diff --git a/.gitignore b/.gitignore index c9f46de8fb8..99bb80d6380 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,8 @@ data.prototext* # Can also ignore all directories and files in a directory. # tmp/**/* build +spack_environments/users/ + + +# we don't want to collect slurm output +**/slurm-*.out diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000000..4afaeaac3b8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,11 @@ +[submodule "applications/graph/snap"] + path = applications/graph/snap + url = https://github.com/snap-stanford/snap + ignore = dirty +[submodule "applications/graph/largescale_node2vec"] + path = applications/graph/largescale_node2vec + url = https://lc.llnl.gov/bitbucket/scm/havoq/largescale_node2vec.git + ignore = dirty +[submodule "applications/ATOM/moses"] + path = applications/ATOM/moses + url = git@github.com:samadejacobs/moses.git diff --git a/.readthedocs.yml b/.readthedocs.yml index 9e2728c0935..dd95022b107 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,7 +1,24 @@ # .readthedocs.yml +# Config file for Read the Docs +# https://docs.readthedocs.io/en/stable/config-file/v2.html + +version: 2 + +sphinx: + builder: html + configuration: docs/conf.py + +formats: [] build: image: latest python: version: 3.7 + install: + - requirements: docs/sphinx_requirements.txt + +submodules: + include: [] + + diff --git a/CMakeLists.txt b/CMakeLists.txt index 4dfb77a0e19..807adfb27da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.12) +cmake_minimum_required(VERSION 3.13) project(LBANN CXX) @@ -48,7 +48,7 @@ endif () # set(LBANN_VERSION_MAJOR 0) -set(LBANN_VERSION_MINOR 99) +set(LBANN_VERSION_MINOR 100) set(LBANN_VERSION_PATCH 0) set(LBANN_VERSION "${LBANN_VERSION_MAJOR}.${LBANN_VERSION_MINOR}.${LBANN_VERSION_PATCH}") @@ -104,6 +104,20 @@ option(LBANN_WITH_CONDUIT "Enable Conduit library" ON) option(LBANN_WITH_CUDNN "Include Nvidia cuDNN" ON) +option(LBANN_WITH_DIHYDROGEN "Build with DiHydrogen support" OFF) +if (LBANN_WITH_DIHYDROGEN) + message(WARNING "DiHydrogen support is currently expermimental. " + "There is no stable interface. " + "Use caution before using any features.") +endif (LBANN_WITH_DIHYDROGEN) + +option(LBANN_WITH_DISTCONV "Enable DiHydrogen's Distconv" OFF) +if (LBANN_WITH_DISTCONV) + message(WARNING "Distconv support is currently expermimental. " + "There is no stable interface. " + "Use caution before using any features.") +endif (LBANN_WITH_DISTCONV) + option(LBANN_WITH_HWLOC "Enable topology-aware optimizations" ON) @@ -121,13 +135,10 @@ option(LBANN_WITH_VTUNE option(LBANN_WITH_UNIT_TESTING "Enable the unit testing framework (requires Catch2)" OFF) -# Enable parallel random matrix generation, if possible +# Use deterministic GPU algorithms and layer operations option(LBANN_DETERMINISTIC "Use deterministic algorithms as much as possible." OFF) -option(LBANN_SEQUENTIAL_INITIALIZATION - "Sequentially consistent initialization" OFF) - option(LBANN_DEBUG_PRINT_SUBTARGETS "Turn on debugging output of internal target properties." OFF) mark_as_advanced(LBANN_DEBUG_PRINT_SUBTARGETS) @@ -161,6 +172,11 @@ include(SetupCXX) ################################################################ # Required dependencies +find_package(Threads REQUIRED) + +# Argument parsing backend +find_package(Clara REQUIRED) + find_package(CEREAL NO_MODULE HINTS ${CEREAL_DIR} $ENV{CEREAL_DIR} PATH_SUFFIXES share/cmake/cereal @@ -172,16 +188,50 @@ set(LBANN_HAS_CEREAL ${CEREAL_FOUND}) # The imported target is just called "cereal". Super. # Setup the linear algebra library -find_package(Hydrogen 1.2.0 NO_MODULE QUIET +find_package(Hydrogen 1.3.3 NO_MODULE QUIET HINTS ${Hydrogen_DIR} ${HYDROGEN_DIR} $ENV{Hydrogen_DIR} $ENV{HYDROGEN_DIR} PATH_SUFFIXES lib/cmake/hydrogen NO_DEFAULT_PATH) if (NOT Hydrogen_FOUND) - find_package(Hydrogen 1.2.0 NO_MODULE QUIET REQUIRED) + find_package(Hydrogen 1.3.3 NO_MODULE QUIET REQUIRED) endif () message(STATUS "Found Hydrogen: ${Hydrogen_DIR}") set(LBANN_HAS_HYDROGEN ${Hydrogen_FOUND}) +# DiHydrogen and Distconv +if (LBANN_WITH_DISTCONV AND NOT LBANN_WITH_DIHYDROGEN) + message(FATAL_ERROR "Distconv requires DiHydrogen. Enable DiHydrogen to use Distconv.") +endif () + +if (LBANN_WITH_DIHYDROGEN) + if (LBANN_WITH_DISTCONV) + find_package(DiHydrogen CONFIG COMPONENTS Meta Patterns DistConv + HINTS ${DIHYDROGEN_DIR} $ENV{DIHYDROGEN_DIR} + ${H2_DIR} $ENV{H2_DIR} + PATH_SUFFIXES install/lib64/cmake install/lib/cmake + NO_DEFAULT_PATH) + find_package(DiHydrogen CONFIG REQUIRED COMPONENTS Meta Patterns DistConv) + set(LBANN_HAS_DISTCONV TRUE) + else () + find_package(DiHydrogen CONFIG COMPONENTS Meta Patterns + HINTS ${DIHYDROGEN_DIR} $ENV{DIHYDROGEN_DIR} + ${H2_DIR} $ENV{H2_DIR} + PATH_SUFFIXES install/lib64/cmake install/lib/cmake + NO_DEFAULT_PATH) + find_package(DiHydrogen CONFIG REQUIRED COMPONENTS Meta Patterns) + endif () + set(LBANN_HAS_DIHYDROGEN TRUE) +endif () + +# Inherit half-precision stuff from Hydrogen +set(LBANN_HAS_HALF ${HYDROGEN_HAVE_HALF}) # This is CPU-only + +# Not the ideal fix, but should be fine for now. +if (Aluminum_FOUND) + message(STATUS "Aluminum found in Hydrogen. Using Aluminum.") + set(LBANN_WITH_ALUMINUM ON CACHE BOOL "Use aluminum." FORCE) +endif () + include(SetupOpenMP) include(SetupMPI) include(SetupProtobuf) @@ -201,6 +251,11 @@ set(LBANN_HAS_OPENCV ${OpenCV_FOUND}) set(LBANN_HAS_CUDA ${_HYDROGEN_HAVE_CUDA}) set(LBANN_WITH_CUDA ${LBANN_HAS_CUDA}) +# Only used if have GPU and have CPU half. +if (LBANN_HAS_CUDA AND LBANN_HAS_HALF) + set(LBANN_HAS_GPU_FP16 ${HYDROGEN_GPU_USE_FP16}) +endif () + if (LBANN_HAS_CUDA) enable_language(CUDA) @@ -214,13 +269,15 @@ endif () if (LBANN_WITH_ALUMINUM) # Aluminum may have already been found by Hydrogen if (NOT Aluminum_FOUND) - find_package(Aluminum 0.2.0 NO_MODULE QUIET + message(WARNING + "Using Aluminum without Hydrogen support may not be well-supported.") + find_package(Aluminum 0.3.0 NO_MODULE QUIET HINTS ${Aluminum_DIR} ${ALUMINUM_DIR} ${AL_DIR} $ENV{Aluminum_DIR} $ENV{ALUMINUM_DIR} $ENV{AL_DIR} PATH_SUFFIXES lib64/cmake/aluminum lib/cmake/aluminum NO_DEFAULT_PATH) if (NOT Aluminum_FOUND) - find_package(Aluminum 0.2.0 NO_MODULE QUIET) + find_package(Aluminum 0.3.0 NO_MODULE QUIET) endif () endif () set(LBANN_HAS_ALUMINUM ${Aluminum_FOUND}) @@ -264,6 +321,11 @@ if (LBANN_HAS_CUDA) include(SetupCUDAToolkit) + if (LBANN_HAS_GPU_FP16) + set_property(TARGET cuda::toolkit PROPERTY + INTERFACE_COMPILE_OPTIONS $<$:-arch=sm_60>) + endif (LBANN_HAS_GPU_FP16) + set(LBANN_HAS_CUDNN ${CUDNN_FOUND}) if (LBANN_HAS_ALUMINUM AND AL_HAS_NCCL) @@ -271,6 +333,16 @@ if (LBANN_HAS_CUDA) else () set(LBANN_HAS_NCCL2 FALSE) endif () + + if (LBANN_WITH_NVSHMEM) + find_package(NVSHMEM REQUIRED) + set_property(TARGET cuda::toolkit PROPERTY + INTERFACE_COMPILE_OPTIONS $<$:-arch=sm_70>) + # Build LBANN as a static library to get around a bug in NVSHMEM + set(BUILD_SHARED_LIBS OFF) + endif () + set(LBANN_HAS_NVSHMEM "${NVSHMEM_FOUND}") + endif (LBANN_HAS_CUDA) # This shouldn't be here, but is ok for now. This will occasionally be @@ -415,22 +487,28 @@ if (LBANN_WITH_CONDUIT) endif () endforeach () + get_filename_component(_conduit_include_dirs + "${CONDUIT_INCLUDE_DIRS}" DIRECTORY) + if (HDF5_FOUND_WITH_MODULE) list(APPEND _conduit_interface_link_libs ${HDF5_LIBRARIES}) - set_target_properties(conduit::conduit - PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${HDF5_INCLUDE_DIRS}") + list(APPEND _conduit_include_dirs + "${HDF5_INCLUDE_DIRS}") endif () + set_property(TARGET conduit::conduit + PROPERTY + INTERFACE_INCLUDE_DIRECTORIES + "${_conduit_include_dirs}") + set_target_properties(conduit::conduit PROPERTIES INTERFACE_LINK_LIBRARIES "${_conduit_interface_link_libs}") set(CONDUIT_LIBRARIES conduit::conduit) - set(LBANN_HAS_CONDUIT ${Conduit_FOUND}) endif (LBANN_WITH_CONDUIT) if (LBANN_WITH_UNIT_TESTING) @@ -446,7 +524,11 @@ if (LBANN_WITH_UNIT_TESTING) # Now that Catch2 has been found, start adding the unit tests include(CTest) include(Catch) + add_subdirectory(src/proto/unit_test) add_subdirectory(src/utils/unit_test) + add_subdirectory(src/weights/unit_test) + add_subdirectory(src/transforms/unit_test) + add_subdirectory(src/transforms/vision/unit_test) # Add this one last add_subdirectory(unit_test) @@ -459,16 +541,16 @@ add_subdirectory(docs) # Build LBANN ################################################################ +# Add LBANN source files +add_subdirectory(include) +add_subdirectory(src) + # Write the configure file configure_file( "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_config.hpp.in" "${CMAKE_BINARY_DIR}/lbann_config.hpp" @ONLY) -# Add LBANN source files -add_subdirectory(include) -add_subdirectory(src) - # Create the LBANN library add_library(lbann ${LBANN_SOURCES} ${LBANN_HEADERS} ${LBANN_CUDA_SOURCES}) @@ -477,12 +559,10 @@ target_include_directories(lbann PUBLIC $ $) -if (LBANN_HAS_PYTHON) - target_include_directories(lbann PUBLIC ${Python_INCLUDE_DIRS}) -endif () - # Use the IMPORTED targets when possible. target_link_libraries(lbann PUBLIC LbannProto) +target_link_libraries(lbann PUBLIC Threads::Threads) +target_link_libraries(lbann PUBLIC clara::clara) target_link_libraries(lbann PUBLIC cereal) target_link_libraries(lbann PUBLIC OpenMP::OpenMP_CXX) target_link_libraries(lbann PUBLIC MPI::MPI_CXX) @@ -491,6 +571,15 @@ target_link_libraries(lbann PUBLIC ${HYDROGEN_LIBRARIES}) target_link_libraries(lbann PUBLIC ${OpenCV_LIBRARIES}) target_link_libraries(lbann PUBLIC ${CONDUIT_LIBRARIES}) +target_link_libraries(lbann PUBLIC + $ + $ + ) + +if (LBANN_WITH_DISTCONV) + target_link_libraries(lbann PUBLIC H2::H2DistConv) +endif () + if (LBANN_HAS_TBINF) target_link_libraries(lbann PUBLIC TBinf) endif () @@ -512,7 +601,12 @@ if (LBANN_HAS_VTUNE) endif () if (LBANN_HAS_PYTHON) - target_link_libraries(lbann PUBLIC ${Python_LIBRARIES}) + target_link_libraries(lbann PUBLIC Python::Python) +endif () + +if (LBANN_HAS_NVSHMEM) + set_property(TARGET lbann PROPERTY CUDA_SEPARABLE_COMPILATION ON) + target_link_libraries(lbann PUBLIC NVSHMEM::NVSHMEM) endif () if (TARGET LBANN_CXX_FLAGS_werror) @@ -521,6 +615,27 @@ endif () target_link_libraries(lbann PUBLIC ${DL_LIBRARY}) +# Fix the -g issue with Clang on OSX +if (APPLE) + # Remove -g from the options + string(REPLACE "-g" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-g" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") + + # Get all the sources and add "-g" to all of them. + get_target_property(_LBANN_SRCS lbann SOURCES) + set_source_files_properties(${_LBANN_SRCS} + PROPERTIES COMPILE_OPTIONS "-g") + + # Cleanup source files + foreach (bad_file IN LISTS _LBANN_SRCS) + get_source_file_property( + _SRC_COMPILE_OPTS "${bad_file}" COMPILE_OPTIONS) + string(REPLACE "-g" "" _SRC_COMPILE_OPTS "${COMPILE_OPTIONS}") + set_source_files_properties( + "${bad_file}" PROPERTIES COMPILE_OPTIONS "${_SRC_COMPILE_OPTS}") + endforeach () +endif () + # Clean things up include(LBANNDebugUtilities) lbann_remove_default_include_paths_from_all_subtargets(lbann) @@ -539,6 +654,8 @@ endif () add_subdirectory(model_zoo) add_subdirectory(model_zoo/tests) add_subdirectory(model_zoo/jag_utils) +add_subdirectory(applications/CANDLE/pilot2/tools) +add_subdirectory(applications/ATOM/utils) add_subdirectory(tests) add_subdirectory(scripts) @@ -733,6 +850,8 @@ string(APPEND _str "\n") #Print the true/false guys append_str_tf(_str LBANN_GNU_LINUX + LBANN_HAS_DIHYDROGEN + LBANN_HAS_DISTCONV LBANN_HAS_HYDROGEN LBANN_HAS_OPENCV LBANN_HAS_CEREAL @@ -747,7 +866,6 @@ append_str_tf(_str LBANN_HAS_DOXYGEN LBANN_HAS_LBANN_PROTO LBANN_HAS_ALUMINUM - LBANN_HAS_CONDUIT LBANN_HAS_PYTHON) string(APPEND _str "\n== End LBANN Configuration Summary ==\n") @@ -774,6 +892,13 @@ configure_file( "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_module.lua.in" "${CMAKE_BINARY_DIR}/lbann_module.lua.install" @ONLY) +configure_file( + "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_module.tcl.in" + "${CMAKE_BINARY_DIR}/lbann_module.tcl.install") + install(FILES "${CMAKE_BINARY_DIR}/lbann_module.lua.install" RENAME "${LBANN_MODULEFILE_NAME}" DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/modulefiles") +install(FILES "${CMAKE_BINARY_DIR}/lbann_module.tcl.install" + RENAME "${LBANN_VERSION}" + DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/modulefiles/lbann") diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt index 13418207629..1ebd8e4a2b8 100644 --- a/ReleaseNotes.txt +++ b/ReleaseNotes.txt @@ -21,6 +21,102 @@ Bug fixes: Retired features: +============================== Release Notes: v0.100 ============================== +Support for new network structures: + - 3D molecular generation models for Metal Organic Frameworks from the CoRE MOF Database. + - 3D CosmoFlow Model + - DenseNet + - ATOM LSTM model + - RAS state classifier + - node2vec + - Transformer and other attention-based models + - ExaGAN (formerly CosmoGAN) + - MaCC ICF surrogate model + +Applications: + - Created a directory of example applications, deprecating the "model zoo" directory + +Support for new layers: + - Embedding layer + - Distributed embedding layer + - Channel-wise scale/bias layer + - Entry-wise scale/bias layer + - Gated-Recurrent Units (GRU) + - Entry-wise batchnorm + - Argmax, Argmin, and one-hot layers + - Layer norm + - Deconvolution layer (transposed convolution) + - Layers for channel-wise operations (channel-wise fully-connected, channel-wise softmax, channel-wise scale/bias, instance norm) + - Matrix multiply layer + +Python front-end: + - Can now configure contrib launcher with environment variables + - Added NERSC compute center + - Per-layer specification of compute device (CPU or GPU) + - Option to write custom batch scripts with Python front-end + +Performance optimizations: + - Parallelized Python data reader with "multiprocessing" module + - Fuse batchnorm stats allreduces in FP/BP. + - Tuned concatenate and slice layer + - Dynamically allocate and free memory for layer error signals (halves LBANN's memory footprint) + +Model portability & usability: + - Bamboo tests for individual layers + +Internal features: + - Added support for DistConv features (distributed, generalized, + parallel convolution) + - Added support for NVSHMEM 1.0 API (used in distributed embedding + layer and DistConv halo exchange) + - Support for multiple data types per model (per-layer) + - Support for per-layer mixed-precision weight training and inference, + includes per-weight object and objective function mixed-precision. + - Improved how and when the RNGs are initialized + - Callback to dump images to TensorBoard + - Callback to save model weights (useful to export to PyTorch) + - Callback to save top K models (LTFB) + - Improved run-to-run reproducibility by initializing weights in alphabetical order + - Moved models from model_zoo directory to applications directory + - Cleanup and refactoring of callbacks and layer instantiation + - Grouped batchnorm statistics + - Callback to print model description + - Refactored trainer and training-state out of the model class + - Support for transposing data in matrix multiply layers + - Added DiHydrogen tensor and DistConv library + - Added parallel strategy to layer class to support DistConv + - LBANN inference mode supports loading models from multiple directories + - Cleanup of checkpoint and restart logic + +I/O & data readers: + - Added in-memory data store that caches samples in CPU memory. It can be loaded + during the first epoch or preloaded + - Added new "transform" data preprocessing ingestion pipeline + - Added sample list format for specifying data sets + - Introduced data coordinator that manages data readers and extracts them from + the input layers + - Data store is able to checkpoint / spill it's contents to local disk + - Data reader for SMILE strings + +Build system: + - Hydrogen 1.3.4 + - Aluminum 0.3.3 + - Improved documentation on read the docs (RTD) + - Robust support for using Spack as a build system around CMake + - Identified compute centers for specifying build and run dependencies + - Added Catch2-based tests + +Bug fixes: + - Fixed path resolution for dump weights, save model, and checkpoint callbacks + - Added mutexes for preloading the data store + - Fixed the LTFB exchange to include all ADAM optimizer state + - Fixed the mapping of I/O RNGs to I/O processing threads to ensure + consistent and correct multi-threaded performance + +Retired features: + - moving MNIST data reader is replaced by python data reader + - ASCII data reader is deprecated + ============================== Release Notes: v0.99 ============================== Support for new training algorithms: - Improvements to LTFB infrastructure (including transfer of SGD and Adam hyperparameters) diff --git a/applications/.gitignore b/applications/.gitignore new file mode 100644 index 00000000000..aa6a015fd1e --- /dev/null +++ b/applications/.gitignore @@ -0,0 +1,21 @@ +# Setup standard ignores to keep the applications directory hierarchy clean + +# Building in source tree garbage +.cproject +.project +*.o +*.a + +# Emacs backup garbage +.backup/ + +# Other standard ignores +*~ +*.pyc +\#*# +.#* +.*.swp +.DS_Store + +# Python garbage +__pycache__/ diff --git a/applications/ATOM/README.md b/applications/ATOM/README.md new file mode 100644 index 00000000000..1789b609f4a --- /dev/null +++ b/applications/ATOM/README.md @@ -0,0 +1,46 @@ +## Accelerating Therapeutics for Opportunities in Medicine (ATOM) + +Models for training neural networks to suppor the [ATOM](https://atomscience.org) project + +The train_atom_char_rnn.py script implements GRU-based recurrent model for generating new SMILES strings. +Original neural network model and training hyperparameters are described in [MOSES benchmark](https://github.com/samadejacobs/moses/tree/master/moses/char_rnn). Please see LBANN documentations on how to install, build and run LBANN code. + +### How to train +```bash +run python3 train_atom_char.rnn.py +``` + +Expected training output in LBANN (250K ZINC training dataset, on a single LLNL Pascal GPU) is shown: +``` +-------------------------------------------------------------------------------- +[0] Epoch : stats formated [tr/v/te] iter/epoch = [3907/0/0] + global MB = [ 64/ 0/ 0] global last MB = [ 16 / 0 / 0 ] + local MB = [ 64/ 0/ 0] local last MB = [ 16+0/ 0+0/ 0+0] +-------------------------------------------------------------------------------- +model0 (instance 0) training epoch 0 objective function : 0.438031 +model0 (instance 0) training epoch 0 run time : 1009.55s +model0 (instance 0) training epoch 0 mini-batch time statistics : 0.257328s mean, 1.89938s max, 0.15177s min, 0.0331048s stdev +-------------------------------------------------------------------------------- +[1] Epoch : stats formated [tr/v/te] iter/epoch = [3907/0/0] + global MB = [ 64/ 0/ 0] global last MB = [ 16 / 0 / 0 ] + local MB = [ 64/ 0/ 0] local last MB = [ 16+0/ 0+0/ 0+0] +-------------------------------------------------------------------------------- +model0 (instance 0) training epoch 1 objective function : 0.37321 +model0 (instance 0) training epoch 1 run time : 1006.6s +model0 (instance 0) training epoch 1 mini-batch time statistics : 0.256573s mean, 0.912742s max, 0.158709s min, 0.0193512s stdev +``` + +### Inference and Sampling + +1. Clone this version of [MOSES benchmark repository](https://github.com/samadejacobs/moses) and follow instructions for installation +2. Inference using LBANN pretrained model parameters + +```bash + + python3 MOSES_DIR/scripts/run.py --model char_rnn --n_samples NUM_SAMPLES \ + --lbann_weights_dir LBANN_WEIGHTS_DIR \ + --lbann_epoch_counts EPOCHS + +``` + +Command above will load pre_trained LBANN weights and biases from LBANN_WEIGHTS_DIR at a specified EPOCH counts, generate up to NUM_SAMPLES new molecules, and calculate metrics on the new molecules, some metrics relative to the test (validation) dataset. diff --git a/applications/ATOM/data/vocab_universal.txt b/applications/ATOM/data/vocab_universal.txt new file mode 100644 index 00000000000..6bca1c7fff5 --- /dev/null +++ b/applications/ATOM/data/vocab_universal.txt @@ -0,0 +1,98 @@ +a 0 +b 1 +c 2 +d 3 +e 4 +f 5 +g 6 +h 7 +i 8 +j 9 +k 10 +l 11 +m 12 +n 13 +o 14 +p 15 +q 16 +r 17 +s 18 +t 19 +u 20 +v 21 +w 22 +x 23 +y 24 +z 25 +A 26 +B 27 +C 28 +D 29 +E 30 +F 31 +G 32 +H 33 +I 34 +J 35 +K 36 +L 37 +M 38 +N 39 +O 40 +P 41 +Q 42 +R 43 +S 44 +T 45 +U 46 +V 47 +W 48 +X 49 +Y 50 +Z 51 +0 52 +1 53 +2 54 +3 55 +4 56 +5 57 +6 58 +7 59 +8 60 +9 61 +! 62 +" 63 +# 64 +$ 65 +% 66 +& 67 +' 68 +( 69 +) 70 +* 71 ++ 72 +, 73 +- 74 +. 75 +/ 76 +: 77 +; 78 +< 79 += 80 +> 81 +? 82 +@ 83 +[ 84 +\ 85 +] 86 +^ 87 +_ 88 +` 89 +{ 90 +| 91 +} 92 +~ 93 + 94 + 95 + 96 + 97 diff --git a/applications/ATOM/dataset.py b/applications/ATOM/dataset.py new file mode 100644 index 00000000000..35bf0faa699 --- /dev/null +++ b/applications/ATOM/dataset.py @@ -0,0 +1,29 @@ +import os +import numpy as np +import json + + +# the idea here is to use the same code with abritrary sets of data +with open(os.environ['DATA_CONFIG'], 'rb') as handle: + config = json.load(handle) + +pad_index = config['pad_index'] +max_seq_len = config['max_seq_len'] + +samples = np.load(config['data_path'], allow_pickle=True) + +# Sample access functions +def get_sample(index): + sample = samples[index] + if len(sample) < max_seq_len: + sample = np.concatenate((sample, np.full(max_seq_len-len(sample), pad_index))) + else: + sample = np.resize(sample, max_seq_len) + return sample + +def num_samples(): + return samples.shape[0] + +def sample_dims(): + return [max_seq_len] + diff --git a/applications/ATOM/moses b/applications/ATOM/moses new file mode 160000 index 00000000000..28932ce6ff6 --- /dev/null +++ b/applications/ATOM/moses @@ -0,0 +1 @@ +Subproject commit 28932ce6ff6fb1883be888a48b431e17835be1c8 diff --git a/applications/ATOM/readme_smiles_data_reader.txt b/applications/ATOM/readme_smiles_data_reader.txt new file mode 100644 index 00000000000..fa669ca9e15 --- /dev/null +++ b/applications/ATOM/readme_smiles_data_reader.txt @@ -0,0 +1,46 @@ +# Example execution line for running with the smiles_data_reader + +setenv BASE /usr/workspace/wsb/hysom/corona/applications/ATOM + +run python3 train_atom_char_rnn_REV.py \ + --nodes=16 \ + --batch-size=1024 \ + --sequence-length=57 \ + --embedding-dim=30 \ + --num-embeddings=30 \ + --pad-index=28 \ + --vocab=/p/lustre2/brainusr/datasets/zinc/vocab_train.txt \ + --data-reader-prototext=$BASE/smiles_data_reader.prototext \ + |& tee out + +WARNING: at present, code assumes the input file is in csv format with + tab delimiters + +Optional arguments: + + --num-samples= # If not given, uses all samples in the file + +Notes: + --sequence-length, --vocab, --num-embeddings, and --embedding-dim should + match the data set; vocabs for various datasets are in /p/lustre2/brainusr/datasets/zinc, + /p/lustre2/brainusr/datasets/enamine, etc. + For now, assume num-embeddings = embedding-dim = vocab.size(), and + pad-index= vocab.size()-2 + + If --sequence-length is too short, portions of some samples will be discarded. + + The smiles_data_reader dtor prints any characters that were not + found in the vocabulary, and the number of characters (if any) that + were discarded (but note, statistics are only gathered for P_0) + +WARNING (when running with the Python data_reader): + ensure that "--sequence-length" matches "max_seq_len" in the + json file; this is not error-checked (as of this writing). + Also ensure "--pad-index" matches the entry in the json file; + also not error checked. + +Verification: to test the above cmd line against output using the python data reader: +run python3 ./train_atom_char_rnn.py --nodes=16 --pad-index=28 --sequence-length=57 --embedding-dim=30 --num-embeddings=30 --batch-size=1024 |& tee out + + + diff --git a/applications/ATOM/requirements.txt b/applications/ATOM/requirements.txt new file mode 100644 index 00000000000..8c5fcd94d99 --- /dev/null +++ b/applications/ATOM/requirements.txt @@ -0,0 +1,4 @@ +numpy +protobuf +six +torch diff --git a/applications/ATOM/smiles_data_reader.prototext b/applications/ATOM/smiles_data_reader.prototext new file mode 100644 index 00000000000..d56e9e7e91c --- /dev/null +++ b/applications/ATOM/smiles_data_reader.prototext @@ -0,0 +1,10 @@ +data_reader { + reader { + name: "smiles" + role: "train" + shuffle: true + percent_of_data_to_use: 1.0 + data_filedir: "/p/lustre2/brainusr/datasets/zinc" + data_filename: "train.csv" + } +} diff --git a/applications/ATOM/train_atom_char_rnn.py b/applications/ATOM/train_atom_char_rnn.py new file mode 100644 index 00000000000..dc808ac1ff4 --- /dev/null +++ b/applications/ATOM/train_atom_char_rnn.py @@ -0,0 +1,305 @@ +import argparse +import datetime +import os +import os.path +import sys + +from google.protobuf import text_format as txtf +import json +import numpy as np +import torch + +import lbann +import lbann.contrib.launcher +import lbann.modules +from lbann.util import str_list + + +def construct_lc_launcher_args(): + + # defaults correspond to the settings needed for training on the moses dataset + parser = argparse.ArgumentParser(prog="lbann charVAE training") + parser.add_argument("--partition", default=None) + parser.add_argument("--account", default="hpcdl") + parser.add_argument("--scheduler", default="slurm") + parser.add_argument( + "--data-module-file", + default="dataset.py", + help="specifies the module that contains the logic for loading data", + ) + parser.add_argument( + "--data-config", + default=os.path.join( + os.path.abspath(os.path.dirname(__file__)), "zinc_data_config.json" + ), + help="path to a data config file that is used for the construction of python data reader", + ) + parser.add_argument( + "--time-limit", + type=int, + default=720, + help="specified time limit in number of minutes", + ) + parser.add_argument("--nodes", type=int, default=1) + parser.add_argument("--job-name", default="atom_char_rnn") + parser.add_argument("--embedding-dim", type=int, default=None) + parser.add_argument("--num-embeddings", type=int, default=None) + parser.add_argument("--batch-size", type=int, default=64) + parser.add_argument("--num-epochs", type=int, default=10) + parser.add_argument("--data-reader-prototext", default=None) + parser.add_argument("--pad-index", type=int, default=None) + parser.add_argument("--sequence-length", type=int, default=None) + parser.add_argument("--dump_weights_dir", type=str, default="weights") + parser.add_argument("--num-samples", type=int, default=None) + parser.add_argument("--num-io-threads", type=int, default=11) + parser.add_argument("--vocab", default=None) + parser.add_argument("--delimiter", default="c") + parser.add_argument("--no-header", type=bool, default=True) + + # these are specific to the Trainer object + parser.add_argument( + "--procs-per-trainer", + type=int, + default=0, + help="number of processes to use per trainer", + ) + + # these are the bits and pieces required for loading the model in the moses library...may be useful for evaluation tasks/continuing training/etc + parser.add_argument("--gamma", type=float, default=0.5, help="") + parser.add_argument( + "--hidden", type=int, default=768, help="size of the hidden layer" + ) + parser.add_argument( + "--lr", + type=float, + default=1e-3, + help="optimizer learning rate to use for training", + ) + parser.add_argument( + "--num-layers", type=int, default=1, help="number of LSTM layers" + ) + parser.add_argument( + "--step-size", type=int, default=10, help="learning rate decay step size" + ) + + # this is just for compatiblity with the moses code + parser.add_argument("--dropout", type=float, default=0.5, help="") + return parser.parse_args() + + +# ============================================== +# Setup and launch experiment +# ============================================== + + +def construct_model(run_args): + """Construct LBANN model. + + Initial model for ATOM molecular SMILES generation + Network architecture and training hyperparameters from + https://github.com/samadejacobs/moses/tree/master/moses/char_rnn + + """ + + pad_index = run_args.pad_index + assert pad_index is not None + + sequence_length = run_args.sequence_length + assert sequence_length is not None + + print("sequence length is {}".format(sequence_length)) + data_layout = "data_parallel" + + # Layer graph + _input = lbann.Input(name="inp_tensor", target_mode="N/A") + print(sequence_length) + x_slice = lbann.Slice( + _input, + axis=0, + slice_points=str_list(range(sequence_length + 1)), + name="inp_slice", + ) + + # embedding layer + emb = [] + embedding_dim = run_args.embedding_dim + num_embeddings = run_args.num_embeddings + assert embedding_dim is not None + assert num_embeddings is not None + + emb_weights = lbann.Weights( + initializer=lbann.NormalInitializer(mean=0, standard_deviation=1), + name="emb_matrix", + ) + + lstm1 = lbann.modules.GRU(size=run_args.hidden, data_layout=data_layout) + fc = lbann.modules.FullyConnectedModule( + size=num_embeddings, data_layout=data_layout + ) + + last_output = lbann.Constant( + value=0.0, + num_neurons="{}".format(run_args.hidden), + data_layout=data_layout, + name="lstm_init_output", + ) + + lstm1_prev_state = [last_output] + + loss = [] + idl = [] + for i in range(sequence_length): + idl.append(lbann.Identity(x_slice, name="slice_idl_" + str(i), device="CPU")) + + for i in range(sequence_length - 1): + + emb_l = lbann.Embedding( + idl[i], + name="emb_" + str(i), + weights=emb_weights, + embedding_dim=embedding_dim, + num_embeddings=num_embeddings, + ) + + x, lstm1_prev_state = lstm1(emb_l, lstm1_prev_state) + fc_l = fc(x) + y_soft = lbann.Softmax(fc_l, name="soft_" + str(i)) + gt = lbann.OneHot(idl[i + 1], size=num_embeddings) + ce = lbann.CrossEntropy([y_soft, gt], name="loss_" + str(i)) + # mask padding in input + pad_mask = lbann.NotEqual( + [idl[i], lbann.Constant(value=pad_index, num_neurons="1")], + ) + ce_mask = lbann.Multiply([pad_mask, ce], name="loss_mask_" + str(i)) + loss.append(lbann.LayerTerm(ce_mask, scale=1 / (sequence_length - 1))) + + layers = list(lbann.traverse_layer_graph(_input)) + # Setup objective function + weights = set() + for l in layers: + weights.update(l.weights) + obj = lbann.ObjectiveFunction(loss) + + callbacks = [ + lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackStepLearningRate(step=run_args.step_size, amt=run_args.gamma), + lbann.CallbackDumpWeights(directory=run_args.dump_weights_dir, epoch_interval=1), + ] + + # Construct model + return lbann.Model( + run_args.num_epochs, + layers=layers, + weights=weights, + objective_function=obj, + callbacks=callbacks + ) + #callbacks=callbacks + + +def construct_data_reader(run_args): + """ + Construct Protobuf message for Python data reader. + + The Python data reader will import this Python file to access the + sample access functions. + + """ + + module_file = os.path.abspath(run_args.data_module_file) + os.environ["DATA_CONFIG"] = os.path.abspath(run_args.data_config) + + module_name = os.path.splitext(os.path.basename(module_file))[0] + module_dir = os.path.dirname(module_file) + + print("module_name: {}\tmodule_dir: {}".format(module_name, module_dir)) + + # Base data reader message + message = lbann.reader_pb2.DataReader() + + # Training set data reader + data_reader = message.reader.add() + data_reader.name = "python" + data_reader.role = "train" + data_reader.shuffle = True + data_reader.percent_of_data_to_use = 1.0 + data_reader.python.module = module_name + data_reader.python.module_dir = module_dir + data_reader.python.sample_function = "get_sample" + data_reader.python.num_samples_function = "num_samples" + data_reader.python.sample_dims_function = "sample_dims" + + return message + + +def main(): + run_args = construct_lc_launcher_args() + + # add data_config data + # and do not overwrite args if data_reader_prototext is enabled + if os.path.isfile(run_args.data_config) and not run_args.data_reader_prototext: + with open(run_args.data_config, "r") as f: + config = json.load(f) + for k, v in config.items(): + setattr(run_args, k, v) + + trainer = lbann.Trainer( + run_args.batch_size, + name=None, + procs_per_trainer=run_args.procs_per_trainer, + ) + + # define data_reader + if run_args.data_reader_prototext: + print("Using data_reader_prototext") + assert run_args.sequence_length is not None + assert run_args.vocab is not None + + data_reader_proto = lbann.lbann_pb2.LbannPB() + with open(run_args.data_reader_prototext, "r") as f: + txtf.Merge(f.read(), data_reader_proto) + data_reader = data_reader_proto.data_reader + else: + data_reader = construct_data_reader(run_args) + + if "LBANN_EXPERIMENT_DIR" in os.environ: + work_dir = os.environ["LBANN_EXPERIMENT_DIR"] + else: + work_dir = os.path.join(os.getcwd()) + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + experiment_dir = os.path.join( + work_dir, "{}_{}".format(timestamp, run_args.job_name) + ) + if not os.path.exists(experiment_dir): + os.makedirs(experiment_dir) + + # model and optimizer + model = construct_model(run_args) + opt = lbann.Adam(learn_rate=run_args.lr, beta1=0.9, beta2=0.99, eps=1e-8) + + # dump the config to the experiment_dir so that it can be used to load the model in pytorch (moses codebase) + ppn = 4 if run_args.scheduler == "lsf" else 2 + print("args:\n" + str(run_args)) + torch.save(run_args, "{}/{}_config.pt".format(experiment_dir, run_args.job_name)) + status = lbann.contrib.launcher.run( + trainer, + model, + data_reader, + opt, + partition=run_args.partition, + scheduler=run_args.scheduler, + account=run_args.account, + time_limit=run_args.time_limit, + nodes=run_args.nodes, + procs_per_node=ppn, + job_name=run_args.job_name, + experiment_dir=experiment_dir, + lbann_args=f"--vocab={run_args.vocab} --num_samples={run_args.num_samples} --sequence_length={run_args.sequence_length} --num_io_threads={run_args.num_io_threads} --no_header={run_args.no_header} --delimiter={run_args.delimiter}", + ) + + print("LBANN launcher status:\n" + str(status)) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/applications/ATOM/utils/CMakeLists.txt b/applications/ATOM/utils/CMakeLists.txt new file mode 100644 index 00000000000..7d76fb6d6f5 --- /dev/null +++ b/applications/ATOM/utils/CMakeLists.txt @@ -0,0 +1,25 @@ +# Add a target to control building all the utilities +add_custom_target(atom-utils) + +add_executable(compute_vocab + EXCLUDE_FROM_ALL compute_vocab.cpp) + target_link_libraries(compute_vocab lbann) + add_dependencies(atom-utils compute_vocab) + +# Install the binaries +install( + TARGETS compute_vocab + OPTIONAL + EXPORT LBANNTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + +#(from Tom) +# The use of `OPTIONAL` here will trigger CMake warnings. These can +# safely be ignored and tests confirm that. See these for more info: +# +# https://gitlab.kitware.com/cmake/cmake/issues/18258 +# https://cmake.org/pipermail/cmake/2011-August/046014.html + diff --git a/applications/ATOM/utils/build_universal_vocab.py b/applications/ATOM/utils/build_universal_vocab.py new file mode 100644 index 00000000000..344dda974e4 --- /dev/null +++ b/applications/ATOM/utils/build_universal_vocab.py @@ -0,0 +1,26 @@ +# +# run with python 2.7 +# +import string + +a1 = string.letters +a2 = string.digits +a3 = string.punctuation +a4 = a1 + a2 + a3 + +out = open('vocab_universal.txt', 'w') +id = 0 +for c in a4 : + out.write(c + ' ' + str(id) + '\n') + id += 1 +out.write(' ' + str(id) + '\n') +id += 1 +out.write(' ' + str(id) + '\n') +id += 1 +out.write(' ' + str(id) + '\n') +id += 1 +out.write(' ' + str(id) + '\n') +id += 1 + +out.close() +print('\nwrote file: vocab_universal.txt\n') diff --git a/applications/ATOM/utils/compute_profile.py b/applications/ATOM/utils/compute_profile.py new file mode 100644 index 00000000000..2d54c0d597d --- /dev/null +++ b/applications/ATOM/utils/compute_profile.py @@ -0,0 +1,50 @@ +import sys + +if len(sys.argv) != 3 : + print('usage:') + print(' ' + sys.argv[0] + ' input_fn output_fn') + print('function:') + print(' writes data for plotting num_sequences as a function') + print(' of sequence length to "output_fn"; prints length') + print(' of longest sequence to cout (add two for , )') + print('delimiter:') + print(' is hard-coded for comma\n') + exit(9) + +a = open(sys.argv[1]) +a.readline() #discard header +out = open(sys.argv[2], 'w') + +longest = 0 +longest_seq = '' +longest_line_num = 0 + +data = {} +j = 0 +for line in a : + j += 1 + if j % 1000 == 0 : print(str(j/1000) + 'K lines processed') + t = line.split(',') + x = len(t[0]) + if x not in data : + data[x] = 0 + data[x] += 1 + if x > longest : + longest = x + longest_seq = t[0] + longest_line_num = j-1 + +v = [] +for ell in data : + v.append( (ell, data[ell]) ) +v.sort() + + +for d in v : + out.write(str(d[0]) + ' ' + str(d[1]) + '\n') +print('\noutput written to: ', sys.argv[2] + '\n') +out.close() + +print('\nlongest sequence length: ' + str(longest)) +print('line number of longest: ' + str(longest_line_num)) +print('longest sequence length: ' + longest_seq) diff --git a/applications/ATOM/utils/compute_sample_lengths.py b/applications/ATOM/utils/compute_sample_lengths.py new file mode 100644 index 00000000000..838779d50a9 --- /dev/null +++ b/applications/ATOM/utils/compute_sample_lengths.py @@ -0,0 +1,43 @@ +import sys + +if len(sys.argv) != 3 : + print(F''' + usage: {sys.argv[0]} input_fn output_fn + function: computes the length of each SMILES string + output: each line of output contains a file name, followed by + the length of each string + where: + "input_fn" contains the names of one or more smiles files; + Assumes each file contains a single header line; + Assumes delimiter is either tab or comma + ''' + ) + exit(9) + +a = open(sys.argv[1]) +out = open(sys.argv[2], 'w') + +sample_id = -1 +num_files = -1 +for line in a : + out.write(line[:-1]) + print('opening: ' + line[:-1]) + b = open(line[:-1]) + num_files += 1 + b.readline() #discard header + for line in b : + sample_id += 1 + j = line.find(',') + if j == -1 : + j == line.find('\t') + if j == -1 : + print(f"failed to find delimiting character (comma or tab) on line # {sample_id} of file: {line[:-1]}") + exit(9) + out.write(' ' + str(len( line[:j] ))) + out.write('\n') + b.close() + if num_files == 3 : break + +a.close() +out.close() +print(F'\noutput has been written to: {sys.argv[2]}\n') diff --git a/applications/ATOM/utils/compute_vocab.cpp b/applications/ATOM/utils/compute_vocab.cpp new file mode 100644 index 00000000000..4c3675dc4ad --- /dev/null +++ b/applications/ATOM/utils/compute_vocab.cpp @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include "lbann/utils/options.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/timer.hpp" +#include "lbann/utils/commify.hpp" + +using namespace lbann; + +int main(int argc, char **argv) { + lbann::world_comm_ptr comm = lbann::initialize(argc, argv); + int np = comm->get_procs_in_world(); + + std::cerr << "STARTED!\n"; + + try { + + if (np != 1) { + LBANN_ERROR("please run with a single processor"); + } + if (argc < 3) { + std::cerr + << "usage: " << argv[0] + << " --input_fn= --output_fn= --delimiter=\n" + << "where: input_fn is csv file containing SMILES strings;\n" + << " --delimiter is c (comma), t (tab) or 0 (none)\n" + << "function: computes vocabulary\n"; + exit(9); + } + + options *opts = options::get(); + opts->init(argc, argv); + double tm1 = get_time(); + + const std::string input_fn = opts->get_string("input_fn"); + std::ifstream in(input_fn.c_str()); + if (!in) { + LBANN_ERROR("failed to open ", input_fn , " for reading"); + } + + const std::string output_fn = opts->get_string("output_fn"); + std::ofstream out(output_fn.c_str(), std::ios::binary); + if (!out) { + LBANN_ERROR("failed to open ", output_fn, " for writing"); + } + + const std::string w = opts->get_string("delimiter"); + const char ww = w[0]; + char d = 0; + switch (ww) { + case 'c' : + d = ','; + break; + case 't' : + d = '\t'; + break; + case '0' : + d = '\0'; + break; + default : + LBANN_ERROR("Invalid delimiter character; should be 'c', 't', '0'; you passed: ", ww); + } + + std::set s; + + std::string line; + getline(in, line); //discard header + size_t j = 1; + while (!in.eof()) { + ++j; + if (j % 1000 == 0) std::cout << j/1000 << "K lines processed" << std::endl; + getline(in, line); + if (line.size() < 5) continue; + size_t h = line.find(d); + if (h == std::string::npos) { + LBANN_ERROR("failed to find delimiter: ", d, " on line ", j); + } + const std::string smiles = line.substr(0, h); + for (const auto &t : smiles) { + s.insert(t); + } + } + + int idx = 0; + for (const auto &t : s) { + out << t << " " << idx++ << std::endl; + } + out << " " << idx++ << std::endl; + out << " " << idx++ << std::endl; + out << " " << idx++ << std::endl; + out << " " << idx++ << std::endl; + + in.close(); + out.close(); + + std::cout << "\nprocessing time: " << get_time() - tm1 << std::endl; + + } catch (lbann::exception& e) { + El::ReportException(e); + return EXIT_FAILURE; + } catch (std::exception& e) { + El::ReportException(e); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/applications/ATOM/utils/compute_vocab_frequencies.py b/applications/ATOM/utils/compute_vocab_frequencies.py new file mode 100644 index 00000000000..a0d04590bd9 --- /dev/null +++ b/applications/ATOM/utils/compute_vocab_frequencies.py @@ -0,0 +1,45 @@ +import sys + +if len(sys.argv) != 3 : + print(F''' + usage: {sys.argv[0]} input_filename output_filename + where: + "input_filename" is a SMILES csv filename + function: + computes the frequency of each character in the vocabulary, + and prints same to "output_filename" + ''' + ) + exit(9) + +out = open(sys.argv[2], 'w') + +a = open(sys.argv[1]) +a.readline() # discard header +h = {} +j = 1 +for line in a : + k1 = line.find(',') + if k1 == -1 : + k1 = line.find('\t') + if k1 == -1 : + print('failed to find comma or tab delimiter on line # ' + str(j)) + exit(9) + s = line[:k1] + for c in s : + if c not in h : + h[c] = 0 + h[c] += 1 + j += 1 + if j % 1000 == 0 : print(str(j/1000) + 'K samples processed') + +v = [] +for c in h.keys() : + v.append( (h[c], c) ) +v.sort() + +for x in v : + print(x) + out.write(str(x[0]) + ' ' + str(x[1]) + '\n') +out.close() +print('\n\nOutput has also been written to: ' + sys.argv[2] + '\n') diff --git a/applications/ATOM/zinc_data_config.json b/applications/ATOM/zinc_data_config.json new file mode 100644 index 00000000000..a583ce84eea --- /dev/null +++ b/applications/ATOM/zinc_data_config.json @@ -0,0 +1,10 @@ +{ + + "pad_index": 28, + "sequence_length": 57, + "max_seq_len": 57, + "data_path": "/p/lustre2/brainusr/datasets/zinc/moses_zinc_train250K.npy", + "embedding_dim": 30, + "num_embeddings": 30 + +} diff --git a/applications/CANDLE/pilot2/data/Min_Max_Mean_Std-Dev.txt b/applications/CANDLE/pilot2/data/Min_Max_Mean_Std-Dev.txt new file mode 100644 index 00000000000..951825fe7a7 --- /dev/null +++ b/applications/CANDLE/pilot2/data/Min_Max_Mean_Std-Dev.txt @@ -0,0 +1,5 @@ +max x/y/z: 9.6067 7.46754 3.14485 +min x/y/z: -3.53206 -5.68766 -7.9664 +mean x/y/z: 1.66227 -0.00509318 -2.23788 +std dev: 1.17969 0.773128 1.04863 + diff --git a/applications/CANDLE/pilot2/data/pilot2_normalization.txt b/applications/CANDLE/pilot2/data/pilot2_normalization.txt new file mode 100644 index 00000000000..635bb276d2b --- /dev/null +++ b/applications/CANDLE/pilot2/data/pilot2_normalization.txt @@ -0,0 +1,15 @@ +max min mean std_dev: +2.40001 0 0.36777 0.248641 +1.98564 0 0.186917 0.158741 +1.95085 0 0.138673 0.127767 +2.53137 3.24127e-08 0.447338 0.240423 +2.48228 0 0.273659 0.220102 +2.3157 0 0.408046 0.198704 +2.06274 0 0.0375413 0.0741625 +2.50623 3.25748e-05 0.663633 0.218187 +2.6136 6.98301e-06 0.712461 0.265916 +2.41215 0 0.2882 0.213314 +2.80934 0 0.0411697 0.0691358 +2.4133 0 0.151854 0.150553 +2.69434 0 0.724106 0.293084 +2.71227 0.00951633 0.893146 0.222718 diff --git a/applications/CANDLE/pilot2/tools/CMakeLists.txt b/applications/CANDLE/pilot2/tools/CMakeLists.txt new file mode 100644 index 00000000000..e95927d0252 --- /dev/null +++ b/applications/CANDLE/pilot2/tools/CMakeLists.txt @@ -0,0 +1,50 @@ +# Add a target to control building all the utilities +add_custom_target(pilot2-utils) + +add_executable(test_ras_lipid_data_files_for_errors + EXCLUDE_FROM_ALL test_ras_lipid_data_files_for_errors.cpp) + target_link_libraries(test_ras_lipid_data_files_for_errors lbann) + add_dependencies(pilot2-utils test_ras_lipid_data_files_for_errors) + +add_executable(compute_ras_lipid_sig1_normalization + EXCLUDE_FROM_ALL compute_ras_lipid_sig1_normalization.cpp) + target_link_libraries(compute_ras_lipid_sig1_normalization lbann) + add_dependencies(pilot2-utils compute_ras_lipid_sig1_normalization) + +add_executable(compute_ras_lipid_bbs_euclid_distances + EXCLUDE_FROM_ALL compute_ras_lipid_bbs_euclid_distances.cpp) + target_link_libraries(compute_ras_lipid_bbs_euclid_distances lbann) + add_dependencies(pilot2-utils compute_ras_lipid_bbs_euclid_distances) + +add_executable(compute_ras_lipid_bbs_max_min + EXCLUDE_FROM_ALL compute_ras_lipid_bbs_max_min.cpp) + target_link_libraries(compute_ras_lipid_bbs_max_min lbann) + add_dependencies(pilot2-utils compute_ras_lipid_bbs_max_min) + +add_executable(compute_ras_lipid_bbs_euclid_normalization + EXCLUDE_FROM_ALL compute_ras_lipid_bbs_euclid_normalization.cpp) + target_link_libraries(compute_ras_lipid_bbs_euclid_normalization lbann) + add_dependencies(pilot2-utils compute_ras_lipid_bbs_euclid_normalization) + +# Install the binaries +install( TARGETS + test_ras_lipid_data_files_for_errors + compute_ras_lipid_sig1_normalization + compute_ras_lipid_bbs_euclid_distances + compute_ras_lipid_bbs_euclid_normalization + + OPTIONAL + EXPORT LBANNTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + ) + +#(from Tom) +# The use of `OPTIONAL` here will trigger CMake warnings. These can +# safely be ignored and tests confirm that. See these for more info: +# +# https://gitlab.kitware.com/cmake/cmake/issues/18258 +# https://cmake.org/pipermail/cmake/2011-August/046014.html + diff --git a/applications/CANDLE/pilot2/tools/Notes.txt b/applications/CANDLE/pilot2/tools/Notes.txt new file mode 100644 index 00000000000..c3a27db71d3 --- /dev/null +++ b/applications/CANDLE/pilot2/tools/Notes.txt @@ -0,0 +1,39 @@ +These notes are for use during development +========================================================================= +(mail 1/21) +Good question. It's a good idea to scale these values, especially since we are mixing them into the input with lipid densities. Can you look at doing the same three options here? + +a) raw values w/o scaling +b) min-max +c) mean/stddev (z-scale) + +(mail from 1/18/2020) +Hi Dave, +We currently have the models reading in the lipid densities (13x13x14) cube. We next want to also include information about the RAS BB positions. Each sample (frame) has a string of 184 RAS protein backbone beads, and we have the (x,y,z) coordinates of each one. You can see this as the 184x3 portion below: + +In [1]: import numpy as np + +In [2]: dat = np.load('/p/gpfs1/moody20/pilot2/lipid_density/sr4/pfpatch_000000917813_sig1.npz') + +In [3]: dat.keys() +Out[3]: ['rots', 'states', 'tilts', 'density_sig1', 'frames', 'bbs', 'probs'] + +In [4]: dat['bbs'].shape +Out[4]: (586, 184, 3) + +That particular patch file has 586 frames, and each frame has the xyz-coords for 184 backbone beads. + +There are lots of ways we can represent this data. To get started, we are planning to use a matrix giving the distance between each pair of the RAS BB beads. That gives a 184x184 matrix, where each row is the Euclidean distance from a particular bead to every bead in the chain (including itself). + +In addition to that, we'd like to tack on a last column to this matrix that encodes the z-value for each bead. We may want to normalize that by using the relative distance to the z-position of the first bead in the chain (or maybe the last bead, I forget). That particular bead, which ever one it is, is considered to be attached to the cell membrane, so it serves as a good baseline. + +Can you please help us extend the LBANN data reader to support this additional input? + +There is one extra complication in that some of these patch files have more than one RAS. I think to get started, we only want to consider those patches with a single RAS. + +That might be hard to express well in email, so give me a call if you'd like to talk through the details. +========= +For "z-value", I just mean the value of the z-coordinate of each bead. It turns out the cell membrane lies in the xy-plane so that the z-coordinate encodes the distance each bead is from the plane, once we subtract off the value of z-coordinate for the anchor bead that is attached the membrane. + +There is an sr4_counts.npz file that list the count of RAS in each patch. We can use that to filter out patches with more than one RAS. To get started, it might be ok to ignore that fact. I think I have only captured a single RAS backbone in each patch, rather than a 184-chain for each one. At some point when we care about patches with more than one RAS, I'll likely have to regenerate the dataset to make that clean. + diff --git a/applications/CANDLE/pilot2/tools/README.txt b/applications/CANDLE/pilot2/tools/README.txt new file mode 100644 index 00000000000..9dea7ae2278 --- /dev/null +++ b/applications/CANDLE/pilot2/tools/README.txt @@ -0,0 +1,7 @@ +The tools in this directory are embarrassingly +parallel. They don't use GPUs, so you are advised to compile +lbann without CUDA, in order to use all avalailable CPUs +on your nodes. + +Typical invocation on lassen: + $ jsrun -n 8 -a 40 -d packed -b "packed:10" -r 1 -c 40 diff --git a/applications/CANDLE/pilot2/tools/README_bbs_binary_file_format.txt b/applications/CANDLE/pilot2/tools/README_bbs_binary_file_format.txt new file mode 100644 index 00000000000..c7a3a98b105 --- /dev/null +++ b/applications/CANDLE/pilot2/tools/README_bbs_binary_file_format.txt @@ -0,0 +1,17 @@ +Format for binary files; all values are floats + +num_frames #aka, num_samples +num_beads #for now, will always be 184 + +#Repeating, for each frame: + + z-coordinates for each bead #184 entries + + #Repeating, for each beads in the current frame: + euclidean distance between beads j and k, + j=0..num_beads-1, k=j+1..num_beads (16836 entries per frame) + +Notes: + Optional normalization of euclidean distances and/or Z-coordinates + will be computed during the data_reader load method + diff --git a/applications/CANDLE/pilot2/tools/common.hpp b/applications/CANDLE/pilot2/tools/common.hpp new file mode 100644 index 00000000000..2732d238091 --- /dev/null +++ b/applications/CANDLE/pilot2/tools/common.hpp @@ -0,0 +1,112 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef __PILOT2_TOOLS_COMMON_HPP_ +#define __PILOT2_TOOLS_COMMON_HPP_ + + +namespace lbann { + +const int Num_beads = 184; +const int Dims = 3; +const int Word_size = 4; + +const int Num_dist = 16836; + // 16836 is number of euclid distances + // for j in range(0, 183): + // for k in range(j+1, 184): + // t += 1 + +//======================================================================= +struct xyz { + xyz() {} + xyz(float xx, float yy, float zz) : x(xx), y(yy), z(zz) { } + + float x; + float y; + float z; + + float dist(const xyz &p) { + return sqrt( + (pow( (x-p.x), 2) + + pow( (x-p.x), 2) + + pow( (x-p.x), 2)) + ); + } + friend std::ostream& operator<<(std::ostream& os, const xyz& p); +}; + +std::ostream& operator<<(std::ostream& os, const xyz &p) { + os << p.x << "," << p.y << "," << p.z << " "; + return os; +} + +//======================================================================= + +//void testme(); + +bool sanity_check_npz_file(std::map &a, const std::string filename) { + const std::vector shape = a["bbs"].shape; + const float num_samples = static_cast(shape[0]); + const int word_size = static_cast(a["bbs"].word_size); + bool is_good = true; + if (shape[1] != Num_beads || shape[2] != Dims || word_size != Word_size) { + is_good = false; + std::stringstream s3; + for (auto t : shape) { s3 << t << " "; } + LBANN_WARNING("Bad file: ", filename, " word_size: ", word_size, " dinum_samples: ", num_samples, " shape: ", s3.str()); + } + return is_good; +} + +void read_sample( + int id, + std::vector &data, + std::vector &z_coordinates, + std::vector &distances) { + + size_t offset = 2 /* n_frames, n_beads */ + id * (Num_beads + Num_dist); + z_coordinates.resize(Num_beads); + for (size_t j=offset; j < offset + Num_beads; j++) { + z_coordinates[j-offset] = data[j]; + } + offset += Num_beads; + for (size_t j = offset; j < offset + Num_dist; j++) { + if (j >= data.size()) { + LBANN_ERROR("j >= data.size(); j: ",j, " datalsize: ", data.size(), " offset: ", offset, " Num_beads: ",Num_beads, " Num_dist: ", Num_dist); + } + if (j-offset >= distances.size()) { + LBANN_ERROR("j-offset >= data.size(); j-offset: ", j-offset, " data.size: ", data.size()); + } + distances[j-offset] = data[j]; + } +} + + +} //namespace lbann + +#endif // __PILOT2_TOOLS_COMMON_HPP_ diff --git a/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_euclid_distances.cpp b/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_euclid_distances.cpp new file mode 100644 index 00000000000..6c3fe6ae9d6 --- /dev/null +++ b/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_euclid_distances.cpp @@ -0,0 +1,140 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/comm.hpp" +#include "lbann/utils/options.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/jag_utils.hpp" +#include "lbann/utils/commify.hpp" +#include +#include +#include +#include "common.hpp" + +using namespace lbann; + +int main(int argc, char *argv[]) { + world_comm_ptr comm = initialize(argc, argv); + bool master = comm->am_world_master(); + + try { + options *opts = options::get(); + opts->init(argc, argv); + + if (! opts->has_string("filelist")) { + LBANN_ERROR("usage: ", argv[0], " --filelist="); + } + std::string input_fn = opts->get_string("filelist"); + + int rank = comm->get_rank_in_world(); + int np = comm->get_procs_in_world(); + + // get list of input filenames + std::vector filenames; + read_filelist(comm.get(), input_fn, filenames); + + size_t nn = 0; // only used for user feedback + std::vector beads(Num_beads); + for (size_t j=rank; j a = cnpy::npz_load(filenames[j]); + bool is_good = sanity_check_npz_file(a, filenames[j]); + + // Open output file + std::string fn = filenames[j] + ".bbs_stats"; + if (!is_good) { + fn += ".bad"; + } + std::ofstream out(fn.c_str(), std::ios::binary); + if (!out) { + LBANN_ERROR("failed to open ", fn, "for writing"); + } + + if (is_good) { + const std::vector shape = a["bbs"].shape; + const float num_frames = static_cast(shape[0]); + + // output number of frames and beads + out.write((char*)&num_frames, sizeof(float)); + float nbeads = static_cast(Num_beads); + out.write((char*)&nbeads, sizeof(float)); + + // Get the bbs data array + const float *bd = a["bbs"].data(); + + // Loop over the samples (frames) + for (int k=0; k= i + for (int i=0; i +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/comm.hpp" +#include "lbann/utils/options.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/jag_utils.hpp" +#include "lbann/utils/commify.hpp" +#include +#include +#include +#include "common.hpp" + +using namespace lbann; + +void read_file(const std::string &filename, std::vector &data); + +int main(int argc, char *argv[]) { + world_comm_ptr comm = initialize(argc, argv); + bool master = comm->am_world_master(); + + + try { + options *opts = options::get(); + opts->init(argc, argv); + + if (! opts->has_string("filelist")) { + LBANN_ERROR("usage: ", argv[0], " --filelist="); + } + + std::string input_fn = opts->get_string("filelist"); + + int rank = comm->get_rank_in_world(); + int np = comm->get_procs_in_world(); + + // get list of input filenames + std::vector filenames; + read_filelist(comm.get(), input_fn, filenames); + + std::vector data; + std::vector z_coords(Num_beads); + std::vector distances(Num_dist); + + double max = FLT_MIN; + double min = FLT_MAX; + double total= 0; //for computing mean + double n_samples = 0; //for computing mean + + size_t nn = 0; + for (size_t j=rank; j(*w++); + int n_beads = static_cast(*w++); + if (n_beads != Num_beads) { + LBANN_ERROR("n_beads != Num_beads; n_beads: ", n_beads, " Num_beads: ", Num_beads); + } + for (int h=0; h max) { max = dist_h_to_i; } + total += dist_h_to_i; + ++n_samples; + offset++; + } + } + } + + // User feedback + ++nn; + if (!rank) { + std::cerr << "approx " << (nn*np) << " files of " + << filenames.size() << " processed\n"; + } + } + + //================================================================== + + // Collect and report global min/max/mean/std-dev values + // (using MPI native calls because having separate calls for root/non-root + // processes is just annoying. We also have well over a dozen reduce + // methods, and I can never remember which to use) + // + double max_all; + double min_all; + double total_all; + double n_samples_all; + + // only master needs to know min and max + MPI_Reduce(&max, &max_all, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(&min, &min_all, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); + // all ranks need to know totals and num_samples, in order to compute + // std deviation + MPI_Allreduce(&total, &total_all, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + MPI_Allreduce(&n_samples, &n_samples_all, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + double mean = (total_all / n_samples_all); + + // compute standard deviation + double v_minus_mean_squared = 0.; + nn = 0; + for (size_t j=rank; j(*w++); + int n_beads = static_cast(*w++); + if (n_beads != Num_beads) { + LBANN_ERROR("n_beads != Num_beads"); + } + + for (int h=0; h &data) { + std::ifstream in(filename, std::ios::binary); + if (!in) { + LBANN_ERROR("failed to open ", filename, " for reading"); + } + in.seekg(0, in.end); + size_t n = in.tellg(); + in.seekg(0, in.beg); + data.resize(n); + char *work = reinterpret_cast(data.data()); + in.read(work, n); + if (static_cast(in.gcount()) != n) { + LBANN_ERROR("in.gcount() != n (gcount: ", in.gcount(), "; n: ", n, ") for file: ", filename); + } +} diff --git a/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_max_min.cpp b/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_max_min.cpp new file mode 100644 index 00000000000..04d5a06918a --- /dev/null +++ b/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_max_min.cpp @@ -0,0 +1,187 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/comm.hpp" +#include "lbann/utils/options.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/jag_utils.hpp" +#include "lbann/utils/commify.hpp" +#include +#include +#include + +using namespace lbann; + +int main(int argc, char *argv[]) { + world_comm_ptr comm = initialize(argc, argv); + bool master = comm->am_world_master(); + + try { + options *opts = options::get(); + opts->init(argc, argv); + + if (! opts->has_string("filelist")) { + LBANN_ERROR("usage: ", argv[0], " --filelist="); + } + + std::string input_fn = opts->get_string("filelist"); + + int rank = comm->get_rank_in_world(); + int np = comm->get_procs_in_world(); + + // get list of input filenames + std::vector filenames; + read_filelist(comm.get(), input_fn, filenames); + + size_t nn = 0; // only for user feedback + std::vector max(3, FLT_MIN); + std::vector min(3, FLT_MAX); + std::vector total(3, 0.); //for computing mean + size_t count = 0; //for compputing mean + for (size_t j=rank; j a = cnpy::npz_load(filenames[j]); + const std::vector shape = a["bbs"].shape; + const size_t num_frames = shape[0]; + const size_t word_size = a["bbs"].word_size; + bool is_good = true; + if (shape[1] != 184 || shape[2] != 3 || word_size != 4) { + is_good = false; + std::stringstream s3; + for (auto t : shape) { s3 << t << " "; } + LBANN_WARNING("Bad file: ", filenames[j], " word_size: ", word_size, " dinum_frames: ", num_frames, " shape: ", s3.str()); + } + + if (is_good) { + + // Get the bbs data array + const float *data = a["bbs"].data(); + + // Loop over the bbs entries + for (size_t k=0; k max[0]) max[0] = xx; + if (yy < min[1]) min[1] = yy; + if (yy > max[1]) max[1] = yy; + if (zz < min[2]) min[2] = zz; + if (zz > max[2]) max[2] = zz; + total[0] += xx; + total[1] += yy; + total[2] += zz; + data += 3; + ++count; + } + + ++nn; + if (!rank) { + std::cout << "approx " << utils::commify(nn*np) << " files of " + << utils::commify(filenames.size()) << " processed\n"; + } + } + } // END: for (size_t j=rank; j max_all(3); + std::vector min_all(3); + std::vector mean(3); + size_t count_all; + + // only master needs to know min and max + MPI_Reduce(max.data(), max_all.data(), 3, MPI_FLOAT, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(min.data(), min_all.data(), 3, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD); + // all ranks need to know totals and num_samples, in order to compute + // std deviation + MPI_Allreduce(total.data(), mean.data(), 3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + MPI_Allreduce(&count, &count_all, 3, MPI_LONG, MPI_SUM, MPI_COMM_WORLD); + + for (size_t i=0; i<3; i++) { + mean[i] /= count_all; + } + + // compute standard deviation + std::vector v_minus_mean_squared(3, 0); + for (size_t j=rank; j a = cnpy::npz_load(filenames[j]); + const std::vector shape = a["bbs"].shape; + const size_t word_size = a["bbs"].word_size; + const size_t num_samples = shape[0]; + bool is_good = true; + if (shape[1] != 184) { is_good = false; } + if (shape[2] != 3) { is_good = false; } + if (word_size != 4) { is_good = false; } + if (is_good) { + const float *data = a["bbs"].data(); + for (size_t k=0; k all_minus_mean_squared(3, 0.); + std::vector std_dev(3, 0.); + MPI_Reduce(v_minus_mean_squared.data(), all_minus_mean_squared.data(), 3, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + if (!rank) { + for (size_t i=0; i<3; i++) { + double v3 = all_minus_mean_squared[i] / count_all; + std_dev[i] = sqrt(v3); + } + + std::cout << "\nmax x/y/z: "; + for (auto t : max_all) std::cout << t << " "; + std::cout << std::endl; + std::cout << "min x/y/z: "; + for (auto t : min_all) std::cout << t << " "; + std::cout << std::endl; + std::cout << "mean x/y/z: "; + for (auto t : mean) std::cout << t << " "; + std::cout << std::endl; + std::cout << "std dev: "; + for (auto t : std_dev) std::cout << t << " "; + std::cout << std::endl; + } + + } catch (std::exception const &e) { + if (master) std::cerr << "caught exception: " << e.what() << "\n"; + return EXIT_FAILURE; + } catch (...) { + std::cerr << "unknown exception in main\n"; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} diff --git a/applications/CANDLE/pilot2/tools/compute_ras_lipid_sig1_normalization.cpp b/applications/CANDLE/pilot2/tools/compute_ras_lipid_sig1_normalization.cpp new file mode 100644 index 00000000000..343c9a9ce31 --- /dev/null +++ b/applications/CANDLE/pilot2/tools/compute_ras_lipid_sig1_normalization.cpp @@ -0,0 +1,219 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/comm.hpp" +#include "lbann/utils/options.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/jag_utils.hpp" +#include "lbann/utils/commify.hpp" +#include +#include + +using namespace lbann; + +int main(int argc, char *argv[]) { + world_comm_ptr comm = initialize(argc, argv); + bool master = comm->am_world_master(); + + try { + // Initialize options db (this parses the command line) + options *opts = options::get(); + opts->init(argc, argv); + + if (argc == 1) { + if (master) { + std::cerr << "usage: " << argv[0] << " --filelist= --output_fn=" << std::endl; + } + return EXIT_FAILURE; + } + + if (! (opts->has_string("filelist") && opts->has_string("output_fn"))) { + throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: improper invocation; run with no cmd line args for proper invocation"); + } + + const std::string input_fn = opts->get_string("filelist"); + const std::string output_fn = opts->get_string("output_fn"); + + //sanity check that we can write to the output file + if (master) { + std::ofstream out(output_fn.c_str()); + if (!out) { + LBANN_ERROR("failed to open ", output_fn, " for writing"); + } + out.close(); + } + + int rank = comm->get_rank_in_world(); + int np = comm->get_procs_in_world(); + + // get list of input filenames + std::vector filenames; + read_filelist(comm.get(), input_fn, filenames); + + size_t total_elts_per_channel = 0; + std::vector v_max(14, 0.); + std::vector v_min(14, std::numeric_limits::max()); + std::vector v_mean(14, 0); + for (size_t j=rank; j a = cnpy::npz_load(filenames[j]); + size_t n_elts = a["density_sig1"].num_vals; + double *data = reinterpret_cast(a["density_sig1"].data_holder->data()); + + int s = 0; + for (size_t i=0; i v_max[s]) v_max[s] = vv; + if (vv < v_min[s]) v_min[s] = vv; + ++s; + if (s == 14) { + s = 0; + } + } + if (master) { + std::cerr << "approx " << utils::commify(total_elts_per_channel*np) << " samples processed" << std::endl; + } + } + // ==================== finished processing all files ======================== + + std::vector f_max(14, 0.); + std::vector f_min(14, 0.); + std::vector f_mean(14, 0.); + + comm->trainer_allreduce(v_max.data(), v_max.size(), f_max.data(), El::mpi::MAX); + comm->trainer_allreduce(v_min.data(), v_min.size(), f_min.data(), El::mpi::MIN); + comm->trainer_allreduce(v_mean.data(), v_mean.size(), f_mean.data(), El::mpi::SUM); + size_t n3 = comm->trainer_allreduce(total_elts_per_channel); + for (size_t j=0; j v_minus_mean_squared(14, 0.); + std::vector stdev(14, 0.); + for (size_t j=rank; j a = cnpy::npz_load(filenames[j]); + size_t n_elts = a["density_sig1"].num_vals; + double *data = reinterpret_cast(a["density_sig1"].data_holder->data()); + + int s = 0; + for (size_t i=0; i f_minus_mean_squared(14, 0.); + std::vector f_std_dev(14, 0.); + comm->trainer_allreduce(v_minus_mean_squared.data(), v_minus_mean_squared.size(), f_minus_mean_squared.data(), El::mpi::SUM); + if (master) std::cout << "n3: " << n3 << std::endl; + for (size_t j=0; j +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/comm.hpp" +#include "lbann/utils/options.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/jag_utils.hpp" +#include "lbann/utils/commify.hpp" +#include +#include + +using namespace lbann; + +int main(int argc, char *argv[]) { + world_comm_ptr comm = initialize(argc, argv); + bool master = comm->am_world_master(); + + try { + // Initialize options db (this parses the command line) + options *opts = options::get(); + opts->init(argc, argv); + + if (argc == 1) { + if (master) { + std::cerr << "usage: " << argv[0] << " --filelist=" << std::endl; + } + return EXIT_FAILURE; + } + + if (! (opts->has_string("filelist"))) { + throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: improper invocation; run with no cmd line args for proper invocation"); + } + + const std::string input_fn = opts->get_string("filelist"); + + int rank = comm->get_rank_in_world(); + int np = comm->get_procs_in_world(); + + char b[1024]; + sprintf(b, "debug.%d", rank); + std::ofstream out(b); + if (! out) { + LBANN_ERROR("failed to open ", b, " for reading"); + } + + // get list of input filenames + std::vector filenames; + read_filelist(comm.get(), input_fn, filenames); + + for (size_t j=rank; j a = cnpy::npz_load(filenames[j]); + + out << "DONE! opening: " << filenames[j] << std::endl; + out.close(); + out.open(b, std::ofstream::out | std::ofstream::app); + } + } catch (std::exception const &e) { + if (master) std::cerr << "caught exception: " << e.what() << "\n"; + return EXIT_FAILURE; + } catch (...) { + std::cerr << "unknown exception in main\n"; + return EXIT_FAILURE; + } + + // Clean up + return EXIT_SUCCESS; +} diff --git a/applications/CANDLE/pilot2/train_ras_classifier.py b/applications/CANDLE/pilot2/train_ras_classifier.py new file mode 100644 index 00000000000..d6de5664fea --- /dev/null +++ b/applications/CANDLE/pilot2/train_ras_classifier.py @@ -0,0 +1,133 @@ +import numpy as np +import lbann +import lbann.modules +from util import preprocess_data + +# Data paths, directory where patches are located +data_dir = 'data' +samples = preprocess_data(data_dir) + +dims = len(samples[0]) + + +num_classes = 3 +num_channels = 14 + +# Sample access functions +def get_sample(index): + sample = samples[index] + return sample + +def num_samples(): + return samples.shape[0] + +def sample_dims(): + return [dims] + +def str_list(l): + return ' '.join([str(i) for i in l]) +# ============================================== +# Setup and launch experiment +# ============================================== + +def construct_model(): + """Model description + + """ + import lbann + import lbann.modules + + + fc = lbann.modules.FullyConnectedModule + conv = lbann.modules.Convolution2dModule + + conv1 = conv(20, 3, stride=1, padding=1,name='conv1') + conv2 = conv(20, 3, stride=1, padding=1,name='conv2') + fc1 = fc(100, name='fc1') + fc2 = fc(20, name='fc2') + fc3 = fc(num_classes, name='fc3') + # Layer graph + input = lbann.Input(name='inp_tensor') + inp_slice = lbann.Slice(input, axis=0, slice_points=str_list([0, dims-1, dims]),name='inp_slice') + xdata = lbann.Identity(inp_slice) + ylabel = lbann.Identity(inp_slice, name='gt_y') + #NHWC to NCHW + x = lbann.Reshape(xdata, dims='14 13 13') + x = conv2(conv1(x)) + x = lbann.Reshape(x, dims='3380') + x = lbann.Dropout(lbann.Relu(fc1(x)),keep_prob=0.5) + x = lbann.Dropout(fc2(x),keep_prob=0.5) + pred = lbann.Softmax(fc3(x)) + gt_label = lbann.OneHot(ylabel, size=num_classes) + loss = lbann.CrossEntropy([pred,gt_label],name='loss') + acc = lbann.CategoricalAccuracy([pred, gt_label]) + + + layers = list(lbann.traverse_layer_graph(input)) + # Setup objective function + weights = set() + for l in layers: + weights.update(l.weights) + obj = lbann.ObjectiveFunction(loss) + + + callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer()] + + # Construct model + num_epochs = 10 + return lbann.Model(num_epochs, + weights=weights, + layers=layers, + metrics=[lbann.Metric(acc, name='accuracy', unit='%')], + objective_function=obj, + callbacks=callbacks) + +def construct_data_reader(): + """Construct Protobuf message for Python data reader. + + The Python data reader will import this Python file to access the + sample access functions. + + """ + import os.path + import lbann + module_file = os.path.abspath(__file__) + module_name = os.path.splitext(os.path.basename(module_file))[0] + module_dir = os.path.dirname(module_file) + + # Base data reader message + message = lbann.reader_pb2.DataReader() + + # Training set data reader + data_reader = message.reader.add() + data_reader.name = 'python' + data_reader.role = 'train' + data_reader.shuffle = True + data_reader.percent_of_data_to_use = 1.0 + data_reader.python.module = module_name + data_reader.python.module_dir = module_dir + data_reader.python.sample_function = 'get_sample' + data_reader.python.num_samples_function = 'num_samples' + data_reader.python.sample_dims_function = 'sample_dims' + + return message + +if __name__ == '__main__': + import lbann + import lbann.contrib.launcher + mini_batch_size = 64 + trainer = lbann.Trainer(mini_batch_size=mini_batch_size) + model = construct_model() + opt = lbann.Adam(learn_rate=0.001,beta1=0.9,beta2=0.99,eps=1e-8) + data_reader = construct_data_reader() + status = lbann.contrib.launcher.run( + trainer, model, data_reader, opt, + account='hpcdl', + scheduler='slurm', + time_limit=720, + nodes=1, + procs_per_node=1, + setup_only=False, + job_name='candle_p2_ras_classifier') + print(status) diff --git a/applications/CANDLE/pilot2/util.py b/applications/CANDLE/pilot2/util.py new file mode 100644 index 00000000000..9584a3c0941 --- /dev/null +++ b/applications/CANDLE/pilot2/util.py @@ -0,0 +1,95 @@ +import os +import sys +import random +import numpy as np + + +p0_thresh = 0.55 +p1_thresh = 0.85 +p2_thresh = 0.85 + +def preprocess_data(dirspath,channels=None): +# define a tuple of specific channels if user listed them + channels_tuple = tuple(range(14)) + if channels is not None: + channels_tuple = tuple(channels) + + files_train = [] + states = [] + cons = [] + + #for d in dirspath: + for _ in range(1): + # get list of all files in datapath and shuffle them + # sort by filename before shuffle so we could generate + # a consistent list if using the same random seed + filenames = os.listdir(dirspath) + filenames.sort() + random.shuffle(filenames) + + filenames_divide = int(1.0 * len(filenames)) + filenames_train = filenames[:filenames_divide] + + files_train.append([dirspath + "/" + f for f in filenames_train]) + + frame_start = 0 + + for f in filenames_train: + # read in the data file + d = np.load(dirspath + '/' + f) + + # extract fields + p = d['probs'][d['frames'] >= frame_start] + s = d['states'][d['frames'] >= frame_start] + #n = d['density_sig1p5'][d['frames'] >= frame_start] + n = d['density_sig1'][d['frames'] >= frame_start] + #print p.shape, s.shape + + s = s[(p[:,0] > p0_thresh) | (p[:,1] > p1_thresh) | (p[:,2] > p2_thresh)] + n = n[(p[:,0] > p0_thresh) | (p[:,1] > p1_thresh) | (p[:,2] > p2_thresh)] + + states.append(s) + + + # append concentrations, filter out by channel id(s) if given + # can we do channel first here, transpose?, move axis? + n = np.array(n) + n = n.astype(np.float32) + if channels: + cons.append(n[:,:,:,channels_tuple]) + else: + cons.append(n) + + + states = np.concatenate(states,axis=0) + cons = np.concatenate(cons,axis=0) + + # print list of unique state labels and number of each + (values, cnt) = np.unique(states, return_counts=True) + + min_cnt = np.min(cnt) + idx_0 = np.where(states == 0) + idx_0 = idx_0[0][:min_cnt] + idx_1 = np.where(states == 1) + idx_1 = idx_1[0][:min_cnt] + idx_2 = np.where(states == 2) + idx_2 = idx_2[0][:min_cnt] + ids = np.concatenate([idx_0, idx_1, idx_2], axis=0) + states = states[ids] + cons = cons[ids] + + + # normalize each concentration channel independently + mins = cons.min(axis=(0,1,2), keepdims=True) + maxs = cons.max(axis=(0,1,2), keepdims=True) + + cons /= maxs + labels = states + + #transpose to NCHW + cons = cons.transpose(0,3,1,2) + + X = cons.reshape(cons.shape[0],-1) + y = labels.reshape(-1,1) + Xy_data = np.hstack((X,y)) + return Xy_data diff --git a/applications/CONTRIBUTING.md b/applications/CONTRIBUTING.md new file mode 100644 index 00000000000..72ff8591e3f --- /dev/null +++ b/applications/CONTRIBUTING.md @@ -0,0 +1,49 @@ +## Contributing Applications: + +The application directory contains the user-facing code for projects +to use LBANN. Each project directory should contain the python code +to instantiate the model, run both training and inference, an +experiments directory, as well as utility / helper code to pre- or +post-process data. In addition to project-specific directories the +directory hierarchy groups together similar projects into broader +categories, such as vision-based networks. + +### Directory Structure: + +``` +applications +└─── ATOM +``` + +The applications directory has primary __projects__ directories as well +as __categories__ that contain related __projects__. + +### Project Directory Structure: + +The general structure of a project directory should be: + +``` + +└─── README.md +└─── .py +└─── lib_.py +└─── experiments + └─── run_.py +└─── utils + +``` + +* README.md + * Describe the project, how to run it, etc. +* `.py` + * Python code that builds the model's compute graph +* `lib_.py` + * Common Python code that builds common substructurs used by the + application +* experiments + * Directory to run an experiment. Should include launcher scripts, + etc. + * `run_.py` + * Launcher script to run the model in train or inference mode +* utils + * Directory for holding pre- and post-processing scripts diff --git a/applications/MOF/MOFae.py b/applications/MOF/MOFae.py new file mode 100644 index 00000000000..bfef507aaad --- /dev/null +++ b/applications/MOF/MOFae.py @@ -0,0 +1,115 @@ +import lbann +import os +import os.path + +# ---------------------------------- +# Construct Graph +# ---------------------------------- +def gen_layers(latent_dim, number_of_atoms): + ''' Generates the model for the 3D Convolutional Auto Encoder. + + returns the Directed Acyclic Graph (DAG) that the lbann + model will run on. + ''' + input_ = lbann.Input( target_mode = "reconstruction") + tensors = lbann.Identity(input_) + + tensors = lbann.Reshape(tensors, dims="11 32 32 32", name="Sample") + # Input tensor shape is (number_of_atoms)x32x32x32 + + # Encoder + + x = lbann.Identity(tensors) + for i in range(4): + out_channels = latent_dim // (2 ** (3-i)) + + x = lbann.Convolution(x, + num_dims = 3, + num_output_channels = out_channels, + num_groups = 1, + conv_dims_i = 4, + conv_strides_i = 2, + conv_dilations_i = 1, + conv_pads_i = 1, + has_bias = True, + name="Conv_{0}".format(i)) + + x = lbann.BatchNormalization(x, name="Batch_NORM_{0}".format(i+1)) + x = lbann.LeakyRelu(x, name="Conv_{0}_Activation".format(i+1)) + + # Shape: (latent_dim)x2x2x2 + encoded = lbann.Convolution(x, + num_dims = 3, + num_output_channels = latent_dim, + num_groups = 1, + conv_dims_i = 2, + conv_strides_i = 2, + conv_dilations_i = 1, + conv_pads_i = 0, + has_bias = True, + name ="encoded") + + # Shape: (latent_dim)1x1x1 + + # Decoder + + x = lbann.Deconvolution(encoded, + num_dims = 3, + num_output_channels = number_of_atoms * 16, + num_groups = 1, + conv_dims_i = 4, + conv_pads_i = 0, + conv_strides_i = 2, + conv_dilations_i = 1, + has_bias = True, + name="Deconv_1" + ) + x = lbann.BatchNormalization(x, name="BN_D1") + x = lbann.Tanh(x, name="Deconv_1_Activation") + + for i in range(3): + out_channels = number_of_atoms * (2 ** (2-i)) + x = lbann.Deconvolution(x, + num_dims = 3, + num_output_channels = out_channels, + num_groups = 1, + conv_dims_i = 4, + conv_pads_i = 1, + conv_strides_i = 2, + conv_dilations_i = 1, + has_bias = True, + name="Deconv_{0}".format(i+2) + ) + x = lbann.BatchNormalization(x, name="BN_D{0}".format(i+2)) + + if (i != 2): #Save the last activation layer because we want to dump the outputs + x = lbann.Tanh(x, name="Deconv_{0}_Activation".format(i+2)) + + decoded = lbann.Tanh(x, + name = "decoded") + + img_loss = lbann.MeanSquaredError([decoded, tensors]) + + metrics = [lbann.Metric(img_loss, name='recon_error')] + # ---------------------------------- + # Set up DAG + # ---------------------------------- + + layers = lbann.traverse_layer_graph(input_) #Generate Model DAG + return layers, img_loss, metrics +def make_data_reader(): + reader = lbann.reader_pb2.DataReader() + _reader = reader.reader.add() + _reader.name = 'python' + _reader.role = 'train' + _reader.shuffle = True + _reader.percent_of_data_to_use = 1.0 + _reader.python.module = 'dataset' + _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__)) + _reader.python.sample_function = 'get_train' + _reader.python.num_samples_function = 'num_train_samples' + _reader.python.sample_dims_function = 'sample_dims' + + return reader + + diff --git a/applications/MOF/README.md b/applications/MOF/README.md new file mode 100644 index 00000000000..d5444889519 --- /dev/null +++ b/applications/MOF/README.md @@ -0,0 +1,52 @@ +# Example models for 3D molecular generation + +This directory contains LBANN implementations of 3D molecular generation models for Metal Organic Frameworks from the CoRE MOF Database. The models are based on 3D convolutional fliters on periodic 3D voxel grids. + + +## Dataset Information + +The dataset used is a subset of the [CoRE MOF Database](https://gregchung.github.io/CoRE-MOFs/). Each Metal Organic Framework is represented as a 32x32x32x11 tensor. + +The representation is channel-wise concatenation of 11 32x32x32 voxel grids, where each voxel grid represents the location of a particular element. + +## Running Instructions + +Run in conjuction with the correct slurm / lsf command: + +``` +python3 main.py --nodes N --procs-per-node P --mini-batch-size B +``` +## Testing Dataset + +To test the dataset: + +``` +python3 -m unittest test/* +``` + +To test integration and performance: + +``` +cd test +python3 -m pytest +``` +## Links + +For more information on the data representation: + + + +@article {Kimeaax9324, + author = {Kim, Baekjun and Lee, Sangwon and Kim, Jihan}, + title = {Inverse design of porous materials using artificial neural networks}, + volume = {6}, + number = {1}, + elocation-id = {eaax9324}, + year = {2020}, + doi = {10.1126/sciadv.aax9324}, + publisher = {American Association for the Advancement of Science}, + } + eprint = {https://advances.sciencemag.org/content/6/1/eaax9324.full.pdf}, + journal = {Science Advances} +} + diff --git a/applications/MOF/data/MOFdataset.py b/applications/MOF/data/MOFdataset.py new file mode 100755 index 00000000000..19439578be2 --- /dev/null +++ b/applications/MOF/data/MOFdataset.py @@ -0,0 +1,36 @@ +from pathlib import Path +from typing import List +import os +import numpy as np + + +class MOFDataset(): + ''' + Custom Dataset loader for MOF data. + ''' + def __init__(self, path, transform=None): + self.path = path + path = Path(path) + self.data = np.load(path) + self.transform = transform + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + if self.transform is not None: + return self.transform(self.data[idx]) + else: + return self.data[idx] + + +def test(): + data_dir = os.path.dirname(os.path.realpath(__file__)) + test_file_path = os.path.join(data_dir, 'mofs.npy') + test_data = MOFDataset(test_file_path) + + print(test_data[0].shape) + print(len(test_data)) + +if __name__ == '__main__': + test() diff --git a/applications/MOF/data/__init__.py b/applications/MOF/data/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/applications/MOF/dataset.py b/applications/MOF/dataset.py new file mode 100755 index 00000000000..a88f80f450a --- /dev/null +++ b/applications/MOF/dataset.py @@ -0,0 +1,50 @@ +import os +import numpy as np +from data.MOFdataset import MOFDataset + +# MOFdaset is a custom dataset class extending torch.utils.data.Dataset + +## +## For an example look at: +## https://github.com/LLNL/lbann/blob/develop/applications/nlp/transformer/dataset.py +## + +data_dir = os.path.dirname(os.path.realpath(__file__)) + +## Add CLI arguments for training file location and error handling +train_file_path = os.path.join(data_dir, 'data/train_mofs.npy') +test_file_path = os.path.join(data_dir, 'data/test_mofs.npy') + + +training_data = MOFDataset(train_file_path) +test_data = MOFDataset(test_file_path) + +def get_train (index): + return np.float32(training_data[index].flatten()) #Iterable or 1 D array + +def get_test (index): + return np.float32(test_data[index].flatten()) #Iterable or 1D array +def num_train_samples(): + return len(training_data) + +def num_test_samples(): + return len(test_data) + +def sample_dims(): + return (32*32*32*11, ) + +if __name__ == '__main__': + data_dir = os.path.dirname(os.path.realpath(__file__)) + +## Add CLI arguments for training file location and error handling + train_file_path = os.path.join(data_dir, 'data/train_mofs.npy') + test_file_path = os.path.join(data_dir, 'data/test_mofs.npy') + + training_data =MOFDataset(train_file_path, no_grid=True) + test_data = MOFDataset(test_file_path, no_grid=True) + + print(len(training_data)) + print(training_data[0].shape) + + + diff --git a/applications/MOF/main.py b/applications/MOF/main.py new file mode 100644 index 00000000000..decc30ba52d --- /dev/null +++ b/applications/MOF/main.py @@ -0,0 +1,92 @@ +import argparse +import lbann +import MOFae +import dataset +import os +import lbann.contrib.launcher +import lbann.contrib.args +# ---------------------------------- +# Command-line arguments +# ---------------------------------- + + +desc = ("Training 3D-CAE on 4D MOF Data using LBANN") + +parser = argparse.ArgumentParser(description = desc) + +parser.add_argument( + '--zdim', action='store',default = 2048, type=int, + help="dimensionality of latent space (dedfault: 2048)", metavar = 'NUM') +parser.add_argument( + '--atoms', action='store', default = 11,type=int, + help="Number of atom species (default: 11)", metavar = 'NUM') +parser.add_argument( + '--job-name', action='store', default='mofae', type=str, + help='job name', metavar='NAME') +parser.add_argument( + '--mini-batch-size', action='store', default=128, type=int, + help='mini-batch size (default: 128)', metavar='NUM') +parser.add_argument( + '--num-epochs', action='store', default=100, type=int, + help='number of epochs (default: 100)', metavar='NUM') + +lbann.contrib.args.add_scheduler_arguments(parser) +args = parser.parse_args() + + +latent_dim = args.zdim +number_of_atoms = args.atoms + + +layers, img_loss, metrics = MOFae.gen_layers(latent_dim, number_of_atoms) +mini_batch_size = args.mini_batch_size +num_epochs = args.num_epochs + +# Callbacks for Debug and Running Model + +print_model = lbann.CallbackPrintModelDescription() #Prints initial Model after Setup + +training_output = lbann.CallbackPrint( interval = 1, + print_global_stat_only = False) #Prints training progress + +gpu_usage = lbann.CallbackGPUMemoryUsage() + +encoded_output = lbann.CallbackDumpOutputs( layers = "decoded", batch_interval = 400, directory = os.path.dirname(os.path.realpath(__file__)), format="npy") + +# ---------------------------------- +# Set up Experiment +# ---------------------------------- + +#Generate Model +model = lbann.Model(num_epochs, + layers = layers, + objective_function = img_loss, + metrics = metrics, + callbacks = [print_model, training_output, gpu_usage, encoded_output] + ) + +#Optimizer + +opt = lbann.Adam(learn_rate = 1e-2, + beta1 = 0.9, + beta2 = 0.99, + eps = 1e-8 + ) + +data_reader = MOFae.make_data_reader() + + +#Trainer + +trainer = lbann.Trainer(mini_batch_size = mini_batch_size, + name = "MOF_AE_1" + ) + +# ---------------------------------- +# Run Experiment +# ---------------------------------- + +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) + + +lbann.contrib.launcher.run(trainer, model, data_reader, opt, **kwargs) diff --git a/applications/MOF/test/__init__.py b/applications/MOF/test/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/applications/MOF/test/conftest.py b/applications/MOF/test/conftest.py new file mode 100644 index 00000000000..e62cd503f49 --- /dev/null +++ b/applications/MOF/test/conftest.py @@ -0,0 +1,41 @@ +import sys +sys.path.insert(0, '../../../bamboo/common_python') +import tools +import pytest, re, subprocess + + +def pytest_addoption(parser): + cluster = re.sub('[0-9]+', '', subprocess.check_output( + 'hostname'.split()).decode('utf-8').strip()) + default_dirname = subprocess.check_output( + 'git rev-parse --show-toplevel'.split()).decode('utf-8').strip() + default_exes = tools.get_default_exes(default_dirname, cluster) + + parser.addoption('--cluster', action='store', default=cluster, + help='--cluster= to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster') + parser.addoption('--dirname', action='store', default=default_dirname, + help='--dirname= to specify the top-level directory. Default directory of build_lbann_lc executable') + parser.addoption('--exes', action='store', default=default_exes, + help='--exes={compiler_name: path}') + parser.addoption('--weekly', action='store_true', default=False, + help='--weekly specifies that the test should ONLY be run weekly, not nightly. Default False') + + +@pytest.fixture +def cluster(request): + return request.config.getoption('--cluster') + + +@pytest.fixture +def dirname(request): + return request.config.getoption('--dirname') + + +@pytest.fixture +def exes(request): + return request.config.getoption('--exes') + + +@pytest.fixture +def weekly(request): + return request.config.getoption('--weekly') diff --git a/applications/MOF/test/dataset_test.py b/applications/MOF/test/dataset_test.py new file mode 100644 index 00000000000..4c4d243ab9c --- /dev/null +++ b/applications/MOF/test/dataset_test.py @@ -0,0 +1,39 @@ +import unittest +import os.path +import sys +import numpy as np + + +root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + + +sys.path.append(root_dir) + +import dataset + + + +# TO DO: Add data to lustre + gpfs for easier testing + +class dataset_test(unittest.TestCase): + + def test_num_train_samples(self): + #print("Testing num train samples") + self.assertEqual(dataset.num_train_samples(), 64) + + def test_get_train(self): + + #print("Testing get train") + for i in range(dataset.num_train_samples()): + mof = dataset.get_train(i) + self.assertIsInstance(mof, np.ndarray) + + + def test_sample_dims(self): + # print("Testing Sample Dims") + self.assertEqual(dataset.sample_dims()[0], dataset.get_train(0).size) + + + +if __name__ == '__main__': + unittest.main() diff --git a/applications/MOF/test/test_integration_mof.py b/applications/MOF/test/test_integration_mof.py new file mode 100644 index 00000000000..a0b0be9c738 --- /dev/null +++ b/applications/MOF/test/test_integration_mof.py @@ -0,0 +1,160 @@ +import functools +import operator +import os +import os.path +import re +import sys +import pytest + +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +root_dir = os.path.dirname(current_dir) + +sys.path.append(root_dir) # Added lbann/applications/MOF directory + + +import dataset +import MOFae +applications_dir = os.path.dirname(root_dir) +lbann_dir = os.path.dirname(applications_dir) +common_python_dir = os.path.join(lbann_dir, 'bamboo/common_python')# Added lbann/bamboo/common_python +sys.path.append(common_python_dir) +import tools + +#Training options +num_epochs = 10 +mini_batch_size = 64 +num_nodes = 2 + +# Error + +expected_MSE_range = (0.09, 0.11) + +expected_mini_batch_times = { + 'ray': .35, + 'pascal':.35 + } + + +def setup_experiment(lbann): + """Construct LBANN experiment. + + args: + lbann (module): Module for LBANN Python frontend + + """ + + trainer = lbann.Trainer(mini_batch_size=mini_batch_size) + model = construct_model(lbann) + reader = make_data_reader(lbann) + + # No validation set + + optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8 ) + return trainer, model, reader, optimizer + +def make_data_reader(lbann): + """Construct LBANN data reader + + """ + reader = lbann.reader_pb2.DataReader() + _reader = reader.reader.add() + _reader.name = 'python' + _reader.role = 'train' + _reader.shuffle = True + _reader.percent_of_data_to_use = 1.0 + _reader.python.module = 'dataset' + _reader.python.module_dir = root_dir + _reader.python.sample_function = 'get_train' + _reader.python.num_samples_function = 'num_train_samples' + _reader.python.sample_dims_function = 'sample_dims' + + return reader +def construct_model(lbann): + + latent_dim = 2048 + number_of_atoms = 11 + layers, img_loss, metrics = MOFae.gen_layers(latent_dim, number_of_atoms) + callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] + + return lbann.Model(num_epochs, + layers = layers, + objective_function = img_loss, + metrics = metrics, + callbacks = callbacks + ) + +# ============================================== +# Setup PyTest +# ============================================== + +def augment_test_func(test_func): + """Augment test function to parse log files. + + `tools.create_tests` creates functions that run an LBANN + experiment. This function creates augmented functions that parse + the log files after LBANN finishes running, e.g. to check metrics + or runtimes. + + Note: The naive approach is to define the augmented test functions + in a loop. However, Python closures are late binding. In other + words, the function would be overwritten every time we define it. + We get around this overwriting problem by defining the augmented + function in the local scope of another function. + + Args: + test_func (function): Test function created by + `tools.create_tests`. + + Returns: + function: Test that can interact with PyTest. + + """ + test_name = test_func.__name__ + + # Define test function + def func(cluster, exes, dirname): + # Run LBANN experiment + experiment_output = test_func(cluster, exes, dirname) + + # Parse LBANN log file + train_accuracy = None + test_accuracy = None + mini_batch_times = [] + with open(experiment_output['stdout_log_file']) as f: + for line in f: + match = re.search('training epoch [0-9]+ recon_error : ([0-9.]+)', line) + if match: + train_accuracy = float(match.group(1)) + match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line) + if match: + mini_batch_times.append(float(match.group(1))) + + # Check if training accuracy is within expected range + assert (expected_MSE_range[0] + < train_accuracy + = 2: + max_id = max(max_id, int(line[0])) + max_id = max(max_id, int(line[1])) + if max_id < 0: + raise RuntimeError('Graph has no non-negative node IDs') + return max_id diff --git a/applications/graph/evaluate.py b/applications/graph/evaluate.py new file mode 100644 index 00000000000..8b67d2b9e38 --- /dev/null +++ b/applications/graph/evaluate.py @@ -0,0 +1,49 @@ +"""Helper script to evaluate quality of node embeddings. + +Converts the embedding weights computed by LBANN into a format that +can be read by Keita's evaluation script. + +""" +import argparse +import os.path +import sys + +import numpy as np + +# Command-line arguments +parser = argparse.ArgumentParser() +parser.add_argument( + 'embedding_file', type=str, + help='node embeddings computed by LBANN', metavar='EMBEDDING_FILE') +parser.add_argument( + 'label_file', type=str, + help='node labels', metavar='LABEL_FILE') +parser.add_argument( + '--snap-embedding-file', default='results.emb', type=str, + help='node embeddings in SNAP format', metavar='FILE') +args = parser.parse_args() + +# Construct embedding file in SNAP's format +embeddings = np.loadtxt(args.embedding_file) +embeddings = np.transpose(embeddings) +with open(args.snap_embedding_file, 'w') as f: + f.write(f'{embeddings.shape[0]} {embeddings.shape[1]}\n') + for index, embedding in enumerate(embeddings): + f.write(f'{index} {" ".join(str(x) for x in embedding)}\n') + +# Evaluate embeddings with Keita's evaluation script +root_dir = os.path.dirname(os.path.realpath(__file__)) +eval_script_dir = os.path.join( + root_dir, + 'largescale_node2vec', + 'evaluation', + 'multi_label_classification' +) +sys.path.append(eval_script_dir) +import multi_label_classification +multi_label_classification.main([ + '-x', args.snap_embedding_file, + '-y', args.label_file, + '-r', 0.9, + '-n', 10 +]) diff --git a/applications/graph/largescale_node2vec b/applications/graph/largescale_node2vec new file mode 160000 index 00000000000..1b0aa43fdf5 --- /dev/null +++ b/applications/graph/largescale_node2vec @@ -0,0 +1 @@ +Subproject commit 1b0aa43fdf5f8e956915926305f3e55c2c17972e diff --git a/applications/graph/main.py b/applications/graph/main.py new file mode 100644 index 00000000000..82c63a912b7 --- /dev/null +++ b/applications/graph/main.py @@ -0,0 +1,149 @@ +"""Learn embedding weights with LBANN.""" +import argparse +import os.path + +import lbann +import lbann.contrib.launcher +import lbann.contrib.args + +import dataset +from utils import make_iterable, str_list +import utils.snap + +# ---------------------------------- +# Options +# ---------------------------------- + +# Command-line arguments +parser = argparse.ArgumentParser() +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_node2vec', type=str, + help='job name', metavar='NAME') +parser.add_argument( + '--mini-batch-size', action='store', default=256, type=int, + help='mini-batch size (default: 256)', metavar='NUM') +parser.add_argument( + '--num-epochs', action='store', default=1, type=int, + help='number of epochs (default: 1)', metavar='NUM') +parser.add_argument( + '--latent-dim', action='store', default=128, type=int, + help='latent space dimensions (default: 128)', metavar='NUM') +parser.add_argument( + '--learning-rate', action='store', default=-1, type=float, + help='learning rate (default: 0.025*mbsize)', metavar='VAL') +parser.add_argument( + '--work-dir', action='store', default=None, type=str, + help='working directory', metavar='DIR') +args = parser.parse_args() + +# ---------------------------------- +# Embedding weights +# ---------------------------------- + +encoder_embeddings_weights = lbann.Weights( + initializer=lbann.NormalInitializer( + mean=0, standard_deviation=1/args.latent_dim, + ), + name='embeddings', +) +decoder_embeddings_weights = lbann.Weights( + initializer=lbann.ConstantInitializer(value=0), + name='decoder_embeddings', +) + +# ---------------------------------- +# Construct layer graph +# ---------------------------------- + +# Properties of graph and random walk +num_graph_nodes = dataset.max_graph_node_id() + 1 +walk_length = dataset.walk_context_length +num_negative_samples = dataset.num_negative_samples +input_size = dataset.sample_dims()[0] + +# Embedding vectors, including negative sampling +# Note: Input is sequence of graph node IDs +input_ = lbann.Identity(lbann.Input()) +input_slice = lbann.Slice( + input_, + slice_points=f'0 {num_negative_samples+1} {input_size}' +) +decoder_embeddings = lbann.Embedding( + input_slice, + weights=decoder_embeddings_weights, + num_embeddings=num_graph_nodes, + embedding_dim=args.latent_dim, +) +encoder_embeddings = lbann.Embedding( + input_slice, + weights=encoder_embeddings_weights, + num_embeddings=num_graph_nodes, + embedding_dim=args.latent_dim, +) + +# Skip-Gram with negative sampling +preds = lbann.MatMul(decoder_embeddings, encoder_embeddings, transpose_b=True) +preds_slice = lbann.Slice( + preds, + axis=0, + slice_points=f'0 {num_negative_samples} {num_negative_samples+1}') +preds_negative = lbann.Identity(preds_slice) +preds_positive = lbann.Identity(preds_slice) +obj_positive = lbann.LogSigmoid(preds_positive) +obj_positive = lbann.Reduction(obj_positive, mode='sum') +obj_negative = lbann.WeightedSum(preds_negative, scaling_factors='-1') +obj_negative = lbann.LogSigmoid(obj_negative) +obj_negative = lbann.Reduction(obj_negative, mode='sum') +obj = [ + lbann.LayerTerm(obj_positive, scale=-1), + lbann.LayerTerm(obj_negative, scale=-1/num_negative_samples), +] + +# ---------------------------------- +# Create data reader +# ---------------------------------- + +reader = lbann.reader_pb2.DataReader() +_reader = reader.reader.add() +_reader.name = 'python' +_reader.role = 'train' +_reader.shuffle = True +_reader.percent_of_data_to_use = 1.0 +_reader.python.module = 'dataset' +_reader.python.module_dir = os.path.dirname(os.path.realpath(__file__)) +_reader.python.sample_function = 'get_sample' +_reader.python.num_samples_function = 'num_samples' +_reader.python.sample_dims_function = 'sample_dims' + +# ---------------------------------- +# Run LBANN +# ---------------------------------- + +# Create optimizer +# Note: Learning rate in original word2vec is 0.025 +learning_rate = args.learning_rate +if learning_rate < 0: + learning_rate = 0.025 * args.mini_batch_size +opt = lbann.SGD(learn_rate=learning_rate) + +# Create LBANN objects +trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size) +callbacks = [ + lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackDumpWeights(basename='embeddings', + epoch_interval=args.num_epochs), +] +model = lbann.Model(args.num_epochs, + layers=lbann.traverse_layer_graph(input_), + objective_function=obj, + callbacks=callbacks) + +# Run LBANN +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) +lbann.contrib.launcher.run(trainer, model, reader, opt, + job_name=args.job_name, + work_dir=args.work_dir, + overwrite_script=True, + **kwargs) diff --git a/applications/graph/snap b/applications/graph/snap new file mode 160000 index 00000000000..907c34aac6b --- /dev/null +++ b/applications/graph/snap @@ -0,0 +1 @@ +Subproject commit 907c34aac6bcddc7c2f8efb64be76e87dd7e4ea5 diff --git a/applications/graph/test/test_dataset.py b/applications/graph/test/test_dataset.py new file mode 100644 index 00000000000..85525ae0c47 --- /dev/null +++ b/applications/graph/test/test_dataset.py @@ -0,0 +1,38 @@ +import os.path +import random +import sys + +# Local paths +root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +sys.path.append(root_dir) + +def test_dataset(): + import dataset + + # Check max node ID + max_graph_node_id = dataset.max_graph_node_id() + assert max_graph_node_id >= 0, 'Negative graph node ID' + assert max_graph_node_id != 0, \ + 'Max graph node ID is zero, ' \ + 'which implies graph has only one node or node IDs are negative' + + # Check sample dimensions + sample_dims = dataset.sample_dims() + assert len(sample_dims) == 1, 'Unexpected dimensions for data sample' + assert sample_dims[0] > 0, 'Invalid dimensions for data sample' + + # Check number of samples + num_samples = dataset.num_samples() + assert num_samples >= 0, 'Invalid number of data samples' + assert num_samples != 0, 'Dataset has no data samples' + + # Check samples + indices = [random.randint(0, num_samples-1) for _ in range(20)] + indices.append(0) + indices.append(num_samples-1) + for index in indices: + sample = dataset.get_sample(index) + assert sample.shape == sample_dims, 'Unexpected dimensions for data sample' + for node in sample: + assert 0 <= node <= max_graph_node_id, \ + 'Invalid graph node ID in data sample' diff --git a/applications/graph/utils/__init__.py b/applications/graph/utils/__init__.py new file mode 100644 index 00000000000..370660d532f --- /dev/null +++ b/applications/graph/utils/__init__.py @@ -0,0 +1,17 @@ +"""Utilities for LBANN graph models""" +import collections.abc + +def make_iterable(obj): + """Convert to an iterable object. + + Simply returns `obj` if it is alredy iterable. Otherwise returns a + 1-tuple containing `obj`. + """ + if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str): + return obj + else: + return (obj,) + +def str_list(it): + """Convert an iterable object to a space-separated string.""" + return ' '.join([str(i) for i in make_iterable(it)]) diff --git a/applications/graph/utils/snap.py b/applications/graph/utils/snap.py new file mode 100644 index 00000000000..2c65bf7231c --- /dev/null +++ b/applications/graph/utils/snap.py @@ -0,0 +1,121 @@ +"""Utilities to interact with SNAP. + +SNAP is the Stanford Network Analysis Platform. See +https://snap.stanford.edu. + +""" +import os +import os.path +import urllib.request +import gzip +import subprocess + +# Root directory for LBANN graph application +_root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + + +def download_graph(name='ego-Facebook', + graph_file=None): + """Download graph edgelist file from SNAP website. + + Args: + name (str): Name of graph. + graph_file (str, optional): File where uncompressed edge list + will be saved (default: in 'data' directory). + + Returns: + str: Uncompressed edge list file. + + """ + + # Graphs from SNAP + download_urls = { + 'ego-Facebook': 'http://snap.stanford.edu/data/facebook_combined.txt.gz', + } + + # Paths + if not graph_file: + graph_file = os.path.join(_root_dir, 'data', name, 'graph.txt') + data_dir = os.path.dirname(graph_file) + if not os.path.isdir(data_dir): + os.makedirs(data_dir) + data_dir = os.path.realpath(data_dir) + graph_file = os.path.realpath(graph_file) + compressed_file = graph_file + '.gz' + + # Download and uncompress graph file + urllib.request.urlretrieve(download_urls[name], + filename=compressed_file) + with gzip.open(compressed_file, 'rb') as in_file: + with open(graph_file, 'wb') as out_file: + out_file.write(in_file.read()) + + return graph_file + + +def node2vec_walk(graph_file, + walk_file, + walk_length, + walks_per_node, + return_param=1.0, + inout_param=1.0, + directed=False, + weighted=False, + verbose=False): + """Perform random walk on graph for node2vec. + + See https://cs.stanford.edu/~jure/pubs/node2vec-kdd16.pdf + + Args: + graph_file (str): Uncompressed edge list file. + walk_file (str): File where random walks will be saved. + walk_length (int): Walk length. + walks_per_node (int): Number of walks per graph vertex. + return_param (float, optional): p-parameter for random walk + (default: 1.0). + inout_param (float, optional): q-parameter for random walk + (default: 1.0). + directed (bool, optional): Graph is directed (default: False). + weighted (bool, optional): Graph is weighted (default: False). + verbose (bool, optional): Verbose output (default: False). + + """ + + # Check executable + node2vec_exe = os.path.join(_root_dir, 'snap', 'examples', + 'node2vec', 'node2vec') + if not os.path.isfile(node2vec_exe): + raise FileNotFoundError( + 'Could not find node2vec executable at {}. ' + 'Has SNAP been built?' + .format(node2vec_exe) + ) + + # Make sure output directory exists + output_dir = os.path.dirname(os.path.realpath(walk_file)) + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + + # Construct invocation + command = [ + node2vec_exe, + '-i:{}'.format(graph_file), + '-o:{}'.format(walk_file), + '-d:-1', + '-l:{}'.format(walk_length), + '-r:{}'.format(walks_per_node), + '-k:-1', + '-e:-1', + '-p:{}'.format(return_param), + '-q:{}'.format(inout_param), + '-ow', + ] + if verbose: + command.append('-v') + if directed: + command.append('-dr') + if weighted: + command.append('-w') + + # Run executable + return subprocess.call(command) diff --git a/applications/nlp/README.md b/applications/nlp/README.md new file mode 100644 index 00000000000..a60513ee0f1 --- /dev/null +++ b/applications/nlp/README.md @@ -0,0 +1,22 @@ +# Example models for natural language processing + +This directory contains LBANN experiments with text data, with the +goal of developing and optimizing NLP functionality. It will +eventually contain reference implementations of widely-used NLP +models. + +## Dependencies + +- PyTorch + +- Transformers: NLP library for TensorFlow and PyTorch. Install with: + +```bash +pip3 install transformers +``` + +- PyTorch-NLP: PyTorch utilities for NLP applications. Install with: + +```bash +pip3 install pytorch-nlp +``` diff --git a/applications/nlp/data/.gitignore b/applications/nlp/data/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/applications/nlp/data/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/applications/nlp/experiments/.gitignore b/applications/nlp/experiments/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/applications/nlp/experiments/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/applications/nlp/rnn/dataset.py b/applications/nlp/rnn/dataset.py new file mode 100644 index 00000000000..66fb7c0fd88 --- /dev/null +++ b/applications/nlp/rnn/dataset.py @@ -0,0 +1,25 @@ +import os.path +import sys + +# Local imports +current_file = os.path.realpath(__file__) +root_dir = os.path.dirname(os.path.dirname(current_file)) +sys.path.append(root_dir) +import utils.gutenberg + +# Options +text_name = 'frankenstein' +sequence_length = 10 + +# Download and tokenize text data, if needed +data_url = utils.gutenberg.get_url(text_name) +data_dir = os.path.join(root_dir, 'data', text_name) +corpus = utils.gutenberg.GutenbergCorpus(data_dir, data_url) + +# Sample access functions +def get_sample(index): + return corpus[index:index+sequence_length] +def num_samples(): + return len(corpus) - sequence_length + 1 +def sample_dims(): + return (sequence_length,) diff --git a/applications/nlp/rnn/main.py b/applications/nlp/rnn/main.py new file mode 100644 index 00000000000..3aa00463919 --- /dev/null +++ b/applications/nlp/rnn/main.py @@ -0,0 +1,118 @@ +"""Simple recurrent network on tokenized text data.""" +import argparse +import os.path +import sys + +import lbann +import lbann.modules +import lbann.contrib.launcher +import lbann.contrib.args + +# Local imports +current_dir = os.path.dirname(os.path.realpath(__file__)) +root_dir = os.path.dirname(current_dir) +sys.path.append(root_dir) +import dataset +from utils import str_list + +# ---------------------------------- +# Options +# ---------------------------------- + +# Command-line arguments +parser = argparse.ArgumentParser() +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_textrnn', type=str, + help='job name', metavar='NAME') +parser.add_argument( + '--mini-batch-size', action='store', default=256, type=int, + help='mini-batch size (default: 256)', metavar='NUM') +parser.add_argument( + '--num-epochs', action='store', default=20, type=int, + help='number of epochs (default: 20)', metavar='NUM') +parser.add_argument( + '--latent-dim', action='store', default=128, type=int, + help='latent space dimensions (default: 128)', metavar='NUM') +args = parser.parse_args() + +# ---------------------------------- +# Construct layer graph +# ---------------------------------- + +# Dataset properties +vocab_size = dataset.corpus.vocab_size +sequence_length = dataset.sample_dims()[0] + +# Input is a sequence of token IDs +input_ = lbann.Identity(lbann.Input()) +input_slice = lbann.Slice(input_, + slice_points=str_list(range(sequence_length+1))) +tokens_list = [lbann.Identity(input_slice) for _ in range(sequence_length)] + +# Get sequence of embedding vectors +embeddings = lbann.Embedding(input_, + num_embeddings=vocab_size, + embedding_dim=args.latent_dim) +embeddings_slice = lbann.Slice(embeddings, + axis=0, + slice_points=str_list(range(sequence_length+1))) +embeddings_list = [lbann.Reshape(embeddings_slice, dims='-1') + for _ in range(sequence_length)] + +# Layer modules +lstm = lbann.modules.LSTMCell(args.latent_dim) +lstm_state = [lbann.Constant(value=0, num_neurons=str_list(args.latent_dim)), + lbann.Constant(value=0, num_neurons=str_list(args.latent_dim))] +pred_fc = lbann.modules.FullyConnectedModule(vocab_size, + data_layout='model_parallel') + +# Iterate through RNN steps +loss = [] +for step in range(sequence_length-1): + + # Predict next token with RNN + x = embeddings_list[step] + x, lstm_state = lstm(x, lstm_state) + x = pred_fc(x) + pred = lbann.Softmax(x) + + # Evaluate prediction with cross entropy + ground_truth = lbann.OneHot(tokens_list[step+1], size=vocab_size) + cross_entropy = lbann.CrossEntropy([pred, ground_truth]) + loss.append(lbann.LayerTerm(cross_entropy, scale=1/(sequence_length-1))) + +# ---------------------------------- +# Create data reader +# ---------------------------------- + +reader = lbann.reader_pb2.DataReader() +_reader = reader.reader.add() +_reader.name = 'python' +_reader.role = 'train' +_reader.shuffle = True +_reader.percent_of_data_to_use = 1.0 +_reader.python.module = 'dataset' +_reader.python.module_dir = current_dir +_reader.python.sample_function = 'get_sample' +_reader.python.num_samples_function = 'num_samples' +_reader.python.sample_dims_function = 'sample_dims' + +# ---------------------------------- +# Run LBANN +# ---------------------------------- + +# Create LBANN objects +trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size) +model = lbann.Model(args.num_epochs, + layers=lbann.traverse_layer_graph(input_), + objective_function=loss, + callbacks=[lbann.CallbackPrint(), + lbann.CallbackTimer()]) +opt = lbann.SGD(learn_rate=0.01, momentum=0.9) + +# Run LBANN +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) +lbann.contrib.launcher.run(trainer, model, reader, opt, + job_name=args.job_name, + **kwargs) diff --git a/applications/nlp/transformer/dataset.py b/applications/nlp/transformer/dataset.py new file mode 100644 index 00000000000..4780a035654 --- /dev/null +++ b/applications/nlp/transformer/dataset.py @@ -0,0 +1,129 @@ +"""WMT 2014 dataset for English-German translation.""" +import os.path +import sys + +import numpy as np +import torchnlp.datasets + +# Local imports +current_file = os.path.realpath(__file__) +root_dir = os.path.dirname(os.path.dirname(current_file)) +sys.path.append(root_dir) +import utils.paths + +# ---------------------------------------------- +# Options +# ---------------------------------------------- + +# Note: Sequence lengths for WMT 2014 have mean 29.05, standard +# deviation 16.20, and max 484. +sequence_length = 64 + +# ---------------------------------------------- +# Setup +# ---------------------------------------------- + +# Load WMT 2014 dataset +data_dir = utils.paths.wmt_dir() +dataset_train, dataset_val = torchnlp.datasets.wmt_dataset( + directory=data_dir, + train=True, + dev=True, +) + +# Load token vocabulary +with open(os.path.join(data_dir, 'vocab.bpe.32000')) as f: + tokens = f.read().splitlines() +tokens.extend(['', '', '', '']) +token_indices = dict(zip(tokens, range(len(tokens)))) +unk_index = token_indices.get('', -1) +bos_index = token_indices.get('', -1) +eos_index = token_indices.get('', -1) +pad_index = token_indices.get('', -1) + +# ---------------------------------------------- +# Tokenization +# ---------------------------------------------- + +def tokenize(text): + """Convert string to list of token indices. + + WMT 2014 has already been tokenized with byte-pair encoding. We + add BOS and EOS tokens. + + """ + indices = [bos_index] + indices.extend( + token_indices.get(token, unk_index) + for token in text.split(' ') + ) + indices.append(eos_index) + return indices + +def detokenize(indices): + """Convert token indices to string. + + Stops at the first EOS token. All other special tokens are + ignored. + + """ + text = '' + for index in indices: + if index == eos_index: + break + elif index in (unk_index, bos_index, pad_index): + continue + else: + text += f' {tokens[index]}' + return text + +# ---------------------------------------------- +# Sample access functions +# ---------------------------------------------- + +def get_train_sample(index): + """Token indices for a data sample from the training set. + + The English and German text samples are tokenized, + padded/subsampled to sequence_length tokens, and concatenated. + + """ + + # Tokenize text data + text = dataset_train[index] + sample_en = tokenize(text['en']) + sample_de = tokenize(text['de']) + + # Randomly subsample sequences if they are too long + if len(sample_en) > sequence_length or len(sample_de) > sequence_length: + pos = np.random.rand() + if len(sample_en) > sequence_length: + offset = (len(sample_en) - sequence_length + 1) * pos + offset = int(np.floor(offset)) + sample_en = sample_en[offset:offset+sequence_length] + if len(sample_de) > sequence_length: + offset = (len(sample_de) - sequence_length + 1) * pos + offset = int(np.floor(offset)) + sample_de = sample_de[offset:offset+sequence_length] + + # Concatenate sequences and return + sample = np.full(2*sequence_length, pad_index, dtype=int) + sample[0:len(sample_en)] = sample_en + sample[sequence_length:sequence_length+len(sample_de)] = sample_de + return sample + +def get_val_sample(index): + """Token indices for a data sample from the validation set.""" + text = dataset_val[index] + sample_en = tokenize(text['en']) + sample_de = tokenize(text['de']) + return sample_en, sample_de + +def num_train_samples(): + return len(dataset_train) +def num_val_samples(): + return len(dataset_val) +def sample_dims(): + return (2*sequence_length+1,) +def vocab_size(): + return len(tokens) diff --git a/applications/nlp/transformer/evaluate.py b/applications/nlp/transformer/evaluate.py new file mode 100644 index 00000000000..91ec00fad95 --- /dev/null +++ b/applications/nlp/transformer/evaluate.py @@ -0,0 +1,322 @@ +"""Evaluate Transformer example. + +The LBANN Transformer model is assumed to have saved its weights to +weight files with the "dump weights" callback. These weights are +loaded into a PyTorch model and English-German translation is +performed with greedy decoding on the WMT 2014 validation dataset. +BLEU scores are computed for the predicted translations. + +""" + +import argparse +import os.path +import sys + +import numpy as np +import torch +import torch.nn +import torchnlp.metrics + +# Local imports +current_file = os.path.realpath(__file__) +root_dir = os.path.dirname(os.path.dirname(current_file)) +sys.path.append(root_dir) +import dataset +import utils +import utils.paths + +# ---------------------------------------------- +# Options +# ---------------------------------------------- + +# Evaluation options +mini_batch_size = 64 # Doesn't need to match training + +# Hard-coded model parameters +# Note: Must match parameters from training. +embed_dim = 512 +num_heads = 8 +num_encoder_layers = 6 +num_decoder_layers = 6 +filter_dim = 2048 +dropout = 0.1 + +# Dataset properties +vocab_size = dataset.vocab_size() +max_sequence_length = dataset.sequence_length +bos_index = dataset.bos_index +eos_index = dataset.eos_index +pad_index = dataset.pad_index +num_samples = dataset.num_val_samples() + +# ---------------------------------------------- +# Evaluation data +# ---------------------------------------------- + +def get_batch(indices): + """Get a batch of samples from the evaluation dataset. + + The sequences are padded to the length of the longest sequence in + the batch. + + """ + + # Get data samples + indices = utils.make_iterable(indices) + tokens_list_en = [] + tokens_list_de = [] + for index in indices: + tokens_en, tokens_de = dataset.get_val_sample(index) + tokens_list_en.append(tokens_en) + tokens_list_de.append(tokens_de) + + # Convert tokens to PyTorch tensors + tokens_en = np.full( + (max(len(seq) for seq in tokens_list_en), len(indices)), + pad_index, + dtype=int, + ) + tokens_de = np.full( + (max(len(seq) for seq in tokens_list_de), len(indices)), + pad_index, + dtype=int, + ) + for i, seq in enumerate(tokens_list_en): + tokens_en[:len(seq), i] = seq + for i, seq in enumerate(tokens_list_de): + tokens_de[:len(seq), i] = seq + tokens_en = torch.from_numpy(tokens_en) + tokens_de = torch.from_numpy(tokens_de) + return tokens_en, tokens_de + +# ---------------------------------------------- +# Load model from file +# ---------------------------------------------- + +def load_parameter(weight_file): + """Create a PyTorch Parameter object with weights from LBANN. + + Weight file is assumed to have been created by the "dump weights" + callback in LBANN. + + """ + data = np.loadtxt(weight_file, dtype=np.float32) + return torch.nn.Parameter( + data=torch.from_numpy(data), + requires_grad=False + ) + +def load_embedding_layer(weights_prefix): + """Create a PyTorch embedding layer with weights from LBANN. + + Weight files are assumed to have been created by the "dump + weights" callback in LBANN. They should be in the form + -embeddings-Weights.txt. + + """ + weight_file = f'{weights_prefix}-embeddings-Weights.txt' + weight = load_parameter(weight_file).transpose(1,0) + return torch.nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=embed_dim, + padding_idx=pad_index, + _weight=weight, + ) + +def load_transformer(weights_prefix): + """Create a PyTorch transformer with weights from LBANN. + + Weight files are assumed to have been created by the "dump + weights" callback in LBANN. They should be in the form + --Weights.txt. + + """ + + # PyTorch transformer model + transformer = torch.nn.Transformer( + d_model=embed_dim, + nhead=num_heads, + num_encoder_layers=num_encoder_layers, + num_decoder_layers=num_decoder_layers, + dim_feedforward=filter_dim, + dropout=dropout, + ) + + # Set transformer to evaluation mode + transformer.eval() + + # Load weights for encoder + for i, layer in enumerate(transformer.encoder.layers): + + # Load weights for self-attention + attention = layer.self_attn + attention._qkv_same_embed_dim = False + prefix = f'{weights_prefix}-transformer_encoder{i}_attention' + attention.q_proj_weight = load_parameter(f'{prefix}_query_matrix-Weights.txt') + attention.q_proj_bias = load_parameter(f'{prefix}_query_bias-Weights.txt') + attention.k_proj_weight = load_parameter(f'{prefix}_key_matrix-Weights.txt') + attention.k_proj_bias = load_parameter(f'{prefix}_key_bias-Weights.txt') + attention.v_proj_weight = load_parameter(f'{prefix}_value_matrix-Weights.txt') + attention.v_proj_bias = load_parameter(f'{prefix}_value_bias-Weights.txt') + attention.out_proj_weight = load_parameter(f'{prefix}_output_matrix-Weights.txt') + attention.out_proj_bias = load_parameter(f'{prefix}_output_bias-Weights.txt') + + # Load weights for feedforward network + prefix = f'{weights_prefix}-transformer_encoder{i}' + layer.linear1.weight = load_parameter(f'{prefix}_fc1_matrix-Weights.txt') + layer.linear1.bias = load_parameter(f'{prefix}_fc1_bias-Weights.txt') + layer.linear2.weight = load_parameter(f'{prefix}_fc2_matrix-Weights.txt') + layer.linear2.bias = load_parameter(f'{prefix}_fc2_bias-Weights.txt') + + # Load weights for decoder + for i, layer in enumerate(transformer.decoder.layers): + + # Load weights for self-attention + attention = layer.self_attn + attention._qkv_same_embed_dim = False + prefix = f'{weights_prefix}-transformer_decoder{i}_attention1' + attention.q_proj_weight = load_parameter(f'{prefix}_query_matrix-Weights.txt') + attention.q_proj_bias = load_parameter(f'{prefix}_query_bias-Weights.txt') + attention.k_proj_weight = load_parameter(f'{prefix}_key_matrix-Weights.txt') + attention.k_proj_bias = load_parameter(f'{prefix}_key_bias-Weights.txt') + attention.v_proj_weight = load_parameter(f'{prefix}_value_matrix-Weights.txt') + attention.v_proj_bias = load_parameter(f'{prefix}_value_bias-Weights.txt') + attention.out_proj_weight = load_parameter(f'{prefix}_output_matrix-Weights.txt') + attention.out_proj_bias = load_parameter(f'{prefix}_output_bias-Weights.txt') + + # Load weights for attention with memory + attention = layer.multihead_attn + attention._qkv_same_embed_dim = False + prefix = f'{weights_prefix}-transformer_decoder{i}_attention2' + attention.q_proj_weight = load_parameter(f'{prefix}_query_matrix-Weights.txt') + attention.q_proj_bias = load_parameter(f'{prefix}_query_bias-Weights.txt') + attention.k_proj_weight = load_parameter(f'{prefix}_key_matrix-Weights.txt') + attention.k_proj_bias = load_parameter(f'{prefix}_key_bias-Weights.txt') + attention.v_proj_weight = load_parameter(f'{prefix}_value_matrix-Weights.txt') + attention.v_proj_bias = load_parameter(f'{prefix}_value_bias-Weights.txt') + attention.out_proj_weight = load_parameter(f'{prefix}_output_matrix-Weights.txt') + attention.out_proj_bias = load_parameter(f'{prefix}_output_bias-Weights.txt') + + # Load weights for feedforward network + prefix = f'{weights_prefix}-transformer_decoder{i}' + layer.linear1.weight = load_parameter(f'{prefix}_fc1_matrix-Weights.txt') + layer.linear1.bias = load_parameter(f'{prefix}_fc1_bias-Weights.txt') + layer.linear2.weight = load_parameter(f'{prefix}_fc2_matrix-Weights.txt') + layer.linear2.bias = load_parameter(f'{prefix}_fc2_bias-Weights.txt') + + return transformer + +# ---------------------------------------------- +# Evaluate transformer model +# ---------------------------------------------- + +def add_positional_encoding(x): + """Add positional encoding for transformer model.""" + sequence_length = x.shape[0] + embed_dim = x.shape[2] + encoding = np.zeros(x.shape, dtype=np.float32) + for i in range((embed_dim+1) // 2): + pos = np.arange(sequence_length).reshape(-1,1) + encoding[:,:,2*i] = np.sin(pos / 10000**(2*i/embed_dim)) + for i in range(embed_dim // 2): + pos = np.arange(sequence_length).reshape(-1,1) + encoding[:,:,2*i+1] = np.cos(pos / 10000**(2*i/embed_dim)) + return x + torch.from_numpy(encoding) + +def greedy_decode(tokens_en, embedding_layer, transformer, classifier): + """Generate sequence with transformer. + + Predict tokens one at a time by choosing the one that maximizes + the classification score. + + """ + + # Encode English sequence + embeddings_en = embedding_layer(tokens_en) + memory = transformer.encoder( + add_positional_encoding(embeddings_en * np.sqrt(embed_dim)) + ) + + # Decode German sequence + # TODO: Only perform compute for last sequence entry + # TODO: Detect EOS tokens and stop early + tokens_de = torch.full((1,tokens_en.shape[1]), bos_index, dtype=int) + for i in range(1, max_sequence_length): + embeddings_de = embedding_layer(tokens_de) + preds = transformer.decoder( + add_positional_encoding(embeddings_de * np.sqrt(embed_dim)), + memory, + tgt_mask=transformer.generate_square_subsequent_mask(i), + ) + preds = classifier(preds[-1,:,:]) + preds = preds.argmax(dim=1) + tokens_de = torch.cat([tokens_de, preds.reshape(1,-1)], dim=0) + return tokens_de + +def evaluate_transformer(weights_prefix): + """Evaluate transformer model with weights from LBANN. + + Weight files are assumed to have been created by the "dump + weights" callback in LBANN. They should be in the form + --Weights.txt. + + """ + + # Load model weights from file + embedding_layer = load_embedding_layer(weights_prefix) + transformer = load_transformer(weights_prefix) + classifier = torch.nn.Linear(embed_dim, vocab_size, bias=False) + classifier.weight = embedding_layer.weight + + # Evaluate model + bleu_scores = [] + for batch, index_start in enumerate(range(0, num_samples, mini_batch_size)): + index_end = min(index_start+mini_batch_size, num_samples) + indices = list(range(index_start, index_end)) + batch_size = len(indices) + + # Translate English sequence to German + # TODO: Decoding with beam search + tokens_en, true_tokens_de = get_batch(indices) + pred_tokens_de = greedy_decode( + tokens_en, + embedding_layer, + transformer, + classifier, + ) + + # Compute BLEU score + for i in range(batch_size): + hypothesis = dataset.detokenize(pred_tokens_de[:,i].numpy()) + reference = dataset.detokenize(true_tokens_de[:,i].numpy()) + bleu_scores.append( + torchnlp.metrics.get_moses_multi_bleu( + [hypothesis], + [reference], + ) + ) + + # Print results + print( + f'BLEU score: ' + f'mean={np.mean(bleu_scores)}, ' + f'stdev={np.std(bleu_scores)}, ' + f'min={np.min(bleu_scores)}, ' + f'max={np.max(bleu_scores)}' + ) + +# ---------------------------------------------- +# Command-line options if run as script +# ---------------------------------------------- + +if __name__ == "__main__": + + # Command-line arguments + parser = argparse.ArgumentParser() + parser.add_argument( + 'weights_prefix', type=str, + help='prefix for saved weights from LBANN') + args = parser.parse_args() + + # Evaluate model + evaluate_transformer(args.weights_prefix) diff --git a/applications/nlp/transformer/main.py b/applications/nlp/transformer/main.py new file mode 100644 index 00000000000..179fb59afa3 --- /dev/null +++ b/applications/nlp/transformer/main.py @@ -0,0 +1,94 @@ +"""Driver script for training Transformer example.""" +import argparse +import datetime +import math +import os +import os.path +import sys + +import lbann +import lbann.contrib.args + +# Local imports +current_dir = os.path.dirname(os.path.realpath(__file__)) +root_dir = os.path.dirname(current_dir) +sys.path.append(root_dir) +import train +import evaluate +import utils.paths + +# ---------------------------------------------- +# Options +# ---------------------------------------------- + +# Command-line arguments +parser = argparse.ArgumentParser() +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_transformer', type=str, + help='job name', metavar='NAME') +parser.add_argument( + '--mini-batch-size', action='store', default=256, type=int, + help='mini-batch size (default: 256)', metavar='NUM') +parser.add_argument( + '--num-epochs', action='store', default=20, type=int, + help='number of epochs (default: 20)', metavar='NUM') +parser.add_argument( + '--num-attention-heads', action='store', default=8, type=int, + help='number of parallel attention layers (default: 8)', metavar='NUM') +parser.add_argument( + '--embed-dim', action='store', default=512, type=int, + help='embedding space dimensions (default: 512)', metavar='NUM') +args = parser.parse_args() + +# Hard-coded options +label_smoothing = 0.1 + +# ---------------------------------------------- +# Work directory +# ---------------------------------------------- + +timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') +work_dir = os.path.join( + utils.paths.root_dir(), + 'experiments', + f'{timestamp}_{args.job_name}', +) +os.makedirs(work_dir, exist_ok=True) + +# ---------------------------------------------- +# Train +# ---------------------------------------------- + +# Create batch script +trainer_params = { + 'mini_batch_size': args.mini_batch_size, +} +model_params = { + 'num_epochs': args.num_epochs, + 'embed_dim': args.embed_dim, + 'num_heads': args.num_attention_heads, + 'label_smoothing': label_smoothing, +} +script_params = lbann.contrib.args.get_scheduler_kwargs(args) +script_params['work_dir'] = work_dir +script_params['job_name'] = args.job_name +train_script = train.make_batch_script( + trainer_params=trainer_params, + model_params=model_params, + script_params=script_params, +) +weights_prefix = os.path.join( + work_dir, + 'weights', + f'model0-epoch{args.num_epochs-1}', +) +train_script.add_command( + f'# python3 {utils.paths.root_dir()}/transformer/evaluate.py {weights_prefix}' +) +train_script.run(overwrite=True) + +# ---------------------------------------------- +# Evaluate +# ---------------------------------------------- +evaluate.evaluate_transformer(weights_prefix) diff --git a/applications/nlp/transformer/train.py b/applications/nlp/transformer/train.py new file mode 100644 index 00000000000..45c2eb3e237 --- /dev/null +++ b/applications/nlp/transformer/train.py @@ -0,0 +1,230 @@ +"""Configure LBANN experiment with Transformer model.""" +import math +import os.path + +import lbann +import lbann.models +import lbann.contrib.launcher +from lbann.util import str_list + +import dataset + +# ---------------------------------------------- +# Options +# ---------------------------------------------- + +# Dataset properties +vocab_size = dataset.vocab_size() +sequence_length = dataset.sequence_length +pad_index = dataset.pad_index + +# ---------------------------------------------- +# Model +# ---------------------------------------------- + +def make_model( + num_epochs, + embed_dim, + num_heads, + label_smoothing, +): + + # Embedding weights + var = 2 / (embed_dim + vocab_size) # Glorot initialization + embedding_weights = lbann.Weights( + name='embeddings', + initializer=lbann.NormalInitializer(standard_deviation=math.sqrt(var)), + ) + + # Input is two sequences of token IDs + input_ = lbann.Identity(lbann.Input()) + + # Get sequences of embedding vectors + # Note: Scale embeddings by sqrt(embed_dim). + # Note: Decoder input is shifted right, so embedding for last + # token isn't needed. + embeddings_tokens = lbann.Identity(lbann.Slice( + input_, + axis=0, + slice_points=str_list([0, 2*sequence_length-1]), + )) + embeddings = lbann.Embedding( + embeddings_tokens, + weights=embedding_weights, + num_embeddings=vocab_size, + embedding_dim=embed_dim, + padding_idx=pad_index, + ) + embeddings = lbann.WeightedSum( + embeddings, + scaling_factors=str(math.sqrt(embed_dim)), + ) + embeddings_slice = lbann.Slice( + embeddings, + axis=0, + slice_points=str_list([0, sequence_length, 2*sequence_length-1]), + ) + encoder_input = lbann.Identity(embeddings_slice) + decoder_input = lbann.Identity(embeddings_slice) + + # Apply transformer model + transformer = lbann.models.Transformer( + hidden_size=embed_dim, + num_heads=num_heads, + name='transformer', + ) + result = transformer( + encoder_input, sequence_length, + decoder_input, sequence_length-1, + ) + + # Reconstruct decoder input + preds = lbann.ChannelwiseFullyConnected( + result, + weights=embedding_weights, + output_channel_dims=[vocab_size], + bias=False, + transpose=True, + ) + preds = lbann.ChannelwiseSoftmax(preds) + preds = lbann.Slice(preds, axis=0, slice_points=str_list(range(sequence_length))) + preds = [lbann.Identity(preds) for _ in range(sequence_length-1)] + + # Count number of non-pad tokens + label_tokens = lbann.Identity(lbann.Slice( + input_, + slice_points=str_list([sequence_length+1, 2*sequence_length]), + )) + pads = lbann.Constant(value=pad_index, num_neurons=str(sequence_length-1)) + is_not_pad = lbann.NotEqual(label_tokens, pads) + num_not_pad = lbann.Reduction(is_not_pad, mode='sum') + + # Cross entropy loss with label smoothing + label_tokens = lbann.Slice( + label_tokens, + slice_points=str_list(range(sequence_length)), + ) + label_tokens = [lbann.Identity(label_tokens) for _ in range(sequence_length-1)] + if label_smoothing > 0: + uniform_label = lbann.Constant( + value=1/vocab_size, + num_neurons=str_list([1, vocab_size]) + ) + loss = [] + for i in range(sequence_length-1): + label = lbann.OneHot(label_tokens[i], size=vocab_size) + label = lbann.Reshape(label, dims=str_list([1, vocab_size])) + if label_smoothing > 0: + label = lbann.WeightedSum( + label, + uniform_label, + scaling_factors=str_list([1-label_smoothing, label_smoothing]), + ) + loss.append(lbann.CrossEntropy(preds[i], label)) + loss = lbann.Concatenation(loss) + + # Average cross entropy over non-pad tokens + loss_scales = lbann.Divide( + is_not_pad, + lbann.Tessellate(num_not_pad, hint_layer=is_not_pad), + ) + loss = lbann.Multiply(loss, loss_scales) + loss = lbann.Reduction(loss, mode='sum') + + # Construct model + metrics = [] + callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] + return lbann.Model( + num_epochs, + layers=lbann.traverse_layer_graph(input_), + objective_function=loss, + metrics=metrics, + callbacks=callbacks, + ) + +# ---------------------------------------------- +# Data reader +# ---------------------------------------------- + +def make_data_reader(): + reader = lbann.reader_pb2.DataReader() + _reader = reader.reader.add() + _reader.name = 'python' + _reader.role = 'train' + _reader.shuffle = True + _reader.percent_of_data_to_use = 1.0 + _reader.python.module = 'dataset' + _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__)) + _reader.python.sample_function = 'get_train_sample' + _reader.python.num_samples_function = 'num_train_samples' + _reader.python.sample_dims_function = 'sample_dims' + return reader + +# ---------------------------------------------- +# Batch script +# ---------------------------------------------- + +def make_batch_script(trainer_params, model_params, script_params): + + # Create LBANN objects + trainer = lbann.Trainer(mini_batch_size=trainer_params.mini_batch_size) + model = make_model(**model_params) + reader = make_data_reader() + + # Optimizer with learning rate schedule + # Note: Rough approximation of + # embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5) + # with embed_dim=512 and warmup=4000. + opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9) + model.callbacks.append( + lbann.CallbackDropFixedLearningRate( + drop_epoch=[1], + amt=2, + ) + ) + model.callbacks.append( + lbann.CallbackDropFixedLearningRate( + drop_epoch=[2,4,8,12], + amt=0.75, + ) + ) + + # Checkpoint after every epoch + trainer.callbacks.append( + lbann.CallbackCheckpoint( + checkpoint_dir=os.path.join(script_params['work_dir'], 'checkpoint'), + checkpoint_epochs=1, + ) + ) + + # Dump weights after every epoch + model.callbacks.append( + lbann.CallbackDumpWeights( + basename=os.path.join(script_params['work_dir'], 'weights'), + epoch_interval=1, + ) + ) + + # Create Protobuf file + protobuf_file = os.path.join(script_params['work_dir'], 'experiment.prototext') + lbann.proto.save_prototext( + protobuf_file, + trainer=trainer, + model=model, + data_reader=reader, + optimizer=opt, + ) + + # Create batch script + script = lbann.contrib.launcher.make_batch_script( + **script_params, + ) + script.add_command('echo "Started training at $(date)"') + script.add_parallel_command([ + lbann.lbann_exe(), + f'--prototext={protobuf_file}', + ]) + script.add_command('status=$?') + script.add_command('echo "Finished training at $(date)"') + script.add_command('exit ${status}') + return script diff --git a/applications/nlp/utils/__init__.py b/applications/nlp/utils/__init__.py new file mode 100644 index 00000000000..29d6d9d3e7f --- /dev/null +++ b/applications/nlp/utils/__init__.py @@ -0,0 +1,17 @@ +import collections.abc + +def make_iterable(obj): + """Convert to an iterable object. + + Simply returns `obj` if it is alredy iterable. Otherwise returns a + 1-tuple containing `obj`. + + """ + if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str): + return obj + else: + return (obj,) + +def str_list(it): + """Convert an iterable object to a space-separated string.""" + return ' '.join([str(i) for i in make_iterable(it)]) diff --git a/applications/nlp/utils/gutenberg.py b/applications/nlp/utils/gutenberg.py new file mode 100644 index 00000000000..42d5d6da16d --- /dev/null +++ b/applications/nlp/utils/gutenberg.py @@ -0,0 +1,146 @@ +"""Helper functions for text data from Project Gutenberg.""" +import array +import os +import os.path +import re +import urllib.request +import numpy as np + + +def get_url(name): + """URL to Project Gutenberg text file.""" + urls = { + 'frankenstein': 'https://www.gutenberg.org/files/84/84-0.txt', + 'shakespeare': 'https://www.gutenberg.org/files/100/100-0.txt', + } + return urls[name.lower()] + + +def strip_boilerplate(raw_file, stripped_file): + """Remove header and footer from Project Gutenberg text file. + + See: + + https://www.gutenberg.org/wiki/Gutenberg:Project_Gutenberg_Header_How-To + + Args: + raw_file (str): Text file downloaded from Project Gutenberg. + stripped_file (str): Path where the stripped file will be + saved. + + """ + with open(raw_file, 'r') as in_file, \ + open(stripped_file, 'w') as out_file: + started = False + begin_regex = re.compile('^\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*$') + end_regex = re.compile('^\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*$') + for line in in_file: + if started: + if end_regex.match(line): + break + else: + out_file.write(line) + elif begin_regex.match(line): + started = True + + +def tokenize(text_file, + encoded_file=None, + vocab_file=None, + ignore_whitespace=True): + """Convert text file to sequence of token IDs. + + Tokenization is performed with BERT tokenizer. + + Args: + text_file (str): Text file to be encoded. + encoded_file (str, optional): If provided, path where the + encoded data will be saved as an .npz file. The sequence of + token IDs is saved as 'encoded_data' and the vocabulary + size is saved as 'vocab_size'. + vocab_file (str, optional): If provided, path where the + vocabulary will be saved as a text file. + ignore_whitespace (bool, optional): Whether to ignore text + lines that are purely made of whitespace (default: True). + + Returns: + array of int: Sequence of token IDs. + int: Number of tokens in vocabulary. + + """ + + # Get BERT tokenizer from Transformers + import transformers + tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased') + vocab_size = tokenizer.vocab_size + if vocab_file: + tokenizer.save_vocabulary(vocab_file) + + # Apply tokenizer to text file + encoded_data = array.array('l') + with open(text_file) as f: + for line in f: + if ignore_whitespace and line.isspace(): + continue + encoded_data.extend(tokenizer.encode(line)) + if encoded_file: + np.savez_compressed(encoded_file, + encoded_data=encoded_data, + vocab_size=vocab_size) + return encoded_data, vocab_size + + +class GutenbergCorpus(): + """Tokenized text from Project Gutenberg. + + Args: + data_dir (str): Directory for downloading data and + intermediate. + data_url (str): URL to Project Gutenberg text file. + + Attributes: + token_data (array of int): Sequence of token IDs. + vocab_size (int): Number of tokens in vocabulary. + + """ + def __init__(self, data_dir, data_url): + + # Create data directory if needed + if not os.path.isdir(data_dir): + os.makedirs(data_dir) + data_dir = os.path.realpath(data_dir) + + # Load tokenized data + # Note: If needed, download the text data from Project + # Gutenberg and tokenize it. + token_data_file = os.path.join(data_dir, 'token_data.npz') + if os.path.isfile(token_data_file): + data = np.load(token_data_file) + token_data = data['encoded_data'] + vocab_size = int(data['vocab_size']) + else: + text_data_file = os.path.join(data_dir, 'text_data.txt') + if not os.path.isfile(text_data_file): + raw_file = os.path.join(data_dir, 'raw.txt') + if not os.path.isfile(raw_file): + urllib.request.urlretrieve(data_url, + filename=raw_file) + strip_boilerplate(raw_file, text_data_file) + vocab_file = os.path.join(data_dir, 'vocab.txt') + token_data, vocab_size = tokenize(text_data_file, + token_data_file, + vocab_file) + + # Class members + self.token_data = token_data + self.vocab_size = vocab_size + + def __iter__(self): + """Iterator through token IDs.""" + return self.token_data.__iter__() + def __getitem__(self, key): + """Get token ID.""" + return self.token_data.__getitem__(key) + def __len__(self): + """Get total number of tokens in corpus.""" + return self.token_data.__len__() diff --git a/applications/nlp/utils/paths.py b/applications/nlp/utils/paths.py new file mode 100644 index 00000000000..70d968230c5 --- /dev/null +++ b/applications/nlp/utils/paths.py @@ -0,0 +1,44 @@ +"""Useful file paths.""" +import os +import os.path +import re +import socket + + +def system(): + """Name of current compute system. + + Primarily used to detect LLNL LC systems. + + """ + return re.sub(r'\d+', '', socket.gethostname()) + + +def root_dir(): + """Root directory for LBANN NLP application.""" + return os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + + +def wmt_dir(system=system()): + """Data directory for the WMT 2014 dataset. + + See https://pytorchnlp.readthedocs.io/en/latest/source/torchnlp.datasets.html#torchnlp.datasets.wmt_dataset. + + The dataset has already been downloaded on LLNL LC systems and is + available to anyone in the "brainusr" group. If the dataset is not + accessible, a path within the application directory is returned. + + """ + + # Cached datasets on LC systems + path = None + if system in ('lassen', 'sierra'): + path = '/p/gpfs1/brainusr/datasets/wmt16_en_de' + elif system in ('pascal', 'catalyst', 'quartz', 'surface'): + path = '/p/lscratchh/brainusr/datasets/wmt16_en_de' + + # Default path if cached dataset isn't available + if not path or not os.access(path, os.R_OK): + path = os.path.join(root_dir(), 'data', 'wmt16_en_de') + + return path diff --git a/applications/physics/ICF/README.md b/applications/physics/ICF/README.md new file mode 100644 index 00000000000..7f134908581 --- /dev/null +++ b/applications/physics/ICF/README.md @@ -0,0 +1,93 @@ +## Surrogate Models for Inertial Confinement Fussion + +Scripts in this directory are surrogate deep learning models bridging simulation and experimental datasets from inertial confinement fusion (high energy physics) application. Please see [link](https://github.com/rushilanirudh/macc) for more details on model architectures and dataset description. Also, see LBANN documentations on how to install, build and run LBANN code. + +### Pre-train Wasserstein autoencoder (WAE) +```bash +python3 pre_train_jag_wae.py +``` +### Train surrogate model using pre-trained WAE +```bash +python3 train_macc_surrogate.py +``` +Expected output of pre-training WAE in LBANN (90K training, 10K validation dataset, on a single LLNL Pascal GPU) is shown: +``` +-------------------------------------------------------------------------------- + +[0] Epoch : stats formated [tr/v/te] iter/epoch = [719/80/79] + + global MB = [ 128/ 128/ 128] global last MB = [ 33 / 103 / 16 ] + + local MB = [ 128/ 128/ 128] local last MB = [ 33+0/ 103+0/ 16+0] + +-------------------------------------------------------------------------------- + +model0 (instance 0) training epoch 0 objective function : 940.059 + +model0 (instance 0) training epoch 0 recon_error : 0.0572849 + +model0 (instance 0) training epoch 0 run time : 5.74964s + +model0 (instance 0) validation objective function : 34.3421 + +model0 (instance 0) validation recon_error : 0.00208194 + +model0 (instance 0) validation run time : 0.494716s + +-------------------------------------------------------------------------------- + +[1] Epoch : stats formated [tr/v/te] iter/epoch = [719/80/79] + + global MB = [ 128/ 128/ 128] global last MB = [ 33 / 103 / 16 ] + + local MB = [ 128/ 128/ 128] local last MB = [ 33+0/ 103+0/ 16+0] + +-------------------------------------------------------------------------------- + +model0 (instance 0) training epoch 1 objective function : 22.2183 + +model0 (instance 0) training epoch 1 recon_error : 0.00134448 + +model0 (instance 0) training epoch 1 run time : 5.53825s + +model0 (instance 0) validation objective function : 11.6158 + +model0 (instance 0) validation recon_error : 0.000693222 + +model0 (instance 0) validation run time : 0.317699s + +-------------------------------------------------------------------------------- + +[2] Epoch : stats formated [tr/v/te] iter/epoch = [719/80/79] + + global MB = [ 128/ 128/ 128] global last MB = [ 33 / 103 / 16 ] + + local MB = [ 128/ 128/ 128] local last MB = [ 33+0/ 103+0/ 16+0] + +-------------------------------------------------------------------------------- + +model0 (instance 0) training epoch 2 objective function : 9.18846 + +model0 (instance 0) training epoch 2 recon_error : 0.000554316 + +model0 (instance 0) training epoch 2 run time : 5.69306s + +model0 (instance 0) validation objective function : 6.96061 + +model0 (instance 0) validation recon_error : 0.00039013 + +model0 (instance 0) validation run time : 0.315543s + +``` + +### Visual Outputs +##### Ground Truth Images + +![alt text](lbann_gt_img.png) + +##### Predicted Images +![alt text](lbann_pred_img.png) + +##### Ground Truth and Predicted Scalars +![alt text](lbann_gt_pred_sca.png) + diff --git a/applications/physics/ICF/check_all_scalar.py b/applications/physics/ICF/check_all_scalar.py new file mode 100644 index 00000000000..cf1c068a5e2 --- /dev/null +++ b/applications/physics/ICF/check_all_scalar.py @@ -0,0 +1,25 @@ +import numpy as np +import sys +import glob + +#Check if there are scalars with all zero values +#Input is scalar values dumped from LBANN input layer +fdir = sys.argv[1] +epoch = sys.argv[2] +print(fdir) +scalar_files = glob.glob(fdir+"*training-epoch"+str(epoch)+"*gt_sca*.npy") +scalar_jag = np.load(scalar_files[0]) +print("First JAG param shape " , scalar_jag.shape) +print("param jag ", scalar_jag) +for i, f in enumerate(scalar_files): + if(i > 0) : + scalar_jag = np.concatenate((scalar_jag, np.load(f))) + +print("Final JAG param shape " , scalar_jag.shape) + + +num_cols = scalar_jag.shape[1] +print("Num cols ", num_cols) + +zeros = np.where(np.all(np.isclose(scalar_jag, 0), axis=1)) +print("Num of zerors ", zeros[0].shape , " ", zeros) diff --git a/applications/physics/ICF/eval_macc_surrogate.py b/applications/physics/ICF/eval_macc_surrogate.py new file mode 100644 index 00000000000..53f6665c687 --- /dev/null +++ b/applications/physics/ICF/eval_macc_surrogate.py @@ -0,0 +1,218 @@ +import macc_models +import argparse +import os +from os.path import abspath, dirname, join +import google.protobuf.text_format as txtf +import lbann.contrib.launcher +import lbann.contrib.args +from lbann.util import str_list +import datetime + +# ============================================== +# Setup and launch experiment +# ============================================== + +# Default data reader +cur_dir = dirname(abspath(__file__)) +data_reader_prototext = join(dirname(cur_dir), + 'data', + 'jag_conduit_reader.prototext') +metadata_prototext = join(dirname(cur_dir), + 'data', + 'jag_100M_metadata.prototext') + +# Initialize LBANN inf executable +lbann_exe = abspath(lbann.lbann_exe()) +lbann_exe = join(dirname(lbann_exe), 'lbann_inf') + +# Command-line arguments +parser = argparse.ArgumentParser() +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='eval', type=str, + help='job name', metavar='NAME') +parser.add_argument( + '--mini-batch-size', action='store', default=4096, type=int, + help='mini-batch size (default: 128)', metavar='NUM') +parser.add_argument( + '--num-nodes', action='store', default=4, type=int, + help='number of nodes (default: 4)', metavar='NUM') +parser.add_argument( + '--ppn', action='store', default=4, type=int, + help='processes per node (default: 4)', metavar='NUM') +parser.add_argument( + '--ydim', action='store', default=16399, type=int, + help='image+scalar dim (default: 64*64*4+15=16399)', metavar='NUM') +parser.add_argument( + '--zdim', action='store', default=20, type=int, + help='latent space dim (default: 20)', metavar='NUM') +parser.add_argument( + '--xdim', action='store', default=5, type=int, + help='input (x) dim (default: 5)', metavar='NUM') +parser.add_argument( + '--lamda-cyc', action='store', default=1e-3, type=float, + help='lamda-cyc (default: 1e-3)', metavar='NUM') +parser.add_argument( + '--useCNN', action='store', default=False, type=bool, + help='use CNN', metavar='BOOL') +parser.add_argument( + '--data-filedir-train', action='store', default='/p/gpfs1/brainusr/datasets/10MJAG/1M_A/', type=str, + help='data filedir (default train dir is 10MJAG/1M_A)', metavar='NAME') +parser.add_argument( + '--data-filedir-test', action='store', default='/p/gpfs1/brainusr/datasets/10MJAG/1M_B/', type=str, + help='data filedir (default test dir is 10MJAG/1M_B)', metavar='NAME') +parser.add_argument( + '--index-list-train', action='store', default='index_eight.txt', type=str, + help='index list (default index_eight 8 samples)', metavar='NAME') +parser.add_argument( + '--index-list-test', action='store', default='t2_index.txt', type=str, + help='index list (default index.txt)', metavar='NAME') +parser.add_argument( + '--percent-of-data-to-use', action='store', default=0.01, type=float, + help='percent of data to use (default: 0.01)', metavar='NUM') +parser.add_argument( + '--dump-outputs', action='store', default='dump_outs', type=str, + help='dump outputs dir (default: jobdir/dump_outs)', metavar='NAME') +parser.add_argument( + '--pretrained-dir', action='store', default=None, type=str, + help='pretrained WAE surrogate dir (default: ' ')', metavar='NAME') +parser.add_argument( + '--procs-per-trainer', action='store', default=0, type=int, + help='processes per trainer (default: 0)', metavar='NUM') +args = parser.parse_args() + +print("Pretrained dir ", args.pretrained_dir) +assert args.pretrained_dir, "evaluate script asssumes a pretrained MaCC model" + +def list2str(l): + return ' '.join(l) + +def construct_model(): + """Construct MACC surrogate model. + + See https://arxiv.org/pdf/1912.08113.pdf model architecture and other details + + """ + import lbann + + # Layer graph + input = lbann.Input(target_mode='N/A',name='inp_data') + # data is 64*64*4 images + 15 scalar + 5 param + inp_slice = lbann.Slice(input, axis=0, slice_points=str_list([0,args.ydim,args.ydim+args.xdim]),name='inp_slice') + gt_y = lbann.Identity(inp_slice,name='gt_y') + gt_x = lbann.Identity(inp_slice, name='gt_x') #param not used + + zero = lbann.Constant(value=0.0,num_neurons='1',name='zero') + one = lbann.Constant(value=1.0,num_neurons='1',name='one') + + + z = lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="20") + wae = macc_models.MACCWAE(args.zdim,args.ydim,use_CNN=args.useCNN) #pretrained, freeze + inv = macc_models.MACCInverse(args.xdim) + fwd = macc_models.MACCForward(args.zdim) + + + y_pred_fwd = wae.encoder(gt_y) + + param_pred_ = wae.encoder(gt_y) + input_fake = inv(param_pred_) + + output_cyc = fwd(input_fake) + y_image_re2 = wae.decoder(output_cyc) + + '''**** Train cycleGAN input params <--> latent space of (images, scalars) ****''' + output_fake = fwd(gt_x) + y_image_re = wae.decoder(output_fake) + + y_out = wae.decoder(y_pred_fwd) + + param_pred2_ = wae.encoder(y_image_re) + input_cyc = inv(param_pred2_) + + L_l2_x = lbann.MeanSquaredError(input_fake,gt_x) #(x,inv(enc(y)), (encoder+)inverse loss + L_cyc_x = lbann.MeanSquaredError(input_cyc,gt_x) #param, x cycle loss, from latent space + + L_l2_y = lbann.MeanSquaredError(output_fake,y_pred_fwd) #pred error into latent space (enc(y),fw(x)) + L_cyc_y = lbann.MeanSquaredError(output_cyc,y_pred_fwd) # pred error into latent space (enc(y), fw(inv(enc(y)))) + + + #@todo slice here to separate scalar from image + img_sca_loss = lbann.MeanSquaredError(y_image_re,gt_y) # (y,dec(fw(x))) #forward model to decoder, no latent space + dec_fw_inv_enc_y = lbann.MeanSquaredError(y_image_re2,gt_y) #(y, dec(fw(inv(enc(y))))) y->enc_z->x'->fw_z->y' + wae_loss = lbann.MeanSquaredError(y_out,gt_y) #(y, dec(enc(y)) ' + #L_cyc = L_cyc_y + L_cyc_x + L_cyc = lbann.Add(L_cyc_y, L_cyc_x) + + #loss_gen0 = L_l2_y + lamda_cyc*L_cyc + loss_gen0 = lbann.WeightedSum([L_l2_y,L_cyc], scaling_factors=f'1 {args.lamda_cyc}') + loss_gen1 = lbann.WeightedSum([L_l2_x,L_cyc_y], scaling_factors=f'1 {args.lamda_cyc}') + #loss_gen1 = L_l2_x + lamda_cyc*L_cyc_y + + + conc_out = lbann.Concatenation([gt_x,wae_loss,img_sca_loss,dec_fw_inv_enc_y, + L_l2_x], name='x_errors') + layers = list(lbann.traverse_layer_graph(input)) + weights = set() + for l in layers: + weights.update(l.weights) + + # Setup objective function + obj = lbann.ObjectiveFunction([loss_gen0,loss_gen1]) + # Initialize check metric callback + metrics = [lbann.Metric(img_sca_loss, name='img_re1'), + lbann.Metric(dec_fw_inv_enc_y, name='img_re2'), + lbann.Metric(wae_loss, name='wae_loss'), + lbann.Metric(L_l2_x, name='inverse loss'), + lbann.Metric(L_cyc_y, name='output cycle loss'), + lbann.Metric(L_cyc_x, name='param cycle loss')] + + callbacks = [lbann.CallbackPrint(), + lbann.CallbackDumpOutputs(layers=f'{conc_out.name}', + execution_modes='test', + directory=args.dump_outputs, + batch_interval=1, + format='npy'), + lbann.CallbackTimer()] + + # Construct model + num_epochs =1 + return lbann.Model(num_epochs, + weights=weights, + layers=layers, + serialize_io=True, + metrics=metrics, + objective_function=obj, + callbacks=callbacks) + + +if __name__ == '__main__': + + trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size, + procs_per_trainer=args.procs_per_trainer) + model = construct_model() + # Setup optimizer + opt = lbann.Adam(learn_rate=0.0001,beta1=0.9,beta2=0.99,eps=1e-8) + # Load data reader from prototext + data_reader_proto = lbann.lbann_pb2.LbannPB() + with open(data_reader_prototext, 'r') as f: + txtf.Merge(f.read(), data_reader_proto) + data_reader_proto = data_reader_proto.data_reader + + kwargs = lbann.contrib.args.get_scheduler_kwargs(args) + status = lbann.contrib.launcher.run(trainer,model, data_reader_proto, opt, + lbann_exe, + scheduler='lsf', + partition='pdebug', + nodes=args.num_nodes, + procs_per_node=args.ppn, + time_limit=30, + setup_only=False, + batch_job=False, + job_name=args.job_name, + lbann_args=['--preload_data_store --use_data_store --load_model_weights_dir_is_complete', + f'--metadata={metadata_prototext}', + f'--load_model_weights_dir={args.pretrained_dir}', + f'--index_list_test={args.index_list_test}', + f'--data_filedir_test={args.data_filedir_test}'], + **kwargs) + print(status) diff --git a/applications/physics/ICF/jag_models.py b/applications/physics/ICF/jag_models.py new file mode 100644 index 00000000000..47399bee33f --- /dev/null +++ b/applications/physics/ICF/jag_models.py @@ -0,0 +1,71 @@ +import lbann +import lbann.modules.base + + +class WAE(lbann.modules.Module): + + global_count = 0 # Static counter, used for default names + + def __init__(self, encoder_out_dim, decoder_out_dim, name=None): + self.instance = 0 + self.name = (name if name + else 'wae{0}'.format(WAE.global_count)) + + fc = lbann.modules.FullyConnectedModule + disc_neurons = [128,64,1] + encoder_neurons = [32,256,128] + decoder_neurons = [64,128,256] + + #Encoder + self.enc_fc0 = fc(encoder_neurons[0],activation=lbann.Elu,name=self.name+'_enc_fc0') + self.enc_fc1 = fc(encoder_neurons[1],activation=lbann.Tanh,name=self.name+'_enc_fc1') + self.enc_fc2 = fc(encoder_neurons[2],activation=lbann.Tanh,name=self.name+'_enc_fc2') + self.enc_out = fc(encoder_out_dim,name='enc_out') + + #Decoder + self.dec_fc0 = fc(decoder_neurons[0],activation=lbann.Elu,name=self.name+'_dec_fc0') + self.dec_fc1 = fc(decoder_neurons[1],activation=lbann.Tanh,name=self.name+'_dec_fc1') + self.dec_fc2 = fc(decoder_neurons[2],activation=lbann.Tanh,name=self.name+'_dec_fc2') + self.dec_out = fc(decoder_out_dim,name='pred_y') + + #Discriminator1 + self.d0_fc0 = fc(disc_neurons[0],activation=lbann.Relu,name=self.name+'_disc0_fc0') + self.d0_fc1 = fc(disc_neurons[1],activation=lbann.Relu,name=self.name+'_disc0_fc1') + self.d0_fc2 = fc(disc_neurons[2],name=self.name+'_disc0_fc2') + + #Discriminator2 + #stacked_discriminator, this will be frozen, no optimizer, + #layer has to be named for replace layer callback + self.d1_fc0 = fc(disc_neurons[0],activation=lbann.Relu,name=self.name+'_disc1_fc0') + self.d1_fc1 = fc(disc_neurons[1],activation=lbann.Relu,name=self.name+'_disc1_fc1') + self.d1_fc2 = fc(disc_neurons[2],name=self.name+'_disc1_fc2') + + + def forward(self, z, y): + + z_sample = self.forward_encoder(y) + + y_recon = self.forward_decoder(z_sample) + + #d real/fake share weights, shared weights is copied to d_adv + #(through replace weight callback) and freeze + d_real = self.forward_discriminator0(lbann.Concatenation([y,z],axis=0)) + y_z_sample = lbann.Concatenation([y,z_sample],axis=0) + d_fake = self.forward_discriminator0(lbann.StopGradient(y_z_sample)) + d_adv = self.forward_discriminator1(y_z_sample) #freeze + + return d_real, d_fake, d_adv,y_recon + + def forward_encoder(self,y): + bn = lbann.BatchNormalization + return self.enc_out(bn(self.enc_fc2(bn(self.enc_fc1(bn(self.enc_fc0(y),epsilon=1e-3) + ),epsilon=1e-3)),epsilon=1e-3)) + + def forward_decoder(self,z): + return self.dec_out(self.dec_fc2(self.dec_fc1(self.dec_fc0(z)))) + + def forward_discriminator0(self,input): + return self.d0_fc2(self.d0_fc1(self.d0_fc0(input))) + + def forward_discriminator1(self,input): + return self.d1_fc2(self.d1_fc1(self.d1_fc0(input))) diff --git a/applications/physics/ICF/lbann_gt_img.png b/applications/physics/ICF/lbann_gt_img.png new file mode 100644 index 00000000000..c0cd09203f9 Binary files /dev/null and b/applications/physics/ICF/lbann_gt_img.png differ diff --git a/applications/physics/ICF/lbann_gt_pred_sca.png b/applications/physics/ICF/lbann_gt_pred_sca.png new file mode 100644 index 00000000000..961653aac4a Binary files /dev/null and b/applications/physics/ICF/lbann_gt_pred_sca.png differ diff --git a/applications/physics/ICF/lbann_pred_img.png b/applications/physics/ICF/lbann_pred_img.png new file mode 100644 index 00000000000..8395d408889 Binary files /dev/null and b/applications/physics/ICF/lbann_pred_img.png differ diff --git a/applications/physics/ICF/macc_models.py b/applications/physics/ICF/macc_models.py new file mode 100644 index 00000000000..bd6e9c9202d --- /dev/null +++ b/applications/physics/ICF/macc_models.py @@ -0,0 +1,156 @@ +import lbann +import lbann.modules.base + + +#Synonymous to fc_gen0 +class MACCForward(lbann.modules.Module): + + global_count = 0 # Static counter, used for default names + + def __init__(self, out_dim,name=None): + self.instance = 0 + self.name = (name if name + else 'macc_forward{0}'.format(MACCForward.global_count)) + + fc = lbann.modules.FullyConnectedModule + + #generator #fc2_gen0 + g_neurons = [32,256,1024] + self.gen_fc = [fc(g_neurons[i],activation=lbann.Relu, name=self.name+'gen_fc'+str(i)) + for i in range(len(g_neurons))] + self.predy = fc(out_dim,name=self.name+'pred_out') + + def forward(self,x): + return self.predy(self.gen_fc[2](self.gen_fc[1](self.gen_fc[0](x)))) + +#Synonymous to fc_gen1 +class MACCInverse(lbann.modules.Module): + + global_count = 0 # Static counter, used for default names + + def __init__(self, out_dim,name=None): + self.instance = 0 + self.name = (name if name + else 'macc_inverse{0}'.format(MACCInverse.global_count)) + + fc = lbann.modules.FullyConnectedModule + + #generator #fc_gen1 + g_neurons = [16,128,64] + self.gen_fc = [fc(g_neurons[i],activation=lbann.Relu, name=self.name+'gen_fc'+str(i)) + for i in range(len(g_neurons))] + self.predx = fc(out_dim,name=self.name+'pred_out') + + def forward(self,y): + return self.predx(self.gen_fc[2](self.gen_fc[1](self.gen_fc[0](y)))) + + +class MACCWAE(lbann.modules.Module): + + global_count = 0 # Static counter, used for default names + + def __init__(self, encoder_out_dim, decoder_out_dim, scalar_dim = 15, use_CNN=False, name=None): + self.instance = 0 + self.name = (name if name + else 'macc_wae{0}'.format(MACCWAE.global_count)) + + self.use_CNN = use_CNN + + fc = lbann.modules.FullyConnectedModule + conv = lbann.modules.Convolution2dModule + + disc_neurons = [128,64,1] + encoder_neurons = [32,256,128] + decoder_neurons = [64,128,256] + + enc_outc = [64,32,16] + dec_outc = [32,16,4] + + #Encoder + self.enc_fc0 = fc(encoder_neurons[0],activation=lbann.Elu,name=self.name+'_enc_fc0') + self.enc_fc1 = fc(encoder_neurons[1],activation=lbann.Tanh,name=self.name+'_enc_fc1') + self.enc_fc2 = fc(encoder_neurons[2],activation=lbann.Tanh,name=self.name+'_enc_fc2') + self.enc_out = fc(encoder_out_dim,name=self.name+'enc_out') + + #Decoder + self.dec_fc0 = fc(decoder_neurons[0],activation=lbann.Elu,name=self.name+'_dec_fc0') + self.dec_fc1 = fc(decoder_neurons[1],activation=lbann.Tanh,name=self.name+'_dec_fc1') + self.dec_fc2 = fc(decoder_neurons[2],activation=lbann.Tanh,name=self.name+'_dec_fc2') + self.dec_out = fc(decoder_out_dim,name=self.name+'pred_y') + + #Discriminator1 + self.d0_fc0 = fc(disc_neurons[0],activation=lbann.Relu,name=self.name+'_disc0_fc0') + self.d0_fc1 = fc(disc_neurons[1],activation=lbann.Relu,name=self.name+'_disc0_fc1') + self.d0_fc2 = fc(disc_neurons[2],name=self.name+'_disc0_fc2') + + #Discriminator2 + #stacked_discriminator, this will be frozen, no optimizer, + #layer has to be named for replace layer callback + self.d1_fc0 = fc(disc_neurons[0],activation=lbann.Relu,name=self.name+'_disc1_fc0') + self.d1_fc1 = fc(disc_neurons[1],activation=lbann.Relu,name=self.name+'_disc1_fc1') + self.d1_fc2 = fc(disc_neurons[2],name=self.name+'_disc1_fc2') + + #Encoder_CNN + self.enc_conv = [conv(enc_outc[i], 4, stride=2, padding=1, activation=lbann.Relu, + name=self.name+'_enc_conv'+str(i)) for i in range(len(enc_outc))] + + #Decoder_CNN + #Arxiv paper/PNAS configuration is D1: Dense(32,1024) + self.dec_cnn_fc = fc(16*8*8,activation=lbann.Relu,name=self.name+'_dec_cnn_fc') + self.dec_fc_sca = fc(scalar_dim, name=self.name+'_dec_sca_fc') + self.dec_convT = [conv(dec_outc[i], 4, stride=2, padding=1, + transpose=True, name=self.name+'_dec_conv'+str(i)) + for i in range(len(dec_outc))] + + def forward(self, z, y): + + z_sample = self.encoder(y) + + y_recon = self.decoder(z_sample) + + #d real/fake share weights, shared weights is copied to d_adv + #(through replace weight callback) and freeze + d_real = self.discriminator0(lbann.Concatenation([y,z],axis=0)) + y_z_sample = lbann.Concatenation([y,z_sample],axis=0) + d_fake = self.discriminator0(lbann.StopGradient(y_z_sample)) + d_adv = self.discriminator1(y_z_sample) #freeze + + return d_real, d_fake, d_adv,y_recon + + def encoder(self, y): + return self.encoder_cnn(y) if self.use_CNN else self.encoder_fc(y) + + def encoder_fc(self,y): + return self.enc_out(self.enc_fc2(self.enc_fc1(self.enc_fc0(y)))) + + def encoder_cnn(self,y): + img_sca = lbann.Slice(y, axis=0, slice_points="0 16384 16399", name=self.name+'_y_slice') + #assume C first, is data C first? + img = lbann.Reshape(img_sca, dims='4 64 64',name=self.name+'enc_reshape0') + x = self.enc_conv[2](self.enc_conv[1](self.enc_conv[0](img))) + x = lbann.Reshape(x, dims=str(16*8*8), name=self.name+'enc_reshape1') + h_stack = lbann.Concatenation([x,img_sca],axis=0) + z = self.enc_out(h_stack) + return z + + def decoder(self, z): + return self.decoder_cnn(z) if self.use_CNN else self.decoder_fc(z) + + def decoder_fc(self,z): + return self.dec_out(self.dec_fc2(self.dec_fc1(self.dec_fc0(z)))) + + def decoder_cnn(self,z): + x = self.dec_cnn_fc(z) + sca = self.dec_fc_sca(lbann.Identity(x)) + img = lbann.Reshape(lbann.Identity(x), dims="16 8 8", name=self.name+'dec_reshape0') + img = self.dec_convT[2](lbann.Relu(self.dec_convT[1](lbann.Relu(self.dec_convT[0](img))))) + #concat for common interface, slice in output + img = lbann.Reshape(img, dims=str(64*64*4), name=self.name+'dec_reshape1') #?? check tensor shape + #todo check that concat size == dec_out_dim + return lbann.Concatenation([img,sca],axis=0) + + def discriminator0(self,input): + return self.d0_fc2(self.d0_fc1(self.d0_fc0(input))) + + def discriminator1(self,input): + return self.d1_fc2(self.d1_fc1(self.d1_fc0(input))) diff --git a/applications/physics/ICF/pre_train_jag_wae.py b/applications/physics/ICF/pre_train_jag_wae.py new file mode 100644 index 00000000000..997e7a26a39 --- /dev/null +++ b/applications/physics/ICF/pre_train_jag_wae.py @@ -0,0 +1,172 @@ +import macc_models +import argparse +from os.path import abspath, dirname, join +import google.protobuf.text_format as txtf +import lbann.contrib.launcher +import lbann.contrib.args +from lbann.util import str_list + +# ============================================== +# Setup and launch experiment +# ============================================== + +# Default data reader +model_zoo_dir = dirname(dirname(abspath(__file__))) +data_reader_prototext = join(model_zoo_dir, + 'data', + 'jag_conduit_reader.prototext') +metadata_prototext = join(model_zoo_dir, + 'data', + 'jag_100M_metadata.prototext') + +# Command-line arguments +parser = argparse.ArgumentParser() +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='wae', type=str, + help='job name', metavar='NAME') +parser.add_argument( + '--mini-batch-size', action='store', default=128, type=int, + help='mini-batch size (default: 128)', metavar='NUM') +parser.add_argument( + '--num-epochs', action='store', default=100, type=int, + help='number of epochs (default: 100)', metavar='NUM') +parser.add_argument( + '--num-nodes', action='store', default=4, type=int, + help='number of nodes (default: 4)', metavar='NUM') +parser.add_argument( + '--ppn', action='store', default=4, type=int, + help='processes per node (default: 4)', metavar='NUM') +parser.add_argument( + '--ydim', action='store', default=16399, type=int, + help='image+scalar dim (default: 64*64*4+15=16399)', metavar='NUM') +parser.add_argument( + '--zdim', action='store', default=20, type=int, + help='latent space dim (default: 20)', metavar='NUM') +parser.add_argument( + '--useCNN', action='store', default=False, type=bool, + help='use CNN', metavar='BOOL') +parser.add_argument( + '--data-filedir-train', action='store', default='/p/gpfs1/brainusr/datasets/10MJAG/1M_A/', type=str, + help='data filedir (default train dir is 10MJAG/1M_A)', metavar='NAME') +parser.add_argument( + '--data-filedir-test', action='store', default='/p/gpfs1/brainusr/datasets/10MJAG/1M_B/', type=str, + help='data filedir (default test dir is 10MJAG/1M_B)', metavar='NAME') +parser.add_argument( + '--index-list-train', action='store', default='index.txt', type=str, + help='index list (default index.txt)', metavar='NAME') +parser.add_argument( + '--index-list-test', action='store', default='t0_sample_list_multi_10K.txt', type=str, + help='index list (default t0_sample_list_multi_10K.txt, 100 samples)', metavar='NAME') +parser.add_argument( + '--dump-outputs', action='store', default='dump_outs', type=str, + help='dump outputs dir (default: jobdir/dump_outs)', metavar='NAME') +parser.add_argument( + '--dump-models', action='store', default='dump_models', type=str, + help='dump models dir (default: jobdir/dump_models)', metavar='NAME') +parser.add_argument( + '--procs-per-trainer', action='store', default=0, type=int, + help='processes per trainer (default: 0)', metavar='NUM') +args = parser.parse_args() + + +def list2str(l): + return ' '.join(l) + +def construct_model(): + """Construct LBANN model. + + JAG Wasserstein autoencoder model + + """ + import lbann + + # Layer graph + input = lbann.Input(target_mode='N/A', name='inp_data') + # data is 64*64*4 images + 15 scalar + 5 param + #inp_slice = lbann.Slice(input, axis=0, slice_points="0 16399 16404",name='inp_slice') + inp_slice = lbann.Slice(input, axis=0, slice_points=str_list([0,args.ydim,args.ydim+5]),name='inp_slice') + gt_y = lbann.Identity(inp_slice,name='gt_y') + gt_x = lbann.Identity(inp_slice, name='gt_x') #param not used + + zero = lbann.Constant(value=0.0,num_neurons='1',name='zero') + one = lbann.Constant(value=1.0,num_neurons='1',name='one') + + z_dim = 20 #Latent space dim + + z = lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="20") + model = macc_models.MACCWAE(args.zdim,args.ydim,use_CNN=args.useCNN) + d1_real, d1_fake, d_adv, pred_y = model(z,gt_y) + + d1_real_bce = lbann.SigmoidBinaryCrossEntropy([d1_real,one],name='d1_real_bce') + d1_fake_bce = lbann.SigmoidBinaryCrossEntropy([d1_fake,zero],name='d1_fake_bce') + d_adv_bce = lbann.SigmoidBinaryCrossEntropy([d_adv,one],name='d_adv_bce') + img_loss = lbann.MeanSquaredError([pred_y,gt_y]) + rec_error = lbann.L2Norm2(lbann.WeightedSum([pred_y,gt_y], scaling_factors="1 -1")) + + layers = list(lbann.traverse_layer_graph(input)) + # Setup objective function + weights = set() + src_layers = [] + dst_layers = [] + for l in layers: + if(l.weights and "disc0" in l.name and "instance1" in l.name): + src_layers.append(l.name) + #freeze weights in disc2 + if(l.weights and "disc1" in l.name): + dst_layers.append(l.name) + for idx in range(len(l.weights)): + l.weights[idx].optimizer = lbann.NoOptimizer() + weights.update(l.weights) + l2_reg = lbann.L2WeightRegularization(weights=weights, scale=1e-4) + d_adv_bce = lbann.LayerTerm(d_adv_bce,scale=0.01) + obj = lbann.ObjectiveFunction([d1_real_bce,d1_fake_bce,d_adv_bce,img_loss,rec_error,l2_reg]) + # Initialize check metric callback + metrics = [lbann.Metric(img_loss, name='recon_error')] + #pred_y = macc_models.MACCWAE.pred_y_name + callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackSaveModel(dir=args.dump_models), + lbann.CallbackReplaceWeights(source_layers=list2str(src_layers), + destination_layers=list2str(dst_layers), + batch_interval=2)] + + # Construct model + return lbann.Model(args.num_epochs, + serialize_io=True, + weights=weights, + layers=layers, + metrics=metrics, + objective_function=obj, + callbacks=callbacks) + + +if __name__ == '__main__': + import lbann + + trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size, + procs_per_trainer=args.procs_per_trainer) + model = construct_model() + # Setup optimizer + opt = lbann.Adam(learn_rate=0.0001,beta1=0.9,beta2=0.99,eps=1e-8) + # Load data reader from prototext + data_reader_proto = lbann.lbann_pb2.LbannPB() + with open(data_reader_prototext, 'r') as f: + txtf.Merge(f.read(), data_reader_proto) + data_reader_proto = data_reader_proto.data_reader + + kwargs = lbann.contrib.args.get_scheduler_kwargs(args) + status = lbann.contrib.launcher.run(trainer,model, data_reader_proto, opt, + nodes=args.num_nodes, + procs_per_node=args.ppn, + time_limit=720, + setup_only=False, + job_name=args.job_name, + lbann_args=['--use_data_store --preload_data_store', + f'--metadata={metadata_prototext}', + f'--index_list_train={args.index_list_train}', + f'--index_list_test={args.index_list_test}', + f'--data_filedir_train={args.data_filedir_train}', + f'--data_filedir_test={args.data_filedir_test}'], + **kwargs) + print(status) diff --git a/applications/physics/ICF/train_jag_wae.py b/applications/physics/ICF/train_jag_wae.py new file mode 100644 index 00000000000..b6f41e8c493 --- /dev/null +++ b/applications/physics/ICF/train_jag_wae.py @@ -0,0 +1,107 @@ +import jag_models +from os.path import abspath, dirname, join +import google.protobuf.text_format as txtf + +# ============================================== +# Setup and launch experiment +# ============================================== + +# Default data reader +model_zoo_dir = dirname(dirname(abspath(__file__))) +data_reader_prototext = join(model_zoo_dir, + 'data', + 'jag_100Kdata.prototext') + + +def list2str(l): + return ' '.join(l) + +def construct_model(): + """Construct LBANN model. + + JAG Wasserstein autoencoder model + + """ + import lbann + + # Layer graph + input = lbann.Input(target_mode='N/A',name='inp_data') + # data is 64*64*4 images + 15 scalar + 5 param + inp_slice = lbann.Slice(input, axis=0, slice_points="0 16399 16404",name='inp_slice') + gt_y = lbann.Identity(inp_slice,name='gt_y') + gt_x = lbann.Identity(inp_slice, name='gt_x') #param not used + + zero = lbann.Constant(value=0.0,num_neurons='1',name='zero') + one = lbann.Constant(value=1.0,num_neurons='1',name='one') + + y_dim = 16399 #image+scalar shape + z_dim = 20 #Latent space dim + + z = lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="20") + d1_real, d1_fake, d_adv, pred_y = jag_models.WAE(z_dim,y_dim)(z,gt_y) + + d1_real_bce = lbann.SigmoidBinaryCrossEntropy([d1_real,one],name='d1_real_bce') + d1_fake_bce = lbann.SigmoidBinaryCrossEntropy([d1_fake,zero],name='d1_fake_bce') + d_adv_bce = lbann.SigmoidBinaryCrossEntropy([d_adv,one],name='d_adv_bce') + + img_loss = lbann.MeanSquaredError([pred_y,gt_y]) + rec_error = lbann.L2Norm2(lbann.WeightedSum([pred_y,gt_y], scaling_factors="1 -1")) + + layers = list(lbann.traverse_layer_graph(input)) + # Setup objective function + weights = set() + src_layers = [] + dst_layers = [] + for l in layers: + if(l.weights and "disc0" in l.name and "instance1" in l.name): + src_layers.append(l.name) + #freeze weights in disc2 + if(l.weights and "disc1" in l.name): + dst_layers.append(l.name) + for idx in range(len(l.weights)): + l.weights[idx].optimizer = lbann.NoOptimizer() + weights.update(l.weights) + l2_reg = lbann.L2WeightRegularization(weights=weights, scale=1e-4) + d_adv_bce = lbann.LayerTerm(d_adv_bce,scale=0.01) + obj = lbann.ObjectiveFunction([d1_real_bce,d1_fake_bce,d_adv_bce,img_loss,rec_error,l2_reg]) + # Initialize check metric callback + metrics = [lbann.Metric(img_loss, name='recon_error')] + + callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackReplaceWeights(source_layers=list2str(src_layers), + destination_layers=list2str(dst_layers), + batch_interval=2)] + + # Construct model + num_epochs = 100 + return lbann.Model(num_epochs, + weights=weights, + layers=layers, + metrics=metrics, + objective_function=obj, + callbacks=callbacks) + + +if __name__ == '__main__': + import lbann + + mini_batch_size = 128 + trainer = lbann.Trainer(mini_batch_size=mini_batch_size) + model = construct_model() + # Setup optimizer + opt = lbann.Adam(learn_rate=0.0001,beta1=0.9,beta2=0.99,eps=1e-8) + # Load data reader from prototext + data_reader_proto = lbann.lbann_pb2.LbannPB() + with open(data_reader_prototext, 'r') as f: + txtf.Merge(f.read(), data_reader_proto) + data_reader_proto = data_reader_proto.data_reader + + status = lbann.run(trainer,model, data_reader_proto, opt, + scheduler='slurm', + nodes=1, + procs_per_node=1, + time_limit=360, + setup_only=True, + job_name='jag_wae') + print(status) diff --git a/applications/physics/ICF/train_macc_surrogate.py b/applications/physics/ICF/train_macc_surrogate.py new file mode 100644 index 00000000000..396465d5838 --- /dev/null +++ b/applications/physics/ICF/train_macc_surrogate.py @@ -0,0 +1,218 @@ +import macc_models +import argparse +from os.path import abspath, dirname, join +import google.protobuf.text_format as txtf +import lbann.contrib.launcher +import lbann.contrib.args +from lbann.util import str_list + +# ============================================== +# Setup and launch experiment +# ============================================== + +# Default data reader +cur_dir = dirname(abspath(__file__)) +data_reader_prototext = join(dirname(cur_dir), + 'data', + 'jag_conduit_reader.prototext') +metadata_prototext = join(dirname(cur_dir), + 'data', + 'jag_100M_metadata.prototext') + +#model_dir='' +#Load at least pretrained WAE model +#assert model_dir, 'pre_trained_dir should not be empty' +#Assume pre_trained model is in current directory, change path if not +#pre_trained_dir=join(cur_dir,model_dir) + +# Command-line arguments +parser = argparse.ArgumentParser() +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='surrogate', type=str, + help='job name', metavar='NAME') +parser.add_argument( + '--mini-batch-size', action='store', default=128, type=int, + help='mini-batch size (default: 128)', metavar='NUM') +parser.add_argument( + '--num-epochs', action='store', default=100, type=int, + help='number of epochs (default: 100)', metavar='NUM') +parser.add_argument( + '--num-nodes', action='store', default=4, type=int, + help='number of nodes (default: 4)', metavar='NUM') +parser.add_argument( + '--ppn', action='store', default=4, type=int, + help='processes per node (default: 4)', metavar='NUM') +parser.add_argument( + '--ydim', action='store', default=16399, type=int, + help='image+scalar dim (default: 64*64*4+15=16399)', metavar='NUM') +parser.add_argument( + '--zdim', action='store', default=20, type=int, + help='latent space dim (default: 20)', metavar='NUM') +parser.add_argument( + '--xdim', action='store', default=5, type=int, + help='input (x) dim (default: 5)', metavar='NUM') +parser.add_argument( + '--lamda-cyc', action='store', default=1e-3, type=float, + help='lamda-cyc (default: 1e-3)', metavar='NUM') +parser.add_argument( + '--useCNN', action='store', default=False, type=bool, + help='use CNN', metavar='BOOL') +parser.add_argument( + '--data-filedir-train', action='store', default='/p/gpfs1/brainusr/datasets/10MJAG/1M_A/', type=str, + help='data filedir (default train dir is 10MJAG/1M_A)', metavar='NAME') +parser.add_argument( + '--data-filedir-test', action='store', default='/p/gpfs1/brainusr/datasets/10MJAG/1M_B/', type=str, + help='data filedir (default test dir is 10MJAG/1M_B)', metavar='NAME') +parser.add_argument( + '--index-list-train', action='store', default='index.txt', type=str, + help='index list (default index.txt)', metavar='NAME') +parser.add_argument( + '--index-list-test', action='store', default='t0_sample_list_multi_10K.txt', type=str, + help='index list (default t0_sample_list_multi_10K.txt, 100 samples)', metavar='NAME') +parser.add_argument( + '--dump-outputs', action='store', default='dump_outs', type=str, + help='dump outputs dir (default: jobdir/dump_outs)', metavar='NAME') +parser.add_argument( + '--dump-models', action='store', default='dump_models', type=str, + help='dump models dir (default: jobdir/dump_models)', metavar='NAME') +parser.add_argument( + '--pretrained-dir', action='store', default=' ', type=str, + help='pretrained WAE dir (default: empty)', metavar='NAME') +parser.add_argument( + '--procs-per-trainer', action='store', default=0, type=int, + help='processes per trainer (default: 0)', metavar='NUM') +args = parser.parse_args() + +if not(args.pretrained_dir): + print("WARNING pretrained dir ", args.pretrained_dir, " is empty, default option assumes + pretrained autoencoder") + +def list2str(l): + return ' '.join(l) + +def construct_model(): + """Construct MACC surrogate model. + + See https://arxiv.org/pdf/1912.08113.pdf model architecture and other details + + """ + import lbann + + # Layer graph + input = lbann.Input(target_mode='N/A',name='inp_data') + # data is 64*64*4 images + 15 scalar + 5 param + inp_slice = lbann.Slice(input, axis=0, slice_points=str_list([0,args.ydim,args.ydim+args.xdim]),name='inp_slice') + gt_y = lbann.Identity(inp_slice,name='gt_y') + gt_x = lbann.Identity(inp_slice, name='gt_x') #param not used + + zero = lbann.Constant(value=0.0,num_neurons='1',name='zero') + one = lbann.Constant(value=1.0,num_neurons='1',name='one') + + + z = lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="20") + wae = macc_models.MACCWAE(args.zdim,args.ydim,use_CNN=args.useCNN) #pretrained, freeze + inv = macc_models.MACCInverse(args.xdim) + fwd = macc_models.MACCForward(args.zdim) + + + y_pred_fwd = wae.encoder(gt_y) + + param_pred_ = wae.encoder(gt_y) + input_fake = inv(param_pred_) + + output_cyc = fwd(input_fake) + y_image_re2 = wae.decoder(output_cyc) + + '''**** Train cycleGAN input params <--> latent space of (images, scalars) ****''' + output_fake = fwd(gt_x) + y_image_re = wae.decoder(output_fake) + + param_pred2_ = wae.encoder(y_image_re) + input_cyc = inv(param_pred2_) + + L_l2_x = lbann.MeanSquaredError(input_fake,gt_x) + L_cyc_x = lbann.MeanSquaredError(input_cyc,gt_x) + + L_l2_y = lbann.MeanSquaredError(output_fake,y_pred_fwd) + L_cyc_y = lbann.MeanSquaredError(output_cyc,y_pred_fwd) + + + #@todo slice here to separate scalar from image + img_sca_loss = lbann.MeanSquaredError(y_image_re,gt_y) + #L_cyc = L_cyc_y + L_cyc_x + L_cyc = lbann.Add(L_cyc_y, L_cyc_x) + + #loss_gen0 = L_l2_y + lamda_cyc*L_cyc + loss_gen0 = lbann.WeightedSum([L_l2_y,L_cyc], scaling_factors=f'1 {args.lamda_cyc}') + loss_gen1 = lbann.WeightedSum([L_l2_x,L_cyc_y], scaling_factors=f'1 {args.lamda_cyc}') + #loss_gen1 = L_l2_x + lamda_cyc*L_cyc_y + + + layers = list(lbann.traverse_layer_graph(input)) + weights = set() + #Freeze appropriate (pretrained) weights + pretrained_models = ["wae"] #add macc? + for l in layers: + for idx in range(len(pretrained_models)): + if(l.weights and pretrained_models[idx] in l.name): + for w in range(len(l.weights)): + l.weights[w].optimizer = lbann.NoOptimizer() + weights.update(l.weights) + + l2_reg = lbann.L2WeightRegularization(weights=weights, scale=1e-4) + #d_adv_bce = lbann.LayerTerm(d_adv_bce,scale=0.01) + # Setup objective function + obj = lbann.ObjectiveFunction([loss_gen0,loss_gen1,l2_reg]) + # Initialize check metric callback + metrics = [lbann.Metric(img_sca_loss, name='fw_loss'), + lbann.Metric(L_l2_x, name='inverse loss'), + lbann.Metric(L_cyc_y, name='output cycle loss'), + lbann.Metric(L_cyc_x, name='param cycle loss')] + + callbacks = [lbann.CallbackPrint(), + lbann.CallbackSaveModel(dir=args.dump_models), + lbann.CallbackLoadModel(dirs=str(args.pretrained_dir)), + lbann.CallbackTimer()] + + # Construct model + return lbann.Model(args.num_epochs, + weights=weights, + serialize_io=True, + layers=layers, + metrics=metrics, + objective_function=obj, + callbacks=callbacks) + + +if __name__ == '__main__': + import lbann + + trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size, + procs_per_trainer=args.procs_per_trainer) + model = construct_model() + # Setup optimizer + opt = lbann.Adam(learn_rate=0.0001,beta1=0.9,beta2=0.99,eps=1e-8) + # Load data reader from prototext + data_reader_proto = lbann.lbann_pb2.LbannPB() + with open(data_reader_prototext, 'r') as f: + txtf.Merge(f.read(), data_reader_proto) + data_reader_proto = data_reader_proto.data_reader + + kwargs = lbann.contrib.args.get_scheduler_kwargs(args) + status = lbann.contrib.launcher.run(trainer,model, data_reader_proto, opt, + scheduler='lsf', + nodes=args.num_nodes, + procs_per_node=args.ppn, + partition='pbatch', + time_limit=480, + setup_only=False, + job_name=args.job_name, + lbann_args=['--preload_data_store --use_data_store', + f'--metadata={metadata_prototext}', + f'--index_list_train={args.index_list_train}', + f'--index_list_test={args.index_list_test}', + f'--data_filedir_train={args.data_filedir_train}', + f'--data_filedir_test={args.data_filedir_test}'], + **kwargs) + print(status) diff --git a/applications/physics/cosmology/ExaGAN/ExaGAN.py b/applications/physics/cosmology/ExaGAN/ExaGAN.py new file mode 100644 index 00000000000..17a8bc90dfb --- /dev/null +++ b/applications/physics/cosmology/ExaGAN/ExaGAN.py @@ -0,0 +1,99 @@ +import lbann +import lbann.modules.base +import lbann.models.resnet + + +class CosmoGAN(lbann.modules.Module): + + global_count = 0 # Static counter, used for default names + + def __init__(self, name=None): + self.instance = 0 + self.name = (name if name + else 'ExaGAN{0}'.format(CosmoGAN.global_count)) + + convbnrelu = lbann.models.resnet.ConvBNRelu + fc = lbann.modules.FullyConnectedModule + conv = lbann.modules.Convolution2dModule + #bn_stats_grp_sz = 0 #0 global, 1 local + bn_stats_grp_sz = -1 #0 global, 1 local + + ##MCR properties #@todo: make multichannel optional + self.datascale = 4 + self.linear_scaler=1000. + + self.inits = {'dense': lbann.NormalInitializer(mean=0,standard_deviation=0.02), + 'conv': lbann.NormalInitializer(mean=0,standard_deviation=0.02), #should be truncated Normal + 'convT':lbann.NormalInitializer(mean=0,standard_deviation=0.02)} + + d_neurons = [64,128,256,512] + self.d1_conv = [convbnrelu(d_neurons[i], 4, 2, 1, False, bn_stats_grp_sz, False,name=self.name+'_disc1_conv'+str(i)) + for i in range(len(d_neurons))] + self.d1_fc = fc(1,name=self.name+'_disc1_fc', + weights=[lbann.Weights(initializer=self.inits['dense'])]) + + #stacked_discriminator, this will be frozen, no optimizer, + #layer has to be named for callback + self.d2_conv = [convbnrelu(d_neurons[i], 4, 2, 1, False, bn_stats_grp_sz, False,name=self.name+'_disc2_conv'+str(i)) + for i in range(len(d_neurons))] + self.d2_fc = fc(1,name=self.name+'_disc2_fc', + weights=[lbann.Weights(initializer=self.inits['dense'])]) + #generator + g_neurons = [256,128,64] + + self.g_convT = [conv(g_neurons[i], 5, stride=2, padding=2, transpose=True, + weights=[lbann.Weights(initializer=self.inits['convT'])]) + for i in range(len(g_neurons))] + + self.g_fc1 = fc(32768,name=self.name+'_gen_fc1', + weights=[lbann.Weights(initializer=self.inits['dense'])]) + self.g_convT3 = conv(1, 5, stride=2, padding=2, activation=lbann.Tanh,name='gen_img',transpose=True, + weights=[lbann.Weights(initializer=self.inits['convT'])]) + + def forward(self, img, z): + #description + d1_real = self.forward_discriminator1(img) #instance1 + gen_img = self.forward_generator(z) + d1_fake = self.forward_discriminator1(lbann.StopGradient(gen_img)) #instance2 + d_adv = self.forward_discriminator2(gen_img) #instance 3 //need to freeze + #d1s share weights, d1_w is copied to d_adv (through replace weight callback) and freeze + return d1_real, d1_fake, d_adv,gen_img + + def forward_discriminator1(self,y): + ch2 = self.inv_transform(lbann.Identity(y)) + y = lbann.Concatenation(lbann.Identity(y),ch2,axis=0) + img = lbann.Reshape(y, dims='2 128 128') + x = lbann.LeakyRelu(self.d1_conv[0](img), negative_slope=0.2) + x = lbann.LeakyRelu(self.d1_conv[1](x), negative_slope=0.2) + x = lbann.LeakyRelu(self.d1_conv[2](x), negative_slope=0.2) + x = lbann.LeakyRelu(self.d1_conv[3](x), negative_slope=0.2) + return self.d1_fc(lbann.Reshape(x,dims='32768')) + + def forward_discriminator2(self,y): + ch2 = self.inv_transform(lbann.Identity(y)) + y = lbann.Concatenation(lbann.Identity(y),ch2,axis=0) + img = lbann.Reshape(y, dims='2 128 128') + x = lbann.LeakyRelu(self.d2_conv[0](img), negative_slope=0.2) + x = lbann.LeakyRelu(self.d2_conv[1](x), negative_slope=0.2) + x = lbann.LeakyRelu(self.d2_conv[2](x), negative_slope=0.2) + x = lbann.LeakyRelu(self.d2_conv[3](x), negative_slope=0.2) + return self.d2_fc(lbann.Reshape(x,dims='32768')) + + def forward_generator(self,z): + x = lbann.Relu(lbann.BatchNormalization(self.g_fc1(z),decay=0.9,scale_init=1.0,epsilon=1e-5)) + x = lbann.Reshape(x, dims='512 8 8') #channel first + x = lbann.Relu(lbann.BatchNormalization(self.g_convT[0](x),decay=0.9,scale_init=1.0,epsilon=1e-5)) + x = lbann.Relu(lbann.BatchNormalization(self.g_convT[1](x),decay=0.9,scale_init=1.0,epsilon=1e-5)) + x = lbann.Relu(lbann.BatchNormalization(self.g_convT[2](x),decay=0.9,scale_init=1.0,epsilon=1e-5)) + return self.g_convT3(x) + + def inv_transform(self,y): + inv_transform = lbann.WeightedSum( + lbann.SafeDivide( + lbann.Add(lbann.Constant(value=1.0, hint_layer=y),lbann.Identity(y)), + lbann.Subtract(lbann.Constant(value=1.0, hint_layer=y),lbann.Identity(y))), + scaling_factors=str(self.datascale)) + linear_scale = 1/self.linear_scaler + CH2 = lbann.Tanh(lbann.WeightedSum(inv_transform,scaling_factors=str(linear_scale))) + return CH2 + diff --git a/applications/physics/cosmology/ExaGAN/README.md b/applications/physics/cosmology/ExaGAN/README.md new file mode 100644 index 00000000000..ca5f0540537 --- /dev/null +++ b/applications/physics/cosmology/ExaGAN/README.md @@ -0,0 +1,8 @@ +## Generative Models for Cosmology - Understanding the Nature of the Universe at Exascale + +LBANN implementation of a number of generative models for cosmology. Please see [link](https://github.com/pzharrington/ExaGAN/) for original Keras implementation of code in this directory and other details. Also, see LBANN documentations on how to install, build and run LBANN code. + +### How to Train +```bash +run python3 train_exagan.py +``` diff --git a/applications/physics/cosmology/ExaGAN/dataset.py b/applications/physics/cosmology/ExaGAN/dataset.py new file mode 100644 index 00000000000..c55f8c902a4 --- /dev/null +++ b/applications/physics/cosmology/ExaGAN/dataset.py @@ -0,0 +1,25 @@ +import numpy as np +from os.path import abspath, dirname, join +import google.protobuf.text_format as txtf + +# Data paths +data_dir = '/p/lustre2/brainusr/datasets/cosmoflow/norm_train200K.npy' + +samples = np.load(data_dir, allow_pickle=True) +samples = samples.transpose(0,3,1,2) + + +dims = 128*128*1 + +# Sample access functions +def get_sample(index): + sample = samples[index].flatten() + #normalization here if unnormalized + return sample + +def num_samples(): + return samples.shape[0] + +def sample_dims(): + return [dims] + diff --git a/applications/physics/cosmology/ExaGAN/train_exagan.py b/applications/physics/cosmology/ExaGAN/train_exagan.py new file mode 100644 index 00000000000..d27999d82b4 --- /dev/null +++ b/applications/physics/cosmology/ExaGAN/train_exagan.py @@ -0,0 +1,126 @@ +import ExaGAN +import dataset +import lbann.contrib.launcher + +# ============================================== +# Setup and launch experiment +# ============================================== + +def list2str(l): + return ' '.join(l) + +def construct_model(): + """Construct LBANN model. + + ExaGAN model + + """ + import lbann + + # Layer graph + input = lbann.Input(target_mode='N/A',name='inp_img') + #label flipping + label_flip_rand = lbann.Uniform(min=0,max=1, neuron_dims='1') + label_flip_prob = lbann.Constant(value=0.01, num_neurons='1') + one = lbann.GreaterEqual(label_flip_rand,label_flip_prob, name='is_real') + zero = lbann.LogicalNot(one,name='is_fake') + + z = lbann.Reshape(lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="64", name='noise_vec'),dims='1 64') + d1_real, d1_fake, d_adv, gen_img = ExaGAN.CosmoGAN()(input,z) + + d1_real_bce = lbann.SigmoidBinaryCrossEntropy([d1_real,one],name='d1_real_bce') + d1_fake_bce = lbann.SigmoidBinaryCrossEntropy([d1_fake,zero],name='d1_fake_bce') + d_adv_bce = lbann.SigmoidBinaryCrossEntropy([d_adv,one],name='d_adv_bce') + + layers = list(lbann.traverse_layer_graph(input)) + # Setup objective function + weights = set() + src_layers = [] + dst_layers = [] + for l in layers: + if(l.weights and "disc1" in l.name and "instance1" in l.name): + src_layers.append(l.name) + #freeze weights in disc2, analogous to discrim.trainable=False in Keras + if(l.weights and "disc2" in l.name): + dst_layers.append(l.name) + for idx in range(len(l.weights)): + l.weights[idx].optimizer = lbann.NoOptimizer() + weights.update(l.weights) + #l2_reg = lbann.L2WeightRegularization(weights=weights, scale=1e-4) + obj = lbann.ObjectiveFunction([d1_real_bce,d1_fake_bce,d_adv_bce]) + # Initialize check metric callback + metrics = [lbann.Metric(d1_real_bce,name='d_real'), + lbann.Metric(d1_fake_bce, name='d_fake'), + lbann.Metric(d_adv_bce,name='gen')] + + callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(), + #Uncomment to dump output for plotting and further statistical analysis + #lbann.CallbackDumpOutputs(layers='inp_img gen_img_instance1_activation', + # execution_modes='train validation', + # directory='dump_outs', + # batch_interval=100, + # format='npy'), + lbann.CallbackReplaceWeights(source_layers=list2str(src_layers), + destination_layers=list2str(dst_layers), + batch_interval=2)] + + # Construct model + num_epochs = 20 + return lbann.Model(num_epochs, + weights=weights, + layers=layers, + metrics=metrics, + objective_function=obj, + callbacks=callbacks) + +def construct_data_reader(): + """Construct Protobuf message for Python data reader. + + The Python data reader will import this Python file to access the + sample access functions. + + """ + import os.path + import lbann + module_file = os.path.abspath(__file__) + module_name = os.path.splitext(os.path.basename(module_file))[0] + module_dir = os.path.dirname(module_file) + + # Base data reader message + message = lbann.reader_pb2.DataReader() + + # Training set data reader + data_reader = message.reader.add() + data_reader.name = 'python' + data_reader.role = 'train' + data_reader.shuffle = True + data_reader.percent_of_data_to_use = 1.0 + data_reader.validation_percent = 0.1 + data_reader.python.module = 'dataset' + data_reader.python.module_dir = module_dir + data_reader.python.sample_function = 'get_sample' + data_reader.python.num_samples_function = 'num_samples' + data_reader.python.sample_dims_function = 'sample_dims' + + return message + +if __name__ == '__main__': + import lbann + + mini_batch_size = 64 + trainer = lbann.Trainer(mini_batch_size=mini_batch_size) + model = construct_model() + # Setup optimizer + opt = lbann.Adam(learn_rate=0.0002,beta1=0.5,beta2=0.99,eps=1e-8) + # Load data reader from prototext + data_reader = construct_data_reader() + status = lbann.contrib.launcher.run(trainer,model, data_reader, opt, + scheduler='slurm', + #account='lbpm', + nodes=1, + procs_per_node=1, + time_limit=1440, + setup_only=False, + job_name='exagan') + print(status) diff --git a/applications/physics/data/hydra_metadata.prototext b/applications/physics/data/hydra_metadata.prototext new file mode 100644 index 00000000000..ec1701f9042 --- /dev/null +++ b/applications/physics/data/hydra_metadata.prototext @@ -0,0 +1,114 @@ +######################################################################## +# The HYDRA normalization values were computed over the +# the 00008 set of 100 files (10K samples), June, 2019 +# John Field cautions that the HYDRA schema will change in +# the future +######################################################################## + +data_set_metadata { + schema { + split_jag_image_channels: false + + # JAG_Image, JAG_Scalar, JAG_Input + independent: [ { pieces: [ JAG_Image, JAG_Scalar ] }, { pieces: [ JAG_Input ] } ] + dependent: [ { pieces: [ JAG_Input ] } ] + + image_prefix: "/images/" + + ## all hydra image keys: + # "(90,0)/bang/image/data" + # "(90,0)/0.03/image/data" + # "(90,0)/0.02/image/data" + # "(90,0)/0.01/image/data" + # + # "(0,0)/bang/image/data" + # "(0,0)/0.03/image/data" + # "(0,0)/0.02/image/data" + # "(0,0)/0.01/image/data" + jag_image_keys: ["(90,0)/bang/image/data", "(0,0)/bang/image/data"] + + scalar_prefix: "/scalars/" + + # An empty list indicates to use all + # The commented out variables are not on the Jim's original list but used in the numpy-based format + jag_scalar_keys: + [ "avg_rhor", + "peak_eprod", + "peak_tion_bw_DT", + "bt_tion_bw_DT", + "avg_tion_bw_DT", + "adiabat", + "bangt", + "burnwidth", + "bt_rhor", + "bt_eprodr", + "peak_eprodr" + ] + + # When using all the keys without explicit selection, key filters can be used + # to explicitly exclude the particular variables with keys that matches a filter. + # 'jag_scalar_filters' and 'jag_input_filters' rely on exact key string matching. + # 'jag_scalar_prefix_filters' and 'jag_input_prefix_filters' define a filter as + # the pair of a prefix substring and the minimum key length. + # For example, with the example below, any key that has a length no shorter + # than 26 and starts with the substring "image_(" is excluded. + + jag_scalar_prefix_filters: [ { key_prefix: "image_(" min_len: 26} ] + jag_scalar_filters: [ "iBT" ] + + input_prefix: "/inputs/" + + jag_input_keys: ["preheat", + "sc_peak", + "t_3rd", + "t_end" + ] + } + + jag_input_normalization_params: [ + { scale: 0.0337373 bias: -0.0105617 }, #p_preheat avg= 15.4355 + { scale: 1.04127 bias: 0.49368 }, #sc_peak avg= 0.00650919 + { scale: 1.00482 bias: 0.499533 }, #t_3rd avg= -0.0241983 + { scale: 1.00725 bias: 0.496931 } #t_end avg= -0.00750582 + ] + + jag_scalar_normalization_params: [ + { scale: 1.82482 bias: -0.511432 }, #avg_rhor avg= 0.529763 + { scale: 0.681226 bias: -0.0150223 }, #peak_eprod avg= 0.201066 + { scale: 0.198934 bias: -0.801525 }, #peak_tion_bw_DT avg= 6.37529 + { scale: 0.244173 bias: -0.604468 }, #bt_tion_bw_DT avg= 4.0855 + { scale: 0.269391 bias: -0.656678 }, #avg_tion_bw_DT avg= 3.91583 + { scale: 0.0492209 bias: -0.186354 }, #adiabat avg= 10.6166 + { scale: 522.423 bias: -3.80809 }, #bangt avg= 0.00814444 + { scale: 3787.06 bias: -0.274563 }, #burnwidth avg= 0.000173271 + { scale: 1.68807 bias: -0.510794 }, #bt_rhor avg= 0.578218 + { scale: 5.27623e-05 bias: -0.00320741 }, #bt_eprodr avg= 1572.53 + { scale: 5.21263e-05 bias: -0.00322019 } #peak_eprodr avg= 1587.55 + ] + + # image data shape is (3,3,64,64) + # from John Field: sets of three: {absorption, emission forward, + # and emission back} # Since we are in 1D, forward and back emission + # are the same. + jag_image_normalization_params: [ + { scale: 1.31227 bias: -5.2241e-05 }, #(90,0)/bang/image/data + { scale: 1.5386e-05 bias: 8.4296e-05 }, #(90,0)/bang/image/data + { scale: 1.5386e-05 bias: 8.4296e-05 }, #(90,0)/bang/image/data + { scale: 1.28446 bias: -0.18841 }, #(90,0)/bang/image/data + { scale: 4.06761e-05 bias: 1.03167e-06 }, #(90,0)/bang/image/data + { scale: 4.06761e-05 bias: 1.03167e-06 }, #(90,0)/bang/image/data + { scale: 1.44979 bias: -0.289003 }, #(90,0)/bang/image/data + { scale: 0.00024344 bias: 7.96652e-08 }, #(90,0)/bang/image/data + { scale: 0.00024344 bias: 7.96652e-08 }, #(90,0)/bang/image/data + { scale: 1.31227 bias: -5.2241e-05 } #(0,0)/bang/image/data + { scale: 1.5386e-05 bias: 8.4296e-05 } #(0,0)/bang/image/data + { scale: 1.5386e-05 bias: 8.4296e-05 } #(0,0)/bang/image/data + { scale: 1.28446 bias: -0.18841 } #(0,0)/bang/image/data + { scale: 4.06761e-05 bias: 1.03167e-06 } #(0,0)/bang/image/data + { scale: 4.06761e-05 bias: 1.03167e-06 } #(0,0)/bang/image/data + { scale: 1.44979 bias: -0.289003 } #(0,0)/bang/image/data + { scale: 0.00024344 bias: 7.96652e-08 } #(0,0)/bang/image/data + { scale: 0.00024344 bias: 7.96652e-08 } #(0,0)/bang/image/data + ] + +} diff --git a/applications/physics/data/jag_100Kdata.prototext b/applications/physics/data/jag_100Kdata.prototext new file mode 100644 index 00000000000..da5ac912dd8 --- /dev/null +++ b/applications/physics/data/jag_100Kdata.prototext @@ -0,0 +1,12 @@ +data_reader { + reader { + name: "numpy" + role: "train" + shuffle: true + data_filename: "/p/lustre2/brainusr/datasets/jag/jag100K1vw_train.npy" + validation_percent: 0.1 + percent_of_data_to_use: 1.0 + disable_responses: true + disable_labels: true + } +} diff --git a/model_zoo/models/jag/ae_cycle_gan/jag_100M_metadata.prototext b/applications/physics/data/jag_100M_metadata.prototext similarity index 95% rename from model_zoo/models/jag/ae_cycle_gan/jag_100M_metadata.prototext rename to applications/physics/data/jag_100M_metadata.prototext index 1643b6db51a..7e22e71f0a9 100644 --- a/model_zoo/models/jag/ae_cycle_gan/jag_100M_metadata.prototext +++ b/applications/physics/data/jag_100M_metadata.prototext @@ -20,7 +20,12 @@ data_set_metadata { image_prefix: "/outputs/images/" - jag_image_keys: ["(0.0, 0.0)/0.0/emi", "(90.0, 0.0)/0.0/emi", "(90.0, 78.0)/0.0/emi"] + image_width: 64 + image_height: 64 + image_num_channels: 4 + + #jag_image_keys: ["(0.0, 0.0)/0.0/emi", "(90.0, 0.0)/0.0/emi", "(90.0, 78.0)/0.0/emi"] #3 views + jag_image_keys: ["(0.0, 0.0)/0.0/emi"] #1 view, default scalar_prefix: "/outputs/scalars/" diff --git a/applications/physics/data/jag_conduit_reader.prototext b/applications/physics/data/jag_conduit_reader.prototext new file mode 100644 index 00000000000..9e72ce41fb8 --- /dev/null +++ b/applications/physics/data/jag_conduit_reader.prototext @@ -0,0 +1,50 @@ +######################################################################## +# The JAG normalization values were computed over the 10M + 1MA + 1MB random +# pulls from the 100M data set. They are valid for the directories: +# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B) +# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B +# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +######################################################################## + +data_reader { + requires_data_set_metadata: true + + reader { + name: "jag_conduit" + role: "train" + shuffle: true + # change to a lustre path + data_filedir: "/p/gpfs1/brainusr/datasets/10MJAG/1M_A/" + index_list: "index.txt" + index_list_per_trainer: true + index_list_per_model: false + + validation_percent: 0.1 + absolute_sample_count: 0 + percent_of_data_to_use: 1.0 + disable_responses: true + disable_labels: true + + num_labels: 5 + } + + reader { + name: "jag_conduit" + role: "test" + shuffle: true + # change to a lustre path + data_filedir: "/p/gpfs1/brainusr/datasets/10MJAG/1M_B" + index_list: "t0_sample_list_multi_10K.txt" #100 samples + index_list_per_trainer: false + index_list_per_model: false + + validation_percent: 0 + absolute_sample_count: 0 + percent_of_data_to_use: 0.1 + disable_responses: true + disable_labels: true + + num_labels: 5 + } +} diff --git a/applications/selfsupervised/.gitignore b/applications/selfsupervised/.gitignore new file mode 100644 index 00000000000..3dd91f59ec8 --- /dev/null +++ b/applications/selfsupervised/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +experiments diff --git a/applications/selfsupervised/README.md b/applications/selfsupervised/README.md new file mode 100644 index 00000000000..d40bb6aec1c --- /dev/null +++ b/applications/selfsupervised/README.md @@ -0,0 +1,26 @@ +# Experiments in self-supervised learning with Siamese networks + +Reference: + +Nathan T. Mundhenk, Daniel Ho, and Barry Y. Chen. "Improvements to +context based self-supervised learning." In Computer Vision and +Pattern Recognition (CVPR). 2018. + +## Dependencies + +- NumPy +- SciPy +- OpenCV + +``` +pip3 install numpy scipy opencv-python +``` + +## Usage + +``` +python3 main.py +``` + +Data paths are hardcoded for the Pascal cluster at LLNL. Users must be +in the `brainusr` group. diff --git a/applications/selfsupervised/classifier.py b/applications/selfsupervised/classifier.py new file mode 100644 index 00000000000..a6585144811 --- /dev/null +++ b/applications/selfsupervised/classifier.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +import os.path +import google.protobuf.text_format +import lbann +import modules + +def setup(data_reader_file, + name='classifier', + num_labels=200, + mini_batch_size=128, + num_epochs=1000, + learning_rate=0.1, + bn_statistics_group_size=2, + fc_data_layout='model_parallel', + warmup_epochs=50, + learning_rate_drop_interval=50, + learning_rate_drop_factor=0.25, + checkpoint_interval=None): + + # Setup input data + input = lbann.Input() + images = lbann.Identity(input) + labels = lbann.Identity(input) + + # Classification network + head_cnn = modules.ResNet(bn_statistics_group_size=bn_statistics_group_size) + class_fc = lbann.modules.FullyConnectedModule(num_labels, + activation=lbann.Softmax, + name=f'{name}_fc', + data_layout=fc_data_layout) + x = head_cnn(images) + probs = class_fc(x) + + # Setup objective function + cross_entropy = lbann.CrossEntropy([probs, labels]) + l2_reg_weights = set() + for l in lbann.traverse_layer_graph(input): + if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: + l2_reg_weights.update(l.weights) + l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=0.0002) + obj = lbann.ObjectiveFunction([cross_entropy, l2_reg]) + + # Setup model + metrics = [lbann.Metric(lbann.CategoricalAccuracy([probs, labels]), + name='accuracy', unit='%')] + callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] + if checkpoint_interval: + callbacks.append( + lbann.CallbackCheckpoint( + checkpoint_dir='ckpt', + checkpoint_epochs=5 + ) + ) + + # Learning rate schedules + if warmup_epochs: + callbacks.append( + lbann.CallbackLinearGrowthLearningRate( + target=learning_rate * mini_batch_size / 128, + num_epochs=warmup_epochs + ) + ) + if learning_rate_drop_factor: + callbacks.append( + lbann.CallbackDropFixedLearningRate( + drop_epoch=list(range(0, num_epochs, learning_rate_drop_interval)), + amt=learning_rate_drop_factor) + ) + + # Construct model + model = lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(input), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + + # Setup optimizer + # opt = lbann.Adam(learn_rate=learning_rate, beta1=0.9, beta2=0.999, eps=1e-8) + opt = lbann.SGD(learn_rate=learning_rate, momentum=0.9) + + # Load data reader from prototext + data_reader_proto = lbann.lbann_pb2.LbannPB() + with open(data_reader_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), data_reader_proto) + data_reader_proto = data_reader_proto.data_reader + for reader_proto in data_reader_proto.reader: + reader_proto.python.module_dir = os.path.dirname(os.path.realpath(__file__)) + + # Return experiment objects + return model, data_reader_proto, opt + +if __name__ == "__main__": + import argparse + import lbann.contrib.args + import lbann.contrib.launcher + + # Command-line arguments + parser = argparse.ArgumentParser() + lbann.contrib.args.add_scheduler_arguments(parser) + parser.add_argument( + '--job-name', action='store', default='lbann_siamese_finetune', type=str, + help=('scheduler job name')) + parser.add_argument( + '--mini-batch-size', action='store', default=128, type=int, + help='mini-batch size (default: 128)', metavar='NUM') + parser.add_argument( + '--num-epochs', action='store', default=1000, type=int, + help='number of epochs (default: 1000)', metavar='NUM') + parser.add_argument( + '--learning-rate', action='store', default=0.1, type=float, + help='learning rate (default: 0.1)', metavar='LR') + parser.add_argument( + '--bn-statistics-group-size', action='store', default=2, type=int, + help=('group size for batch norm statistics (default: 2)')) + parser.add_argument( + '--fc-data-layout', action='store', default='model_parallel', type=str, + help=('data layout for fully-connected layers ' + '(default: "model_parallel")')) + args = parser.parse_args() + + # Setup experiment + trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size) + current_dir = os.path.dirname(os.path.realpath(__file__)) + data_reader_file = os.path.join(current_dir, 'data_reader_cub.prototext') + model, data_reader, opt = setup( + data_reader_file=data_reader_file, + mini_batch_size=args.mini_batch_size, + num_epochs=args.num_epochs, + learning_rate=args.learning_rate, + bn_statistics_group_size=args.bn_statistics_group_size, + fc_data_layout=args.fc_data_layout, + ) + + # Run experiment + kwargs = lbann.contrib.args.get_scheduler_kwargs(args) + lbann.contrib.launcher.run( + trainer, model, data_reader, opt, + job_name=args.job_name, + **kwargs, + ) diff --git a/applications/selfsupervised/data_reader_cub.prototext b/applications/selfsupervised/data_reader_cub.prototext new file mode 100644 index 00000000000..b63e19e8cef --- /dev/null +++ b/applications/selfsupervised/data_reader_cub.prototext @@ -0,0 +1,61 @@ +data_reader { + reader { + name: "imagenet" + role: "train" + shuffle: true + data_filedir: "/p/lscratchh/brainusr/datasets/CUB_200_2011/basic_set_256x256/" + data_filename: "/p/lscratchh/brainusr/datasets/CUB_200_2011_list/train_list.txt" + validation_percent: 0.0 + percent_of_data_to_use: 1.0 + num_labels: 200 + + transforms { + random_resized_crop { + height: 224 + width: 224 + } + } + transforms { + horizontal_flip { + p: 0.5 + } + } + transforms { + colorize {} + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" + } + } + + } + + reader { + name: "imagenet" + role: "test" + shuffle: true + data_filedir: "/p/lscratchh/brainusr/datasets/CUB_200_2011/basic_set_256x256/" + data_filename: "/p/lscratchh/brainusr/datasets/CUB_200_2011_list/test_list.txt" + percent_of_data_to_use: 1.0 + num_labels: 200 + + transforms { + center_crop { + height: 224 + width: 224 + } + } + transforms { + colorize {} + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" + } + } + + } +} diff --git a/applications/selfsupervised/data_reader_imagenet.prototext b/applications/selfsupervised/data_reader_imagenet.prototext new file mode 100644 index 00000000000..733b8fa80bf --- /dev/null +++ b/applications/selfsupervised/data_reader_imagenet.prototext @@ -0,0 +1,33 @@ +data_reader { + reader { + name: "imagenet" + role: "train" + shuffle: true + data_filedir: "/p/lscratchh/brainusr/ILSVRC2012/original/train/" + data_filename: "/p/lscratchh/brainusr/ILSVRC2012/labels/train.txt" + #data_filename: "/p/lscratchh/brainusr/ILSVRC2012/labels/train_c0-9.txt" + percent_of_data_to_use: 1.0 + num_labels: 1000 + + transforms { + random_resized_crop { + height: 224 + width: 224 + } + } + transforms { + horizontal_flip { + p: 0.5 + } + } + transforms { + colorize {} + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" + } + } + } +} diff --git a/applications/selfsupervised/main.py b/applications/selfsupervised/main.py new file mode 100644 index 00000000000..6364df48b6d --- /dev/null +++ b/applications/selfsupervised/main.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +import os.path +import argparse +import random +import lbann +import lbann.contrib.launcher +import lbann.contrib.args +import lbann.proto +import classifier +import pretrain_siamese +import util + +# Paths +current_dir = os.path.dirname(os.path.realpath(__file__)) + +# ============================================== +# Options +# ============================================== + +# Command-line options +parser = argparse.ArgumentParser() +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_siamese', type=str, + help='job name', metavar='NAME') +parser.add_argument( + '--pretrain', action='store', default='siamese', type=str, + help='pretraining model (default: siamese)') +parser.add_argument( + '--num-patches', action='store', default=3, type=int, + help='number of patches and Siamese heads (default: 3)', metavar='NUM') +parser.add_argument( + '--pretrain-epochs', action='store', default=20, type=int, + help='number of pretraining epochs (default: 20)', metavar='NUM') +parser.add_argument( + '--batch-job', action='store_true', + help='submit script as batch job') +parser.add_argument( + '--checkpoint-interval', action='store', default=0, type=int, + help='epoch frequency for checkpointing') +args = parser.parse_args() + +# ============================================== +# Setup experiment +# ============================================== + +# Pretraining model +if not args.pretrain or args.pretrain == 'siamese': + model1, reader1, opt1 = pretrain_siamese.setup( + num_patches=args.num_patches, + mini_batch_size=512, + num_epochs=args.pretrain_epochs, + learning_rate=0.005, + checkpoint_interval=args.checkpoint_interval, + ) +elif args.pretrain == 'supervised': + data_reader_file = os.path.join(current_dir, 'data_reader_imagenet.prototext') + model1, reader1, opt1 = classifier.setup( + data_reader_file=data_reader_file, + name='supervised', + num_labels=1000, + mini_batch_size=512, + num_epochs=args.pretrain_epochs, + learning_rate=0.1, + warmup_epochs=5, + learning_rate_drop_interval=30, + learning_rate_drop_factor=0.1, + checkpoint_interval=args.checkpoint_interval, + ) +else: + raise Exception(f'"{args.pretrain}" is an invalid pretraining model') +model1.random_seed = random.getrandbits(32) + +# Fine-tuning model +data_reader_file = os.path.join(current_dir, 'data_reader_cub.prototext') +model2, reader2, opt2 = classifier.setup( + data_reader_file=data_reader_file, + name='finetune', + num_labels=200, + mini_batch_size=128, + num_epochs=500, + learning_rate=0.1, + warmup_epochs=50, + learning_rate_drop_interval=50, + learning_rate_drop_factor=0.25, +) + +# ============================================== +# Construct LBANN invocation +# ============================================== + +# Initialize LBANN executable and command-line arguments +lbann_exe = os.path.realpath(lbann.lbann_exe()) +lbann_exe = os.path.join(os.path.dirname(lbann_exe), 'lbann2') +lbann_command = [lbann_exe] + +# Construct experiment directory +experiment_dir = util.make_experiment_dir(args.job_name) + +# Export model prototext files +# Note: lbann2 driver doesn't have a command-line argument to get +# trainer. +file1 = os.path.join(experiment_dir, 'model1.prototext') +file2 = os.path.join(experiment_dir, 'model2.prototext') +lbann.proto.save_prototext(file1, model=model1, trainer=lbann.Trainer(mini_batch_size=512)) +lbann.proto.save_prototext(file2, model=model2, trainer=lbann.Trainer(mini_batch_size=512)) +lbann_command.append(f'--model={{{file1},{file2}}}') + +# Export data reader prototext files +file1 = os.path.join(experiment_dir, 'reader1.prototext') +file2 = os.path.join(experiment_dir, 'reader2.prototext') +lbann.proto.save_prototext(file1, data_reader=reader1) +lbann.proto.save_prototext(file2, data_reader=reader2) +lbann_command.append(f'--reader={{{file1},{file2}}}') + +# Export optimizer prototext files +file1 = os.path.join(experiment_dir, 'opt1.prototext') +file2 = os.path.join(experiment_dir, 'opt2.prototext') +lbann.proto.save_prototext(file1, optimizer=opt1) +lbann.proto.save_prototext(file2, optimizer=opt2) +lbann_command.append(f'--optimizer={{{file1},{file2}}}') + +# ============================================== +# Launch experiment +# ============================================== + +# Construct batch script +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) +script = lbann.contrib.launcher.make_batch_script( + work_dir=experiment_dir, + job_name=args.job_name, + **kwargs, +) +script.add_parallel_command(lbann_command) + +# Launch LBANN +if args.batch_job: + script.submit() +else: + script.run() diff --git a/applications/selfsupervised/modules.py b/applications/selfsupervised/modules.py new file mode 100644 index 00000000000..fe5c1f66c1e --- /dev/null +++ b/applications/selfsupervised/modules.py @@ -0,0 +1,190 @@ +import lbann +import lbann.modules +import resnet + +class BatchNormModule(lbann.modules.Module): + + global_count = 0 # Static counter, used for default names + + def __init__(self, + statistics_group_size=1, + name=None, + data_layout='data_parallel'): + super().__init__() + BatchNormModule.global_count += 1 + self.instance = 0 + self.statistics_group_size = statistics_group_size + self.name = (name + if name + else 'bnmodule{0}'.format(BatchNormModule.global_count)) + self.data_layout = data_layout + + # Initialize weights + self.scale = lbann.Weights( + initializer=lbann.ConstantInitializer(value=1.0), + name=self.name + '_scale') + self.bias = lbann.Weights( + initializer=lbann.ConstantInitializer(value=0.0), + name=self.name + '_bias') + self.running_mean = lbann.Weights( + initializer=lbann.ConstantInitializer(value=0.0), + name=self.name + '_running_mean') + self.running_variance = lbann.Weights( + initializer=lbann.ConstantInitializer(value=1.0), + name=self.name + '_running_variance') + + def forward(self, x): + self.instance += 1 + name = '{0}_instance{1}'.format(self.name, self.instance) + return lbann.BatchNormalization( + x, + weights=[self.scale, self.bias, + self.running_mean, self.running_variance], + decay=0.9, + scale_init=1.0, + bias_init=0.0, + epsilon=1e-5, + statistics_group_size=self.statistics_group_size, + name=name, + data_layout=self.data_layout) + +class ConvBnRelu(lbann.modules.Module): + + global_count = 0 # Static counter, used for default names + + def __init__(self, + out_channels, kernel_size, + stride=1, padding=0, + statistics_group_size=1, + name=None): + super().__init__() + ConvBnRelu.global_count += 1 + self.instance = 0 + self.name = (name + if name + else 'convbnrelu{0}'.format(ConvBnRelu.global_count)) + self.conv = lbann.modules.Convolution2dModule(out_channels, + kernel_size, + stride=stride, + padding=padding, + bias=False, + name=self.name+'_conv') + self.bn = BatchNormModule(statistics_group_size=statistics_group_size, + name=self.name+'_bn') + + def forward(self, x): + self.instance += 1 + x = self.conv(x) + x = self.bn(x) + return lbann.Relu(x, name='{0}_relu_instance{1}'.format(self.name, self.instance)) + +class FcBnRelu(lbann.modules.Module): + + global_count = 0 # Static counter, used for default names + + def __init__(self, + size, + statistics_group_size=1, + name=None, + data_layout='data_parallel'): + super().__init__() + FcBnRelu.global_count += 1 + self.instance = 0 + self.name = (name + if name + else 'fcbnrelu{0}'.format(FcBnRelu.global_count)) + self.data_layout = data_layout + self.fc = lbann.modules.FullyConnectedModule(size, + bias=False, + name=self.name+'_fc', + data_layout=self.data_layout) + + # Weights for batchnorm + scalebias_vals = [1.0] * size + [0.0] * size + self.bn_weights = [ + lbann.Weights( + name='{0}_bn_running_mean'.format(self.name), + initializer=lbann.ConstantInitializer(value=0.0)), + lbann.Weights( + name='{0}_bn_running_var'.format(self.name), + initializer=lbann.ConstantInitializer(value=1.0)), + lbann.Weights( + name='{0}_bn_scalebias'.format(self.name), + initializer=lbann.ValueInitializer(values=' '.join([str(x) for x in scalebias_vals])))] + + def forward(self, x): + self.instance += 1 + x = self.fc(x) + x = lbann.EntrywiseBatchNormalization(x, + weights=[self.bn_weights[0], self.bn_weights[1]], + decay=0.9, + epsilon=1e-5, + name='{0}_bn_instance{1}'.format(self.name, self.instance), + data_layout=self.data_layout) + x = lbann.EntrywiseScaleBias(x, + weights=self.bn_weights[2], + name='{0}_bn_scalebias_instance{1}'.format(self.name, self.instance), + data_layout=self.data_layout) + return lbann.Relu(x, + name='{0}_relu_instance{1}'.format(self.name, self.instance), + data_layout=self.data_layout) + +class AlexNetCNN(lbann.modules.Module): + """AlexNet CNN with batch norm. + + FC network at end of AlexNet is not included. + + """ + + def __init__(self, bn_statistics_group_size=1): + self.name = 'alexnet' + self.conv1 = ConvBnRelu(96, 11, + stride=4, + padding=5, + statistics_group_size=bn_statistics_group_size, + name='{0}_conv1'.format(self.name)) + self.conv2 = ConvBnRelu(256, 3, + padding=1, + statistics_group_size=bn_statistics_group_size, + name='{0}_conv2'.format(self.name)) + self.conv3 = ConvBnRelu(384, 3, + padding=1, + statistics_group_size=bn_statistics_group_size, + name='{0}_conv3'.format(self.name)) + self.conv4 = ConvBnRelu(384, 3, + padding=1, + statistics_group_size=bn_statistics_group_size, + name='{0}_conv4'.format(self.name)) + self.conv5 = ConvBnRelu(256, 3, + padding=1, + statistics_group_size=bn_statistics_group_size, + name='{0}_conv5'.format(self.name)) + + def forward(self, x): + x = self.conv1(x) + x = lbann.Pooling(x, num_dims=2, has_vectors=False, + pool_dims_i=3, pool_pads_i=0, pool_strides_i=2, + pool_mode='max') + x = self.conv2(x) + x = lbann.Pooling(x, num_dims=2, has_vectors=False, + pool_dims_i=3, pool_pads_i=0, pool_strides_i=2, + pool_mode='max') + x = self.conv3(x) + x = self.conv4(x) + x = self.conv5(x) + x = lbann.Pooling(x, num_dims=2, has_vectors=False, + pool_dims_i=3, pool_pads_i=0, pool_strides_i=2, + pool_mode='max') + return x + +class ResNet(lbann.modules.Module): + + def __init__(self, bn_statistics_group_size=1): + self.name = 'resnet' + self.cnn = resnet.ResNet34(bn_statistics_group_size=bn_statistics_group_size, + name=self.name) + + def forward(self, x): + x = self.cnn(x) + x = lbann.ChannelwiseMean(x) + return x diff --git a/applications/selfsupervised/patch_generator/__init__.py b/applications/selfsupervised/patch_generator/__init__.py new file mode 100644 index 00000000000..b8e81340273 --- /dev/null +++ b/applications/selfsupervised/patch_generator/__init__.py @@ -0,0 +1,136 @@ +import functools +import operator +import os.path +import random +import sys +import cv2 +import numpy as np +from .extract_patches import extract_patches +from .patterns import patterns_2patch, patterns_3patch, patterns_4patch, patterns_5patch +from .chroma_blur import chroma_blur + +# Data paths +label_file = '/p/lscratchh/brainusr/ILSVRC2012/labels/train.txt' +data_dir = '/p/lscratchh/brainusr/ILSVRC2012/original/train' + +# Read label files +samples = [] +with open(label_file) as f: + for line in f: + line = line.split(' ') + samples.append((line[0], int(line[1]))) + +# Get sample function +def get_sample_2patch(index): + return get_sample(index, 2) +def get_sample_3patch(index): + return get_sample(index, 3) +def get_sample_4patch(index): + return get_sample(index, 4) +def get_sample_5patch(index): + return get_sample(index, 5) +def get_sample(index, num_patches): + """Generate data sample. + + Extract patches and apply preprocessing tricks. + """ + + # Read image from file + file_name, _ = samples[index] + file_name = os.path.join(data_dir, file_name) + img = cv2.imdecode(np.fromfile(file_name, dtype=np.uint8), + cv2.IMREAD_COLOR) + + # Crop to get square image + size = min(img.shape[0], img.shape[1]) + y = (img.shape[0] - size) // 2 + x = (img.shape[1] - size) // 2 + img = img[y:y+size, x:x+size, :] + + # Extract patches + patterns = None + if num_patches == 2: + patterns = patterns_2patch + if num_patches == 3: + patterns = patterns_3patch + if num_patches == 4: + patterns = patterns_4patch + if num_patches == 5: + patterns = patterns_5patch + patches, label = extract_patches(img, patterns) + + # Randomly rotate patches + rotate_type = random.randint(0, 3) + for i, patch in enumerate(patches): + patch = np.rot90(patch, rotate_type, axes=(0,1)) + patches[i] = patch + label = label + rotate_type * len(patterns) + + # Convert patch to float32 + for i, patch in enumerate(patches): + if patch.dtype == np.uint8: + patches[i] = patch.astype(np.float32) / 255 + + # Chroma blur + for i, patch in enumerate(patches): + patches[i] = chroma_blur(patch) + + # Transform to CHW format and normalize + for i, patch in enumerate(patches): + patch = np.transpose(patch, axes=(2, 0, 1)) + means = np.array([0.406, 0.456, 0.485]).reshape((3,1,1)) + stdevs = np.array([0.225, 0.224, 0.229]).reshape((3,1,1)) + patch -= means + patch /= stdevs + patches[i] = patch + + # Random aperture + for i, patch in enumerate(patches): + if i == 0: + continue + size = random.randint(64, 96) + y = random.randint(0, 96-size) + x = random.randint(0, 96-size) + new_patch = np.zeros((3, 96, 96), dtype=np.float32) + new_patch[:, y:y+size, x:x+size] = patch[:, y:y+size, x:x+size] + patches[i] = new_patch + + # Construct one-hot label vector + label_vec = np.zeros(num_labels(num_patches), dtype=np.float32) + label_vec[label] = 1 + + # Return flattened data tensors + flat_data = [] + for patch in patches: + flat_data.append(patch.reshape(-1)) + flat_data.append(label_vec) + return np.concatenate(flat_data) + +# Get sample dims functions +patch_dims = (3, 96, 96) +def num_labels(num_patches): + num_patterns = 0 + if num_patches == 2: + num_patterns = len(patterns_2patch) + if num_patches == 3: + num_patterns = len(patterns_3patch) + if num_patches == 4: + num_patterns = len(patterns_4patch) + if num_patches == 5: + num_patterns = len(patterns_5patch) + return 4 * num_patterns +def sample_dims(num_patches): + patch_size = functools.reduce(operator.mul, patch_dims) + return (num_patches*patch_size + num_labels(num_patches),) +def sample_dims_2patch(): + return sample_dims(2) +def sample_dims_3patch(): + return sample_dims(3) +def sample_dims_4patch(): + return sample_dims(4) +def sample_dims_5patch(): + return sample_dims(5) + +# Get num samples function +def num_samples(): + return len(samples) diff --git a/applications/selfsupervised/patch_generator/chroma_blur.py b/applications/selfsupervised/patch_generator/chroma_blur.py new file mode 100644 index 00000000000..ee9e4a57406 --- /dev/null +++ b/applications/selfsupervised/patch_generator/chroma_blur.py @@ -0,0 +1,16 @@ +import numpy as np +import scipy.ndimage.filters +import cv2 + +def chroma_blur(img): + """Blur chroma channels to hide chromatic aberration. + + Convert to CIE Lab format and apply box filter to a and b + channels. + + """ + img = cv2.cvtColor(img, cv2.COLOR_BGR2Lab) + img[:,:,1] = scipy.ndimage.filters.uniform_filter(img[:,:,1], 13) + img[:,:,2] = scipy.ndimage.filters.uniform_filter(img[:,:,2], 13) + img = cv2.cvtColor(img, cv2.COLOR_Lab2BGR) + return img diff --git a/applications/selfsupervised/patch_generator/extract_patches.py b/applications/selfsupervised/patch_generator/extract_patches.py new file mode 100644 index 00000000000..e24733ae23f --- /dev/null +++ b/applications/selfsupervised/patch_generator/extract_patches.py @@ -0,0 +1,116 @@ +import enum +import math +import random +import cv2 +import numpy as np + +# ---------------------------------------------- +# Patch type specification +# ---------------------------------------------- +# Note: Sizes and positions are in pixels. + +class PatchType(enum.Enum): + _3X3 = 1 + _2X2 = 2 + OVERLAP = 3 + +# 3x3-type patches +_3x3_patch_pos = ((0.0, 0.0), (0.0, 137/384), (0.0, 274/384), + (137/384, 0.0), (137/384, 137/384), (137/384, 274/384), + (274/384, 0.0), (274/384, 137/384), (274/384, 274/384)) +_3x3_patch_size = 110/384 + +# 2x2-type patches +_2x2_patch_pos = ((0, 0), (0, 146/256), + (146/256, 0), (146/256, 146/256)) +_2x2_patch_size = 110/256 + +# Overlap-type patches +overlap_patch_pos = ((0, 0), (0, 86/196), + (86/196, 0), (86/196, 86/196)) +overlap_patch_size = 110/196 + +# ---------------------------------------------- +# Patch extraction +# ---------------------------------------------- + +def extract_patch(img, patch_type, index, zoom, jitter): + """Extract a patch from image and resize. + + Args: + img (ndarry): Image in HWC format. + patch_type (PatchType): Desired patch type. + index (int): Patch index. + zoom (float): Zoom factor. + jitter ((float, float)): Jitter positions, normalized in + [0,1). + + Returns: + ndarray: Patch in HWC format. + + """ + + # Get patch position + if patch_type == PatchType._3X3: + posy = _3x3_patch_pos[index][0] + posx = _3x3_patch_pos[index][1] + patch_size = _3x3_patch_size + if patch_type == PatchType._2X2: + posy = _2x2_patch_pos[index][0] + posx = _2x2_patch_pos[index][1] + patch_size = _2x2_patch_size + if patch_type == PatchType.OVERLAP: + posy = overlap_patch_pos[index][0] + posx = overlap_patch_pos[index][1] + patch_size = overlap_patch_size + + # Apply zoom and jitter to patch position + posy += (1-1/zoom) * patch_size * jitter[0] + posx += (1-1/zoom) * patch_size * jitter[1] + patch_size /= zoom + + # Identify patch pixels + img_size = img.shape[0] + y0 = math.floor(posy * img_size) + y1 = math.ceil((posy + patch_size) * img_size) + x0 = math.floor(posx * img_size) + x1 = math.ceil((posx + patch_size) * img_size) + y0 = max(0, min(img_size-1, y0)) + y1 = max(1, min(img_size, y1)) + x0 = max(0, min(img_size-1, x0)) + x1 = max(1, min(img_size, x1)) + + # Extract patch from image + interp_methods = (cv2.INTER_LINEAR, cv2.INTER_AREA, + cv2.INTER_CUBIC, cv2.INTER_LANCZOS4) + patch = cv2.resize(img[y0:y1, x0:x1, :], + (96, 96), + interpolation=random.choice(interp_methods)) + + # Randomly apply horizontal flip + if random.choice([True, False]): + patch = np.fliplr(patch) + + return patch + +def extract_patches(img, patterns): + """Extract patches from image. + + Args: + img (ndarry): Image in HWC format. + patterns (list of (list of (PatchType, int))): Patch patterns. + See patterns.py. + + Returns: + list of ndarray: Patches in HWC format. + int: Patch pattern label. + + """ + + label = random.randint(0, len(patterns)-1) + zoom = random.uniform(1, 128/96) + jitter = (random.random(), random.random()) + patches = [extract_patch(img, p[0], p[1], zoom, jitter) + for p in patterns[label]] + random.shuffle(patches) + return patches, label diff --git a/applications/selfsupervised/patch_generator/patterns.py b/applications/selfsupervised/patch_generator/patterns.py new file mode 100644 index 00000000000..6a65223c6d6 --- /dev/null +++ b/applications/selfsupervised/patch_generator/patterns.py @@ -0,0 +1,147 @@ +from .extract_patches import PatchType +_3X3 = PatchType._3X3 +_2X2 = PatchType._2X2 +OVERLAP = PatchType.OVERLAP + +# 2-patch configurations +# See: Carl Doersch, Abhinav Gupta, and Alexei A. Efros. "Unsupervised +# visual representation learning by context prediction." In +# Proceedings of the IEEE International Conference on Computer +# Vision, pp. 1422-1430. 2015. +patterns_2patch = ( + ((_3X3, 4), (_3X3, 0)), + ((_3X3, 4), (_3X3, 1)), + ((_3X3, 4), (_3X3, 2)), + ((_3X3, 4), (_3X3, 5)), + ((_3X3, 4), (_3X3, 8)), + ((_3X3, 4), (_3X3, 7)), + ((_3X3, 4), (_3X3, 6)), + ((_3X3, 4), (_3X3, 3)) +) + +# 3-patch configurations +# See: T. Nathan Mundhenk, Daniel Ho, and Barry Y. Chen. "Improvements +# to Context Based Self-Supervised Learning." In CVPR, pp. +# 9339-9348. 2018. +patterns_3patch = ( + + # Line + ((_3X3, 4), (_3X3, 0), (_3X3, 8)), + ((_3X3, 4), (_3X3, 1), (_3X3, 7)), + ((_3X3, 4), (_3X3, 2), (_3X3, 6)), + ((_3X3, 4), (_3X3, 5), (_3X3, 3)), + + # L-shape + ((_2X2, 0), (_2X2, 1), (_2X2, 3)), + ((_2X2, 1), (_2X2, 3), (_2X2, 2)), + ((_2X2, 3), (_2X2, 2), (_2X2, 0)), + ((_2X2, 2), (_2X2, 0), (_2X2, 1)), + + # Hybrid scale patches + ((OVERLAP, 0), (_3X3, 2), (_3X3, 5)), + ((OVERLAP, 0), (_3X3, 6), (_3X3, 7)), + ((OVERLAP, 1), (_3X3, 8), (_3X3, 7)), + ((OVERLAP, 1), (_3X3, 0), (_3X3, 3)), + ((OVERLAP, 3), (_3X3, 6), (_3X3, 3)), + ((OVERLAP, 3), (_3X3, 2), (_3X3, 1)), + ((OVERLAP, 2), (_3X3, 0), (_3X3, 1)), + ((OVERLAP, 2), (_3X3, 8), (_3X3, 5)) + +) + +# 4-patch configurations +patterns_4patch = ( + + # T-shape + ((_3X3, 4), (_3X3, 1), (_3X3, 5), (_3X3, 7)), + ((_3X3, 4), (_3X3, 5), (_3X3, 7), (_3X3, 3)), + ((_3X3, 4), (_3X3, 7), (_3X3, 3), (_3X3, 1)), + ((_3X3, 4), (_3X3, 3), (_3X3, 1), (_3X3, 5)), + + # Z-shape + ((_3X3, 4), (_3X3, 2), (_3X3, 5), (_3X3, 7)), + ((_3X3, 4), (_3X3, 8), (_3X3, 7), (_3X3, 3)), + ((_3X3, 4), (_3X3, 6), (_3X3, 3), (_3X3, 1)), + ((_3X3, 4), (_3X3, 0), (_3X3, 1), (_3X3, 5)), + ((_3X3, 4), (_3X3, 0), (_3X3, 3), (_3X3, 7)), + ((_3X3, 4), (_3X3, 2), (_3X3, 1), (_3X3, 3)), + ((_3X3, 4), (_3X3, 8), (_3X3, 5), (_3X3, 1)), + ((_3X3, 4), (_3X3, 6), (_3X3, 7), (_3X3, 5)), + + # L-shape + ((_3X3, 4), (_3X3, 2), (_3X3, 1), (_3X3, 7)), + ((_3X3, 4), (_3X3, 8), (_3X3, 5), (_3X3, 3)), + ((_3X3, 4), (_3X3, 6), (_3X3, 7), (_3X3, 1)), + ((_3X3, 4), (_3X3, 0), (_3X3, 3), (_3X3, 5)), + ((_3X3, 4), (_3X3, 0), (_3X3, 1), (_3X3, 7)), + ((_3X3, 4), (_3X3, 2), (_3X3, 5), (_3X3, 3)), + ((_3X3, 4), (_3X3, 8), (_3X3, 7), (_3X3, 1)), + ((_3X3, 4), (_3X3, 6), (_3X3, 3), (_3X3, 5)), + + # Square + ((_2X2, 0), (_2X2, 1), (_2X2, 3), (_2X2, 2)), + + # Hybrid scale + ((OVERLAP, 0), (_3X3, 2), (_3X3, 5), (_3X3, 8)), + ((OVERLAP, 1), (_3X3, 8), (_3X3, 7), (_3X3, 6)), + ((OVERLAP, 3), (_3X3, 6), (_3X3, 3), (_3X3, 0)), + ((OVERLAP, 2), (_3X3, 0), (_3X3, 1), (_3X3, 2)), + ((OVERLAP, 0), (_3X3, 6), (_3X3, 7), (_3X3, 8)), + ((OVERLAP, 1), (_3X3, 0), (_3X3, 3), (_3X3, 6)), + ((OVERLAP, 3), (_3X3, 2), (_3X3, 1), (_3X3, 0)), + ((OVERLAP, 2), (_3X3, 8), (_3X3, 5), (_3X3, 2)), + ((OVERLAP, 0), (_3X3, 5), (_3X3, 8), (_3X3, 7)), + ((OVERLAP, 1), (_3X3, 7), (_3X3, 6), (_3X3, 3)), + ((OVERLAP, 3), (_3X3, 3), (_3X3, 0), (_3X3, 1)), + ((OVERLAP, 2), (_3X3, 1), (_3X3, 2), (_3X3, 5)), + +) + +# 5-patch configurations +patterns_5patch = ( + + # Cross + ((_3X3, 4), (_3X3, 1), (_3X3, 5), (_3X3, 7), (_3X3, 3)), + + # X-shape + ((_3X3, 4), (_3X3, 0), (_3X3, 2), (_3X3, 8), (_3X3, 6)), + + # T-shape + ((_3X3, 4), (_3X3, 0), (_3X3, 1), (_3X3, 2), (_3X3, 7)), + ((_3X3, 4), (_3X3, 2), (_3X3, 5), (_3X3, 8), (_3X3, 3)), + ((_3X3, 4), (_3X3, 8), (_3X3, 7), (_3X3, 6), (_3X3, 1)), + ((_3X3, 4), (_3X3, 6), (_3X3, 3), (_3X3, 0), (_3X3, 5)), + + # Z-shape + ((_3X3, 4), (_3X3, 0), (_3X3, 1), (_3X3, 7), (_3X3, 8)), + ((_3X3, 4), (_3X3, 2), (_3X3, 5), (_3X3, 3), (_3X3, 6)), + ((_3X3, 4), (_3X3, 8), (_3X3, 7), (_3X3, 1), (_3X3, 0)), + ((_3X3, 4), (_3X3, 6), (_3X3, 3), (_3X3, 5), (_3X3, 2)), + ((_3X3, 4), (_3X3, 0), (_3X3, 3), (_3X3, 5), (_3X3, 8)), + ((_3X3, 4), (_3X3, 2), (_3X3, 1), (_3X3, 7), (_3X3, 6)), + ((_3X3, 4), (_3X3, 8), (_3X3, 5), (_3X3, 3), (_3X3, 0)), + ((_3X3, 4), (_3X3, 6), (_3X3, 7), (_3X3, 1), (_3X3, 2)), + + # U-shape + ((_3X3, 4), (_3X3, 0), (_3X3, 3), (_3X3, 5), (_3X3, 2)), + ((_3X3, 4), (_3X3, 2), (_3X3, 1), (_3X3, 7), (_3X3, 8)), + ((_3X3, 4), (_3X3, 8), (_3X3, 5), (_3X3, 3), (_3X3, 6)), + ((_3X3, 4), (_3X3, 6), (_3X3, 7), (_3X3, 1), (_3X3, 0)), + + # V-shape + ((_3X3, 0), (_3X3, 4), (_3X3, 8), (_3X3, 7), (_3X3, 6)), + ((_3X3, 2), (_3X3, 4), (_3X3, 6), (_3X3, 3), (_3X3, 0)), + ((_3X3, 8), (_3X3, 4), (_3X3, 0), (_3X3, 1), (_3X3, 2)), + ((_3X3, 6), (_3X3, 4), (_3X3, 2), (_3X3, 5), (_3X3, 8)), + ((_3X3, 0), (_3X3, 4), (_3X3, 8), (_3X3, 5), (_3X3, 2)), + ((_3X3, 2), (_3X3, 4), (_3X3, 6), (_3X3, 7), (_3X3, 8)), + ((_3X3, 8), (_3X3, 4), (_3X3, 0), (_3X3, 3), (_3X3, 6)), + ((_3X3, 6), (_3X3, 4), (_3X3, 2), (_3X3, 1), (_3X3, 0)), + + # Hybrid scale + ((OVERLAP, 0), (_3X3, 2), (_3X3, 5), (_3X3, 6), (_3X3, 7)), + ((OVERLAP, 1), (_3X3, 8), (_3X3, 7), (_3X3, 0), (_3X3, 3)), + ((OVERLAP, 3), (_3X3, 6), (_3X3, 3), (_3X3, 2), (_3X3, 1)), + ((OVERLAP, 2), (_3X3, 0), (_3X3, 1), (_3X3, 8), (_3X3, 5)) + +) diff --git a/applications/selfsupervised/pretrain_siamese.py b/applications/selfsupervised/pretrain_siamese.py new file mode 100644 index 00000000000..a818f855126 --- /dev/null +++ b/applications/selfsupervised/pretrain_siamese.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +import functools +import operator +import os.path +import google.protobuf.text_format as txtf +import lbann +import modules +from util import str_list +import patch_generator + +def setup(num_patches=3, + mini_batch_size=512, + num_epochs=75, + learning_rate=0.005, + bn_statistics_group_size=2, + fc_data_layout='model_parallel', + warmup=True, + checkpoint_interval=None): + + # Data dimensions + patch_dims = patch_generator.patch_dims + num_labels = patch_generator.num_labels(num_patches) + + # Extract tensors from data sample + input = lbann.Input() + slice_points = [0] + for _ in range(num_patches): + patch_size = functools.reduce(operator.mul, patch_dims) + slice_points.append(slice_points[-1] + patch_size) + slice_points.append(slice_points[-1] + num_labels) + sample = lbann.Slice(input, slice_points=str_list(slice_points)) + patches = [lbann.Reshape(sample, dims=str_list(patch_dims)) + for _ in range(num_patches)] + labels = lbann.Identity(sample) + + # Siamese network + head_cnn = modules.ResNet(bn_statistics_group_size=bn_statistics_group_size) + heads = [head_cnn(patch) for patch in patches] + heads_concat = lbann.Concatenation(heads) + + # Classification network + class_fc1 = modules.FcBnRelu(4096, + statistics_group_size=bn_statistics_group_size, + name='siamese_class_fc1', + data_layout=fc_data_layout) + class_fc2 = modules.FcBnRelu(4096, + statistics_group_size=bn_statistics_group_size, + name='siamese_class_fc2', + data_layout=fc_data_layout) + class_fc3 = lbann.modules.FullyConnectedModule(num_labels, + activation=lbann.Softmax, + name='siamese_class_fc3', + data_layout=fc_data_layout) + x = class_fc1(heads_concat) + x = class_fc2(x) + probs = class_fc3(x) + + # Setup objective function + cross_entropy = lbann.CrossEntropy([probs, labels]) + l2_reg_weights = set() + for l in lbann.traverse_layer_graph(input): + if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: + l2_reg_weights.update(l.weights) + l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=0.0002) + obj = lbann.ObjectiveFunction([cross_entropy, l2_reg]) + + # Setup model + metrics = [lbann.Metric(lbann.CategoricalAccuracy([probs, labels]), + name='accuracy', unit='%')] + callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] + if checkpoint_interval: + callbacks.append( + lbann.CallbackCheckpoint( + checkpoint_dir='ckpt', + checkpoint_epochs=5 + ) + ) + + # Learning rate schedules + if warmup: + callbacks.append( + lbann.CallbackLinearGrowthLearningRate( + target=learning_rate * mini_batch_size / 128, + num_epochs=5 + ) + ) + callbacks.append( + lbann.CallbackDropFixedLearningRate( + drop_epoch=list(range(0, 100, 15)), amt=0.25) + ) + + # Construct model + model = lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(input), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + + # Setup optimizer + opt = lbann.SGD(learn_rate=learning_rate, momentum=0.9) + # opt = lbann.Adam(learn_rate=learning_rate, beta1=0.9, beta2=0.999, eps=1e-8) + + # Setup data reader + data_reader = make_data_reader(num_patches) + + # Return experiment objects + return model, data_reader, opt + +def make_data_reader(num_patches): + message = lbann.reader_pb2.DataReader() + data_reader = message.reader.add() + data_reader.name = 'python' + data_reader.role = 'train' + data_reader.shuffle = True + data_reader.percent_of_data_to_use = 1.0 + data_reader.python.module = 'patch_generator' + data_reader.python.module_dir = os.path.dirname(os.path.realpath(__file__)) + data_reader.python.num_samples_function = 'num_samples' + if num_patches == 2: + data_reader.python.sample_function = 'get_sample_2patch' + data_reader.python.sample_dims_function = 'sample_dims_2patch' + if num_patches == 3: + data_reader.python.sample_function = 'get_sample_3patch' + data_reader.python.sample_dims_function = 'sample_dims_3patch' + if num_patches == 4: + data_reader.python.sample_function = 'get_sample_4patch' + data_reader.python.sample_dims_function = 'sample_dims_4patch' + if num_patches == 5: + data_reader.python.sample_function = 'get_sample_5patch' + data_reader.python.sample_dims_function = 'sample_dims_5patch' + return message + +if __name__ == "__main__": + import argparse + import lbann.contrib.args + import lbann.contrib.launcher + + # Command-line arguments + parser = argparse.ArgumentParser() + parser.add_argument( + '--job-name', action='store', default='lbann_siamese', type=str, + help='scheduler job name (default: lbann_siamese)', metavar='NAME') + parser.add_argument( + '--num-patches', action='store', default=3, type=int, + help='number of patches and Siamese heads (default: 3)', metavar='NUM') + lbann.contrib.args.add_scheduler_arguments(parser) + parser.add_argument( + '--mini-batch-size', action='store', default=512, type=int, + help='mini-batch size (default: 512)', metavar='NUM') + parser.add_argument( + '--num-epochs', action='store', default=75, type=int, + help='number of epochs (default: 75)', metavar='NUM') + parser.add_argument( + '--learning-rate', action='store', default=0.005, type=float, + help='learning rate (default: 0.005)', metavar='LR') + parser.add_argument( + '--bn-statistics-group-size', action='store', default=2, type=int, + help=('group size for batch norm statistics (default: 2)')) + parser.add_argument( + '--fc-data-layout', action='store', default='model_parallel', type=str, + help=('data layout for fully-connected layers ' + '(default: "model_parallel")')) + parser.add_argument( + '--warmup', action='store', default=True, type=bool, + help='use learning rate warmup (default: True)') + args = parser.parse_args() + + # Setup experiment + trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size) + model, data_reader, opt = setup( + num_patches=args.num_patches, + mini_batch_size=args.mini_batch_size, + num_epochs=args.num_epochs, + learning_rate=args.learning_rate, + bn_statistics_group_size=args.bn_statistics_group_size, + fc_data_layout=args.fc_data_layout, + warmup=args.warmup, + ) + + # Run experiment + kwargs = lbann.contrib.args.get_scheduler_kwargs(args) + lbann.contrib.launcher.run( + trainer, model, data_reader, opt, + job_name = args.job_name, + **kwargs, + ) diff --git a/applications/selfsupervised/resnet.py b/applications/selfsupervised/resnet.py new file mode 100644 index 00000000000..2ab7911f53e --- /dev/null +++ b/applications/selfsupervised/resnet.py @@ -0,0 +1,498 @@ +"""Copy-pasted from lbann.models.resnet.""" +import lbann +import lbann.modules + +# ============================================== +# Helper modules +# ============================================== + +class ConvBNRelu(lbann.modules.Module): + """Convolution -> Batch normalization -> ReLU + + Basic unit for ResNets. Assumes image data in NCHW format. + + """ + + def __init__(self, out_channels, kernel_size, stride, padding, + bn_zero_init, bn_statistics_group_size, + relu, name): + """Initialize ConvBNRelu module. + + Args: + out_channels (int): Number of output channels, i.e. number + of convolution filters. + kernel_size (int): Size of convolution kernel. + stride (int): Convolution stride. + padding (int): Convolution padding. + bn_zero_init (bool): Zero-initialize batch normalization + scale. + bn_statistics_group_size (int): Group size for aggregating + batch normalization statistics. + relu (bool): Apply ReLU activation. + name (str): Module name. + + """ + super().__init__() + self.name = name + self.instance = 0 + + # Initialize convolution + self.conv = lbann.modules.Convolution2dModule( + out_channels, kernel_size, + stride=stride, padding=padding, + bias=False, + name=self.name + '_conv') + + # Initialize batch normalization + bn_scale_init = 0.0 if bn_zero_init else 1.0 + bn_scale = lbann.Weights( + initializer=lbann.ConstantInitializer(value=bn_scale_init), + name=self.name + '_bn_scale') + bn_bias = lbann.Weights( + initializer=lbann.ConstantInitializer(value=0.0), + name=self.name + '_bn_bias') + self.bn_weights = [bn_scale, bn_bias] + self.bn_statistics_group_size = bn_statistics_group_size + + # Initialize ReLU + self.relu = relu + + def forward(self, x): + self.instance += 1 + conv = self.conv(x) + bn = lbann.BatchNormalization( + conv, weights=self.bn_weights, + statistics_group_size=(-1 if self.bn_statistics_group_size == 0 + else self.bn_statistics_group_size), + name='{0}_bn_instance{1}'.format(self.name,self.instance)) + if self.relu: + return lbann.Relu( + bn, name='{0}_relu_instance{1}'.format(self.name,self.instance)) + else: + return bn + +class BasicBlock(lbann.modules.Module): + """Residual block without bottlenecking. + + The number of output channels is the same as the number of + internal channels. Assumes image data in NCHW format. This is the + residual block used in ResNet-{18,34}. + + """ + + def __init__(self, in_channels, mid_channels, + downsample, zero_init_residual, + bn_statistics_group_size, name, width=1): + """Initialize residual block. + + Args: + in_channels (int): Number of input channels. + mid_channels (int): Number of channels in residual branch. + downsample (bool): Perform spatial downsampling (by a + factor of 2 in each spatial dimension). + zero_init_residual (bool): Zero-initialize the scale in + the final batch normalization in the residual branch. + bn_statistics_group_size (int): Group size for aggregating + batch normalization statistics. + name (str): Module name. + width (float, optional): Width growth factor for 3x3 + convolutions. + + """ + super().__init__() + self.name = name + self.instance = 0 + mid_channels = int(mid_channels * width) + self.out_channels = mid_channels + + # Skip connection + if downsample: + self.branch1 = ConvBNRelu(self.out_channels, 1, 2, 0, + False, bn_statistics_group_size, + False, self.name + '_branch1') + elif in_channels != self.out_channels: + self.branch1 = ConvBNRelu(self.out_channels, 1, 1, 0, + False, bn_statistics_group_size, + False, self.name + '_branch1') + else: + self.branch1 = None + + # Residual branch + self.branch2a = ConvBNRelu(mid_channels, 3, + (2 if downsample else 1), 1, + False, bn_statistics_group_size, + True, self.name + '_branch2a') + self.branch2b = ConvBNRelu(self.out_channels, 3, 1, 1, + zero_init_residual, + bn_statistics_group_size, + False, self.name + '_branch2b') + + def forward(self, x): + self.instance += 1 + y1 = self.branch1(x) if self.branch1 else x + y2 = self.branch2b(self.branch2a(x)) + z = lbann.Add([y1, y2], + name='{0}_sum_instance{1}'.format(self.name,self.instance)) + return lbann.Relu(z, name='{0}_relu_instance{1}'.format(self.name,self.instance)) + +class BottleneckBlock(lbann.modules.Module): + """Residual block with bottlenecking. + + The number of output channels is four times the number of internal + channels. Assumes image data in NCHW format. This is the residual + block used in ResNet-{50,101,152}. + + """ + + def __init__(self, in_channels, mid_channels, + downsample, zero_init_residual, + bn_statistics_group_size, name, width=1): + """Initialize residual block. + + Args: + in_channels (int): Number of input channels. + mid_channels (int): Number of channels in residual branch. + downsample (bool): Perform spatial downsampling (by a + factor of 2 in each spatial dimension). + zero_init_residual (bool): Zero-initialize the scale in + the final batch normalization in the residual branch. + bn_statistics_group_size (int): Group size for aggregating + batch normalization statistics. + name (str): Module name. + width (float, optional): Width growth factor for 3x3 + convolutions. + + """ + super().__init__() + self.name = name + self.instance = 0 + self.out_channels = 4 * mid_channels + # Width factor does not grow the output channel size. + mid_channels = int(mid_channels * width) + + # Skip connection + if downsample: + self.branch1 = ConvBNRelu(self.out_channels, 1, 2, 0, + False, bn_statistics_group_size, + False, self.name + '_branch1') + elif in_channels != self.out_channels: + self.branch1 = ConvBNRelu(self.out_channels, 1, 1, 0, + False, bn_statistics_group_size, + False, self.name + '_branch1') + else: + self.branch1 = None + + # Residual branch + self.branch2a = ConvBNRelu(mid_channels, 1, 1, 0, + False, bn_statistics_group_size, + True, self.name + '_branch2a') + self.branch2b = ConvBNRelu(mid_channels, 3, + (2 if downsample else 1), 1, + False, bn_statistics_group_size, + True, self.name + '_branch2b') + self.branch2c = ConvBNRelu(self.out_channels, 1, 1, 0, + zero_init_residual, + bn_statistics_group_size, + False, self.name + '_branch2c') + + def forward(self, x): + self.instance += 1 + y1 = self.branch1(x) if self.branch1 else x + y2 = self.branch2c(self.branch2b(self.branch2a(x))) + z = lbann.Add([y1, y2], + name='{0}_sum_instance{1}'.format(self.name,self.instance)) + return lbann.Relu(z, name='{0}_relu_instance{1}'.format(self.name,self.instance)) + +# ============================================== +# ResNet modules +# ============================================== + +class ResNet(lbann.modules.Module): + """Residual neural network. + + A ResNet is comprised of residual blocks, which are small + convolutional networks with a skip connection. These blocks are + grouped into "layers" (this is a horribly overloaded term, but we + are following the common usage). At the first block in each layer + (except the first), the spatial dimensions are all downsampled by + a factor of 2. + + This does not include the fully-connected network that is commonly + applied following the convolutional network. Input data is assumed + to be image data in NCHW format. + + See: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep + residual learning for image recognition." In Proceedings of + the IEEE Conference on Computer Vision and Pattern + Recognition, pp. 770-778. 2016. + + """ + + + def __init__(self, block, + layer_sizes, layer_channels, + zero_init_residual, bn_statistics_group_size, + name, width=1): + """Initialize ResNet. + + Args: + block (type): Residual block type, which should be a + `lbann.modules.Module`. + layer_sizes (`Iterable` containing `int`s): Number of + blocks in each ResNet layer. + layer_channels (`Iterable` containing `int`s): Number of + internal channels in each ResNet layer. + zero_init_residual (bool): Whether to initialize the final + batch normalization in residual branches with zeros. + bn_statistics_group_size (int): Group size for aggregating + batch normalization statistics. + name (str): Module name. + width (float, optional): Width growth factor. + + """ + super().__init__() + self.name = name + self.instance = 0 + self.conv1 = ConvBNRelu(layer_channels[0], 7, 2, 3, + False, bn_statistics_group_size, + True, self.name + '_conv1') + self.blocks = [] + for layer in range(len(layer_sizes)): + mid_channels = layer_channels[layer] + for i in range(layer_sizes[layer]): + in_channels = (self.blocks[-1].out_channels + if self.blocks + else mid_channels) + downsample = (i == 0 and layer > 0) + b = block(in_channels, mid_channels, + downsample, zero_init_residual, + bn_statistics_group_size, + '{0}_layer{1}_block{2}'.format(self.name, layer, i), + width=width) + self.blocks.append(b) + + def forward(self, x): + self.instance += 1 + x = self.conv1(x) + x = lbann.Pooling(x, num_dims=2, has_vectors=False, + pool_dims_i=3, pool_pads_i=1, pool_strides_i=2, + pool_mode='max', + name='{0}_pool1_instance{1}'.format(self.name,self.instance)) + for b in self.blocks: + x = b(x) + return x + +class ResNet18(ResNet): + """ResNet-18 neural network. + + This does not include the fully-connected network that is commonly + applied following the convolutional network. Input data is assumed + to be image data in NCHW format. + + See: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep + residual learning for image recognition." In Proceedings of + the IEEE Conference on Computer Vision and Pattern + Recognition, pp. 770-778. 2016. + + """ + + global_count = 0 # Static counter, used for default names + + def __init__(self, + zero_init_residual=True, + bn_statistics_group_size=1, + name=None, + width=1): + """Initialize ResNet-18. + + Args: + zero_init_residual (bool, optional): Whether to initialize + the final batch normalization in residual branches + with zeros. + bn_statistics_group_size (str, optional): Group size for + aggregating batch normalization statistics. + name (str, optional): Module name + (default: 'resnet18_module') + width (float, optional): Width growth factor. + + """ + ResNet18.global_count += 1 + if name is None: + name = 'resnet18_module{0}'.format(ResNet18.global_count) + super().__init__(BasicBlock, + (2,2,2,2), (64,128,256,512), + zero_init_residual, bn_statistics_group_size, + name, width=width) + +class ResNet34(ResNet): + """ResNet-34 neural network. + + This does not include the fully-connected network that is commonly + applied following the convolutional network. Input data is assumed + to be image data in NCHW format. + + See: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep + residual learning for image recognition." In Proceedings of + the IEEE Conference on Computer Vision and Pattern + Recognition, pp. 770-778. 2016. + + """ + + global_count = 0 # Static counter, used for default names + + def __init__(self, + zero_init_residual=True, + bn_statistics_group_size=1, + name=None, + width=1): + """Initialize ResNet-34. + + Args: + zero_init_residual (bool, optional): Whether to initialize + the final batch normalization in residual branches + with zeros. + bn_statistics_group_size (str, optional): Group size for + aggregating batch normalization statistics. + name (str, optional): Module name + (default: 'resnet34_module') + width (float, optional): Width growth factor. + + """ + ResNet34.global_count += 1 + if name is None: + name = 'resnet34_module{0}'.format(ResNet34.global_count) + super().__init__(BasicBlock, + (3,4,6,3), (64,128,256,512), + zero_init_residual, bn_statistics_group_size, + name, width=width) + +class ResNet50(ResNet): + """ResNet-50 neural network. + + This does not include the fully-connected network that is commonly + applied following the convolutional network. Input data is assumed + to be image data in NCHW format. + + See: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep + residual learning for image recognition." In Proceedings of + the IEEE Conference on Computer Vision and Pattern + Recognition, pp. 770-778. 2016. + + """ + + global_count = 0 # Static counter, used for default names + + def __init__(self, + zero_init_residual=True, + bn_statistics_group_size=1, + name=None, + width=1): + """Initialize ResNet-50. + + Args: + zero_init_residual (bool, optional): Whether to initialize + the final batch normalization in residual branches + with zeros. + bn_statistics_group_size (str, optional): Group size for + aggregating batch normalization statistics. + name (str, optional): Module name + (default: 'resnet50_module') + width (float, optional): Width growth factor. + + """ + ResNet50.global_count += 1 + if name is None: + name = 'resnet50_module{0}'.format(ResNet50.global_count) + super().__init__(BottleneckBlock, + (3,4,6,3), (64,128,256,512), + zero_init_residual, bn_statistics_group_size, + name, width=width) + +class ResNet101(ResNet): + """ResNet-101 neural network. + + This does not include the fully-connected network that is commonly + applied following the convolutional network. Input data is assumed + to be image data in NCHW format. + + See: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep + residual learning for image recognition." In Proceedings of + the IEEE Conference on Computer Vision and Pattern + Recognition, pp. 770-778. 2016. + + """ + + global_count = 0 # Static counter, used for default names + + def __init__(self, + zero_init_residual=True, + bn_statistics_group_size=1, + name=None, width=1): + """Initialize ResNet-101. + + Args: + zero_init_residual (bool, optional): Whether to initialize + the final batch normalization in residual branches + with zeros. + bn_statistics_group_size (str, optional): Group size for + aggregating batch normalization statistics. + name (str, optional): Module name + (default: 'resnet101_module') + width (float, optional): Width growth factor. + + """ + ResNet101.global_count += 1 + if name is None: + name = 'resnet101_module{0}'.format(ResNet101.global_count) + super().__init__(BottleneckBlock, + (3,4,23,3), (64,128,256,512), + zero_init_residual, bn_statistics_group_size, + name, width=width) + +class ResNet152(ResNet): + """ResNet-152 neural network. + + This does not include the fully-connected network that is commonly + applied following the convolutional network. Input data is assumed + to be image data in NCHW format. + + See: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep + residual learning for image recognition." In Proceedings of + the IEEE Conference on Computer Vision and Pattern + Recognition, pp. 770-778. 2016. + + """ + + global_count = 0 # Static counter, used for default names + + def __init__(self, + zero_init_residual=True, + bn_statistics_group_size=1, + name=None, + width=1): + """Initialize ResNet-152. + + Args: + zero_init_residual (bool, optional): Whether to initialize + the final batch normalization in residual branches + with zeros. + bn_statistics_group_size (str, optional): Group size for + aggregating batch normalization statistics. + name (str, optional): Module name + (default: 'resnet152_module') + width (float, optional): Width growth factor. + + """ + ResNet152.global_count += 1 + if name is None: + name = 'resnet152_module{0}'.format(ResNet152.global_count) + super().__init__(BottleneckBlock, + (3,8,36,3), (64,128,256,512), + zero_init_residual, bn_statistics_group_size, + name, width=width) diff --git a/applications/selfsupervised/util.py b/applications/selfsupervised/util.py new file mode 100644 index 00000000000..7b4f16dcd4e --- /dev/null +++ b/applications/selfsupervised/util.py @@ -0,0 +1,27 @@ +import os +import os.path +import datetime + +def str_list(l): + """Convert iterable to a space-separated string.""" + return ' '.join([str(i) for i in l]) + +def make_experiment_dir(job_name=None): + if job_name is None: + job_name = 'lbann_siamese' + if 'LBANN_EXPERIMENT_DIR' in os.environ: + experiment_dir = os.environ['LBANN_EXPERIMENT_DIR'] + else: + experiment_dir = os.path.join(os.getcwd()) + timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + experiment_dir = os.path.join(experiment_dir, + '{}_{}'.format(timestamp, job_name)) + i = 1 + while os.path.lexists(experiment_dir): + i += 1 + experiment_dir = os.path.join( + os.path.dirname(experiment_dir), + '{}_{}_{}'.format(timestamp, job_name, i)) + experiment_dir = os.path.abspath(experiment_dir) + os.makedirs(experiment_dir, exist_ok=True) + return experiment_dir diff --git a/applications/vision/README.md b/applications/vision/README.md new file mode 100644 index 00000000000..fc05737bf67 --- /dev/null +++ b/applications/vision/README.md @@ -0,0 +1,21 @@ +# Example models for computer vision + +This directory contains LBANN implementations of widely-used vision +models. They are intended to validate and benchmark LBANN's vision +functionality, and are also suitable as pedagogical tools for using +LBANN. + +## LeNet + +`lenet.py` trains a LeNet model on MNIST data. It is a simple script +intended to demonstrate LBANN's Python API. It calls helper functions +in `data/mnist/__init__.py` to download MNIST data and construct MNIST +data readers. + +## ImageNet models + +`alexnet.py`, `resnet.py`, and `densenet.py` are primarily used for +performance benchmarks and scaling studies. It uses LLNL-specific +features and the helper functions in `data/imagenet/__init__.py` +assume that the user is on an LLNL LC system and belongs to the +`brainusr` group. diff --git a/applications/vision/alexnet.py b/applications/vision/alexnet.py new file mode 100644 index 00000000000..c836e90adbd --- /dev/null +++ b/applications/vision/alexnet.py @@ -0,0 +1,81 @@ +import argparse +import lbann +import lbann.models +import lbann.contrib.args +import lbann.contrib.launcher +import data.imagenet + +# Command-line arguments +desc = ('Construct and run AlexNet on ImageNet-1K data. ' + 'Running the experiment is only supported on LC systems.') +parser = argparse.ArgumentParser(description=desc) +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_alexnet', type=str, + help='scheduler job name (default: lbann_alexnet)') +parser.add_argument( + '--mini-batch-size', action='store', default=256, type=int, + help='mini-batch size (default: 256)', metavar='NUM') +parser.add_argument( + '--num-epochs', action='store', default=100, type=int, + help='number of epochs (default: 100)', metavar='NUM') +parser.add_argument( + '--num-classes', action='store', default=1000, type=int, + help='number of ImageNet classes (default: 1000)', metavar='NUM') +lbann.contrib.args.add_optimizer_arguments(parser) +parser.add_argument( + '--setup_only', action='store_true', + help='setup LBANN experiment without running it') +args = parser.parse_args() + +# Due to a data reader limitation, the actual model realization must be +# hardcoded to 1000 labels for ImageNet. +imagenet_labels = 1000 + +# Construct layer graph +input_ = lbann.Input() +images = lbann.Identity(input_) +labels = lbann.Identity(input_) +preds = lbann.models.AlexNet(imagenet_labels)(images) +probs = lbann.Softmax(preds) +cross_entropy = lbann.CrossEntropy(probs, labels) +top1 = lbann.CategoricalAccuracy(probs, labels) +top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5) +layers = list(lbann.traverse_layer_graph(input_)) + +# Setup objective function +weights = set() +for l in layers: + weights.update(l.weights) +l2_reg = lbann.L2WeightRegularization(weights=weights, scale=5e-4) +obj = lbann.ObjectiveFunction([cross_entropy, l2_reg]) + +# Setup model +metrics = [lbann.Metric(top1, name='top-1 accuracy', unit='%'), + lbann.Metric(top5, name='top-5 accuracy', unit='%')] +callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackDropFixedLearningRate( + drop_epoch=[20,40,60], amt=0.1)] +model = lbann.Model(args.num_epochs, + layers=layers, + weights=weights, + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +# Setup optimizer +opt = lbann.contrib.args.create_optimizer(args) + +# Setup data reader +data_reader = data.imagenet.make_data_reader(num_classes=args.num_classes) + +# Setup trainer +trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size) + +# Run experiment +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) +lbann.contrib.launcher.run(trainer, model, data_reader, opt, + job_name=args.job_name, + setup_only=args.setup_only, + **kwargs) diff --git a/applications/vision/data/__init__.py b/applications/vision/data/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/applications/vision/data/cifar10/__init__.py b/applications/vision/data/cifar10/__init__.py new file mode 100644 index 00000000000..9fa71d684ca --- /dev/null +++ b/applications/vision/data/cifar10/__init__.py @@ -0,0 +1,28 @@ +import os +import os.path + +import google.protobuf.text_format +import lbann +import lbann.contrib.lc.paths + +def make_data_reader(num_classes=10): + + # Load Protobuf message from file + current_dir = os.path.dirname(os.path.realpath(__file__)) + protobuf_file = os.path.join(current_dir, 'data_reader.prototext') + message = lbann.lbann_pb2.LbannPB() + with open(protobuf_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), message) + message = message.data_reader + + # Check if data paths are accessible + data_dir = lbann.contrib.lc.paths.cifar10_dir() + + if not os.path.isdir(data_dir): + raise FileNotFoundError('could not access {}'.format(data_dir)) + + # Set paths + message.reader[0].data_filedir = data_dir + message.reader[1].data_filedir = data_dir + + return message diff --git a/applications/vision/data/cifar10/data_reader.prototext b/applications/vision/data/cifar10/data_reader.prototext new file mode 100644 index 00000000000..2867c622231 --- /dev/null +++ b/applications/vision/data/cifar10/data_reader.prototext @@ -0,0 +1,43 @@ +data_reader { + reader { + name: "cifar10" + role: "train" + shuffle: true + data_filedir: "path/to/cifar10/data" + validation_percent: 0.1 + absolute_sample_count: 0 + percent_of_data_to_use: 1.0 + + transforms { + horizontal_flip { + p: 0.5 + } + } + transforms { + normalize_to_lbann_layout { + means: "0.44653 0.48216 0.4914" + stddevs: "0.26159 0.24349 0.24703" + } + } + } + reader { + name: "cifar10" + role: "test" + shuffle: true + data_filedir: "path/to/cifar10/data" + absolute_sample_count: 0 + percent_of_data_to_use: 1.0 + + transforms { + horizontal_flip { + p: 0.5 + } + } + transforms { + normalize_to_lbann_layout { + means: "0.44653 0.48216 0.4914" + stddevs: "0.26159 0.24349 0.24703" + } + } + } +} diff --git a/applications/vision/data/imagenet/__init__.py b/applications/vision/data/imagenet/__init__.py new file mode 100644 index 00000000000..dc3d46552c4 --- /dev/null +++ b/applications/vision/data/imagenet/__init__.py @@ -0,0 +1,56 @@ +import os +import os.path + +import google.protobuf.text_format +import lbann +import lbann.contrib.launcher + +def make_data_reader(num_classes=1000): + + # Load Protobuf message from file + current_dir = os.path.dirname(os.path.realpath(__file__)) + protobuf_file = os.path.join(current_dir, 'data_reader.prototext') + message = lbann.lbann_pb2.LbannPB() + with open(protobuf_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), message) + message = message.data_reader + + # Paths to ImageNet data + # Note: Paths are only known for some compute centers + compute_center = lbann.contrib.launcher.compute_center() + if compute_center == 'lc': + from lbann.contrib.lc.paths import imagenet_dir, imagenet_labels + train_data_dir = imagenet_dir(data_set='train', + num_classes=num_classes) + train_label_file = imagenet_labels(data_set='train', + num_classes=num_classes) + test_data_dir = imagenet_dir(data_set='val', + num_classes=num_classes) + test_label_file = imagenet_labels(data_set='val', + num_classes=num_classes) + elif compute_center == 'nersc': + from lbann.contrib.nersc.paths import imagenet_dir, imagenet_labels + train_data_dir = imagenet_dir(data_set='train') + train_label_file = imagenet_labels(data_set='train') + test_data_dir = imagenet_dir(data_set='val') + test_label_file = imagenet_labels(data_set='val') + else: + raise RuntimeError(f'ImageNet data paths are unknown for current compute center ({compute_center})') + + # Check that data paths are accessible + if not os.path.isdir(train_data_dir): + raise FileNotFoundError('could not access {}'.format(train_data_dir)) + if not os.path.isfile(train_label_file): + raise FileNotFoundError('could not access {}'.format(train_label_file)) + if not os.path.isdir(test_data_dir): + raise FileNotFoundError('could not access {}'.format(test_data_dir)) + if not os.path.isfile(test_label_file): + raise FileNotFoundError('could not access {}'.format(test_label_file)) + + # Set paths + message.reader[0].data_filedir = train_data_dir + message.reader[0].data_filename = train_label_file + message.reader[1].data_filedir = test_data_dir + message.reader[1].data_filename = test_label_file + + return message diff --git a/applications/vision/data/imagenet/data_reader.prototext b/applications/vision/data/imagenet/data_reader.prototext new file mode 100644 index 00000000000..3810e28046c --- /dev/null +++ b/applications/vision/data/imagenet/data_reader.prototext @@ -0,0 +1,61 @@ +data_reader { + reader { + name: "imagenet" + role: "train" + shuffle: true + data_filedir: "path/to/ILSVRC2012/train" + data_filename: "path/to/ILSVRC2012/labels/train.txt" + validation_percent: 0.0 + percent_of_data_to_use: 1.0 + num_labels: 1000 + + transforms { + random_resized_crop { + height: 224 + width: 224 + } + } + transforms { + horizontal_flip { + p: 0.5 + } + } + transforms { + colorize {} + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" + } + } + } + + reader { + name: "imagenet" + role: "validate" + shuffle: true + data_filedir: "path/to/ILSVRC2012/val" + data_filename: "path/to/ILSVRC2012/labels/val.txt" + percent_of_data_to_use: 1.0 + num_labels: 1000 + + transforms { + resized_center_crop { + height: 256 + width: 256 + crop_height: 224 + crop_width: 224 + } + } + transforms { + colorize {} + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" + } + } + } +} diff --git a/applications/vision/data/mnist/.gitignore b/applications/vision/data/mnist/.gitignore new file mode 100644 index 00000000000..10c191aa77f --- /dev/null +++ b/applications/vision/data/mnist/.gitignore @@ -0,0 +1,5 @@ +*.gz +train-images-idx3-ubyte +train-labels-idx1-ubyte +t10k-images-idx3-ubyte +t10k-labels-idx1-ubyte diff --git a/applications/vision/data/mnist/__init__.py b/applications/vision/data/mnist/__init__.py new file mode 100644 index 00000000000..271ccf0f61e --- /dev/null +++ b/applications/vision/data/mnist/__init__.py @@ -0,0 +1,59 @@ +import gzip +import os +import os.path +import urllib.request + +import google.protobuf.text_format +import lbann + +# Paths +data_dir = os.path.dirname(os.path.realpath(__file__)) + +def download_data(): + """Download MNIST data files, if needed. + + Data files are downloaded from http://yann.lecun.com/exdb/mnist/ + and uncompressed. Does nothing if the files already exist. + + """ + + # MNIST data files and associated URLs + urls = { + 'train-images-idx3-ubyte': 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', + 'train-labels-idx1-ubyte': 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', + 't10k-images-idx3-ubyte': 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', + 't10k-labels-idx1-ubyte': 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', + } + + # Download and uncompress MNIST data files, if needed + for data_file, url in urls.items(): + data_file = os.path.join(data_dir, data_file) + compressed_file = data_file + '.gz' + if not os.path.isfile(data_file): + urllib.request.urlretrieve(url, filename=compressed_file) + with gzip.open(compressed_file, 'rb') as in_file: + with open(data_file, 'wb') as out_file: + out_file.write(in_file.read()) + +def make_data_reader(): + """Make Protobuf message for MNIST data reader. + + MNIST data is downloaded if needed. + + """ + + # Download MNIST data files + download_data() + + # Load Protobuf message from file + protobuf_file = os.path.join(data_dir, 'data_reader.prototext') + message = lbann.lbann_pb2.LbannPB() + with open(protobuf_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), message) + message = message.data_reader + + # Set paths + for reader in message.reader: + reader.data_filedir = data_dir + + return message diff --git a/applications/vision/data/mnist/data_reader.prototext b/applications/vision/data/mnist/data_reader.prototext new file mode 100644 index 00000000000..61c3b32cf42 --- /dev/null +++ b/applications/vision/data/mnist/data_reader.prototext @@ -0,0 +1,30 @@ +data_reader { + reader { + name: "mnist" + role: "train" + shuffle: true + data_filedir: "lbann/applications/vision/data/mnist" + data_filename: "train-images-idx3-ubyte" + label_filename: "train-labels-idx1-ubyte" + validation_percent: 0.1 + percent_of_data_to_use: 1.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 + } + } + } + reader { + name: "mnist" + role: "test" + data_filedir: "lbann/applications/vision/data/mnist" + data_filename: "t10k-images-idx3-ubyte" + label_filename: "t10k-labels-idx1-ubyte" + percent_of_data_to_use: 1.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 + } + } + } +} diff --git a/applications/vision/densenet.py b/applications/vision/densenet.py new file mode 100644 index 00000000000..7e70c4d23a0 --- /dev/null +++ b/applications/vision/densenet.py @@ -0,0 +1,470 @@ +import argparse +import lbann +import lbann.contrib.args +import lbann.contrib.launcher +import data.imagenet + +LOG = True + + +def log(string): + if LOG: + print(string) + + +# DenseNet ##################################################################### +# See src/proto/lbann.proto for possible functions to call. +# See PyTorch DenseNet: +# https://github.com/pytorch/vision/blob/master/torchvision/models/densenet.py +# See "Densely Connected Convolutional Networks" by Huang et. al p.4 +def densenet(statistics_group_size, + version, + cumulative_layer_num, + images_node + ): + if version == 121: + growth_rate = 32 # k in the paper + layers_per_block = (6, 12, 24, 16) + num_initial_features = 64 + elif version == 161: + growth_rate = 48 # k in the paper + layers_per_block = (96, 48, 36, 24) + num_initial_features = 96 + else: + raise Exception('Invalid version={v}.'.format(v=version)) + batch_norm_size = 4 + + parent_node, cumulative_layer_num = initial_layer( + statistics_group_size, + cumulative_layer_num, images_node, + num_initial_features) + num_features = num_initial_features + # Start counting dense blocks at 1. + for current_block_num, num_layers in enumerate(layers_per_block, 1): + parent_nodes, cumulative_layer_num = dense_block( + statistics_group_size, + cumulative_layer_num, + parent_node, + batch_norm_size=batch_norm_size, + current_block_num=current_block_num, + growth_rate=growth_rate, + num_layers=num_layers, + num_initial_channels=num_initial_features + ) + # num_features += num_layers * growth_rate + for node in parent_nodes[1:]: + num_features += node.num_output_channels + parent_node = lbann.Concatenation(parent_nodes) + cumulative_layer_num += 1 + log('densenet Concatenation. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + if current_block_num != len(layers_per_block): + parent_node, cumulative_layer_num = transition_layer( + statistics_group_size, + current_block_num, + cumulative_layer_num, + parent_node, + # In Python 3, this is integer division. + num_output_channels=num_features//2, + ) + num_features //= 2 + + batch_normalization_node = standard_batchnorm(statistics_group_size, + parent_node) + cumulative_layer_num += 1 + log('densenet BatchNormalization. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + + relu_node = lbann.Relu(batch_normalization_node) + cumulative_layer_num += 1 + log('densenet Relu. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + + probs = classification_layer( + cumulative_layer_num, + relu_node + ) + return probs + + +def initial_layer(statistics_group_size, + cumulative_layer_num, + images_node, + num_initial_channels + ): + # 7x7 conv, stride 2 + convolution_node = lbann.Convolution( + images_node, + conv_dims_i=7, + conv_pads_i=3, + conv_strides_i=2, + has_bias=False, + num_dims=2, + num_output_channels=num_initial_channels + ) + cumulative_layer_num += 1 + log('initial_layer Convolution. cumulative_layer_num={n}'.format( + n=cumulative_layer_num)) + + batch_normalization_node = standard_batchnorm(statistics_group_size, + convolution_node) + cumulative_layer_num += 1 + log('initial_layer BatchNormalization. cumulative_layer_num={n}'.format( + n=cumulative_layer_num)) + + relu_node = lbann.Relu(batch_normalization_node) + cumulative_layer_num += 1 + log('initial_layer Relu. cumulative_layer_num={n}'.format( + n=cumulative_layer_num)) + + # 3x3 max pool, stride 2 + pooling_node = lbann.Pooling( + relu_node, + num_dims=2, + pool_dims_i=3, + pool_mode='max', + pool_pads_i=1, + pool_strides_i=2 + ) + cumulative_layer_num += 1 + log('initial_layer Pooling. cumulative_layer_num={n}'.format( + n=cumulative_layer_num)) + + return pooling_node, cumulative_layer_num + + +def standard_batchnorm(statistics_group_size, parent_node): + return lbann.BatchNormalization( + parent_node, + bias_init=0.0, + decay=0.9, + epsilon=1e-5, + scale_init=1.0, + statistics_group_size=statistics_group_size + ) + + +def dense_block(statistics_group_size, + cumulative_layer_num, + parent_node, + batch_norm_size, + current_block_num, + growth_rate, + num_layers, + num_initial_channels + ): + parent_nodes = [parent_node] + # Start counting dense layers at 1. + for current_layer_num in range(1, num_layers + 1): + # channels from before block + (each dense layer has k=growth_rate channels) + num_input_channels = num_initial_channels + (current_layer_num - 1) * growth_rate + print('num_input_channels={c}'.format(c=num_input_channels)) + parent_node, cumulative_layer_num = dense_layer( + statistics_group_size, + current_block_num, + current_layer_num, + cumulative_layer_num, + parent_nodes, + batch_norm_size=batch_norm_size, + growth_rate=growth_rate + ) + parent_nodes.append(parent_node) + return parent_nodes, cumulative_layer_num + + +def dense_layer(statistics_group_size, + current_block_num, + current_layer_num, + cumulative_layer_num, + parent_nodes, + batch_norm_size, + growth_rate + ): + concatenation_node = lbann.Concatenation(parent_nodes) + cumulative_layer_num += 1 + log('dense_block={b} dense_layer={l} Concatenation. cumulative_layer_num={n}'.format( + b=current_block_num, l=current_layer_num, n=cumulative_layer_num)) + conv_block_1_node, cumulative_layer_num = conv_block( + statistics_group_size, + current_block_num, + current_layer_num, + cumulative_layer_num, + concatenation_node, + conv_dims_i=1, + conv_pads_i=0, + num_output_channels=batch_norm_size * growth_rate + ) + conv_block_2_node, cumulative_layer_num = conv_block( + statistics_group_size, + current_block_num, + current_layer_num, + cumulative_layer_num, + conv_block_1_node, + conv_dims_i=3, + conv_pads_i=1, + num_output_channels=growth_rate + ) + return conv_block_2_node, cumulative_layer_num + + +def conv_block(statistics_group_size, + current_block_num, + current_layer_num, + cumulative_layer_num, + parent_node, + conv_dims_i, + conv_pads_i, + num_output_channels + ): + batch_normalization_node = standard_batchnorm(statistics_group_size, + parent_node) + cumulative_layer_num += 1 + log('dense_block={b} dense_layer={l} BatchNormalization. cumulative_layer_num={n}'.format( + b=current_block_num, l=current_layer_num, n=cumulative_layer_num)) + + relu_node = lbann.Relu(batch_normalization_node) + cumulative_layer_num += 1 + log( + 'dense_block={b} dense_layer={l} Relu. cumulative_layer_num={n}'.format( + b=current_block_num, l=current_layer_num, n=cumulative_layer_num)) + + convolution_node = lbann.Convolution( + relu_node, + conv_dims_i=conv_dims_i, + conv_pads_i=conv_pads_i, + conv_strides_i=1, + has_bias=False, + num_dims=2, + num_output_channels=num_output_channels + ) + cumulative_layer_num += 1 + log('dense_block={b} dense_layer={l} Convolution. cumulative_layer_num={n}'.format( + b=current_block_num, l=current_layer_num, n=cumulative_layer_num)) + + return convolution_node, cumulative_layer_num + + +def transition_layer(statistics_group_size, + current_block_num, + cumulative_layer_num, + parent_node, + num_output_channels + ): + batch_normalization_node = standard_batchnorm(statistics_group_size, + parent_node) + cumulative_layer_num += 1 + log('dense_block={b} > transition_layer BatchNormalization. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + + relu_node = lbann.Relu(batch_normalization_node) + cumulative_layer_num += 1 + log('dense_block={b} > transition_layer Relu. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + + convolution_node = lbann.Convolution( + relu_node, + conv_dims_i=1, + conv_pads_i=0, + conv_strides_i=1, + has_bias=False, + num_dims=2, + num_output_channels=num_output_channels + ) + cumulative_layer_num += 1 + log('dense_block={b} > transition_layer Convolution. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + + # 2x2 average pool, stride 2 + pooling_node = lbann.Pooling( + convolution_node, + num_dims=2, + pool_dims_i=2, + pool_mode='average', + pool_pads_i=0, + pool_strides_i=2 + ) + cumulative_layer_num += 1 + log('dense_block={b} > transition_layer Pooling. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + + return pooling_node, cumulative_layer_num + + +def classification_layer(cumulative_layer_num, + parent_node): + # 7x7 global average pool + pooling_node = lbann.Pooling( + parent_node, + num_dims=2, + pool_dims_i=7, + pool_mode='average', + pool_pads_i=1, + pool_strides_i=1 + ) + cumulative_layer_num += 1 + log('classification_layer Pooling. cumulative_layer_num={n}'.format( + n=cumulative_layer_num)) + + fully_connected_node = lbann.FullyConnected( + pooling_node, + num_neurons=1000, + has_bias=False + ) + cumulative_layer_num += 1 + log('classification_layer FullyConnected. cumulative_layer_num={n}'.format( + n=cumulative_layer_num)) + + probabilities = lbann.Softmax(fully_connected_node) + return probabilities + + +# Helpful Functions ############################################################ +def get_args(): + desc = ('Construct and run DenseNet on ImageNet data. ' + 'Running the experiment is only supported on LC systems.') + parser = argparse.ArgumentParser(description=desc) + lbann.contrib.args.add_scheduler_arguments(parser) + parser.add_argument( + '--job-name', action='store', default='lbann_densenet', type=str, + help='scheduler job name (default: lbann_densenet)') + parser.add_argument( + '--mini-batch-size', action='store', default=256, type=int, + help='mini-batch size (default: 256)', metavar='NUM') + parser.add_argument( + '--num-epochs', action='store', default=90, type=int, + help='number of epochs (default: 90)', metavar='NUM') + parser.add_argument( + '--num-classes', action='store', default=1000, type=int, + help='number of ImageNet classes (default: 1000)', metavar='NUM') + lbann.contrib.args.add_optimizer_arguments( + parser, + default_optimizer='sgd', + default_learning_rate=0.1 + ) + parser.add_argument( + '--setup_only', action='store_true', + help='do not run experiment (e.g. if only the prototext is desired)') + args = parser.parse_args() + return args + + +def construct_layer_graph( + statistics_group_size, + version, + cumulative_layer_num, + input_node): + # Input data + images_node = lbann.Identity(input_node) + cumulative_layer_num += 1 + log('Identity. cumulative_layer_num={n}'.format(n=cumulative_layer_num)) + + # Use input_node, not images_node. + image_labels_node = lbann.Identity(input_node) + cumulative_layer_num += 1 + log('Identity. cumulative_layer_num={n}'.format(n=cumulative_layer_num)) + + # Use images_node, not image_labels_node. + probabilities = densenet(statistics_group_size, version, + cumulative_layer_num, images_node) + + return probabilities, image_labels_node + + +def set_up_experiment(args, + input_, + probs, + labels): + # Set up objective function + cross_entropy = lbann.CrossEntropy([probs, labels]) + layers = list(lbann.traverse_layer_graph(input_)) + l2_reg_weights = set() + for l in layers: + if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: + l2_reg_weights.update(l.weights) + # scale = weight decay + l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4) + objective_function = lbann.ObjectiveFunction([cross_entropy, l2_reg]) + + # Set up model + top1 = lbann.CategoricalAccuracy([probs, labels]) + top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5) + metrics = [lbann.Metric(top1, name='top-1 accuracy', unit='%'), + lbann.Metric(top5, name='top-5 accuracy', unit='%')] + callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackDropFixedLearningRate( + drop_epoch=[30, 60], amt=0.1)] + model = lbann.Model(args.num_epochs, + layers=layers, + objective_function=objective_function, + metrics=metrics, + callbacks=callbacks) + + # Set up data reader + data_reader = data.imagenet.make_data_reader(num_classes=args.num_classes) + + # Set up optimizer + if args.optimizer == 'sgd': + print('Creating sgd optimizer') + optimizer = lbann.optimizer.SGD( + learn_rate=args.optimizer_learning_rate, + momentum=0.9, + nesterov=True + ) + else: + optimizer = lbann.contrib.args.create_optimizer(args) + + # Setup trainer + trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size) + + return trainer, model, data_reader, optimizer + + +def run_experiment(args, + trainer, + model, + data_reader, + optimizer): + # Note: Use `lbann.run` instead for non-LC systems. + kwargs = lbann.contrib.args.get_scheduler_kwargs(args) + lbann.contrib.launcher.run(trainer, model, data_reader, optimizer, + job_name=args.job_name, + **kwargs) + + +# Main function ################################################################ +def main(): + # ---------------------------------- + # Command-line arguments + # ---------------------------------- + + args = get_args() + + # ---------------------------------- + # Construct layer graph + # ---------------------------------- + + input_node = lbann.Input() + # Start counting cumulative layers at 1. + cumulative_layer_num = 1 + log('Input. cumulative_layer_num={n}'.format(n=cumulative_layer_num)) + (probs, labels) = construct_layer_graph( + args.procs_per_node, + 121, cumulative_layer_num, input_node) + + # ---------------------------------- + # Setup experiment + # ---------------------------------- + + (trainer, model, data_reader_proto, optimizer) = set_up_experiment( + args, input_node, probs, labels) + + # ---------------------------------- + # Run experiment + # ---------------------------------- + + run_experiment(args, trainer, model, data_reader_proto, optimizer) + + +if __name__ == '__main__': + main() diff --git a/applications/vision/lenet.py b/applications/vision/lenet.py new file mode 100644 index 00000000000..47e6819edfe --- /dev/null +++ b/applications/vision/lenet.py @@ -0,0 +1,98 @@ +import argparse +import lbann +import data.mnist +import lbann.contrib.args +import lbann.contrib.launcher + +# ---------------------------------- +# Command-line arguments +# ---------------------------------- + +desc = ('Train LeNet on MNIST data using LBANN.') +parser = argparse.ArgumentParser(description=desc) +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_lenet', type=str, + help='scheduler job name (default: lbann_lenet)') +args = parser.parse_args() + +# ---------------------------------- +# Construct layer graph +# ---------------------------------- + +# Input data +input_ = lbann.Input() +images = lbann.Identity(input_) +labels = lbann.Identity(input_) + +# LeNet +x = lbann.Convolution(images, + num_dims = 2, + num_output_channels = 6, + num_groups = 1, + conv_dims_i = 5, + conv_strides_i = 1, + conv_dilations_i = 1, + has_bias = True) +x = lbann.Relu(x) +x = lbann.Pooling(x, + num_dims = 2, + pool_dims_i = 2, + pool_strides_i = 2, + pool_mode = "max") +x = lbann.Convolution(x, + num_dims = 2, + num_output_channels = 16, + num_groups = 1, + conv_dims_i = 5, + conv_strides_i = 1, + conv_dilations_i = 1, + has_bias = True) +x = lbann.Relu(x) +x = lbann.Pooling(x, + num_dims = 2, + pool_dims_i = 2, + pool_strides_i = 2, + pool_mode = "max") +x = lbann.FullyConnected(x, num_neurons = 120, has_bias = True) +x = lbann.Relu(x) +x = lbann.FullyConnected(x, num_neurons = 84, has_bias = True) +x = lbann.Relu(x) +x = lbann.FullyConnected(x, num_neurons = 10, has_bias = True) +probs = lbann.Softmax(x) + +# Loss function and accuracy +loss = lbann.CrossEntropy(probs, labels) +acc = lbann.CategoricalAccuracy(probs, labels) + +# ---------------------------------- +# Setup experiment +# ---------------------------------- + +# Setup model +mini_batch_size = 64 +num_epochs = 20 +model = lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(input_), + objective_function=loss, + metrics=[lbann.Metric(acc, name='accuracy', unit='%')], + callbacks=[lbann.CallbackPrintModelDescription(), + lbann.CallbackPrint(), + lbann.CallbackTimer()]) + +# Setup optimizer +opt = lbann.SGD(learn_rate=0.01, momentum=0.9) + +# Setup data reader +data_reader = data.mnist.make_data_reader() + +# Setup trainer +trainer = lbann.Trainer(mini_batch_size=mini_batch_size) + +# ---------------------------------- +# Run experiment +# ---------------------------------- +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) +lbann.contrib.launcher.run(trainer, model, data_reader, opt, + job_name=args.job_name, + **kwargs) diff --git a/applications/vision/resnet.py b/applications/vision/resnet.py new file mode 100644 index 00000000000..3181d29f936 --- /dev/null +++ b/applications/vision/resnet.py @@ -0,0 +1,158 @@ +import argparse +import lbann +import lbann.models +import lbann.models.resnet +import lbann.contrib.args +import lbann.contrib.models.wide_resnet +import lbann.contrib.launcher +import data.imagenet + +# Command-line arguments +desc = ('Construct and run ResNet on ImageNet-1K data. ' + 'Running the experiment is only supported on LC systems.') +parser = argparse.ArgumentParser(description=desc) +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_resnet', type=str, + help='scheduler job name (default: lbann_resnet)') +parser.add_argument( + '--resnet', action='store', default=50, type=int, + choices=(18, 34, 50, 101, 152), + help='ResNet variant (default: 50)') +parser.add_argument( + '--width', action='store', default=1, type=float, + help='Wide ResNet width factor (default: 1)') +parser.add_argument( + '--block-type', action='store', default=None, type=str, + choices=('basic', 'bottleneck'), + help='ResNet block type') +parser.add_argument( + '--blocks', action='store', default=None, type=str, + help='ResNet block counts (comma-separated list)') +parser.add_argument( + '--block-channels', action='store', default=None, type=str, + help='Internal channels in each ResNet block (comma-separated list)') +parser.add_argument( + '--bn-statistics-group-size', action='store', default=1, type=int, + help=('Group size for aggregating batch normalization statistics ' + '(default: 1)')) +parser.add_argument( + '--warmup', action='store_true', help='use a linear warmup') +parser.add_argument( + '--mini-batch-size', action='store', default=256, type=int, + help='mini-batch size (default: 256)', metavar='NUM') +parser.add_argument( + '--num-epochs', action='store', default=90, type=int, + help='number of epochs (default: 90)', metavar='NUM') +parser.add_argument( + '--num-classes', action='store', default=1000, type=int, + help='number of ImageNet classes (default: 1000)', metavar='NUM') +parser.add_argument( + '--random-seed', action='store', default=0, type=int, + help='random seed for LBANN RNGs', metavar='NUM') +lbann.contrib.args.add_optimizer_arguments(parser, default_learning_rate=0.1) +args = parser.parse_args() + +# Due to a data reader limitation, the actual model realization must be +# hardcoded to 1000 labels for ImageNet. +imagenet_labels = 1000 + +# Choose ResNet variant +resnet_variant_dict = {18: lbann.models.ResNet18, + 34: lbann.models.ResNet34, + 50: lbann.models.ResNet50, + 101: lbann.models.ResNet101, + 152: lbann.models.ResNet152} +wide_resnet_variant_dict = {50: lbann.contrib.models.wide_resnet.WideResNet50_2} +block_variant_dict = { + 'basic': lbann.models.resnet.BasicBlock, + 'bottleneck': lbann.models.resnet.BottleneckBlock +} + +if (any([args.block_type, args.blocks, args.block_channels]) + and not all([args.block_type, args.blocks, args.block_channels])): + raise RuntimeError('Must specify all of --block-type, --blocks, --block-channels') +if args.block_type and args.blocks and args.block_channels: + # Build custom ResNet. + resnet = lbann.models.ResNet( + block_variant_dict[args.block_type], + imagenet_labels, + list(map(int, args.blocks.split(','))), + list(map(int, args.block_channels.split(','))), + zero_init_residual=True, + bn_statistics_group_size=args.bn_statistics_group_size, + name='custom_resnet', + width=args.width) +elif args.width == 1: + # Vanilla ResNet. + resnet = resnet_variant_dict[args.resnet]( + imagenet_labels, + bn_statistics_group_size=args.bn_statistics_group_size) +elif args.width == 2 and args.resnet == 50: + # Use pre-defined WRN-50-2. + resnet = wide_resnet_variant_dict[args.resnet]( + imagenet_labels, + bn_statistics_group_size=args.bn_statistics_group_size) +else: + # Some other Wide ResNet. + resnet = resnet_variant_dict[args.resnet]( + imagenet_labels, + bn_statistics_group_size=args.bn_statistics_group_size, + width=args.width) + +# Construct layer graph +input_ = lbann.Input() +images = lbann.Identity(input_) +labels = lbann.Identity(input_) +preds = resnet(images) +probs = lbann.Softmax(preds) +cross_entropy = lbann.CrossEntropy(probs, labels) +top1 = lbann.CategoricalAccuracy(probs, labels) +top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5) +layers = list(lbann.traverse_layer_graph(input_)) + +# Setup tensor core operations (just to demonstrate enum usage) +tensor_ops_mode = lbann.ConvTensorOpsMode.NO_TENSOR_OPS +for l in layers: + if type(l) == lbann.Convolution: + l.conv_tensor_op_mode=tensor_ops_mode + +# Setup objective function +l2_reg_weights = set() +for l in layers: + if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: + l2_reg_weights.update(l.weights) +l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4) +obj = lbann.ObjectiveFunction([cross_entropy, l2_reg]) + +# Setup model +metrics = [lbann.Metric(top1, name='top-1 accuracy', unit='%'), + lbann.Metric(top5, name='top-5 accuracy', unit='%')] +callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackDropFixedLearningRate( + drop_epoch=[30, 60, 80], amt=0.1)] +if args.warmup: + callbacks.append( + lbann.CallbackLinearGrowthLearningRate( + target=0.1 * args.mini_batch_size / 256, num_epochs=5)) +model = lbann.Model(args.num_epochs, + layers=layers, + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +# Setup optimizer +opt = lbann.contrib.args.create_optimizer(args) + +# Setup data reader +data_reader = data.imagenet.make_data_reader(num_classes=args.num_classes) + +# Setup trainer +trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size, random_seed=args.random_seed) + +# Run experiment +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) +lbann.contrib.launcher.run(trainer, model, data_reader, opt, + job_name=args.job_name, + **kwargs) diff --git a/applications/vision/summarizing_images/autoencoder_conv_summarize.py b/applications/vision/summarizing_images/autoencoder_conv_summarize.py new file mode 100644 index 00000000000..a1e2f67f751 --- /dev/null +++ b/applications/vision/summarizing_images/autoencoder_conv_summarize.py @@ -0,0 +1,281 @@ +################################################################################ +# Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# Written by the LBANN Research Team (B. Van Essen, et al.) listed in +# the CONTRIBUTORS file. +# +# LLNL-CODE-697807. +# All rights reserved. +# +# This file is part of LBANN: Livermore Big Artificial Neural Network +# Toolkit. For details, see http://software.llnl.gov/LBANN or +# https://github.com/LLNL/LBANN. +# +# Licensed under the Apache License, Version 2.0 (the "Licensee"); you +# may not use this file except in compliance with the License. You may +# obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the license. +# +# autoencoder_conv_summarize.py - A simple autoencoder for image data +# (Supports CIFAR-10 or Imagenet 1K) +# +# This example demonstrates the use of the image summarizer in +# autoencoder mode. +# +################################################################################ + +import os.path +import sys +import argparse +import lbann +import lbann.models +import lbann.contrib.args +import lbann.contrib.models.wide_resnet +import lbann.contrib.launcher + +# Get relative path to data +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'data')) +import cifar10 +import imagenet + +# Command-line arguments +desc = ('Construct and run ResNet on ImageNet-1K data. ' + 'Running the experiment is only supported on LC systems.') +parser = argparse.ArgumentParser(description=desc) +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_image_ae', type=str, + help='scheduler job name (default: lbann_resnet)') +parser.add_argument( + '--mini-batch-size', action='store', default=256, type=int, + help='mini-batch size (default: 256)', metavar='NUM') +parser.add_argument( + '--num-epochs', action='store', default=90, type=int, + help='number of epochs (default: 90)', metavar='NUM') +parser.add_argument( + '--num-classes', action='store', default=1000, type=int, + help='number of ImageNet classes (default: 1000)', metavar='NUM') +parser.add_argument( + '--random-seed', action='store', default=0, type=int, + help='random seed for LBANN RNGs', metavar='NUM') +parser.add_argument( + '--dataset', action='store', default='imagenet', type=str, + help='dataset to use; \"cifar10\" or \"imagenet\"') +parser.add_argument( + '--data-reader-percent', action='store', + default=1.0, type=float, + help='the percent of the data to use (default: 1.0)', metavar='NUM') +lbann.contrib.args.add_optimizer_arguments(parser, default_learning_rate=0.1) +args = parser.parse_args() + +# Due to a data reader limitation, the actual model realization must be +# hardcoded to 1000 labels for ImageNet; 10 for CIFAR10. +dataset = args.dataset; +if dataset == 'imagenet': + num_labels=1000 +elif dataset == 'cifar10': + num_labels=10 +else: + print("Dataset must be cifar10 or imagenet. Try again.") + exit() + +# Construct layer graph +input_ = lbann.Input(name='input') +image = lbann.Identity(input_, name='images') +dummy = lbann.Dummy(input_, name='labels') + +# Encoder + +conv1 = lbann.Convolution(image, + name="conv1", + num_dims=2, + num_output_channels=16, + conv_dims='3 3', + conv_pads='0 0', + conv_strides='1 1', + has_bias=True, + has_vectors=True) + +relu1 = lbann.Relu(conv1, name="relu1") + +pool1 = lbann.Pooling(relu1, + name="pool1", + num_dims=2, + pool_dims='2 2', + pool_pads='0 0', + pool_strides='1 1', + pool_mode="max", + has_vectors=True) + + +conv2 = lbann.Convolution(pool1, + name="conv2", + num_dims=2, + num_output_channels=8, + conv_dims='3 3', + conv_pads='0 0', + conv_strides='1 1', + has_bias=True, + has_vectors=True) + +relu2 = lbann.Relu(conv2, name="relu2") + +pool2 = lbann.Pooling(relu2, + name="pool2", + num_dims=2, + pool_dims='2 2', + pool_pads='0 0', + pool_strides='1 1', + pool_mode="max", + has_vectors=True) + +conv3 = lbann.Convolution(pool2, + name="conv3", + num_dims=2, + num_output_channels=8, + conv_dims='3 3', + conv_pads='0 0', + conv_strides='1 1', + has_bias=True, + has_vectors=True) + +relu3 = lbann.Relu(conv3, name="relu3") + +pool3 = lbann.Pooling(relu3, + name="pool3", + num_dims=2, + pool_dims='2 2', + pool_pads='0 0', + pool_strides='1 1', + pool_mode="max", + has_vectors=True) + +unpool3 = lbann.Unpooling(pool3, + name="unpool3", + num_dims=2, + pooling_layer=pool3.name) + +deconv3 = lbann.Deconvolution(unpool3, + name="deconv3", + num_dims=2, + num_output_channels=8, + conv_dims='3 3', + conv_pads='0 0', + conv_strides='1 1', + has_bias=True, + has_vectors=True) + +relu4 = lbann.Relu(deconv3, name="relu4") + +unpool2 = lbann.Unpooling(relu4, + name="unpool2", + num_dims=2, + pooling_layer=pool2.name) + +deconv2 = lbann.Deconvolution(unpool2, + name="deconv2", + num_dims=2, + num_output_channels=16, + conv_dims='3 3', + conv_pads='0 0', + conv_strides='1 1', + has_bias=True, + has_vectors=True) + +relu5 = lbann.Relu(deconv2, name="relu5") + +unpool1 = lbann.Unpooling(relu5, + name="unpool1", + num_dims=2, + pooling_layer=pool1.name) + +deconv1 = lbann.Deconvolution(unpool1, + name="deconv1", + num_dims=2, + num_output_channels=3, + conv_dims='3 3', + conv_pads='0 0', + conv_strides='1 1', + has_bias=True, + has_vectors=True) + +relu6 = lbann.Relu(deconv1, name="relu6") + +decode1 = lbann.FullyConnected(relu6, + name="decode1", + hint_layer=image, + has_bias=True) + +reconstruction = lbann.Sigmoid(decode1, + name="reconstruction") + + +# Reconstruction +mean_squared_error = lbann.MeanSquaredError([reconstruction, image], + name="mean_squared_error") + +layer_term = lbann.LayerTerm(mean_squared_error) +scale_factor = lbann.L2WeightRegularization(scale=0.0005) +obj = lbann.ObjectiveFunction([layer_term, scale_factor]) + +metrics = [lbann.Metric(mean_squared_error, name=mean_squared_error.name)] + +img_strategy = lbann.TrackSampleIDsStrategy( + input_layer_name=input_.name, + num_tracked_images=20) + +summarize_images = lbann.CallbackSummarizeImages( + selection_strategy=img_strategy, + image_source_layer_name=reconstruction.name, + epoch_interval=10) + +# Dump original image from input layer one time (high epoch interval) +summarize_input_layer = lbann.CallbackSummarizeImages( + selection_strategy=img_strategy, + image_source_layer_name=input_.name, + epoch_interval=10000) + +callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(), + summarize_images, + summarize_input_layer] + +layer_list = list(lbann.traverse_layer_graph(input_)) +model = lbann.Model( + args.num_epochs, + layers=layer_list, + objective_function=obj, + metrics=metrics, + callbacks=callbacks, + summary_dir=".") + +# Setup optimizer +opt = lbann.contrib.args.create_optimizer(args) + +# Setup data reader +num_classes=min(args.num_classes, num_labels) + +if dataset == "cifar10": + data_reader = cifar10.make_data_reader(num_classes=num_classes) +else: + data_reader = imagenet.make_data_reader(num_classes=num_classes) + +# Setup trainer +trainer = lbann.Trainer(random_seed=args.random_seed, mini_batch_size=args.mini_batch_size) + +# Run experiment +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) +kwargs['lbann_args'] = '--data_reader_percent='+str(args.data_reader_percent)+' --disable_cuda=1' + +lbann.contrib.launcher.run(trainer, model, data_reader, opt, + job_name=args.job_name, + **kwargs) diff --git a/applications/vision/summarizing_images/autoencoder_summarize.py b/applications/vision/summarizing_images/autoencoder_summarize.py new file mode 100644 index 00000000000..ba967e4f031 --- /dev/null +++ b/applications/vision/summarizing_images/autoencoder_summarize.py @@ -0,0 +1,192 @@ +################################################################################ +# Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# Written by the LBANN Research Team (B. Van Essen, et al.) listed in +# the CONTRIBUTORS file. +# +# LLNL-CODE-697807. +# All rights reserved. +# +# This file is part of LBANN: Livermore Big Artificial Neural Network +# Toolkit. For details, see http://software.llnl.gov/LBANN or +# https://github.com/LLNL/LBANN. +# +# Licensed under the Apache License, Version 2.0 (the "Licensee"); you +# may not use this file except in compliance with the License. You may +# obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the license. +# +# autoencoder_summarize.py - A simple autoencoder for image data +# (Supports CIFAR-10 or Imagenet 1K) +# +# This example demonstrates the use of the image summarizer in +# autoencoder mode. +# +################################################################################ + +import os.path +import sys +import argparse +import lbann +import lbann.models +import lbann.contrib.args +import lbann.contrib.models.wide_resnet +import lbann.contrib.launcher + +# Get relative path to data +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'data')) +import cifar10 +import imagenet + +# Command-line arguments +desc = ('Construct and run ResNet on ImageNet-1K data. ' + 'Running the experiment is only supported on LC systems.') +parser = argparse.ArgumentParser(description=desc) +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_image_ae', type=str, + help='scheduler job name (default: lbann_resnet)') +parser.add_argument( + '--width', action='store', default=1, type=float, + help='Wide ResNet width factor (default: 1)') +parser.add_argument( + '--bn-statistics-group-size', action='store', default=1, type=int, + help=('Group size for aggregating batch normalization statistics ' + '(default: 1)')) +parser.add_argument( + '--warmup', action='store_true', help='use a linear warmup') +parser.add_argument( + '--mini-batch-size', action='store', default=256, type=int, + help='mini-batch size (default: 256)', metavar='NUM') +parser.add_argument( + '--num-epochs', action='store', default=90, type=int, + help='number of epochs (default: 90)', metavar='NUM') +parser.add_argument( + '--num-classes', action='store', default=1000, type=int, + help='number of ImageNet classes (default: 1000)', metavar='NUM') +parser.add_argument( + '--random-seed', action='store', default=0, type=int, + help='random seed for LBANN RNGs', metavar='NUM') +parser.add_argument( + '--dataset', action='store', default='imagenet', type=str, + help='dataset to use; \"cifar10\" or \"imagenet\"') +parser.add_argument( + '--data-reader-percent', action='store', + default=1.0, type=float, + help='the percent of the data to use (default: 1.0)', metavar='NUM') +lbann.contrib.args.add_optimizer_arguments(parser, default_learning_rate=0.1) +args = parser.parse_args() + +# Due to a data reader limitation, the actual model realization must be +# hardcoded to 1000 labels for ImageNet; 10 for CIFAR10. +dataset = args.dataset; +if dataset == 'imagenet': + num_labels=1000 +elif dataset == 'cifar10': + num_labels=10 +else: + print("Dataset must be cifar10 or imagenet. Try again.") + exit() + +# Construct layer graph +input_ = lbann.Input(name='input') +image = lbann.Identity(input_, name='images') +dummy = lbann.Dummy(input_, name='labels') + +# Encoder +encode1 = lbann.FullyConnected(image, + name="encode1", + data_layout="model_parallel", + num_neurons=1000, + has_bias=True) + +relu1 = lbann.Relu(encode1, name="relu1", data_layout="model_parallel") + +dropout1 = lbann.Dropout(relu1, + name="dropout1", + data_layout="model_parallel", + keep_prob=0.8) + +decode1 = lbann.FullyConnected(dropout1, + name="decode1", + data_layout="model_parallel", + hint_layer=image, + has_bias=True) + +reconstruction = lbann.Sigmoid(decode1, + name="reconstruction", + data_layout="model_parallel") + +dropout2 = lbann.Dropout(reconstruction, + name="dropout2", + data_layout="model_parallel", + keep_prob=0.8) + + +# Reconstruction +mean_squared_error = lbann.MeanSquaredError([dropout2, image], + name="mean_squared_error") + +layer_term = lbann.LayerTerm(mean_squared_error) +obj = lbann.ObjectiveFunction(layer_term) + +metrics = [lbann.Metric(mean_squared_error, name=mean_squared_error.name)] + +img_strategy = lbann.TrackSampleIDsStrategy( + input_layer_name=input_.name, + num_tracked_images=10) + +summarize_images = lbann.CallbackSummarizeImages( + selection_strategy=img_strategy, + image_source_layer_name=reconstruction.name, + epoch_interval=1) + +# Dump original image from input layer one time +summarize_input_layer = lbann.CallbackSummarizeImages( + selection_strategy=img_strategy, + image_source_layer_name=input_.name, + epoch_interval=10000) + +callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(), + summarize_input_layer, + summarize_images] + +layer_list = list(lbann.traverse_layer_graph(input_)) +model = lbann.Model(args.num_epochs, + layers=layer_list, + objective_function=obj, + metrics=metrics, + callbacks=callbacks, + summary_dir=".") + +# Setup optimizer +opt = lbann.contrib.args.create_optimizer(args) + +# Setup data reader +num_classes=min(args.num_classes, num_labels) + +if dataset == "cifar10": + data_reader = cifar10.make_data_reader(num_classes=num_classes) +else: + data_reader = imagenet.make_data_reader(num_classes=num_classes) + +# Setup trainer +trainer = lbann.Trainer(random_seed=args.random_seed, mini_batch_size=args.mini_batch_size) + +# Run experiment +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) +kwargs['lbann_args'] = '--data_reader_percent='+str(args.data_reader_percent) + +lbann.contrib.launcher.run(trainer, model, data_reader, opt, + job_name=args.job_name, + **kwargs) diff --git a/applications/vision/summarizing_images/resnet_summarize.py b/applications/vision/summarizing_images/resnet_summarize.py new file mode 100644 index 00000000000..1690dcc1f22 --- /dev/null +++ b/applications/vision/summarizing_images/resnet_summarize.py @@ -0,0 +1,226 @@ +################################################################################ +# Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# Written by the LBANN Research Team (B. Van Essen, et al.) listed in +# the CONTRIBUTORS file. +# +# LLNL-CODE-697807. +# All rights reserved. +# +# This file is part of LBANN: Livermore Big Artificial Neural Network +# Toolkit. For details, see http://software.llnl.gov/LBANN or +# https://github.com/LLNL/LBANN. +# +# Licensed under the Apache License, Version 2.0 (the "Licensee"); you +# may not use this file except in compliance with the License. You may +# obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the license. +# +# resnet_summarize.py - A simple residual learning model for image data +# (Supports CIFAR-10 or Imagenet 1K) +# +# This example demonstrates the use of the image summarizer in +# categorical accuracy mode. +# +################################################################################ + +import os.path +import sys +import argparse +import lbann +import lbann.models +import lbann.models.resnet +import lbann.contrib.args +import lbann.contrib.models.wide_resnet +import lbann.contrib.launcher + +# Get relative path to data +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'data')) +import cifar10 +import imagenet + +# Command-line arguments +desc = ('Construct and run ResNet on ImageNet-1K data. ' + 'Running the experiment is only supported on LC systems.') +parser = argparse.ArgumentParser(description=desc) +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_resnet', type=str, + help='scheduler job name (default: lbann_resnet)') +parser.add_argument( + '--resnet', action='store', default=50, type=int, + choices=(18, 34, 50, 101, 152), + help='ResNet variant (default: 50)') +parser.add_argument( + '--width', action='store', default=1, type=float, + help='Wide ResNet width factor (default: 1)') +parser.add_argument( + '--block-type', action='store', default=None, type=str, + choices=('basic', 'bottleneck'), + help='ResNet block type') +parser.add_argument( + '--blocks', action='store', default=None, type=str, + help='ResNet block counts (comma-separated list)') +parser.add_argument( + '--block-channels', action='store', default=None, type=str, + help='Internal channels in each ResNet block (comma-separated list)') +parser.add_argument( + '--bn-statistics-group-size', action='store', default=1, type=int, + help=('Group size for aggregating batch normalization statistics ' + '(default: 1)')) +parser.add_argument( + '--warmup', action='store_true', help='use a linear warmup') +parser.add_argument( + '--mini-batch-size', action='store', default=256, type=int, + help='mini-batch size (default: 256)', metavar='NUM') +parser.add_argument( + '--num-epochs', action='store', default=90, type=int, + help='number of epochs (default: 90)', metavar='NUM') +parser.add_argument( + '--num-classes', action='store', default=1000, type=int, + help='number of ImageNet classes (default: 1000)', metavar='NUM') +parser.add_argument( + '--random-seed', action='store', default=0, type=int, + help='random seed for LBANN RNGs', metavar='NUM') +parser.add_argument( + '--dataset', action='store', default='imagenet', type=str, + help='dataset to use; \"cifar10\" or \"imagenet\"') +parser.add_argument( + '--data-reader-percent', action='store', + default=1.0, type=float, + help='the percent of the data to use (default: 1.0)', metavar='NUM') +lbann.contrib.args.add_optimizer_arguments(parser, default_learning_rate=0.1) +args = parser.parse_args() + +# Due to a data reader limitation, the actual model realization must be +# hardcoded to 1000 labels for ImageNet; 10 for CIFAR10. +dataset = args.dataset; +if dataset == 'imagenet': + num_labels=1000 +elif dataset == 'cifar10': + num_labels=10 +else: + print("Dataset must be cifar10 or imagenet. Try again.") + exit() + +# Choose ResNet variant +resnet_variant_dict = {18: lbann.models.ResNet18, + 34: lbann.models.ResNet34, + 50: lbann.models.ResNet50, + 101: lbann.models.ResNet101, + 152: lbann.models.ResNet152} +wide_resnet_variant_dict = {50: lbann.contrib.models.wide_resnet.WideResNet50_2} +block_variant_dict = { + 'basic': lbann.models.resnet.BasicBlock, + 'bottleneck': lbann.models.resnet.BottleneckBlock +} + +if (any([args.block_type, args.blocks, args.block_channels]) + and not all([args.block_type, args.blocks, args.block_channels])): + raise RuntimeError('Must specify all of --block-type, --blocks, --block-channels') +if args.block_type and args.blocks and args.block_channels: + # Build custom ResNet. + resnet = lbann.models.ResNet( + block_variant_dict[args.block_type], + num_labels, + list(map(int, args.blocks.split(','))), + list(map(int, args.block_channels.split(','))), + zero_init_residual=True, + bn_statistics_group_size=args.bn_statistics_group_size, + name='custom_resnet', + width=args.width) +elif args.width == 1: + # Vanilla ResNet. + resnet = resnet_variant_dict[args.resnet]( + num_labels, + bn_statistics_group_size=args.bn_statistics_group_size) +elif args.width == 2 and args.resnet == 50: + # Use pre-defined WRN-50-2. + resnet = wide_resnet_variant_dict[args.resnet]( + num_labels, + bn_statistics_group_size=args.bn_statistics_group_size) +else: + # Some other Wide ResNet. + resnet = resnet_variant_dict[args.resnet]( + num_labels, + bn_statistics_group_size=args.bn_statistics_group_size, + width=args.width) + +# Construct layer graph +input_ = lbann.Input(name='input') +images = lbann.Identity(input_, name='images') +labels = lbann.Identity(input_, name='labels') +preds = resnet(images) +probs = lbann.Softmax(preds) +cross_entropy = lbann.CrossEntropy(probs, labels) +top1 = lbann.CategoricalAccuracy(probs, labels, name='louise') +top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5) +layer_list = list(lbann.traverse_layer_graph(input_)) + +# Setup objective function +l2_reg_weights = set() +for l in layer_list: + if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: + l2_reg_weights.update(l.weights) +l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4) +obj = lbann.ObjectiveFunction([cross_entropy, l2_reg]) + +# Setup model +metrics = [lbann.Metric(top1, name='top-1 accuracy', unit='%'), + lbann.Metric(top5, name='top-5 accuracy', unit='%')] + +img_strategy = lbann.CategoricalAccuracyStrategy( + accuracy_layer_name=top1.name, + match_type=lbann.CategoricalAccuracyStrategy.MatchType.NOMATCH, + num_images_per_epoch=10) + +summarize_images = lbann.CallbackSummarizeImages( + selection_strategy=img_strategy, + image_source_layer_name=images.name, + epoch_interval=5) + +callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackDropFixedLearningRate( + drop_epoch=[30, 60, 80], amt=0.1), + summarize_images] +if args.warmup: + callbacks.append( + lbann.CallbackLinearGrowthLearningRate( + target=0.1 * args.mini_batch_size / 256, num_epochs=5)) +model = lbann.Model(args.num_epochs, + layers=layer_list, + objective_function=obj, + metrics=metrics, + callbacks=callbacks, + summary_dir=".") + +# Setup optimizer +opt = lbann.contrib.args.create_optimizer(args) + +# Setup data reader +num_classes=min(args.num_classes, num_labels) + +if dataset == "cifar10": + data_reader = cifar10.make_data_reader(num_classes=num_classes) +else: + data_reader = imagenet.make_data_reader(num_classes=num_classes) + +# Setup trainer +trainer = lbann.Trainer(random_seed=args.random_seed, mini_batch_size=args.mini_batch_size) + +# Run experiment +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) +kwargs['lbann_args'] = '--data_reader_percent='+str(args.data_reader_percent) +lbann.contrib.launcher.run(trainer, model, data_reader, opt, + job_name=args.job_name, + **kwargs) diff --git a/bamboo/README.md b/bamboo/README.md index c317c496379..ccb1813e878 100644 --- a/bamboo/README.md +++ b/bamboo/README.md @@ -1,93 +1,3 @@ -# LBANN CI - -Bamboo is the continuous integration (CI) framework we use. A Bamboo plan consists of stages (which run sequentially), which consist of jobs (which run in parallel), which consist of tasks (which run sequentially). - -The LBANN build project has many plans. Two plans run off of [`LLNL/lbann/develop`](https://github.com/LLNL/lbann/tree/develop "https://github.com/LLNL/lbann/tree/develop") - Nightly Develop and Weekly Develop. Nightly Develop runs every night (except Saturday) at midnight. Weekly Develop runs every Saturday at midnight. The other plans in the build project are for each individual LBANN developer's fork of LBANN. - -All plans run off the latest *pushed* commits to the repository. That means if you have local commits that you have not pushed to your fork, these commits will *not* be tested by Bamboo. If you have pushed commits to your fork but have not merged your branch into the main repository's `develop`, your commits will be tested on your individual plan, but not on Nightly Develop or Weekly Develop. - -## Plan Configuration -Each plan is identical (except Weekly Develop, which will be explained below). The plans consist of a single stage `Tests`. The stage consists of three jobs - `ppc64le_gpu`, `x86_cpu`, and `x86_gpu`. Each of these three jobs can run in parallel. They consist of an identical list of tasks: -1. Checkout Default Repository (checkout the repository) -2. Remove Generated Files (each build creates a large number of files. We may look at these files between builds, so we cannot delete them at the end of a build. So, instead we delete them before doing any real work in the next build. This also ensures the generated files came from the latest build and not a previous build). -3. Compiler Tests (run tests in `bamboo/compiler_tests`) -4. Integration Tests (run tests in `bamboo/integration_tests`) -5. Unit Tests (run tests in `bamboo/unit_tests`) -6. JUnit Parser (this allows Bamboo to render test results in a nice UI) - -The three testing tasks differ somewhat between jobs. However, they all execute some variant of `python -m pytest -s --junitxml=results.xml`, which will run all the pytests in the job's associated directory. - -Weekly Develop adds the `--weekly` option (`python -m pytest -s --weekly --junitxml=results.xml`). Many (mostly longer-running) tests are set to not run unless this option is on. Weekly Develop runs a superset of the tests that Nightly Develop runs. - -## Directory Structure - -`bamboo/compiler_tests`, `bamboo/integration_tests`, `bamboo/unit_tests` each have a `conftest.py` that pytest requires. They also contain one or more python files. Each of these files have a number of tests to run. - -## Writing Your Own Tests - -A side effect of our Bamboo setup is that tests must be written using pytest. Test files must begin with `test_` to be recognized by pytest. Individual test methods must also begin with `test_`. Test methods should use the `assert` keyword. A test will only fail if the assertion turns out to be false. Not putting an assertion will automatically cause the test to pass. - -How then to test non-Python code? You can just wrap your test with Python. A test can be as simple as asserting the output of a shell command is 0. The output of a command can be found using Python's `os.system()`. - -## Running Tests On Your Individual Plan - -Unlike Nightly Develop, the individual plans are triggered to run by polling your fork for commits. They do not run nightly. If you push new commits to your fork, a new build should start automatically. You can also manually start a build by navigating to your individual plan and clicking Run > Run Plan. Once again, keep in mind that the tests will run off what has been pushed to your GitHub fork of LBANN and not your local copy of the LBANN repository. - -## Navigating Bamboo - -From the [LBANN Project Summary](https://lc.llnl.gov/bamboo/browse/LBANN "https://lc.llnl.gov/bamboo/browse/LBANN"), click on a build project. From there, click on a build (builds are listed under "Recent History" and can also be accessed from the pass/fail marks in the top right, to the left of the "Run" button). This will bring you to a certain build's page. The most relevant tabs are "Tests" and "Logs". It is recommended to look at failures first in the "Tests" tab, as the build logs can be difficult to parse through. The build's "Tests" tab shows "New test failures", "Existing test failures", "Fixed tests", and "Skipped Tests". - -From the build's page, you can also click on individual jobs, which have the same tabs. The "Tests" tabs of the individual jobs have two sub-tabs, "Failed tests" and "Successful tests". They do not display skipped tests. The Bamboo agent that ran the job can be found by looking at the "Agent" field under the "Job Summary" tab. Alternatively, you can determine the agent from one of the first lines in the build logs: `Build working directory is /usr/workspace/wsb/lbannusr/bamboo//xml-data/build-dir/`. - -Some build logs can be very large (e.g. over 100,000 lines). Beyond about 5,000 lines it is a good idea to download a log instead of viewing it in the browser. Beyond about 10,000 lines, some text editors may experience slowness. At this point it is good to split up the files with `split -l 10000 `, which creates files of the form `x*` and of length 10,000. You can then run a command such as `grep -in "Errors for:" x*` to find which files have reported errors. After you are done, you can remove the files with `rm x*`. Note that the original log file is not modified by any of these steps. - -As an alternative to splitting the file, errors can be searched for with `grep -in -A "Errors for:" `. - -## Bamboo Agent Properties - -Bamboo agent properties are used to specify requirements for each job. - -| Agents (jobs) | `agent_owner` | `architecture` | `cluster` | `gpu_architecture` | `sys_type` | -| --- | --- | --- | --- | --- | --- | -| Catalyst Agents (x86_cpu) | `lbannusr` | `x86_64` | `catalyst` | `none` | `toss_3_x86_64_ib` | -| Pascal Agents (x86_gpu_pascal) | `lbannusr` | `x86_64` | `pascal` | `pascal` | `chaos_6_x86_64_ib` | -| Quartz Agents (x86_cpu) | `lbannusr` | `x86_64` | `quartz` | `none` | `toss_3_x86_64_ib` | -| Ray Agents (ppc64le_gpu) | `lbannusr` | `ppc64_le` | `ray` | `pascal` | `blueos_3_ppc64le_ib` | -| Surface Agents (x86_gpu) | `lbannusr` | `x86_64` | `surface` | `kepler` | `chaos_5_x86_64_ib` | - -Currently, `agent_owner`, `architecture`, and `gpu_architecture` are used to determine agents to run a job. - -# Running Tests From The Command Line - -Navigate to `bamboo/compiler_tests`, `bamboo/integration_tests`, or `bamboo/unit_tests`. - -To run all the tests in a subdirectory: `python -m pytest -s --weekly`. Note that running all tests can take a substantial amount of time. - -To run the tests that Nightly Develop or the individual plans run in a subdirectory: `python -m pytest -s`. - -To run a specific test file: `python -m pytest -s .py`. - -To run a specific test: `python -m pytest -s .py -k ''`. - -Most integration and unit tests allow for running a test with a different executable. The convention is to have a similarly structured test replacing `_` with `_exe`. These tests are set to be skipped in Bamboo, but can be run locally. There should be a line above the test that gives the command to run the test locally, likely in the following form: `python -m pytest -s .py -k '' --exe=`. - -At this time, there is no way to run all the `_exe` tests in a subdirectory and only those. - -# Helpful Files - -First, run `sudo lbannusr`. - -To look at output and error from previous builds: `cd /usr/workspace/wsb/lbannusr/bamboo//xml-data/build-dir//bamboo//` - -To look at archived results from previous builds: `cd /usr/workspace/wsb/lbannusr/archives/` - -To look at Bamboo agent properties: `cat /usr/global/tools/bamboo/agents/lbannusr//bin/bamboo-capabilities.properties` - -You can copy these files over to your own machine as follows: -- `sudo lbannusr` -- `give ` -- `exit` - to go back to your own LC account, not `lbannusr`'s. -- `take lbannusr` - now the file exists on your LC account, but not yet on your own machine. - -From your own machine, not a ssh terminal: -- `scp @.llnl.gov: .` +Refer to `lbann/docs/continuous_integration.rst` +or "LBANN CI" on the [LBANN docs](http://software.llnl.gov/lbann/) - +specifically [LBANN CI docs](https://lbann.readthedocs.io/en/latest/continuous_integration.html). diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh new file mode 100755 index 00000000000..2cd798d0e76 --- /dev/null +++ b/bamboo/allocate_and_run.sh @@ -0,0 +1,93 @@ +#!/bin/bash -l + +CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') + +echo "allocate_and_run.sh CLUSTER=${CLUSTER}" + +export PYTHONPATH=${HOME}/.local/lib/python3.7/site-packages:${PYTHONPATH} + +WEEKLY=0 +while :; do + case ${1} in + --weekly) + # Run all tests. This is a weekly build. + echo "Setting WEEKLY in allocate_and_run.sh" + WEEKLY=1 + ;; + -?*) + # Unknown option + echo "Unknown option (${1})" >&2 + exit 1 + ;; + *) + # Break loop if there are no more options + break + esac + shift +done + +echo "allocate_and_run.sh WEEKLY=${WEEKLY}" + +if [ "${CLUSTER}" = 'pascal' ]; then + export MV2_USE_CUDA=1 +fi + +ALLOCATION_TIME_LIMIT_NIGHTLY=45 +ALLOCATION_TIME_LIMIT_WEEKLY=90 + +if [ "${CLUSTER}" = 'lassen' ]; then + ALLOCATION_TIME_LIMIT_NIGHTLY=90 + ALLOCATION_TIME_LIMIT_WEEKLY=120 + if [ ${WEEKLY} -ne 0 ]; then + timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 4 -W ${ALLOCATION_TIME_LIMIT_WEEKLY} ./run.sh --weekly + else + timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 2 -W ${ALLOCATION_TIME_LIMIT_NIGHTLY} ./run.sh + fi +elif [ "${CLUSTER}" = 'ray' ]; then + if [ ${WEEKLY} -ne 0 ]; then + timeout -k 5 24h bsub -Is -q pbatch -nnodes 4 -W ${ALLOCATION_TIME_LIMIT_WEEKLY} ./run.sh --weekly + else + timeout -k 5 24h bsub -Is -q pbatch -nnodes 2 -W ${ALLOCATION_TIME_LIMIT_NIGHTLY} ./run.sh + fi +elif [ "${CLUSTER}" = 'corona' ]; then + if [ ${WEEKLY} -ne 0 ]; then + ALLOCATION_TIME_LIMIT_WEEKLY=960 + timeout -k 5 24h salloc -N4 --partition=mi60 -t ${ALLOCATION_TIME_LIMIT_WEEKLY} ./run.sh --weekly + else + ALLOCATION_TIME_LIMIT_NIGHTLY=90 # Start with 1.5 hrs; may adjust for CPU clusters + if [[ $(mjstat -c | awk 'match($1, "mi60") && NF < 7 { print $5 }') -ne "0" ]]; + then + timeout -k 5 24h salloc -N2 --partition=mi60 -t ${ALLOCATION_TIME_LIMIT_NIGHTLY} ./run.sh + else + echo "Partition \"mi60\" on cluster \"${CLUSTER}\" appears to be down." + echo "Trying \"mi25\"." + timeout -k 5 24h salloc -N2 --partition=mi25 -t ${ALLOCATION_TIME_LIMIT_NIGHTLY} ./run.sh + fi + fi +elif [ "${CLUSTER}" = 'pascal' ]; then + if [ ${WEEKLY} -ne 0 ]; then + timeout -k 5 24h salloc -N4 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT_WEEKLY} ./run.sh --weekly + else + if [[ $(mjstat -c | awk 'match($1, "pbatch") && NF < 7 { print $5 }') -ne "0" ]]; + then + timeout -k 5 24h salloc -N2 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT_NIGHTLY} ./run.sh + else + echo "Partition \"pbatch\" on cluster \"${CLUSTER}\" appears to be down." + fi + fi +elif [ "${CLUSTER}" = 'catalyst' ]; then + if [ ${WEEKLY} -ne 0 ]; then + ALLOCATION_TIME_LIMIT_WEEKLY=960 + timeout -k 5 24h salloc -N4 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT_WEEKLY} ./run.sh --weekly + else + ALLOCATION_TIME_LIMIT_NIGHTLY=90 # Start with 1.5 hrs; may adjust for CPU clusters + if [[ $(mjstat -c | awk 'match($1, "pbatch") && NF < 7 { print $5 }') -ne "0" ]]; + then + timeout -k 5 24h salloc -N2 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT_NIGHTLY} ./run.sh + else + echo "Partition \"pbatch\" on cluster \"${CLUSTER}\" appears to be down." + fi + fi +else + echo "allocate_and_run.sh. Unsupported cluster CLUSTER=${CLUSTER}" +fi diff --git a/bamboo/clean.sh b/bamboo/clean.sh index 254930cb247..03b7826cf2a 100755 --- a/bamboo/clean.sh +++ b/bamboo/clean.sh @@ -6,10 +6,9 @@ LBANN_DIR=$(git rev-parse --show-toplevel) # Compiler Tests rm -f ${LBANN_DIR}/bamboo/compiler_tests/*.pyc rm -rf ${LBANN_DIR}/bamboo/compiler_tests/__pycache__ -rm -rf ${LBANN_DIR}/bamboo/compiler_tests/builds/*_debug -rm -rf ${LBANN_DIR}/bamboo/compiler_tests/builds/*_rel -rm -f ${LBANN_DIR}/bamboo/compiler_tests/error/*.txt -rm -f ${LBANN_DIR}/bamboo/compiler_tests/output/*.txt +rm -rf ${LBANN_DIR}/bamboo/compiler_tests/builds/* +rm -f ${LBANN_DIR}/bamboo/compiler_tests/error/* +rm -f ${LBANN_DIR}/bamboo/compiler_tests/output/* # Integration Tests rm -f ${LBANN_DIR}/bamboo/integration_tests/*.pgm @@ -17,13 +16,15 @@ rm -f ${LBANN_DIR}/bamboo/integration_tests/*.prototext* rm -f ${LBANN_DIR}/bamboo/integration_tests/*.pyc rm -rf ${LBANN_DIR}/bamboo/integration_tests/__pycache__ rm -f ${LBANN_DIR}/bamboo/integration_tests/*.tfevents.* -rm -f ${LBANN_DIR}/bamboo/integration_tests/error/*.txt -rm -f ${LBANN_DIR}/bamboo/integration_tests/output/*.txt +rm -rf ${LBANN_DIR}/bamboo/integration_tests/experiments/* # Unit Tests +rm -rf ${LBANN_DIR}/bamboo/unit_tests/ckpt* +rm -rf ${LBANN_DIR}/bamboo/unit_tests/lbann2_* rm -f ${LBANN_DIR}/bamboo/unit_tests/*.prototext* rm -f ${LBANN_DIR}/bamboo/unit_tests/*.pyc rm -rf ${LBANN_DIR}/bamboo/unit_tests/__pycache__ rm -f ${LBANN_DIR}/bamboo/unit_tests/*.tfevents.* -rm -f ${LBANN_DIR}/bamboo/unit_tests/error/*.txt -rm -f ${LBANN_DIR}/bamboo/unit_tests/output/*.txt +rm -f ${LBANN_DIR}/bamboo/unit_tests/error/* +rm -f ${LBANN_DIR}/bamboo/unit_tests/output/* +rm -rf ${LBANN_DIR}/bamboo/unit_tests/experiments/* diff --git a/bamboo/common_python/data/__init__.py b/bamboo/common_python/data/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/bamboo/common_python/data/imagenet/__init__.py b/bamboo/common_python/data/imagenet/__init__.py new file mode 100644 index 00000000000..3ab346ec2dd --- /dev/null +++ b/bamboo/common_python/data/imagenet/__init__.py @@ -0,0 +1,44 @@ +import os +import os.path + +import google.protobuf.text_format + +def make_data_reader(lbann, num_classes=1000): + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.contrib.lc.paths + + # Load Protobuf message from file + current_dir = os.path.dirname(os.path.realpath(__file__)) + protobuf_file = os.path.join(current_dir, 'data_reader.prototext') + message = lbann.lbann_pb2.LbannPB() + with open(protobuf_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), message) + message = message.data_reader + + # Check if data paths are accessible + train_data_dir = lbann.contrib.lc.paths.imagenet_dir(data_set='train', + num_classes=num_classes) + train_label_file = lbann.contrib.lc.paths.imagenet_labels(data_set='train', + num_classes=num_classes) + test_data_dir = lbann.contrib.lc.paths.imagenet_dir(data_set='val', + num_classes=num_classes) + test_label_file = lbann.contrib.lc.paths.imagenet_labels(data_set='val', + num_classes=num_classes) + if not os.path.isdir(train_data_dir): + raise FileNotFoundError('could not access {}'.format(train_data_dir)) + if not os.path.isfile(train_label_file): + raise FileNotFoundError('could not access {}'.format(train_label_file)) + if not os.path.isdir(test_data_dir): + raise FileNotFoundError('could not access {}'.format(test_data_dir)) + if not os.path.isfile(test_label_file): + raise FileNotFoundError('could not access {}'.format(test_label_file)) + + # Set paths + message.reader[0].data_filedir = train_data_dir + message.reader[0].data_filename = train_label_file + message.reader[1].data_filedir = test_data_dir + message.reader[1].data_filename = test_label_file + + return message diff --git a/bamboo/common_python/data/imagenet/data_reader.prototext b/bamboo/common_python/data/imagenet/data_reader.prototext new file mode 100644 index 00000000000..3f4e0270f3f --- /dev/null +++ b/bamboo/common_python/data/imagenet/data_reader.prototext @@ -0,0 +1,60 @@ +data_reader { + reader { + name: "imagenet" + role: "train" + shuffle: true + data_filedir: "path/to/ILSVRC2012/train" + data_filename: "path/to/ILSVRC2012/labels/train.txt" + validation_percent: 0.0 + percent_of_data_to_use: 1.0 + num_labels: 1000 + + transforms { + random_resized_crop { + height: 224 + width: 224 + } + } + transforms { + horizontal_flip { + p: 0.5 + } + } + transforms { + colorize {} + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" + } + } + } + + reader { + name: "imagenet" + role: "validate" + data_filedir: "path/to/ILSVRC2012/val" + data_filename: "path/to/ILSVRC2012/labels/val.txt" + percent_of_data_to_use: 1.0 + num_labels: 1000 + + transforms { + resized_center_crop { + height: 256 + width: 256 + crop_height: 224 + crop_width: 224 + } + } + transforms { + colorize {} + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" + } + } + } +} diff --git a/bamboo/common_python/data/mnist/__init__.py b/bamboo/common_python/data/mnist/__init__.py new file mode 100644 index 00000000000..3c4546011cd --- /dev/null +++ b/bamboo/common_python/data/mnist/__init__.py @@ -0,0 +1,34 @@ +import gzip +import os +import os.path +import urllib.request + +import google.protobuf.text_format + +def make_data_reader(lbann): + """Make Protobuf message for MNIST data reader. + + MNIST data is downloaded if needed. + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.contrib.lc.paths + + # Load data readers from prototext + current_dir = os.path.dirname(os.path.realpath(__file__)) + # Load Protobuf message from file + protobuf_file = os.path.join(current_dir, + 'data_reader.prototext') + + message = lbann.lbann_pb2.LbannPB() + with open(protobuf_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), message) + message = message.data_reader + + # Set paths + for reader in message.reader: + reader.data_filedir = lbann.contrib.lc.paths.mnist_dir() + + return message diff --git a/bamboo/common_python/data/mnist/data_reader.prototext b/bamboo/common_python/data/mnist/data_reader.prototext new file mode 100644 index 00000000000..61c3b32cf42 --- /dev/null +++ b/bamboo/common_python/data/mnist/data_reader.prototext @@ -0,0 +1,30 @@ +data_reader { + reader { + name: "mnist" + role: "train" + shuffle: true + data_filedir: "lbann/applications/vision/data/mnist" + data_filename: "train-images-idx3-ubyte" + label_filename: "train-labels-idx1-ubyte" + validation_percent: 0.1 + percent_of_data_to_use: 1.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 + } + } + } + reader { + name: "mnist" + role: "test" + data_filedir: "lbann/applications/vision/data/mnist" + data_filename: "t10k-images-idx3-ubyte" + label_filename: "t10k-labels-idx1-ubyte" + percent_of_data_to_use: 1.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 + } + } + } +} diff --git a/bamboo/common_python/test_tools.py b/bamboo/common_python/test_tools.py index 6cafbb39bd6..2146ba05b3b 100644 --- a/bamboo/common_python/test_tools.py +++ b/bamboo/common_python/test_tools.py @@ -1,161 +1,267 @@ import pytest import tools -# This test isn't in a directory to be run from Bamboo + +# This test file isn't in a directory to be run from Bamboo # Run locally with python -m pytest -s +d = dict( + executable='exe', + num_nodes=20, + partition='pdebug', + time_limit=30, + num_processes=40, + dir_name='dir', + data_filedir_default='lscratchh/filedir', + data_reader_name='mnist', + data_reader_percent=0.10, + exit_after_setup=True, + mini_batch_size=15, + model_folder='models/folder', + model_name='lenet', + num_epochs=7, + optimizer_name='adagrad', + processes_per_model=10, + extra_lbann_flags={'print_affinity': None}, + output_file_name='output_file', + error_file_name='error_file', + check_executable_existence=False) + + def test_command_catalyst(): - actual = tools.get_command(cluster='catalyst', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) - expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + actual = tools.get_command(cluster='catalyst', **d) + expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file' assert actual == expected -def test_command_pascal(): - actual = tools.get_command(cluster='pascal', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) - expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + +def test_command_corona(): + actual = tools.get_command(cluster='corona', **d) + expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file' assert actual == expected -def test_command_quartz(): - actual = tools.get_command(cluster='quartz', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) - expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --data_filedir=lscratchh/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + +def test_command_lassen(): + actual = tools.get_command(cluster='lassen', **d) + expected = 'bsub -G guests -Is -q pdebug -nnodes 20 -W 30 jsrun -b "packed:10" -c 40 -g 4 -d packed -n 16 -r 1 -a 4 exe --data_filedir=gpfs1/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file' assert actual == expected - -def test_command_surface(): - actual = tools.get_command(cluster='surface', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) - expected = 'salloc --nodes=20 --partition=pbatch --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + + +def test_command_pascal(): + actual = tools.get_command(cluster='pascal', **d) + expected = 'salloc --nodes=20 --partition=pbatch --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file' assert actual == expected + def test_command_ray(): - actual = tools.get_command(cluster='ray', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) - expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + actual = tools.get_command(cluster='ray', **d) + expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun --timeout=30 -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file' assert actual == expected + # Test error cases ############################################################ -def test_blacklisted_substrings(): + +def test_blacklisted_substrings_1(): try: - tools.get_command('ray', 'exe', partition=';', optimizer_path='--model=new_model', check_executable_existence=False) + tools.get_command('ray', 'exe', partition=';', + optimizer_path='--model=new_model', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid character(s): ; contains ; , --model=new_model contains --' assert actual == expected + +def test_blacklisted_substrings_2(): + try: + tools.get_command('ray', 'exe', partition='pdebug', + extra_lbann_flags={'--bad_key': 5}, + check_executable_existence=False) + assert False + except Exception as e: + actual = str(e) + expected = 'Invalid character(s): --bad_key contains --' + assert actual == expected + + +def test_blacklisted_substrings_3(): + try: + tools.get_command('ray', 'exe', partition='pdebug', + extra_lbann_flags={'key': '--bad_value'}, + check_executable_existence=False) + assert False + except Exception as e: + actual = str(e) + expected = 'Invalid character(s): --bad_value contains --' + assert actual == expected + + def test_unsupported_cluster(): try: - tools.get_command('quartz', 'exe', check_executable_existence=False) + tools.get_command('q', 'exe', check_executable_existence=False) + assert False except Exception as e: actual = str(e) - expected = 'Unsupported Cluster: quartz' + expected = 'Unsupported Cluster: q' assert actual == expected + def test_bad_model_1(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', model_name='name', model_path='path', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', + model_name='name', model_path='path', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name' assert actual == expected + def test_bad_model_2(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', model_path='path', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', + model_path='path', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name' assert actual == expected + def test_bad_model_3(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_name='name', model_path='path', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', model_name='name', + model_path='path', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name' assert actual == expected + def test_bad_model_4(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: model_folder set but not model_name.' assert actual == expected + def test_bad_model_5(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_name='name', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', model_name='name', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: model_name set but not model_folder.' assert actual == expected + def test_bad_data_reader(): try: - tools.get_command('catalyst', 'exe', dir_name='dir', data_reader_name='name', data_reader_path='path', check_executable_existence=False) + tools.get_command('catalyst', 'exe', dir_name='dir', + data_reader_name='name', data_reader_path='path', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_reader_path is set but so is data_reader_name , data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected + def test_bad_optimizer(): try: - tools.get_command('ray', 'exe', dir_name='dir', optimizer_name='name', optimizer_path='path', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', optimizer_name='name', + optimizer_path='path', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: optimizer_path is set but so is optimizer_name' assert actual == expected + def test_bad_dir_name_1(): try: - tools.get_command('ray', 'exe', dir_name='dir', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', + check_executable_existence=False) + assert False except Exception as e: - actual = str(e) - expected = 'Invalid Usage: dir_name set but none of model_folder, model_name, data_reader_name, optimizer_name are.' + actual = str(e) + expected = 'Invalid Usage: dir_name set but none of model_folder, model_name, data_reader_name, optimizer_name are.' assert actual == expected + def test_bad_dir_name_2(): try: - tools.get_command('ray', 'exe', model_folder='folder', check_executable_existence=False) + tools.get_command('ray', 'exe', model_folder='folder', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' assert actual == expected + def test_bad_dir_name_3(): try: - tools.get_command('ray', 'exe', model_name='name', check_executable_existence=False) + tools.get_command('ray', 'exe', model_name='name', + check_executable_existence=False) + assert False except Exception as e: - actual = str(e) - expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' + actual = str(e) + expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' assert actual == expected + def test_bad_dir_name_4(): try: - tools.get_command('catalyst', 'exe', data_reader_name='name', check_executable_existence=False) + tools.get_command('catalyst', 'exe', data_reader_name='name', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is. , data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected + def test_bad_dir_name_5(): try: - tools.get_command('ray', 'exe', optimizer_name='name', check_executable_existence=False) + tools.get_command('ray', 'exe', optimizer_name='name', + check_executable_existence=False) + assert False except Exception as e: - actual = str(e) - expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' + actual = str(e) + expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' assert actual == expected + def test_bad_data_filedir_1(): try: - tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filedir_train_default='a', + tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', + data_filedir_default='filedir', + data_filedir_train_default='a', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected + def test_bad_data_filedir_2(): try: - tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filename_train_default='b', + tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', + data_filedir_default='filedir', + data_filename_train_default='b', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' @@ -164,33 +270,50 @@ def test_bad_data_filedir_2(): def test_bad_data_filedir_3(): try: - tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filedir_test_default='c', + tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', + data_filedir_default='filedir', + data_filedir_test_default='c', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected + def test_bad_data_filedir_4(): try: - tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filename_test_default='d', + tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', + data_filedir_default='filedir', + data_filename_test_default='d', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected + def test_bad_data_filedir_5(): try: - tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filedir_train_default='e', check_executable_existence=False) + tools.get_command('ray', 'exe', data_reader_path='path', + data_filedir_default='filedir', + data_filedir_train_default='e', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected + def test_bad_data_filedir_6(): try: - tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_train_default='f', check_executable_existence=False) + tools.get_command('ray', 'exe', data_reader_path='path', + data_filedir_default='filedir', + data_filename_train_default='f', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' @@ -199,47 +322,68 @@ def test_bad_data_filedir_6(): def test_bad_data_filedir_7(): try: - tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filedir_test_default='g', check_executable_existence=False) + tools.get_command('ray', 'exe', data_reader_path='path', + data_filedir_default='filedir', + data_filedir_test_default='g', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected + def test_bad_data_filedir_8(): try: - tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_test_default='h', check_executable_existence=False) + tools.get_command('ray', 'exe', data_reader_path='path', + data_filedir_default='filedir', + data_filename_test_default='h', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected + def test_bad_data_filedir_9(): try: - tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected + def test_bad_data_filedir_10(): try: - tools.get_command('ray', 'exe', data_reader_path='path', check_executable_existence=False) + tools.get_command('ray', 'exe', data_reader_path='path', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected + def test_bad_data_filedir_11(): try: - tools.get_command('ray', 'exe', data_filedir_default='filedir', check_executable_existence=False) + tools.get_command('ray', 'exe', data_filedir_default='filedir', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_filedir_default set but neither data_reader_name or data_reader_path are.' - assert actual == expected + assert actual == expected + def test_bad_data_filedir_12(): try: - tools.get_command('ray', 'exe', data_filedir_train_default='a', check_executable_existence=False) + tools.get_command('ray', 'exe', data_filedir_train_default='a', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' @@ -248,7 +392,9 @@ def test_bad_data_filedir_12(): def test_bad_data_filedir_13(): try: - tools.get_command('ray', 'exe', data_filename_train_default='b', check_executable_existence=False) + tools.get_command('ray', 'exe', data_filename_train_default='b', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' @@ -257,7 +403,9 @@ def test_bad_data_filedir_13(): def test_bad_data_filedir_14(): try: - tools.get_command('ray', 'exe', data_filedir_test_default='c', check_executable_existence=False) + tools.get_command('ray', 'exe', data_filedir_test_default='c', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' @@ -266,8 +414,48 @@ def test_bad_data_filedir_14(): def test_bad_data_filedir_15(): try: - tools.get_command('ray', 'exe', data_filename_test_default='e', check_executable_existence=False) + tools.get_command('ray', 'exe', data_filename_test_default='e', + check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' assert actual == expected + + +def test_bad_extra_lbann_flags_invalid_flag(): + try: + tools.get_command('ray', 'exe', partition='pdebug', + extra_lbann_flags={'invalid_flag': 'value'}, + check_executable_existence=False) + assert False + except Exception as e: + actual = str(e) + expected = ("Invalid Usage: extra_lbann_flags includes invalid" + " flag=invalid_flag. Flags must" + " be in ['hydrogen_block_size', 'procs_per_trainer'," + " 'num_parallel_readers', 'num_io_threads', 'serialize_io'," + " 'disable_background_io_activity', 'disable_cuda'," + " 'random_seed', 'objective_function', 'data_layout'," + " 'print_affinity', 'use_data_store', 'preload_data_store'," + " 'super_node', 'write_sample_list', 'ltfb_verbose'," + " 'ckpt_dir', 'index_list_train', 'index_list_test'," + " 'label_filename_train', 'label_filename_test'," + " 'share_testing_data_readers', 'image_dir', 'no_im_comm']." + ) + assert actual == expected + + +def test_bad_extra_lbann_flags_not_a_dict(): + try: + tools.get_command('ray', 'exe', partition='pdebug', + extra_lbann_flags='invalid_flag', + check_executable_existence=False) + assert False + except Exception as e: + actual = str(e) + expected = ( + 'Invalid Usage: extra_lbann_flags must be a dict e.g. `{flag :' + ' None, flag: 4}`. Use `None` if a flag has no value attached ' + 'to it.') + assert actual == expected diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 7110ddc9a67..27f15772254 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -1,22 +1,34 @@ +import collections.abc +import copy +import math +import os +import re +import sys +import numpy as np import pytest -import math, os, re - +import shutil +import subprocess +from filecmp import cmp def check_list(substrings, strings): errors = [] for string in strings: for substring in substrings: - if (string != None) and (substring in string): + if (string is not None) and (isinstance(string, str)) and (substring in string): errors.append('%s contains %s' % (string, substring)) return errors def get_command(cluster, executable, + # Allocation/Run Parameters num_nodes=None, + num_processes=None, partition=None, time_limit=None, - num_processes=None, + # LBANN Parameters + ckpt_dir=None, + disable_cuda=None, dir_name=None, data_filedir_default=None, data_filedir_train_default=None, @@ -27,6 +39,7 @@ def get_command(cluster, data_reader_path=None, data_reader_percent=None, exit_after_setup=False, + metadata=None, mini_batch_size=None, model_folder=None, model_name=None, @@ -35,39 +48,64 @@ def get_command(cluster, optimizer_name=None, optimizer_path=None, processes_per_model=None, - ckpt_dir=None, - output_file_name=None, + restart_dir=None, + extra_lbann_flags=None, + # Error/Output Redirect error_file_name=None, - return_tuple=False, + output_file_name=None, + # Misc. Parameters check_executable_existence=True, - skip_no_exe=True): + return_tuple=False, + skip_no_exe=True, + weekly=False): # Check parameters for black-listed characters like semi-colons that # would terminate the command and allow for an extra command blacklist = [';', '--'] - strings = [partition, dir_name, data_filedir_default, - data_filedir_train_default, - data_filename_train_default, data_filedir_test_default, - data_filename_test_default, data_reader_name, data_reader_path, - model_folder, model_name, model_path, optimizer_name, - optimizer_path, output_file_name, error_file_name] + strings = [ + cluster, executable, + # Allocation/Run Parameters + num_nodes, num_processes, partition, time_limit, + # LBANN Parameters + ckpt_dir, dir_name, data_filedir_default, data_filedir_train_default, + data_filename_train_default, data_filedir_test_default, + data_filename_test_default, data_reader_name, data_reader_path, + data_reader_percent, exit_after_setup, metadata, mini_batch_size, + model_folder, model_name, model_path, num_epochs, optimizer_name, + optimizer_path, processes_per_model, restart_dir, + # Error/Output Redirect + error_file_name, output_file_name, + # Misc. Parameters + check_executable_existence, return_tuple, skip_no_exe, weekly + ] + lbann_errors = [] + if extra_lbann_flags is not None: + if not isinstance(extra_lbann_flags, dict): + lbann_errors.append( + ('extra_lbann_flags must be a dict e.g. `{flag :' + ' None, flag: 4}`. Use `None` if a flag has no value attached ' + 'to it.')) + else: + strings += list(extra_lbann_flags.keys()) + strings += list(extra_lbann_flags.values()) invalid_character_errors = check_list(blacklist, strings) if invalid_character_errors != []: raise Exception('Invalid character(s): %s' % ' , '.join( invalid_character_errors)) - # Never give lbannusr an allocation for over 12 hours though. - strict_time_limit = 60*6 # 6 hours. - if time_limit > strict_time_limit: - time_limit = strict_time_limit - - # Check executable existence - if check_executable_existence: - process_executable_existence(executable, skip_no_exe) + DEFAULT_TIME = 35 + MAX_TIME = 360 # 6 hours. + if time_limit is None: + if weekly: + time_limit = MAX_TIME + else: + time_limit = DEFAULT_TIME + if time_limit > MAX_TIME: + time_limit = MAX_TIME # Determine scheduler - if cluster in ['catalyst', 'pascal', 'quartz', 'surface']: + if cluster in ['catalyst', 'corona', 'pascal']: scheduler = 'slurm' - elif cluster == 'ray': + elif cluster in ['lassen', 'ray']: scheduler = 'lsf' else: raise Exception('Unsupported Cluster: %s' % cluster) @@ -77,9 +115,9 @@ def get_command(cluster, if scheduler == 'slurm': # Create allocate command command_allocate = '' - # Allocate a node if we don't have one already - # Running the tests manually allows for already having a node allocated - if os.getenv('SLURM_JOB_NUM_NODES') == None: + # Allocate nodes only if we don't already have an allocation. + if os.getenv('SLURM_JOB_NUM_NODES') is None: + print('Allocating slurm nodes.') command_allocate = 'salloc' option_num_nodes = '' option_partition = '' @@ -91,8 +129,8 @@ def get_command(cluster, # maxnodes. option_num_nodes = ' --nodes=%d' % num_nodes if partition is not None: - # Surface does not have pdebug, so switch to pbatch - if (cluster in ['surface', 'pascal']) and \ + # If cluster doesn't have pdebug switch to pbatch. + if (cluster in ['pascal']) and \ (partition == 'pdebug'): partition = 'pbatch' # --partition => Request a specific partition for the resource @@ -106,12 +144,16 @@ def get_command(cluster, command_allocate = '%s%s%s%s' % ( command_allocate, option_num_nodes, option_partition, option_time_limit) + else: + print('slurm nodes already allocated.') # Create run command if command_allocate == '': - command_run = 'srun --mpibind=off' + space = '' else: - command_run = ' srun --mpibind=off' + space = ' ' + command_run = '{s}srun --mpibind=off --time={t}'.format( + s=space, t=time_limit) option_num_processes = '' if num_processes is not None: # --ntasks => Specify the number of tasks to run. @@ -122,24 +164,29 @@ def get_command(cluster, elif scheduler == 'lsf': # Create allocate command command_allocate = '' - # Allocate a node if we don't have one already - # Running the tests manually allows for already having a node allocated - if os.getenv('LSB_HOSTS') is None: + # Allocate nodes only if we don't already have an allocation. + if (os.getenv('LSB_HOSTS') is None) and (os.getenv('LSB_JOBID') is None): + print('Allocating lsf nodes.') command_allocate = 'bsub' - # x => Puts the host running your job into exclusive execution - # mode. - option_exclusive = ' -x' + option_exclusive = '' + if cluster != 'lassen': + # x => Puts the host running your job into exclusive execution + # mode. + option_exclusive = ' -x' # G=> For fairshare scheduling. Associates the job with the # specified group. option_group = ' -G guests' # Is => Submits an interactive job and creates a pseudo-terminal # with shell mode when the job starts. option_interactive = ' -Is' + option_num_nodes = '' option_num_processes = '' option_partition = '' option_processes_per_node = '' option_time_limit = '' - if num_processes is not None: + if cluster == 'lassen': + option_num_nodes = ' -nnodes {n}'.format(n=num_nodes) + elif num_processes is not None: # n => Submits a parallel job and specifies the number of # tasks in the job. option_num_processes = ' -n %d' % num_processes @@ -147,7 +194,7 @@ def get_command(cluster, # R => Runs the job on a host that meets the specified # resource requirements. option_processes_per_node = ' -R "span[ptile=%d]"' % int( - math.ceil(float(num_processes)/num_nodes)) + math.ceil(float(num_processes) / num_nodes)) if partition is not None: # q => Submits the job to one of the specified queues. option_partition = ' -q %s' % partition @@ -158,32 +205,70 @@ def get_command(cluster, time_limit = max_ray_time # W => Sets the runtime limit of the job. option_time_limit = ' -W %d' % time_limit - command_allocate = '%s%s%s%s%s%s%s%s' % ( + command_allocate = '%s%s%s%s%s%s%s%s%s' % ( command_allocate, option_exclusive, option_group, option_interactive, option_num_processes, option_partition, - option_processes_per_node, option_time_limit) + option_num_nodes, option_processes_per_node, option_time_limit) + else: + print('lsf nodes already allocated.') # Create run command if command_allocate == '': - command_run = 'mpirun' + space = '' else: - command_run = ' mpirun' + space = ' ' + if cluster == 'lassen': + # Cannot specify time limit for jsrun. + command_run = '{s}jsrun'.format(s=space) + else: + command_run = '{s}mpirun --timeout {t}'.format(s=space, t=time_limit*60) + option_bind = '' + option_cpu_per_resource = '' + option_gpu_per_resource = '' + option_launch_distribution = '' option_num_processes = '' option_processes_per_node = '' + option_resources_per_host = '' + option_tasks_per_resource = '' if num_processes is not None: - # -np => Run this many copies of the program on the given nodes. - option_num_processes = ' -np %d' % num_processes - if (num_nodes is not None) and (num_nodes != 0): - option_processes_per_node = ' -N %d' % int( - math.ceil(float(num_processes)/num_nodes)) - command_run = '%s%s%s' % ( - command_run, option_num_processes, option_processes_per_node) + if cluster == 'lassen': + option_bind = ' -b "packed:10"' + option_cpu_per_resource = ' -c 40' + option_gpu_per_resource = ' -g 4' + option_launch_distribution = ' -d packed' + # Avoid `nrs (32) should not be greater than rs_per_host (1) * number of servers available (16).` + if num_nodes is None: + num_nodes = 1 + # The "option_num_processes" is a misnomer for the LSF case. Rather than + # changing the rest of the code, set it to be the number of nodes. Within + # JSRUN, the correct number of processes will be obtained when combined + # with "option_tasks_per_resource". + option_num_processes = ' -n {n}'.format(n=num_nodes) + option_resources_per_host = ' -r 1' + option_tasks_per_resource = ' -a %d' % (num_processes/num_nodes) + if (num_processes%num_nodes) is not 0: + raise Exception('num_processes %s, is not divisible by num_nodes %d' + % (num_processes, num_nodes)) + + else: + # -np => Run this many copies of the program on the given nodes. + option_num_processes = ' -np %d' % num_processes + if (num_nodes is not None) and (num_nodes != 0): + processes_per_node = int( + math.ceil(float(num_processes)/num_nodes)) + option_processes_per_node = ' -N %d' % processes_per_node + command_run = '%s%s%s%s%s%s%s%s%s' % ( + command_run, option_bind, option_cpu_per_resource, + option_gpu_per_resource, option_launch_distribution, + option_num_processes, option_processes_per_node, + option_resources_per_host, option_tasks_per_resource) else: raise Exception('Unsupported Scheduler %s' % scheduler) # Create LBANN command option_ckpt_dir = '' + option_disable_cuda = '' option_data_filedir = '' option_data_filedir_train = '' option_data_filename_train = '' @@ -192,12 +277,13 @@ def get_command(cluster, option_data_reader = '' option_data_reader_percent = '' option_exit_after_setup = '' + option_metadata = '' option_mini_batch_size = '' option_model = '' option_num_epochs = '' option_optimizer = '' option_processes_per_model = '' - lbann_errors = [] + option_restart_dir = '' if model_path is not None: # If model_folder and/or model_name are set, an exception will be # raised later. @@ -206,8 +292,8 @@ def get_command(cluster, # If data_reader_name is set, an exception will be raised later. option_data_reader = ' --reader=%s' % data_reader_path if optimizer_path is not None: - # If optimizer_name is set, an exception will be raised later. - option_optimizer_name = ' --optimizer=%s' % optimizer_path + # If optimizer_name is also set, an exception will be raised later. + option_optimizer = ' --optimizer=%s' % optimizer_path if dir_name is not None: if model_path is not None: if (model_folder is not None) or (model_name is not None): @@ -251,27 +337,40 @@ def get_command(cluster, # Determine data file paths # If there is no regex match, then re.sub keeps the original string if data_filedir_default is not None: - if cluster in ['catalyst', 'pascal', 'surface']: + if cluster in ['catalyst', 'corona', 'pascal',]: # option_data_filedir = data_filedir_default # lscratchh, presumably pass # No need to pass in a parameter - elif cluster == 'quartz': + elif cluster == 'lassen': option_data_filedir = ' --data_filedir=%s' % re.sub( - '[a-z]scratch[a-z]', 'lscratchh', data_filedir_default) + '[a-z]scratch[a-z]', 'gpfs1', data_filedir_default) elif cluster == 'ray': option_data_filedir = ' --data_filedir=%s' % re.sub( '[a-z]scratch[a-z]', 'gscratchr', data_filedir_default) - elif None not in data_file_parameters: - if cluster in ['catalyst', 'pascal', 'surface']: + elif not data_file_parameters == [None, None, None, None]: + # Any of the data_file_parameters has a non-None value. + if cluster in ['catalyst', 'corona', 'pascal']: # option_data_filedir_train = data_filedir_train_default # option_data_filename_train = data_filename_train_default # option_data_filedir_test = data_filedir_test_default # option_data_filename_train = data_filename_test_default - pass # No need to pass in a parameter - elif cluster == 'quartz': - option_data_filedir_train = ' --data_filedir_train=%s' % re.sub('[a-z]scratch[a-z]', 'lscratchh', data_filedir_train_default) - option_data_filename_train = ' --data_filename_train=%s' % re.sub('[a-z]scratch[a-z]', 'lscratchh', data_filename_train_default) - option_data_filedir_test = ' --data_filedir_test=%s' % re.sub('[a-z]scratch[a-z]', 'lscratchh', data_filedir_test_default) - option_data_filename_train = ' --data_filename_test=%s' % re.sub('[a-z]scratch[a-z]', 'lscratchh', data_filename_test_default) + pass # No need to pass in a parameter + elif cluster == 'lassen': + if data_filedir_train_default is not None: + option_data_filedir_train = ' --data_filedir_train=%s' % re.sub('[a-z]scratch[a-z]', 'gpfs1', data_filedir_train_default) + if data_filename_train_default is not None: + filename_train = re.sub( + '[a-z]scratch[a-z]', 'gpfs1', data_filename_train_default) + filename_train = re.sub( + 'labels', 'original/labels', filename_train) + option_data_filename_train = ' --data_filename_train=%s' % filename_train + if data_filedir_test_default is not None: + option_data_filedir_test = ' --data_filedir_test=%s' % re.sub('[a-z]scratch[a-z]', 'gpfs1', data_filedir_test_default) + if data_filename_test_default is not None: + filename_test = re.sub( + '[a-z]scratch[a-z]', 'gpfs1', data_filename_test_default) + filename_test = re.sub( + 'labels', 'original/labels', filename_test) + option_data_filename_test = ' --data_filename_test=%s' % filename_test elif cluster == 'ray': option_data_filedir_train = ' --data_filedir_train=%s' % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filedir_train_default) option_data_filename_train = ' --data_filename_train=%s' % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filename_train_default) @@ -290,22 +389,23 @@ def get_command(cluster, else: # if None in data_file_parameters: # If any are None if data_file_parameters == [None, None, None, None]: # If all are None - lbann_errors.append( - ('data_reader_name or data_reader_path is set but not' - ' data_filedir_default. If a data reader is provided,' - ' the default filedir must be set. This allows for' - ' determining what the filedir should be on each' - ' cluster. Alternatively, some or all of' - ' [data_filedir_train_default, data_filename_train' - '_default, data_filedir_test_default, data_filename' - '_test_default] can be set.')) + if data_reader_name != 'synthetic': + lbann_errors.append( + ('data_reader_name or data_reader_path is set but not' + ' data_filedir_default. If a data reader is provided,' + ' the default filedir must be set. This allows for' + ' determining what the filedir should be on each' + ' cluster. Alternatively, some or all of' + ' [data_filedir_train_default, data_filename_train' + '_default, data_filedir_test_default, data_filename' + '_test_default] can be set.')) # else: no data_file parameters are set else: if data_filedir_default is not None: lbann_errors.append( ('data_filedir_default set but neither data_reader_name' ' or data_reader_path are.')) - elif filter(lambda x: x is not None, data_file_parameters) != []: + elif list(filter(lambda x: x is not None, data_file_parameters)) != []: # If the list of non-None data_file parameters is not empty lbann_errors.append( ('At least one of [data_filedir_train_default, data_filename' @@ -313,10 +413,30 @@ def get_command(cluster, '_test_default] is set, but neither data_reader_name or' ' data_reader_path are.')) # else: no conflicts - if data_reader_percent is not None: - option_data_reader_percent = ' --data_reader_percent=%f' % data_reader_percent + if data_reader_percent != "prototext": + if data_reader_percent is not None: + + # If data_reader_percent is not None, then it will override `weekly`. + # If it is None however, we choose its value based on `weekly`. + try: + data_reader_percent = float(data_reader_percent) + + except ValueError: + lbann_errors.append( + 'data_reader_percent={d} is not a float.'.format( + d=data_reader_percent)) + elif weekly: + data_reader_percent = 1.00 + else: + # Nightly + data_reader_percent = 0.10 + option_data_reader_percent = ' --data_reader_percent={d}'.format( + d=data_reader_percent) + # else: use the data reader's value if exit_after_setup: option_exit_after_setup = ' --exit_after_setup' + if metadata is not None: + option_metadata = ' --metadata={d}/{m}'.format(d=dir_name, m=metadata) if mini_batch_size is not None: option_mini_batch_size = ' --mini_batch_size=%d' % mini_batch_size if num_epochs is not None: @@ -325,17 +445,91 @@ def get_command(cluster, option_processes_per_model = ' --procs_per_model=%d' % processes_per_model if ckpt_dir is not None: option_ckpt_dir = ' --ckpt_dir=%s' % ckpt_dir + if restart_dir is not None: + option_restart_dir = ' --restart_dir=%s' % restart_dir + if disable_cuda is not None: + option_disable_cuda = ' --disable_cuda=%d' % int(bool(disable_cuda)) + extra_options = '' + if extra_lbann_flags is not None: + # If extra_lbann_flags is not a dict, then we have already appended + # this error to lbann_errors. + if isinstance(extra_lbann_flags, dict): + # See `lbann --help` or src/proto/proto_common.cpp + # Commented out flags already have their own parameters + # in this function. + allowed_flags = [ + # 'model', + # 'optimizer', + # 'reader', + # 'metadata', + + # General: + # 'mini_batch_size', + # 'num_epochs', + 'hydrogen_block_size', + 'procs_per_trainer', + 'num_parallel_readers', + 'num_io_threads', + 'serialize_io', + 'disable_background_io_activity', + #'disable_cuda', + 'random_seed', + 'objective_function', + 'data_layout', + 'print_affinity', + 'use_data_store', + 'preload_data_store', + 'super_node', + 'write_sample_list', + 'ltfb_verbose', + 'ckpt_dir', + #'restart_dir', + 'restart_dir_is_fullpath', + + # DataReaders: + # 'data_filedir', + # 'data_filedir_train', + # 'data_filedir_test', + # 'data_filename_train', + # 'data_filename_test', + 'index_list_train', + 'index_list_test', + 'label_filename_train', + 'label_filename_test', + # 'data_reader_percent', + 'share_testing_data_readers', + + # Callbacks: + 'image_dir', + 'no_im_comm', + + # Not listed by `lbann --help`: + # 'exit_after_setup', + # 'procs_per_model' + ] + for flag, value in sorted(extra_lbann_flags.items()): + if flag in allowed_flags: + if value is not None: + extra_options += ' --{f}={v}'.format(f=flag, v=value) + else: + extra_options += ' --{f}'.format(f=flag) + else: + s = ('extra_lbann_flags includes invalid flag={f}.' + ' Flags must be in {flags}.').format( + f=flag, flags=allowed_flags) + lbann_errors.append(s) if lbann_errors != []: print('lbann_errors={lbann_errors}.'.format(lbann_errors=lbann_errors)) raise Exception('Invalid Usage: ' + ' , '.join(lbann_errors)) - command_lbann = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % ( - executable, option_ckpt_dir, option_data_filedir, + command_lbann = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % ( + executable, option_ckpt_dir, option_disable_cuda, + option_data_filedir, option_data_filedir_train, option_data_filename_train, option_data_filedir_test, option_data_filename_test, option_data_reader, option_data_reader_percent, - option_exit_after_setup, option_mini_batch_size, + option_exit_after_setup, option_metadata, option_mini_batch_size, option_model, option_num_epochs, option_optimizer, - option_processes_per_model) + option_processes_per_model, option_restart_dir, extra_options) # Create redirect command command_output = '' @@ -357,28 +551,27 @@ def get_command(cluster, return command_string -def process_executable_existence(executable, skip_no_exe=True): - executable_exists = os.path.exists(executable) - if not executable_exists: - error_string = 'Executable does not exist: %s' % executable - if skip_no_exe: - pytest.skip(error_string) - else: - raise Exception(error_string) +def process_executable(name, compiler_name, executables): + if compiler_name not in executables: + e = '{n}: default_exes[{c}] does not exist'.format( + n=name, c=compiler_name) + print('Skip - ' + e) + import pytest + pytest.skip(e) + executable_path = executables[compiler_name] + print('{n}: executable_path={e}'.format(n=name, e=executable_path)) def get_spack_exes(default_dirname, cluster): exes = {} - exes['clang4'] = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) - exes['gcc4'] = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_rel/build/model_zoo/lbann' % (default_dirname, cluster) + exes['clang6'] = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) exes['gcc7'] = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) - exes['intel18'] = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) + exes['intel19'] = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) - exes['clang4_debug'] = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) - exes['gcc4_debug'] = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_debug/build/model_zoo/lbann' % (default_dirname, cluster) + exes['clang6_debug'] = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) exes['gcc7_debug'] = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) - exes['intel18_debug'] = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) + exes['intel19_debug'] = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) return exes @@ -386,38 +579,411 @@ def get_spack_exes(default_dirname, cluster): def get_default_exes(default_dirname, cluster): exes = get_spack_exes(default_dirname, cluster) # Use build script as a backup if the Spack build doesn't work. - if not os.path.exists(exes['clang4']): - exes['clang4'] = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + if not os.path.exists(exes['clang6']): + exes['clang6'] = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) if not os.path.exists(exes['gcc7']): exes['gcc7'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - if not os.path.exists(exes['intel18']): - exes['intel18'] = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + if not os.path.exists(exes['intel19']): + exes['intel19'] = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - if not os.path.exists(exes['clang4_debug']): - exes['clang4_debug'] = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + if not os.path.exists(exes['clang6_debug']): + exes['clang6_debug'] = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) if not os.path.exists(exes['gcc7_debug']): exes['gcc7_debug'] = '%s/build/gnu.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - if not os.path.exists(exes['intel18_debug']): - exes['intel18_debug'] = '%s/build/intel.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + if not os.path.exists(exes['intel19_debug']): + exes['intel19_debug'] = '%s/build/intel.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) default_exes = {} default_exes['default'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - if cluster in ['catalyst', 'quartz', 'pascal']: - # x86_cpu - catalyst, quartz + if cluster in ['catalyst', 'corona', 'lassen', 'pascal', 'ray']: + # Define all compilers. + # x86_cpu - catalyst # x86_gpu_pascal - pascal - default_exes['clang4'] = exes['clang4'] - default_exes['gcc4'] = exes['gcc4'] + # ppc64le_gpu_lassen - lassen + default_exes['clang6'] = exes['clang6'] default_exes['gcc7'] = exes['gcc7'] - default_exes['intel18'] = exes['intel18'] + default_exes['intel19'] = exes['intel19'] - default_exes['clang4_debug'] = exes['clang4_debug'] - default_exes['gcc4_debug'] = exes['gcc4_debug'] + default_exes['clang6_debug'] = exes['clang6_debug'] default_exes['gcc7_debug'] = exes['gcc7_debug'] - default_exes['intel18_debug'] = exes['intel18_debug'] - elif cluster in ['surface']: - # x86_gpu - surface - default_exes['gcc4'] = exes['gcc4'] - default_exes['gcc4_debug'] = exes['gcc4_debug'] + default_exes['intel19_debug'] = exes['intel19_debug'] + print('default_exes={d}'.format(d=default_exes)) return default_exes + + +def get_error_line(error_file_name): + with open(error_file_name, 'r') as error_file: + error_line = '' + previous_line = '' + for line in error_file: + if ('ERROR' in line) or ('LBANN error' in line) or \ + ('Error:' in line) or \ + ('Expired or invalid job' in line) or \ + ('Segmentation fault (core dumped)' in line) or \ + ('Relinquishing job allocation' in line): + error_line = line + break + elif ('Stack trace:' in line) or \ + ('Error is not recoverable: exiting now' in line): + error_line = previous_line + break + else: + previous_line = line + return error_line + + +def assert_success(return_code, error_file_name): + if return_code != 0: + error_line = get_error_line(error_file_name) + raise AssertionError( + 'return_code={rc}\n{el}\nSee {efn}'.format( + rc=return_code, el=error_line, efn=error_file_name)) + + +def assert_failure(return_code, expected_error, error_file_name): + if return_code == 0: + raise AssertionError( + 'return_code={rc}\nSuccess when expecting failure.\nSee {efn}'.format( + rc=return_code, efn=error_file_name)) + with open(error_file_name, 'r') as error_file: + for line in error_file: + if expected_error in line: + return True + # If we're at this point, then we know the test did not succeed, + # but we didn't get the expected error. + actual_error = get_error_line(error_file_name) + raise AssertionError( + 'return_code={rc}\nFailed with error different than expected.\nactual_error={ae}\nexpected_error={ee}\nSee {efn}'.format( + rc=return_code, ae=actual_error, ee=expected_error, + efn=error_file_name)) + + +def create_tests(setup_func, + test_file, + test_name_base=None, + **kwargs): + """Create functions that can interact with PyTest + + This function creates tests that involve running an LBANN + experiment with the Python frontend. `setup_func` should be a + function that takes in the LBANN Python module and outputs objects + for an LBANN experiment. A test succeeds if LBANN runs and exits + with an exit code of 0, and fails otherwise. + + PyTest detects tests by loading in a Python script and looking for + functions prefixed with 'test_'. After you call this function + within a script to generate test functions, make sure to add the + test functions to the script's scope. For example: + + _test_funcs = tools.create_tests(setup_func, __file__) + for t in _test_funcs: + globals()[t.__name__] = t + + Args: + setup_func (function): Sets up an LBANN experiment using the + Python frontend. It takes in the LBANN Python module as + input and returns a `(lbann.Trainer, lbann.Model, + lbann.reader_pb2.DataReader, lbann.Optimizer)`. + test_file (str): Python script being run by PyTest. In most + cases, use `__file__`. + test_name (str, optional): Descriptive name (default: test + file name with '.py' removed). + **kwargs: Keyword arguments to pass into + `lbann.contrib.launcher.run`. + + Returns: + Iterable of function: Tests that can interact with PyTest. + Each function returns a dict containing log files and + other output data. + + """ + + # Make sure test name is valid + test_file = os.path.realpath(test_file) + if not test_name_base: + # Create test name by removing '.py' from file name + test_name_base = os.path.splitext(os.path.basename(test_file))[0] + if not re.match('^test_.', test_name_base): + # Make sure test name is prefixed with 'test_' + test_name_base = 'test_' + test_name_base + + def test_func(cluster, executables, dir_name, compiler_name): + """Function that can interact with PyTest. + + Returns a dict containing log files and other output data. + + """ + process_executable(test_name_base, compiler_name, executables) + test_name = '{}_{}'.format(test_name_base, compiler_name) + + # Load LBANN Python frontend + build_names = { + 'clang6': 'clang.Release.{}.llnl.gov'.format(cluster), + 'clang6_debug': 'clang.Debug.{}.llnl.gov'.format(cluster), + 'gcc7': 'gnu.Release.{}.llnl.gov'.format(cluster), + 'gcc7_debug': 'gnu.Debug.{}.llnl.gov'.format(cluster), + 'intel19': 'intel.Release.{}.llnl.gov'.format(cluster), + 'intel19_debug': 'intel.Debug.{}.llnl.gov'.format(cluster), + } + python_frontend_path = os.path.join(dir_name, + 'build', + build_names[compiler_name], + 'install', + 'lib', + 'python3.7', + 'site-packages') + sys.path.append(python_frontend_path) + import lbann + import lbann.contrib.launcher + + # Setup LBANN experiment + trainer, model, data_reader, optimizer = setup_func(lbann) + + # Configure kwargs to LBANN launcher + _kwargs = copy.deepcopy(kwargs) + if 'work_dir' not in _kwargs: + _kwargs['work_dir'] = os.path.join(os.path.dirname(test_file), + 'experiments', + test_name) + + # If the user provided a suffix for the work directory, append it + if 'work_subdir' in _kwargs: + _kwargs['work_dir'] = os.path.join(_kwargs['work_dir'], _kwargs['work_subdir']) + del _kwargs['work_subdir'] + + # Delete the work directory + if os.path.isdir(_kwargs['work_dir']): + shutil.rmtree(_kwargs['work_dir']) + + if 'job_name' not in _kwargs: + _kwargs['job_name'] = f'lbann_{test_name}' + if 'overwrite_script' not in _kwargs: + _kwargs['overwrite_script'] = True + + # Run LBANN + work_dir = _kwargs['work_dir'] + stdout_log_file = os.path.join(work_dir, 'out.log') + stderr_log_file = os.path.join(work_dir, 'err.log') + return_code = lbann.contrib.launcher.run( + trainer=trainer, + model=model, + data_reader=data_reader, + optimizer=optimizer, + **_kwargs, + ) + assert_success(return_code, stderr_log_file) + return { + 'return_code': return_code, + 'work_dir': work_dir, + 'stdout_log_file': stdout_log_file, + 'stderr_log_file': stderr_log_file, + } + + # Specific test functions for different build configurations + def test_func_clang6(cluster, exes, dirname): + return test_func(cluster, exes, dirname, 'clang6') + def test_func_gcc7(cluster, exes, dirname): + return test_func(cluster, exes, dirname, 'gcc7') + def test_func_intel19(cluster, exes, dirname): + return test_func(cluster, exes, dirname, 'intel19') + test_func_clang6.__name__ = '{}_clang6'.format(test_name_base) + test_func_gcc7.__name__ = '{}_gcc7'.format(test_name_base) + test_func_intel19.__name__ = '{}_intel19'.format(test_name_base) + + return ( + test_func_gcc7, + test_func_clang6, + test_func_intel19, + ) + + +def create_python_data_reader(lbann, + file_name, + sample_function_name, + num_samples_function_name, + sample_dims_function_name, + execution_mode): + """Create protobuf message for Python data reader + + A Python data reader gets data by importing a Python module and + calling functions in its scope. + + Args: + lbann (module): Module for LBANN Python frontend. + file_name (str): Python file. + sample_function_name (str): Function to get a data sample. It + takes one integer argument for the sample index and + returns an `Iterator` of `float`s. + sample_dims_function_name (str): Function to get dimensions of + a data sample. It takes no arguments and returns a + `(int,)`. + num_samples_function_name (str): Function to get number of + data samples in data set. It takes no arguments and + returns an `int`. + execution_mode (str): 'train', 'validation', or 'test' + + """ + + # Extract paths + file_name = os.path.realpath(file_name) + dir_name = os.path.dirname(file_name) + module_name = os.path.splitext(os.path.basename(file_name))[0] + + # Construct protobuf message for data reader + reader = lbann.reader_pb2.Reader() + reader.name = 'python' + reader.role = execution_mode + reader.shuffle = False + reader.percent_of_data_to_use = 1.0 + reader.python.module = module_name + reader.python.module_dir = dir_name + reader.python.sample_function = sample_function_name + reader.python.num_samples_function = num_samples_function_name + reader.python.sample_dims_function = sample_dims_function_name + + return reader + + +def numpy_l2norm2(x): + """Square of L2 norm, computed with NumPy + + The computation is performed with 64-bit floats. + + """ + if x.dtype is not np.float64: + x = x.astype(np.float64) + x = x.reshape(-1) + return np.inner(x, x) + + +def make_iterable(obj): + """Convert to an iterable object + + Simply returns `obj` if it is alredy iterable. Otherwise returns a + 1-tuple containing `obj`. + + """ + if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str): + return obj + else: + return (obj,) + + +def str_list(it): + """Convert an iterable object to a space-separated string""" + return ' '.join([str(i) for i in make_iterable(it)]) + +# Define evaluation function +def collect_metrics_from_log_func(log_file, key): + metrics = [] + with open(log_file) as f: + for line in f: + match = re.search(key + ' : ([0-9.]+)', line) + if match: + metrics.append(float(match.group(1))) + return metrics + +def compare_metrics(baseline_metrics, test_metrics): + assert len(baseline_metrics) == len(test_metrics), \ + 'baseline and test experiments did not run for same number of epochs' + for i in range(len(baseline_metrics)): + x = baseline_metrics[i] + xhat = test_metrics[i] + assert x == xhat, \ + 'found discrepancy in metrics for baseline {b} and test {t}'.format(b=x, t=xhat) + + +# Perform a diff across a directoy where not all of the subdirectories will exist in +# the test directory. Return a list of unchecked subdirectories, the running error code +# and the list of failed directories +def multidir_diff(baseline, test, fileList): + tmpList = [] + err_msg = "" + err = 0 + # Iterate over the list of filepaths & remove each file. + for filePath in fileList: + d = os.path.basename(filePath) + t = os.path.basename(os.path.dirname(filePath)) + c = os.path.join(test, t, d) + if os.path.exists(c): + ret = subprocess.run('diff -rq {baseline} {test}'.format( + baseline=filePath, test=c), capture_output=True, shell=True, text=True) + if ret.returncode != 0: + err_msg += 'diff -rq {baseline} {test} failed {dt}\n'.format( + dt=ret.returncode, baseline=filePath, test=c) + err_msg += ret.stdout + err += ret.returncode + else: + tmpList.append(filePath) + + return tmpList, err, err_msg + +# Perform a line by line difference of an xml file and look for any floating point values +# For each floating point value, check to see if it is close-enough and log a warning if it +# is within a threshhold. +def approx_diff_xml_files(file1, file2, rel_tol): + f1 = open(file1, 'r') + f2 = open(file2, 'r') + files_differ = False + diff_list = [] + near_diff_list = [] + for l1 in f1: + l2 = next(f2) + if l1 != l2: + try: + v1 = float(re.sub(r'\s*<\w*>(\S*)<\/\w*>\s*', r'\1', l1)) + v2 = float(re.sub(r'\s*<\w*>(\S*)<\/\w*>\s*', r'\1', l2)) + close = math.isclose(v1, v2, rel_tol=rel_tol, abs_tol=0.0) + if not close: + err = ('lines: %s and %s differ: %.13f != %.13f (+/- %.1e)' % (l1.rstrip(), l2.rstrip(), v1, v2, rel_tol)) + diff_list.append(err) + files_differ = True + else: + warn = ('lines: %s and %s are close: %.13f ~= %.13f (+/- %.1e)' % (l1.rstrip(), l2.rstrip(), v1, v2, rel_tol)) + near_diff_list.append(warn) + except ValueError: + # Non-numerical diff. + err = ('lines: %s and %s differ' % (l1.rstrip(), l2.rstrip())) + diff_list.append(err) + files_differ = True + return files_differ, diff_list, near_diff_list + +# Given a recursive python diff from dircmp, perform a recursive exploration of any files +# with differences. For files with differences, if check any XML files for approximate equivalence +# which can be seen in some of the floating point recorded values +def print_diff_files(dcmp): + any_files_differ = False + all_diffs = [] + all_warns = [] + for name in dcmp.diff_files: + from pprint import pprint + err = f'Files {os.path.join(dcmp.left, name)} and {os.path.join(dcmp.right, name)} differ' + if re.search('.xml', name): + files_differ, diff_list, warn_list = approx_diff_xml_files( + os.path.join(dcmp.left, name), os.path.join(dcmp.right, name), 1e-6) + if files_differ: + any_files_differ = True + all_diffs.append(err) + for d in diff_list: + all_diffs.append(d) + if len(warn_list) > 0: + warn = f'Files {os.path.join(dcmp.left, name)} and {os.path.join(dcmp.right, name)} have a near difference' + all_warns.append(warn) + for w in warn_list: + all_warns.append(w) + else: + any_files_differ = True + all_diffs.append(err) + + for sub_dcmp in dcmp.subdirs.values(): + files_differ, diff_list, warn_list = print_diff_files(sub_dcmp) + if files_differ: + any_files_differ = True + for d in diff_list: + all_diffs.append(d) + for d in warn_list: + all_warns.append(d) + + return any_files_differ, all_diffs, all_warns diff --git a/bamboo/compiler_tests/build_script.sh b/bamboo/compiler_tests/build_script.sh index 07a19172f26..1ecdc393b57 100755 --- a/bamboo/compiler_tests/build_script.sh +++ b/bamboo/compiler_tests/build_script.sh @@ -1,7 +1,131 @@ -CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') -if [ "${CLUSTER}" != 'surface' ]; then - source /usr/share/lmod/lmod/init/bash - source /etc/profile.d/00-modulepath.sh -fi +#!/bin/bash + +source /usr/share/lmod/lmod/init/bash +source /etc/profile.d/00-modulepath.sh + LBANN_DIR=$(git rev-parse --show-toplevel) -${LBANN_DIR}/scripts/build_lbann_lc.sh --with-conduit +CLUSTER=$(hostname | sed 's/[0-9]*//g') +USER=$(whoami) +WORKSPACE_DIR=$(ls --color=no -d /usr/workspace/ws*/${USER}) +COMMON_DEPENDENCY_DIR=${WORKSPACE_DIR}/stable_dependencies +DEPENDENCY_DIR_BASE=${COMMON_DEPENDENCY_DIR}/${CLUSTER} + +# For this script, we only care about GCC. +LATEST_GCC=$(ls -1 ${DEPENDENCY_DIR_BASE} | grep gcc | tail -n1) +COMPILER_DIR=${DEPENDENCY_DIR_BASE}/${LATEST_GCC} + +# For now, there's only one MPI library. The pipe to tail ensures that +# we just pick one thing, just in case. +MPI_LIBRARY=$(ls -1 --color=no ${COMPILER_DIR} | tail -n1) +MPI_DIR=${COMPILER_DIR}/${MPI_LIBRARY} + +# All the dependencies are installed at the MPI level (even though +# most are MPI-independent). +DEPENDENCY_DIR=${MPI_DIR} + +export CMAKE_PREFIX_PATH=${COMMON_DEPENDENCY_DIR}/catch2:${COMMON_DEPENDENCY_DIR}/cereal:${COMMON_DEPENDENCY_DIR}/clara:${COMMON_DEPENDENCY_DIR}/cub:${COMMON_DEPENDENCY_DIR}/half:${DEPENDENCY_DIR}/aluminum:${DEPENDENCY_DIR}/cnpy:${DEPENDENCY_DIR}/conduit:${DEPENDENCY_DIR}/hdf5:${DEPENDENCY_DIR}/hydrogen:${DEPENDENCY_DIR}/jpeg-turbo:${DEPENDENCY_DIR}/nccl:${DEPENDENCY_DIR}/openblas:${DEPENDENCY_DIR}/opencv:${DEPENDENCY_DIR}/protobuf:${CMAKE_PREFIX_PATH} + +if [ -e ${DEPENDENCY_DIR} ]; +then + SAVELIST_NAME=$(echo ${CLUSTER}_${LATEST_GCC}_${MPI_LIBRARY} | sed -e 's/\./x/g') + + if ml -t savelist |& grep ${SAVELIST_NAME} > /dev/null 2>&1 + then + ml restore ${SAVELIST_NAME} + else + # Compilers are easy... + COMPILER_MODULE=$(echo ${LATEST_GCC} | sed -e 's|-|/|g') + + if [[ ${MPI_LIBRARY} =~ ^spectrum-mpi-.*$ ]] + then + MPI_MODULE=$(echo ${MPI_LIBRARY} | sed -e 's|spectrum-mpi-|spectrum-mpi/|g') + else + MPI_MODULE=$(echo ${MPI_LIBRARY} | sed -e 's|-|/|g') + fi + + # Use the latest CUDA 10, since it's compatible with other + # CUDA 10.* libraries + CUDA_MODULE=$(ml --terse avail cuda |& sed -n '/\/10\./p' | tail -n1) + + # Load up the appropriate modules + module load ${COMPILER_MODULE} ${MPI_MODULE} ${CUDA_MODULE} cmake/3.14.5 + ml save ${SAVELIST_NAME} + fi + + BRAIN_DIR=/usr/workspace/wsb/brain + + # CUDA-y things (Use the newest) + ARCH=$(uname -i) + export NCCL_DIR=$(ls -d --color=no ${BRAIN_DIR}/nccl2/*cuda10*${ARCH} | tail -n1) + # Right now, we only support cuDNN 7 versions. + export CUDNN_DIR=$(find ${BRAIN_DIR}/cudnn -maxdepth 2 -type d | grep "cudnn-7.*/cuda-10.*_${ARCH}" | sort -r | head -1) + + # Unit testing framework + export CLARA_DIR=${WORKSPACE_DIR}/stable_dependencies/clara + export CATCH2_DIR=${WORKSPACE_DIR}/stable_dependencies/catch2 + + # Add Ninja support + export PATH=${DEPENDENCY_DIR_BASE}/ninja/bin:${PATH} + + # Setup paths to match the build_lbann_lc.sh script (ugh) + BUILD_DIR_BASE=${LBANN_DIR}/build/gnu.Release.${CLUSTER}.llnl.gov + BUILD_DIR=${BUILD_DIR_BASE}/lbann/build + INSTALL_DIR=${BUILD_DIR_BASE}/install + + # Setup a path for Catch2 to use + CATCH2_OUTPUT_DIR=${LBANN_DIR}/bamboo/compiler_tests + rm -f ${CATCH2_OUTPUT_DIR}/*.xml + + # Decide if CUDA should be used. + if [[ "${CLUSTER}" =~ ^(pascal|lassen|ray)$ ]]; + then + USE_CUDA=ON + else + USE_CUDA=OFF + fi + + # Cleanup + [[ -e ${BUILD_DIR_BASE} ]] && rm -rf ${BUILD_DIR_BASE} + mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR} + + # Hack to be nice to others. + if [[ "${CLUSTER}" =~ ^(lassen|ray)$ ]]; + then + LAUNCH_CMD="lrun -1" + NHOSTS=$(expr $(printenv LSB_HOSTS | wc -w) - 1) + NNODES=$(expr ${NHOSTS} / 40) + PARALLEL_LAUNCH_CMD="jsrun -n${NNODES} -r1 -a4 -c40 -g4 -d packed -b packed:10 " + else + unset LAUNCH_CMD + PARALLEL_LAUNCH_CMD="srun --mpibind=off -N${SLURM_NNODES} --ntasks-per-node=2 " + fi + + cmake \ + -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} \ + \ + -DCMAKE_CXX_COMPILER=$(which g++) \ + -DCMAKE_CXX_FLAGS="-DLBANN_SET_EL_RNG -g" \ + -DCMAKE_CUDA_COMPILER=$(which nvcc) \ + -DCMAKE_CUDA_HOST_COMPILER=$(which g++) \ + \ + -DCMAKE_CXX_STANDARD=14 \ + -DCMAKE_CUDA_STANDARD=14 \ + \ + -DLBANN_DATATYPE=float \ + -DLBANN_DETERMINISTIC=ON \ + -DLBANN_WARNINGS_AS_ERRORS=ON \ + -DLBANN_WITH_CONDUIT=ON \ + -DLBANN_WITH_CUDA=ON \ + -DLBANN_WITH_NVPROF=OFF \ + -DLBANN_WITH_TBINF=ON \ + -DLBANN_WITH_UNIT_TESTING=ON \ + -DLBANN_WITH_VTUNE=OFF \ + \ + -Dprotobuf_MODULE_COMPATIBLE=ON \ + \ + ${LBANN_DIR} && ${LAUNCH_CMD} ninja && ${LAUNCH_CMD} ninja install && ${LAUNCH_CMD} ./unit_test/seq-catch-tests -r junit -o ${CATCH2_OUTPUT_DIR}/seq_catch_tests_output-${CLUSTER}.xml ; ${PARALLEL_LAUNCH_CMD} ./unit_test/mpi-catch-tests -r junit -o "${CATCH2_OUTPUT_DIR}/mpi_catch_tests_output-${CLUSTER}-rank=%r-size=%s.xml" +else + ${LBANN_DIR}/scripts/build_lbann_lc.sh --with-conduit +fi diff --git a/bamboo/compiler_tests/build_script_specific.sh b/bamboo/compiler_tests/build_script_specific.sh index 975d58ac4a1..49833de8b1e 100755 --- a/bamboo/compiler_tests/build_script_specific.sh +++ b/bamboo/compiler_tests/build_script_specific.sh @@ -2,10 +2,8 @@ set -e CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') LBANN_DIR=$(git rev-parse --show-toplevel) DEBUG='' -if [ "${CLUSTER}" != 'surface' ]; then - source /usr/share/lmod/lmod/init/bash - source /etc/profile.d/00-modulepath.sh -fi +source /usr/share/lmod/lmod/init/bash +source /etc/profile.d/00-modulepath.sh while :; do case ${1} in @@ -32,22 +30,18 @@ while :; do shift done -if [ "${COMPILER}" == 'clang4' ]; then - module load clang/4.0.0 +if [ "${COMPILER}" == 'clang6' ]; then + module load clang/6.0.0 ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler clang ${DEBUG} --reconfigure --with-conduit fi -if [ "${COMPILER}" == 'intel18' ]; then - module load intel/18.0.0 - ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler intel ${DEBUG} --reconfigure --with-conduit -fi - -if [ "${COMPILER}" == 'gcc4' ]; then - module load gcc/4.9.3 - ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler gnu ${DEBUG} --reconfigure --with-conduit -fi if [ "${COMPILER}" == 'gcc7' ]; then module load gcc/7.1.0 ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler gnu ${DEBUG} --reconfigure --with-conduit fi + +if [ "${COMPILER}" == 'intel19' ]; then + module load intel/19.0.0 + ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler intel ${DEBUG} --reconfigure --with-conduit +fi diff --git a/bamboo/compiler_tests/builds/.gitignore b/bamboo/compiler_tests/builds/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/bamboo/compiler_tests/builds/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/bamboo/compiler_tests/builds/README.md b/bamboo/compiler_tests/builds/README.md deleted file mode 100644 index 1962c6506d6..00000000000 --- a/bamboo/compiler_tests/builds/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for build directories diff --git a/bamboo/compiler_tests/conftest.py b/bamboo/compiler_tests/conftest.py index 238b812e638..ccffb182a73 100644 --- a/bamboo/compiler_tests/conftest.py +++ b/bamboo/compiler_tests/conftest.py @@ -4,13 +4,13 @@ def pytest_addoption(parser): cluster = re.sub('[0-9]+', '', subprocess.check_output( - 'hostname'.split()).strip()) + 'hostname'.split()).decode('utf-8').strip()) default_dirname = subprocess.check_output( - 'git rev-parse --show-toplevel'.split()).strip() + 'git rev-parse --show-toplevel'.split()).decode('utf-8').strip() parser.addoption('--cluster', action='store', default=cluster, help='--cluster= to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster') parser.addoption('--dirname', action='store', default=default_dirname, - help='--dirname specifies the top-level directory') + help='--dirname= specifies the top-level directory') @pytest.fixture diff --git a/bamboo/compiler_tests/error/.gitignore b/bamboo/compiler_tests/error/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/bamboo/compiler_tests/error/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/bamboo/compiler_tests/error/README.md b/bamboo/compiler_tests/error/README.md deleted file mode 100644 index 78712c2962b..00000000000 --- a/bamboo/compiler_tests/error/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for test error diff --git a/bamboo/compiler_tests/output/.gitignore b/bamboo/compiler_tests/output/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/bamboo/compiler_tests/output/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/bamboo/compiler_tests/output/README.md b/bamboo/compiler_tests/output/README.md deleted file mode 100644 index 308358e3777..00000000000 --- a/bamboo/compiler_tests/output/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for test output diff --git a/bamboo/compiler_tests/test_compiler.py b/bamboo/compiler_tests/test_compiler.py index 5682d11f3af..212dcf7f8cc 100644 --- a/bamboo/compiler_tests/test_compiler.py +++ b/bamboo/compiler_tests/test_compiler.py @@ -1,81 +1,41 @@ -# import sys -# sys.path.insert(0, '../common_python') -# import tools +import sys +sys.path.insert(0, '../common_python') +import tools import pytest import os, re, subprocess def test_compiler_build_script(cluster, dirname): - if cluster in ['pascal']: - output_file_name = '%s/bamboo/compiler_tests/output/build_script_output.txt' % (dirname) - error_file_name = '%s/bamboo/compiler_tests/error/build_script_error.txt' % (dirname) - command = '%s/bamboo/compiler_tests/build_script.sh > %s 2> %s' % ( - dirname, output_file_name, error_file_name) - return_code = os.system(command) - if return_code != 0: - output_file = open(output_file_name, 'r') - for line in output_file: - print('%s: %s' % (output_file_name, line)) - error_file = open(error_file_name, 'r') - for line in error_file: - print('%s: %s' % (error_file_name, line)) - assert return_code == 0 - else: + if cluster not in ['catalyst', 'corona', 'lassen', 'pascal', 'ray']: e = 'test_compiler_build_script: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) + output_file_name = '%s/bamboo/compiler_tests/output/build_script_output.txt' % (dirname) + error_file_name = '%s/bamboo/compiler_tests/error/build_script_error.txt' % (dirname) + command = '%s/bamboo/compiler_tests/build_script.sh > %s 2> %s' % ( + dirname, output_file_name, error_file_name) + return_code = os.system(command) + tools.assert_success(return_code, error_file_name) -def test_compiler_clang4_release(cluster, dirname): - try: - skeleton_clang4(cluster, dirname, False) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'clang4', False) - path = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_rel/build/model_zoo/lbann' % (dirname, cluster) +def test_compiler_clang6_release(cluster, dirname): + skeleton_clang6(cluster, dirname, False) + path = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_rel/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) assert os.path.exists(path) -def test_compiler_clang4_debug(cluster, dirname): - try: - skeleton_clang4(cluster, dirname, True) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'clang4', True) - path = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_debug/build/model_zoo/lbann' % (dirname, cluster) +def test_compiler_clang6_debug(cluster, dirname): + skeleton_clang6(cluster, dirname, True) + path = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_debug/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) assert os.path.exists(path) -def test_compiler_gcc4_release(cluster, dirname): - try: - skeleton_gcc4(cluster, dirname, False) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'gcc4', False) - path = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_rel/build/model_zoo/lbann' % (dirname, cluster) - assert os.path.exists(path) - - -def test_compiler_gcc4_debug(cluster, dirname): - try: - skeleton_gcc4(cluster, dirname, True) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'gcc4', True) - path = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_debug/build/model_zoo/lbann' % (dirname, cluster) - assert os.path.exists(path) - - def test_compiler_gcc7_release(cluster, dirname): - try: - skeleton_gcc7(cluster, dirname, False) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'gcc7', False) + skeleton_gcc7(cluster, dirname, False) path = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_rel/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) @@ -83,87 +43,69 @@ def test_compiler_gcc7_release(cluster, dirname): def test_compiler_gcc7_debug(cluster, dirname): - try: - skeleton_gcc7(cluster, dirname, True) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'gcc7', True) + skeleton_gcc7(cluster, dirname, True) path = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_debug/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/gnu.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) assert os.path.exists(path) -def test_compiler_intel18_release(cluster, dirname): - try: - skeleton_intel18(cluster, dirname, False) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'intel18', False) - path = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_rel/build/model_zoo/lbann' % (dirname, cluster) +def test_compiler_intel19_release(cluster, dirname): + skeleton_intel19(cluster, dirname, False) + path = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_rel/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) assert os.path.exists(path) -def test_compiler_intel18_debug(cluster, dirname): - try: - skeleton_intel18(cluster, dirname, True) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'intel18', True) - path = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_debug/build/model_zoo/lbann' % (dirname, cluster) +def test_compiler_intel19_debug(cluster, dirname): + skeleton_intel19(cluster, dirname, True) + path = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_debug/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/intel.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) assert os.path.exists(path) -def skeleton_clang4(cluster, dir_name, debug, should_log=False): - if cluster in ['catalyst', 'quartz']: - spack_skeleton(dir_name, 'clang@4.0.0', 'mvapich2@2.2', debug, should_log) - build_skeleton(dir_name, 'clang@4.0.0', debug, should_log) - else: - e = 'skeleton_clang4: Unsupported Cluster %s' % cluster - print('Skip - ' + e) - pytest.skip(e) - - -def skeleton_gcc4(cluster, dir_name, debug, should_log=False): - if cluster in ['quartz']: # Taking out 'catalyst' - mpi = 'mvapich2@2.2' - elif cluster in ['surface']: # Taking out 'pascal' - mpi = 'mvapich2@2.2+cuda' - elif cluster == 'ray': - mpi = 'spectrum-mpi@2018.04.27' - else: - e = 'skeleton_gcc4: Unsupported Cluster %s' % cluster +def skeleton_clang6(cluster, dir_name, debug): + if cluster not in []: + e = 'skeleton_clang6: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) - spack_skeleton(dir_name, 'gcc@4.9.3', mpi, debug, should_log) - build_skeleton(dir_name, 'gcc@4.9.3', debug, should_log) + try: + spack_skeleton(dir_name, 'clang@6.0.0', 'mvapich2@2.2', debug) + build_skeleton(dir_name, 'clang@6.0.0', debug) + except AssertionError as e: + print(e) + build_script(cluster, dir_name, 'clang6', debug) -def skeleton_gcc7(cluster, dir_name, debug, should_log=False): - if cluster in ['catalyst', 'quartz']: - spack_skeleton(dir_name, 'gcc@7.1.0', 'mvapich2@2.2', debug, should_log) - build_skeleton(dir_name, 'gcc@7.1.0', debug, should_log) - else: +def skeleton_gcc7(cluster, dir_name, debug): + if cluster not in []: e = 'skeleton_gcc7: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) + try: + spack_skeleton(dir_name, 'gcc@7.1.0', 'mvapich2@2.2', debug) + build_skeleton(dir_name, 'gcc@7.1.0', debug) + except AssertionError as e: + print(e) + build_script(cluster, dir_name, 'gcc7', debug) -def skeleton_intel18(cluster, dir_name, debug, should_log=False): - if cluster in ['quartz']: # Taking out 'catalyst' - spack_skeleton(dir_name, 'intel@18.0.0', 'mvapich2@2.2', debug, should_log) - build_skeleton(dir_name, 'intel@18.0.0', debug, should_log) - else: - e = 'skeleton_intel18: Unsupported Cluster %s' % cluster +def skeleton_intel19(cluster, dir_name, debug): + if cluster not in []: # Taking out 'catalyst' + e = 'skeleton_intel19: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) + try: + spack_skeleton(dir_name, 'intel@19.0.0', 'mvapich2@2.2', debug) + build_skeleton(dir_name, 'intel@19.0.0', debug) + except AssertionError as e: + print(e) + build_script(cluster, dir_name, 'intel19', debug) -def spack_skeleton(dir_name, compiler, mpi_lib, debug, should_log): +def spack_skeleton(dir_name, compiler, mpi_lib, debug): compiler_underscored = re.sub('[@\.]', '_', compiler) if debug: build_type = 'debug' @@ -179,17 +121,10 @@ def spack_skeleton(dir_name, compiler, mpi_lib, debug, should_log): dir_name, compiler, mpi_lib, debug_flag, output_file_name, error_file_name) return_code = os.system(command) os.chdir('..') - if should_log or (return_code != 0): - output_file = open(output_file_name, 'r') - for line in output_file: - print('%s: %s' % (output_file_name, line)) - error_file = open(error_file_name, 'r') - for line in error_file: - print('%s: %s' % (error_file_name, line)) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) -def build_skeleton(dir_name, compiler, debug, should_log): +def build_skeleton(dir_name, compiler, debug): compiler_underscored = re.sub('[@\.]', '_', compiler) if debug: build_type = 'debug' @@ -199,31 +134,22 @@ def build_skeleton(dir_name, compiler, debug, should_log): error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_build_error.txt' % (dir_name, compiler_underscored, build_type) compiler = compiler.replace('@', '-') #mpi_lib = mpi_lib.replace('@', '-') - cluster = re.sub('[0-9]+', '', subprocess.check_output('hostname'.split()).strip()) + cluster = re.sub('[0-9]+', '', subprocess.check_output('hostname'.split()).decode('utf-8').strip()) # For reference: # Commenting out for now. These additions to path name will likely return # one day, so I am not removing them entirely. - # x86_64 <=> catalyst, pascal, quartz, surface + # x86_64 <=> catalyst, pascal # ppc64le <=> ray - #architecture = subprocess.check_output('uname -m'.split()).strip() + #architecture = subprocess.check_output('uname -m'.split()).decode('utf-8').strip() #if cluster == 'ray': # architecture += '_gpu_cuda-9.2.64_cudnn-7.0' #elif cluster == 'pascal': # architecture += '_gpu_cuda-9.1.85_cudnn-7.1' - #elif cluster == 'surface': - # architecture += '_gpu' os.chdir('%s/bamboo/compiler_tests/builds/%s_%s_%s/build' % (dir_name, cluster, compiler, build_type)) command = 'make -j all > %s 2> %s' % (output_file_name, error_file_name) return_code = os.system(command) os.chdir('../..') - if should_log or (return_code != 0): - output_file = open(output_file_name, 'r') - for line in output_file: - print('%s: %s' % (output_file_name, line)) - error_file = open(error_file_name, 'r') - for line in error_file: - print('%s: %s' % (error_file_name, line)) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def build_script(cluster, dirname, compiler, debug): @@ -240,11 +166,4 @@ def build_script(cluster, dirname, compiler, debug): error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_%s_build_script_error.txt' % (dirname, cluster, compiler, build) command = '%s/bamboo/compiler_tests/build_script_specific.sh --compiler %s %s> %s 2> %s' % (dirname, compiler, debug_flag, output_file_name, error_file_name) return_code = os.system(command) - if return_code != 0: - output_file = open(output_file_name, 'r') - for line in output_file: - print('%s: %s' % (output_file_name, line)) - error_file = open(error_file_name, 'r') - for line in error_file: - print('%s: %s' % (error_file_name, line)) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py deleted file mode 100644 index 0d0a4dda68e..00000000000 --- a/bamboo/integration_tests/common_code.py +++ /dev/null @@ -1,218 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools -import collections, csv, os, pprint, re, time - - -# Set up the command ########################################################## -def get_command(cluster, dir_name, model_folder, model_name, executable, - output_file_name, error_file_name, compiler_name, weekly=False): - if model_name in ['alexnet', 'conv_autoencoder_imagenet']: - data_reader_percent = 0.01 - if weekly: - data_reader_percent = 0.10 - command = tools.get_command( - cluster=cluster, executable=executable, num_nodes=16, - partition='pbatch', time_limit=600, num_processes=32, - dir_name=dir_name, - data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/', - data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt', - data_filedir_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/', - data_filename_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt', - data_reader_name='imagenet', data_reader_percent=data_reader_percent, - model_folder=model_folder, model_name=model_name, num_epochs=20, - optimizer_name='adagrad', output_file_name=output_file_name, - error_file_name=error_file_name) - elif model_name in ['conv_autoencoder_mnist', 'lenet_mnist']: - if (model_name == 'lenet_mnist') and \ - (compiler_name in ['clang4', 'intel18']): - partition = 'pbatch' - time_limit = 600 - else: - partition = 'pdebug' - time_limit = 30 - if (cluster == 'ray') and (model_name == 'conv_autoencoder_mnist'): - num_processes = 20 - else: - num_processes = 2 - command = tools.get_command( - cluster=cluster, executable=executable, num_nodes=1, - partition=partition, time_limit=time_limit, - num_processes=num_processes, dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder=model_folder, - model_name=model_name, num_epochs=5, optimizer_name='adagrad', - output_file_name=output_file_name, error_file_name=error_file_name) - else: - raise Exception('Invalid model: %s' % model_name) - return command - -# Run LBANN ################################################################### - - -def run_lbann(command, model_name, output_file_name, error_file_name, - should_log=False): - print('About to run: %s' % command) - print('%s began waiting in the queue at ' % model_name + - time.strftime('%H:%M:%S', time.localtime())) - output_value = os.system(command) - print('%s finished at ' % model_name + - time.strftime('%H:%M:%S', time.localtime())) - lbann_exceptions = [] - timed_out = False - if should_log or (output_value != 0): - output_file = open(output_file_name, 'r') - for line in output_file: - print('%s: %s' % (output_file_name, line)) - is_match = re.search( - 'This lbann_exception is about to be thrown:(.*)', line) - if is_match: - lbann_exceptions.append(is_match.group(1)) - is_match = re.search('CANCELLED AT (.*) DUE TO TIME LIMIT', line) - if is_match: - timed_out = True - error_file = open(error_file_name, 'r') - for line in error_file: - print('%s: %s' % (error_file_name, line)) - is_match = re.search('LBANN error on (.*)', line) - if is_match: - lbann_exceptions.append(is_match.group(1)) - if output_value != 0: - error_string = ('Model %s crashed with output_value=%d, timed_out=%s,' - ' and lbann exceptions=%s. Command was: %s') % ( - model_name, output_value, str(timed_out), - str(collections.Counter(lbann_exceptions)), command) - raise Exception(error_string) - return output_value - -# Extract data from output #################################################### - - -def populate_data_dict_epoch(regex, line, data_field, data_fields, data_dict, - model_id): - is_match = re.search(regex, line) - if is_match and (data_field in data_fields): - if model_id not in data_dict[data_field].keys(): - data_dict[data_field][model_id] = {} - epoch_id = is_match.group(1) - value = float(is_match.group(2)) - data_dict[data_field][model_id][epoch_id] = value - - -def populate_data_dict_overall(regex, line, data_field, data_fields, data_dict, - model_id): - is_match = re.search(regex, line) - if is_match and (data_field in data_fields): - if model_id not in data_dict[data_field].keys(): - data_dict[data_field][model_id] = {} - value = float(is_match.group(1)) - data_dict[data_field][model_id]['overall'] = value - - -# data_dict[data_field][model_id][epoch_id] = float -# data_fields is the list or set of data we're interested in. -def extract_data(output_file_name, data_fields, should_log): - output_file = open(output_file_name, 'r') - data_dict = {} - for data_field in data_fields: - data_dict[data_field] = {} - - for line in output_file: - if should_log: - print('extract_data: %s: %s' % (output_file_name, line)) - - # Check if line is reporting model results - is_model = re.search('^Model ([0-9]+)', line) - if not is_model: - is_model = re.search('^model([0-9]+)', line) - if is_model: - print('extract_data: is_model={is_model}'.format(is_model=is_model)) - model_id = is_model.group(1) - - regex = 'training epoch ([0-9]+) objective function : ([0-9.]+)' - data_field = 'training_objective_function' - populate_data_dict_epoch(regex, line, data_field, data_fields, - data_dict, model_id) - - regex = 'training epoch ([0-9]+) run time : ([0-9.]+)' - data_field = 'training_run_time' - populate_data_dict_epoch(regex, line, data_field, data_fields, - data_dict, model_id) - - regex = 'training epoch ([0-9]+) mini-batch time statistics : ([0-9.]+)s mean, ([0-9.]+)s max, ([0-9.]+)s min, ([0-9.]+)s stdev' - is_match = re.search(regex, line) - if is_match: - print('extract_data: is_mini-batch time statistics={is_match}'.format( - is_match=is_match)) - epoch_id = is_match.group(1) - mean_value = float(is_match.group(2)) - max_value = float(is_match.group(3)) - min_value = float(is_match.group(4)) - stdev_value = float(is_match.group(5)) - data_field = 'training_mean' - if data_field in data_fields: - if model_id not in data_dict[data_field].keys(): - data_dict[data_field][model_id] = {} - print('extract_data: mean_value={mv}'.format(mv=mean_value)) - data_dict[data_field][model_id][epoch_id] = mean_value - data_field = 'training_max' - if data_field in data_fields: - if model_id not in data_dict[data_field].keys(): - data_dict[data_field][model_id] = {} - print('extract_data: max_value={mv}'.format(mv=max_value)) - data_dict[data_field][model_id][epoch_id] = max_value - data_field = 'training_min' - if data_field in data_fields: - if model_id not in data_dict[data_field].keys(): - data_dict[data_field][model_id] = {} - print('extract_data: min_value={mv}'.format(mv=min_value)) - data_dict[data_field][model_id][epoch_id] = min_value - data_field = 'training_stdev' - if data_field in data_fields: - if model_id not in data_dict[data_field].keys(): - data_dict[data_field][model_id] = {} - print('extract_data: stdev={sv}'.format(sv=stdev_value)) - data_dict[data_field][model_id][epoch_id] = stdev_value - - regex = 'test categorical accuracy : ([0-9.]+)' - data_field = 'test_accuracy' - populate_data_dict_overall(regex, line, data_field, data_fields, - data_dict, model_id) - output_file.close() - if should_log: - print('extract_data: Extracted Data below:') - pprint.pprint(data_dict) - return data_dict - -# Skeleton #################################################################### - - -def skeleton(cluster, dir_name, executable, model_folder, model_name, - data_fields, should_log, compiler_name=None, weekly=False): - if compiler_name is None: - output_file_name = '%s/bamboo/integration_tests/output/%s_output.txt' % (dir_name, model_name) - error_file_name = '%s/bamboo/integration_tests/error/%s_error.txt' % (dir_name, model_name) - else: - output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' % (dir_name, model_name, compiler_name) - error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' % (dir_name, model_name, compiler_name) - command = get_command( - cluster, dir_name, model_folder, model_name, executable, - output_file_name, error_file_name, compiler_name, weekly=weekly) - run_lbann(command, model_name, output_file_name, - error_file_name, should_log) # Don't need return value - return extract_data(output_file_name, data_fields, should_log) - -# Misc. functions ############################################################ - - -# csv_dict[row_header][column_header] = float -def csv_to_dict(csv_path): - with open(csv_path, 'r') as csv_file: - reader = csv.reader(csv_file, skipinitialspace=True) - column_headers = reader.next() - values = {} - for row in reader: - row_header = row[0] - values[row_header] = dict( - zip(column_headers[1:], map(float, row[1:]))) - return values diff --git a/bamboo/integration_tests/conftest.py b/bamboo/integration_tests/conftest.py index da2ffc127be..9487cdf242e 100644 --- a/bamboo/integration_tests/conftest.py +++ b/bamboo/integration_tests/conftest.py @@ -6,9 +6,9 @@ def pytest_addoption(parser): cluster = re.sub('[0-9]+', '', subprocess.check_output( - 'hostname'.split()).strip()) + 'hostname'.split()).decode('utf-8').strip()) default_dirname = subprocess.check_output( - 'git rev-parse --show-toplevel'.split()).strip() + 'git rev-parse --show-toplevel'.split()).decode('utf-8').strip() default_exes = tools.get_default_exes(default_dirname, cluster) parser.addoption('--cluster', action='store', default=cluster, @@ -17,12 +17,8 @@ def pytest_addoption(parser): help='--dirname= to specify the top-level directory. Default directory of build_lbann_lc executable') parser.addoption('--exes', action='store', default=default_exes, help='--exes={compiler_name: path}') - parser.addoption('--log', action='store', default=0, - help='--log=1 to keep trimmed accuracy files. Default (--log=0) removes files') parser.addoption('--weekly', action='store_true', default=False, - help='--weekly specifies that the test should ONLY be run weekly, not nightly') - # For local testing only - parser.addoption('--exe', action='store', help='--exe=') + help='--weekly specifies that the test should ONLY be run weekly, not nightly. Default False') @pytest.fixture @@ -30,11 +26,6 @@ def cluster(request): return request.config.getoption('--cluster') -@pytest.fixture -def debug(request): - return request.config.getoption('--debug') - - @pytest.fixture def dirname(request): return request.config.getoption('--dirname') @@ -48,8 +39,3 @@ def exes(request): @pytest.fixture def weekly(request): return request.config.getoption('--weekly') - - -@pytest.fixture -def exe(request): - return request.config.getoption('--exe') diff --git a/bamboo/integration_tests/error/README.md b/bamboo/integration_tests/error/README.md deleted file mode 100644 index 78712c2962b..00000000000 --- a/bamboo/integration_tests/error/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for test error diff --git a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_imagenet_objective_functions.csv deleted file mode 100644 index 003794fd557..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_imagenet_objective_functions.csv +++ /dev/null @@ -1,21 +0,0 @@ -Epoch_number, training_objective_function_nightly, training_objective_function_weekly -0, 0.675652, 0.608574 -1, 0.590008, 0.590008 -2, 0.587484, 0.587484 -3, 0.586305, 0.586305 -4, 0.585585, 0.585585 -5, 0.585036, 0.585036 -6, 0.584688, 0.584688 -7, 0.584348, 0.584348 -8, 0.584041, 0.584041 -9, 0.583865, 0.583865 -10, 0.583665, 0.583665 -11, 0.583521, 0.583521 -12, 0.583303, 0.583303 -13, 0.58328, 0.58328 -14, 0.5832, 0.5832 -15, 0.583134, 0.583134 -16, 0.583052, 0.583052 -17, 0.583039, 0.583039 -18, 0.582954, 0.582954 -19, 0.582936, 0.582936 diff --git a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_mnist_objective_functions.csv deleted file mode 100644 index 80c12b2b0ed..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_mnist_objective_functions.csv +++ /dev/null @@ -1,6 +0,0 @@ -Epoch_number, training_objective_function -0, 0.207480 -1, 0.194710 -2, 0.193224 -3, 0.192867 -4, 0.192758 diff --git a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv deleted file mode 100644 index 32551e8e70b..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 56.00, 1.20, 5.00, 0.80, 0.40, 0.00 -alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -lenet_mnist, 88.00, 0.12, 0.40, 0.10, 0.09, 98.40 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv deleted file mode 100644 index 003794fd557..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv +++ /dev/null @@ -1,21 +0,0 @@ -Epoch_number, training_objective_function_nightly, training_objective_function_weekly -0, 0.675652, 0.608574 -1, 0.590008, 0.590008 -2, 0.587484, 0.587484 -3, 0.586305, 0.586305 -4, 0.585585, 0.585585 -5, 0.585036, 0.585036 -6, 0.584688, 0.584688 -7, 0.584348, 0.584348 -8, 0.584041, 0.584041 -9, 0.583865, 0.583865 -10, 0.583665, 0.583665 -11, 0.583521, 0.583521 -12, 0.583303, 0.583303 -13, 0.58328, 0.58328 -14, 0.5832, 0.5832 -15, 0.583134, 0.583134 -16, 0.583052, 0.583052 -17, 0.583039, 0.583039 -18, 0.582954, 0.582954 -19, 0.582936, 0.582936 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv deleted file mode 100644 index 8bcf25bb71d..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv +++ /dev/null @@ -1,6 +0,0 @@ -Epoch_number, training_objective_function -0, 0.207514 -1, 0.194710 -2, 0.193221 -3, 0.192864 -4, 0.192755 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv deleted file mode 100644 index d3ac7caa6b4..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 57.00, 1.11, 4.80, 0.37, 1.20, 0.00 -alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -lenet_mnist, 64.00, 0.10, 0.40, 0.08, 0.04, 98.92 diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv deleted file mode 100644 index 003794fd557..00000000000 --- a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv +++ /dev/null @@ -1,21 +0,0 @@ -Epoch_number, training_objective_function_nightly, training_objective_function_weekly -0, 0.675652, 0.608574 -1, 0.590008, 0.590008 -2, 0.587484, 0.587484 -3, 0.586305, 0.586305 -4, 0.585585, 0.585585 -5, 0.585036, 0.585036 -6, 0.584688, 0.584688 -7, 0.584348, 0.584348 -8, 0.584041, 0.584041 -9, 0.583865, 0.583865 -10, 0.583665, 0.583665 -11, 0.583521, 0.583521 -12, 0.583303, 0.583303 -13, 0.58328, 0.58328 -14, 0.5832, 0.5832 -15, 0.583134, 0.583134 -16, 0.583052, 0.583052 -17, 0.583039, 0.583039 -18, 0.582954, 0.582954 -19, 0.582936, 0.582936 diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv deleted file mode 100644 index 8bcf25bb71d..00000000000 --- a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv +++ /dev/null @@ -1,6 +0,0 @@ -Epoch_number, training_objective_function -0, 0.207514 -1, 0.194710 -2, 0.193221 -3, 0.192864 -4, 0.192755 diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv deleted file mode 100644 index cca3451efd2..00000000000 --- a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 51.00, 1.20, 4.00, 0.50, 0.40, 0.17 -alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -lenet_mnist, 9.00, 0.01, 6.00, 0.01, 0.40, 98.40 diff --git a/bamboo/integration_tests/experiments/.gitignore b/bamboo/integration_tests/experiments/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/bamboo/integration_tests/experiments/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/bamboo/integration_tests/full_alexnet.sh b/bamboo/integration_tests/full_alexnet.sh deleted file mode 100644 index ff1b5cf1c76..00000000000 --- a/bamboo/integration_tests/full_alexnet.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -module load mpifileutils - -# Clear SSDs -srun --wait=0 --clear-ssd hostname > /dev/null - -# Cache dataset -echo "Caching dataset..." -[ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train_resized.tar ] || \ - srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/train_resized.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train_resized.tar > /dev/null -[ -d /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train ] || \ - srun --nodes=128 --ntasks-per-node=1 tar xf /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train_resized.tar -C /l/ssd/lbannusr/datasets-resized/ILSVRC2012 -[ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val_resized.tar ] || \ - srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/val_resized.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val_resized.tar > /dev/null -[ -d /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val ] || \ - srun --nodes=128 --ntasks-per-node=1 tar xf /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val_resized.tar -C /l/ssd/lbannusr/datasets-resized/ILSVRC2012 -[ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels.tar ] || \ - srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels.tar > /dev/null -[ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/train.txt ] || \ - srun --nodes=128 --ntasks-per-node=1 tar xf /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels.tar -C /l/ssd/lbannusr/datasets-resized/ILSVRC2012 -wait -echo "Done caching dataset..." - -LBANN_DIR=$(git rev-parse --show-toplevel) -CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') - -# Experiment -srun --nodes=128 --ntasks-per-node=2 ${LBANN_DIR}/bamboo/compiler_tests/builds/catalyst_gcc-4.9.3_x86_64_mvapich2-2.2_openblas_rel/build/model_zoo/lbann --model=${LBANN_DIR}/model_zoo/models/alexnet/model_alexnet.prototext --optimizer=${LBANN_DIR}/model_zoo/optimizers/opt_sgd.prototext --reader=${LBANN_DIR}/model_zoo/data_readers/data_reader_imagenet.prototext --data_filedir_train=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/train/ --data_filename_train=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/train.txt --data_filedir_test=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/val/ --data_filename_test=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/val.txt diff --git a/bamboo/integration_tests/output/README.md b/bamboo/integration_tests/output/README.md deleted file mode 100644 index 308358e3777..00000000000 --- a/bamboo/integration_tests/output/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for test output diff --git a/bamboo/integration_tests/test_integration_alexnet.py b/bamboo/integration_tests/test_integration_alexnet.py new file mode 100644 index 00000000000..576b2852204 --- /dev/null +++ b/bamboo/integration_tests/test_integration_alexnet.py @@ -0,0 +1,190 @@ +import functools +import operator +import os +import os.path +import re +import sys +import numpy as np +import google.protobuf.text_format +import pytest + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools +import data.imagenet + +# ============================================== +# Options +# ============================================== + +# Training options +num_epochs = 5 +mini_batch_size = 256 +num_nodes = 4 +imagenet_fraction = 0.280994 # Train with 360K out of 1.28M samples + +# Top-5 classification accuracy (percent) +expected_train_accuracy_range = (9, 15) +expected_test_accuracy_range = (15, 24) + +# Average mini-batch time (in sec) for each LC system +# Note that run times are with LBANN_DETERMINISTIC set +# Commented out times are prior to thread safe RNGs +expected_mini_batch_times = { + 'pascal': 0.154, # 0.100, + 'lassen': 0.050, + 'ray': 0.075, +} + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer(mini_batch_size=mini_batch_size) + model = construct_model(lbann) + # Setup data reader + data_reader = data.imagenet.make_data_reader(lbann, num_classes=1000) + # We train on a subset of ImageNet + data_reader.reader[0].percent_of_data_to_use = imagenet_fraction + # Only evaluate on ImageNet validation set at end of training + data_reader.reader[1].role = 'test' + + optimizer = lbann.SGD(learn_rate=0.01, momentum=0.9) + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.models + + # Layer graph + input_ = lbann.Input() + images = lbann.Identity(input_) + labels = lbann.Identity(input_) + x = lbann.models.AlexNet(1000)(images) + probs = lbann.Softmax(x) + cross_entropy = lbann.CrossEntropy(probs, labels) + top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5) + layers = list(lbann.traverse_layer_graph(x)) + + # Setup objective function + l2_reg_weights = set() + for l in layers: + if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: + l2_reg_weights.update(l.weights) + l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4) + obj = lbann.ObjectiveFunction([cross_entropy, l2_reg]) + + # Objects for LBANN model + callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] + metrics = [lbann.Metric(top5, name='top-5 accuracy', unit='%')] + + # Construct model + return lbann.Model(num_epochs, + layers=layers, + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +# ============================================== +# Setup PyTest +# ============================================== + +def augment_test_func(test_func): + """Augment test function to parse log files. + + `tools.create_tests` creates functions that run an LBANN + experiment. This function creates augmented functions that parse + the log files after LBANN finishes running, e.g. to check metrics + or runtimes. + + Note: The naive approach is to define the augmented test functions + in a loop. However, Python closures are late binding. In other + words, the function would be overwritten every time we define it. + We get around this overwriting problem by defining the augmented + function in the local scope of another function. + + Args: + test_func (function): Test function created by + `tools.create_tests`. + + Returns: + function: Test that can interact with PyTest. + + """ + test_name = test_func.__name__ + + # Define test function + def func(cluster, exes, dirname, weekly): + + # Skip test with nightly builds and on CPU systems + if not weekly: + pytest.skip('only run {} with weekly builds'.format(test_name)) + if cluster in ('catalyst', 'corona'): + pytest.skip('only run {} on GPU systems'.format(test_name)) + + # Run LBANN experiment + experiment_output = test_func(cluster, exes, dirname) + + # Parse LBANN log file + train_accuracy = None + test_accuracy = None + mini_batch_times = [] + with open(experiment_output['stdout_log_file']) as f: + for line in f: + match = re.search('training epoch [0-9]+ top-5 accuracy : ([0-9.]+)%', line) + if match: + train_accuracy = float(match.group(1)) + match = re.search('test top-5 accuracy : ([0-9.]+)%', line) + if match: + test_accuracy = float(match.group(1)) + match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line) + if match: + mini_batch_times.append(float(match.group(1))) + + # Check if training accuracy is within expected range + assert (expected_train_accuracy_range[0] + < train_accuracy + < expected_train_accuracy_range[1]), \ + 'train accuracy is outside expected range' + + # Check if testing accuracy is within expected range + assert (expected_test_accuracy_range[0] + < test_accuracy + < expected_test_accuracy_range[1]), \ + 'test accuracy is outside expected range' + + # Check if mini-batch time is within expected range + # Note: Skip first epoch since its runtime is usually an outlier + mini_batch_times = mini_batch_times[1:] + mini_batch_time = sum(mini_batch_times) / len(mini_batch_times) + assert (0.75 * expected_mini_batch_times[cluster] + < mini_batch_time + < 1.25 * expected_mini_batch_times[cluster]), \ + 'average mini-batch time is outside expected range' + + # Return test function from factory function + func.__name__ = test_name + return func + +# Create test functions that can interact with PyTest +for _test_func in tools.create_tests(setup_experiment, + __file__, + nodes=num_nodes): + globals()[_test_func.__name__] = augment_test_func(_test_func) diff --git a/bamboo/integration_tests/test_integration_autoencoders.py b/bamboo/integration_tests/test_integration_autoencoders.py deleted file mode 100644 index 5f021ce6f53..00000000000 --- a/bamboo/integration_tests/test_integration_autoencoders.py +++ /dev/null @@ -1,102 +0,0 @@ -import pytest -import common_code - - -def error_if(f, f_symbol, data_field, actual_values, expected_values, - model_name, errors, all_values, frequency_str): - d = actual_values[data_field] - for model_id in sorted(d.keys()): - for epoch_id in sorted(d[model_id].keys()): - actual_value = d[model_id][epoch_id] - expected_value = expected_values[epoch_id][data_field + frequency_str] - - if actual_value is None: - errors.append('d[%s][%s] == None' % (model_id, epoch_id)) - if expected_value is None: - errors.append('d[%s]([%s] == None' % (model_id, epoch_id)) - - if f(actual_value, expected_value): - errors.append('%f %s %f %s Model %s Epoch %s %s' % ( - actual_value, f_symbol, expected_value, model_name, model_id, - epoch_id, data_field)) - all_values.append('%f %s Model %s Epoch %s %s' % ( - actual_value, model_name, model_id, epoch_id, data_field)) - - -def run_tests(actual_objective_functions, model_name, dir_name, cluster, - should_log, compiler_name, frequency_str=''): - expected_objective_functions = common_code.csv_to_dict( - '%s/bamboo/integration_tests/expected_values/%s/%s/expected_%s_objective_functions.csv' % (dir_name, cluster, compiler_name, model_name)) - errors = [] - all_values = [] - tolerance = 0.05 - # Are we within tolerance * expected_value? - outside_tolerance = lambda x, y: abs(x - y) > abs(tolerance * y) - error_if(outside_tolerance, '!=', 'training_objective_function', - actual_objective_functions, expected_objective_functions, - model_name, errors, all_values, frequency_str) - - print('Errors for: %s %s (%d)' % (model_name, compiler_name, len(errors))) - for error in errors: - print(error) - if should_log: - print('All values for: %s %s (%d)' % (model_name, compiler_name, - len(all_values))) - for value in all_values: - print(value) - assert errors == [] - -DATA_FIELDS = [ - 'training_objective_function' -] - - -def skeleton_autoencoder_imagenet(cluster, dir_name, executables, compiler_name, - weekly): - if cluster in ['surface', 'pascal']: - e = 'skeleton_autoencoder_imagenet: does not run on GPU' - print('Skip - ' + e) - pytest.skip(e) - if compiler_name not in executables: - e = 'skeleton_autoencoder_imagenet: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - model_folder = 'models/autoencoder_imagenet' - model_name = 'conv_autoencoder_imagenet' - should_log = False - actual_objective_functions = common_code.skeleton( - cluster, dir_name, executables[compiler_name], model_folder, model_name, - DATA_FIELDS, should_log, compiler_name=compiler_name, weekly=weekly) - frequency_str = '_nightly' - if weekly: - frequency_str = '_weekly' - run_tests(actual_objective_functions, model_name, dir_name, cluster, - should_log, compiler_name, frequency_str) - - -def test_integration_autoencoder_imagenet_clang4(cluster, dirname, exes, - weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'clang4', weekly) - - -def test_integration_autoencoder_imagenet_gcc4(cluster, dirname, exes, weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'gcc4', weekly) - - -def test_integration_autoencoder_imagenet_gcc7(cluster, dirname, exes, weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'gcc7', weekly) - - -def test_integration_autoencoder_imagenet_intel18(cluster, dirname, exes, - weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'intel18', weekly) - - -# Run with python -m pytest -s test_integration_autoencoder.py -k 'test_integration_autoencoder_imagenet_exe' --exe= -def test_integration_autoencoder_imagenet_exe(cluster, dirname, exe): - if exe is None: - e = 'test_integration_autoencoder_imagenet_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip() - exes = {'exe': exe} - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'exe', True) diff --git a/bamboo/integration_tests/test_integration_debug.py b/bamboo/integration_tests/test_integration_debug.py deleted file mode 100644 index c205dffb24c..00000000000 --- a/bamboo/integration_tests/test_integration_debug.py +++ /dev/null @@ -1,112 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools -import pytest -import common_code - - -def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, - debug, should_log=False): - # If weekly or debug are true, then run the test. - if (not weekly) and (not debug): - e = 'skeleton_mnist_debug: Not doing weekly or debug testing' - print('Skip - ' + e) - pytest.skip(e) - if compiler_name not in executables: - e = 'skeleton_mnist_debug: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - model_name = 'lenet_mnist' - output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) - error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - partition='pbatch', time_limit=100, dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='models/' + model_name, - model_name=model_name, num_epochs=5, optimizer_name='adagrad', - output_file_name=output_file_name, error_file_name=error_file_name) - output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name) - assert output_value == 0 - - -def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, - debug, should_log=False): - # If weekly or debug are true, then run the test. - if (not weekly) and (not debug): - e = 'skeleton_cifar_debug: Not doing weekly or debug testing' - print('Skip - ' + e) - pytest.skip(e) - if cluster == 'ray': - e = 'skeleton_cifar_debug: cifar not operational on Ray' - print('Skip - ' + e) - pytest.skip(e) - if compiler_name not in executables: - e = 'skeleton_cifar_debug: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - model_name = 'autoencoder_cifar10' - output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) - error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - partition='pbatch', time_limit=100, dir_name=dir_name, - data_filename_train_default='/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin', - data_filename_test_default='/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin', - data_reader_name='cifar10', data_reader_percent=0.01, model_folder='models/' + model_name, - model_name='conv_' + model_name, num_epochs=5, optimizer_name='adagrad', - output_file_name=output_file_name, error_file_name=error_file_name) - output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name) - assert output_value == 0 - - -def test_integration_mnist_clang4_debug(cluster, dirname, exes, weekly, debug): - skeleton_mnist_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug) - - -def test_integration_cifar_clang4_debug(cluster, dirname, exes, weekly, debug): - skeleton_cifar_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug) - - -def test_integration_mnist_gcc4_debug(cluster, dirname, exes, weekly, debug): - skeleton_mnist_debug(cluster, dirname, exes, 'gcc4_debug', weekly, debug) - - -def test_integration_cifar_gcc4_debug(cluster, dirname, exes, weekly, debug): - skeleton_cifar_debug(cluster, dirname, exes, 'gcc4_debug', weekly, debug) - - -def test_integration_mnist_gcc7_debug(cluster, dirname, exes, weekly, debug): - skeleton_mnist_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug) - - -def test_integration_cifar_gcc7_debug(cluster, dirname, exes, weekly, debug): - skeleton_cifar_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug) - - -def test_integration_mnist_intel18_debug(cluster, dirname, exes, weekly, debug): - skeleton_mnist_debug(cluster, dirname, exes, 'intel18_debug', weekly, debug) - - -def test_integration_cifar_intel18_debug(cluster, dirname, exes, weekly, debug): - skeleton_cifar_debug(cluster, dirname, exes, 'intel18_debug', weekly, debug) - - -# Run with python -m pytest -s test_integration_debug.py -k 'test_integration_mnist_exe' --exe= -def test_integration_mnist_exe(cluster, dirname, exe): - if exe is None: - e = 'test_integration_mnist_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_mnist_debug(cluster, dirname, exes, 'exe', True, True) - - -# Run with python -m pytest -s test_integration_debug.py -k 'test_integration_cifar_exe' --exe= -def test_integration_cifar_exe(cluster, dirname, exe): - if exe == None: - e = 'test_integration_cifar_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_cifar_debug(cluster, dirname, exes, 'exe', True, True) diff --git a/bamboo/integration_tests/test_integration_lenet.py b/bamboo/integration_tests/test_integration_lenet.py new file mode 100644 index 00000000000..3abc4a02387 --- /dev/null +++ b/bamboo/integration_tests/test_integration_lenet.py @@ -0,0 +1,174 @@ +import functools +import operator +import os +import os.path +import re +import sys +import numpy as np +import google.protobuf.text_format +import pytest + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools +import data.mnist + +# ============================================== +# Options +# ============================================== + +# Training options +num_epochs = 5 +mini_batch_size = 64 +num_nodes = 2 + +# Classification accuracy (percent) +expected_train_accuracy_range = (98.75, 99.25) +expected_test_accuracy_range = (98, 99) + +# Average mini-batch time (in sec) for each LC system +# Note that run times are with LBANN_DETERMINISTIC set +# Commented out times are prior to thread safe RNGs +expected_mini_batch_times = { + 'pascal': 0.0014, # 0.0013, + 'catalyst': 0.0073, # 0.0055, + 'lassen': 0.0022, + 'ray': 0.0025, + 'corona': 0.0117, # 0.0075, +} + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer(mini_batch_size=mini_batch_size) + model = construct_model(lbann) + + data_reader = data.mnist.make_data_reader(lbann) + # No validation set + data_reader.reader[0].validation_percent = 0 + + optimizer = lbann.SGD(learn_rate=0.01, momentum=0.9) + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.models + + # Layer graph + input_ = lbann.Input() + images = lbann.Identity(input_) + labels = lbann.Identity(input_) + x = lbann.models.LeNet(10)(images) + probs = lbann.Softmax(x) + loss = lbann.CrossEntropy(probs, labels) + acc = lbann.CategoricalAccuracy(probs, labels) + + # Objects for LBANN model + callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] + metrics = [lbann.Metric(acc, name='accuracy', unit='%')] + + # Construct model + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(input_), + objective_function=loss, + metrics=metrics, + callbacks=callbacks) + +# ============================================== +# Setup PyTest +# ============================================== + +def augment_test_func(test_func): + """Augment test function to parse log files. + + `tools.create_tests` creates functions that run an LBANN + experiment. This function creates augmented functions that parse + the log files after LBANN finishes running, e.g. to check metrics + or runtimes. + + Note: The naive approach is to define the augmented test functions + in a loop. However, Python closures are late binding. In other + words, the function would be overwritten every time we define it. + We get around this overwriting problem by defining the augmented + function in the local scope of another function. + + Args: + test_func (function): Test function created by + `tools.create_tests`. + + Returns: + function: Test that can interact with PyTest. + + """ + test_name = test_func.__name__ + + # Define test function + def func(cluster, exes, dirname): + + # Run LBANN experiment + experiment_output = test_func(cluster, exes, dirname) + + # Parse LBANN log file + train_accuracy = None + test_accuracy = None + mini_batch_times = [] + with open(experiment_output['stdout_log_file']) as f: + for line in f: + match = re.search('training epoch [0-9]+ accuracy : ([0-9.]+)%', line) + if match: + train_accuracy = float(match.group(1)) + match = re.search('test accuracy : ([0-9.]+)%', line) + if match: + test_accuracy = float(match.group(1)) + match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line) + if match: + mini_batch_times.append(float(match.group(1))) + + # Check if training accuracy is within expected range + assert (expected_train_accuracy_range[0] + < train_accuracy + < expected_train_accuracy_range[1]), \ + 'train accuracy is outside expected range' + + # Check if testing accuracy is within expected range + assert (expected_test_accuracy_range[0] + < test_accuracy + < expected_test_accuracy_range[1]), \ + 'test accuracy is outside expected range' + + # Check if mini-batch time is within expected range + # Note: Skip first epoch since its runtime is usually an outlier + mini_batch_times = mini_batch_times[1:] + mini_batch_time = sum(mini_batch_times) / len(mini_batch_times) + assert (0.75 * expected_mini_batch_times[cluster] + < mini_batch_time + < 1.25 * expected_mini_batch_times[cluster]), \ + 'average mini-batch time is outside expected range' + + # Return test function from factory function + func.__name__ = test_name + return func + +# Create test functions that can interact with PyTest +for _test_func in tools.create_tests(setup_experiment, + __file__, + nodes=num_nodes): + globals()[_test_func.__name__] = augment_test_func(_test_func) diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py deleted file mode 100644 index a171184ba5e..00000000000 --- a/bamboo/integration_tests/test_integration_performance.py +++ /dev/null @@ -1,252 +0,0 @@ -import pytest -import operator, os -import common_code - - -def error_if(f, f_symbol, data_field, actual_values, expected_values, - model_name, errors, all_values, frequency_str): - d = actual_values[data_field] - if f_symbol == '<': - # Every time a value is smaller, update archive_value - archive_value = float('inf') - elif f_symbol == '>': - # Every time a value is greater, update archive_value - archive_value = float('-inf') - else: - raise Exception('Invalid Function Symbol %s' % f_symbol) - for model_id in sorted(d.keys()): - for epoch_id in sorted(d[model_id].keys()): - actual_value = d[model_id][epoch_id] - expected_value = expected_values[model_name + frequency_str][data_field] - - if actual_value is None: - errors.append('actual_value: d[%s][%s] is None' % (model_id, epoch_id)) - else: - print('actual_value={av}'.format(av=actual_value)) - if expected_value is None: - errors.append( - 'expected_value: d[%s]([%s] is None' % (model_id, epoch_id)) - else: - print('expected_value={ev}'.format(ev=expected_value)) - - if (actual_value is not None) and (expected_value is not None): - if f(actual_value, expected_value): - errors.append('%f %s %f %s Model %s Epoch %s %s' % ( - actual_value, f_symbol, expected_value, model_name, model_id, - epoch_id, data_field)) - all_values.append('%f %s Model %s Epoch %s %s' % ( - actual_value, model_name, model_id, epoch_id, data_field)) - - if f(actual_value, archive_value): - archive_value = actual_value - else: - print('archiving: either actual_value or expected_value is None.') - return archive_value - - -def run_tests(actual_performance, model_name, dir_name, should_log, - compiler_name, cluster, frequency_str=''): - expected_performance = common_code.csv_to_dict( - '%s/bamboo/integration_tests/expected_values/%s/%s/expected_performance.csv' % (dir_name, cluster, compiler_name)) - errors = [] - all_values = [] - greater_than = operator.gt - less_than = operator.lt - max_run_time = error_if(greater_than, '>', 'training_run_time', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - max_mean = error_if(greater_than, '>', 'training_mean', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - max_max = error_if(greater_than, '>', 'training_max', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - max_min = error_if(greater_than, '>', 'training_min', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - max_stdev = error_if(greater_than, '>', 'training_stdev', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - min_accuracy = error_if(less_than, '<', 'test_accuracy', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - - archival_string = '%s, %f, %f, %f, %f, %f, %f\n' % ( - os.environ['bamboo_buildNumber'], max_run_time, max_mean, max_max, max_min, - max_stdev, min_accuracy) - print('archival_string: ' + archival_string) - if os.environ['LOGNAME'] == 'lbannusr': - key = 'bamboo_planKey' - if key in os.environ: - plan = os.environ[key] - if plan in ['LBANN-NIGHTD', 'LBANN-WD']: - archive_file = '/usr/workspace/wsb/lbannusr/archives/%s/%s/%s/performance_%s.txt' % (plan, cluster, compiler_name, model_name) - print('Archive file: ' + archive_file) - with open(archive_file, 'a') as archive: - print('Archiving to file.') - archive.write(archival_string) - else: - print('The plan %s does not have archiving activated' % plan) - else: - print('%s is not in os.environ' % key) - else: - print('os.environ["LOGNAME"]=%s' % os.environ['LOGNAME']) - - print('Errors for: %s %s (%d)' % (model_name, compiler_name, len(errors))) - for error in errors: - print(error) - if should_log: - print('All values for: %s %s (%d)' % ( - model_name, compiler_name, len(all_values))) - for value in all_values: - print(value) - assert errors == [] - -DATA_FIELDS = [ - 'training_run_time', - 'training_mean', - 'training_max', - 'training_min', - 'training_stdev', - 'test_accuracy' -] - - -def skeleton_performance_lenet_mnist(cluster, dir_name, executables, - compiler_name): - if compiler_name not in executables: - e = 'skeleton_performance_lenet_mnist: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - executable = executables[compiler_name] - model_name = 'lenet_mnist' - model_folder = 'models/' + model_name - should_log = True - actual_performance = common_code.skeleton( - cluster, dir_name, executable, model_folder, model_name, DATA_FIELDS, - should_log, compiler_name=compiler_name) - run_tests(actual_performance, model_name, dir_name, should_log, - compiler_name, cluster) - - -def skeleton_performance_alexnet(cluster, dir_name, executables, compiler_name, - weekly): - if compiler_name not in executables: - e = 'skeleton_performance_alexnet: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - executable = executables[compiler_name] - model_name = 'alexnet' - model_folder = 'models/' + model_name - should_log = True - actual_performance = common_code.skeleton( - cluster, dir_name, executable, model_folder, model_name, DATA_FIELDS, - should_log, compiler_name=compiler_name, weekly=weekly) - frequency_str = '_nightly' - if weekly: - frequency_str = '_weekly' - run_tests(actual_performance, model_name, dir_name, should_log, - compiler_name, cluster, frequency_str) - - -def skeleton_performance_full_alexnet(cluster, dir_name, executables, - compiler_name, weekly): - if not weekly: - e = 'skeleton_performance_full_alexnet: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - if compiler_name not in executables: - e = 'skeleton_performance_full_alexnet: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - executable = executables[compiler_name] - if not os.path.exists(executable): - pytest.skip('Executable does not exist: %s' % executable) - model_name = 'full_alexnet' - should_log = True - output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) - error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) - if cluster in ['catalyst', 'surface']: - command = 'salloc %s/bamboo/integration_tests/%s.sh > %s' % (dir_name, model_name, output_file_name) - elif cluster == 'ray': - e = 'skeleton_performance_full_alexnet: Ray is unsupported for skeleton_performance_full_alexnet' - print('Skip - ' + e) - pytest.skip(e) - else: - raise Exception('Unsupported Cluster %s' % cluster) - common_code.run_lbann(command, model_name, output_file_name, error_file_name, - should_log) # Don't need return value - actual_performance = common_code.extract_data(output_file_name, DATA_FIELDS, - should_log) - run_tests(actual_performance, model_name, dir_name, should_log, compiler_name, - cluster) - - -def test_integration_performance_lenet_mnist_clang4(cluster, dirname, exes): - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'clang4') - - -def test_integration_performance_alexnet_clang4(cluster, dirname, exes, weekly): - skeleton_performance_alexnet(cluster, dirname, exes, 'clang4', weekly) - - -def test_integration_performance_full_alexnet_clang4(cluster, dirname, exes, - weekly): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'clang4', weekly) - - -def test_integration_performance_lenet_mnist_gcc4(cluster, dirname, exes): - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'gcc4') - - -def test_integration_performance_alexnet_gcc4(cluster, dirname, exes, weekly): - skeleton_performance_alexnet(cluster, dirname, exes, 'gcc4', weekly) - - -def test_integration_performance_full_alexnet_gcc4(cluster, dirname, exes, weekly): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc4', weekly) - - -def test_integration_performance_lenet_mnist_gcc7(cluster, dirname, exes): - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'gcc7') - - -def test_integration_performance_alexnet_gcc7(cluster, dirname, exes, weekly): - skeleton_performance_alexnet(cluster, dirname, exes, 'gcc7', weekly) - - -def test_integration_performance_full_alexnet_gcc7(cluster, dirname, exes, - weekly): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc7', weekly) - - -def test_integration_performance_lenet_mnist_intel18(cluster, dirname, exes): - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'intel18') - - -def test_integration_performance_alexnet_intel18(cluster, dirname, exes, - weekly): - skeleton_performance_alexnet(cluster, dirname, exes, 'intel18', weekly) - - -def test_integration_performance_full_alexnet_intel18(cluster, dirname, exes, - weekly): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'intel18', weekly) - - -# Run with python -m pytest -s test_integration_performance.py -k 'test_integration_performance_lenet_mnist_exe' --exe= -def test_integration_performance_lenet_mnist_exe(cluster, dirname, exe): - if exe is None: - e = 'test_integration_performance_lenet_mnist_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'exe') - - -# Run with python -m pytest -s test_integration_performance.py -k 'test_integration_performance_alexnet_exe' --exe= -def test_integration_performance_alexnet_exe(cluster, dirname, exe): - if exe is None: - e = 'stest_integration_performance_alexnet_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_performance_alexnet(cluster, dirname, exes, 'exe', True) - - -# Run with python -m pytest -s test_integration_performance.py -k 'test_integration_performance_full_alexnet_exe' --exe= -def test_integration_performance_full_alexnet_exe(cluster, dirname, exe): - if exe is None: - e = 'test_integration_performance_full_alexnet_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_performance_full_alexnet(cluster, dirname, exes, 'exe', True) diff --git a/bamboo/integration_tests/test_integration_resnet50.py b/bamboo/integration_tests/test_integration_resnet50.py new file mode 100644 index 00000000000..360e3fb20e1 --- /dev/null +++ b/bamboo/integration_tests/test_integration_resnet50.py @@ -0,0 +1,188 @@ +import functools +import operator +import os +import os.path +import re +import sys +import numpy as np +import google.protobuf.text_format +import pytest + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools +import data.imagenet + +# ============================================== +# Options +# ============================================== + +# Training options +num_epochs = 5 +mini_batch_size = 256 +num_nodes = 4 +imagenet_fraction = 0.280994 # Train with 360K out of 1.28M samples + +# Top-5 classification accuracy (percent) +expected_train_accuracy_range = (45, 50) +expected_test_accuracy_range = (40, 55) + +# Average mini-batch time (in sec) for each LC system +expected_mini_batch_times = { + 'pascal': 0.25, + 'lassen': 0.10, + 'ray': 0.15, +} + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer(mini_batch_size=mini_batch_size) + model = construct_model(lbann) + # Setup data reader + data_reader = data.imagenet.make_data_reader(lbann, num_classes=1000) + # We train on a subset of ImageNet + data_reader.reader[0].percent_of_data_to_use = imagenet_fraction + # Only evaluate on ImageNet validation set at end of training + data_reader.reader[1].role = 'test' + + optimizer = lbann.SGD(learn_rate=0.1, momentum=0.9) + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.models + + # Layer graph + input_ = lbann.Input() + images = lbann.Identity(input_) + labels = lbann.Identity(input_) + x = lbann.models.ResNet50(1000, bn_statistics_group_size=-1)(images) + probs = lbann.Softmax(x) + cross_entropy = lbann.CrossEntropy(probs, labels) + top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5) + layers = list(lbann.traverse_layer_graph(x)) + + # Setup objective function + l2_reg_weights = set() + for l in layers: + if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: + l2_reg_weights.update(l.weights) + l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4) + obj = lbann.ObjectiveFunction([cross_entropy, l2_reg]) + + # Objects for LBANN model + callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] + metrics = [lbann.Metric(top5, name='top-5 accuracy', unit='%')] + + # Construct model + return lbann.Model(num_epochs, + layers=layers, + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +# ============================================== +# Setup PyTest +# ============================================== + +def augment_test_func(test_func): + """Augment test function to parse log files. + + `tools.create_tests` creates functions that run an LBANN + experiment. This function creates augmented functions that parse + the log files after LBANN finishes running, e.g. to check metrics + or runtimes. + + Note: The naive approach is to define the augmented test functions + in a loop. However, Python closures are late binding. In other + words, the function would be overwritten every time we define it. + We get around this overwriting problem by defining the augmented + function in the local scope of another function. + + Args: + test_func (function): Test function created by + `tools.create_tests`. + + Returns: + function: Test that can interact with PyTest. + + """ + test_name = test_func.__name__ + + # Define test function + def func(cluster, exes, dirname, weekly): + + # Skip test with nightly builds and on CPU systems + if not weekly: + pytest.skip('only run {} with weekly builds'.format(test_name)) + if cluster in ('catalyst', 'corona'): + pytest.skip('only run {} on GPU systems'.format(test_name)) + + # Run LBANN experiment + experiment_output = test_func(cluster, exes, dirname) + + # Parse LBANN log file + train_accuracy = None + test_accuracy = None + mini_batch_times = [] + with open(experiment_output['stdout_log_file']) as f: + for line in f: + match = re.search('training epoch [0-9]+ top-5 accuracy : ([0-9.]+)%', line) + if match: + train_accuracy = float(match.group(1)) + match = re.search('test top-5 accuracy : ([0-9.]+)%', line) + if match: + test_accuracy = float(match.group(1)) + match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line) + if match: + mini_batch_times.append(float(match.group(1))) + + # Check if training accuracy is within expected range + assert (expected_train_accuracy_range[0] + < train_accuracy + < expected_train_accuracy_range[1]), \ + 'train accuracy is outside expected range' + + # Check if testing accuracy is within expected range + assert (expected_test_accuracy_range[0] + < test_accuracy + < expected_test_accuracy_range[1]), \ + 'test accuracy is outside expected range' + + # Check if mini-batch time is within expected range + # Note: Skip first epoch since its runtime is usually an outlier + mini_batch_times = mini_batch_times[1:] + mini_batch_time = sum(mini_batch_times) / len(mini_batch_times) + assert (0.75 * expected_mini_batch_times[cluster] + < mini_batch_time + < 1.25 * expected_mini_batch_times[cluster]), \ + 'average mini-batch time is outside expected range' + + # Return test function from factory function + func.__name__ = test_name + return func + +# Create test functions that can interact with PyTest +for _test_func in tools.create_tests(setup_experiment, + __file__, + nodes=num_nodes): + globals()[_test_func.__name__] = augment_test_func(_test_func) diff --git a/bamboo/local_test.cmd b/bamboo/local_test.cmd new file mode 100644 index 00000000000..aa17ec3101b --- /dev/null +++ b/bamboo/local_test.cmd @@ -0,0 +1,11 @@ +#!/bin/bash +#SBATCH --nodes 16 +#SBATCH --partition pbatch +#SBATCH --time 1440 + +# Update "--time" above to increase/decrease allocation time. +# Update "executable" with your executable. +# Use "data-reader-percent" to specify data reader percent. Note that `data-reader-percent=1.0` means 100%, not 1%. +# Use "--integration-tests" to only run integration tests. +# Use "--unit-tests" to only run unit tests. +./local_test.sh --executable "../build/gnu.Release.pascal.llnl.gov/install/bin/lbann" --data-reader-percent 0.001 --unit-tests diff --git a/bamboo/local_test.sh b/bamboo/local_test.sh new file mode 100755 index 00000000000..051c1931c8c --- /dev/null +++ b/bamboo/local_test.sh @@ -0,0 +1,120 @@ +#!/bin/bash -l + +# Local testing (i.e. not with Bamboo) + +################################################################ +# Help message +################################################################ + +function help_message { + local SCRIPT=$(basename ${0}) + local N=$(tput sgr0) # Normal text + local C=$(tput setf 4) # Colored text + cat << EOF +Run integration and unit tests locally, outside Bamboo. +Usage: ./${SCRIPT} [options] +Options: + ${C}--help${N} Display this help message and exit. + ${C}--data-reader-percent${N} Specify data reader percent. Note that `data-reader-percent=1.0` means 100%, not 1%. + ${C}--executable${N} Specify executable to be used. Required field. + ${C}--integration-tests${N} Specify that only integration tests should be run. + ${C}--unit-tests${N} Specify that only unit tests should be run. +EOF +} + +################################################################ +# Parse command-line arguments +################################################################ + +DATA_READER_PERCENT=0.001 +EXECUTABLE= +INTEGRATION_TESTS=1 +UNIT_TESTS=1 +while :; do + case ${1} in + -h|--help) + # Help message + help_message + exit 0 + ;; + -d|--data-reader-percent) + # Set data reader percent. + # -n: check if string has non-zero length. + if [ -n "${2}" ]; then + DATA_READER_PERCENT=${2} + shift + else + echo "\"${1}\" option requires a non-empty option argument" >&2 + help_message + exit 1 + fi + ;; + -e|--executable) + # Set executable. + # -n: check if string has non-zero length. + if [ -n "${2}" ]; then + EXECUTABLE=${2} + shift + else + echo "\"${1}\" option requires a non-empty option argument" >&2 + help_message + exit 1 + fi + ;; + -i|--integration-tests) + # Run only integration tests + UNIT_TESTS=0 + ;; + -u|--unit-tests) + # Run only unit tests + INTEGRATION_TESTS=0 + ;; + -?*) + # Unknown option + echo "Unknown option (${1})" >&2 + exit 1 + ;; + *) + # Break loop if there are no more options + break + esac + shift +done + +# -z: check if string has zero length. +if [ -z ${EXECUTABLE} ]; then + echo "Executable must be set." + help_message + exit 1 +fi + +################################################################ +# Run tests +################################################################ + +# Assume user already has an executable (i.e. no need for compiler tests). +# Assume user already has 16 nodes allocated on a cluster. + +echo "EXECUTABLE=${EXECUTABLE}" +echo "INTEGRATION_TESTS=${INTEGRATION_TESTS}" +echo "UNIT_TESTS=${UNIT_TESTS}" +PYTHON=python3 + +echo "Task: Cleaning" +./clean.sh + +echo "Task: Integration Tests" +cd integration_tests +if [ ${INTEGRATION_TESTS} -ne 0 ]; then + $PYTHON -m pytest -s -vv --durations=0 --exe=${EXECUTABLE} +fi +cd .. + +echo "Task: Unit Tests" +cd unit_tests +if [ ${UNIT_TESTS} -ne 0 ]; then + $PYTHON -m pytest -s -vv --durations=0 --exe=${EXECUTABLE} --data-reader-percent=${DATA_READER_PERCENT} +fi +cd .. + +echo "Task: Finished" diff --git a/bamboo/run.sh b/bamboo/run.sh new file mode 100755 index 00000000000..aef17792f88 --- /dev/null +++ b/bamboo/run.sh @@ -0,0 +1,55 @@ +#!/bin/bash -l + +CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') + +echo "run.sh CLUSTER=" +echo $CLUSTER + +PYTHON=python3 + +WEEKLY=0 +while :; do + case ${1} in + --weekly) + # Run all tests. This is a weekly build. + echo "Setting WEEKLY in run.sh" + WEEKLY=1 + ;; + -?*) + # Unknown option + echo "Unknown option (${1})" >&2 + exit 1 + ;; + *) + # Break loop if there are no more options + break + esac + shift +done + +echo "run.sh WEEKLY=" +echo $WEEKLY + +echo "Task: Cleaning" +./clean.sh + +echo "Task: Compiler Tests" +cd compiler_tests +$PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml +cd .. + +echo "Task: Integration Tests" +cd integration_tests +if [ ${WEEKLY} -ne 0 ]; then + $PYTHON -m pytest -s -vv -k "gcc7" --durations=0 --weekly --junitxml=results.xml +else + $PYTHON -m pytest -s -vv -k "gcc7" --durations=0 --junitxml=results.xml +fi +cd .. + +echo "Task: Unit Tests" +cd unit_tests +OMP_NUM_THREADS=10 $PYTHON -m pytest -s -vv -k "gcc7" --durations=0 --junitxml=results.xml +cd .. + +echo "Task: Finished" diff --git a/bamboo/unit_tests/.gitignore b/bamboo/unit_tests/.gitignore index 16d3c4dbbfe..0cc4de789bf 100644 --- a/bamboo/unit_tests/.gitignore +++ b/bamboo/unit_tests/.gitignore @@ -1 +1,2 @@ .cache +*.prototext diff --git a/bamboo/unit_tests/conftest.py b/bamboo/unit_tests/conftest.py index eda975da95a..cf646ad1e04 100644 --- a/bamboo/unit_tests/conftest.py +++ b/bamboo/unit_tests/conftest.py @@ -3,34 +3,54 @@ import tools import pytest, re, subprocess + def pytest_addoption(parser): cluster = re.sub('[0-9]+', '', subprocess.check_output( - 'hostname'.split()).strip()) + 'hostname'.split()).decode('utf-8').strip()) default_dirname = subprocess.check_output( - 'git rev-parse --show-toplevel'.split()).strip() + 'git rev-parse --show-toplevel'.split()).decode('utf-8').strip() default_exes = tools.get_default_exes(default_dirname, cluster) parser.addoption('--cluster', action='store', default=cluster, help='--cluster= to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster') parser.addoption('--dirname', action='store', default=default_dirname, - help='--dirname specifies the top-level directory') + help='--dirname= specifies the top-level directory') parser.addoption('--exes', action='store', default=default_exes, help='--exes={compiler_name: path}') + parser.addoption('--weekly', action='store_true', default=False, + help='--weekly specifies that the test should ONLY be run weekly, not nightly. Default False') # For local testing only - parser.addoption('--exe', action='store', help='--exe=') + parser.addoption('--data-reader-percent', action='store', default=None, + help='--data-reader-percent=. Default None. Note that 1.0 is 100%.') + parser.addoption('--exe', action='store', + help='--exe=') + @pytest.fixture def cluster(request): return request.config.getoption('--cluster') + @pytest.fixture def dirname(request): return request.config.getoption('--dirname') + @pytest.fixture def exes(request): return request.config.getoption('--exes') + +@pytest.fixture +def weekly(request): + return request.config.getoption('--weekly') + + +@pytest.fixture +def data_reader_percent(request): + return request.config.getoption('--data-reader-percent') + + @pytest.fixture def exe(request): return request.config.getoption('--exe') diff --git a/bamboo/unit_tests/error/.gitignore b/bamboo/unit_tests/error/.gitignore index 7c9d611b592..d6b7ef32c84 100644 --- a/bamboo/unit_tests/error/.gitignore +++ b/bamboo/unit_tests/error/.gitignore @@ -1,3 +1,2 @@ * !.gitignore -!README.md diff --git a/bamboo/unit_tests/error/README.md b/bamboo/unit_tests/error/README.md deleted file mode 100644 index 78712c2962b..00000000000 --- a/bamboo/unit_tests/error/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for test error diff --git a/bamboo/unit_tests/experiments/.gitignore b/bamboo/unit_tests/experiments/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/bamboo/unit_tests/experiments/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/bamboo/unit_tests/output/.gitignore b/bamboo/unit_tests/output/.gitignore index 7c9d611b592..d6b7ef32c84 100644 --- a/bamboo/unit_tests/output/.gitignore +++ b/bamboo/unit_tests/output/.gitignore @@ -1,3 +1,2 @@ * !.gitignore -!README.md diff --git a/bamboo/unit_tests/output/README.md b/bamboo/unit_tests/output/README.md deleted file mode 100644 index 308358e3777..00000000000 --- a/bamboo/unit_tests/output/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for test output diff --git a/bamboo/unit_tests/prototext/data_reader_mnist.prototext b/bamboo/unit_tests/prototext/data_reader_mnist.prototext deleted file mode 100644 index 9d2e2663202..00000000000 --- a/bamboo/unit_tests/prototext/data_reader_mnist.prototext +++ /dev/null @@ -1,64 +0,0 @@ -data_reader { - reader { - name: "mnist" - role: "train" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" - data_filename: "train-images-idx3-ubyte" - label_filename: "train-labels-idx1-ubyte" - validation_percent: 0.1 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - image_preprocessor { - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: true - factor: 0.0 - } - } - } - reader { - name: "mnist" - role: "test" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" - data_filename: "t10k-images-idx3-ubyte" - label_filename: "t10k-labels-idx1-ubyte" - validation_percent: 1.0 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - image_preprocessor { - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: true - factor: 0.0 - } - } - } -} diff --git a/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext b/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext deleted file mode 100644 index 77a1c7ed256..00000000000 --- a/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext +++ /dev/null @@ -1,122 +0,0 @@ -model { - data_layout: "data_parallel" - mini_batch_size: 64 - block_size: 256 - num_epochs: 3 - num_parallel_readers: 0 - procs_per_trainer: 0 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { layer: "cross_entropy" } - l2_weight_regularization { - scale_factor: 1e-4 - } - } - - ################################################### - # Metrics - ################################################### - - metric { - layer_metric { - name: "categorical accuracy" - layer: "accuracy" - unit: "%" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - summary { - dir: "." - mat_interval: 25 - } - } - callback { - adaptive_learning_rate { - patience: 4 - amt: 0.1 - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - children: "image label" - data_layout: "data_parallel" - input {} - } - layer { - parents: "data" - name: "image" - data_layout: "data_parallel" - split {} - } - layer { - parents: "data" - name: "label" - data_layout: "data_parallel" - split {} - } - - layer { - parents: "image" - name: "ip1" - data_layout: "model_parallel" - fully_connected { - num_neurons: 500 - has_bias: true - } - } - - layer { - parents: "ip1" - name: "relu1" - data_layout: "model_parallel" - relu {} - } - - layer { - parents: "relu1" - name: "ip2" - data_layout: "model_parallel" - fully_connected { - num_neurons: 10 - has_bias: true - } - } - - layer { - parents: "ip2" - name: "prob" - data_layout: "data_parallel" - softmax {} - } - - layer { - parents: "prob label" - name: "cross_entropy" - data_layout: "data_parallel" - cross_entropy {} - } - - layer { - parents: "prob label" - name: "accuracy" - data_layout: "data_parallel" - categorical_accuracy {} - } - -} diff --git a/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext b/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext deleted file mode 100644 index c89c171566f..00000000000 --- a/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext +++ /dev/null @@ -1,138 +0,0 @@ -model { - data_layout: "data_parallel" - mini_batch_size: 64 - block_size: 256 - num_epochs: 3 - num_parallel_readers: 0 - procs_per_trainer: 0 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { layer: "cross_entropy" } - l2_weight_regularization { - scale_factor: 1e-4 - } - } - - ################################################### - # Metrics - ################################################### - - metric { - layer_metric { - name: "categorical accuracy" - layer: "accuracy" - unit: "%" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - summary { - dir: "." - mat_interval: 25 - } - } - callback { - adaptive_learning_rate { - patience: 4 - amt: 0.1 - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - children: "image label" - data_layout: "data_parallel" - input {} - } - layer { - parents: "data" - name: "image" - data_layout: "data_parallel" - split {} - } - layer { - parents: "data" - name: "label" - data_layout: "data_parallel" - split {} - } - - layer { - parents: "image" - name: "ip1" - data_layout: "model_parallel" - fully_connected { - num_neurons: 500 - has_bias: true - } - } - - layer { - parents: "ip1" - name: "relu1" - data_layout: "model_parallel" - relu {} - } - - layer { - parents: "relu1" - name: "ip3" - data_layout: "model_parallel" - fully_connected { - num_neurons: 500 - has_bias: true - } - } - - layer { - parents: "ip3" - name: "relu3" - data_layout: "model_parallel" - relu {} - } - layer { - parents: "relu3" - name: "ip2" - data_layout: "model_parallel" - fully_connected { - num_neurons: 10 - has_bias: true - } - } - - layer { - parents: "ip2" - name: "prob" - data_layout: "data_parallel" - softmax {} - } - - layer { - parents: "prob label" - name: "cross_entropy" - data_layout: "data_parallel" - cross_entropy {} - } - - layer { - parents: "prob label" - name: "accuracy" - data_layout: "data_parallel" - categorical_accuracy {} - } - -} diff --git a/bamboo/unit_tests/prototext/opt_sgd.prototext b/bamboo/unit_tests/prototext/opt_sgd.prototext deleted file mode 100644 index 8d066780476..00000000000 --- a/bamboo/unit_tests/prototext/opt_sgd.prototext +++ /dev/null @@ -1,7 +0,0 @@ -optimizer { - sgd { - learn_rate: 0.01 - momentum: 0.9 - nesterov: false - } -} diff --git a/bamboo/unit_tests/test_unit_callback_set_weights_value.py b/bamboo/unit_tests/test_unit_callback_set_weights_value.py new file mode 100644 index 00000000000..97ae2b72dad --- /dev/null +++ b/bamboo/unit_tests/test_unit_callback_set_weights_value.py @@ -0,0 +1,158 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20200526) +_samples = np.random.uniform(size=13).astype(np.float32) + +# Sample access functions +def get_sample(index): + return (_samples[index],) +def num_samples(): + return len(_samples) +def sample_dims(): + return (1,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer(mini_batch_size=1) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # LBANN implementation + weights_values = np.random.uniform(size=num_samples()).astype(np.float32) + w = lbann.Weights(optimizer=None, + initializer=lbann.ConstantInitializer(value=1234.5)) + for step, val in enumerate(weights_values): + callbacks.append( + lbann.CallbackSetWeightsValue(weights=w.name, value=val, step=step) + ) + x_lbann = lbann.Identity(lbann.Input()) + x = x_lbann + y = lbann.WeightsLayer(weights=w, dims='1') + z = lbann.Multiply(x, y) + metrics.append(lbann.Metric(z, name='value')) + + # Numpy implementation of training + vals = [] + for step, val in enumerate(weights_values): + x = np.float64(get_sample(step)[0]) + y = np.float64(val) + z = x * y + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='train')) + + # Numpy implementation of testing + vals = [] + for i in range(num_samples()): + x = np.float64(get_sample(i)[0]) + y = np.float64(weights_values[-1]) + z = x * y + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # Construct model + return lbann.Model(epochs=1, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +### @todo Run on >1 proc when https://github.com/LLNL/lbann/issues/1548 is resolved +for test in tools.create_tests(setup_experiment, __file__, procs_per_node=1, nodes=1): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_check_proto_models.py b/bamboo/unit_tests/test_unit_check_proto_models.py index 353fca3143a..2921931c86c 100644 --- a/bamboo/unit_tests/test_unit_check_proto_models.py +++ b/bamboo/unit_tests/test_unit_check_proto_models.py @@ -5,7 +5,8 @@ import os -def skeleton_models(cluster, dir_name, executables, compiler_name): +def skeleton_models(cluster, dir_name, executables, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_models: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -37,23 +38,6 @@ def skeleton_models(cluster, dir_name, executables, compiler_name): data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST' data_reader_path = '%s/model_zoo/models/gan/mnist/discriminator_data.prototext' % (dir_name) data_reader_name = None - elif 'triplet' in file_name: - # Disabling triplet test. - print('Skipping triplet tests.') - continue - data_filedir_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/' - data_filename_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/train/train_list_8h.nfl.npz' - data_filedir_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/' - data_filename_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/val/val_list_8h.nfl.npz' - data_reader_path = '%s/model_zoo/models/siamese/triplet/data_reader_triplet.prototext' % (dir_name) - data_reader_name = None - elif 'siamese_alexnet' in file_name: - data_filedir_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/' - data_filename_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt' - data_filedir_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/' - data_filename_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt' - data_reader_path = '%s/model_zoo/models/siamese/siamese_alexnet/data_reader_imagenet_patches.prototext' % (dir_name) - data_reader_name = None elif 'net' in file_name: data_filedir_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/' data_filename_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt' @@ -65,6 +49,8 @@ def skeleton_models(cluster, dir_name, executables, compiler_name): time_limit = 3 if 'resnet50' in file_name: node_count = 8 + if not weekly: + continue # This is too many nodes for nightly. elif 'cifar' in file_name: data_filename_train_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin' data_filename_test_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin' @@ -98,10 +84,11 @@ def skeleton_models(cluster, dir_name, executables, compiler_name): data_filename_test_default=data_filename_test_default, data_reader_name=data_reader_name, data_reader_path=data_reader_path, + data_reader_percent=data_reader_percent, exit_after_setup=True, model_path=model_path, optimizer_name=opt, output_file_name=output_file_name, - error_file_name=error_file_name) + error_file_name=error_file_name, weekly=weekly) if os.system(cmd) != 0: print("Error detected in " + model_path) #defective_models.append(file_name) @@ -115,30 +102,29 @@ def skeleton_models(cluster, dir_name, executables, compiler_name): print('Errors for: The following models exited with errors %s' % compiler_name) for model in defective_models: print(model) - assert num_defective == 0 - - -def test_unit_models_clang4(cluster, dirname, exes): - skeleton_models(cluster, dirname, exes, 'clang4') + if num_defective != 0: + raise AssertionError( + 'num_defective={nd}\nDefective models:\n{dms}'.format( + nd=num_defective, dms=defective_models)) -def test_unit_models_gcc4(cluster, dirname, exes): - skeleton_models(cluster, dirname, exes, 'gcc4') +def test_unit_models_clang6(cluster, dirname, exes, weekly, data_reader_percent): + skeleton_models(cluster, dirname, exes, 'clang6', weekly, data_reader_percent) -def test_unit_models_gcc7(cluster, dirname, exes): - skeleton_models(cluster, exes, dirname, 'gcc7') +def test_unit_models_gcc7(cluster, dirname, exes, weekly, data_reader_percent): + skeleton_models(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_models_intel18(cluster, dirname, exes): - skeleton_models(cluster, dirname, exes, 'intel18') +def test_unit_models_intel19(cluster, dirname, exes, weekly, data_reader_percent): + skeleton_models(cluster, dirname, exes, 'intel19', weekly, data_reader_percent) -# Run with python -m pytest -s test_unit_check_proto_models.py -k 'test_unit_models_exe' --exe= -def test_unit_models_exe(cluster, dirname, exe): +# Run with python3 -m pytest -s test_unit_check_proto_models.py -k 'test_unit_models_exe' --exe= +def test_unit_models_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_models_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe' : exe} - skeleton_models(cluster, dirname, exes, 'exe') + skeleton_models(cluster, dirname, exes, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py index 25ea6614e3b..7c3a36028ae 100644 --- a/bamboo/unit_tests/test_unit_checkpoint.py +++ b/bamboo/unit_tests/test_unit_checkpoint.py @@ -3,46 +3,48 @@ import tools import pytest import os - +from filecmp import dircmp def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, - compiler_name): + compiler_name, weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_checkpoint_lenet_shared: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) pytest.skip(e) exe = executables[compiler_name] - + # Handle data + if data_reader_percent is None: + data_reader_percent = 0.01 # No checkpointing, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) + os.system('rm -rf ckpt_lenet_shared && mkdir ckpt_lenet_shared') + no_ckpt_dir = 'ckpt_lenet_shared/no_ckpt_{c}'.format(c=compiler_name) command = tools.get_command( cluster=cluster, executable=exe, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', + data_reader_name='mnist', data_reader_percent=data_reader_percent, + ckpt_dir=no_ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code_nockpt = os.system(command) - if return_code_nockpt != 0: - sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error') - sys.exit(1) - os.system('mv ckpt ckpt_baseline') + tools.assert_success(return_code_nockpt, error_file_name) # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_checkpoint_%s_error.txt' % (dir_name, compiler_name) + ckpt_dir = 'ckpt_lenet_shared/ckpt_{c}'.format(c=compiler_name) command = tools.get_command( cluster=cluster, executable=exe, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', + data_reader_name='mnist', data_reader_percent=data_reader_percent, + ckpt_dir=ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=1, optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code_ckpt_1 = os.system(command) - if return_code_ckpt_1 != 0: - sys.stderr.write('LeNet (checkpoint) execution failed, exiting with error') - sys.exit(1) + tools.assert_success(return_code_ckpt_1, error_file_name) # Pick up from checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_restart_%s_output.txt' % (dir_name, compiler_name) @@ -51,104 +53,153 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, cluster=cluster, executable=exe, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', + data_reader_name='mnist', data_reader_percent=data_reader_percent, + ckpt_dir=ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code_ckpt_2 = os.system(command) - if return_code_ckpt_2 != 0: - sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error') - sys.exit(1) + tools.assert_success(return_code_ckpt_2, error_file_name) - diff_test = os.system('diff -rq ckpt ckpt_baseline') - os.system('rm -rf ckpt*') - assert diff_test == 0 + dcmp = dircmp(ckpt_dir, no_ckpt_dir) + fail, diffs, warns = tools.print_diff_files(dcmp) + for w in warns: + print(w) + + if fail: + print() + for d in diffs: + print(d) + path_prefix = '{d}/bamboo/unit_tests'.format(d=dir_name) + raise AssertionError( + 'Compare {ncd} and {cd} in {p}'.format( + ncd=no_ckpt_dir, cd=ckpt_dir, p=path_prefix)) def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, - compiler_name): - if compiler_name not in executables: - e = 'skeleton_checkpoint_lenet_distributed: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - exe = executables[compiler_name] - - # No checkpointing, printing weights to files. - output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=exe, num_nodes=1, num_processes=2, - dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', - model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code_nockpt = os.system(command) - if return_code_nockpt != 0: - sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error') - sys.exit(1) - os.system('mv ckpt ckpt_baseline') - - # Run to checkpoint, printing weights to files. - output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_checkpoint_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_checkpoint_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=exe, num_nodes=1, num_processes=2, - dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', - model_name='lenet_mnist_dist_ckpt', num_epochs=1, optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code_ckpt_1 = os.system(command) - if return_code_ckpt_1 != 0: - sys.stderr.write('LeNet (checkpoint) execution failed, exiting with error') - sys.exit(1) - - # Pick up from checkpoint, printing weights to files. - output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_restart_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_restart_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=exe, num_nodes=1, num_processes=2, - dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', - model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code_ckpt_2 = os.system(command) - if return_code_ckpt_2 != 0: - sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error') - sys.exit(1) - - diff_test = os.system('diff -rq ckpt ckpt_baseline') - os.system('rm -rf ckpt*') - assert diff_test == 0 - - -def test_unit_checkpoint_lenet_clang4(cluster, exes, dirname): - skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'clang4') - skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'clang4') - - -def test_unit_checkpoint_lenet_gcc4(cluster, exes, dirname): - skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'gcc4') - skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'gcc4') - - -def test_unit_checkpoint_lenet_gcc7(cluster, exes, dirname): - skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'gcc7') - skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'gcc7') - - -def test_unit_checkpoint_lenet_intel18(cluster, exes, dirname): - skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'intel18') - skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'intel18') - - -# Run with python -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_exe' --exe= -def test_unit_checkpoint_lenet_exe(cluster, dirname, exe): + compiler_name, + weekly, data_reader_percent): + if compiler_name not in executables: + e = 'skeleton_checkpoint_lenet_distributed: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) + exe = executables[compiler_name] + # Handle data + if data_reader_percent is None: + data_reader_percent = 0.01 + + # No checkpointing, printing weights to files. + output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) + error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) + os.system('rm -rf ckpt_lenet_distributed && mkdir ckpt_lenet_distributed') + no_ckpt_dir = 'ckpt_lenet_distributed/no_ckpt_{c}'.format(c=compiler_name) + command = tools.get_command( + cluster=cluster, executable=exe, num_nodes=1, num_processes=2, + dir_name=dir_name, + data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', + data_reader_name='mnist', data_reader_percent=data_reader_percent, + ckpt_dir=no_ckpt_dir, model_folder='tests', + model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd', + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) + return_code_nockpt = os.system(command) + tools.assert_success(return_code_nockpt, error_file_name) + + # Run to checkpoint, printing weights to files. + output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_checkpoint_%s_output.txt' % (dir_name, compiler_name) + error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_checkpoint_%s_error.txt' % (dir_name, compiler_name) + ckpt_dir = 'ckpt_lenet_distributed/ckpt_{c}'.format(c=compiler_name) + command = tools.get_command( + cluster=cluster, executable=exe, num_nodes=1, num_processes=2, + dir_name=dir_name, + data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', + data_reader_name='mnist', data_reader_percent=data_reader_percent, + ckpt_dir=ckpt_dir, model_folder='tests', + model_name='lenet_mnist_dist_ckpt', num_epochs=1, optimizer_name='sgd', + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) + return_code_ckpt_1 = os.system(command) + tools.assert_success(return_code_ckpt_1, error_file_name) + + # Pick up from checkpoint, printing weights to files. + output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_restart_%s_output.txt' % (dir_name, compiler_name) + error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_restart_%s_error.txt' % (dir_name, compiler_name) + command = tools.get_command( + cluster=cluster, executable=exe, num_nodes=1, num_processes=2, + dir_name=dir_name, + data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', + data_reader_name='mnist', data_reader_percent=data_reader_percent, + ckpt_dir=ckpt_dir, model_folder='tests', + model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd', + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) + return_code_ckpt_2 = os.system(command) + tools.assert_success(return_code_ckpt_2, error_file_name) + + dcmp = dircmp(ckpt_dir, no_ckpt_dir) + fail, diffs, warns = tools.print_diff_files(dcmp) + for w in warns: + print(w) + + if fail: + print() + for d in diffs: + print(d) + path_prefix = '{d}/bamboo/unit_tests'.format(d=dir_name) + raise AssertionError( + 'Compare {ncd} and {cd} in {p}'.format( + ncd=no_ckpt_dir, cd=ckpt_dir, p=path_prefix)) + + +def test_unit_checkpoint_lenet_shared_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) + + +def test_unit_checkpoint_lenet_distributed_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) + + +def test_unit_checkpoint_lenet_shared_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) + + +def test_unit_checkpoint_lenet_distributed_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) + + +def test_unit_checkpoint_lenet_shared_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) + + +def test_unit_checkpoint_lenet_distributed_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) + + +# Run with python3 -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_shared_exe' --exe= +def test_unit_checkpoint_lenet_shared_exe(cluster, dirname, exe, + weekly, data_reader_percent): + if exe is None: + e = 'test_unit_checkpoint_lenet_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} + skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'exe', + weekly, data_reader_percent) + + +# Run with python3 -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_distributed_exe' --exe= +def test_unit_checkpoint_lenet_distributed_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_checkpoint_lenet_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'exe') - skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'exe') + skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_checkpoint_lenet.py b/bamboo/unit_tests/test_unit_checkpoint_lenet.py new file mode 100644 index 00000000000..b052056c654 --- /dev/null +++ b/bamboo/unit_tests/test_unit_checkpoint_lenet.py @@ -0,0 +1,261 @@ +import os.path +import re +import sys +import math +import numpy as np +import google.protobuf.text_format +import pytest +import glob + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Options +# ============================================== + +# Training options +num_epochs = 4 +num_ckpt_epochs = int(float(num_epochs)/2) +mini_batch_size = 64 +num_nodes = 1 +lenet_fraction = 0.1 +random_seed = 20191206 + +test_name_base='test_unit_checkpoint_lenet' +checkpoint_dir='ckpt' + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer(mini_batch_size=mini_batch_size, + random_seed=random_seed) + + # Checkpoint after every epoch + trainer.callbacks = [ + lbann.CallbackCheckpoint( + checkpoint_dir=checkpoint_dir, + checkpoint_epochs=1, + checkpoint_steps=845 + ) + ] + + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.SGD(learn_rate=0.01, momentum=0.9) + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.models + + # Manually override the global count so that each model is named the same + lbann.models.LeNet.global_count = 0 + # Layer graph + input_ = lbann.Input() + images = lbann.Identity(input_) + labels = lbann.Identity(input_) + x = lbann.models.LeNet(10)(images) + probs = lbann.Softmax(x) + loss = lbann.CrossEntropy(probs, labels) + acc = lbann.CategoricalAccuracy(probs, labels) + + # Make sure all layers are on CPU + for layer in lbann.traverse_layer_graph(input_): + layer.device = 'cpu' + + # Objects for LBANN model + callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] + metrics = [lbann.Metric(acc, name='accuracy', unit='%')] + + # Construct model + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(input_), + objective_function=loss, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.contrib.lc.paths + + # Load data readers from prototext + dirname = os.path.dirname + lbann_dir = dirname(dirname(dirname(os.path.realpath(__file__)))) + pb_file = os.path.join(lbann_dir, + 'model_zoo', + 'data_readers', + 'data_reader_mnist.prototext') + message = lbann.lbann_pb2.LbannPB() + with open(pb_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), message) + message = message.data_reader + + # Set location of MNIST data + for reader in message.reader: + reader.data_filedir = lbann.contrib.lc.paths.mnist_dir() + reader.percent_of_data_to_use = lenet_fraction + + + # Validation set + message.reader[0].validation_percent = 0.1 + + return message + +# ============================================== +# Setup PyTest +# ============================================== + +def create_test_func(test_func): + """Augment test function to cascade multiple tests and parse results. + + `tools.create_tests` creates functions that run an LBANN + experiment. This function creates augmented functions that parse + the log files after LBANN finishes running, e.g. to check metrics + or runtimes. + + Note: The naive approach is to define the augmented test functions + in a loop. However, Python closures are late binding. In other + words, the function would be overwritten every time we define it. + We get around this overwriting problem by defining the augmented + function in the local scope of another function. + + Args: + test_func (function): Test function created by + `tools.create_tests`. + + Returns: + function: Test that can interact with PyTest. + + """ + test_name = test_func.__name__ + + # Define test function + def func(cluster, exes, dirname, weekly): + + # Run LBANN experiment baseline + print('\n################################################################################') + print('Running baseline model') + print('################################################################################\n') + baseline_test_output = test_func(cluster, exes, dirname) + baseline_training_metrics = tools.collect_metrics_from_log_func(baseline_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function') + baseline_validation_metrics = tools.collect_metrics_from_log_func(baseline_test_output['stdout_log_file'], 'validation objective function') + baseline_test_metrics = tools.collect_metrics_from_log_func(baseline_test_output['stdout_log_file'], 'test objective function') + + # Run LBANN model to checkpoint + print('\n################################################################################') + print('Running initial model to checkpoint') + print('################################################################################\n') + test_func_checkpoint = tools.create_tests( + setup_experiment, + __file__, + test_name_base=test_name_base, + nodes=num_nodes, + work_subdir='checkpoint', + lbann_args=['--disable_cuda=True' + ' --num_epochs='+str(num_ckpt_epochs)], + ) + + checkpoint_test_output = test_func_checkpoint[0](cluster, exes, dirname) + checkpoint_training_metrics = tools.collect_metrics_from_log_func(checkpoint_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function') + checkpoint_validation_metrics = tools.collect_metrics_from_log_func(checkpoint_test_output['stdout_log_file'], 'validation objective function') + checkpoint_test_metrics = tools.collect_metrics_from_log_func(checkpoint_test_output['stdout_log_file'], 'test objective function') + + print('\n################################################################################') + print('Running restarted model to completion') + print('################################################################################\n') + test_func_restart = tools.create_tests( + setup_experiment, + __file__, + test_name_base=test_name_base, + nodes=num_nodes, + work_subdir='restart', + lbann_args=['--disable_cuda=True' + + ' --restart_dir=' + + os.path.join(checkpoint_test_output['work_dir'], checkpoint_dir) + + ' --num_epochs='+str(num_epochs)], + ) + + # Restart LBANN model and run to completion + restart_test_output = test_func_restart[0](cluster, exes, dirname) + restart_training_metrics = tools.collect_metrics_from_log_func(restart_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function') + restart_validation_metrics = tools.collect_metrics_from_log_func(restart_test_output['stdout_log_file'], 'validation objective function') + restart_test_metrics = tools.collect_metrics_from_log_func(restart_test_output['stdout_log_file'], 'test objective function') + + print('\n################################################################################') + print('Comparing results of models') + print('################################################################################\n') + + # Check if metrics are same in baseline and test experiments + # Note: "Print statistics" callback will print up to 6 digits + # of metric values. + + # Comparing training objective functions + tools.compare_metrics(baseline_training_metrics, checkpoint_training_metrics + restart_training_metrics) + # Comparing validation objective functions + tools.compare_metrics(baseline_validation_metrics, checkpoint_validation_metrics + restart_validation_metrics) + # Comparing test objective functions + tools.compare_metrics(baseline_test_metrics, restart_test_metrics) + + baseline_ckpt=os.path.join(baseline_test_output['work_dir'], checkpoint_dir) + checkpoint_ckpt=os.path.join(checkpoint_test_output['work_dir'], checkpoint_dir) + restart_ckpt=os.path.join(restart_test_output['work_dir'], checkpoint_dir) + + err = 0 + err_dirs = '' + fileList = glob.glob('{base}/trainer0/*'.format(base=baseline_ckpt)) + fileList, tmp_err, tmp_err_str = tools.multidir_diff(baseline_ckpt, restart_ckpt, fileList) + err += tmp_err + err_dirs += tmp_err_str + fileList, tmp_err, tmp_err_str = tools.multidir_diff(baseline_ckpt, checkpoint_ckpt, fileList) + err += tmp_err + err_dirs += tmp_err_str + + err_msg = "\nUnmatched checkpoints:\n" + for f in fileList: + err_msg += f + "\n" + assert len(fileList) == 0, \ + 'Extra checkpoint data in baseline directory: ' + err_msg + assert err == 0, err_dirs + + # Return test function from factory function + func.__name__ = test_name + return func + +# Create test functions that can interact with PyTest +for _test_func in tools.create_tests(setup_experiment, + __file__, + test_name_base=test_name_base, + nodes=num_nodes, + work_subdir='baseline', + lbann_args=['--disable_cuda=True']): + globals()[_test_func.__name__] = create_test_func(_test_func) diff --git a/bamboo/unit_tests/test_unit_datareader_python.py b/bamboo/unit_tests/test_unit_datareader_python.py new file mode 100644 index 00000000000..618e5a8f77e --- /dev/null +++ b/bamboo/unit_tests/test_unit_datareader_python.py @@ -0,0 +1,130 @@ +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20190708) +_num_samples = 29 +_sample_size = 7 +_samples = np.random.normal(size=(_num_samples,_sample_size)) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 4 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Layer graph + x = lbann.Input() + y = lbann.L2Norm2(x) + layers = list(lbann.traverse_layer_graph(x)) + metric = lbann.Metric(y, name='obj') + callbacks = [] + + # Compute expected value with NumPy + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = tools.numpy_l2norm2(x) + vals.append(y) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metric.name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # Construct model + num_epochs = 0 + return lbann.Model(num_epochs, + layers=layers, + metrics=[metric], + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_datastore_imagenet.py b/bamboo/unit_tests/test_unit_datastore_imagenet.py new file mode 100644 index 00000000000..69519ed26fc --- /dev/null +++ b/bamboo/unit_tests/test_unit_datastore_imagenet.py @@ -0,0 +1,324 @@ +import os.path +import re +import sys +import math +import numpy as np +import pytest + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Options +# ============================================== + +# Training options +num_epochs = 5 +mini_batch_size = 256 +num_nodes = 2 +imagenet_fraction = 0.0031971 # Train with 4096 out of 1.28M samples +validation_percent = 0.1 +random_seed = 20191206 + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer(mini_batch_size=mini_batch_size, random_seed=random_seed) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.SGD(learn_rate=0.01, momentum=0.9) + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.models + + # Layer graph + input_ = lbann.Input() + x = lbann.Identity(input_) + y = lbann.L2Norm2(x) + z = lbann.Multiply(y, lbann.Sqrt(lbann.MiniBatchIndex())) + + # Make sure all layers are on CPU + for layer in lbann.traverse_layer_graph(input_): + layer.device = 'cpu' + + # Objects for LBANN model + callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] + metrics = [lbann.Metric(z, name='metric')] + + # Construct model + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(input_), + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for ImageNet data reader. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.contrib.lc.paths + + # Construct data reader + message = lbann.reader_pb2.DataReader() + reader = message.reader.add() + + # Configure data reader + reader.name = 'imagenet' + reader.role = 'train' + reader.shuffle = False + reader.data_filedir = lbann.contrib.lc.paths.imagenet_dir(data_set='train') + reader.data_filename = lbann.contrib.lc.paths.imagenet_labels(data_set='train') + reader.percent_of_data_to_use = imagenet_fraction + reader.validation_percent = validation_percent + reader.num_labels = 1000 + reader.shuffle = True + + # Configure transforms + # Note: The image just resized to 32x32 + resize = reader.transforms.add().resize + resize.SetInParent() + resize.height = 32 + resize.width = 32 + colorize = reader.transforms.add().colorize + colorize.SetInParent() + normalize = reader.transforms.add().to_lbann_layout + normalize.SetInParent() + + return message + +# ============================================== +# Setup PyTest +# ============================================== +def run_datastore_test_func(test_func, baseline_metrics, cluster, exes, dirname, profile_data) : + '''Executes the input test function + + Args: + run_datastore_test_func (function): test function + baseline_metrics: list of metrics against which the output of + the test function will be compared + profile_data: dictionary of key, value pairs for testing + entries in the output file: data_store_profile_train.txt + + Returns: + list containg test name, pass/fail, etc. + On error, this will have the form: + ['FAILED', , ] + on success: + ['passed', ] + ''' + datastore_test_output = test_func(cluster, exes, dirname) + + test_name = test_func.__name__ + r = ['passed', test_name] + datastore_metrics = [] + with open(datastore_test_output['stdout_log_file']) as f: + for line in f: + match = re.search('validation metric : ([0-9.]+)', line) + if match: + datastore_metrics.append(float(match.group(1))) + + # Check if metrics are same in baseline and data store experiments + # Note: "Print statistics" callback will print up to 6 digits + # of metric values. + if len(baseline_metrics) != len(datastore_metrics) : + r[0] = 'FAILED' + r.append('baseline and data store experiments did not run for same number of epochs; num baseline: ' + str(len(baseline_metrics)) + '; num ds: ' + str(len(datastore_metrics))) + + for i in range(len(datastore_metrics)): + x = baseline_metrics[i] + xhat = datastore_metrics[i] + eps = np.finfo(np.float32).eps + ceillogx = int(math.ceil(math.log10(x))) + if abs(x-xhat) >= max(8*eps*x, 1.5*10**(ceillogx-6)) : + r[0] = 'FAILED' + r.append('found large discrepancy in metrics for baseline and data store experiments') + + # Check if entries profile_data exist and have correct values + d = None + for key in profile_data.keys() : + if test_name.find(key) != -1 : + d = profile_data[key] + break + assert d != None, 'failed to find key for profile_data' + + found_profile_data = {} + with open(datastore_test_output['work_dir'] + '/data_store_profile_train.txt') as f: + for line in f: + for key in d : + if key in line and key not in found_profile_data.keys() : + t = line.split() + found_profile_data[key] = t[-1] + + for key in d.keys() : + if key not in found_profile_data.keys() : + r[0] = 'FAILED' + r.append('missing key in profile_data: ' + key) + elif found_profile_data[key] != d[key] : + r[0] = 'FAILED' + r.append('bad value for "' + key + '; value is: ' + str(found_profile_data[key]) + '; should be: ' + str(d[key])) + return r + +def run_baseline_test_func(baseline_test_func, cluster, exes, dirname) : + '''Executes the input test function + + Args: + baseline_test_func (function): test function + + Returns: + list of metrics that are parsed from the function's + output log + ''' + baseline_test_output = baseline_test_func(cluster, exes, dirname) + baseline_metrics = [] + with open(baseline_test_output['stdout_log_file']) as f: + for line in f: + match = re.search('validation metric : ([0-9.]+)', line) + if match: + baseline_metrics.append(float(match.group(1))) + + assert len(baseline_metrics) > 0, 'failed to parse baseline_metrics; len: ' + str(len(baseline_metrics)) + return baseline_metrics + +def create_test_func(baseline_test_func, datastore_test_funcs, profile_data=None) : + """Augment test function to parse log files. + + `tools.create_tests` creates functions that run an LBANN + experiment. This function creates augmented functions that parse + the log files after LBANN finishes running, e.g. to check metrics + or runtimes. + + Note: The naive approach is to define the augmented test functions + in a loop. However, Python closures are late binding. In other + words, the function would be overwritten every time we define it. + We get around this overwriting problem by defining the augmented + function in the local scope of another function. + + Args: + test_func (function): Test function created by + `tools.create_tests`. + + Returns: + function: Test that can interact with PyTest. + + """ + # Define test function + def func(cluster, exes, dirname, weekly): + # Run LBANN experiment without data store + baseline_metrics = run_baseline_test_func(baseline_test_func, cluster, exes, dirname) + + # Run LBANN experiments with data store + num_failed = 0 + results = [] + for i in range(len(datastore_test_funcs)) : + r = run_datastore_test_func(datastore_test_funcs[i], baseline_metrics, cluster, exes, dirname, profile_data) + results.append(r) + if len(r) > 2 : + num_failed += 1 + + work = [] + for x in results : + work.append(' :: '.join(x)) + result_string = '\n'.join(work) + assert num_failed == 0, '\n' + result_string + + print('\n===============================================') + print('data_store test synopsis:') + print(result_string) + print('===============================================\n') + + # Return test function from factory function + func.__name__ = baseline_test_func.__name__ + return func + +# Create test functions that can interact with PyTest +def make_test(name, test_by_platform_list=[], args=[]) : + test_list = tools.create_tests( + setup_experiment, + __file__, + nodes=num_nodes, + test_name_base=name, + lbann_args=args) + + if test_by_platform_list != [] : + for i in range(len(test_list)) : + test_by_platform_list[i].append(test_list[i]) + return test_list + +baseline_tests = make_test('nodatastore') + +datastore_tests = [[] for j in range(len(baseline_tests))] + +# Dictionary of dictionaries; this will contain data for testing +# the output file: data_store_profile_train.txt +profile_data = {} + +# handles for entries in the profile_data dictionaries +is_e = 'is_explicitly_loading' +is_l = 'is_local_cache' +is_f = 'is_fully_loaded' + +# test checkpoint, preload +test_name = 'data_store_checkpoint_preload' +make_test(test_name, datastore_tests, ['--preload_data_store', '--data_store_test_checkpoint=CHECKPOINT', '--data_store_profile']) +profile_data[test_name] = {is_e : '0', is_l : '0', is_f : '1'} + +# test checkpoint, explicit +test_name = 'data_store_checkpoint_explicit' +make_test(test_name, datastore_tests, ['--use_data_store', '--data_store_test_checkpoint=CHECKPOINT', '--data_store_profile']) +profile_data[test_name] = {is_e : '1', is_l : '0', is_f : '0'} + +# explicit loading +test_name = 'data_store_explicit' +make_test(test_name, datastore_tests, ['--use_data_store', '--data_store_profile']) +profile_data[test_name] = {is_e : '1', is_l : '0', is_f : '0'} + +# preloading +test_name = 'data_store_preload' +make_test(test_name, datastore_tests, ['--preload_data_store', '--data_store_profile']) +profile_data[test_name] = {is_e : '0', is_l : '0', is_f : '1'} + +#local cache with explicit loading (internally, this should run identically +#with the flag: --preload_data_store +test_name = 'data_store_cache_explicit' +make_test(test_name, datastore_tests, ['--data_store_cache', '--data_store_profile']) +profile_data[test_name] = {is_e : '1', is_l : '1', is_f : '0'} + +#local cache with preloading +test_name = 'data_store_cache_preloading' +make_test(test_name, datastore_tests, ['--data_store_cache', '--preload_data_store', '--data_store_profile']) +profile_data[test_name] = {is_e : '0', is_l : '1', is_f : '0'} + +#test local cache +test_name = 'data_store_test_cache' +make_test(test_name, datastore_tests, ['--data_store_cache', '--preload_data_store', '--data_store_test_cache', '--data_store_profile']) +profile_data[test_name] = {is_e : '0', is_l : '1', is_f : '0'} + +for i in range(len(datastore_tests)): + _test_func = create_test_func(baseline_tests[i], datastore_tests[i], profile_data) + globals()[_test_func.__name__] = _test_func diff --git a/bamboo/unit_tests/test_unit_layer_argmax.py b/bamboo/unit_tests/test_unit_layer_argmax.py new file mode 100644 index 00000000000..4228fd5bad4 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_argmax.py @@ -0,0 +1,147 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20190911) +_num_samples = 35 +_sample_dims = (11,) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) +_samples[1,:] = 0.5 +_samples[15,:] = -1.0 +_samples[15,3] = -0.5 +_samples[15,5] = -0.5 + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Convenience function to compute L2 norm squared with NumPy + def l2_norm2(x): + x = x.reshape(-1) + return np.inner(x, x) + + # LBANN implementation + x = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) + y = lbann.Argmax(x, device='cpu') + z = lbann.L2Norm2(y) + + # Objects for LBANN model + obj = z + metric = lbann.Metric(z, name='obj') + layers = list(lbann.traverse_layer_graph(z)) + callbacks = [] + + # Get expected metric value from NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = np.argmax(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metric.name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # Construct model + num_epochs = 0 + return lbann.Model(num_epochs, + layers=layers, + objective_function=obj, + metrics=metric, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_argmin.py b/bamboo/unit_tests/test_unit_layer_argmin.py new file mode 100644 index 00000000000..b6830f0891a --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_argmin.py @@ -0,0 +1,147 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(201909112) +_num_samples = 37 +_sample_dims = (11,) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) +_samples[1,:] = 0.5 +_samples[15,:] = 1.0 +_samples[15,3] = 0.5 +_samples[15,5] = 0.5 + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # LBANN implementation + x = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) + y = lbann.Argmin(x, device='cpu') + z = lbann.L2Norm2(y) + + # Objects for LBANN model + obj = z + metric = lbann.Metric(z, name='obj') + layers = list(lbann.traverse_layer_graph(z)) + callbacks = [] + + # Get expected metric value from NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = np.argmin(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metric.name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=layers, + objective_function=obj, + metrics=metric, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_channelwise_fully_connected.py b/bamboo/unit_tests/test_unit_layer_channelwise_fully_connected.py new file mode 100644 index 00000000000..bed7e415795 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_channelwise_fully_connected.py @@ -0,0 +1,309 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20200113) +_num_samples = 17 +_sample_dims = (5,7,3) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) +_scale = np.random.normal(loc=1, size=(_sample_dims[0],1,1)).astype(np.float32) +_bias = np.random.normal(loc=0, size=(_sample_dims[0],1,1)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x0 = lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_dims)) + x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) + x = lbann.Sum(x0, x1) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Compute expected metric values with NumPy + # ------------------------------------------ + + # Input and output dimensions + input_channel_dims = _sample_dims[1:] + output_channel_dims = (2,5) + input_channel_size = functools.reduce(operator.mul, input_channel_dims) + output_channel_size = functools.reduce(operator.mul, output_channel_dims) + + # Weight values + linearity = np.random.normal( + size=(output_channel_size,input_channel_size) + ).astype(np.float32) + bias = np.random.normal(size=(output_channel_size,1)).astype(np.float32) + + # With bias + x = (_samples + .reshape((-1,input_channel_size)) + .transpose() + .astype(np.float64)) + y = np.matmul(linearity.astype(np.float64), x) + bias.astype(np.float64) + z = tools.numpy_l2norm2(y) / _num_samples + val_with_bias = z + + # Without bias + x = (_samples + .reshape((-1,input_channel_size)) + .transpose() + .astype(np.float64)) + y = np.matmul(linearity.astype(np.float64), x) + z = tools.numpy_l2norm2(y) / _num_samples + val_without_bias = z + + # ------------------------------------------ + # Data-parallel layout, non-transpose, bias + # ------------------------------------------ + + # LBANN implementation + linearity_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(linearity, order='F')) + ) + ) + bias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(bias)) + ) + ) + x = x_lbann + y = lbann.ChannelwiseFullyConnected( + x, + weights=(linearity_weights, bias_weights), + output_channel_dims=output_channel_dims, + ) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, non-transpose, bias')) + + # NumPy implementation + tol = 8 * val_with_bias * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val_with_bias-tol, + upper_bound=val_with_bias+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Data-parallel layout, non-transpose, no bias + # ------------------------------------------ + + # LBANN implementation + linearity_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(linearity, order='F')) + ) + ) + x = x_lbann + y = lbann.ChannelwiseFullyConnected( + x, + weights=(linearity_weights), + output_channel_dims=output_channel_dims, + bias=False, + ) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, non-transpose, no bias')) + + # NumPy implementation + tol = 8 * val_without_bias * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val_without_bias-tol, + upper_bound=val_without_bias+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Data-parallel layout, transpose, bias + # ------------------------------------------ + + # LBANN implementation + linearity_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(linearity, order='C')) + ) + ) + bias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(bias)) + ) + ) + x = x_lbann + y = lbann.ChannelwiseFullyConnected( + x, + weights=(linearity_weights, bias_weights), + output_channel_dims=output_channel_dims, + transpose=True, + ) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, transpose, bias')) + + # NumPy implementation + tol = 8 * val_with_bias * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val_with_bias-tol, + upper_bound=val_with_bias+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Data-parallel layout, transpose, no bias + # ------------------------------------------ + + # LBANN implementation + linearity_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(linearity, order='C')) + ) + ) + x = x_lbann + y = lbann.ChannelwiseFullyConnected( + x, + weights=(linearity_weights), + output_channel_dims=output_channel_dims, + bias=False, + transpose=True, + ) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, transpose, no bias')) + + # NumPy implementation + tol = 8 * val_without_bias * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val_without_bias-tol, + upper_bound=val_without_bias+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py new file mode 100644 index 00000000000..1df73392538 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py @@ -0,0 +1,161 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20190719) +_num_samples = 23 +_sample_dims = (7,5,3) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) +_scale = np.random.normal(loc=1, size=(_sample_dims[0],1,1)).astype(np.float32) +_bias = np.random.normal(loc=0, size=(_sample_dims[0],1,1)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x0 = lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_dims)) + x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) + x = lbann.Sum(x0, x1) + + # Apply channel-wise scale/bias + scale_values = tools.str_list(np.nditer(_scale)) + bias_values = tools.str_list(np.nditer(_bias)) + scalebias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values='{} {}'.format(scale_values, + bias_values)), + name='scalebias_weights' + ) + y = lbann.ChannelwiseScaleBias(x, weights=scalebias_weights) + z = lbann.L2Norm2(y) + + # Objects for LBANN model + obj = z + metric = lbann.Metric(z, name='obj') + layers = list(lbann.traverse_layer_graph(z)) + callbacks = [] + + # Get expected metric value from NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = _scale.astype(np.float64) * x + _bias.astype(np.float64) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metric.name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # Gradient checking + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # Construct model + num_epochs = 0 + return lbann.Model(num_epochs, + layers=layers, + objective_function=obj, + metrics=metric, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_channelwise_softmax.py b/bamboo/unit_tests/test_unit_layer_channelwise_softmax.py new file mode 100644 index 00000000000..921ecee364b --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_channelwise_softmax.py @@ -0,0 +1,176 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20200115) +_num_samples = 15 +_sample_dims = (5,2,7) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(loc=0.5, size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# NumPy implementation +# ============================================== + +def numpy_channelwise_softmax(x): + if x.dtype is not np.float64: + x = x.astype(np.float64) + axis = tuple(range(1,x.ndim)) + shift = np.max(x, axis=axis, keepdims=True) + y = np.exp(x-shift) + return y / np.sum(y, axis=axis, keepdims=True) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_dims)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_dims))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.ChannelwiseSoftmax(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = numpy_channelwise_softmax(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_clamp.py b/bamboo/unit_tests/test_unit_layer_clamp.py index 8cd7d579374..6ddd53a6d12 100644 --- a/bamboo/unit_tests/test_unit_layer_clamp.py +++ b/bamboo/unit_tests/test_unit_layer_clamp.py @@ -1,49 +1,196 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The clamp function is not differentiable at the interval +# boundaries, so we make sure values are well inside or well outside +# the interval. +np.random.seed(201910241) +_num_samples = 27 +_sample_size = 11 +_samples = np.random.choice([-193.0,-4.0,-1.0,1.0,3.0,5.0,2003.0], + size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Clamp(x, min=-2, max=2, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.clip(x, -2, 2) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Clamp(x, min=0, max=4, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.clip(x, 0, 4) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_clamp(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_clamp: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_clamp_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_clamp_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='clamp', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_clamp_clang4(cluster, exes, dirname): - skeleton_layer_clamp(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_clamp_gcc4_check(cluster, exes, dirname): - skeleton_layer_clamp(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_clamp_gcc7(cluster, exes, dirname): - skeleton_layer_clamp(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_clamp_intel18(cluster, exes, dirname): - skeleton_layer_clamp(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_layer_clamp.py -k 'test_unit_layer_clamp_exe' --exe= -def test_unit_layer_clamp_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_clamp_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_clamp(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_concatenate.py b/bamboo/unit_tests/test_unit_layer_concatenate.py new file mode 100644 index 00000000000..946f8ec1fab --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_concatenate.py @@ -0,0 +1,278 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np +import pytest + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20191204) +_num_samples = 17 +_sample_size = 60 +_samples = np.random.normal(size=(_num_samples,_sample_size), loc=1).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Input(), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # -------------------------- + # Concatenate along axis 0 + # -------------------------- + + # LBANN implementation + x = x_lbann + x = lbann.Reshape(x, dims=tools.str_list([5,3,4])) + x_slice = lbann.Slice(x, axis=0, slice_points=tools.str_list([0,1,3,5])) + x1 = lbann.Identity(x_slice) + x2 = lbann.Identity(x_slice) + x3 = lbann.Identity(x_slice) + y = lbann.Concatenation(x3, x2, x1, axis=0) + z = lbann.L2Norm2(lbann.Multiply(x, y)) + obj.append(z) + metrics.append(lbann.Metric(z, name='axis0')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape([5,3,4]).astype(np.float64) + x1 = x[0:1,:,:] + x2 = x[1:3,:,:] + x3 = x[3:5,:,:] + y = np.concatenate((x3, x2, x1), axis=0) + z = tools.numpy_l2norm2(x*y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # -------------------------- + # Slice along axis 1 + # -------------------------- + + # LBANN implementation + x = x_lbann + x = lbann.Reshape(x, dims=tools.str_list([3,4,5])) + x_slice = lbann.Slice(x, axis=1, slice_points=tools.str_list([0,1,3,4])) + x1 = lbann.Identity(x_slice) + x2 = lbann.Identity(x_slice) + x3 = lbann.Identity(x_slice) + y = lbann.Concatenation(x2, x1, x3, axis=1) + z = lbann.L2Norm2(lbann.Multiply(x, y)) + obj.append(z) + metrics.append(lbann.Metric(z, name='axis1')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape([3,4,5]).astype(np.float64) + x1 = x[:,0:1,:] + x2 = x[:,1:3,:] + x3 = x[:,3:4,:] + y = np.concatenate((x2, x1, x3), axis=1) + z = tools.numpy_l2norm2(x*y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # -------------------------- + # Slice along axis 2 + # -------------------------- + + # LBANN implementation + x = x_lbann + x = lbann.Reshape(x, dims=tools.str_list([3,4,5])) + x_slice = lbann.Slice(x, axis=2, slice_points=tools.str_list([0,1,2,3,5])) + x1 = lbann.Identity(x_slice) + x2 = lbann.Identity(x_slice) + x3 = lbann.Identity(x_slice) + x4 = lbann.Identity(x_slice) + y = lbann.Concatenation(x2, x4, x1, x3, axis=2) + z = lbann.L2Norm2(lbann.Multiply(x, y)) + obj.append(z) + metrics.append(lbann.Metric(z, name='axis2')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape([3,4,5]).astype(np.float64) + x1 = x[:,:,0:1] + x2 = x[:,:,1:2] + x3 = x[:,:,2:3] + x4 = x[:,:,3:5] + y = np.concatenate((x2, x4, x1, x3), axis=2) + z = tools.numpy_l2norm2(x*y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # -------------------------- + # Model-parallel + # -------------------------- + + # LBANN implementation + x = x_lbann + x = lbann.Reshape(x, dims=tools.str_list([60])) + x_slice = lbann.Slice(x, slice_points=tools.str_list([0,22,23,60])) + x1 = lbann.Identity(x_slice) + x2 = lbann.Identity(x_slice) + x3 = lbann.Identity(x_slice) + y = lbann.Concatenation(x3, x1, x2, data_layout='model_parallel') + z = lbann.L2Norm2(lbann.Multiply(x, y)) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape([60]).astype(np.float64) + x1 = x[0:22] + x2 = x[22:23] + x3 = x[23:60] + y = np.concatenate((x3, x1, x2)) + z = tools.numpy_l2norm2(x*y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # -------------------------- + # Gradient checking + # -------------------------- + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # -------------------------- + # Construct model + # -------------------------- + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_convolution.py b/bamboo/unit_tests/test_unit_layer_convolution.py new file mode 100644 index 00000000000..275b4d9d94d --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_convolution.py @@ -0,0 +1,327 @@ +import functools +import math +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +def make_random_array(shape, seed): + """Hacked function to generate a random array. + + NumPy's RNG produces different values with different NumPy + versions. This function is helpful when array values must be + identical across all runs, e.g. when checking against precomputed + metric values. + + Args: + shape (Iterable of int): Array dimensions + seed (int): Parameter for RNG. Must be non-zero. + Returns: + numpy.ndarray: Array of `np.float32`. Values will be in + [-0.5,0.5). + + """ + size = functools.reduce(operator.mul, shape) + eps = np.finfo(np.float32).eps + x = (seed / np.linspace(math.sqrt(eps), 0.1, size)) % 1 - 0.5 + return x.reshape(shape).astype(np.float32) + +# Data +_num_samples = 23 +_sample_dims = [6,11,7] +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = make_random_array([_num_samples] + _sample_dims, 7) + +# Sample access functions +def get_sample(index): + return _samples[index,:].reshape(-1) +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# PyTorch convolution +# ============================================== + +def pytorch_convolution(data, + kernel, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1): + """Wrapper around PyTorch convolution. + + Input and output data are NumPy arrays. + + """ + + # Convert input data to PyTorch tensors with 64-bit floats + import torch + import torch.nn.functional + if type(data) is np.ndarray: + data = torch.from_numpy(data) + if type(kernel) is np.ndarray: + kernel = torch.from_numpy(kernel) + if type(bias) is np.ndarray: + bias = torch.from_numpy(bias) + if data.dtype is not torch.float64: + data = data.astype(torch.float64) + if kernel.dtype is not torch.float64: + kernel = kernel.astype(torch.float64) + if bias.dtype is not torch.float64: + bias = bias.astype(torch.float64) + + # Perform convolution with PyTorch + output = None + if len(kernel.shape) == 3: + output = torch.nn.functional.conv1d( + data, kernel, bias, stride, padding, dilation, groups + ) + if len(kernel.shape) == 4: + output = torch.nn.functional.conv2d( + data, kernel, bias, stride, padding, dilation, groups + ) + if len(kernel.shape) == 5: + output = torch.nn.functional.conv3d( + data, kernel, bias, stride, padding, dilation, groups + ) + if output is None: + raise ValueError('PyTorch only supports 1D, 2D, and 3D convolution') + + # Return output as NumPy array + return output.numpy() + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_dims)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_dims))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Basic 3x3 convolution + # ------------------------------------------ + # 3x3 conv, stride=1, pad=1, dilation=1, bias + + # Convolution settings + kernel_dims = (5, _sample_dims[0], 3, 3) + strides = (1, 1) + pads = (1, 1) + dilations = (1, 1) + kernel = make_random_array(kernel_dims, 11) + bias = make_random_array([kernel_dims[0]], 123) + + # Apply convolution + kernel_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(kernel))), + name='kernel1' + ) + bias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(bias))), + name='bias1' + ) + x = x_lbann + y = lbann.Convolution(x, + weights=(kernel_weights, bias_weights), + num_dims=3, + num_output_channels=kernel_dims[0], + has_vectors=True, + conv_dims=tools.str_list(kernel_dims[2:]), + conv_strides=tools.str_list(strides), + conv_pads=tools.str_list(pads), + conv_dilations=tools.str_list(dilations), + has_bias=True) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='basic 3x3 convolution')) + + # PyTorch implementation + try: + x = _samples + y = pytorch_convolution( + x, kernel, bias=bias, + stride=strides, padding=pads, dilation=dilations + ) + z = tools.numpy_l2norm2(y) / _num_samples + val = z + except: + # Precomputed value + val = 153.84937996554953 + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # 2x4 strided convolution + # ------------------------------------------ + + # Convolution settings + kernel_dims = (3, _sample_dims[0], 2, 4) + strides = (3, 1) + pads = (3, 0) + dilations = (1, 1) + num_groups = 1 + kernel = make_random_array(kernel_dims, 19) + + # Apply convolution + kernel_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(kernel))), + name='kernel2' + ) + x = x_lbann + y = lbann.Convolution(x, + weights=(kernel_weights), + num_dims=3, + num_output_channels=kernel_dims[0], + has_vectors=True, + conv_dims=tools.str_list(kernel_dims[2:]), + conv_strides=tools.str_list(strides), + conv_pads=tools.str_list(pads), + conv_dilations=tools.str_list(dilations), + num_groups=num_groups, + has_bias=False) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='2x4 convolution')) + + # PyTorch implementation + try: + x = _samples + y = pytorch_convolution( + x, kernel, bias=None, + stride=strides, padding=pads, + dilation=dilations, groups=num_groups + ) + z = tools.numpy_l2norm2(y) / _num_samples + val = z + except: + # Precomputed value + val = 19.24587403346207 + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_convolution_distconv.py b/bamboo/unit_tests/test_unit_layer_convolution_distconv.py new file mode 100644 index 00000000000..74e665a09cc --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_convolution_distconv.py @@ -0,0 +1,332 @@ +import functools +import math +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +def make_random_array(shape, seed): + """Hacked function to generate a random array. + + NumPy's RNG produces different values with different NumPy + versions. This function is helpful when array values must be + identical across all runs, e.g. when checking against precomputed + metric values. + + Args: + shape (Iterable of int): Array dimensions + seed (int): Parameter for RNG. Must be non-zero. + Returns: + numpy.ndarray: Array of `np.float32`. Values will be in + [-0.5,0.5). + + """ + size = functools.reduce(operator.mul, shape) + eps = np.finfo(np.float32).eps + x = (seed / np.linspace(math.sqrt(eps), 0.1, size)) % 1 - 0.5 + return x.reshape(shape).astype(np.float32) + +# Data +_num_samples = 23 +_sample_dims = [6,11,7] +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = make_random_array([_num_samples] + _sample_dims, 7) + +# Sample access functions +def get_sample(index): + return _samples[index,:].reshape(-1) +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# PyTorch convolution +# ============================================== + +def pytorch_convolution(data, + kernel, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1): + """Wrapper around PyTorch convolution. + + Input and output data are NumPy arrays. + + """ + + # Convert input data to PyTorch tensors with 64-bit floats + import torch + import torch.nn.functional + if type(data) is np.ndarray: + data = torch.from_numpy(data) + if type(kernel) is np.ndarray: + kernel = torch.from_numpy(kernel) + if type(bias) is np.ndarray: + bias = torch.from_numpy(bias) + if data.dtype is not torch.float64: + data = data.astype(torch.float64) + if kernel.dtype is not torch.float64: + kernel = kernel.astype(torch.float64) + if bias.dtype is not torch.float64: + bias = bias.astype(torch.float64) + + # Perform convolution with PyTorch + output = None + if len(kernel.shape) == 3: + output = torch.nn.functional.conv1d( + data, kernel, bias, stride, padding, dilation, groups + ) + if len(kernel.shape) == 4: + output = torch.nn.functional.conv2d( + data, kernel, bias, stride, padding, dilation, groups + ) + if len(kernel.shape) == 5: + output = torch.nn.functional.conv3d( + data, kernel, bias, stride, padding, dilation, groups + ) + if output is None: + raise ValueError('PyTorch only supports 1D, 2D, and 3D convolution') + + # Return output as NumPy array + return output.numpy() + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size=mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def create_parallel_strategy(num_height_groups): + return {"height_groups": num_height_groups} + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_dims)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_dims))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Basic 3x3 convolution + # ------------------------------------------ + # 3x3 conv, stride=1, pad=1, dilation=1, bias + + # Convolution settings + kernel_dims = (5, _sample_dims[0], 3, 3) + strides = (1, 1) + pads = (1, 1) + dilations = (1, 1) + kernel = make_random_array(kernel_dims, 11) + bias = make_random_array([kernel_dims[0]], 123) + + # Apply convolution + kernel_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(kernel))), + name='kernel1' + ) + bias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(bias))), + name='bias1' + ) + x = x_lbann + y = lbann.Convolution(x, + weights=(kernel_weights, bias_weights), + num_dims=3, + num_output_channels=kernel_dims[0], + has_vectors=True, + conv_dims=tools.str_list(kernel_dims[2:]), + conv_strides=tools.str_list(strides), + conv_pads=tools.str_list(pads), + conv_dilations=tools.str_list(dilations), + has_bias=True, + parallel_strategy=create_parallel_strategy(4)) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='basic 3x3 convolution')) + + # PyTorch implementation + try: + x = _samples + y = pytorch_convolution( + x, kernel, bias=bias, + stride=strides, padding=pads, dilation=dilations + ) + z = tools.numpy_l2norm2(y) / _num_samples + val = z + except: + # Precomputed value + val = 153.84937996554953 + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # 2x4 strided convolution + # ------------------------------------------ + + # Convolution settings + kernel_dims = (3, _sample_dims[0], 2, 4) + strides = (3, 1) + pads = (3, 0) + dilations = (1, 1) + num_groups = 1 + kernel = make_random_array(kernel_dims, 19) + + # Apply convolution + kernel_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(kernel))), + name='kernel2' + ) + x = x_lbann + y = lbann.Convolution(x, + weights=(kernel_weights), + num_dims=3, + num_output_channels=kernel_dims[0], + has_vectors=True, + conv_dims=tools.str_list(kernel_dims[2:]), + conv_strides=tools.str_list(strides), + conv_pads=tools.str_list(pads), + conv_dilations=tools.str_list(dilations), + num_groups=num_groups, + has_bias=False, + parallel_strategy=create_parallel_strategy(4)) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='2x4 convolution')) + + # PyTorch implementation + try: + x = _samples + y = pytorch_convolution( + x, kernel, bias=None, + stride=strides, padding=pads, + dilation=dilations, groups=num_groups + ) + z = tools.numpy_l2norm2(y) / _num_samples + val = z + except: + # Precomputed value + val = 19.24587403346207 + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name, procs_per_node=4): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py index e72bca4fb51..a0137841b27 100644 --- a/bamboo/unit_tests/test_unit_layer_covariance.py +++ b/bamboo/unit_tests/test_unit_layer_covariance.py @@ -1,49 +1,261 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(201910242) +_samples = np.random.normal(size=(27,2,5)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index].reshape(-1) +def num_samples(): + return _samples.shape[0] +def sample_dims(): + return (2*_samples.shape[-1],) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with weights layers so that gradient checking will + # verify that error signals are correct. + slice_size = _samples.shape[-1] + x0_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input0_weights') + x1_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input1_weights') + x_slice = lbann.Slice(lbann.Input(), + slice_points=tools.str_list([0, slice_size, 2*slice_size])) + x0 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size))) + x1 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size))) + x0_lbann = x0 + x1_lbann = x1 + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout, unbiased + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.Covariance(x0, x1, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, unbiased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = np.cov(np.stack((x0,x1), axis=0), bias=False)[0,1] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout, unbiased + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.Covariance(x0, x1, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = np.cov(np.stack((x0,x1), axis=0), bias=False)[0,1] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Data-parallel layout, biased + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.Covariance(x0, x1, biased=True, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, biased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + x0 = x[:slice_size].astype(np.float64) + x1 = x[slice_size:].astype(np.float64) + y = np.cov(np.stack((x0,x1), axis=0), bias=True)[0,1] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout, biased + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.Covariance(x0, x1, biased=True, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout, biased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + x0 = x[:slice_size].astype(np.float64) + x1 = x[slice_size:].astype(np.float64) + y = np.cov(np.stack((x0,x1), axis=0), bias=True)[0,1] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_covariance(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_covariance: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_covariance_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_covariance_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='covariance', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_covariance_clang4(cluster, exes, dirname): - skeleton_layer_covariance(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x0_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_covariance_gcc4_check(cluster, exes, dirname): - skeleton_layer_covariance(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_covariance_gcc7(cluster, exes, dirname): - skeleton_layer_covariance(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_covariance_intel18(cluster, exes, dirname): - skeleton_layer_covariance(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_covariance_exe' --exe= -def test_unit_layer_covariance_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_covariance_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_covariance(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_cross_entropy.py b/bamboo/unit_tests/test_unit_layer_cross_entropy.py new file mode 100644 index 00000000000..a417f61947f --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_cross_entropy.py @@ -0,0 +1,232 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The error bounds for gradient checking assume that the fourth +# derivative of the objective function is ~1. However, given our loss +# function: +# L = ( -xhat * log(x) )^2 +# L'''' = O( xhat^2 * log(x) / x^4 ) +# We have x >= 0.25 to make sure the fourth derivative does not get +# too big and mess up the error bounds. +np.random.seed(201910143) +_samples = np.random.uniform(low=0.25, + high=1, + size=(23,2,7)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index].reshape(-1) +def num_samples(): + return _samples.shape[0] +def sample_dims(): + return (2*_samples.shape[-1],) + +# ============================================== +# NumPy cross entropy +# ============================================== + +def numpy_cross_entropy(x, xhat): + """Cross entropy between two distributions, computed with NumPy + + The computation is performed with 64-bit floats. + + Args: + x: Estimated distribution + xhat: True distribution + + """ + if x.dtype is not np.float64: + x = x.astype(np.float64) + if xhat.dtype is not np.float64: + xhat = xhat.astype(np.float64) + return -np.inner(xhat, np.log(x)) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with weights layers so that gradient checking will + # verify that error signals are correct. + slice_size = _samples.shape[-1] + x0_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input0_weights') + x1_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input1_weights') + x_slice = lbann.Slice(lbann.Input(), + slice_points=tools.str_list([0, slice_size, 2*slice_size])) + x0 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size))) + x1 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size))) + x0_lbann = x0 + x1_lbann = x1 + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.CrossEntropy(x0, x1, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = -np.inner(x1, np.log(x0)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.CrossEntropy(x0, x1, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = -np.inner(x1, np.log(x0)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x0_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_dist_embedding.py b/bamboo/unit_tests/test_unit_layer_dist_embedding.py new file mode 100644 index 00000000000..bdec9c11d54 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_dist_embedding.py @@ -0,0 +1,215 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +_seed = 20200117 +_num_samples = 41 +_num_embeddings = 11 +_sequence_length = 3 + +# Sample access functions +def get_sample(index): + np.random.seed(100*_seed+index) + return np.random.randint(_num_embeddings, size=_sequence_length) +def num_samples(): + return _num_samples +def sample_dims(): + return (_sequence_length,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + x = lbann.Identity(lbann.Input()) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # GPU + # ------------------------------------------ + + # Embeddings + np.random.seed(_seed) + embedding_dim = 7 + embeddings = np.random.normal(size=(_num_embeddings,embedding_dim)) + + # LBANN implementation + embedding_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(embeddings))) + ) + x = x_lbann + y = lbann.DistEmbedding(x, + weights=embedding_weights, + num_embeddings=_num_embeddings, + embedding_dim=embedding_dim, + barrier_in_forward_prop=True, + device='gpu') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='GPU')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + y = embeddings[x,:] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # CPU + # ------------------------------------------ + + # Embeddings + np.random.seed(_seed) + embedding_dim = 5 + embeddings = np.random.normal(size=(_num_embeddings,embedding_dim)) + + # LBANN implementation + embedding_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(embeddings))) + ) + x = x_lbann + y = lbann.DistEmbedding(x, + weights=embedding_weights, + num_embeddings=_num_embeddings, + embedding_dim=embedding_dim, + barrier_in_forward_prop=True, + device='cpu') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='CPU')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + y = embeddings[x,:] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + # Construct model + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note (tym 6/12/20): Tests are disabled for now since the default +# build doesn't include SHMEM or NVSHMEM. Restore these tests when +# proper support is added. +# for test in tools.create_tests(setup_experiment, __file__): +# globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_elu.py b/bamboo/unit_tests/test_unit_layer_elu.py index 66b10d1fc5b..e42882e264d 100644 --- a/bamboo/unit_tests/test_unit_layer_elu.py +++ b/bamboo/unit_tests/test_unit_layer_elu.py @@ -1,49 +1,194 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: ELU is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(201910243) +_num_samples = 37 +_sample_size = 8 +_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Elu(x, alpha=1, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x < 0, np.expm1(x), x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Elu(x, alpha=0.5, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x < 0, 0.5*np.expm1(x), x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_elu(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_elu: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_elu_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_elu_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='elu', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_elu_clang4(cluster, exes, dirname): - skeleton_layer_elu(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_elu_gcc4_check(cluster, exes, dirname): - skeleton_layer_elu(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_elu_gcc7(cluster, exes, dirname): - skeleton_layer_elu(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_elu_intel18(cluster, exes, dirname): - skeleton_layer_elu(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_layer_elu.py -k 'test_unit_layer_elu_exe' --exe= -def test_unit_layer_elu_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_elu_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_elu(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_embedding.py b/bamboo/unit_tests/test_unit_layer_embedding.py new file mode 100644 index 00000000000..8f05792c6f6 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_embedding.py @@ -0,0 +1,210 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +_num_samples = 41 +_num_embeddings = 11 +_sequence_length = 3 + +# Sample access functions +def get_sample(index): + np.random.seed(2019101500+index) + return np.random.randint(_num_embeddings, size=_sequence_length) +def num_samples(): + return _num_samples +def sample_dims(): + return (_sequence_length,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + x = lbann.Identity(lbann.Input()) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # No padding index + # ------------------------------------------ + + # Embeddings + np.random.seed(20191015) + embedding_dim = 5 + embeddings = np.random.normal(size=(_num_embeddings,embedding_dim)) + + # LBANN implementation + embedding_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(embeddings))) + ) + x = x_lbann + y = lbann.Embedding(x, + weights=embedding_weights, + num_embeddings=_num_embeddings, + embedding_dim=embedding_dim) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='no padding index')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + y = embeddings[x,:] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Padding index 0 + # ------------------------------------------ + + # Embeddings + np.random.seed(201910152) + embedding_dim = 7 + padding_idx = 0 + embeddings = np.random.normal(size=(_num_embeddings,embedding_dim)) + + # LBANN implementation + # Note: Embedding layer gradients are not exact if a padding index + # is set. Avoid gradient checking by not using an optimizer. + embedding_weights = lbann.Weights( + optimizer=None, + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(embeddings))) + ) + x = x_lbann + y = lbann.Embedding(x, + weights=embedding_weights, + num_embeddings=_num_embeddings, + embedding_dim=embedding_dim, + padding_idx=padding_idx) + z = lbann.L2Norm2(y) + metrics.append(lbann.Metric(z, name='padding index = 0')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + y = np.where((x==padding_idx).reshape((-1,1)), 0, embeddings[x,:]) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + # Construct model + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py new file mode 100644 index 00000000000..ced689cdad4 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py @@ -0,0 +1,174 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20190815) +_num_samples = 29 +_sample_dims = (7,5,3) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: We want to use gradient checking to verify that error + # signals are correct. To do this, we zero-initialize a weights + # object, construct a zero-valued tensor, and add it to the + # input. To make sure that batchnorm is non-trivial, we multiply + # the zero-valued tensor by the mini-batch index. + x = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x0 = lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_dims)) + x1 = lbann.Divide(lbann.MiniBatchIndex(), lbann.MiniBatchSize()) + x1 = lbann.Tessellate(lbann.Reshape(x1, dims='1 1 1'), dims=tools.str_list(_sample_dims)) + x = lbann.Sum(x, lbann.Multiply(x0, x1)) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + decay = 0.9 + epsilon = 1e-5 + x = x_lbann + y = lbann.EntrywiseBatchNormalization(x, + decay=decay, + epsilon=epsilon, + data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + decay = 0.9 + epsilon = 1e-5 + x = x_lbann + y = lbann.EntrywiseBatchNormalization(x, + decay=decay, + epsilon=epsilon, + data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 1 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py new file mode 100644 index 00000000000..e6308a8ea4f --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py @@ -0,0 +1,209 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20190723) +_num_samples = 29 +_sample_dims = (7,5,3) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) +_scale = np.random.normal(loc=1, size=_sample_dims).astype(np.float32) +_bias = np.random.normal(loc=0, size=_sample_dims).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + scale_values = tools.str_list(np.nditer(_scale)) + bias_values = tools.str_list(np.nditer(_bias)) + scalebias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values='{} {}'.format(scale_values, + bias_values))) + x = x_lbann + y = lbann.EntrywiseScaleBias(x, + weights=scalebias_weights, + data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = _scale.astype(np.float64) * x + _bias.astype(np.float64) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + scale_values = tools.str_list(np.nditer(_scale)) + bias_values = tools.str_list(np.nditer(_bias)) + scalebias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values='{} {}'.format(scale_values, + bias_values))) + x = x_lbann + y = lbann.EntrywiseScaleBias(x, + weights=scalebias_weights, + data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = _scale.astype(np.float64) * x + _bias.astype(np.float64) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_fully_connected.py b/bamboo/unit_tests/test_unit_layer_fully_connected.py new file mode 100644 index 00000000000..4ccee406c39 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_fully_connected.py @@ -0,0 +1,299 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20191011) +_num_samples = 31 +_input_size = 11 +_output_size = 3 +_samples = np.random.normal(size=(_num_samples,_input_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_input_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_input_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_input_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Compute expected metric values with NumPy + # ------------------------------------------ + + # Weight values + linearity = np.random.normal(size=(_output_size,_input_size)).astype(np.float32) + bias = np.random.normal(size=(_output_size,1)).astype(np.float32) + + # With bias + x = _samples.transpose().astype(np.float64) + y = np.matmul(linearity.astype(np.float64), x) + bias.astype(np.float64) + z = tools.numpy_l2norm2(y) / _num_samples + val_with_bias = z + + # Without bias + x = _samples.transpose().astype(np.float64) + y = np.matmul(linearity.astype(np.float64), x) + z = tools.numpy_l2norm2(y) / _num_samples + val_without_bias = z + + # ------------------------------------------ + # Data-parallel layout, non-transpose, bias + # ------------------------------------------ + + # LBANN implementation + linearity_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(linearity, order='F')) + ) + ) + bias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(bias)) + ) + ) + x = x_lbann + y = lbann.FullyConnected(x, + weights=(linearity_weights, bias_weights), + data_layout='data_parallel', + num_neurons=_output_size, + has_bias=True, + transpose=False) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, non-transpose, bias')) + + # NumPy implementation + val = val_with_bias + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout, non-transpose, bias + # ------------------------------------------ + + # LBANN implementation + linearity_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(linearity, order='F')) + ) + ) + bias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(bias)) + ) + ) + x = x_lbann + y = lbann.FullyConnected(x, + weights=(linearity_weights, bias_weights), + data_layout='model_parallel', + num_neurons=_output_size, + has_bias=True, + transpose=False) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout, non-transpose, bias')) + + # NumPy implementation + val = val_with_bias + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Data-parallel layout, transpose, no bias + # ------------------------------------------ + + # LBANN implementation + linearity_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(linearity, order='C')) + ) + ) + x = x_lbann + y = lbann.FullyConnected(x, + weights=linearity_weights, + data_layout='data_parallel', + num_neurons=_output_size, + has_bias=False, + transpose=True) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, transpose, no bias')) + + # NumPy implementation + val = val_without_bias + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout, transpose, no bias + # ------------------------------------------ + + # LBANN implementation + linearity_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=tools.str_list(np.nditer(linearity, order='C')) + ) + ) + x = x_lbann + y = lbann.FullyConnected(x, + weights=linearity_weights, + data_layout='model_parallel', + num_neurons=_output_size, + has_bias=False, + transpose=True) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, transpose, no bias')) + + # NumPy implementation + val = val_without_bias + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_identity.py b/bamboo/unit_tests/test_unit_layer_identity.py index 86568e946d5..cb9a523d62f 100644 --- a/bamboo/unit_tests/test_unit_layer_identity.py +++ b/bamboo/unit_tests/test_unit_layer_identity.py @@ -1,49 +1,190 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(201910244) +_num_samples = 83 +_sample_size = 47 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Identity(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Identity(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_identity(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_identity: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_identity_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_identity_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='identity', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_identity_clang4(cluster, exes, dirname): - skeleton_layer_identity(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_identity_gcc4_check(cluster, exes, dirname): - skeleton_layer_identity(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_identity_gcc7(cluster, exes, dirname): - skeleton_layer_identity(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_identity_intel18(cluster, exes, dirname): - skeleton_layer_identity(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_layer_identity.py -k 'test_unit_layer_identity_exe' --exe= -def test_unit_layer_identity_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_identity_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_identity(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_identity_distconv.py b/bamboo/unit_tests/test_unit_layer_identity_distconv.py new file mode 100644 index 00000000000..7a991359bcc --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_identity_distconv.py @@ -0,0 +1,196 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(201910244) +_num_samples = 83 +_sample_size = 48 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size=mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def create_parallel_strategy(num_height_groups): + return {"height_groups": num_height_groups} + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout with distconv + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + x = lbann.Reshape(x, dims="4 4 3") + y = lbann.Identity(x, data_layout='data_parallel', + parallel_strategy=create_parallel_strategy(4)) + x = lbann.Reshape(x, dims="48") + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Identity(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__, procs_per_node=4): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_instance_norm.py b/bamboo/unit_tests/test_unit_layer_instance_norm.py new file mode 100644 index 00000000000..bdc2c44a075 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_instance_norm.py @@ -0,0 +1,176 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20200107) +_num_samples = 15 +_sample_dims = (5,3,7) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(loc=0.5, size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# NumPy implementation +# ============================================== + +def numpy_instance_norm(x, epsilon=1e-5): + if x.dtype is not np.float64: + x = x.astype(np.float64) + axes = tuple(range(1,x.ndim)) + mean = np.mean(x, axis=axes, keepdims=True) + var = np.var(x, ddof=1, axis=axes, keepdims=True) + return (x - mean) / np.sqrt(var + epsilon) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_dims)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_dims))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.InstanceNorm(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = numpy_instance_norm(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_l1_norm.py b/bamboo/unit_tests/test_unit_layer_l1_norm.py index 9abcc2652ce..c3054f862f0 100644 --- a/bamboo/unit_tests/test_unit_layer_l1_norm.py +++ b/bamboo/unit_tests/test_unit_layer_l1_norm.py @@ -1,49 +1,194 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The L1 norm is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(201910245) +_num_samples = 23 +_sample_size = 11 +_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.L1Norm(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.linalg.norm(x, 1) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.L1Norm(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.linalg.norm(x, 1) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_l1_norm(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_l1_norm: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_l1_norm_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_l1_norm_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='l1_norm', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_l1_norm_clang4(cluster, exes, dirname): - skeleton_layer_l1_norm(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_l1_norm_gcc4_check(cluster, exes, dirname): - skeleton_layer_l1_norm(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_l1_norm_gcc7(cluster, exes, dirname): - skeleton_layer_l1_norm(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_l1_norm_intel18(cluster, exes, dirname): - skeleton_layer_l1_norm(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l1_norm_exe' --exe= -def test_unit_layer_l1_norm_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_l1_norm_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_l1_norm(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_l2_norm2.py b/bamboo/unit_tests/test_unit_layer_l2_norm2.py deleted file mode 100644 index cdbad231498..00000000000 --- a/bamboo/unit_tests/test_unit_layer_l2_norm2.py +++ /dev/null @@ -1,49 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools -import pytest -import os - - -def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_l2_norm2: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_l2_norm2_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_l2_norm2_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='l2_norm2', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 - - -def test_unit_layer_l2_norm2_clang4(cluster, exes, dirname): - skeleton_layer_l2_norm2(cluster, exes, dirname, 'clang4') - - -def test_unit_layer_l2_norm2_gcc4_check(cluster, exes, dirname): - skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc4') - - -def test_unit_layer_l2_norm2_gcc7(cluster, exes, dirname): - skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc7') - - -def test_unit_layer_l2_norm2_intel18(cluster, exes, dirname): - skeleton_layer_l2_norm2(cluster, exes, dirname, 'intel18') - - -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l2_norm2_exe' --exe= -def test_unit_layer_l2_norm2_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_l2_norm2_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_l2_norm2(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_layer_layer_norm.py b/bamboo/unit_tests/test_unit_layer_layer_norm.py new file mode 100644 index 00000000000..c3daef9b888 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_layer_norm.py @@ -0,0 +1,202 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20191114) +_num_samples = 31 +_sample_size = 31 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# NumPy softmax +# ============================================== + +def numpy_layer_norm(x, epsilon=1e-5): + if x.dtype is not np.float64: + x = x.astype(np.float64) + mean = np.mean(x) + var = np.var(x, ddof=1) + return (x - mean) / np.sqrt(var + epsilon) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LayerNorm(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = numpy_layer_norm(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + epsilon = 0.0123 + x = x_lbann + y = lbann.LayerNorm(x, data_layout='model_parallel', epsilon=epsilon) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = numpy_layer_norm(x, epsilon) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu.py b/bamboo/unit_tests/test_unit_layer_leaky_relu.py index 6c90b34ce78..cce0e0802c2 100644 --- a/bamboo/unit_tests/test_unit_layer_leaky_relu.py +++ b/bamboo/unit_tests/test_unit_layer_leaky_relu.py @@ -1,49 +1,194 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The leaky ReLU is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(201910246) +_num_samples = 23 +_sample_size = 11 +_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LeakyRelu(x, negative_slope=0.01, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x > 0, x, 0.01*x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LeakyRelu(x, negative_slope=2, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x > 0, x, 2*x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_leaky_relu(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_leaky_relu: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_leaky_relu_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_leaky_relu_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='leaky_relu', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_leaky_relu_clang4(cluster, exes, dirname): - skeleton_layer_leaky_relu(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_leaky_relu_gcc4_check(cluster, exes, dirname): - skeleton_layer_leaky_relu(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_leaky_relu_gcc7(cluster, exes, dirname): - skeleton_layer_leaky_relu(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_leaky_relu_intel18(cluster, exes, dirname): - skeleton_layer_leaky_relu(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_leaky_relu_exe' --exe= -def test_unit_layer_leaky_relu_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_leaky_relu_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_leaky_relu(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu_distconv.py b/bamboo/unit_tests/test_unit_layer_leaky_relu_distconv.py new file mode 100644 index 00000000000..e3abe076bef --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_leaky_relu_distconv.py @@ -0,0 +1,205 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The leaky ReLU is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(201910246) +_num_samples = 23 +_sample_size = 48 +_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size=mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def create_parallel_strategy(num_height_groups): + return {"height_groups": num_height_groups} + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + x = lbann.Reshape(x, dims="4 2 6") + y = lbann.LeakyRelu(x, negative_slope=0.01, + data_layout='data_parallel', + parallel_strategy=create_parallel_strategy(4)) + y = lbann.Reshape(y, dims=str(sample_dims())) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x > 0, x, 0.01*x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + x = lbann.Reshape(x, dims="4 2 6") + y = lbann.LeakyRelu(x, negative_slope=2, + data_layout='model_parallel', + parallel_strategy=create_parallel_strategy(4)) + y = lbann.Reshape(y, dims=str(sample_dims())) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x > 0, x, 2*x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__, procs_per_node=4): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py index 9a47d55754d..fa2ea6035ee 100644 --- a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py @@ -1,49 +1,192 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The L1 norm is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(201910247) +_num_samples = 23 +_sample_size = 7 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LogSigmoid(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x - np.log1p(np.exp(x)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LogSigmoid(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x - np.log1p(np.exp(x)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_log_sigmoid(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_log_sigmoid: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_log_sigmoid_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_log_sigmoid_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='log_sigmoid', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_log_sigmoid_clang4(cluster, exes, dirname): - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_log_sigmoid_gcc4_check(cluster, exes, dirname): - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_log_sigmoid_gcc7(cluster, exes, dirname): - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_log_sigmoid_intel18(cluster, exes, dirname): - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_layer_log_sigmoid.py -k 'test_unit_layer_log_sigmoid_exe' --exe= -def test_unit_layer_log_sigmoid_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_log_sigmoid_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py index 85a20790d31..d541a3c7531 100644 --- a/bamboo/unit_tests/test_unit_layer_log_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py @@ -1,49 +1,205 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(201910213) +_num_samples = 15 +_sample_size = 11 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# NumPy softmax +# ============================================== + +def numpy_log_softmax(x): + """Log-softmax, computed with NumPy + + The computation is performed with 64-bit floats. + + """ + if x.dtype is not np.float64: + x = x.astype(np.float64) + x = x - np.max(x) + return x - np.log(np.sum(np.exp(x))) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LogSoftmax(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = numpy_log_softmax(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LogSoftmax(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = numpy_log_softmax(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_log_softmax(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_log_softmax: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_log_softmax_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_log_softmax_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='log_softmax', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_log_softmax_clang4(cluster, exes, dirname): - skeleton_layer_log_softmax(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_log_softmax_gcc4_check(cluster, exes, dirname): - skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_log_softmax_gcc7(cluster, exes, dirname): - skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_log_softmax_intel18(cluster, exes, dirname): - skeleton_layer_log_softmax(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_log_softmax_exe' --exe= -def test_unit_layer_log_softmax_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_log_softmax_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_log_softmax(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_matmul.py b/bamboo/unit_tests/test_unit_layer_matmul.py new file mode 100644 index 00000000000..53dd3a2557b --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_matmul.py @@ -0,0 +1,264 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20191111) +_m = 11 +_n = 3 +_k = 5 +_samples = np.random.normal(size=(27,_m*_k+_k*_n)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index].reshape(-1) +def num_samples(): + return _samples.shape[0] +def sample_dims(): + return (_samples.shape[-1],) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with weights layers so that gradient checking will + # verify that error signals are correct. + x0_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input0_weights') + x1_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input1_weights') + x_slice = lbann.Slice(lbann.Input(), + slice_points=tools.str_list([0, _m*_k, _m*_k+_k*_n])) + x0 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x0_weights, dims=str(_m*_k))) + x1 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x1_weights, dims=str(_k*_n))) + x0_lbann = x0 + x1_lbann = x1 + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # NN GEMM + # ------------------------------------------ + + # LBANN implementation + x0 = lbann.Reshape(x0_lbann, dims=tools.str_list([_m, _k])) + x1 = lbann.Reshape(x1_lbann, dims=tools.str_list([_k, _n])) + y = lbann.MatMul(x0, x1, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='NN GEMM')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:_m*_k].reshape([_m,_k]) + x1 = x[_m*_k:].reshape([_k,_n]) + y = np.matmul(x0, x1) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # TN GEMM + # ------------------------------------------ + + # LBANN implementation + x0 = lbann.Reshape(x0_lbann, dims=tools.str_list([_k, _m])) + x1 = lbann.Reshape(x1_lbann, dims=tools.str_list([_k, _n])) + y = lbann.MatMul(x0, x1, transpose_a=True, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='TN GEMM')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:_m*_k].reshape([_k,_m]) + x1 = x[_m*_k:].reshape([_k,_n]) + y = np.matmul(x0.transpose(), x1) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # NT GEMM + # ------------------------------------------ + + # LBANN implementation + x0 = lbann.Reshape(x0_lbann, dims=tools.str_list([_m, _k])) + x1 = lbann.Reshape(x1_lbann, dims=tools.str_list([_n, _k])) + y = lbann.MatMul(x0, x1, transpose_b=True, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='NT GEMM')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:_m*_k].reshape([_m,_k]) + x1 = x[_m*_k:].reshape([_n,_k]) + y = np.matmul(x0, x1.transpose()) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # TT GEMM + # ------------------------------------------ + + # LBANN implementation + x0 = lbann.Reshape(x0_lbann, dims=tools.str_list([_k, _m])) + x1 = lbann.Reshape(x1_lbann, dims=tools.str_list([_n, _k])) + y = lbann.MatMul(x0, x1, transpose_a=True, transpose_b=True, + data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='TT GEMM')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:_m*_k].reshape([_k,_m]) + x1 = x[_m*_k:].reshape([_n,_k]) + y = np.matmul(x0.transpose(), x1.transpose()) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x0_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py index c21544ed295..4c4c8eb7045 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py @@ -1,49 +1,204 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: MAE is not differentiable when the two inputs match, so we +# make sure inputs have separated values. +np.random.seed(201910248) +_samples = np.random.uniform(-0.25, 0.25, size=(27,2,7)).astype(np.float32) +_samples[:,1,:] += np.random.choice([-1.0,1.0], size=(27,7)) + +# Sample access functions +def get_sample(index): + return _samples[index].reshape(-1) +def num_samples(): + return _samples.shape[0] +def sample_dims(): + return (2*_samples.shape[-1],) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with weights layers so that gradient checking will + # verify that error signals are correct. + slice_size = _samples.shape[-1] + x0_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input0_weights') + x1_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input1_weights') + x_slice = lbann.Slice(lbann.Input(), + slice_points=tools.str_list([0, slice_size, 2*slice_size])) + x0 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size))) + x1 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size))) + x0_lbann = x0 + x1_lbann = x1 + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.MeanAbsoluteError(x0, x1, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = np.linalg.norm(x1-x0, 1) / slice_size + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.MeanAbsoluteError(x0, x1, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = np.linalg.norm(x1-x0, 1) / slice_size + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_mean_absolute_error(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_mean_absolute_error: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_mean_absolute_error_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_mean_absolute_error_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='mean_absolute_error', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_mean_absolute_error_clang4(cluster, exes, dirname): - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x0_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_mean_absolute_error_gcc4_check(cluster, exes, dirname): - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_mean_absolute_error_gcc7(cluster, exes, dirname): - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_mean_absolute_error_intel18(cluster, exes, dirname): - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_mean_absolute_error_exe' --exe= -def test_unit_layer_mean_absolute_error_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_mean_absolute_error_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_mean_squared_error.py b/bamboo/unit_tests/test_unit_layer_mean_squared_error.py new file mode 100644 index 00000000000..2e6a1cef5f6 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_mean_squared_error.py @@ -0,0 +1,201 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(201910249) +_samples = np.random.normal(size=(27,2,13)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index].reshape(-1) +def num_samples(): + return _samples.shape[0] +def sample_dims(): + return (2*_samples.shape[-1],) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with weights layers so that gradient checking will + # verify that error signals are correct. + slice_size = _samples.shape[-1] + x0_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input0_weights') + x1_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input1_weights') + x_slice = lbann.Slice(lbann.Input(), + slice_points=tools.str_list([0, slice_size, 2*slice_size])) + x0 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size))) + x1 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size))) + x0_lbann = x0 + x1_lbann = x1 + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.MeanSquaredError(x0, x1, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = tools.numpy_l2norm2(x1-x0) / slice_size + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.MeanSquaredError(x0, x1, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = tools.numpy_l2norm2(x1-x0) / slice_size + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x0_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_one_hot.py b/bamboo/unit_tests/test_unit_layer_one_hot.py new file mode 100644 index 00000000000..6a3db01a79b --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_one_hot.py @@ -0,0 +1,139 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +one_hot_size = 7 +seed = 201909113 + +# Sample access functions +def get_sample(index): + np.random.seed(seed+index) + return [np.random.uniform(-1, one_hot_size+1)] +def num_samples(): + return 47 +def sample_dims(): + return (1,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Layer graph + x = lbann.Input() + y1 = lbann.OneHot(x, size=one_hot_size) + y2 = lbann.Concatenation([lbann.Constant(value=i+1, num_neurons='1') + for i in range(one_hot_size)]) + y = lbann.Multiply(y1, y2) + z = lbann.L2Norm2(y) + + # Objects for LBANN model + layers = list(lbann.traverse_layer_graph(x)) + metric = lbann.Metric(z, name='obj') + obj = lbann.ObjectiveFunction(z) + callbacks = [] + + # Compute expected metric value + vals = [] + for i in range(num_samples()): + x = get_sample(i)[0] + y = int(x) + 1 if (0 <= x and x < one_hot_size) else 0 + z = y * y + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metric.name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # Construct model + num_epochs = 0 + return lbann.Model(num_epochs, + layers=layers, + objective_function=obj, + metrics=[metric], + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_relu.py b/bamboo/unit_tests/test_unit_layer_relu.py index c904cce301f..0b5700aca99 100644 --- a/bamboo/unit_tests/test_unit_layer_relu.py +++ b/bamboo/unit_tests/test_unit_layer_relu.py @@ -1,49 +1,194 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: ReLU is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(2019102410) +_num_samples = 23 +_sample_size = 41 +_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Relu(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.maximum(x, 0.0) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Relu(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.maximum(x, 0.0) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_relu(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_relu: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_relu_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_relu_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='relu', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_relu_clang4(cluster, exes, dirname): - skeleton_layer_relu(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_relu_gcc4_check(cluster, exes, dirname): - skeleton_layer_relu(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_relu_gcc7(cluster, exes, dirname): - skeleton_layer_relu(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_relu_intel18(cluster, exes, dirname): - skeleton_layer_relu(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_layer_relu.py -k 'test_unit_layer_relu_exe' --exe= -def test_unit_layer_relu_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_relu_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_relu(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_relu_distconv.py b/bamboo/unit_tests/test_unit_layer_relu_distconv.py new file mode 100644 index 00000000000..eed3171b2b5 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_relu_distconv.py @@ -0,0 +1,203 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: ReLU is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(2019102410) +_num_samples = 23 +_sample_size = 48 +_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size=mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def create_parallel_strategy(num_height_groups): + return {"height_groups": num_height_groups} + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + x = lbann.Reshape(x, dims="4 2 6") + y = lbann.Relu(x, data_layout='data_parallel', + parallel_strategy=create_parallel_strategy(4)) + y = lbann.Reshape(y, dims=str(sample_dims())) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.maximum(x, 0.0) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + x = lbann.Reshape(x, dims="4 2 6") + y = lbann.Relu(x, data_layout='model_parallel', + parallel_strategy=create_parallel_strategy(4)) + y = lbann.Reshape(y, dims=str(sample_dims())) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.maximum(x, 0.0) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__, procs_per_node=4): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_selu.py b/bamboo/unit_tests/test_unit_layer_selu.py index b32f8c9eb71..c8a4c3dc197 100644 --- a/bamboo/unit_tests/test_unit_layer_selu.py +++ b/bamboo/unit_tests/test_unit_layer_selu.py @@ -1,49 +1,210 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: SELU is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(2019102411) +_num_samples = 20 +_sample_size = 5 +_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# NumPy SELU +# ============================================== + +def numpy_selu(x): + """NumPy implementation of SELU activation. + + The computation is performed with 64-bit floats. + + """ + if x.dtype is not np.float64: + x = x.astype(np.float64) + alpha = 1.6732632423543772848170429916717 + scale = 1.0507009873554804934193349852946 + return scale * np.where(x < 0, alpha * np.expm1(x), x) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Selu(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = numpy_selu(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Selu(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = numpy_selu(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_selu(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_selu: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_selu_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_selu_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='selu', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_selu_clang4(cluster, exes, dirname): - skeleton_layer_selu(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_selu_gcc4_check(cluster, exes, dirname): - skeleton_layer_selu(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_selu_gcc7(cluster, exes, dirname): - skeleton_layer_selu(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_selu_intel18(cluster, exes, dirname): - skeleton_layer_selu(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_layer_selu.py -k 'test_unit_layer_selu_exe' --exe= -def test_unit_layer_selu_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_selu_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_selu(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid.py b/bamboo/unit_tests/test_unit_layer_sigmoid.py index 268526b7644..590f0448ff5 100644 --- a/bamboo/unit_tests/test_unit_layer_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_sigmoid.py @@ -1,49 +1,196 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The L1 norm is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(2019102412) +_num_samples = 23 +_sample_size = 17 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Sigmoid(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x >= 0, + 1 / (1 + np.exp(-x)), + np.exp(x) / (1 + np.exp(x))) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Sigmoid(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x >= 0, + 1 / (1 + np.exp(-x)), + np.exp(x) / (1 + np.exp(x))) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_sigmoid(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_sigmoid: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_sigmoid_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_sigmoid_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='sigmoid', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_sigmoid_clang4(cluster, exes, dirname): - skeleton_layer_sigmoid(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_sigmoid_gcc4_check(cluster, exes, dirname): - skeleton_layer_sigmoid(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_sigmoid_gcc7(cluster, exes, dirname): - skeleton_layer_sigmoid(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_sigmoid_intel18(cluster, exes, dirname): - skeleton_layer_sigmoid(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_layer_sigmoid.py -k 'test_unit_layer_sigmoid_exe' --exe= -def test_unit_layer_sigmoid_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_sigmoid_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_sigmoid(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid_binary_cross_entropy.py b/bamboo/unit_tests/test_unit_layer_sigmoid_binary_cross_entropy.py new file mode 100644 index 00000000000..59f3c580457 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_sigmoid_binary_cross_entropy.py @@ -0,0 +1,204 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: Sigmoid cross entropy is not differentiable w.r.t. ground +# truth at 0 and 1. +np.random.seed(20191218) +_samples = np.random.normal(size=(11,2,13)).astype(np.float32) +_samples[:,1,:] = np.clip(_samples[:,1,:], 0.1, 0.9) + +# Sample access functions +def get_sample(index): + return _samples[index].reshape(-1) +def num_samples(): + return _samples.shape[0] +def sample_dims(): + return (2*_samples.shape[-1],) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with weights layers so that gradient checking will + # verify that error signals are correct. + slice_size = _samples.shape[-1] + x0_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input0_weights') + x1_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input1_weights') + x_slice = lbann.Slice(lbann.Input(), + slice_points=tools.str_list([0, slice_size, 2*slice_size])) + x0 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size))) + x1 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size))) + x0_lbann = x0 + x1_lbann = x1 + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.SigmoidBinaryCrossEntropy(x0, x1, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = -x1 * np.log1p(np.exp(-x0)) - (1-x1) * np.log1p(np.exp(x0)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.SigmoidBinaryCrossEntropy(x0, x1, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = -x1 * np.log1p(np.exp(-x0)) - (1-x1) * np.log1p(np.exp(x0)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x0_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_slice.py b/bamboo/unit_tests/test_unit_layer_slice.py new file mode 100644 index 00000000000..8d48b436b33 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_slice.py @@ -0,0 +1,275 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np +import pytest + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20190708) +_num_samples = 29 +_sample_dims = (7,5,3) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_dims)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_dims))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # -------------------------- + # Slice along axis 0 + # -------------------------- + + # LBANN implementation + slice_points = (2, 3, 6, 7) + x = x_lbann + x_slice = lbann.Slice(x, axis=0, slice_points=tools.str_list(slice_points)) + y = [] + for _ in range(len(slice_points)-1): + y.append(lbann.L2Norm2(x_slice)) + z = lbann.Add(y[0], y[2]) + obj.append(z) + metrics.append(lbann.Metric(z, name='axis0')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = [] + for j in range(len(slice_points)-1): + x_slice = x[slice_points[j]:slice_points[j+1],:,:] + y.append(tools.numpy_l2norm2(x_slice)) + z = y[0] + y[2] + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # -------------------------- + # Slice along axis 1 + # -------------------------- + + # LBANN implementation + slice_points = (0, 2, 3, 4) + x = x_lbann + x_slice = lbann.Slice(x, axis=1, slice_points=tools.str_list(slice_points)) + y = [] + for _ in range(len(slice_points)-1): + y.append(lbann.L2Norm2(x_slice)) + z = lbann.Add(y[0], y[2]) + obj.append(z) + metrics.append(lbann.Metric(z, name='axis1')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = [] + for j in range(len(slice_points)-1): + x_slice = x[:,slice_points[j]:slice_points[j+1],:] + y.append(tools.numpy_l2norm2(x_slice)) + z = y[0] + y[2] + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # -------------------------- + # Slice along axis 2 + # -------------------------- + + # LBANN implementation + slice_points = (1, 3) + x = x_lbann + x_slice = lbann.Slice(x, axis=2, slice_points=tools.str_list(slice_points)) + y = [] + for _ in range(len(slice_points)-1): + y.append(lbann.L2Norm2(x_slice)) + z = y[0] + obj.append(z) + metrics.append(lbann.Metric(z, name='axis2')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = [] + for j in range(len(slice_points)-1): + x_slice = x[:,:,slice_points[j]:slice_points[j+1]] + y.append(tools.numpy_l2norm2(x_slice)) + z = y[0] + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # -------------------------- + # Model-parallel + # -------------------------- + + # LBANN implementation + slice_points = (31, 54, 56, 57) + x = lbann.Reshape(x_lbann, dims=tools.str_list([105])) + x_slice = lbann.Slice(x, slice_points=tools.str_list(slice_points), + data_layout='model_parallel') + y = [] + for _ in range(len(slice_points)-1): + y.append(lbann.L2Norm2(x_slice)) + z = lbann.Add(y[0], y[2]) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(-1).astype(np.float64) + y = [] + for j in range(len(slice_points)-1): + x_slice = x[slice_points[j]:slice_points[j+1]] + y.append(tools.numpy_l2norm2(x_slice)) + z = y[0] + y[2] + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # -------------------------- + # Gradient checking + # -------------------------- + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # -------------------------- + # Construct model + # -------------------------- + + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index dd4c3add193..7eaf4a9954b 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -1,49 +1,206 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(201910142) +_num_samples = 19 +_sample_size = 7 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# NumPy softmax +# ============================================== + +def numpy_softmax(x): + """NumPy implementation of softmax. + + The computation is performed with 64-bit floats. There is also an + implementation of softmax in SciPy 1.2.0 (scipy.special.softmax). + + """ + if x.dtype is not np.float64: + x = x.astype(np.float64) + y = np.exp(x - np.max(x)) + return y / np.sum(y) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Softmax(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = numpy_softmax(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Softmax(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = numpy_softmax(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_softmax(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_softmax: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_softmax_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_softmax_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='softmax', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_softmax_clang4(cluster, exes, dirname): - skeleton_layer_softmax(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_softmax_gcc4_check(cluster, exes, dirname): - skeleton_layer_softmax(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_softmax_gcc7(cluster, exes, dirname): - skeleton_layer_softmax(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_softmax_intel18(cluster, exes, dirname): - skeleton_layer_softmax(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_softmax_exe' --exe= -def test_unit_layer_softmax_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_softmax_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_softmax(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_softplus.py b/bamboo/unit_tests/test_unit_layer_softplus.py index 0c017c6f93e..3d2076a6e03 100644 --- a/bamboo/unit_tests/test_unit_layer_softplus.py +++ b/bamboo/unit_tests/test_unit_layer_softplus.py @@ -1,49 +1,192 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The L1 norm is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(2019102413) +_num_samples = 11 +_sample_size = 7 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Softplus(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.log1p(np.exp(x)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Softplus(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.log1p(np.exp(x)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_softplus(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_softplus: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_softplus_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_softplus_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='softplus', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_softplus_clang4(cluster, exes, dirname): - skeleton_layer_softplus(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_softplus_gcc4_check(cluster, exes, dirname): - skeleton_layer_softplus(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_softplus_gcc7(cluster, exes, dirname): - skeleton_layer_softplus(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_softplus_intel18(cluster, exes, dirname): - skeleton_layer_softplus(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_layer_softplus.py -k 'test_unit_layer_softplus_exe' --exe= -def test_unit_layer_softplus_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_softplus_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_softplus(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_softsign.py b/bamboo/unit_tests/test_unit_layer_softsign.py index a7bed251425..46d983b8416 100644 --- a/bamboo/unit_tests/test_unit_layer_softsign.py +++ b/bamboo/unit_tests/test_unit_layer_softsign.py @@ -1,49 +1,190 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(2019102414) +_num_samples = 11 +_sample_size = 7 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Softsign(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x / (1 + np.abs(x)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Softsign(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x / (1 + np.abs(x)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_softsign(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_softsign: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_softsign_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_softsign_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='softsign', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_softsign_clang4(cluster, exes, dirname): - skeleton_layer_softsign(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_softsign_gcc4_check(cluster, exes, dirname): - skeleton_layer_softsign(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_softsign_gcc7(cluster, exes, dirname): - skeleton_layer_softsign(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_softsign_intel18(cluster, exes, dirname): - skeleton_layer_softsign(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_layer_softsign.py -k 'test_unit_layer_softsign_exe' --exe= -def test_unit_layer_softsign_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_softsign_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_softsign(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_squared_difference.py b/bamboo/unit_tests/test_unit_layer_squared_difference.py index a05bbcc5082..1ea15aea9bc 100644 --- a/bamboo/unit_tests/test_unit_layer_squared_difference.py +++ b/bamboo/unit_tests/test_unit_layer_squared_difference.py @@ -1,49 +1,201 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(2019102415) +_samples = np.random.normal(size=(23,2,7)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index].reshape(-1) +def num_samples(): + return _samples.shape[0] +def sample_dims(): + return (2*_samples.shape[-1],) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with weights layers so that gradient checking will + # verify that error signals are correct. + slice_size = _samples.shape[-1] + x0_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input0_weights') + x1_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input1_weights') + x_slice = lbann.Slice(lbann.Input(), + slice_points=tools.str_list([0, slice_size, 2*slice_size])) + x0 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size))) + x1 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size))) + x0_lbann = x0 + x1_lbann = x1 + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.SquaredDifference(x0, x1, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = (x1-x0)**2 + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.SquaredDifference(x0, x1, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = (x1-x0)**2 + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_squared_difference(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_squared_difference: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_squared_difference_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_squared_difference_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='squared_difference', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_squared_difference_clang4(cluster, exes, dirname): - skeleton_layer_squared_difference(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x0_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_squared_difference_gcc4_check(cluster, exes, dirname): - skeleton_layer_squared_difference(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_squared_difference_gcc7(cluster, exes, dirname): - skeleton_layer_squared_difference(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_squared_difference_intel18(cluster, exes, dirname): - skeleton_layer_squared_difference(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_layer_squared_difference.py -k 'test_unit_layer_squared_difference_exe' --exe= -def test_unit_layer_squared_difference_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_squared_difference_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_squared_difference(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_tessellate.py b/bamboo/unit_tests/test_unit_layer_tessellate.py index 575bd894f89..862d97fc936 100644 --- a/bamboo/unit_tests/test_unit_layer_tessellate.py +++ b/bamboo/unit_tests/test_unit_layer_tessellate.py @@ -1,49 +1,198 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') -import tools +import numpy as np import pytest -import os +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(2019102416) +_num_samples = 29 +_sample_dims = (3,1,4) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_dims)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_dims))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + output_dims = (7,4,3) + x = x_lbann + y = lbann.Tessellate(x, + dims=tools.str_list(output_dims), + data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = np.tile(x, (3,4,1))[:7,:4,:3] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + output_dims = (2,1,9) + x = x_lbann + y = lbann.Tessellate(x, + dims=tools.str_list(output_dims), + data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = np.tile(x, (1,1,3))[:2,:1,:9] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_tessellate(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_tessellate: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_tessellate_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_tessellate_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='tessellate', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # -------------------------- + # Gradient checking + # -------------------------- + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_tessellate_clang4(cluster, exes, dirname): - skeleton_layer_tessellate(cluster, exes, dirname, 'clang4') + # -------------------------- + # Construct model + # -------------------------- + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_tessellate_gcc4_check(cluster, exes, dirname): - skeleton_layer_tessellate(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_tessellate_gcc7(cluster, exes, dirname): - skeleton_layer_tessellate(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_tessellate_intel18(cluster, exes, dirname): - skeleton_layer_tessellate(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_layer_tessellate.py -k 'test_unit_layer_tessellate_exe' --exe= -def test_unit_layer_tessellate_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_tessellate_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_tessellate(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_variance.py b/bamboo/unit_tests/test_unit_layer_variance.py index 0db001567d5..8113d45adbe 100644 --- a/bamboo/unit_tests/test_unit_layer_variance.py +++ b/bamboo/unit_tests/test_unit_layer_variance.py @@ -1,49 +1,244 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(2019102417) +_num_samples = 11 +_sample_size = 7 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + mini_batch_size = num_samples() // 2 + trainer = lbann.Trainer(mini_batch_size) + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout, unbiased + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Variance(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, unbiased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.cov(x, bias=False) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout, unbiased + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Variance(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.cov(x, bias=False) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Data-parallel layout, biased + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Variance(x, biased=True, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, biased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.cov(x, bias=True) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout, biased + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Variance(x, biased=True, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout, biased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.cov(x, bias=True) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) -def skeleton_layer_variance(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_layer_variance: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_variance_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_variance_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', - model_folder='tests/layer_tests', model_name='variance', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def test_unit_layer_variance_clang4(cluster, exes, dirname): - skeleton_layer_variance(cluster, exes, dirname, 'clang4') + # ------------------------------------------ + # Construct model + # ------------------------------------------ + num_epochs = 0 + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_variance_gcc4_check(cluster, exes, dirname): - skeleton_layer_variance(cluster, exes, dirname, 'gcc4') +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_variance_gcc7(cluster, exes, dirname): - skeleton_layer_variance(cluster, exes, dirname, 'gcc7') + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_variance_intel18(cluster, exes, dirname): - skeleton_layer_variance(cluster, exes, dirname, 'intel18') + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_variance_exe' --exe= -def test_unit_layer_variance_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_layer_variance_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_variance(cluster, exes, dirname, 'exe') +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py deleted file mode 100644 index 4b8491e248f..00000000000 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ /dev/null @@ -1,148 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools -import pytest -import os, sys - - -def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_lbann2_reload: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - lbann2 = executables[compiler_name] + '2' - - # Delete directories / files if they happen to be around from the - # previous build. - os.system('rm -rf ckpt') - os.system('rm -rf lbann2_*') - - - # No checkpointing, printing weights to files. - model_path = '{../../model_zoo/models/lenet_mnist/model_lenet_mnist.prototext,../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext}' - output_file_name = '%s/bamboo/unit_tests/output/lbann2_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/lbann2_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2, - data_reader_name='mnist', - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - dir_name=dir_name, - model_path=model_path, - optimizer_name='sgd', - num_epochs=2, - output_file_name=output_file_name, - error_file_name=error_file_name) - - os.mkdir('lbann2_ckpt') - return_code = os.system(command) - if return_code != 0: - sys.stderr.write('LBANN2 LeNet execution failed, exiting with error') - sys.exit(1) - - os.system('mv lbann2_ckpt lbann2_nockpt') - - # Run to checkpoint, printing weights to files. - output_file_name = '%s/bamboo/unit_tests/output/lbann2_checkpoint_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/lbann2_checkpoint_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2, - dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', - model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', - output_file_name=output_file_name, - error_file_name=error_file_name) - return_code_ckpt_1 = os.system(command) - if return_code_ckpt_1 != 0: - sys.stderr.write( - 'LeNet (checkpoint) execution failed, exiting with error') - sys.exit(1) - - # Pick up from checkpoint, printing weights to files. - output_file_name = '%s/bamboo/unit_tests/output/lbann2_restart_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/lbann2_restart_%s_error.txt' % (dir_name, compiler_name) - os.mkdir('lbann2_ckpt') - command = tools.get_command( - cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2, - dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', - model_path='../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext', - num_epochs=2, optimizer_name='sgd', ckpt_dir='ckpt/', - output_file_name=output_file_name, - error_file_name=error_file_name) - return_code_ckpt_2 = os.system(command) - if return_code_ckpt_2 != 0: - sys.stderr.write( - 'LBANN2 LeNet weight reload failed, exiting with error') - sys.exit(1) - os.system('rm lbann2_ckpt/model0-epoch*') - os.system('rm lbann2_nockpt/model0-epoch*') - - diff_result = os.system('diff -rq lbann2_ckpt/ lbann2_nockpt/') - allow_epsilon_diff = False - if allow_epsilon_diff and (diff_result != 0): - equal_within_epsilon = True - ckpt_files = os.listdir('lbann2_ckpt') - for file_name in ckpt_files: - ckpt_file = open('lbann2_ckpt/' + file_name, 'r') - no_ckpt_file = open('lbann2_nockpt/' + file_name, 'r') - for ckpt_line in ckpt_file: - no_ckpt_line = next(no_ckpt_file) - if ckpt_line != no_ckpt_line: - error_string = ('ckpt_line={ckpt_line},' - ' nockpt_line={no_ckpt_line}').format( - ckpt_line=ckpt_line, no_ckpt_line=no_ckpt_line) - try: - ckpt_values = list(map(float, ckpt_line.split())) - no_ckpt_values = list(map(float, no_ckpt_line.split())) - num = len(ckpt_values) - if len(no_ckpt_values) == num: - for i in range(num): - if abs(ckpt_values[i] - no_ckpt_values[i]) > 0.5: - # Not equal within epsilon. - equal_within_epsilon = False - print(error_string) - else: - # Length of lists don't match. - equal_within_epsilon = False - print(error_string) - except ValueError: - # Non-numerical diff. - equal_within_epsilon = False - print(error_string) - if equal_within_epsilon: - diff_result = 0 - os.system('rm -rf ckpt') - os.system('rm -rf lbann2_*') - assert diff_result == 0 - - -def test_unit_lbann2_reload_clang4(cluster, exes, dirname): - if cluster == 'catalyst': # STILL ERRORS - pytest.skip('FIXME') - skeleton_lbann2_reload(cluster, exes, dirname, 'clang4') - - -def test_unit_lbann2_reload_gcc4(cluster, exes, dirname): - skeleton_lbann2_reload(cluster, exes, dirname, 'gcc4') - - -def test_unit_lbann2_reload_gcc7(cluster, exes, dirname): - if cluster in ['catalyst', 'pascal']: # STILL ERRORS - pytest.skip('FIXME') - skeleton_lbann2_reload(cluster, exes, dirname, 'gcc7') - - -def test_unit_lbann2_reload_intel18(cluster, exes, dirname): - skeleton_lbann2_reload(cluster, exes, dirname, 'intel18') - - -# Run with python -m pytest -s test_unit_lbann2_reload.py -k 'test_unit_lbann2_reload_exe' --exe= -def test_unit_lbann2_reload_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_lbann2_reload_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_lbann2_reload(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_lbann_invocation.py b/bamboo/unit_tests/test_unit_lbann_invocation.py index a002db49be4..1ea9a6ae19e 100644 --- a/bamboo/unit_tests/test_unit_lbann_invocation.py +++ b/bamboo/unit_tests/test_unit_lbann_invocation.py @@ -1,93 +1,209 @@ import sys sys.path.insert(0, '../common_python') import tools -import os, sys +import os -def test_unit_no_params_bad(cluster, exes): - exe = exes['gcc4'] - sys.stderr.write('TESTING: run lbann with no params; lbann should throw exception\n') + +def get_default_parameters(dir_name, two_models=True): + data_reader_path = '{d}/model_zoo/data_readers/data_reader_mnist.prototext'.format( + d=dir_name) + model_path = '{d}/model_zoo/tests/model_lenet_mnist_ckpt.prototext'.format( + d=dir_name) + if two_models: + model_path = '{{{mp},{mp}}}'.format(mp=model_path) + optimizer_path = '{d}/model_zoo/optimizers/opt_sgd.prototext'.format( + d=dir_name) + return data_reader_path, model_path, optimizer_path + + +def get_file_names(dir_name, test_name): + output_file_name = '{d}/bamboo/unit_tests/output/lbann_invocation_{t}_output.txt'.format( + d=dir_name, t=test_name) + error_file_name = '{d}/bamboo/unit_tests/error/lbann_invocation_{t}_error.txt'.format( + d=dir_name, t=test_name) + return output_file_name, error_file_name + + +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_no_params_bad' --exes= +def test_unit_no_params_bad(cluster, dirname, exes): + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes + print('TESTING: run lbann with no params; lbann should throw exception\n') + (output_file_name, error_file_name) = get_file_names(dirname, 'no_params_bad') command = tools.get_command( - cluster=cluster, executable=exe, exit_after_setup=True) + cluster=cluster, executable=exe, + exit_after_setup=True, + num_processes=1, + output_file_name=output_file_name, + error_file_name=error_file_name + ) return_code = os.system(command) - assert return_code != 0 + tools.assert_failure(return_code, + 'Failed to load any prototext files', + error_file_name) -def test_unit_one_model_bad(cluster, exes): - exe = exes['gcc4'] - sys.stderr.write('TESTING: run lbann with no optimizer or reader; lbann should throw exception\n') - model_path = 'prototext/model_mnist_simple_1.prototext' +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_one_model_bad' --exes= +def test_unit_one_model_bad(cluster, dirname, exes): + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes + print('TESTING: run lbann with no optimizer or reader; lbann should throw exception\n') + (_, model_path, _) = get_default_parameters(dirname, two_models=False) + (output_file_name, error_file_name) = get_file_names(dirname, 'one_model_bad') command = tools.get_command( - cluster=cluster, executable=exe, exit_after_setup=True, - model_path=model_path) + cluster=cluster, executable=exe, + exit_after_setup=True, + model_path=model_path, + num_processes=1, + output_file_name=output_file_name, + error_file_name=error_file_name + ) return_code = os.system(command) - assert return_code != 0 + tools.assert_failure(return_code, + 'you specified 1 model filenames, and 0 optimizer filenames; you must specify either one or 1 optimizer filenames', + error_file_name) -def test_unit_two_models_bad(cluster, exes): - exe = exes['gcc4'] - sys.stderr.write('TESTING: run lbann with two models but no optimizer or reader; lbann should throw exception\n') - model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_two_models_bad' --exes= +def test_unit_two_models_bad(cluster, dirname, exes): + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes + print('TESTING: run lbann with two models but no optimizer or reader; lbann should throw exception\n') + (_, model_path, _) = get_default_parameters(dirname) + (output_file_name, error_file_name) = get_file_names(dirname, 'two_models_bad') command = tools.get_command( - cluster=cluster, executable=exe, exit_after_setup=True, - model_path=model_path) + cluster=cluster, executable=exe, + exit_after_setup=True, + model_path=model_path, + num_processes=1, + output_file_name=output_file_name, + error_file_name=error_file_name + ) return_code = os.system(command) - assert return_code != 0 + tools.assert_failure(return_code, + 'you specified 2 model filenames, and 0 optimizer filenames; you must specify either one or 2 optimizer filenames', + error_file_name) -def test_unit_two_models_bad2(cluster, exes): - exe = exes['gcc4'] - sys.stderr.write('TESTING: run lbann with two models with missing {; lbann should throw exception\n') - model_path='prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_two_models_bad2' --exes= +def test_unit_two_models_bad2(cluster, dirname, exes): + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes + print('TESTING: run lbann with two models with missing {; lbann should throw exception\n') + (_, model_path, _) = get_default_parameters(dirname, two_models=False) + model_path = '{mp},{mp}}}'.format(mp=model_path) + (output_file_name, error_file_name) = get_file_names(dirname, 'two_models_bad2') command = tools.get_command( - cluster=cluster, executable=exe, exit_after_setup=True, - model_path=model_path) + cluster=cluster, executable=exe, + exit_after_setup=True, + model_path=model_path, + num_processes=1, + output_file_name=output_file_name, + error_file_name=error_file_name + ) return_code = os.system(command) - assert return_code != 0 + tools.assert_failure(return_code, + "possibly you left out '{' or '}' or both", + error_file_name) -def test_unit_missing_optimizer(cluster, exes): - exe = exes['gcc4'] - sys.stderr.write('TESTING: run lbann with two models, reader, but no optimizer; lbann should throw exception\n') - model_path='{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' - data_reader_path='prototext/data_reader_mnist.prototext' +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_missing_optimizer' --exes= +def test_unit_missing_optimizer(cluster, dirname, exes): + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes + print('TESTING: run lbann with two models, reader, but no optimizer; lbann should throw exception\n') + (data_reader_path, model_path, _) = get_default_parameters(dirname) + (output_file_name, error_file_name) = get_file_names(dirname, 'missing_optimizer') command = tools.get_command( - cluster=cluster, executable=exe, data_reader_path=data_reader_path, + cluster=cluster, executable=exe, + data_reader_path=data_reader_path, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - exit_after_setup=True, model_path=model_path) + exit_after_setup=True, model_path=model_path, + num_processes=1, + output_file_name=output_file_name, + error_file_name=error_file_name + ) return_code = os.system(command) - assert return_code != 0 + tools.assert_failure(return_code, + 'you specified 2 model filenames, and 0 optimizer filenames; you must specify either one or 2 optimizer filenames', + error_file_name) -def test_unit_missing_reader(cluster, exes): - exe = exes['gcc4'] - sys.stderr.write('TESTING: run lbann with two models, reader, but no reader; lbann should throw exception\n') - model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' - optimizer_path = 'prototext/opt_sgd.prototext' +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_missing_reader' --exes= +def test_unit_missing_reader(cluster, dirname, exes): + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes + print('TESTING: run lbann with two models, reader, but no reader; lbann should throw exception\n') + (_, model_path, optimizer_path) = get_default_parameters(dirname) + (output_file_name, error_file_name) = get_file_names(dirname, 'missing_reader') command = tools.get_command( - cluster=cluster, executable=exe, exit_after_setup=True, - model_path=model_path, optimizer_path=optimizer_path) + cluster=cluster, executable=exe, + exit_after_setup=True, + model_path=model_path, optimizer_path=optimizer_path, + num_processes=1, + output_file_name=output_file_name, + error_file_name=error_file_name + ) return_code = os.system(command) - assert return_code != 0 + tools.assert_failure(return_code, + 'you specified 2 model filenames, and 0 reader filenames; you must specify either one or 2 reader filenames', + error_file_name) -def test_unit_bad_params(cluster, exes): - exe = exes['gcc4'] - sys.stderr.write('TESTING: run lbann with ill-formed param (missing -) lbann should throw exception\n') - (command_allocate, command_run, _, _) = tools.get_command(cluster=cluster, executable=exe, return_tuple=True) - return_code = os.system('%s%s %s -exit_after_setup --reader=prototext/data_reader_mnist.prototext --model={prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext} --optimizer=prototext/opt_sgd.prototext' % (command_allocate, command_run, exe)) - assert return_code != 0 +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_bad_params' --exes= +def test_unit_bad_params(cluster, dirname, exes): + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes + print('TESTING: run lbann with ill-formed param (exit_after_setup should have `--` not `-`) lbann should throw exception\n') + (data_reader_path, model_path, optimizer_path) = get_default_parameters( + dirname) + (command_allocate, command_run, _, _) = tools.get_command( + cluster=cluster, executable=exe, + num_processes=1, + return_tuple=True) + (output_file_name, error_file_name) = get_file_names(dirname, 'bad_params') + command_string = '{ca}{cr} {e} -exit_after_setup --reader={d} --model={m} --optimizer={o} > {ofn} 2> {efn}'.format( + ca=command_allocate, cr=command_run, e=exe, + d=data_reader_path, m=model_path, o=optimizer_path, + ofn=output_file_name, efn=error_file_name + ) + return_code = os.system(command_string) + tools.assert_failure(return_code, + "badly formed cmd line param; must begin with '--': -exit_after_setup", + error_file_name) -def test_unit_should_work(cluster, exes): - exe = exes['gcc4'] - sys.stderr.write('TESTING: run lbann with two models, reader, and optimizer; lbann should NOT throw exception\n') - model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' - data_reader_path = 'prototext/data_reader_mnist.prototext' - optimizer_path = 'prototext/opt_sgd.prototext' +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_should_work' --exes= +def test_unit_should_work(cluster, dirname, exes): + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes + print('TESTING: run lbann with two models, reader, and optimizer; lbann should NOT throw exception\n') + (data_reader_path, model_path, optimizer_path) = get_default_parameters( + dirname) + (output_file_name, error_file_name) = get_file_names(dirname, 'should_work') command = tools.get_command( cluster=cluster, executable=exe, data_reader_path=data_reader_path, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', exit_after_setup=True, model_path=model_path, - optimizer_path=optimizer_path) + optimizer_path=optimizer_path, + num_processes=1, + output_file_name=output_file_name, + error_file_name=error_file_name) return_code = os.system(command) - assert return_code != 0 + tools.assert_success(return_code, error_file_name) diff --git a/bamboo/unit_tests/test_unit_load_weights_lenet.py b/bamboo/unit_tests/test_unit_load_weights_lenet.py new file mode 100644 index 00000000000..0db1f94bb04 --- /dev/null +++ b/bamboo/unit_tests/test_unit_load_weights_lenet.py @@ -0,0 +1,265 @@ +import os.path +import re +import sys +import math +import numpy as np +import google.protobuf.text_format +import pytest +import glob + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Options +# ============================================== + +# Training options +num_epochs = 4 +num_ckpt_epochs = int(float(num_epochs)/2) +num_restart_epochs = num_epochs - num_ckpt_epochs +mini_batch_size = 64 +num_nodes = 1 +lenet_fraction = 0.01 +random_seed = 20191206 + +test_name_base='test_unit_load_weights_lenet' +checkpoint_dir='ckpt' +save_model_dir='model_weights' + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer(mini_batch_size=mini_batch_size, + random_seed=random_seed) + + # Checkpoint after every epoch + trainer.callbacks = [ + lbann.CallbackCheckpoint( + checkpoint_dir=checkpoint_dir, + checkpoint_epochs=1, + checkpoint_steps=845 + ) + ] + + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.SGD(learn_rate=0.01, momentum=0.9) + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.models + + # Manually override the global count so that each model is named the same + lbann.models.LeNet.global_count = 0 + # Layer graph + input_ = lbann.Input() + images = lbann.Identity(input_) + labels = lbann.Identity(input_) + x = lbann.models.LeNet(10)(images) + probs = lbann.Softmax(x) + loss = lbann.CrossEntropy(probs, labels) + acc = lbann.CategoricalAccuracy(probs, labels) + + # Make sure all layers are on CPU + for layer in lbann.traverse_layer_graph(input_): + layer.device = 'cpu' + + # Objects for LBANN model + callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackSaveModel(dir=save_model_dir)] + metrics = [lbann.Metric(acc, name='accuracy', unit='%')] + + # Construct model + return lbann.Model(num_epochs, + layers=lbann.traverse_layer_graph(input_), + objective_function=loss, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.contrib.lc.paths + + # Load data readers from prototext + dirname = os.path.dirname + lbann_dir = dirname(dirname(dirname(os.path.realpath(__file__)))) + pb_file = os.path.join(lbann_dir, + 'model_zoo', + 'data_readers', + 'data_reader_mnist.prototext') + message = lbann.lbann_pb2.LbannPB() + with open(pb_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), message) + message = message.data_reader + + # Set location of MNIST data + for reader in message.reader: + reader.data_filedir = lbann.contrib.lc.paths.mnist_dir() + reader.percent_of_data_to_use = lenet_fraction + + + # Validation set + message.reader[0].validation_percent = 0.1 + + return message + +# ============================================== +# Setup PyTest +# ============================================== + +def create_test_func(test_func): + """Augment test function to cascade multiple tests and parse results. + + `tools.create_tests` creates functions that run an LBANN + experiment. This function creates augmented functions that parse + the log files after LBANN finishes running, e.g. to check metrics + or runtimes. + + Note: The naive approach is to define the augmented test functions + in a loop. However, Python closures are late binding. In other + words, the function would be overwritten every time we define it. + We get around this overwriting problem by defining the augmented + function in the local scope of another function. + + Args: + test_func (function): Test function created by + `tools.create_tests`. + + Returns: + function: Test that can interact with PyTest. + + """ + test_name = test_func.__name__ + + # Define test function + def func(cluster, exes, dirname, weekly): + + # Run LBANN experiment baseline + print('\n################################################################################') + print('Running model halfway ') + print('################################################################################\n') + baseline_test_output = test_func(cluster, exes, dirname) + baseline_training_metrics = tools.collect_metrics_from_log_func(baseline_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function') + baseline_validation_metrics = tools.collect_metrics_from_log_func(baseline_test_output['stdout_log_file'], 'validation objective function') + baseline_test_metrics = tools.collect_metrics_from_log_func(baseline_test_output['stdout_log_file'], 'test objective function') + + # Run LBANN model to checkpoint + print('\n################################################################################') + print('Running model to checkpointed weights') + print('################################################################################\n') + test_func_checkpoint = tools.create_tests( + setup_experiment, + __file__, + test_name_base=test_name_base, + nodes=num_nodes, + work_subdir='reload_weights_from_checkpoint', + lbann_args=['--disable_cuda=True', + '--num_epochs='+str(num_restart_epochs), + '--load_model_weights_dir='+ os.path.join(baseline_test_output['work_dir'], checkpoint_dir, 'trainer0')], + ) + + checkpoint_test_output = test_func_checkpoint[0](cluster, exes, dirname) + checkpoint_training_metrics = tools.collect_metrics_from_log_func(checkpoint_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function') + checkpoint_validation_metrics = tools.collect_metrics_from_log_func(checkpoint_test_output['stdout_log_file'], 'validation objective function') + checkpoint_test_metrics = tools.collect_metrics_from_log_func(checkpoint_test_output['stdout_log_file'], 'test objective function') + + print('\n################################################################################') + print('Running model from save_model weights') + print('################################################################################\n') + test_func_restart = tools.create_tests( + setup_experiment, + __file__, + test_name_base=test_name_base, + nodes=num_nodes, + work_subdir='reload_weights_from_save_model_cb', + lbann_args=['--disable_cuda=True', + '--num_epochs='+str(num_restart_epochs), + '--load_model_weights_dir='+ os.path.join(baseline_test_output['work_dir'], save_model_dir, 'trainer0', 'model0/'), + '--load_model_weights_dir_is_complete=True'], + ) + + # Restart LBANN model and run to completion + restart_test_output = test_func_restart[0](cluster, exes, dirname) + restart_training_metrics = tools.collect_metrics_from_log_func(restart_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function') + restart_validation_metrics = tools.collect_metrics_from_log_func(restart_test_output['stdout_log_file'], 'validation objective function') + restart_test_metrics = tools.collect_metrics_from_log_func(restart_test_output['stdout_log_file'], 'test objective function') + + print('\n################################################################################') + print('Comparing results of models') + print('################################################################################\n') + + # Check if metrics are same in baseline and test experiments + # Note: "Print statistics" callback will print up to 6 digits + # of metric values. + + # Comparing training objective functions + tools.compare_metrics(checkpoint_training_metrics, restart_training_metrics) + # Comparing validation objective functions + tools.compare_metrics(checkpoint_validation_metrics, restart_validation_metrics) + # Comparing test objective functions + tools.compare_metrics(checkpoint_test_metrics, restart_test_metrics) + + baseline_ckpt=os.path.join(baseline_test_output['work_dir'], checkpoint_dir) + checkpoint_ckpt=os.path.join(checkpoint_test_output['work_dir'], checkpoint_dir) + restart_ckpt=os.path.join(restart_test_output['work_dir'], checkpoint_dir) + + err = 0 + err_dirs = '' + fileList = glob.glob('{base}/trainer0/*'.format(base=checkpoint_ckpt)) + fileList, tmp_err, tmp_err_str = tools.multidir_diff(checkpoint_ckpt, restart_ckpt, fileList) + err += tmp_err + err_dirs += tmp_err_str + + err_msg = "\nUnmatched checkpoints:\n" + for f in fileList: + err_msg += f + "\n" + assert len(fileList) == 0, \ + 'Extra checkpoint data in baseline directory: ' + err_msg + assert err == 0, err_dirs + + # Return test function from factory function + func.__name__ = test_name + return func + +# Create test functions that can interact with PyTest +for _test_func in tools.create_tests(setup_experiment, + __file__, + test_name_base=test_name_base, + nodes=num_nodes, + work_subdir='baseline', + lbann_args=['--disable_cuda=True', + ' --num_epochs='+str(num_ckpt_epochs)]): + globals()[_test_func.__name__] = create_test_func(_test_func) diff --git a/bamboo/unit_tests/test_unit_mnist_conv_graph.py b/bamboo/unit_tests/test_unit_mnist_conv_graph.py deleted file mode 100644 index 65a7bd54ad0..00000000000 --- a/bamboo/unit_tests/test_unit_mnist_conv_graph.py +++ /dev/null @@ -1,56 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools -import pytest -import os - - -def skeleton_mnist_conv_graph(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_mnist_conv_graph: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/mnist_conv_graph_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/mnist_conv_graph_%s_error.txt' % (dir_name, compiler_name) - if compiler_name == 'gcc7': - tl = 240 - else: - tl = None - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, time_limit=tl, num_processes=1, - dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', - model_name='mnist_conv_graph', - optimizer_name='adam', - output_file_name=output_file_name, - error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 - - -def test_unit_mnist_conv_graph_clang4(cluster, exes, dirname): - skeleton_mnist_conv_graph(cluster, exes, dirname, 'clang4') - - -def test_unit_mnist_conv_graph_gcc4(cluster, exes, dirname): - skeleton_mnist_conv_graph(cluster, exes, dirname, 'gcc4') - - -def test_unit_mnist_conv_graph_gcc7(cluster, exes, dirname): - skeleton_mnist_conv_graph(cluster, exes, dirname, 'gcc7') - - -def test_unit_mnist_conv_graph_intel18(cluster, exes, dirname): - skeleton_mnist_conv_graph(cluster, exes, dirname, 'intel18') - - -# Run with python -m pytest -s test_unit_conv_graph.py -k 'test_unit_mnist_conv_graph_exe' --exe= -def test_unit_mnist_conv_graph_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_mnist_conv_graph_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_mnist_conv_graph(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py deleted file mode 100644 index 0d4d3994837..00000000000 --- a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py +++ /dev/null @@ -1,50 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools -import pytest -import os - - -def skeleton_mnist_ridge_regression(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_mnist_ridge_regression: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/mnist_ridge_regression_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/mnist_ridge_regression_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=1, dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', - model_folder='tests', model_name='mnist_ridge_regression', - optimizer_name='adam', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 - - -def test_unit_mnist_ridge_regression_clang4(cluster, exes, dirname): - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'clang4') - - -def test_unit_mnist_ridge_regression_gcc4(cluster, exes, dirname): - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'gcc4') - - -def test_unit_mnist_ridge_regression_gcc7(cluster, exes, dirname): - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'gcc7') - - -def test_unit_mnist_ridge_regression_intel18(cluster, exes, dirname): - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'intel18') - - -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_mnist_ridge_regression_exe' --exe= -def test_unit_mnist_ridge_regression_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_mnist_ridge_regression_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py deleted file mode 100644 index 8718c0e5802..00000000000 --- a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py +++ /dev/null @@ -1,50 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools -import pytest -import os - - -def skeleton_mnist_softmax_classifier(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_mnist_softmax_classifier: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/mnist_softmax_classifier_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/mnist_softmax_classifier_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=1, dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', - model_folder='tests', model_name='mnist_softmax_classifier', - optimizer_name='adam', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) - assert return_code == 0 - - -def test_unit_mnist_softmax_classifier_clang4(cluster, exes, dirname): - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'clang4') - - -def test_unit_mnist_softmax_classifier_gcc4(cluster, exes, dirname): - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'gcc4') - - -def test_unit_mnist_softmax_classifier_gcc7(cluster, exes, dirname): - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'gcc7') - - -def test_unit_mnist_softmax_classifier_intel18(cluster, exes, dirname): - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'intel18') - - -# Run with python -m pytest -s test_unit_softmax_classifier.py -k 'test_unit_mnist_softmax_classifier_exe' --exe= -def test_unit_mnist_softmax_classifier_exe(cluster, dirname, exe): - if exe is None: - e = 'test_unit_mnist_softmax_classifier_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_reconstruction_loss.py b/bamboo/unit_tests/test_unit_reconstruction_loss.py new file mode 100644 index 00000000000..fbdd0125aa6 --- /dev/null +++ b/bamboo/unit_tests/test_unit_reconstruction_loss.py @@ -0,0 +1,72 @@ +import sys +sys.path.insert(0, '../common_python') +import os +import pytest +import tools + + +def skeleton_jag_reconstruction_loss(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): + if compiler_name not in executables: + e = 'skeleton_jag_reconstruction_loss: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) + if cluster == 'ray': + e = 'skeleton_jag_reconstruction_loss: dataset does not exist on %s' % cluster + print('Skip - ' + e) + pytest.skip(e) + #if cluster == 'lassen': + #e = 'skeleton_jag_reconstruction_loss: FIXME dataset consistency issues on Lassen' + #print('Skip - ' + e) + #pytest.skip(e) + output_file_name = '%s/bamboo/unit_tests/output/jag_reconstruction_loss_%s_output.txt' % (dir_name, compiler_name) + error_file_name = '%s/bamboo/unit_tests/error/jag_reconstruction_loss_%s_error.txt' % (dir_name, compiler_name) + command = tools.get_command( + cluster=cluster, + executable=executables[compiler_name], + num_nodes=2, + num_processes=32, + disable_cuda=1, + dir_name=dir_name, + data_filedir_train_default='/p/lscratchh/brainusr/datasets/10MJAG/1M_A/100K4trainers', + data_filedir_test_default='/p/lscratchh/brainusr/datasets/10MJAG/1M_A/100K16trainers', + data_reader_name='jag', + data_reader_percent='prototext', + metadata='applications/physics/data/jag_100M_metadata.prototext', + model_folder='tests', + model_name='jag_single_layer_ae', + optimizer_name='adam', + output_file_name=output_file_name, + error_file_name=error_file_name, weekly=weekly) + return_code = os.system(command) + tools.assert_success(return_code, error_file_name) + + +def test_unit_jag_reconstruction_loss_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) + + +def test_unit_jag_reconstruction_loss_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) + + +def test_unit_jag_reconstruction_loss_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) + + +# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_jag_reconstruction_loss_exe' --exe= +def test_unit_jag_reconstruction_loss_exe(cluster, dirname, exe, + weekly, data_reader_percent): + if exe is None: + e = 'test_unit_jag_reconstruction_loss_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} + skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'exe', + weekly, data_reader_percent) diff --git a/cmake/configure_files/LBANNConfig.cmake.in b/cmake/configure_files/LBANNConfig.cmake.in index 2ac6ed91a9f..6c03819c99b 100644 --- a/cmake/configure_files/LBANNConfig.cmake.in +++ b/cmake/configure_files/LBANNConfig.cmake.in @@ -10,9 +10,11 @@ list(APPEND CMAKE_MODULE_PATH "@EXTRA_CMAKE_MODULE_DIR@") set(LBANN_VERSION ${PACKAGE_VERSION}) +set(LBANN_BUILD_TYPE "@CMAKE_BUILD_TYPE@") + # Record compiler information set(LBANN_CXX_COMPILER "@CMAKE_CXX_COMPILER@") -set(LBANN_CUDA_COMPILER "$@CMAKE_CUDA_COMPILER@") +set(LBANN_CUDA_COMPILER "@CMAKE_CUDA_COMPILER@") set(LBANN_CXX_FLAGS "@CMAKE_CXX_FLAGS@") set(LBANN_CUDA_FLAGS "@CMAKE_CUDA_FLAGS@") @@ -28,6 +30,7 @@ if (CMAKE_CXX_STANDARD LESS LBANN_CXX_STANDARD) endif () set(CMAKE_CXX_STANDARD_REQUIRED TRUE) + # Record the various flags and switches accumlated in LBANN set(LBANN_ALUMINUM_MPI_PASSTHROUGH @LBANN_ALUMINUM_MPI_PASSTHROUGH@) set(LBANN_BUILT_WITH_SPECTRUM @LBANN_BUILT_WITH_SPECTRUM@) @@ -37,7 +40,7 @@ set(LBANN_GNU_LINUX @LBANN_GNU_LINUX@) set(LBANN_HAS_ALUMINUM @LBANN_HAS_ALUMINUM@) set(LBANN_HAS_CEREAL @LBANN_HAS_CEREAL@) set(LBANN_HAS_CNPY @LBANN_HAS_CNPY@) -set(LBANN_HAS_CONDUIT @LBANN_HAS_CONDUIT@) +set(LBANN_HAS_CONDUIT @LBANN_WITH_CONDUIT@) set(LBANN_HAS_CUDA @LBANN_HAS_CUDA@) set(LBANN_HAS_CUDNN @LBANN_HAS_CUDNN@) set(LBANN_HAS_DOXYGEN @LBANN_HAS_DOXYGEN@) @@ -46,28 +49,38 @@ set(LBANN_HAS_LBANN_PROTO @LBANN_HAS_LBANN_PROTO@) set(LBANN_HAS_OPENCV @LBANN_HAS_OPENCV@) set(LBANN_HAS_NCCL2 @LBANN_HAS_NCCL2@) set(LBANN_HAS_PROTOBUF @LBANN_HAS_PROTOBUF@) +set(LBANN_HAS_PYTHON @LBANN_HAS_PYTHON@) set(LBANN_HAS_TBINF @LBANN_HAS_TBINF@) set(LBANN_HAS_VTUNE @LBANN_HAS_VTUNE@) -set(LBANN_NO_OMP_FOR_DATA_READERS @LBANN_NO_OMP_FOR_DATA_READERS@) set(LBANN_NVPROF @LBANN_NVPROF@) -set(LBANN_SEQUENTIAL_INITIALIZATION @LBANN_SEQUENTIAL_INITIALIZAION@) set(LBANN_TOPO_AWARE @LBANN_TOPO_AWARE@) # Setup dependencies +find_package(Threads REQUIRED) -# First, CEREAL. if (LBANN_HAS_CEREAL) - find_package(CEREAL NO_MODULE + find_package(CEREAL NO_MODULE QUIET HINTS ${CEREAL_DIR} $ENV{CEREAL_DIR} PATH_SUFFIXES share/cmake/cereal NO_DEFAULT_PATH) if (NOT CEREAL_FOUND) - find_package(CEREAL NO_MODULE) + find_package(CEREAL NO_MODULE QUIET) endif () if (NOT CEREAL_FOUND AND NOT CEREAL_DIR) set(CEREAL_DIR "@CEREAL_DIR@") find_package(CEREAL NO_MODULE REQUIRED) endif () + if (NOT CEREAL_FOUND) + message(FATAL_ERROR "Required dependency CEREAL not found.") + endif () +endif () + +if (NOT HWLOC_DIR) + set(HWLOC_DIR "@HWLOC_DIR@") +endif () +if (LBANN_TOPO_AWARE) + find_package(HWLOC REQUIRED) + set(LBANN_TOPO_AWARE ${HWLOC_FOUND}) endif () # Next, Hydrogen. We can probably inherit Aluminum-ness from @@ -192,6 +205,102 @@ if (LBANN_HAS_CUDA) include(SetupCUDAToolkit) endif (LBANN_HAS_CUDA) +set(_LBANN_CONDUIT_DIR "@Conduit_DIR@") +set(_LBANN_HDF5_DIR "@HDF5_DIR@") +if (LBANN_HAS_CONDUIT) + # Apparently we have to find HDF5, too. + find_package(HDF5 CONFIG QUIET + HINTS ${HDF5_DIR} $ENV{HDF5_DIR} ${_LBANN_HDF5_DIR} + PATH_SUFFIXES share/cmake/hdf5 + NO_DEFAULT_PATH) + if (NOT HDF5_FOUND) + find_package(HDF5 CONFIG QUIET) + endif () + if (NOT HDF5_FOUND) + enable_language(C) # WHY?????????????? + find_package(HDF5 REQUIRED) + set(HDF5_FOUND_WITH_MODULE TRUE) + else () + message(STATUS "Found HDF5: ${HDF5_DIR}") + endif () + + find_package(Conduit CONFIG QUIET + HINTS ${Conduit_DIR} $ENV{Conduit_DIR} + ${CONDUIT_DIR} $ENV{CONDUIT_DIR} + ${_LBANN_CONDUIT_DIR} + PATH_SUFFIXES lib64/cmake lib/cmake + NO_DEFAULT_PATH) + if (NOT Conduit_FOUND) + find_package(Conduit CONFIG REQUIRED + PATH_SUFFIXES lib64/cmake lib/cmake) + endif () + message(STATUS "Found CONDUIT: ${Conduit_DIR}") + + # Ugh. I don't like that this requires intimate knowledge of + # specific targets that CONDUIT exports. It should support + # components. + if (NOT TARGET conduit_relay_mpi) + message(FATAL_ERROR "CONDUIT does not have proper MPI support.") + endif () + + if (NOT TARGET conduit OR NOT TARGET conduit_relay + OR NOT TARGET conduit_blueprint) + message(FATAL_ERROR "Missing some CONDUIT required library.") + endif () + + if (NOT TARGET conduit::conduit) + add_library(conduit::conduit INTERFACE IMPORTED) + endif () + + set(_conduit_interface_link_libs + "conduit;conduit_relay;conduit_relay_mpi;conduit_blueprint") + + # Remove -pthread from linkage, if found + foreach (_lib IN LISTS _conduit_interface_link_libs) + if (TARGET ${_lib}) + get_property(_tmp_interface_link_libs TARGET ${_lib} + PROPERTY INTERFACE_LINK_LIBRARIES) + + list(FIND _tmp_interface_link_libs "-pthread" _pthread_idx) + if (_pthread_idx GREATER_EQUAL 0) + list(REMOVE_AT _tmp_interface_link_libs ${_pthread_idx}) + + set_property(TARGET ${_lib} PROPERTY + INTERFACE_LINK_LIBRARIES ${_tmp_interface_link_libs}) + endif () + + get_property(_tmp_interface_compile_opts TARGET ${_lib} + PROPERTY INTERFACE_COMPILE_OPTIONS) + set_property(TARGET ${_lib} + PROPERTY INTERFACE_COMPILE_OPTIONS + $<$:${_tmp_interface_compile_opts}>) + endif () + endforeach () + + get_filename_component(_conduit_include_dirs + "${CONDUIT_INCLUDE_DIRS}" DIRECTORY) + + if (HDF5_FOUND_WITH_MODULE) + list(APPEND _conduit_interface_link_libs + ${HDF5_LIBRARIES}) + + list(APPEND _conduit_include_dirs + "${HDF5_INCLUDE_DIRS}") + endif () + + set_property(TARGET conduit::conduit + PROPERTY + INTERFACE_INCLUDE_DIRECTORIES + "${_conduit_include_dirs}") + + set_target_properties(conduit::conduit + PROPERTIES + INTERFACE_LINK_LIBRARIES + "${_conduit_interface_link_libs}") + + set(CONDUIT_LIBRARIES conduit::conduit) +endif (LBANN_HAS_CONDUIT) + @PACKAGE_INIT@ # Now actually import the LBANN target diff --git a/cmake/configure_files/lbann_config.hpp.in b/cmake/configure_files/lbann_config.hpp.in index 76b50bc920c..c011e492454 100644 --- a/cmake/configure_files/lbann_config.hpp.in +++ b/cmake/configure_files/lbann_config.hpp.in @@ -24,19 +24,27 @@ #cmakedefine LBANN_GNU_LINUX #cmakedefine LBANN_HAS_CEREAL +#cmakedefine LBANN_HAS_DIHYDROGEN #cmakedefine LBANN_HAS_OPENCV #cmakedefine LBANN_HAS_TBINF #cmakedefine LBANN_HAS_CNPY #cmakedefine LBANN_HAS_VTUNE #cmakedefine LBANN_HAS_ALUMINUM #cmakedefine LBANN_ALUMINUM_MPI_PASSTHROUGH -#cmakedefine LBANN_HAS_CONDUIT #cmakedefine LBANN_HAS_PYTHON +#cmakedefine LBANN_HAS_SHMEM +#cmakedefine LBANN_HAS_LARGESCALE_NODE2VEC #cmakedefine LBANN_DETERMINISTIC #cmakedefine LBANN_HAS_CUDA #cmakedefine LBANN_HAS_CUDNN +#ifdef LBANN_HAS_CUDA +#cmakedefine LBANN_HAS_NVSHMEM +#endif + +#cmakedefine LBANN_HAS_HALF +#cmakedefine LBANN_HAS_GPU_FP16 #cmakedefine LBANN_VTUNE #cmakedefine LBANN_NVPROF @@ -46,6 +54,11 @@ #cmakedefine LBANN_HAS_STD_ANY #cmakedefine LBANN_HAS_STD_MAKE_UNIQUE +// API support for non-portable pthread functionality. +#cmakedefine LBANN_HAS_PTHREAD_AFFINITY_SUPPORT + +#cmakedefine LBANN_HAS_DISTCONV + // Define the LBANN datatype namespace lbann { diff --git a/cmake/configure_files/lbann_module.lua.in b/cmake/configure_files/lbann_module.lua.in index 754d2c6106d..e6ea77453ae 100644 --- a/cmake/configure_files/lbann_module.lua.in +++ b/cmake/configure_files/lbann_module.lua.in @@ -22,7 +22,6 @@ -- LBANN_HAS_DOXYGEN: @LBANN_HAS_DOXYGEN@ -- LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@ -- LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@ --- LBANN_HAS_CONDUIT: @LBANN_HAS_CONDUIT@ -- LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@ help( @@ -58,7 +57,6 @@ whatis("LBANN_NVPROF: @LBANN_NVPROF@") whatis("LBANN_HAS_DOXYGEN: @LBANN_HAS_DOXYGEN@") whatis("LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@") whatis("LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@") -whatis("LBANN_HAS_CONDUIT: @LBANN_HAS_CONDUIT@") whatis("LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@") prepend_path("PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_BINDIR@") diff --git a/cmake/configure_files/lbann_module.tcl.in b/cmake/configure_files/lbann_module.tcl.in new file mode 100644 index 00000000000..3ec52a66ce6 --- /dev/null +++ b/cmake/configure_files/lbann_module.tcl.in @@ -0,0 +1,57 @@ +#%Module + +# Lua (and hence LMod) should be preferred, but this will +# satisfy... less modern system needs. + +set name lbann +set version @LBANN_VERSION@ +set root @CMAKE_INSTALL_PREFIX@ + +conflict $name + +set fullname LBANN +set url https://github.com/llnl/lbann +set docs https://lbann.readthedocs.io + +set description "LBANN: Livermore Big Artificial Neural Network Toolkit." + +proc ModulesHelp { } { + global description url docs + puts stderr "Description - $description" + puts stderr + puts stderr "Docs - $url" +} + +module-whatis "Package: LBANN +Version: @LBANN_VERSION@ +Description: Livermore Big Artificial Neural Network Toolkit. + A distributed memory, HPC-optimized, model and data parallel + training toolkit for deep neural networks. +URL: https://github.com/llnl/lbann +Configuration: + CMAKE_INSTALL_PREFIX: @CMAKE_INSTALL_PREFIX@ + CMAKE_BUILD_TYPE: @CMAKE_BUILD_TYPE@ + CXX Compiler: @CMAKE_CXX_COMPILER@ + CXX FLAGS: @CMAKE_CXX_FLAGS@ + CXX FLAGS_DEBUG: @CMAKE_CXX_FLAGS_DEBUG@ + CXX FLAGS_RELWITHDEBINFO: @CMAKE_CXX_FLAGS_RELWITHDEBINFO@ + CXX FLAGS_RELEASE: @CMAKE_CXX_FLAGS_RELEASE@ + LBANN_GNU_LINUX: @LBANN_GNU_LINUX@ + LBANN_HAS_HYDROGEN: @LBANN_HAS_HYDROGEN@ + LBANN_HAS_OPENCV: @LBANN_HAS_OPENCV@ + LBANN_HAS_CEREAL: @LBANN_HAS_CEREAL@ + LBANN_HAS_CUDA: @LBANN_HAS_CUDA@ + LBANN_HAS_CUDNN: @LBANN_HAS_CUDNN@ + LBANN_HAS_NCCL2: @LBANN_HAS_NCCL2@ + LBANN_HAS_PROTOBUF: @LBANN_HAS_PROTOBUF@ + LBANN_HAS_CNPY: @LBANN_HAS_CNPY@ + LBANN_HAS_TBINF: @LBANN_HAS_TBINF@ + LBANN_HAS_VTUNE: @LBANN_HAS_VTUNE@ + LBANN_NVPROF: @LBANN_NVPROF@ + LBANN_HAS_DOXYGEN: @LBANN_HAS_DOXYGEN@ + LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@ + LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@ + LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@" + +prepend-path PATH $root/@CMAKE_INSTALL_BINDIR@ +prepend-path PYTHONPATH @PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@ diff --git a/cmake/configure_files/setup.py.in b/cmake/configure_files/setup.py.in index bd6dae0516b..c56a7df3a4b 100644 --- a/cmake/configure_files/setup.py.in +++ b/cmake/configure_files/setup.py.in @@ -9,9 +9,9 @@ config_file = '@_PYTHON_CONFIG_INI@' # Get relative paths # Note: setuptools does not accept absolute paths -current_dir = os.path.dirname(os.path.abspath(__file__)) -src_dir = os.path.relpath(os.path.abspath(src_dir), current_dir) -config_file = os.path.relpath(os.path.abspath(config_file), current_dir) +current_dir = os.path.dirname(os.path.realpath(__file__)) +src_dir = os.path.relpath(os.path.realpath(src_dir), current_dir) +config_file = os.path.relpath(os.path.realpath(config_file), current_dir) # Setup package setuptools.setup( @@ -24,8 +24,7 @@ setuptools.setup( packages=setuptools.find_packages(src_dir), package_dir={'': src_dir}, data_files=[('lbann', [config_file])], - install_requires=['graphviz>=0.10.1', - 'matplotlib>=2.0.2', + install_requires=['matplotlib>=2.0.2', 'numpy>=1.16.0', 'onnx>=1.3.0', 'pandas>=0.24.1', diff --git a/cmake/modules/FindBreathe.cmake b/cmake/modules/FindBreathe.cmake index c1f2d2c5fa2..36f9499c1b8 100644 --- a/cmake/modules/FindBreathe.cmake +++ b/cmake/modules/FindBreathe.cmake @@ -10,7 +10,7 @@ find_program(BREATHE_EXECUTABLE breathe-apidoc PATH_SUFFIXES bin DOC "The breathe documentation tool." NO_DEFAULT_PATH) -find_program(BREATHE_EXECUTABLE breathe-build) +find_program(BREATHE_EXECUTABLE breathe-apidoc) # Standard handling of the package arguments include(FindPackageHandleStandardArgs) diff --git a/cmake/modules/FindClara.cmake b/cmake/modules/FindClara.cmake new file mode 100644 index 00000000000..ff2f02cafd3 --- /dev/null +++ b/cmake/modules/FindClara.cmake @@ -0,0 +1,34 @@ +# Output variables +# +# Clara_FOUND +# Clara_LIBRARIES +# Clara_INCLUDE_PATH +# +# Also creates an imported target clara::clara + +# Find the header +find_path(CLARA_INCLUDE_PATH clara.hpp + HINTS ${CLARA_DIR} $ENV{CLARA_DIR} ${Clara_DIR} $ENV{Clara_DIR} + PATH_SUFFIXES include + NO_DEFAULT_PATH) +find_path(CLARA_INCLUDE_PATH clara.hpp) + +# Handle the find_package arguments +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + Clara DEFAULT_MSG CLARA_INCLUDE_PATH) + +# Build the imported target +if (NOT TARGET clara::clara) + add_library(clara::clara INTERFACE IMPORTED) +endif() + +set_property(TARGET clara::clara + PROPERTY INTERFACE_INCLUDE_DIRECTORIES + ${CLARA_INCLUDE_PATH}) + +# Set the last of the output variables +set(CLARA_LIBRARIES clara::clara) + +# Cleanup +mark_as_advanced(FORCE CLARA_INCLUDE_PATH) diff --git a/cmake/modules/FindNVSHMEM.cmake b/cmake/modules/FindNVSHMEM.cmake new file mode 100644 index 00000000000..b711c48e55e --- /dev/null +++ b/cmake/modules/FindNVSHMEM.cmake @@ -0,0 +1,46 @@ +# Output variables +# +# NVSHMEM_FOUND +# NVSHMEM_LIBRARY +# NVSHMEM_INCLUDE_DIRS +# +# Also creates an imported target NVSHMEM::NVSHMEM + +# Find the library +find_library(NVSHMEM_LIBRARY nvshmem + HINTS ${NVSHMEM_DIR} $ENV{NVSHMEM_DIR} + PATH_SUFFIXES lib lib64 + NO_DEFAULT_PATH + DOC "The location of NVSHMEM library.") +find_library(NVSHMEM_LIBRARY nvshmem) + +# Find the header +find_path(NVSHMEM_INCLUDE_DIRS nvshmem.h + HINTS ${NVSHMEM_DIR} $ENV{NVSHMEM_DIR} + PATH_SUFFIXES include + NO_DEFAULT_PATH + DOC "The location of NVSHMEM headers.") +find_path(NVSHMEM_INCLUDE_DIRS nvshmemx.h) + +# Handle the find_package arguments +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + NVSHMEM DEFAULT_MSG NVSHMEM_LIBRARY NVSHMEM_INCLUDE_DIRS) + +# Build the imported target +if (NOT TARGET NVSHMEM::NVSHMEM) + add_library(NVSHMEM::NVSHMEM INTERFACE IMPORTED) + set_property(TARGET NVSHMEM::NVSHMEM PROPERTY + INTERFACE_LINK_LIBRARIES ${NVSHMEM_LIBRARY}) + set_property(TARGET NVSHMEM::NVSHMEM PROPERTY + INTERFACE_INCLUDE_DIRECTORIES ${NVSHMEM_INCLUDE_DIRS}) +endif () + +if (NVSHMEM_FOUND) + # Workaround for separable compilation with cooperative threading. see + # https://stackoverflow.com/questions/53492528/cooperative-groupsthis-grid-causes-any-cuda-api-call-to-return-unknown-erro. + # Adding this to INTERFACE_COMPILE_OPTIONS does not seem to solve the problem. + # It seems that CMake does not add necessary options for device linking when cuda_add_executable/library is NOT used. See also + # https://github.com/dealii/dealii/pull/5405 + string(APPEND CMAKE_CUDA_FLAGS " -gencode=arch=compute_70,code=compute_70") +endif () diff --git a/cmake/modules/FindPython.cmake b/cmake/modules/FindPython.cmake index 62c7945174f..39d5430461e 100644 --- a/cmake/modules/FindPython.cmake +++ b/cmake/modules/FindPython.cmake @@ -64,13 +64,19 @@ execute_process( COMMAND "${Python_EXECUTABLE}" "-c" "import sys; from distutils.sysconfig import get_config_var; sys.stdout.write(get_config_var('LIBDIR'))" OUTPUT_VARIABLE _LIB_DIR) -if (BUILD_SHARED_LIBS) - set(_GLOB_EXPR "${_LIB_DIR}/libpython*${CMAKE_SHARED_LIBRARY_SUFFIX}") -ELSE (BUILD_SHARED_LIBS) - set(_GLOB_EXPR "${_LIB_DIR}/libpython*${CMAKE_STATIC_LIBRARY_SUFFIX}") -endif (BUILD_SHARED_LIBS) -FILE(GLOB _GLOB_RESULT "${_GLOB_EXPR}") -get_filename_component(Python_LIBRARIES "${_GLOB_RESULT}" ABSOLUTE) + +set(_PY_MAJ_MIN_VERSION "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") +find_library(Python_LIBRARY + NAMES python python${_PY_MAJ_MIN_VERSION}m python${_PY_MAJ_MIN_VERSION} + python${Python_VERSION_MAJOR}m python${Python_VERSION_MAJOR} + HINTS ${_LIB_DIR} + DOC "The python${Python_VERSION_MAJOR} library." + NO_DEFAULT_PATH) +if (NOT Python_LIBRARY) + message(FATAL_ERROR "Could not find Python library for version " + "${_PY_MAJ_MIN_VERSION} in directory: ${_LIB_DIR}") +endif () +set(Python_LIBRARIES "${Python_LIBRARY}") # Handle the find_package arguments include(FindPackageHandleStandardArgs) diff --git a/docs/RSTDocsFlavorText.py b/docs/RSTDocsFlavorText.py index 2f6b5969f08..bfdc1caf249 100644 --- a/docs/RSTDocsFlavorText.py +++ b/docs/RSTDocsFlavorText.py @@ -9,6 +9,7 @@ 'callbacks' : 'Callback Interface', 'data_readers' : 'Data Readers Interface', 'data_store' : 'Data Store Interface', + 'execution_contexts' : 'Execution Context Interface', 'layers' : 'Layer Interface', 'layers/activations' : 'Activation Layers', 'layers/image' : 'Image Layers', @@ -26,6 +27,9 @@ 'objective_functions/weight_regularization' : 'Objective Functions for Weight Regularization', 'optimizers' : 'Optimizer Interface', 'proto' : 'Protobuf and Front-End Utilities', + 'trainers' : 'Trainer Interface', + 'training_algorithms' : 'Training Algorithm Interface', + 'transforms' : 'Transform Interface', 'utils' : 'General Utilities', 'utils/threads' : 'Multithreading Utilities', 'weights' : 'Weights Interface' @@ -33,10 +37,14 @@ lbann_rst_flavor_text = { '.' : ''' -Welcome to the LBANN developers' documentation. The documentation is -laid out following a similar structure to the source code to aid in -navigation. - ''', +The LBANN API documentation is almost entirely generated by `Doxygen +`_. We encourage developers to view the +`Doxygen-generated documentation +<../_static/doxygen/html/index.html>`_. The API documentation is largely +reproduced here (using `Breathe +`_) for those who prefer the +Sphinx/RTD style. It is laid out following a similar structure to the +source code to aid in navigation.''', 'callbacks' : ''' Callbacks give users information about their model as it is trained. @@ -52,6 +60,21 @@ The data store provides in-memory caching of the data set and inter-epoch data shuffling.''', + 'execution_contexts' : ''' +When a model is attached to a trainer, the execution context of the +training algorithm is stored in an `execution_context` (or sub-class) +object per execution mode. Thus there is one execution context per +model and mode that contains all of the state with respect to the +training algorithm being applied to the model. + +For example it tracks the current: + +* step +* execution mode +* epoch +* and a pointer back to the trainer. +''', + 'layers' : ''' LBANN models are defined in model prototext files. The bulk of these defintions will be the series of layers which make up the model @@ -103,6 +126,31 @@ python front end of LBANN will emit a network description in the protobuf format that is ingested at runtime.''', + 'trainers' : ''' +A trainer is a collection of compute resources and defines an explicit +communication domain. It manages the execution for both the training +and inference of a trained model. Once constructed, a trainer owns an +`lbann_comm` object that defines both intra- and inter-trainer +communication domains. Additionally, a trainer will contain an I/O +thread pool that is used to fetch and preprocess data that will be +provided to the trainer's models. + +A trainer owns: + +* `lbann_comm` object, +* I/O thread pool, +* One or more models, and +* Execution context for each model. + +In the future, it will also contain the data readers. +''', + + 'training_algorithms' : ''' +The training algorithm defines the optimization that is to be +applied to the model(s) being trained. Additionally, it can +specify how to evaluate the model. +''', + 'utils' : 'Utility classes and functions.', 'utils/threads' : 'TODO: Something about utils/threads', diff --git a/docs/SourceTreeDoxyfile b/docs/SourceTreeDoxyfile index eb38cd65aa7..8fcbae615bd 100644 --- a/docs/SourceTreeDoxyfile +++ b/docs/SourceTreeDoxyfile @@ -58,7 +58,7 @@ PROJECT_LOGO = # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. -OUTPUT_DIRECTORY = doxy_out +OUTPUT_DIRECTORY = _static/doxygen # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and @@ -763,7 +763,8 @@ WARN_LOGFILE = INPUT = ../README.md \ ../docs \ ../src \ - ../include + ../include \ + ../unit_test # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -1359,7 +1360,7 @@ ECLIPSE_DOC_ID = org.doxygen.Project # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. -DISABLE_INDEX = YES +DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag @@ -1564,7 +1565,7 @@ EXTRA_SEARCH_MAPPINGS = # If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output. # The default value is: YES. -GENERATE_LATEX = YES +GENERATE_LATEX = NO # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of @@ -1616,7 +1617,7 @@ PAPER_TYPE = a4wide # If left blank no extra packages will be included. # This tag requires that the tag GENERATE_LATEX is set to YES. -EXTRA_PACKAGES = amsmath +EXTRA_PACKAGES = amsmath, amssymb, amsfonts, latexsym # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the # generated LaTeX document. The header should contain everything until the first @@ -1841,7 +1842,7 @@ GENERATE_XML = YES # The default directory is: xml. # This tag requires that the tag GENERATE_XML is set to YES. -XML_OUTPUT = xml +XML_OUTPUT = ../../doxy_out/xml # If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program # listings (including syntax highlighting and cross-referencing information) to diff --git a/docs/build_osx.rst b/docs/build_osx.rst index 3e5ce179a09..753bfeef4b1 100644 --- a/docs/build_osx.rst +++ b/docs/build_osx.rst @@ -5,25 +5,16 @@ Building LBANN on OS X ========================= -.. warning:: This section is still under development and being - tested. It contains known issues. This warning will be - removed when it is believed to be generally usable. +.. warning:: If using OSX 10.14 or newer, be sure that + :bash:`/usr/include` has been restored. In version 10.14, + this may be accomplished by installing + :bash:`/Library/Developer/CommandLineTools/Packages/macOS_SDK_headers_for_macOS_10.14.pkg`. + If this package is not available, it's possible command + line tools have not been installed; do so by executing + :bash:`xcode-select --install`. --------------------- -Getting Started --------------------- - -.. _osx-setup-spack: - -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Setup Spack and local base tools -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -To get started follow the general directions on building LBANN to -`setup spack -`_. - +.. _osx-basic-setup: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Setup Homebrew @@ -31,104 +22,41 @@ Setup Homebrew .. note:: Setting up Homebrew only needs to be done once per system,. -1. Download and install `Homebrew `_. Setup base - development packages. Note that at the moment we use brew to - install llvm, open-mpi, scalapack, and cmake. - - .. code-block:: bash +Download and install `Homebrew `_. Setup base +development packages. Note that at the moment we use brew to install +LLVM, Open-MPI, ScaLAPACK, and CMake. - brew install llvm - brew install open-mpi - brew install scalapack - brew install cmake +.. code-block:: bash - Put the brew based clang in your path: + brew install llvm + brew install open-mpi + brew install cmake + brew install hwloc - .. code-block:: bash +Put the brew-based :code:`clang` in your path: - export PATH="/usr/local/opt/llvm/bin:$PATH"; +.. code-block:: bash - Install lmmod so that we can use modules to put spack built - packages into your path. + export PATH=/usr/local/opt/llvm/bin:$PATH; - .. code-block:: bash +Install :code:`lmod` so that we can use modules to put Spack-built +packages into your path: - brew install lmod - brew install luarocks +.. code-block:: bash - Update your .profile to enable use of modules via lmod + brew install lmod + brew install luarocks - .. code-block:: bash +Update your shell configuration files to enable use of modules via +:code:`lmod`: - source $(brew --prefix lmod)/init/$(basename $SHELL) +.. code-block:: bash -.. _osx-build-install-as-developer: + source $(brew --prefix lmod)/init/$(basename $SHELL) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Building & Installing LBANN as a developer +Building & Installing LBANN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -1. Establish a Spack environment and install software dependencies. - - .. note:: This spack environment has to be setup once each time - you create a new build directory. - - .. code-block:: bash - - export LBANN_HOME=/path/to/lbann/git/repo - export LBANN_BUILD_DIR=/path/to/a/build/directory - export LBANN_INSTALL_DIR=/path/to/an/install/directory - cd ${LBANN_BUILD_DIR} - spack env create -d . ${LBANN_HOME}/spack_environments/developer_release_osx_spack.yaml - spack install - spack env loads # Spack creates a file named loads that has all of the correct modules - source loads - unset LIBRARY_PATH - - -2. Build LBANN locally from source and build Hydrogen and Aluminum - using the superbuild. See :ref:`here ` - for a list and descriptions of all CMake flags known to LBANN's - "Superbuild" build system. A representative CMake command line - that expects :bash:`LBANN_HOME`, :bash:`LBANN_BUILD_DIR`, - :bash:`LBANN_INSTALL_DIR` environment variables might be: - - .. code-block:: console - - cd ${LBANN_BUILD_DIR} - cmake \ - -G Ninja \ - -D CMAKE_BUILD_TYPE:STRING=Release \ - -D CMAKE_INSTALL_PREFIX:PATH=${LBANN_INSTALL_DIR} \ - \ - -D LBANN_SB_BUILD_ALUMINUM=ON \ - -D ALUMINUM_ENABLE_MPI_CUDA=OFF \ - -D ALUMINUM_ENABLE_NCCL=OFF \ - \ - -D LBANN_SB_BUILD_HYDROGEN=ON \ - -D Hydrogen_ENABLE_ALUMINUM=ON \ - -D Hydrogen_ENABLE_CUB=OFF \ - -D Hydrogen_ENABLE_CUDA=OFF \ - \ - -D LBANN_SB_BUILD_LBANN=ON \ - -D LBANN_DATATYPE:STRING=float \ - -D LBANN_SEQUENTIAL_INITIALIZATION:BOOL=OFF \ - -D LBANN_WITH_ALUMINUM:BOOL=ON \ - -D LBANN_WITH_CONDUIT:BOOL=ON \ - -D LBANN_WITH_CUDA:BOOL=OFF \ - -D LBANN_WITH_CUDNN:BOOL=OFF \ - -D LBANN_WITH_NCCL:BOOL=OFF \ - -D LBANN_WITH_NVPROF:BOOL=OFF \ - -D LBANN_WITH_SOFTMAX_CUDA:BOOL=OFF \ - -D LBANN_WITH_TOPO_AWARE:BOOL=ON \ - -D LBANN_WITH_TBINF=OFF \ - -D LBANN_WITH_VTUNE:BOOL=OFF \ - \ - -D CMAKE_CXX_COMPILER=$(which clang) \ - -D CMAKE_C_COMPILER=$(which clang) \ - -D LBANN_SB_FWD_ALUMINUM_OpenMP_CXX_LIB_NAMES=omp \ - -D LBANN_SB_FWD_ALUMINUM_OpenMP_CXX_FLAGS=-fopenmp \ - -D LBANN_SB_FWD_ALUMINUM_OpenMP_omp_LIBRARY=/usr/local/opt/llvm/lib/libomp.dylib \ - ${LBANN_HOME}/superbuild - - ninja +From this point, follow the instructions for :ref:`building LBANN with +Spack `. diff --git a/docs/build_with_cmake.rst b/docs/build_with_cmake.rst index f6f49ebd305..d694e8dd11c 100644 --- a/docs/build_with_cmake.rst +++ b/docs/build_with_cmake.rst @@ -8,7 +8,7 @@ Building LBANN with `CMake `_ ================================================== LBANN uses `CMake `_ for its build system and a -version newer than or equal to 3.9.0 is required. LBANN development is +version newer than or equal to 3.12.0 is required. LBANN development is done primarily on UNIX-based platforms. As such, the build is tested regularly on Linux-based machines, occasionally on OSX, and never on Windows machines. @@ -22,6 +22,87 @@ is missing, please `open an issue `_. It is required that LBANN be built out-of-source. That is, CMake must not be invoked in a directory containing a CMakeLists. +-------------------- +Dependencies +-------------------- + +The following packages and tools are required to build LBANN. All +packages listed below may be installed using `Spack +`_. See :ref:`the Spack installation +instructions ` for more details on using Spack to +build a complete LBANN environment. + +The following basic tools are **required**. + ++ A C++11-compliant compiler. + ++ OpenMP, version 3.0 or newer. + ++ An MPI-3.0 implementation. + ++ `CEREAL `_ is used to handle + complex serialization tasks. + ++ `CMake `_, version 3.12 or newer. + +The following LLNL-maintained packages are **required**. + ++ `Hydrogen `_ is a fork of the + `Elemental `_ distributed + dense linear-algebra library and it may be installed via + `Spack `_ using the package name + "hydrogen". If CUDA support is enabled in Hydrogen, LBANN will + inherit this support. + +The following third-party packages are **required**. + ++ `CNPY `_ is used to ingest data + in NumPy format. In principle this should be optional, but at time + of writing, LBANN will not build without it. + ++ `OpenCV `_ is used to preprocess + image data. For performance reasons, it is recommend to build OpenCV + with `JPEG-turbo `_ + for JPEG format support. + ++ `ProtoBuf `_ is used to + express models in a portable format. + +The following LLNL-maintained packages are **optional**. + ++ `Aluminum `_ is a + communication library optimized for machine learning and interaction + with GPUs. We cannot recommend its use strongly enough. It can be + built using `Spack `_. + ++ `CONDUIT `_ is used to ingest + structured data produced by scientific simulations. + ++ `DiHydrogen `_ is going to + become the linear algebra interface; currently, it can be used to + manage metaprogramming and some utilities. + +The following third-party packages are **optional**. + ++ `CUDA `_. The development + team currently uses CUDA version 9.2. Building with CUDA support + requires that Hydrogen has been built with CUDA support (see below). + ++ `cuDNN `_ is required if + building LBANN with CUDA support. It is freely available as a binary + distribution from NVIDIA. + ++ `HWLOC `_. HWLOC enables + LBANN to make certain optimizations based on the hardware + topology. Its use is strongly recommended. + ++ NVTX. LBANN supports some improved annotations for NVPROF using + NVTX. NVTX is provided as part of the CUDA toolkit. + ++ VTune. LBANN supports some improved annotations for VTune. + + + -------------------- LBANN CMake options -------------------- @@ -37,6 +118,10 @@ The following options are exposed in the CMake build system. + :code:`LBANN_WITH_CONDUIT` (Default: :code:`OFF`): Build with support for CONDUIT. ++ :code:`LBANN_WITH_DIHYDROGEN` (Default: :code:`OFF`): Build with + DiHydrogen support. This will replace temporary implementations in + LBANN with permanent implementations from DiHydrogen. + + :code:`LBANN_WITH_NVPROF` (Default: :code:`OFF`): Build with extra annotations for NVPROF. + :code:`LBANN_WITH_TOPO_AWARE` (Default: :code:`ON`): Use HWLOC for topology-aware choices. @@ -106,6 +191,12 @@ The latter option is recommended. file. Must set :code:`LBANN_WITH_CONDUIT=ON` to enable CONDUIT support. ++ :code:`DIHYDROGEN_DIR` or :code:`H2_DIR`: The + path to *either* the DiHydrogen installation prefix *or* the + :code:`DiHydrogenConfig.cmake` file. Alternatively, + :code:`DiHydrogen_DIR` can be set to the path of the + :code:`DiHydrogenConfig.cmake` file. + + :code:`HDF5_DIR`: The path to *either* the HDF5 installation prefix *or* the :code:`hdf5_config.cmake` file. There is a known issue with CONDUIT that it may link to HDF5 but not properly export that @@ -147,6 +238,28 @@ documentation of the packages that are causing the issues as they may require additional CMake/environment flags to be set before properly resolving. +------------------------------ +Building JAG utilities +------------------------------ +The JAG utility executables are not part of the `all` target. In order +to use or install them, they must be built using the `jag-utils` +target. In order to install them, this must be done before installing. + +.. code-block:: bash + + # Configure LBANN + cmake /path/to/lbann + + # Build main LBANN library and front-ends + cmake --build . + + # If JAG utilities are required, build them + cmake --build . --target jag-utils + + # Install all (built) targets + cmake --build . --target install + + ------------------------------ Example CMake invocation ------------------------------ diff --git a/docs/building_lbann.rst b/docs/building_lbann.rst index b374170bb07..c3053300fd4 100644 --- a/docs/building_lbann.rst +++ b/docs/building_lbann.rst @@ -10,83 +10,7 @@ Download -------------------- LBANN source code can be obtained from the `Github -repo `_. - --------------------- -Dependencies --------------------- - -The following packages and tools are required to build LBANN. All -packages listed below may be installed using `Spack -`_. See :ref:`below -` for more details on using Spack to build a -complete LBANN environment. - -The following basic tools are **required**. - -+ A C++11-compliant compiler. - -+ OpenMP, version 3.0 or newer. - -+ An MPI-3.0 implementation. - -+ `CEREAL `_ is used to handle - complex serialization tasks. - -+ `CMake `_, version 3.9 or newer. - -The following LLNL-maintained packages are **required**. - -+ `Hydrogen `_ is a fork of the - `Elemental `_ distributed - dense linear-algebra library and it may be installed via - `Spack `_ using the package name - "hydrogen". If CUDA support is enabled in Hydrogen, LBANN will - inherit this support. - -The following third-party packages are **required**. - -+ `CNPY `_ is used to ingest data - in NumPy format. In principle this should be optional, but at time - of writing, LBANN will not build without it. - -+ `OpenCV `_ is used to preprocess - image data. For performance reasons, it is recommend to build OpenCV - with `JPEG-turbo `_ - for JPEG format support. - -+ `ProtoBuf `_ is used to - express models in a portable format. - -The following LLNL-maintained packages are **optional**. - -+ `Aluminum `_ is a - communication library optimized for machine learning and interaction - with GPUs. We cannot recommend its use strongly enough. It can be - built using `Spack `_. - -+ `CONDUIT `_ is used to ingest - structured data produced by scientific simulations. - -The following third-party packages are **optional**. - -+ `CUDA `_. The development - team currently uses CUDA version 9.2. Building with CUDA support - requires that Hydrogen has been built with CUDA support (see below). - -+ `cuDNN `_ is required if - building LBANN with CUDA support. It is freely available as a binary - distribution from NVIDIA. - -+ `HWLOC `_. HWLOC enables - LBANN to make certain optimizations based on the hardware - topology. Its use is strongly recommended. - -+ NVTX. LBANN supports some improved annotations for NVPROF using - NVTX. NVTX is provided as part of the CUDA toolkit. - -+ VTune. LBANN supports some improved annotations for VTune. - +repository `_. .. _building-with-spack: @@ -94,8 +18,14 @@ The following third-party packages are **optional**. Building with `Spack `_ ------------------------------------------------------------ +.. note:: Users attempting to install LBANN on a Mac OSX machine may + need to do :ref:`additional setup ` before + continuing. In particular, installing LBANN requires a + different compiler than the default OSX command line tools + and an MPI library. + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Setup Spack and local base tools +Setup Spack ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1. Download and install `Spack `_. @@ -104,201 +34,162 @@ Setup Spack and local base tools .. code-block:: bash - . ${SPACK_ROOT}/share/spack/setup-env.sh - - -2. Setup your compiler and external software environment. For example, - on LLNL\'s LC machines, one might load the following modules: - - .. code-block:: bash - - ml gcc/7.3.0 mvapich2/2.3 cuda/10.0.130 # Pascal - - or + source ${SPACK_ROOT}/share/spack/setup-env.sh - .. code-block:: bash - - ml gcc/7.3.1 cuda/9.2.148 spectrum-mpi/rolling-release # Lassen / Sierra +2. LBANN will use `Spack environments + `_ to + specify and manage both compilers and versions of dependent + libraries. Go to the install instructions for :ref:`users + ` or :ref:`developers + `. - + Note to unload unwanted modules you can execute :bash:`ml` with - package names prepended with a dash, e.g.: :bash:`ml -intel`. To - unload all currently loaded modules, use :bash:`ml purge`. +.. note:: Optionally, setup your Spack environment to take advantage + of locally installed tools. Unless your Spack environment + is explicitly told about tools such as CMake, Python, MPI, + etc., it will install everything that LBANN and all of its + dependencies require. This can take quite a long time but + only has to be done once for a given spack repository. Once + all of the standard tools are installed, rebuilding LBANN + with Spack is quite fast. -3. Optionally, setup your spack environment to take advantages of - locally installed tools. Note that unless your spack environment - is explicitly told about tools such as cmake, python, mpi, etc. it - will install everything that LBANN and all of its dependencies - require. This can take quite a long time, but only has to be done - once for a given spack repository. Once all of the standard tools - are installed, rebuilding LBANN with spack is quite fast. + Advice on setting up paths to external installations is + beyond the scope of this document but is covered in the + `Spack Documentation + `_. - + Advice on setting up paths to external installations is beyond - the scope of this document, but is covered in the `Spack - Documentation `_. +.. _install_lbann_as_user: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Building & Installing LBANN as a user ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. warning:: This section is still under development and being - tested. It contains known issues. This warning will be - removed when it is believed to be generally usable. - With Spack setup and installed into your path, it can be used to install the LBANN executables. This approach is appropriate for users -that want to train new or existing models using the python front-end. +that want to train new or existing models using the Python front-end. .. note:: If your model requires custom layers or data readers, you may need to install LBANN as a developer, which would allow you to modify and recompile the source code. -Here are three easy ways to install LBANN: +Users comfortable with Spack and `its idioms for installing packages +`_ +or those who already have `customizations to their Spack ecosystem +`_ in place +may simply use -- Using the Spack environment method, (e.g., for an x86_64 LLNL LC - system with GPU support): +.. code-block:: bash - .. note:: This method provides a consistent set of dependencies during - installation. + spack install lbann - .. code-block:: bash +In this case, it is not even necessary to clone the LBANN repository +from Github; Spack will handle this in its installation. - cd /spack_environments/users/llnl_lc/_gpu/ # where = x86_64 | ppc64le - spack install - ml load lbann +For users that are new to spack, LBANN provides a script that will do +some basic configuration and then install LBANN using the Spack +environment method: -- Building with the latest released versions and GPU support (use the - user's defaults for specifying the compiler, MPI library, etc.): +.. code-block:: bash - .. code-block:: bash + /scripts/install_lbann.sh -e lbann + spack env activate -p lbann - spack install lbann +gpu +nccl - ml load lbann +Options exist in the script to disable the GPUs and change the +name of the Spack environment. These can be viewed by passing the +:code:`-h` option to the script. -- Building with the head of develop branch for lbann, hydrogen and - aluminum with GPU support (use the user's defaults for specifying - the compiler, MPI library, etc.): +.. note:: Currently this script will clone a second LBANN repository + that Spack will use to build the LBANN library and + executables. We are working on simplifying this further. - .. code-block:: bash - spack install lbann@develop +gpu +nccl ^hydrogen@develop ^aluminum@master - ml load lbann - -There are numerous options for all of these packages. These options -can be viewed via commands such as :bash:`spack info lbann`. To -specify the compiler, one can add options such as :code:`%gcc@7.3.0`. -For further information about specifying dependencies, such as the MPI -library, please consult `the Spack documentation -`_. +.. _build_lbann_from_source: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Building & Installing LBANN as a developer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Developers of LBANN will often need to interact with the source code -and/or advanced configuration options for Aluminum, Hydrogen, and -LBANN while the other dependencies remain constant. The Spack -installation instructions below set up a Spack environment with the -remaining dependencies, requiring the developer to build Aluminum, -Hydrogen, and LBANN separately, by whatever means they choose. +and/or set advanced configuration options for Aluminum, Hydrogen, and +LBANN while the other dependencies remain constant. The installation +instructions below provide a script that will setup a Spack +environment with the remaining dependencies, and then invoke the LBANN +CMake infrastructure to build LBANN from the local source. The +provided script will build with a standard compiler for a given +platform and the nominal options in the CMake build environment. +Expert developers should refer to :ref:`the "Superbuild" documentation +` for a list and descriptions of all +CMake flags known to LBANN's "Superbuild" build system. + +1. Install all of the external packages via Spack (Aluminum, + Hydrogen, etc). + + Install packages into a Spack environment. This is only done when + initially installing or upgrading the dependencies. LBANN provides + a script to install the basic dependencies in their default + configurations and it can be found at: -1. Establish a Spack environment and install software dependencies. - Note that there are four environments to pick from along two axes: + .. code-block:: bash - .. note:: This spack environment has to be setup once each time - you create a new build directory. + /scripts/install_lbann.sh -d - 1. developers or users - 2. x86_64 and ppc64le + Note that the named environment can be controlled via the + :code:`-e` flag. A full list of options can be viewed with the + :code:`-h` flag. - For example if you are a developer and want to build the inside of - the git repo use the following instructions: +2. Setup the LBANN CMake environment using the Spack environment for + the dependencies. .. code-block:: bash - export LBANN_HOME=/path/to/lbann/git/repo - export LBANN_BUILD_DIR=/path/to/a/build/directory - export LBANN_INSTALL_DIR=/path/to/an/install/directory - cd ${LBANN_BUILD_DIR} - spack env create -d . ${LBANN_HOME}/spack_environments/developer_release__cuda_spack.yaml # where = x86_64 | ppc64le - cp ${LBANN_HOME}/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml . - cp ${LBANN_HOME}/spack_environments/externals__llnl_lc_cz.yaml . # where = x86_64 | ppc64le - spack install - spack env loads # Spack creates a file named loads that has all of the correct modules - source loads - unset LIBRARY_PATH - - - + Note that the environments provided here have a set of external - packages and compilers that are installed on an LLNL LC CZ - system. Please update these for your system environment. - Alternatively, you can create baseline versions of the - user-level Spack configuration files and remove the externals - and compilers from the :code:`spack.yaml` file. More details are - provided :ref:`here `. - - + Note that the initial build of all of the standard packages in Spack - will take a while. - - + Note that the Spack module files set the :bash:`LIBRARY_PATH` environment - variable. This behavior allows autotools-based builds to pickup the - correct libraries but interferes with the way that CMake sets up - RPATHs. To correctly establish the RPATH, please unset the variable - as noted above, or you can explicitly pass the RPATH fields to CMake - using a command such as: - - .. code-block:: bash - - cmake -DCMAKE_INSTALL_RPATH=$(sed 's/:/;/g' <<< "${LIBRARY_PATH}") \ - -DCMAKE_BUILD_RPATH=$(sed 's/:/;/g' <<< "${LIBRARY_PATH}") \ - ... - -2. Build LBANN locally from source and build Hydrogen and Aluminum - using the superbuild. See :ref:`here ` - for a list and descriptions of all CMake flags known to LBANN's - "Superbuild" build system. A representative CMake command line - that expects :bash:`LBANN_HOME`, :bash:`LBANN_BUILD_DIR`, - :bash:`LBANN_INSTALL_DIR` environment variables might be: + /scripts/build_lbann_from_source.sh + + + Options exist in the script to disable the GPUs, set a build and + install prefix, separately set the build and install + directories, or use a different spack environment. These options + can be viewed using the :code:`-h` flag. + + The environments provided by this script have a set of external + packages and compilers that are installed on an LLNL LC CZ, NERSC, + or LLNL-configured OS X system. If you are not on one of these + systems, please update the externals and compilers for your system + environment. Alternatively, you can create baseline versions of + the user-level Spack configuration files and remove the externals + and compilers from the :code:`spack.yaml` file. More details are + provided :ref:`here `. + + .. warning:: Depending on the completeness of the externals + specification, the initial build of all of the + standard packages in Spack can take a long time. + +3. Once the installation has completed, you can load the module file + for LBANN with the following command .. code-block:: console - cd ${LBANN_BUILD_DIR} - cmake \ - -G Ninja \ - -D CMAKE_BUILD_TYPE:STRING=Release \ - -D CMAKE_INSTALL_PREFIX:PATH=${LBANN_INSTALL_DIR} \ - \ - -D LBANN_SB_BUILD_ALUMINUM=ON \ - -D ALUMINUM_ENABLE_MPI_CUDA=OFF \ - -D ALUMINUM_ENABLE_NCCL=ON \ - \ - -D LBANN_SB_BUILD_HYDROGEN=ON \ - -D Hydrogen_ENABLE_ALUMINUM=ON \ - -D Hydrogen_ENABLE_CUB=ON \ - -D Hydrogen_ENABLE_CUDA=ON \ - \ - -D LBANN_SB_BUILD_LBANN=ON \ - -D LBANN_DATATYPE:STRING=float \ - -D LBANN_SEQUENTIAL_INITIALIZATION:BOOL=OFF \ - -D LBANN_WITH_ALUMINUM:BOOL=ON \ - -D LBANN_WITH_CONDUIT:BOOL=ON \ - -D LBANN_WITH_CUDA:BOOL=ON \ - -D LBANN_WITH_CUDNN:BOOL=ON \ - -D LBANN_WITH_NCCL:BOOL=ON \ - -D LBANN_WITH_NVPROF:BOOL=ON \ - -D LBANN_WITH_SOFTMAX_CUDA:BOOL=ON \ - -D LBANN_WITH_TOPO_AWARE:BOOL=ON \ - -D LBANN_WITH_TBINF=OFF \ - -D LBANN_WITH_VTUNE:BOOL=OFF \ - ${LBANN_HOME}/superbuild - - ninja - ml use ${LBANN_INSTALL_DIR}/etc/modulefiles/ + ml use /etc/modulefiles ml load lbann-0.99.0 -The complete documentation for building LBANN directly with CMake can -be found :ref:`here `. + For advanced users, :ref:`the LBANN superbuild system + ` provides additional control over + the dependencies, especially Aluminum and Hydrogen. + +4. After the initial setup of the LBANN CMake environment, you can + rebuild by activating the Spack environment and then re-running + ninja. + + .. code-block:: console + + spack env activate -p + cd /lbann/build + unset CPATH # Can cause bad include resolution + ninja + +For more control over the LBANN build, please see :ref:`the complete +documentation for building LBANN directly with CMake +`. ------------------------------ Advanced build methods diff --git a/docs/conf.py b/docs/conf.py index d1763486df5..a921a29a037 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,9 +18,13 @@ import subprocess, os, runpy -rebuild_doxygen = not os.path.isdir("doxy_out/xml") +rebuild_doxygen = not os.path.isdir("doxy_out/xml") or not os.path.isdir("_static/doxygen/html") +if not os.path.isdir("_static"): + os.makedirs("_static") + if rebuild_doxygen: + os.makedirs("doxy_out/xml") subprocess.call('doxygen SourceTreeDoxyfile', shell=True) #exec(open("./BuildRSTDocs.py").read()) @@ -75,6 +79,7 @@ # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' +html_static_path = ['_static'] # -- Options for HTML output ------------------------------------------------- diff --git a/docs/continuous_integration.rst b/docs/continuous_integration.rst new file mode 100644 index 00000000000..ddced95ebff --- /dev/null +++ b/docs/continuous_integration.rst @@ -0,0 +1,221 @@ +.. role:: bash(code) + :language: bash + +.. role:: python(code) + :language: python + +LBANN CI +==================== + +Bamboo is the continuous integration (CI) framework we use. +A Bamboo plan consists of stages (which run sequentially), +which consist of jobs (which run in parallel), +which consist of tasks (which run sequentially). + +The LBANN build project has many plans. +Two plans run off of `LLNL/lbann/develop `_ +- Nightly Develop and Weekly Develop. +Nightly Develop runs every night (except Saturday) at midnight. +Weekly Develop runs every Saturday at midnight. +The other plans in the build project are for each individual LBANN developer's +fork of LBANN. + +All plans run off the latest *pushed* commits to the repository. +That means if you have local commits that you have not pushed to your fork, +these commits will *not* be tested by Bamboo. +If you have pushed commits to your fork but have not merged your branch into +the main repository's "develop", +your commits will be tested on your individual plan, +but not on Nightly Develop or Weekly Develop. + +Plan Configuration +---------------------------------------- +Each plan is identical (except Weekly Develop, which will be explained below). +The plans consist of a single stage "Tests". +The stage consists of two jobs - "x86_cpu" (Catalyst), and "x86_gpu" (Pascal). +Each of these jobs can run in parallel. +They consist of an identical list of tasks: + +1. Checkout Default Repository (checkout the repository) + +2. Run :bash:`./allocate_and_run.sh`; + Weekly Develop adds the :bash:`--weekly` option. + This script allocates nodes and then runs "run.sh" which does the following: + + a. Remove Generated Files (each build creates a large number of files. + We may look at these files between builds, + so we cannot delete them at the end of a build. + So, instead we delete them before doing any real work in the next build. + This also ensures the generated files came from the latest build and not + a previous build). + + b. Compiler Tests (run tests in "bamboo/compiler_tests") + + c. Integration Tests (run tests in "bamboo/integration_tests") + + d. Unit Tests (run tests in "bamboo/unit_tests") + +3. JUnit Parser (this allows Bamboo to render test results in a nice UI) + + +The tests in Task 2 run +:bash:`$PYTHON -m pytest -s -vv --durations=0 [--weekly] --junitxml=results.xml`, +which will run all the pytests in the job's associated directory. +Note that :bash:`$PYTHON` refers to the Python build to use. +Also note that only Weekly Develop adds the :bash:`--weekly` option. +Many (mostly longer-running) tests are set to not run unless this option is on. +Weekly Develop runs a superset of the tests that Nightly Develop runs. + +Directory Structure +---------------------------------------- + +"bamboo/compiler_tests", "bamboo/integration_tests", "bamboo/unit_tests" each +have a "conftest.py" that pytest requires. +They also contain one or more python files. +Each of these files have a number of tests to run. + +Writing Your Own Tests +---------------------------------------- + +A side effect of our Bamboo setup is that tests must be written using pytest. +Test files must begin with :bash:`test_` to be recognized by pytest. +Individual test methods must also begin with :python:`test_`. +Test methods should use the :python:`assert` keyword or raise an +:python:`AssertionError`. +A test will only fail if the assertion turns out to be false. +Not putting an assertion will automatically cause the test to pass. + +How then to test non-Python code? +You can just wrap your test with Python. +A test can be as simple as asserting the output code of a shell command is 0. +The output code of a command can be found using Python's :python:`os.system()`. + +Running Tests On Your Individual Plan +---------------------------------------- + +Unlike Nightly Develop, the individual plans are triggered to run by polling +your fork for commits. +They do not run nightly. +If you push new commits to your fork, a new build should start automatically. +You can also manually start a build by navigating to your individual plan and +clicking Run > Run plan +(this will say "Run branch" if you have plan branches set up). +Once again, keep in mind that the tests will run off what has been pushed to +your GitHub fork of LBANN and not your local copy of the LBANN repository. + +Plan branches allow you to test multiple branches simultaneously instead +of simply testing "/develop". +You can create plan branches by navigating to your individual plan, +clicking Actions > Configure plan > Branches > Create plan branch. + +Navigating Bamboo +---------------------------------------- + +From the `LBANN Project Summary `_, +click on a plan. +From there, click on a build (builds are listed under "Recent History" and can +also be accessed from the pass/fail marks in the top right, +to the left of the "Run" button). +This will bring you to a certain build's page. +The most relevant tabs are "Tests" and "Logs". +It is recommended to look at failures first in the "Tests" tab, +as the build logs can be difficult to parse through. +The build's "Tests" tab shows "New test failures", "Existing test failures", +"Fixed tests", and "Skipped Tests". + +From the build's page, you can also click on individual jobs, +which have the same tabs. +The "Tests" tabs of the individual jobs have two sub-tabs, +"Failed tests" and "Successful tests". +They do not display skipped tests. +The Bamboo agent that ran the job can be found by looking at the "Agent" field +under the "Job Summary" tab. +Alternatively, you can determine the agent from one of the first lines in the +build logs: +"Build working directory is /usr/workspace/wsb/lbannusr/bamboo//xml-data/build-dir/". + + +Bamboo Agent Properties +---------------------------------------- + +Bamboo agent properties are used to specify requirements for each job. + ++--------------------------------+-------------+--------------+----------+------------------+------------------------+ +| Agents (jobs) | agent_owner | architecture | cluster | gpu_architecture | sys_type | ++================================+=============+==============+==========+==================+========================+ +| Catalyst Agents (x86_cpu) | lbannusr | x86_64 | catalyst | none | toss_3_x86_64_ib | ++--------------------------------+-------------+--------------+----------+------------------+------------------------+ +| Corona Agents (x86_cpu_corona) | lbannusr | x86_64 | corona | none | toss_3_x86_64_ib | ++--------------------------------+-------------+--------------+----------+------------------+------------------------+ +| Lassen Agents (ppc64le_gpu) | lbannusr | ppc64le | lassen | volta | blueos_3_ppc64le_ib_p9 | ++--------------------------------+-------------+--------------+----------+------------------+------------------------+ +| Pascal Agents (x86_gpu_pascal) | lbannusr | x86_64 | pascal | pascal | chaos_6_x86_64_ib | ++--------------------------------+-------------+--------------+----------+------------------+------------------------+ +| Ray Agents (ppc64le_gpu) | lbannusr | ppc64le | ray | pascal | blueos_3_ppc64le_ib | ++--------------------------------+-------------+--------------+----------+------------------+------------------------+ + +Currently, "agent_owner", "architecture", and "gpu_architecture" are used to +determine agents to run a job. + +Running Tests From The Command Line +---------------------------------------- + +Navigate to "bamboo/compiler_tests", "bamboo/integration_tests", +or "bamboo/unit_tests". + +To run all the tests in a subdirectory: :bash:`python -m pytest -s --weekly`. +Note that running all tests can take a substantial amount of time. + +To run the tests that Nightly Develop or the individual plans run in a +subdirectory: :bash:`python -m pytest -s`. + +To run a specific test file: :bash:`python -m pytest -s .py`. + +To run a specific test: +:bash:`python -m pytest -s .py -k ''`. + +Most integration and unit tests allow for running a test with a different +executable. +The convention is to have a similarly structured test replacing +:python:`_` with :python:`_exe`. +These tests are set to be skipped in Bamboo, but can be run locally. +There should be a line above the test that gives the command to run the test +locally, likely in the following form: +:bash:`python -m pytest -s .py -k '' --exe=`. + +If you have an executable, you can run the :python:`_exe` tests with +:bash:`local_test.sh`. Use :bash:`local_test.cmd` as a template for writing +a batch script. You can run only integration tests, only unit tests, or both. + +Helpful Files +---------------------------------------- + +First, run :bash:`sudo lbannusr`. + +To look at output and error from previous builds: +:bash:`cd /usr/workspace/wsb/lbannusr/bamboo//xml-data/build-dir//bamboo//`. +If the test uses the Python Front-End, use: +:bash:`cd /usr/workspace/wsb/lbannusr/bamboo//xml-data/build-dir//bamboo//experiments/`. +(Note that these files can also be read by clicking on the "Artifacts" tab on +the Bamboo build). + +To look at archived results from previous builds: +:bash:`cd /usr/workspace/wsb/lbannusr/archives/` + +To look at Bamboo agent properties: +:bash:`cat /usr/global/tools/bamboo/agents/lbannusr//bin/bamboo-capabilities.properties` + +You can copy these files over to your own machine as follows: + +- :bash:`sudo lbannusr` + +- :bash:`give ` + +- :bash:`exit` - to go back to your own LC account, not lbannusr's. + +- :bash:`take lbannusr` - now the file exists on your LC account, + but not yet on your own machine. + +From your own machine, not a ssh terminal: + +- :bash:`scp @.llnl.gov: .` diff --git a/docs/documentation_building.rst b/docs/documentation_building.rst new file mode 100644 index 00000000000..7800f699d44 --- /dev/null +++ b/docs/documentation_building.rst @@ -0,0 +1,56 @@ +.. role:: bash(code) + :language: bash + +LBANN Documentation Building +============================ + +.. warning:: Some of the directions in this section are Mac-specific. + +Adding Documentation Outside Code +---------------------------------- + +1. Create a file such as "new_docs.rst" in "lbann/docs". + +2. Add "new_docs" (no ".rst") to the appropriate documentation block in + "lbann/docs/index.rst". + +3. Look at the other ".rst" files in "lbann/docs" to see how to get + certain formatting. + +4. When you want to see how your code looks, you have a couple options: + + a. Push your docs to your fork/branch on GitHub and look at how + the text renders. This is a very simplified look compared to + Read-the-Docs. + + b. From "lbann/docs" run :bash:`make html` and then + :bash:`open -a _build/html/index.html`. + This is exactly how the docs will look. + +5. Merge your code into "lbann/develop" and then have someone with + correct permissions on Read-the-Docs update the + `official docs `_. + +Making The Build Work +---------------------------------- + +In order to make :bash:`make html` work, you may need to do a few steps: + +1. Run :bash:`pip3 install sphinx breathe sphinx-rtd-theme`. + +2. Download Doxygen by going to the + `Doxygen downloads page `_, + downloading "Doxygen-1.8.15.dmg", and + dragging the app to the "Applications" folder. + +3. Determine the directory Doxygen is in by running `which Doxygen`. + If nothing is returned, see if `doxygen` is in + "/Applications/Doxygen.app/Contents/Resources" or + "/Applications/Doxygen.app/Contents/MacOS". + +4. Add Doxygen to your path with + :bash:`PATH=":${PATH}"`. + You may want to add this to your "~/.bash_profile" so your :bash:`PATH` is + always correct. Run :bash:`source ~/.bash_profile` to run that code. + +5. Try running :bash:`make html` again. diff --git a/docs/index.rst b/docs/index.rst index b07bfae9bb9..e4603712d64 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,7 +11,7 @@ LBANN provides model-parallel acceleration through domain decomposition to optimize for strong scaling of network training. It also allows for composition of model-parallelism with both data parallelism and ensemble training methods for training large neural -networks with massive amounts of data. LBANN is able to advantage of +networks with massive amounts of data. LBANN is able to take advantage of tightly-coupled accelerators, low-latency high-bandwidth networking, and high-bandwidth parallel file systems. @@ -22,6 +22,8 @@ recurrent neural networks via back propagation through time (BPTT) training, transfer learning, and multi-model and ensemble training methods. +Users are advised to view `the Doxygen API Documentation +<_static/doxygen/html/index.html>`_ for API information. .. toctree:: :maxdepth: 2 @@ -40,8 +42,11 @@ methods. :maxdepth: 2 :caption: Developer Documentation + lbann lbann/lbann style_guide + continuous_integration + documentation_building ================== diff --git a/docs/lbann.rst b/docs/lbann.rst new file mode 100644 index 00000000000..30438e05af2 --- /dev/null +++ b/docs/lbann.rst @@ -0,0 +1,85 @@ +************************************************** +LBANN Software Architecture and Class Overview +************************************************** + +Trainers (i.e. execution environment) +****************************************** + +A trainer is a collection of compute resources and defines a explicit +communication domain. It provides the execution for both the training +and inference of a trained model. Once constructed a trainer owns an +LBANN comm object that defines both intra- and inter-trainer +communication domains. Additionally, a trainer will contain an I/O +thread pool that is used to fetch and pre-process data that will be +provided to the trainer's models. + +A trainer owns: + +* comm object +* I/O thread pool +* One or more models +* Execution context for each model +* In the future, it will also contain the data readers. + +Execution Context +****************************************** + +When a model is attached to a trainer the execution context of the +training algorithm is stored in an execution_context class (or +sub-class) per execution mode. Thus there is one execution context +per model and mode that contains all of the state with respect to the +training algorithm being applied to the model. + +For example it tracks the current: + +* step +* execution mode +* epoch +* and a pointer back to the trainer + +Termination Criteria (Pending) +****************************************** + +(Pending feature) When a model is going to be trained or evaluated, +the termination criteria is specified in an object that is passed into +the training algorithm. (Note that this feature is under development, +currently the termination criteria is dictated by when the training +algorithm executes a fixed number of epochs.) + +Training Algorithms +****************************************** + +The training algorithm defines the optimization that is to be +applied to the model(s) being trained. Additionally, it can +specify how to evaluate the model. + +Model +****************************************** + +A model is a collection of operations with dependencies encoded as a +directed acyclic graph (DAG). In a typical formulation, these +operations form a neural network that will be either trained or used +for inference. Each operation in the model is an instance of the +layer class. The model is then a collection of layers that perform +transformations and mathematical operations on data that is passed +between layers. The model's DAG is executed in topological order. +Inside of some layer types are weight matrices that define a trained +model. (Note that LBANN should be able to support non-DNN models, but +this is a subject for future work.) + +Each layer in the graph contains a set of tensors that holds the +inputs, computed outputs, gradients with respect to the outputs, and +gradients with respect to the inputs. Furthermore, for each layer in +the graph with learnable parameters, there is an associated weight +tensor that form the learned weights of the model. The model also +owns the objective function, since that is integrally tied into the +model's computational graph. Additionally, the model owns both the +default optimizer that is used to provide a standard optimizer for the +model's weight tensors. Once each weight tensor is instantiated, it +will owns an instance of an optimizer. + +The model also owns the max_mini_batch_size that is supported by the +model. This is due to the fact that it changes the size and shape of +input, output, and gradient tensors. Additionally, the model owns a +field that controls if background I/O is allowed for this model and +associated data reader. diff --git a/docs/publications.rst b/docs/publications.rst index c2bb25449cd..aa22a58b0e6 100644 --- a/docs/publications.rst +++ b/docs/publications.rst @@ -3,10 +3,31 @@ Papers, Presentations, and Posters Publications about or related to using LBANN: ++ Nikoli Dryden, Naoya Maruyama, Tom Benson, Tim Moon, Marc Snir, + Brian Van Essen. "Channel and Filter Parallelism for Large-Scale + CNN Training", to appear in *International Conference for High + Performance Computing, Networking, Storage and Analysis (SC'19)*, 2019. + ++ Sam Ade Jacobs, Brian Van Essen, Tim Moon, Jae Seung Yeom, David + Hysom, Brian Spears, Rushil Anirudh, Jayaraman Thiagaranjan, Shusen + Liu, Jim Gaffney, Peer-Timo Bremer, Tom Benson, Peter Robinson, and + Luc Peterson, "Parallelizing Training of Deep Generative Models on + Massive Scientific Datasets", to appear in *Proceedings of Cluster + Computing*, 2019 + ++ Shusen Liu, Di Wang, Dan Maljovec, Rushil Anirudh, + Jayaraman J. Thiagarajan, Sam Ade Jacobs, Brian C. Van Essen, David + Hysom, Jae-Seung Yeom, Jim Gaffney, Luc Peterson, Peter B. Robinson, + Harsh Bhatia, Valerio Pascucci, Brian K. Spears, Peer-Timo Bremer. + `"Scalable Topological Data Analysis and Visualization for + Evaluating Data-Driven Models in Scientific Applications" + `_, to appear in *IEEE Transactions + on Visualization and Computer Graphics*, 2019 + + Nikoli Dryden, Naoya Maruyama, Tom Benson, Tim Moon, Marc Snir, Brian Van Essen. `"Improving Strong-Scaling of CNN Training by Exploiting Finer-Grained Parallelism" - `_, to appear in *IEEE + `_, in *Proceedings of IEEE International Parallel & Distributed Processing Symposium*, 2019. + `IPDPS'19 `_ diff --git a/docs/running_lbann.rst b/docs/running_lbann.rst index d98e5fa62fe..124d9d28cbf 100644 --- a/docs/running_lbann.rst +++ b/docs/running_lbann.rst @@ -1,96 +1,454 @@ .. role:: bash(code) :language: bash +.. role:: python(code) + :language: python -==================== +============================================================ Running LBANN -==================== +============================================================ -The basic template for running LBANN is +------------------------------------------------ +Anatomy of an LBANN experiment +------------------------------------------------ + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +LBANN is run under `MPI +`_, i.e. with +multiple processes that communicate with message passing. This set of +processes is subdivided into one or more "trainers." Conceptually, a +trainer owns parallel objects, like models and data readers, and +generally operates independently of other trainers. + +Comments: + ++ LBANN targets HPC systems with homogeneous compute nodes and GPU + accelerators, which motivates some simplifying assumptions: + + - All trainers have the same number of processes. + + - If GPU acceleration is enabled, each MPI process corresponds to + one GPU. + ++ Processors are block assigned to trainers based on MPI rank. + + - In order to minimize the cost of intra-trainer communication, make + sure to map processes to the hardware and network + topologies. Typically, this just means choosing a sensible number + of processes per trainer, e.g. a multiple of the number of GPUs + per compute node. + ++ Generally, increasing the number of processes per trainer will + accelerate computation but require more intra-trainer + communication. There is typically a sweet spot where run time is + minimized, but it is complicated and sensitive to the nature of the + computation, the mini-batch size, the data partitioning scheme, + hardware and network properties, the communication algorithms, and + myriad other factors. + + - Rule-of-thumb: Configure experiments so that the bulk of run time + is taken by compute-bound operations (e.g. convolution or matrix + multiplication) and so that each process has enough work to + achieve a large fraction of peak performance (e.g. by making the + mini-batch size sufficiently large). + ++ Most HPC systems are managed with job schedulers like `Slurm + `_. Typically, users can + not immediately access compute nodes but must request them from + login nodes. The login nodes can be accessed directly (e.g. via + :bash:`ssh`), but users are discouraged from doing heavy computation + on them. + + - For debugging and quick testing, it's convenient to request an + interactive session (:bash:`salloc` or :bash:`sxterm` with Slurm). + + - If you need to run multiple experiments or if experiments are not + time-sensitive, it's best to submit a batch job (:bash:`sbatch` + with Slurm). + + - When running an experiment, make sure you know what scheduler + account to charge (used by the scheduler for billing and + determining priority) and what scheduler partition to run on + (compute nodes on a system are typically subdivided into multiple + groups, e.g. for batch jobs and for debugging). + + + With :bash:`salloc`, specify the partition using the + :bash:`--partition` command-line argument and specify the + account using :bash:`--account`. + + - Familiarize yourself with the rules for the systems you use + (e.g. the expected work for each partition, time limits, job + submission limits) and be a good neighbor. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Model components +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ++ Layer: A tensor operation, arranged within a directed acyclic graph. + + - During evaluation ("forward prop"), a layer receives input tensors + from its parents and sends an output tensor to each child. + + - During automatic differentiation ("backprop"), a layer receives + "input error signals" (objective function gradients w.r.t. output + tensors) from its children and sends "output error signals" + (objective function gradients w.r.t. input tensors) to its + parents. If the layer has any associated weight tensors, it will + also compute objective function gradients w.r.t. the weight + tensors. + + - Most layers require a specific number of parents and children, but + LBANN will insert layers into the graph if there is a mismatch and + the intention is obvious. For example, if a layer expects one + child but has multiple, then a split layer (with multiple output + tensors all identical to the input tensor) is inserted. Similarly, + if a layer has fewer children than expected, dummy layers will be + inserted. However, this does not work if there is any + ambiguity. In such cases (common with input and slice layers), it + is recommended to manually insert identity layers so that the + parent/child relationships are absolutely unambiguous. + + - See `lbann/src/proto/layers.proto + `_ + for a full list of supported layers. + ++ Weights: A tensor consisting of trainable parameters, typically + associated with one or more layers. A weight tensor owns an + initializer to initially populate its values and an optimizer to + find values that minimize the objective function. + + - A weight tensor without a specified initializer will use a zero + initializer. + + - A weight tensor without a specified optimizer will use the model's + default optimizer. + + - If a layer requires weight tensors and none are specified, it will + create the needed weight tensors. The layer will pick sensible + initializers and optimizers for the weight tensors. For example, a + convolution layer will initialize its kernel tensor with He normal + initialization and with the model's default optimizer. + + - The dimensions of a weight tensor is determined by their + associated layers. The user can not set it directly. + ++ Objective function: Mathematical expression that the optimizers will + attempt to minimize. It is made up of multiple terms that are added + together (possibly with scaling factors). + + - An objective function term can get its value from a scalar-valued + layer, i.e. a layer with an output tensor with one entry. + ++ Metric: Mathematical expression that will be reported to the + user. This typically does not affect training, but is helpful for + evaluating the progress of training. A canonical example for + classification problems is classification accuracy. + ++ Callback: Function that is performed at various points during an + experiment. Callbacks are helpful for reporting, debugging, and + performing advanced training techniques. + + - This is the natural home for experimental training + techniques. + + - A common use-case is to export values with the "dump outputs" + callback so that the user can perform data post-processing or + visualization. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Data readers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: The core infrastructure for data readers is slated for + significant refactoring, so expect major changes in the + future. + +Data readers are responsible for managing a data set and providing +data samples to models. A data set is comprised of independent data +samples, each of which is made up of multiple tensors. For example, a +data sample for a labeled image classification problem consists of an +image tensor and a one-hot label vector. + +.. note:: The data readers are currently hard-coded to assume this + simple classification paradigm. Hacks are needed if your + data does not match it exactly, e.g. if a data sample is + comprised of more than two tensors. The most basic approach + is to flatten all tensors and concatenate them into one + large vector. The model is then responsible for slicing this + vector into the appropriate chunks and resizing the chunks + into the appropriate dimensions. Done correctly, this should + not impose any additional overhead. + +Specifically, data readers and models interact via input layers. Each +model must have exactly one input layer and its output tensors are +populated by a data reader every mini-batch step. This is typically +performed by a background thread pool, so data ingestion will +efficiently overlap with other computation, especially if the data +reader's work is IO-bound or if the computation is largely on GPUs. + +.. note:: An input layer has an output tensor for each data sample + tensor. Since each data sample has two tensors (one for the + data and one for the label), it follows that every input + layer should have two child layers. To make parent/child + relationships unambiguous, we recommend manually creating + identity layers as children of the input layer. + +Note that layers within a model treat the data for a mini-batch as a +single tensor where the leading dimension is the mini-batch size. +Thus, corresponding tensors in all data samples must have the same +dimensions. The data dimensions must be known from the beginning of +the experiment and can not change. However, real data is rarely so +consistent and some preprocessing is typically required. See +`lbann/src/proto/transforms.proto +`_ +for a list of available preprocessing transforms. + +.. warning:: The Python data reader will trigger some process forking + that doesn't interact with InfiniBand all that well by + default. Users may encounter hangs on clusters that use + InfiniBand. To avoid this, ensure that + :bash:`IBV_FORK_SAFE=1` is exported into the environment + when running LBANN. + +------------------------------------------------ +Python frontend +------------------------------------------------ + +LBANN provides a Python frontend with syntax reminiscent of `PyTorch +`_. See `a simple implementation of LeNet +`_. + +Comments: + ++ Under-the-hood, the Python frontend is actually a convenience + wrapper around the Protobuf frontend. The core infrastructure allows + users to configure an experiment and "compiles" it to a Prototext + text file. + ++ The Python interface can only configure and launch experiments. It + is not active during an experiment and it does not allow for any + dynamic control flow. + ++ Only Python 3 is supported. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Setup +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The :python:`lbann` Python package is installed as part of the LBANN +build process. However, it is necessary to update the +:bash:`PYTHONPATH` environment variable to make sure Python detect +it. There are several ways to do this: + ++ If LBANN has been built with the Spack user build process, loading + LBANN will automatically update :bash:`PYTHONPATH`: .. code-block:: bash - \ - lbann \ - --model=model.prototext \ - --optimizer=opt.prototext \ - --reader=data_reader.prototext - -When using GPGPU accelerators, users should be aware that LBANN is -optimized for the case in which one assigns one GPU per MPI -*rank*. This should be borne in mind when choosing the parameters for -the MPI launcher. - -A list of options for LBANN may be found by running :bash:`lbann ---help`. - -.. note:: At time of writing, it is known that some of these are - out-of-date. An - `issue `_ has been - opened to track this. - -.. _using-the-model-zoo: - --------------------- -Using the model zoo --------------------- - -LBANN ships with prototext descriptions of a variety of models, -optimizers and data readers. These may be found in the :code:`model_zoo/` -directory of the source repository or the :code:`share/model_zoo/` directory -of the install directory. - -.. warning:: Some of these prototexts point to specific data locations - on LLNL LC clusters. Users may have to modify such paths - to point to locations on their own systems. This can be - done by modifying the prototext directly or overriding - the options on the command line with, e.g., the - :code:`--data_filedir_train` and - :code:`--data_filedir_test` options. - -The following is an example invocation of LBANN on a machine using -Slurm's :bash:`srun` as an MPI launcher. In the example command, -a machine with 2 GPGPUs per node are available, 4 nodes will be used, -:bash:`${LBANN_EXE}` is the path to the :code:`lbann` executable, and -:bash:`${LBANN_MODEL_ZOO_DIR}` is the path to the :code:`model_zoo/` directory in -either the source tree or the install tree. Note that the options -passed to :bash:`srun` are not likely to be portable to other MPI -launchers. The example will train Alexnet with SGD optimization on the -Imagenet dataset for 5 epochs. + module load lbann + +.. warning:: The above will *not* work if LBANN has been built with + :bash:`scripts/build_lbann_lc.sh` or with the Spack + developer build process. + ++ LBANN includes a modulefile that updates :bash:`PYTHONPATH`: .. code-block:: bash - srun -N4 --ntasks-per-node=2 \ - ${LBANN_EXE} \ - --model=${LBANN_MODEL_ZOO_DIR}/models/alexnet/alexnet.prototext \ - --optimizer=${LBANN_MODEL_ZOO_DIR}/optimizers/opt_sgd.prototext \ - --reader=${LBANN_MODEL_ZOO_DIR}/data_readers/data_reader_imagenet.prototext \ - --num_epochs=5 - ---------------------------------------------- -Using the Python interface for prototext ---------------------------------------------- - -There is a python interface for generating model prototext -files. Example Python scripts may be found in the -:code:`scripts/proto/lbann/models` directory of the source -repository. Running the Python script will generate a prototext that -can be passed to the :code:`--model` option for LBANN. + module use /etc/modulefiles + module load lbann- + ++ Directly manipulate :bash:`PYTHONPATH`: .. code-block:: bash - - python3 alexnet.py alexnet.prototext - \ - lbann --model=alexnet.prototext -where :code:`` are as documented -:ref:`above `, with optimizer and data reader -prototexts coming from the appropriate :code:`model_zoo/` directories. + export PYTHONPATH=/lib/python/site-packages:${PYTHONPATH} + +Note that LBANN depends on the Protobuf Python package, which can be +installed with: + +.. code-block:: bash + + pip install protobuf + +If the user does not own the site-packages directory, then it may be +necessary to pass the :bash:`--user` flag to pip. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Basic usage +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A typical workflow involves the following steps: + +1. Configuring a :python:`Trainer`. + +2. Configuring LBANN model components (like the graph of + :python:`Layer` s) and creating a :python:`Model`. + + + Classes for model components are automatically generated from the + LBANN Protobuf specifications in `lbann/src/proto + `_. These + files are currently the best source of documentation. Message + fields in the Protobuf specification are optional keyword + arguments for the corresponding Python class constructor. If a + keyword argument is not provided, it is logically zero (e.g. false + for Boolean fields and empty for string fields) + +3. Configuring the default :python:`Optimizer` to be used by the + :python:`Weights` objects. + +4. Loading in a Protobuf text file describing the data reader. + + + The Python frontend currently does not have good support for + specifying data readers. If any data reader properties need to be + set programmatically, the user must do it directly via the + Protobuf Python API. + +5. Launching LBANN by calling :python:`run`. + + + :python:`lbann.run` should be run from a compute node. If a node + allocation is not available, the :python:`batch_job` option can + be set to submit a batch job to the scheduler. + + + A timestamped work directory will be created each time LBANN is + run. The default location of these work directories can be set + with the environment variable :bash:`LBANN_EXPERIMENT_DIR`. + + + Supported job managers are Slurm and LSF. + + + LLNL users and collaborators may prefer to use + :python:`lbann.contrib.launcher.run`. This is similar to + :python:`lbann.run`, with defaults and optimizations for certain + systems. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +A simple example +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import lbann + + # ---------------------------------- + # Construct layer graph + # ---------------------------------- ------------------------------- -Running the inference engine ------------------------------- + # Input data + input = lbann.Input() + image = lbann.Identity(input) + label = lbann.Identity(input) -This section is under construction, requiring input from other team -members. Until it is complete, please ask questions on the -`issue tracker `_. + # Softmax classifier + y = lbann.FullyConnected(image, num_neurons = 10, has_bias = True) + pred = lbann.Softmax(y) + + # Loss function and accuracy + loss = lbann.CrossEntropy([pred, label]) + acc = lbann.CrossEntropy([pred, label]) + + # ---------------------------------- + # Setup experiment + # ---------------------------------- + + # Setup trainer + trainer = lbann.Trainer() + + # Setup model + mini_batch_size = 64 + num_epochs = 5 + model = lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(input), + objective_function=loss, + metrics=[lbann.Metric(acc, name='accuracy', unit='%')], + callbacks=[lbann.CallbackPrint(), lbann.CallbackTimer()]) + + # Setup optimizer + opt = lbann.SGD(learn_rate=0.01, momentum=0.9) + + # Load data reader from prototext + import google.protobuf.text_format + data_reader_proto = lbann.lbann_pb2.LbannPB() + with open('path/to/lbann/model_zoo/data_readers/data_reader_mnist.prototext', 'r') as f: + google.protobuf.text_format.Merge(f.read(), data_reader_proto) + data_reader_proto = data_reader_proto.data_reader + + # ---------------------------------- + # Run experiment + # ---------------------------------- + + lbann.run(trainer, model, data_reader_proto, opt) + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Useful submodules +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +^^^^^^^^^^^^^^^^^^^^^^^^ +:python:`lbann.modules` +^^^^^^^^^^^^^^^^^^^^^^^^ + +A :python:`Module` is a pattern of layers that can be applied multiple +times in a neural network. Once created, a :python:`Module` is +*callable*, taking a layer as input and returning a layer as +output. They will create and manage :python:`Weights` es internally, +so they are convenient for weight sharing between different +layers. They are also useful for complicated patterns like RNN cells. + +*A possible note of confusion*: "Modules" in LBANN are similar to +"layers" in PyTorch, TensorFlow, and Keras. LBANN uses "layer" to +refer to tensor operations, in a similar manner as Caffe. + +^^^^^^^^^^^^^^^^^^^^^^^^ +:python:`lbann.models` +^^^^^^^^^^^^^^^^^^^^^^^^ + +Several common and influential neural network models are implemented +as :python:`Module` s. They can be used as building blocks within more +complicated models. + +^^^^^^^^^^^^^^^^^^^^^^^^ +:python:`lbann.proto` +^^^^^^^^^^^^^^^^^^^^^^^^ + +The :python:`save_prototext` function will export a Protobuf text +file, which can be fed into the Protobuf frontend. + +^^^^^^^^^^^^^^^^^^^^^^^^ +:python:`lbann.onnx` +^^^^^^^^^^^^^^^^^^^^^^^^ + +This contains functionality to convert between LBANN and ONNX +models. See `python/docs/onnx/README.md +`_ +for full documentation. + +------------------------------------------------ +Protobuf frontend (advanced) +------------------------------------------------ + +The main LBANN driver uses Protobuf text files (sometimes called +prototext files) to specify experiments. The Python frontend operates +by "compiling" an experiment configuration into a Protobuf text file +and passing it into the LBANN driver. Aside from quick debugging, +there are very few situations where directly manipulating Protobuf +text files is superior to using the Python frontend. In fact, it is +possible to use Protobuf's Python API to programmatically manipulate +Protobuf messages, if such fine control is necessary. + +In order to fully specify an experiment, the user must provide +Protobuf text files for the model, default optimizer, and data +reader. These can be provided as three separate files or one unified +file. The basic template for running LBANN is + +.. code-block:: bash + + \ + lbann --prototext=experiment.prototext +The LBANN Protobuf format is defined in `src/proto/lbann.proto +`_. It +is important to remember that the default value of a Protobuf field is +logically zero (e.g. false for Boolean fields and empty for string +fields). diff --git a/external/TBinf/TBinf.cpp b/external/TBinf/TBinf.cpp index b92141f9bc5..90e9dabdd8e 100644 --- a/external/TBinf/TBinf.cpp +++ b/external/TBinf/TBinf.cpp @@ -68,6 +68,26 @@ void SummaryWriter::add_scalar(const std::string tag, float value, write_summary_event(s, step); } +void SummaryWriter::add_image(const std::string& tag, + std::string encoded_img, + const std::vector& dims, + int64_t step){ + + auto s = std::unique_ptr(new tensorflow::Summary()); + tensorflow::Summary::Value *v = s->add_value(); + v->set_tag(tag); + tensorflow::Summary_Image *img = v->mutable_image(); + img->Clear(); + img->set_colorspace(dims[0]); + img->set_height(dims[1]); + img->set_width(dims[2]); + + img->set_encoded_image_string(std::move(encoded_img)); + + write_summary_event(s.release(), step); +} + + void SummaryWriter::add_histogram(const std::string tag, std::vector::const_iterator first, std::vector::const_iterator last, diff --git a/external/TBinf/TBinf.hpp b/external/TBinf/TBinf.hpp index 0a11937da71..0edcad8aa54 100644 --- a/external/TBinf/TBinf.hpp +++ b/external/TBinf/TBinf.hpp @@ -39,27 +39,40 @@ namespace TBinf { /** - * Write data to a Tensorboard logging directory. - * This writes data in the same format as Tensorflow does. + * @brief Write data to Tensorboard logging directory in Tensorflow format. */ class SummaryWriter { public: /** - * Create a new event file in logdir to write to. + * @brief Create a new event file in logdir to write to. * @param logdir The directory where the event file will be written. */ SummaryWriter(const std::string logdir); ~SummaryWriter(); /** - * Add a scalar value to the event file. + * @brief Add a scalar value to the event file. * @param tag The tag for this summary. * @param value The scalar value. * @param step Optional global step. */ void add_scalar(const std::string tag, float value, int64_t step = -1); + + /** + * @brief Add an image to the event file. + * @param tag The tag for this summary. + * @param encoded_img The image to be written. + * @param dims The dimensions of the image. + * @param step Optional global step. + */ + + void add_image(std::string const& tag, + std::string encoded_img, + const std::vector& dims, + int64_t step = -1); + /** - * Add a histogram of values to the event file. + * @brief Add a histogram of values to the event file. * @param tag The tag for this summary. * @param first Iterator to the first value to add. * @param last Iterator past the last value to add. @@ -70,7 +83,7 @@ class SummaryWriter { std::vector::const_iterator last, int64_t step = -1); /** - * Add a histogram based upon buckets to the event file. + * @brief Add a histogram based upon buckets to the event file. * @param tag The tag for this summary. * @param buckets The histogram buckets. * @param min The minimum value in the dataset. @@ -85,44 +98,44 @@ class SummaryWriter { double min, double max, double num, double sum, double sqsum, int64_t step = -1); - /** Return the current histogram buckets. */ + /** @brief Return the current histogram buckets. */ const std::vector& get_histogram_buckets() const; - /** Return the default histogram buckets. */ + /** @brief Return the default histogram buckets. */ static std::vector get_default_histogram_buckets(); - /** Ensure all events are written out. */ + /** @brief Ensure all events are written out. */ void flush(); private: /** - * Write a summary to the event file. + * @brief Write a summary to the event file. * @param s The summary to write. * @param step Optional global step for the event. */ void write_summary_event(tensorflow::Summary *s, int64_t step = -1); /** - * Write an event to the event file. + * @brief Write an event to the event file. * @param e The event to write. */ void write_event(tensorflow::Event& e); - /** Get current wall time in fractional seconds. */ + /** @brief Get current wall time in fractional seconds. */ double get_time_in_seconds(); - /** Initialize histogram buckets. */ + /** @brief Initialize histogram buckets. */ void init_histogram_buckets(); - /** Current event version. */ + /** @brief Current event version. */ static constexpr const char *EVENT_VERSION = "brain.Event:2"; - /** Filename to write to. */ + /** @brief Filename to write to. */ std::string filename; - /** File stream for writing. */ + /** @brief File stream for writing. */ std::fstream file; - /** Current histogram buckets. */ + /** @brief Current histogram buckets. */ std::vector histogram_buckets; }; diff --git a/include/lbann/CMakeLists.txt b/include/lbann/CMakeLists.txt index 28123a8350b..bfd9b756b61 100644 --- a/include/lbann/CMakeLists.txt +++ b/include/lbann/CMakeLists.txt @@ -8,15 +8,21 @@ set_full_path(THIS_DIR_HEADERS # Add the subdirectories add_subdirectory(callbacks) +add_subdirectory(data_coordinator) add_subdirectory(data_readers) add_subdirectory(data_store) +add_subdirectory(execution_contexts) add_subdirectory(io) add_subdirectory(layers) +add_subdirectory(macros) add_subdirectory(metrics) add_subdirectory(models) add_subdirectory(objective_functions) add_subdirectory(optimizers) add_subdirectory(proto) +add_subdirectory(trainers) +add_subdirectory(training_algorithms) +add_subdirectory(transforms) add_subdirectory(utils) add_subdirectory(weights) diff --git a/include/lbann/base.hpp b/include/lbann/base.hpp index a4baa63c443..996bb655524 100644 --- a/include/lbann/base.hpp +++ b/include/lbann/base.hpp @@ -27,24 +27,35 @@ #ifndef LBANN_BASE_HPP_INCLUDED #define LBANN_BASE_HPP_INCLUDED -#include "El.hpp" -#include "lbann/Elemental_extensions.hpp" -#include "lbann/utils/cyg_profile.hpp" -#include "lbann/utils/file_utils.hpp" +#include // Defines, among other things, DataType. #include "lbann_config.hpp" +#include "lbann/Elemental_extensions.hpp" +#include "lbann/utils/cyg_profile.hpp" +#include "lbann/utils/file_utils.hpp" +#include "lbann/utils/enum_iterator.hpp" +#ifdef LBANN_HAS_HALF +#include "lbann/utils/serialization.hpp" +#endif // LBANN_HAS_HALF + // Support for OpenMP macros #include "lbann/utils/omp_pragma.hpp" #include +#include +#include +#include namespace lbann { // Forward-declaration. class lbann_comm; +/// Creating an observer_ptr to complement the unique_ptr and shared_ptr +template using observer_ptr = typename std::add_pointer::type; + // Note that this should only be used to wrap the thing coming out of // initialize()! This will be removed when we have proper RAII around // these things. @@ -63,7 +74,7 @@ using world_comm_ptr = * @param seed RNG seed. * @return LBANN communicator corresponding to MPI_COMM_WORLD. */ -world_comm_ptr initialize(int& argc, char**& argv, int seed = -1); +world_comm_ptr initialize(int& argc, char**& argv); /** Destroy LBANN communicator. * @@ -72,6 +83,14 @@ world_comm_ptr initialize(int& argc, char**& argv, int seed = -1); */ void finalize(lbann_comm* comm = nullptr); +#ifdef LBANN_HAS_HALF +using cpu_fp16 = El::cpu_half_type; +#endif + +#ifdef LBANN_HAS_GPU_FP16 +using fp16 = El::gpu_half_type; +#endif + // Typedefs for Elemental matrices using AbsMat = El::AbstractMatrix; using CPUMat = El::Matrix; @@ -79,6 +98,7 @@ using CPUMat = El::Matrix; using GPUMat = El::Matrix; #endif // LBANN_HAS_GPU using AbsDistMat = El::AbstractDistMatrix; +using BaseDistMat = El::BaseDistMatrix; // Deprecated typedefs /// @todo Remove @@ -90,22 +110,45 @@ template using AbsDistMatReadProxy = El::AbstractDistMatrixReadDeviceProxy; using ElMat = El::ElementalMatrix; using BlockMat = El::BlockMatrix; + +template +using CPUMatDT = El::Matrix; + +template +using MCMRMatDT = El::DistMatrix; +template +using CircMatDT = El::DistMatrix; +template +using StarMatDT = El::DistMatrix; +template +using StarVCMatDT = El::DistMatrix; +template +using VCStarMatDT = El::DistMatrix; /// ColSumStarVCMat +template +using MCStarMatDT = El::DistMatrix; /// RowSumMat +template +using MRStarMatDT = El::DistMatrix; /// ColSumMat +template +using StarMRMatDT = El::DistMatrix; +template +using DistMatDT = MCMRMatDT; + template -using MCMRMat = El::DistMatrix; +using MCMRMat = MCMRMatDT; template -using CircMat = El::DistMatrix; +using CircMat = CircMatDT; template -using StarMat = El::DistMatrix; +using StarMat = StarMatDT; template -using StarVCMat = El::DistMatrix; +using StarVCMat = StarVCMatDT; template -using VCStarMat = El::DistMatrix; /// ColSumStarVCMat +using VCStarMat = VCStarMatDT; /// ColSumStarVCMat template -using MCStarMat = El::DistMatrix; /// RowSumMat +using MCStarMat = MCStarMatDT; /// RowSumMat template -using MRStarMat = El::DistMatrix; /// ColSumMat +using MRStarMat = MRStarMatDT; /// ColSumMat template -using StarMRMat = El::DistMatrix; +using StarMRMat = StarMRMatDT; using DistMat = MCMRMat; using Mat = El::Matrix; // Temporarily define as CPUMat @@ -116,42 +159,25 @@ using EvalType = double; /// Distributed matrix format enum class matrix_format {MC_MR, CIRC_CIRC, STAR_STAR, STAR_VC, MC_STAR, invalid}; +/// @todo This should move to hydrogen +std::string to_string(El::Device const& d); +El::Device device_from_string(std::string const& str); + /// Data layout that is optimized for different modes of parallelism enum class data_layout {MODEL_PARALLEL, DATA_PARALLEL, invalid}; -static matrix_format __attribute__((used)) data_layout_to_matrix_format(data_layout layout) { - matrix_format format; - switch(layout) { - case data_layout::MODEL_PARALLEL: - format = matrix_format::MC_MR; - break; - case data_layout::DATA_PARALLEL: - /// Weights are stored in STAR_STAR and data in STAR_VC - format = matrix_format::STAR_STAR; - break; - default: - throw(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " Invalid data layout selected"); - } - return format; -} +matrix_format data_layout_to_matrix_format(data_layout layout); +std::string to_string(data_layout const& dl); +data_layout data_layout_from_string(std::string const& str); /// Neural network execution mode enum class execution_mode {training, validation, testing, prediction, invalid}; -static const char *__attribute__((used)) _to_string(execution_mode m) { - switch(m) { - case execution_mode::training: - return "training"; - case execution_mode::validation: - return "validation"; - case execution_mode::testing: - return "testing"; - case execution_mode::prediction: - return "prediction"; - case execution_mode::invalid: - return "invalid"; - default: - throw("Invalid execution mode specified"); /// @todo this should be an lbann_exception but then the class has to move to resolve dependencies - } -} +std::string to_string(execution_mode m); +using execution_mode_iterator = enum_iterator; + +/** @brief Convert a string to an execution_mode. */ +execution_mode exec_mode_from_string(std::string const& str); +/** @brief Extract an execution_mode from a stream. */ +std::istream& operator>>(std::istream& os, execution_mode& e); /** Pooling layer mode */ enum class pool_mode {invalid, max, average, average_no_pad}; @@ -159,56 +185,26 @@ enum class pool_mode {invalid, max, average, average_no_pad}; /** returns a string representation of the pool_mode */ std::string get_pool_mode_name(pool_mode m); -// NA - Not applicable, used for input layers that don't produce a second output -enum class data_reader_target_mode {CLASSIFICATION, REGRESSION, RECONSTRUCTION, NA}; - /* * endsWith: http://thispointer.com/c-how-to-check-if-a-string-ends-with-an-another-given-string/ * Case Sensitive Implementation of endsWith() * It checks if the string 'mainStr' ends with given string * 'toMatch' */ -static bool __attribute__((used)) endsWith(const std::string mainStr, const std::string &toMatch) -{ - if(mainStr.size() >= toMatch.size() && - mainStr.compare(mainStr.size() - toMatch.size(), toMatch.size(), toMatch) == 0) - return true; - else - return false; -} +bool endsWith(const std::string mainStr, const std::string &toMatch); /// Print the dimensions and name of a Elemental matrix -static void __attribute__((used)) _print_matrix_dims(AbsDistMat *m, const char *name) { - std::cout << "DISPLAY MATRIX: " << name << " = " << m->Height() << " x " << m->Width() << std::endl; -} -#define PRINT_MATRIX_DIMS(x) _print_matrix_dims(x, #x); +void print_matrix_dims(AbsDistMat *m, const char *name); +#define LBANN_PRINT_MATRIX_DIMS(x) print_matrix_dims(x, #x); /// Print the dimensions and name of a Elemental matrix -static void __attribute__((used)) _print_local_matrix_dims(AbsMat *m, const char *name) { - std::cout << "DISPLAY MATRIX: " << name << " = " << m->Height() << " x " << m->Width() << std::endl; -} -#define PRINT_LOCAL_MATRIX_DIMS(x) _print_local_matrix_dims(x, #x); - -// FIXME -#if 1 -// __FILE__ -#define log_msg(...) {\ - char str[256];\ - sprintf(str, __VA_ARGS__);\ - std::cout << "[" << m_comm->get_trainer_rank() << "." << m_comm->get_rank_in_trainer() << "][" << __FUNCTION__ << "][Line " << __LINE__ << "]" << str << std::endl; \ - } -#define log_simple_msg(...) {\ - char str[256];\ - sprintf(str, __VA_ARGS__);\ - std::cout << "[" << __FUNCTION__ << "][Line " << __LINE__ << "]" << str << std::endl; \ - } -#else -#define log_msg(...) -#define log_simple_msg(...) -#endif +void print_local_matrix_dims(AbsMat *m, const char *name); +#define LBANN_PRINT_LOCAL_MATRIX_DIMS(x) print_local_matrix_dims(x, #x); + +#define LBANN_MAKE_STR_(x) #x +#define LBANN_MAKE_STR(x) LBANN_MAKE_STR_(x) -#define LBANN_MAKE_STR(x) _LBANN_MAKE_STR(x) -#define _LBANN_MAKE_STR(x) #x +void lbann_mpi_err_handler(MPI_Comm *comm, int *err_code, ... ); } // namespace lbann diff --git a/include/lbann/callbacks/CMakeLists.txt b/include/lbann/callbacks/CMakeLists.txt index 8466fdf53ef..db67fe83570 100644 --- a/include/lbann/callbacks/CMakeLists.txt +++ b/include/lbann/callbacks/CMakeLists.txt @@ -1,35 +1,44 @@ # Add the headers for this directory set_full_path(THIS_DIR_HEADERS callback.hpp - callback_check_dataset.hpp - callback_check_gradients.hpp - callback_check_init.hpp - callback_check_metric.hpp - callback_checknan.hpp - callback_checksmall.hpp - callback_confusion_matrix.hpp - callback_debug.hpp - callback_debug_io.hpp - callback_dump_outputs.hpp - callback_dump_error_signals.hpp - callback_dump_gradients.hpp - callback_dump_minibatch_sample_indices.hpp - callback_dump_weights.hpp - callback_early_stopping.hpp - callback_hang.hpp - callback_imcomm.hpp - callback_io.hpp - callback_learning_rate.hpp - callback_ltfb.hpp - callback_perturb_adam.hpp - callback_print.hpp - callback_save_images.hpp - callback_save_model.hpp - callback_summary.hpp - callback_timer.hpp - callback_variable_minibatch.hpp + check_dataset.hpp + check_gradients.hpp + check_init.hpp + check_metric.hpp + check_nan.hpp + check_small.hpp + checkpoint.hpp + confusion_matrix.hpp + debug.hpp + debug_io.hpp + dump_error_signals.hpp + dump_gradients.hpp + dump_minibatch_sample_indices.hpp + dump_outputs.hpp + dump_weights.hpp + early_stopping.hpp + gpu_memory_usage.hpp + hang.hpp + imcomm.hpp + learning_rate.hpp + ltfb.hpp + mixup.hpp + monitor_io.hpp + perturb_adam.hpp + perturb_dropout.hpp + print_model_description.hpp + print_statistics.hpp profiler.hpp - callback_gpu_memory_usage.hpp + replace_weights.hpp + save_images.hpp + save_model.hpp + save_topk_models.hpp + set_weights_value.hpp + summary.hpp + sync_layers.hpp + timeline.hpp + timer.hpp + variable_minibatch.hpp ) # Propagate the files up the tree diff --git a/include/lbann/callbacks/callback.hpp b/include/lbann/callbacks/callback.hpp index fae45448bb8..36b0cd8cb67 100644 --- a/include/lbann/callbacks/callback.hpp +++ b/include/lbann/callbacks/callback.hpp @@ -23,20 +23,36 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback .hpp - Base class for LBANN callbacks +// callback .hpp - Base class for LBANN callbacks //////////////////////////////////////////////////////////////////////////////// -#ifndef __LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED -#define __LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED +#ifndef LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED -#include "lbann/base.hpp" -#include "lbann/utils/summary.hpp" -#include "lbann/models/model.hpp" +#include "lbann/trainers/trainer.hpp" #include "lbann/layers/layer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/utils/description.hpp" +#include "lbann/utils/memory.hpp" +#include "lbann/utils/summary.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" + +#include + +#include +#include + +/** @brief A utility macro for easily adding default-constructed sub-class + * builders.*/ +#define LBANN_ADD_DEFAULT_CALLBACK_BUILDER(Class, FunctionName) \ + inline std::unique_ptr FunctionName( \ + const google::protobuf::Message&, std::shared_ptr const&) { \ + return lbann::make_unique(); \ + } namespace lbann { -/** @class lbann_callback +/** @class callback_base * @brief Base class for callbacks during training/testing. * * The method of each callback is called at a given point during @@ -44,37 +60,35 @@ namespace lbann { * care about. Callbacks may be passed a lbann_summary instance, * which they can use to log any relevant information. */ -class lbann_callback { +class callback_base { public: /** @name Constructors and destructor */ ///@{ - /** @brief Initialize a callback with an optional batch interval and - * summarizer. + /** @brief Initialize a callback with an optional batch interval */ - lbann_callback(int batch_interval = 1, - lbann_summary *summarizer = nullptr) : - m_batch_interval(std::max(batch_interval, 1)), m_summarizer(summarizer) {} - lbann_callback(const lbann_callback&) = default; - virtual ~lbann_callback() {} + callback_base(int batch_interval = 1) : + m_batch_interval(std::max(batch_interval, 1)) {} + callback_base(const callback_base&) = default; + virtual ~callback_base() = default; ///@} /** @name Polymorphic copy */ ///@{ - virtual lbann_callback* copy() const = 0; + virtual callback_base* copy() const = 0; ///@} /** @name Modifiers */ ///@{ - void set_summarizer(lbann_summary *summarizer) { - m_summarizer = summarizer; - } + /** @brief Called once to set up the callback on the trainer + */ + virtual void setup(trainer *t) {}; - /** @brief Called once to set up the callback (after all layers are - * set up). + /** @brief Called once to set up the callback on the model + * (after all layers are set up). */ virtual void setup(model *m) {}; @@ -82,6 +96,8 @@ class lbann_callback { /** @name Callback hooks */ ///@{ + /** @brief Called at the end of setup. */ + virtual void on_setup_end(model *m) {} /** @brief Called at the beginning of training. */ virtual void on_train_begin(model *m) {} /** @brief Called at the end of training. */ @@ -166,25 +182,58 @@ class lbann_callback { /** @brief Return this callback's name. */ virtual std::string name() const = 0; + /** @brief Human-readable description. */ + virtual description get_description() const; + ///@} + /** @brief Build a standard directory hierachy including trainer, + * execution context, and model information (in that order). + */ + inline std::string get_multi_trainer_ec_model_path(const model& m, + const std::string& root_dir) { + std::string dir = root_dir; + if (dir.empty()) { dir = "./"; } + if (dir.back() != '/') { dir += "/"; } + + const auto& c = static_cast(m.get_execution_context()); + return build_string(dir, + c.get_trainer().get_name(), '/', + c.get_state_string(), '/', + m.get_name(), '/'); + } + + /** @brief Build a standard directory hierachy including trainer, + * model information in that order. + */ + inline std::string get_multi_trainer_model_path(const model& m, + const std::string& root_dir) { + std::string dir = root_dir; + if (dir.empty()) { dir = "./"; } + if (dir.back() != '/') { dir += "/"; } + + const auto& c = static_cast(m.get_execution_context()); + return build_string(dir, + c.get_trainer().get_name(), '/', + m.get_name(), '/'); + } + + protected: /** @brief Copy-assignment operator. * * Performs a shallow (pointer) copy of the summarizer. */ - lbann_callback& operator=(const lbann_callback&) = default; + callback_base& operator=(const callback_base&) = default; protected: - /** @todo Make lbann_callback data private */ + /** @todo Make callback data private */ /** @brief Batch methods should once every this many steps. */ int m_batch_interval; - /** @brief Optional summarizer for the callbacks to use. */ - lbann_summary *m_summarizer; }; } // namespace lbann -#endif // __LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED +#endif // LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_check_dataset.hpp b/include/lbann/callbacks/callback_check_dataset.hpp deleted file mode 100644 index 09ce25d723f..00000000000 --- a/include/lbann/callbacks/callback_check_dataset.hpp +++ /dev/null @@ -1,73 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED - -#include -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * Save the sample indices for each mini-batch to ordered set. - * Check to make sure that all samples were properly processed. - */ -class lbann_callback_check_dataset : public lbann_callback { - public: - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_evaluate_forward_prop_end; - - lbann_callback_check_dataset() : - lbann_callback() {} - lbann_callback_check_dataset( - const lbann_callback_check_dataset&) = default; - lbann_callback_check_dataset& operator=( - const lbann_callback_check_dataset&) = default; - lbann_callback_check_dataset* copy() const override { - return new lbann_callback_check_dataset(*this); - } - void on_forward_prop_end(model *m, Layer *l) override; - void on_evaluate_forward_prop_end(model *m, Layer *l) override; - void on_epoch_end(model *m) override; - void on_validation_end(model *m) override; - void on_test_end(model *m) override; - - void add_to_set(model *m, Layer *l, int64_t step, std::set &set); - - std::string name() const override { return "check data set indices"; } - private: - /** @brief Basename for writing files. */ - std::string m_basename; - - std::set training_set; - std::set validation_set; - std::set testing_set; -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_check_gradients.hpp b/include/lbann/callbacks/callback_check_gradients.hpp deleted file mode 100644 index 8433a00d5f1..00000000000 --- a/include/lbann/callbacks/callback_check_gradients.hpp +++ /dev/null @@ -1,83 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** Gradient checking callback. - * Gradient checking is performed at the beginning of the test - * phase. Using a fourth-order finite difference scheme, a numerical - * partial derivative is computed for every weight parameter. If the - * numerical derivative differs signifcantly from the analytical - * derivative computed during backprop, the gradient check has - * failed. - */ -class lbann_callback_check_gradients : public lbann_callback { -public: - - /** Constructor. - * @param step_size Step size for numerical - * differentiation (with a step size of - * zero, the step size is chosen to - * minimize the numerical error). - * @param verbose Whether to print results for each - * parameter. - * @param error_on_failure Whether to throw an exception for - * large gradient errors. - */ - lbann_callback_check_gradients(DataType step_size = DataType(0), - bool verbose = false, - bool error_on_failure = false); - lbann_callback_check_gradients* copy() const override { - return new lbann_callback_check_gradients(*this); - } - void on_test_begin(model *m) override; - std::string name() const override { return "check gradients"; } - - /** Compute objective function value. - * It is assumed that input data has already been loaded into the - * activations of the first layer. - */ - DataType compute_objective_function(model *m); - -private: - - /** Step size for numerical differentiation. */ - DataType m_step_size; - /** Whether to print results for each parameter. */ - bool m_verbose; - /** Whether to throw an exception for large gradient errors. */ - bool m_error_on_failure; - -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_check_init.hpp b/include/lbann/callbacks/callback_check_init.hpp deleted file mode 100644 index 6d5572379fb..00000000000 --- a/include/lbann/callbacks/callback_check_init.hpp +++ /dev/null @@ -1,58 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_check_init .hpp .cpp - Check multi-model init -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * Verify that every model uses the same initialization. - */ -class lbann_callback_check_init : public lbann_callback { - public: - lbann_callback_check_init() : lbann_callback() {} - lbann_callback_check_init(const lbann_callback_check_init&) = default; - lbann_callback_check_init& operator=( - const lbann_callback_check_init&) = default; - lbann_callback_check_init* copy() const override { - return new lbann_callback_check_init(*this); - } - /** Check initializations. */ - void on_train_begin(model *m) override; - std::string name() const override { return "check init"; } - private: - /** Return true if x == y. */ - bool check_equal(const AbsMat& x, const AbsMat& y) const; -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_check_metric.hpp b/include/lbann/callbacks/callback_check_metric.hpp deleted file mode 100644 index 8b094c8c395..00000000000 --- a/include/lbann/callbacks/callback_check_metric.hpp +++ /dev/null @@ -1,78 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" -#include - -namespace lbann { - -/** Metric checking callback. - * Checks if a metric value falls within an expected range. - */ -class lbann_callback_check_metric : public lbann_callback { -public: - - lbann_callback_check_metric(std::string metric_name, - std::set modes, - EvalType lower_bound, - EvalType upper_bound, - bool error_on_failure); - lbann_callback_check_metric* copy() const override { return new lbann_callback_check_metric(*this); } - std::string name() const override { return "check metric"; } - - void on_epoch_end(model* m) override { check_metric(*m); } - void on_validation_end(model* m) override { check_metric(*m); } - void on_test_end(model* m) override { check_metric(*m); } - -private: - - /** Metric name. */ - std::string m_metric_name; - - /** Execution modes with metric checks. */ - std::set m_modes; - - /** Lower bound for metric value. */ - EvalType m_lower_bound; - /** Upper bound for metric value. */ - EvalType m_upper_bound; - - /** Whether to throw an exception if metric check fails. */ - bool m_error_on_failure; - - /** Perform metric check. - * Does nothing if current execution mode is not in m_modes; - */ - void check_metric(const model& m) const; - -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_checknan.hpp b/include/lbann/callbacks/callback_checknan.hpp deleted file mode 100644 index c45a7eee95c..00000000000 --- a/include/lbann/callbacks/callback_checknan.hpp +++ /dev/null @@ -1,66 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_checknan .hpp .cpp - Check matrices for invalid numbers -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_CHECKNAN_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_CHECKNAN_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * Check matrices for whether they include any NaNs or infs to help debugging. - * This will kill the rank if such values are discovered. - */ -class lbann_callback_checknan : public lbann_callback { - public: - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_backward_prop_end; - - lbann_callback_checknan() : lbann_callback() {} - lbann_callback_checknan(const lbann_callback_checknan&) = default; - lbann_callback_checknan& operator=( - const lbann_callback_checknan&) = default; - lbann_callback_checknan* copy() const override { - return new lbann_callback_checknan(*this); - } - /** Check that activations are good. */ - void on_forward_prop_end(model *m, Layer *l) override; - /** Check that error signals are good. */ - void on_backward_prop_end(model *m, Layer *l) override; - /** Check that gradients are good. */ - void on_backward_prop_end(model *m) override; - /** Check that weights are good. */ - void on_batch_end(model *m) override; - std::string name() const override { return "checknan"; } - -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_CHECKNAN_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_checkpoint.hpp b/include/lbann/callbacks/callback_checkpoint.hpp deleted file mode 100644 index ebeacdeaa7e..00000000000 --- a/include/lbann/callbacks/callback_checkpoint.hpp +++ /dev/null @@ -1,207 +0,0 @@ -////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_checkpoint .hpp .cpp - Callback hooks to checkpoint model -//////////////////////////////////////////////////////////////////////////////// -#ifndef LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" -#include "lbann/io/persist.hpp" - -namespace lbann { - -/** @brief Checkpoint at given interval in given directory */ -class lbann_callback_checkpoint : public lbann_callback { - public: - - /** @brief Construct the checkpoint callback - * - * It may be beneficial to the distributed checkpoints at a higher - * tempo than the shared checkpoints because they are less - * expensive. - * - * @param checkpoint_dir directory to save checkpoint files - * @param checkpoint_epochs interval to checkpoint - * @param checkpoint_steps interval to checkpoint - * @param checkpoint_secs interval to checkpoint - * @param per_rank_dir The directory into which to dump distributed checkpoints - * @param ckpt_dist_epochs The frequency of distributed checkpoints in epochs - * @param ckpt_dist_steps The frequence of distributed checkpoints in steps - */ - lbann_callback_checkpoint(std::string checkpoint_dir, - int checkpoint_epochs, - int checkpoint_steps, - int checkpoint_secs, - std::string per_rank_dir, - int ckpt_dist_epochs, - int ckpt_dist_steps) : - lbann_callback(), - m_checkpoint_dir(checkpoint_dir), - m_checkpoint_epochs(checkpoint_epochs), - m_checkpoint_steps(checkpoint_steps), - m_checkpoint_secs(checkpoint_secs), - m_per_rank_dir(per_rank_dir), - m_ckpt_dist_epochs(ckpt_dist_epochs), - m_ckpt_dist_steps(ckpt_dist_steps) {} - lbann_callback_checkpoint(const lbann_callback_checkpoint&) = default; - lbann_callback_checkpoint& operator=(const lbann_callback_checkpoint&) = default; - lbann_callback_checkpoint* copy() const override { return new lbann_callback_checkpoint(*this); } - void setup(model *m) override; - void on_epoch_end(model *m) override; - void on_batch_end(model *m) override; - void on_validation_end(model *m) override; - - inline void set_checkpoint_dir(std::string dir){ - m_checkpoint_dir= dir; - } - - inline void set_checkpoint_epochs(int epochs){ - m_checkpoint_epochs= epochs; - } - - inline void set_checkpoint_steps(int steps){ - m_checkpoint_steps= steps; - } - - inline void set_checkpoint_secs(EvalType secs){ - m_checkpoint_secs= secs; - } - - inline void set_per_rank_dir(std::string dir){ - m_per_rank_dir = dir; - } - - inline void set_ckpt_dist_epochs(int ckpt_dist_epochs){ - m_ckpt_dist_epochs = ckpt_dist_epochs; - } - - inline void set_ckpt_dist_steps(int ckpt_dist_steps){ - m_ckpt_dist_steps = ckpt_dist_steps; - } - - bool need_checkpoint(model *m); - bool checkpoint(model *m); - bool restart(model *m); - std::string name() const override { return "checkpoint"; } - protected: - std::string m_checkpoint_dir; - int m_checkpoint_epochs; - int m_checkpoint_steps; - EvalType m_checkpoint_secs; - std::string m_per_rank_dir; - int m_ckpt_dist_epochs; - int m_ckpt_dist_steps; - EvalType m_checkpoint_last; - persist p; - bool m_checkpoint_dist; - bool m_checkpoint_shared; - - template - struct header_t { - int epoch; - int step; - int shared; - char dirname[_max_dir_len]; - }; -}; - -static inline std::string get_last_shared_checkpoint_filename(model *m, std::string dir) { - lbann_comm *comm = m->get_comm(); - std::stringstream ss; - ss << dir << "/"; - ss << m->get_name().c_str() << "."; - ss << comm->get_trainer_rank() << ".last.shared.checkpoint"; - return ss.str(); -} - -static inline std::string get_shared_checkpoint_dirname(model *m, std::string dir, int epoch, int step) { - lbann_comm *comm = m->get_comm(); - std::stringstream ss; - ss << dir << "/" << m->get_name().c_str(); - ss << "." << comm->get_trainer_rank(); - ss << ".shared.epoch." << epoch; - ss << ".step."<< step << "/"; - return ss.str(); -} - -static inline std::string get_last_distributed_checkpoint_filename(model *m, std::string dir) { - lbann_comm *comm = m->get_comm(); - std::stringstream ss; - ss << dir << "/"; - ss << m->get_name().c_str() << "."; - ss << comm->get_trainer_rank() << ".last.distributed.checkpoint"; - return ss.str(); -} - -static inline std::string get_distributed_checkpoint_dirname(model *m, std::string dir, int epoch, int step) { - lbann_comm *comm = m->get_comm(); - std::stringstream ss; - ss << dir << "/" << m->get_name().c_str(); - ss << "." << comm->get_trainer_rank(); - ss << ".rank." << comm->get_rank_in_trainer(); - ss << ".epoch." << epoch; - ss << ".step."<< step << "/"; - return ss.str(); -} - -// Print last checkpoint to file, used to determine which checkpoint to load from. -static inline bool write_latest(std::string filename, int epoch, int train) { - // open the file for writing - int fd = openwrite(filename.c_str()); - if (fd != -1) { - char field[256]; - sprintf(field, "epoch=%d step=%d\n", epoch, train); - write_string(fd, filename.c_str(), field, strlen(field)); - // close our file - closewrite(fd, filename.c_str()); - } - return true; -} - -/** \brief Reads the "latest" file and returns the epoch number and - * sample offset for most recent checkpoint - */ -static inline bool read_latest(std::string filename, int *epochLast, int *trainLast) { - // assume we don't have a file, we'll return -1 in that case - *epochLast = -1; - *trainLast = -1; - // open the file for reading - int fd = openread(filename.c_str()); - if (fd != -1) { - // read epoch from file - char field[256]; - read_string(fd, filename.c_str(), field, sizeof(field)); - int ret = sscanf(field, "epoch=%d step=%d\n", epochLast, trainLast); - // close our file - closeread(fd, filename.c_str()); - if(ret != 2) { return false; } - } - return true; -} - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_checksmall.hpp b/include/lbann/callbacks/callback_checksmall.hpp deleted file mode 100644 index 2f66a04d2d9..00000000000 --- a/include/lbann/callbacks/callback_checksmall.hpp +++ /dev/null @@ -1,72 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_checksmall .hpp .cpp - Check matrices for small values -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_CHECKSMALL_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_CHECKSMALL_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * Check matrices for whether they include any very small values to avoid - * getting denormalized values. Denormalized values can significantly slow - * floating point computations. - * Since we often square values, the check is based on the square root of the - * smallest floating point value. - * This will kill the rank if such values are discovered. - */ -class lbann_callback_checksmall : public lbann_callback { - public: - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_backward_prop_end; - - lbann_callback_checksmall() : lbann_callback() {} - lbann_callback_checksmall(const lbann_callback_checksmall&) = default; - lbann_callback_checksmall& operator=( - const lbann_callback_checksmall&) = default; - lbann_callback_checksmall* copy() const override { - return new lbann_callback_checksmall(*this); - } - /** Check that activations are good. */ - void on_forward_prop_end(model *m, Layer *l) override; - /** Check that gradients are good. */ - void on_backward_prop_end(model *m) override; - /** Check that weights are good. */ - void on_batch_end(model *m) override; - std::string name() const override { return "checksmall"; } - private: - /** Smallest allowable value. */ - const DataType m_threshold = std::sqrt(std::numeric_limits::min()); - /** Return true if there are no problems with m. */ - bool is_good(const AbsDistMat& m); -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_CHECKSMALL_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_confusion_matrix.hpp b/include/lbann/callbacks/callback_confusion_matrix.hpp deleted file mode 100644 index b87dc8b24a0..00000000000 --- a/include/lbann/callbacks/callback_confusion_matrix.hpp +++ /dev/null @@ -1,115 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** Compute confusion matrix. - * Confusion matrices are saved in CSV files of the form - * ".csv". The (i,j)-entry is the proportion of samples - * with prediction i and label j. The prediction and label layers are - * assumed to output one-hot vectors for each mini-batch sample. - */ -class lbann_callback_confusion_matrix : public lbann_callback { -public: - - lbann_callback_confusion_matrix(std::string prediction_layer, - std::string label_layer, - std::string prefix); - lbann_callback_confusion_matrix(const lbann_callback_confusion_matrix&); - lbann_callback_confusion_matrix& operator=(const lbann_callback_confusion_matrix&); - lbann_callback_confusion_matrix* copy() const override { - return new lbann_callback_confusion_matrix(*this); - } - std::string name() const override { return "confusion matrix"; } - - void setup(model *m) override; - - void on_epoch_begin(model *m) override { reset_counts(*m); } - void on_epoch_end(model *m) override { save_confusion_matrix(*m); } - void on_validation_begin(model *m) override { reset_counts(*m); } - void on_validation_end(model *m) override { save_confusion_matrix(*m); } - void on_test_begin(model *m) override { reset_counts(*m); } - void on_test_end(model *m) override { save_confusion_matrix(*m); } - void on_batch_end(model *m) override { update_counts(*m); } - void on_batch_evaluate_end(model *m) override { update_counts(*m); } - -private: - - /** Name of prediction layer. - * This layer is assumed to output one-hot vectors. - */ - std::string m_prediction_layer; - /** Name of label layer. - * This layer is assumed to output one-hot vectors. - */ - std::string m_label_layer; - /** Prefix for output files. */ - std::string m_prefix; - - /** Confusion matrix counts. - * Each vector should be interpreted as a num_classes x num_classes - * matrix in row-major order. The (i,j)-entry is the number of - * samples with prediction i and label j. - */ - std::map> m_counts; - - /** "View" into prediction matrix. - * This is a CPU matrix. If the prediction layer keeps data on GPU, - * then this will be a matrix copy rather than a matrix view. - */ - std::unique_ptr m_predictions_v; - /** "View" into label matrix. - * This is a CPU matrix. If the label layer keeps data on GPU or in - * a different distribution than the prediction layer, then this - * will be a matrix copy rather than a matrix view. - */ - std::unique_ptr m_labels_v; - - /** Get prediction matrix. */ - const AbsDistMat& get_predictions(const model& m) const; - /** Get label matrix. */ - const AbsDistMat& get_labels(const model& m) const; - - /** Reset confusion matrix counts. */ - void reset_counts(const model& m); - /** Update confusion matrix counts. - * Counts are updated with current mini-batch predictions and - * labels. - */ - void update_counts(const model& m); - /** Output confusion matrix to file. */ - void save_confusion_matrix(const model& m); - -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_debug.hpp b/include/lbann/callbacks/callback_debug.hpp deleted file mode 100644 index c342c7ad778..00000000000 --- a/include/lbann/callbacks/callback_debug.hpp +++ /dev/null @@ -1,108 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * @brief Phase specific "printf debugging" - * - * Print verbose status updates to standard error stream. - * This callback is useful for "printf debugging." - * - * Takes a prototext parameter @c phase: train | validate | test | \ - * if \ will print messages for all phases - * - */ -class lbann_callback_debug : public lbann_callback { - public: - - /** @brief Constructor. - * - * If modes is empty, status updates will be printed for all - * execution modes. - */ - lbann_callback_debug(std::set modes, - lbann_summary *summarizer = nullptr) : - lbann_callback(1, summarizer), m_modes(std::move(modes)) {} - lbann_callback_debug(const lbann_callback_debug&) = default; - lbann_callback_debug& operator=(const lbann_callback_debug&) = default; - lbann_callback_debug* copy() const override { return new lbann_callback_debug(*this); } - std::string name() const override { return "debug"; } - - /** @brief Print that a batch is beginning. */ - void on_batch_begin(model *m) override; - /** @brief Print that a batch is ending. */ - void on_batch_end(model *m) override; - /** @brief Print that a layer's forward prop is beginning. */ - void on_batch_evaluate_begin(model *m) override; - /** @brief Print that a layer's forward prop is ending. */ - void on_batch_evaluate_end(model *m) override; - - using lbann_callback::on_forward_prop_begin; - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_backward_prop_begin; - using lbann_callback::on_backward_prop_end; - using lbann_callback::on_evaluate_forward_prop_begin; - using lbann_callback::on_evaluate_forward_prop_end; - using lbann_callback::on_optimize_begin; - using lbann_callback::on_optimize_end; - - /** @brief Print that a layer's forward prop is beginning. */ - void on_forward_prop_begin(model *m, Layer *l) override; - /** @brief Print that a layer's forward prop is ending. */ - void on_forward_prop_end(model *m, Layer *l) override; - /** @brief Print that a layer's backward prop is beginning. */ - void on_backward_prop_begin(model *m, Layer *l) override; - /** @brief Print that a layer's backward prop is ending. */ - void on_backward_prop_end(model *m, Layer *l) override; - /** @brief Print that a layer's backward prop is beginning. */ - void on_evaluate_forward_prop_begin(model *m, Layer *l) override; - /** @brief Print that a layer's backward prop is ending. */ - void on_evaluate_forward_prop_end(model *m, Layer *l) override; - - /** @brief Print that a weights' optimization step is beginning. */ - void on_optimize_begin(model *m, weights *w) override; - /** @brief Print that a weights' optimization step is ending. */ - void on_optimize_end(model *m, weights *w) override; - - private: - - /** @brief Execution modes for which status updates will be printed. - * - * If empty, status updates are printed for all execution modes. - */ - std::set m_modes; - -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_debug_io.hpp b/include/lbann/callbacks/callback_debug_io.hpp deleted file mode 100644 index ffaff0af567..00000000000 --- a/include/lbann/callbacks/callback_debug_io.hpp +++ /dev/null @@ -1,89 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_debug .hpp .cpp - Callback hooks to debug LBANN -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED - -#include -#include -#include "lbann/callbacks/callback.hpp" -#include "lbann/layers/io/input/input_layer.hpp" - -namespace lbann { - -/** - * Print status updates on where training is. - */ -class lbann_callback_debug_io : public lbann_callback { - public: - using lbann_callback::on_forward_prop_begin; - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_backward_prop_begin; - using lbann_callback::on_backward_prop_end; - using lbann_callback::on_evaluate_forward_prop_begin; - using lbann_callback::on_evaluate_forward_prop_end; - - /** - * Debug a particular phase; use invalid to debug every phase. - */ - lbann_callback_debug_io(execution_mode phase = execution_mode::invalid, - int debug_lvl = 0, - lbann_summary *summarizer = nullptr) : - lbann_callback(1, summarizer), m_debug_phase(phase), m_debug_lvl(debug_lvl) {} - lbann_callback_debug_io(const lbann_callback_debug_io&) = default; - lbann_callback_debug_io& operator=( - const lbann_callback_debug_io&) = default; - lbann_callback_debug_io* copy() const override { return new lbann_callback_debug_io(*this); } - /** Print that a training epoch is being started. */ - void on_epoch_begin(model *m) override; - /** Print that forward prop for a layer is beginning. */ - void on_forward_prop_begin(model *m, Layer *l) override; - - /** Print I/O details at the beginning of validation. */ - void on_validation_begin(model *m) override; - /** Print that an evaluation forward prop is beginning. */ - void on_evaluate_forward_prop_begin(model *m, Layer *l) override; - - /** Print I/O details at the beginning of testing. */ - void on_test_begin(model *m) override; - - /** Common format for printing I/O stats at the start of a mini-batch */ - void print_fp_start(model *m, generic_input_layer *input); - /** Common format for printing I/O stats at the start of a phase */ - void print_phase_start(model *m, execution_mode mode); - - std::string name() const override { return "debug_io"; } - private: - /** The phase to debug. */ - execution_mode m_debug_phase; - int m_debug_lvl; /** Debugging level: 0 - epoch begin, 1 - fwd prop */ -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_error_signals.hpp b/include/lbann/callbacks/callback_dump_error_signals.hpp deleted file mode 100644 index 0c5571d9597..00000000000 --- a/include/lbann/callbacks/callback_dump_error_signals.hpp +++ /dev/null @@ -1,63 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** Dump gradients w.r.t. inputs to file. - * After each layer performs a backward prop step, this callback will - * dump the gradients w.r.t. inputs (the "error signals") to a - * human-readable ASCII file. This is slow and produces a lot of output. - */ -class lbann_callback_dump_error_signals : public lbann_callback { - public: - - /** Constructor. - * @param basename The basename for output files. - */ - lbann_callback_dump_error_signals(std::string basename = "") - : lbann_callback(), m_basename(basename) {} - lbann_callback_dump_error_signals* copy() const override { - return new lbann_callback_dump_error_signals(*this); - } - std::string name() const override { return "dump error signals"; } - - /** Write error signals to file after each backward prop step. */ - void on_backward_prop_end(model *m, Layer *l) override; - - private: - /** Basename for output files. */ - std::string m_basename; - -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_gradients.hpp b/include/lbann/callbacks/callback_dump_gradients.hpp deleted file mode 100644 index b0a6d587446..00000000000 --- a/include/lbann/callbacks/callback_dump_gradients.hpp +++ /dev/null @@ -1,73 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_dump_gradients .hpp .cpp - Callbacks to dump gradients -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED - -#include - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * @brief Dump gradient matrices to files. - * @details This will dump each hidden layer's gradient matrix after - * each minibatch. The matrices are written to files using - * Elemental's simple ASCII format. This is not meant for - * checkpointing, but for exporting gradient matrices for analysis - * that isn't easily done in LBANN. Note this dumps matrices during - * each mini-batch. This will be slow and produce a lot of output. - */ -class lbann_callback_dump_gradients : public lbann_callback { - public: - using lbann_callback::on_backward_prop_end; - - /** - * @param basename The basename for writing files. - * @param batch_interval The frequency at which to dump the gradients - */ - lbann_callback_dump_gradients(std::string basename, int batch_interval = 1) : - lbann_callback(batch_interval), m_basename(std::move(basename)) {} - lbann_callback_dump_gradients( - const lbann_callback_dump_gradients&) = default; - lbann_callback_dump_gradients& operator=( - const lbann_callback_dump_gradients&) = default; - lbann_callback_dump_gradients* copy() const override { - return new lbann_callback_dump_gradients(*this); - } - void on_backward_prop_end(model *m) override; - std::string name() const override { return "dump gradients"; } - private: - /** @brief Basename for writing files. */ - std::string m_basename; -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp b/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp deleted file mode 100644 index 8840b1a83c5..00000000000 --- a/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp +++ /dev/null @@ -1,78 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_dump_minibatch_sample_indices .hpp .cpp - Callbacks -// to dump the list of indices per minibatch -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED - -#include - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * @brief Dump sample indices for each minibatch to files. - * @details This will dump the list of indices from the training / - * validation / testing data that was processed Note this dumps - * vectors during each mini-batch. This will be slow and produce a lot - * of output. - */ -class lbann_callback_dump_minibatch_sample_indices : public lbann_callback { - public: - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_evaluate_forward_prop_end; - - /** - * @param basename The basename for writing files. - * @param batch_interval The frequency at which to dump sample indices - */ - lbann_callback_dump_minibatch_sample_indices(std::string basename, - int batch_interval = 1) : - lbann_callback(batch_interval), m_basename(std::move(basename)) {} - lbann_callback_dump_minibatch_sample_indices( - const lbann_callback_dump_minibatch_sample_indices&) = default; - lbann_callback_dump_minibatch_sample_indices& operator=( - const lbann_callback_dump_minibatch_sample_indices&) = default; - lbann_callback_dump_minibatch_sample_indices* copy() const override { - return new lbann_callback_dump_minibatch_sample_indices(*this); - } - void on_forward_prop_end(model *m, Layer *l) override; - void on_evaluate_forward_prop_end(model *m, Layer *l) override; - - void dump_to_file(model *m, Layer *l, int64_t step); - - std::string name() const override { return "dump minibatch sample indices"; } - private: - /** Basename for writing files. */ - std::string m_basename; -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_outputs.hpp b/include/lbann/callbacks/callback_dump_outputs.hpp deleted file mode 100644 index 0ad260be495..00000000000 --- a/include/lbann/callbacks/callback_dump_outputs.hpp +++ /dev/null @@ -1,113 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" - -#include -#include - -namespace lbann { - -/** @brief Dump layer output tensors to files. - * - * Saves a file for each output tensor of each selected layer, - * computed at each mini-batch step. Output files have the form - * "--epoch<#>-step<#>--output<#>.". - * This is primarily intended as a debugging tool, although it can be - * used for inference when performance is not critical. - * - * For NumPy file formats (npy and npz), tensor dimensions are - * recorded. For text file formats (CSV and TSV), each line contains - * flattened tensor data corresponding to one mini-batch sample - * (which is the transpose of the column-major matrix representation - * we use internally). - * - * CNPY is required to export to NumPy file formats (npy and npz). - */ -class lbann_callback_dump_outputs : public lbann_callback { -public: - - /** @brief Construct a callback to dump outputs. - * - * @param layer_names Names of layers with output dumps - * (default: dump outputs for all layers). - * @param modes Execution modes with output dumps - * (default: dump outputs for all modes). - * @param batch_interval Frequency of output dumps (default: dump - * outputs at each mini-batch step). - * @param directory Directory for output files (default: current - * working directory). - * @param file_format Output file format. Options are csv, tsv, - * npy, npz (default: csv). - */ - lbann_callback_dump_outputs( - std::set layer_names,// = std::set(), - std::set modes, // = std::set(), - El::Int batch_interval = 0, - std::string directory = "", - std::string file_format = ""); - - lbann_callback_dump_outputs* copy() const override { - return new lbann_callback_dump_outputs(*this); - } - std::string name() const override { return "dump outputs"; } - - void on_forward_prop_end(model* m, Layer* l) override { dump_outputs(*m, *l); } - void on_evaluate_forward_prop_end(model* m, Layer* l) override { dump_outputs(*m, *l); } - -private: - - /** @brief Names of layers with output dumps. - * @details If empty, outputs will be dumped for all layers. - */ - std::set m_layer_names; - - /** @brief Execution modes with output dumps. - * @details If empty, outputs will be dumped for all execution modes. - */ - std::set m_modes; - - /** @brief Directory for output files. - * @details Pathname has trailing '/'. - */ - std::string m_directory; - - /** @brief Output file format. */ - std::string m_file_format; - - /** @brief Dump outputs to file. - * @details Returns immediately if an output dump is not needed. - */ - void dump_outputs(const model& m, const Layer& l); - -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_weights.hpp b/include/lbann/callbacks/callback_dump_weights.hpp deleted file mode 100644 index 7edb2aacc20..00000000000 --- a/include/lbann/callbacks/callback_dump_weights.hpp +++ /dev/null @@ -1,70 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_dump_weights .hpp .cpp - Callbacks to dump weight matrices -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED - -#include - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * Dump weight matrices to files. - * This will dump each hidden layer's weight/bias matrix after each epoch. - * The matrices are written to files using Elemental's simple ASCII format. This - * is not meant for checkpointing, but for exporting weight matrices for - * analysis that isn't easily done in LBANN. - */ -class lbann_callback_dump_weights : public lbann_callback { - public: - /** - * @param basename The basename for writing files. - */ - lbann_callback_dump_weights(std::string basename) : - lbann_callback(), m_basename(std::move(basename)) {} - lbann_callback_dump_weights(const lbann_callback_dump_weights&) = default; - lbann_callback_dump_weights& operator=( - const lbann_callback_dump_weights&) = default; - lbann_callback_dump_weights* copy() const override { - return new lbann_callback_dump_weights(*this); - } - void on_train_begin(model *m) override; - void on_epoch_end(model *m) override; - std::string name() const override { return "dump weights"; } - private: - /** Basename for writing files. */ - std::string m_basename; - /// Dump weights from learning layers. - void dump_weights(model *m, std::string s = ""); -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_early_stopping.hpp b/include/lbann/callbacks/callback_early_stopping.hpp deleted file mode 100644 index e02fe4d3601..00000000000 --- a/include/lbann/callbacks/callback_early_stopping.hpp +++ /dev/null @@ -1,67 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_early_stopping .hpp .cpp - Callback hooks for early stopping -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED -#define LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED - -#include -#include -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * Stop training after validation error stops improving. - */ -class lbann_callback_early_stopping : public lbann_callback { - public: - /** - * Continue training until score has not improved for patience epochs. - */ - lbann_callback_early_stopping(int64_t patience); - lbann_callback_early_stopping(const lbann_callback_early_stopping&) = default; - lbann_callback_early_stopping& operator=( - const lbann_callback_early_stopping&) = default; - lbann_callback_early_stopping* copy() const override { - return new lbann_callback_early_stopping(*this); - } - /** Update validation score and check for early stopping. */ - void on_validation_end(model *m) override; - std::string name() const override { return "early stopping"; } - private: - /** Number of epochs to wait for improvements. */ - int64_t m_patience; - /** Last recorded score. */ - EvalType m_last_score = std::numeric_limits::max(); - /** Current number of epochs without improvement. */ - int64_t m_wait = 0; -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_gpu_memory_usage.hpp b/include/lbann/callbacks/callback_gpu_memory_usage.hpp deleted file mode 100644 index aa890efcc87..00000000000 --- a/include/lbann/callbacks/callback_gpu_memory_usage.hpp +++ /dev/null @@ -1,51 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// callback_gpu_memory_usage .hpp .cpp - Callbacks for printing GPU memory usage -//////////////////////////////////////////////////////////////////////////////// - -#ifndef __LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED -#define __LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { -/** Callback hooks for printing GPU memory usage. */ -class lbann_callback_gpu_memory_usage : public lbann_callback { - public: - - /** Constructor. - */ - lbann_callback_gpu_memory_usage() = default; - lbann_callback_gpu_memory_usage(const lbann_callback_gpu_memory_usage&) = default; - lbann_callback_gpu_memory_usage& operator=(const lbann_callback_gpu_memory_usage&) = default; - lbann_callback_gpu_memory_usage* copy() const override { return new lbann_callback_gpu_memory_usage(*this); } - void on_epoch_begin(model *m) override; - std::string name() const override { return "GPU memory usage"; } -}; - -} // namespace lbann - -#endif // __LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_hang.hpp b/include/lbann/callbacks/callback_hang.hpp deleted file mode 100644 index 2ec4c68b835..00000000000 --- a/include/lbann/callbacks/callback_hang.hpp +++ /dev/null @@ -1,69 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_hang .hpp .cpp - Callback to hang LBANN for debuggers -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * Hang LBANN as training starts so debuggers can attach. - * This will cause either a specific rank (in COMM_WORLD) or every rank to hang. - * Attach to the hung ranks and set the hang flag to false with a debugger to - * proceed. - */ -class lbann_callback_hang : public lbann_callback { - public: - /** - * @param rank_to_hang The rank to hang; -1 for every rank (default). - */ - lbann_callback_hang(int rank_to_hang = -1) : - m_rank_to_hang(rank_to_hang) {} - lbann_callback_hang(const lbann_callback_hang&) = default; - lbann_callback_hang& operator=(const lbann_callback_hang&) = default; - lbann_callback_hang* copy() const override { return new lbann_callback_hang(*this); } - /// Hang on train begin. - void on_train_begin(model* m) override { - if (m_rank_to_hang == -1 || - m_rank_to_hang == m->get_comm()->get_rank_in_world()) { - // Set this flag to false with your debugger to resume execution. - volatile bool lbann_hang = true; - while (lbann_hang) {} - } - } - std::string name() const override { return "hang"; } - protected: - /// The rank that will hang; -1 for every rank. - int m_rank_to_hang; -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_imcomm.hpp b/include/lbann/callbacks/callback_imcomm.hpp deleted file mode 100644 index fb52daa2bee..00000000000 --- a/include/lbann/callbacks/callback_imcomm.hpp +++ /dev/null @@ -1,102 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_imcomm .hpp .cpp - Send gradient updates between models -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED - -#include -#include -#include -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * Support inter-model communication after each mini-batch to synchronize - * gradient updates. - */ -class lbann_callback_imcomm : public lbann_callback { - public: - using lbann_callback::on_backward_prop_end; - - enum comm_type { - NONE, /** Do no gradient updates. */ - NORMAL, /** Simply sum gradient updates. */ - }; - - /** - * Initialize with ct being used for all weights. - */ - lbann_callback_imcomm(comm_type ct = NORMAL, - lbann_summary *summarizer = nullptr); - lbann_callback_imcomm(const lbann_callback_imcomm&) = default; - lbann_callback_imcomm& operator=(const lbann_callback_imcomm&) = default; - lbann_callback_imcomm* copy() const override { - return new lbann_callback_imcomm(*this); - } - /** - * Convenience initialization to do one update type for specific weights. - * Implies no inter-model updates for other weights. - */ - lbann_callback_imcomm(comm_type ct, std::unordered_set weights_list, - lbann_summary *summarizer = nullptr); - - /** Choose comm type ct for weights. */ - void set_weights_comm(weights *w, comm_type ct); - - /** Do initialization for this model. */ - void setup(model *m) override; - /** Make sure all models have the same weights. */ - void on_train_begin(model *m) override; - /** Do inter-model gradient updates. */ - void on_backward_prop_end(model *m) override; - - std::string name() const override { return "imcomm"; } - - private: - /** Parameters for a given set of weights. */ - struct imcomm_params { - /** Type of communication done. */ - comm_type ct = NONE; - }; - /** Default communication type. */ - comm_type m_default_ct; - /** Per-weights parameters. */ - std::unordered_map m_weights_params; - - /** Summarize relevant statistics. */ - void do_summary(model *m, weights *w, EvalType im_time); -}; - - -/** returns a string representation of the weight_initialization */ -std::string get_comm_type_name(lbann_callback_imcomm::comm_type m); - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_io.hpp b/include/lbann/callbacks/callback_io.hpp deleted file mode 100644 index 2ed29430a05..00000000000 --- a/include/lbann/callbacks/callback_io.hpp +++ /dev/null @@ -1,60 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_io .hpp .cpp - Callback hooks for I/O monitoring -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_IO_HPP_INCLUDED -#define LBANN_CALLBACKS_IO_HPP_INCLUDED - -#include -#include -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * Print information on the amount of IO that layers do. - */ -class lbann_callback_io : public lbann_callback { - public: - lbann_callback_io(); - lbann_callback_io(const lbann_callback_io&) = default; - lbann_callback_io& operator=(const lbann_callback_io&) = default; - lbann_callback_io* copy() const override { return new lbann_callback_io(*this); } - /** Only apply to specific layers. */ - lbann_callback_io(std::unordered_set layers); - /** Report how much I/O has occured per data reader */ - void on_epoch_end(model *m) override; - void on_test_end(model *m) override; - std::string name() const override { return "io"; } - private: - /** Indicies of layers to monitor. */ - std::unordered_set m_layer_indices; -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_IO_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_learning_rate.hpp b/include/lbann/callbacks/callback_learning_rate.hpp deleted file mode 100644 index 55dd090a7ea..00000000000 --- a/include/lbann/callbacks/callback_learning_rate.hpp +++ /dev/null @@ -1,297 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_learning_rate .hpp .cpp - Callback hooks for learning rate schedules -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED -#define LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED - -#include -#include -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -// Different schedules should inherit from lbann_callback_learning_rate. - -/** - * Base class for learning rate schedules. - * Child classes should implement the schedule method to make changes. - */ -class lbann_callback_learning_rate : public lbann_callback { - public: - lbann_callback_learning_rate(); - lbann_callback_learning_rate(const lbann_callback_learning_rate&) = default; - lbann_callback_learning_rate& operator=( - const lbann_callback_learning_rate&) = default; - /** Only apply to specific weights. */ - lbann_callback_learning_rate(std::unordered_set weights_list); - /** Do some initialization. */ - void setup(model *m) override; - /** Apply global learning rate schedules. */ - void on_epoch_end(model *m) override; - - using lbann_callback::on_backward_prop_end; - /** Apply local/per-optimizer learning rate schedules. */ - void on_backward_prop_end(model *m) override; - protected: - /** - * This is called at the end of every epoch to update the learning - * rate for every optimizer. Adjustments should be made based on the - * current global learning rate. - * The returned learning rate will be used to automatically update - * the current global learning rate. - */ - virtual float global_schedule(model *m) { return m_cur_global_lr; } - /** - * This is called at the end of every training mini-batch to update the - * learning rate for optimizer opt. The current global learning rate is *not* - * updated automatically based on this method. - */ - virtual float optimizer_schedule(model *m, optimizer &opt) { - return opt.get_learning_rate(); - } - - /** Weights to update. */ - std::unordered_set m_weights; - - /** - * This should be maintained by all learning rate schedule - * implementations as the current global learning rate. This enables - * coordination among different schedules, particularly ones that - * work on a per-optimizer basis. - */ - static float m_cur_global_lr; -}; - -/** - * Decrease the learning rate by a fixed proportion every X epochs. - */ -class lbann_callback_step_learning_rate : public lbann_callback_learning_rate { - public: - /** Decrease the learning rate by amt every step epochs. */ - lbann_callback_step_learning_rate(int step, float amt); - lbann_callback_step_learning_rate(int step, float amt, - std::unordered_set weights_list); - lbann_callback_step_learning_rate( - const lbann_callback_step_learning_rate&) = default; - lbann_callback_step_learning_rate& operator=( - const lbann_callback_step_learning_rate&) = default; - lbann_callback_step_learning_rate* copy() const override { - return new lbann_callback_step_learning_rate(*this); - } - std::string name() const override { return "step learning rate"; } - protected: - float global_schedule(model *m) override; - private: - /** Number of epochs between each learning rate decrease. */ - int m_step; - /** Amount to decrease the learning rate by. */ - float m_amt; -}; - -/** - * Decrease the learning rate by a fixed proportion when validation error stops - * improving. - */ -class lbann_callback_adaptive_learning_rate : public lbann_callback_learning_rate { - public: - /** - * Decrease the learning rate by amt if accuracy does not improve for patience - * epochs. - */ - lbann_callback_adaptive_learning_rate(int64_t patience, float amt); - lbann_callback_adaptive_learning_rate(int64_t patience, float amt, - std::unordered_set weights_list); - lbann_callback_adaptive_learning_rate( - const lbann_callback_adaptive_learning_rate&) = default; - lbann_callback_adaptive_learning_rate& operator=( - const lbann_callback_adaptive_learning_rate&) = default; - lbann_callback_adaptive_learning_rate* copy() const override { - return new lbann_callback_adaptive_learning_rate(*this); - } - std::string name() const override { return "adaptive learning rate"; } - protected: - float global_schedule(model *m) override; - private: - /** Number of epochs to wait for improvements. */ - int64_t m_patience; - /** Amount to decrease the learning rate by. */ - float m_amt; - /** Current epoch. */ - int m_cur_epoch = -1; - /** Last recorded score. */ - EvalType m_last_score = std::numeric_limits::max(); - /** Current number of epochs without improvement. */ - int64_t m_wait = 0; - /** Whether to adjust learning rate for current epoch. */ - bool m_adjust_learning_rate = false; -}; - -/** - * Decrease learning rate by a fixed amount at fixed times. - */ -class lbann_callback_drop_fixed_learning_rate : - public lbann_callback_learning_rate { - public: - /** - * Decrease the learning rate by amt when each epoch in drop_epochs is - * reached. - */ - lbann_callback_drop_fixed_learning_rate( - std::vector drop_epochs, float amt); - lbann_callback_drop_fixed_learning_rate( - std::vector drop_epochs, float amt, - std::unordered_set weights_list); - lbann_callback_drop_fixed_learning_rate( - const lbann_callback_drop_fixed_learning_rate&) = default; - lbann_callback_drop_fixed_learning_rate& operator=( - const lbann_callback_drop_fixed_learning_rate&) = default; - lbann_callback_drop_fixed_learning_rate* copy() const override { - return new lbann_callback_drop_fixed_learning_rate(*this); - } - std::string name() const override { return "drop fixed learning rate"; } - protected: - float global_schedule(model *m) override; - private: - /// Amount to decrease the learning rate by. - float m_amt; - /** - * Epochs to drop learning rate at. This is stored in reverse sorted order, - * so that the end can be examined and then popped in constant time. - */ - std::vector m_drop_epochs; -}; - -/** - * Linearly increase the learning rate to reach a target value over a - * fixed number of epochs. - * @note This currently assumes every optimizer begins with the same - * learning rate. This also *forces* its schedule and will stomp over - * other changes. - */ -class lbann_callback_linear_growth_learning_rate : - public lbann_callback_learning_rate { - public: - /** - * Linearly increase the learning rate to reach target after num_epochs. - */ - lbann_callback_linear_growth_learning_rate( - float target, int64_t num_epochs); - lbann_callback_linear_growth_learning_rate( - float target, int64_t num_epochs, int64_t delay); - lbann_callback_linear_growth_learning_rate( - float target, int64_t num_epochs, int64_t delay, - std::unordered_set weights_list); - lbann_callback_linear_growth_learning_rate( - const lbann_callback_linear_growth_learning_rate&) = default; - lbann_callback_linear_growth_learning_rate& operator=( - const lbann_callback_linear_growth_learning_rate&) = default; - lbann_callback_linear_growth_learning_rate* copy() const override { - return new lbann_callback_linear_growth_learning_rate(*this); } - void setup(model *m) override; - std::string name() const override { return "linear growth learning rate"; } - protected: - float global_schedule(model *m) override; - private: - /// Initial learning rate. - float m_base_lr; - /// Target learning rate to reach. - float m_target; - /// Amount to increase each epoch. - float m_inc; - /// Number of epochs over which to scale the learning rate. - int64_t m_num_epochs; - /// Number of epochs to delay before starting growth. - int64_t m_delay; -}; - -/** - * Decrease the learning rate by polynomial policy - * base_lr*(1 - i_cur/i_max)^p, where - * base_lr is the initial learning rate, i_cur is the current iteration, - * i_max is the maximum iteration, and p is a parameter. - */ -class lbann_callback_poly_learning_rate : public lbann_callback_learning_rate { - public: - lbann_callback_poly_learning_rate(double p, uint64_t n_epochs, uint64_t max_iter); - lbann_callback_poly_learning_rate(double p, uint64_t n_epochs, uint64_t max_iter, double endl_r, - std::unordered_set weights_list); - lbann_callback_poly_learning_rate( - const lbann_callback_poly_learning_rate&) = default; - lbann_callback_poly_learning_rate& operator=( - const lbann_callback_poly_learning_rate&) = default; - lbann_callback_poly_learning_rate* copy() const override { - return new lbann_callback_poly_learning_rate(*this); - } - void setup(model *m) override; - std::string name() const override { return "poly learning rate"; } - protected: - float global_schedule(model *m) override; - float optimizer_schedule(model *m, optimizer &opt) override; - private: - /// The exponent to compute new learning rate in poly policy - double m_p; - /// The number of epochs for training - uint64_t m_num_epochs; - /// The maximum number of iterations until which the learning rate changes - uint64_t m_max_iter; - /// The minimum learning rate - float m_end_lr; - /// The current rate to scale the base learning rate - float m_lr; - /// The learning rate scale used at the end of the last epoch - float m_last_epoch_lr; -}; - -/** - * This implements an adaptive scheme for adjust each optimizer's - * learning rate based on the ratio of the norms of its weights and - * its gradients. - * See: You et al. "Scaling SGD Batch Size to 32K for ImageNet - * Training", 2017. - */ -class lbann_callback_optimizerwise_adaptive_learning_rate : public lbann_callback_learning_rate { - public: - lbann_callback_optimizerwise_adaptive_learning_rate(float scale); - lbann_callback_optimizerwise_adaptive_learning_rate( - float scale, std::unordered_set weights_list); - lbann_callback_optimizerwise_adaptive_learning_rate( - const lbann_callback_optimizerwise_adaptive_learning_rate&) = default; - lbann_callback_optimizerwise_adaptive_learning_rate& operator=( - const lbann_callback_optimizerwise_adaptive_learning_rate&) = default; - lbann_callback_optimizerwise_adaptive_learning_rate* copy() const override { - return new lbann_callback_optimizerwise_adaptive_learning_rate(*this); } - std::string name() const override { return "optimizerwise adaptive learning rate"; } - protected: - float optimizer_schedule(model *m, optimizer &opt) override; - private: - float m_scale; -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_ltfb.hpp b/include/lbann/callbacks/callback_ltfb.hpp deleted file mode 100644 index e28a717da9c..00000000000 --- a/include/lbann/callbacks/callback_ltfb.hpp +++ /dev/null @@ -1,168 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" -#include -#include -#include - -namespace lbann { - -/** @brief Tournament training. - * - * This is intended to support research into the LTFB algorithm. An - * outline: - * - Divide the computational resources into multiple "trainers" - * that can operate in parallel. - * - Setup a model on each trainer and begin training independently. - * - Periodically launch tournaments to select "good" models. More - * specifically, trainers partner up and exchange their models. - * Each trainer evaluates a metric for its local and partner - * models, using its validation data set. The model with the better - * score is retained and the other one is discarded. - * - * There are many algorithmic variations to be explored: - * - How is data is divvied up amongst the trainers. Is it strictly - * partitioned, partially shared, or completely replicated? - * - What model components are exchanged? Just the trainable weights, - * or a subset of the weights? Hyperparameters? - * - Can this be used to explore model architectures? - * - * @todo Exchange optimizer state. - * @todo Support heterogeneous models. - */ -class lbann_callback_ltfb : public lbann_callback { -public: - - /** Inter-trainer communication scheme for LTFB. - * - * The specifics of these algorithms are experimental and will be - * in flux. - */ - enum class communication_algorithm { - /** Directly exchange weights values with sendrecv. - * - * Corresponding ranks in partner trainers will iterate through - * their weights and exchange values with sendrecvs. - * - * Notes: - * - Requires all models to be identical aside from their - * weights values, so this is not suitable for hyperparameter - * or model architecture exploration. - * - Optimizer state is not exchanged, so there may be wonky - * learning behavior immediately after a tournament. - * - Optimal if communication performance between ranks is - * uniform and independent. If intra-trainer communication is - * fast or if communication performance is sensitive to - * network traffic, it may be advantageous to gather model - * data on the trainer master ranks and only perform - * inter-trainer communication between them. - */ - sendrecv_weights, - - /** Save and load model data with checkpoint files. - * - * @todo Implement. - * - * Notes: - * - Supports hyperparameter exploration. - * - Checkpoint files currently do not store model architecture - * information, so this is not suitable for model - * architecture exploraiton. - * - This approach is temporary and experimental, since going - * through the file system is very suboptimal. When a wire - * format for model checkpoints is developed, it should be - * used instead. - */ - checkpoint_file - }; - - /** @brief Construct the LTFB callback - * @param batch_interval Number of training mini-batch steps between - * tournaments. - * @param metric_name Metric for tournament evaluation. - * @param weights_names List of weights to exchange with partner. - * If empty, then all weights are exchanged. - * @param low_score_wins Whether low-scoring or high-scoring models - * survive a tournament. - * @param comm_algo Inter-trainer communication scheme. - * @param summarizer The summarizer to use for this callback - */ - lbann_callback_ltfb( - El::Int batch_interval, - std::string metric_name, - std::set weights_names = std::set(), - bool low_score_wins = false, - communication_algorithm comm_algo = communication_algorithm::sendrecv_weights, - lbann_summary *summarizer = nullptr); - lbann_callback_ltfb(const lbann_callback_ltfb& other); - lbann_callback_ltfb& operator=(const lbann_callback_ltfb& other); - lbann_callback_ltfb* copy() const override { return new lbann_callback_ltfb(*this); } - std::string name() const override { return "LTFB"; } - - void setup(model *m) override; - void on_train_begin(model *m) override; - void on_batch_begin(model *m) override; - - /** Convert string to LTFB communication algorithm. - * - * If an empty string is provided, returns @c - * communication_algorithm::sendrecv_weights. - */ - static communication_algorithm string_to_comm_algo(const std::string& str); - -private: - - /** Metric for tournament evaluation. */ - std::string m_metric_name; - - /** List of weights to exchange with partner. - * - * If empty, then all weights are exchanged. - */ - std::set m_weights_names; - - /** Whether low-scoring or high-scoring models survive a - * tournament. */ - bool m_low_score_wins; - - /** Inter-trainer communication scheme. */ - communication_algorithm m_comm_algo; - - /** Workspace weights. - * - * Used to temporarily store local weights during a tournament. - */ - std::vector> m_workspace_weights; - -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_perturb_adam.hpp b/include/lbann/callbacks/callback_perturb_adam.hpp deleted file mode 100644 index 6adf47dd83a..00000000000 --- a/include/lbann/callbacks/callback_perturb_adam.hpp +++ /dev/null @@ -1,127 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" -#include "lbann/optimizers/adam.hpp" -#include - -namespace lbann { - -/** @brief Hyperparameter exploration with Adam optimizers. - * - * Goes through the Adam optimizers in a model and perturbs four - * hyperparameters: the learning rate, @f$\beta_1@f$, @f$\beta_2@f$, - * and @f$\epsilon@f$. Since these hyperparameters can range over - * orders of magnitude, the perturbations are performed in log space. - * More precisely, random values are drawn from normal distributions - * (with user-provided standard deviations) and added to - * @f$\log(\text{learning rate})@f$, @f$\log(1-\beta_1)@f$, - * @f$\log(1-\beta_2)@f$, and @f$\log\epsilon@f$. - */ -class lbann_callback_perturb_adam : public lbann_callback { -public: - - /** @param learning_rate_factor Standard deviation of learning rate - * perturbation (in log space). - * @param beta1_factor Standard deviation of @f$\beta_1@f$ - * perturbation (in log space). - * @param beta2_factor Standard deviation of @f$\beta_2@f$ - * perturbation (in log space). - * @param eps_factor Standard deviation of @f$\epsilon@f$ - * perturbation (in log space). - * @param perturb_during_training Whether to periodically perturb - * hyperparameters during training - * or to only perturb once during - * setup. - * @param batch_interval Number of training mini-batch steps between - * perturbations. Only used if - * @c perturb_during_training is @c true. - * @param weights_names Names of weights with Adam optimizers. If - * empty, all Adam optimizers in the model are - * perturbed. - */ - lbann_callback_perturb_adam(DataType learning_rate_factor, - DataType beta1_factor, - DataType beta2_factor, - DataType eps_factor = 0, - bool perturb_during_training = false, - El::Int batch_interval = 1, - std::set weights_names - = std::set()); - lbann_callback_perturb_adam* copy() const override { return new lbann_callback_perturb_adam(*this); } - std::string name() const override { return "perturb Adam"; } - - void setup(model* m); - void on_batch_begin(model* m); - -private: - - /** Standard deviation of learning rate perturbation. - * - * In log space. - */ - DataType m_learning_rate_factor; - /** Standard deviation of @f$\beta_1@f$ perturbation. - * - * In log space. - */ - DataType m_beta1_factor; - /** Standard deviation of @f$\beta_2@f$ perturbation. - * - * In log space. - */ - DataType m_beta2_factor; - /** Standard deviation of @f$\epsilon@f$ perturbation. - * - * In log space. - */ - DataType m_eps_factor; - - /** Whether to periodically perturb during training. - * - * If false, only perturb once during setup. - */ - bool m_perturb_during_training; - - /** Optimizers for these weights will be perturbed. - * - * If empty, all Adam optimizers in the model will be perturbed. - */ - std::set m_weights_names; - - /** Perturb Adam optimizers in model. */ - void perturb(model& m) const; - /** Perturb Adam optimizer hyperparameters. */ - void perturb(lbann_comm& comm, adam& m) const; - -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_print.hpp b/include/lbann/callbacks/callback_print.hpp deleted file mode 100644 index 53c77d2a7a1..00000000000 --- a/include/lbann/callbacks/callback_print.hpp +++ /dev/null @@ -1,63 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_print .hpp .cpp - Callback hooks to print information -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_PRINT_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_PRINT_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** Periodically print computational results. - * Prints average objective function value and metric scores after - * each training epoch and evaluation. - */ -class lbann_callback_print : public lbann_callback { - public: - lbann_callback_print(int batch_interval = 1, bool print_global_stat_only=false) : - lbann_callback(batch_interval), m_print_global_stat_only(print_global_stat_only) {} - lbann_callback_print(const lbann_callback_print&) = default; - lbann_callback_print& operator=(const lbann_callback_print&) = default; - lbann_callback_print* copy() const override { return new lbann_callback_print(*this); } - void setup(model *m) override; - void on_epoch_begin(model *m) override; - void on_epoch_end(model *m) override; - void on_validation_end(model *m) override; - void on_test_end(model *m) override; - std::string name() const override { return "print"; } - - private: - /** Print objective function and metrics to standard output. */ - void report_results(model *m); - bool m_print_global_stat_only; - -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_PRINT_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_replace_weights.hpp b/include/lbann/callbacks/callback_replace_weights.hpp deleted file mode 100644 index 62bf033792c..00000000000 --- a/include/lbann/callbacks/callback_replace_weights.hpp +++ /dev/null @@ -1,71 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED - -#include - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * Weights/parameters replacement on k-batch end - * Currently support replacing weights/parameters using layer names - * Can easily be extended to support replacement by weights name - * Given two layers specified in prototext, weights are copied from source layer to destination layer. - */ -class lbann_callback_replace_weights : public lbann_callback { - public: - lbann_callback_replace_weights(std::vector src, - std::vector dst, int batch_interval=1) : - lbann_callback(batch_interval), - m_src_layers(std::move(src)), - m_dst_layers(std::move(dst)){ - if(m_src_layers.size() != m_dst_layers.size()) - throw lbann_exception("In replace weights callback: number of src and dest layers does not match."); - } - - lbann_callback_replace_weights( - const lbann_callback_replace_weights&) = default; - lbann_callback_replace_weights& operator=( - const lbann_callback_replace_weights&) = default; - lbann_callback_replace_weights* copy() const override { - return new lbann_callback_replace_weights(*this); - } - void on_batch_end(model *m) override; - - std::string name() const override { return "replace weights"; } - private: - std::vector m_src_layers, m_dst_layers; - -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_save_images.hpp b/include/lbann/callbacks/callback_save_images.hpp deleted file mode 100644 index 72d870f3fc1..00000000000 --- a/include/lbann/callbacks/callback_save_images.hpp +++ /dev/null @@ -1,76 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED - -#include -#include -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** Save layer outputs as image files. - * Image files are in the form - * "-.". - */ -class lbann_callback_save_images : public lbann_callback { -public: - - /** Constructor. - * @param layer_names List of layer names to save as images. - * @param image_format Image file format (e.g. jpg, png, pgm). - * @param image_prefix Prefix for image file names. - */ - lbann_callback_save_images(std::vector layer_names, - std::string image_format = "jpg", - std::string image_prefix = ""); - lbann_callback_save_images(const lbann_callback_save_images&) = default; - lbann_callback_save_images& operator=( - const lbann_callback_save_images&) = default; - lbann_callback_save_images* copy() const override { - return new lbann_callback_save_images(*this); - } - void on_epoch_end(model *m) override; - void on_test_end(model *m) override; - std::string name() const override { return "save images"; } - -private: - - /** List of layer names to save as images. */ - std::vector m_layer_names; - /** Image file format. - * Valid options: jpg, png, pgm. - */ - std::string m_image_format; - /** Prefix for saved image files. */ - std::string m_image_prefix; - -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_save_model.hpp b/include/lbann/callbacks/callback_save_model.hpp deleted file mode 100644 index aeeae47415a..00000000000 --- a/include/lbann/callbacks/callback_save_model.hpp +++ /dev/null @@ -1,80 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_save_model .hpp .cpp - Callbacks to save model, currently as protobuf -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED - -#include - -#include "lbann/callbacks/callback.hpp" -#include -#include - -namespace lbann { - -/** - * Save model to as protobuf file and set of weights - */ -class lbann_callback_save_model : public lbann_callback { - public: - /** - * @param dir directory to save model - * @param disable_save_after_training Don't save after training - * @param extension file extension e.g., model, state ...... - */ - lbann_callback_save_model(std::string dir, - bool disable_save_after_training, - std::string extension="prototext") : - lbann_callback(), m_dir(std::move(dir)), - m_disable_save_after_training(disable_save_after_training), - m_extension(std::move(extension)) - {} - lbann_callback_save_model(const lbann_callback_save_model&) = default; - lbann_callback_save_model& operator=( - const lbann_callback_save_model&) = default; - lbann_callback_save_model* copy() const override { - return new lbann_callback_save_model(*this); - } - void on_train_end(model *m) override; - bool save_model(model *m); - bool save_model_weights(model *m); - static bool load_model_weights(std::string ckpt_dir, model *m); - - std::string name() const override { return "save model"; } - private: - std::string m_dir; //directory to save file - bool m_disable_save_after_training; /// Disables the normal behavior of saving when training is complete - std::string m_extension; //file extension - persist p; - void write_proto_binary(const lbann_data::Model& proto, const std::string filename); - void write_proto_text(const lbann_data::Model& proto, const std::string filename); -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_summary.hpp b/include/lbann/callbacks/callback_summary.hpp deleted file mode 100644 index 15294ac240d..00000000000 --- a/include/lbann/callbacks/callback_summary.hpp +++ /dev/null @@ -1,71 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_callback_summary .hpp .cpp - Callback hooks to summarize to Tensorboard -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" -#include "lbann/utils/summary.hpp" - -namespace lbann { - -/** - * Summarize information to Tensorboard using LBANN's summary interface. - */ -class lbann_callback_summary : public lbann_callback { - public: - /** - * @param summarizer The summary object to write to; this callback takes - * ownership of it. - * @param batch_interval The frequency with which to summarize - * @param mat_interval FIXME - * @todo Document mat_interval parameter. - */ - lbann_callback_summary(lbann_summary *summarizer, int batch_interval = 1, - int mat_interval = 25); - ~lbann_callback_summary() override; - lbann_callback_summary(const lbann_callback_summary&) = default; - lbann_callback_summary& operator=(const lbann_callback_summary&) = default; - lbann_callback_summary* copy() const override { - return new lbann_callback_summary(*this); - } - void on_train_begin(model *m) override; - void on_batch_end(model *m) override; - void on_epoch_end(model *m) override; - void on_test_end(model *m) override; - std::string name() const override { return "summary"; } - protected: - /** Write out histograms from the model's layers. */ - void save_histograms(model *m); - /** Interval for doing matrix summarization. */ - int m_mat_interval; -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_sync_layers.hpp b/include/lbann/callbacks/callback_sync_layers.hpp deleted file mode 100644 index 2c9d4984fa8..00000000000 --- a/include/lbann/callbacks/callback_sync_layers.hpp +++ /dev/null @@ -1,80 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// callback_sync_layers.hpp - Callback to synchronize layers -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** Synchronize layers after forward and backward prop. - * Additionally updates layer timing information to account for this. - * Note that this callback should come before the summarizer callback to report - * time correctly (otherwise it will be shifted by one mini-batch). - */ -class lbann_callback_sync_layers : public lbann_callback { - public: - /** - * @param sync_gpus The GPU stream will be synchronized. - * @param sync_mpi A global barrier will synchronize processes. - * @param only_input The only synchronization will be after the input layer in - * forward prop. - */ - lbann_callback_sync_layers(bool sync_gpus = true, bool sync_mpi = true, - bool only_input = false) : - lbann_callback(1), m_sync_gpus(sync_gpus), m_sync_mpi(sync_mpi), - m_only_input(only_input) {} - lbann_callback_sync_layers(const lbann_callback_sync_layers&) = default; - lbann_callback_sync_layers& operator=( - const lbann_callback_sync_layers&) = default; - lbann_callback_sync_layers* copy() const override { - return new lbann_callback_sync_layers(*this); - } - std::string name() const override { return "sync_layers"; } - - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_backward_prop_end; - - void on_forward_prop_end(model *m, Layer *l) override; - void on_backward_prop_end(model *m, Layer *l) override; - - protected: - /** Whether to synchronize GPUs. */ - bool m_sync_gpus; - /** Whether to do a global synchronization. */ - bool m_sync_mpi; - /** Whether to only synchronize after the input layer. */ - bool m_only_input; - - virtual void do_sync(Layer *l); -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_sync_selected.hpp b/include/lbann/callbacks/callback_sync_selected.hpp deleted file mode 100644 index 53cda7e8b3f..00000000000 --- a/include/lbann/callbacks/callback_sync_selected.hpp +++ /dev/null @@ -1,138 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// callback_sync_selected.hpp - Callback to synchronize selected layers -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_SYNC_SELECTED_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_SYNC_SELECTED_HPP_INCLUDED - -#include "lbann/callbacks/callback_sync_layers.hpp" -#include -#include - -namespace lbann { - -/** - * Synchronize at the beginning and the end of the propagation operation(s) of - * a selected layer, which can be both/either of the forward prop and/or the - * backward prop of the layer. Additionally updates layer timing information to - * account for the synchronization at the end of propagation(s). - * When nvprof is enabled, cudaProfilerStart() follows the synchronization - * inserted at the beginning of the selected prop step(s), and cudaProfilerEnd() - * comes after the local GPU sychronization and before the global MPI barrier - * inserted at the end of the selected prop step(s). - * Note that this callback should come before the summarizer callback - * as the base callback lbann_callback_sync_layers requires. - */ -class lbann_callback_sync_selected : public lbann_callback_sync_layers { - public: - ///type of propagation toch synchronize - enum prop_t {Both = 0, Forward = 1, Backward = 2}; - static const std::map m_prop_str; - - using layers_t = std::unordered_map; - using layer_ptrs_t = std::unordered_set; - - /** - * @param layers specifies the layers to synchronize - * @param async_gpus sets not to synchronize gpus. The default is false. - * @param async_mpi sets not to synchronize mpi. The default is false. - */ - lbann_callback_sync_selected(const layers_t& layers, - bool async_gpus = false, bool async_mpi = false); - - lbann_callback_sync_selected(const lbann_callback_sync_selected&) = default; - - lbann_callback_sync_selected& operator=( - const lbann_callback_sync_selected&) = default; - - lbann_callback_sync_selected* copy() const override { - return new lbann_callback_sync_selected(*this); - } - - ~lbann_callback_sync_selected() override; - - std::string name() const override { return "sync_selected"; } - std::string get_description() const; - - /// To protect in case that cudaProfilerInitialized() has already been called - static void turn_off_init_cuda_profiler(); - - /// Tells if cuda_profiler has been initialized - static bool check_if_cuda_profiler_initialized(); - - void init_cuda_profiler(const std::string cfg_file, const std::string out_dir, - int out_mode, lbann_comm* comm) const; - - /** Called once to set up the callback (after all layers are set up). - * Then, populate the layer pointers */ - void setup(model *m) override; - - using lbann_callback::on_forward_prop_begin; - using lbann_callback::on_backward_prop_begin; - using lbann_callback_sync_layers::on_forward_prop_end; - using lbann_callback_sync_layers::on_backward_prop_end; - - /// Synchronize at the beginning of the forward prop of layer l - void on_forward_prop_begin(model* m, Layer* l) override; - /// Synchronize at the end of the forward prop of layer l - void on_forward_prop_end(model* m, Layer* l) override; - /// Synchronize at the beginning of the backward prop of layer l - void on_backward_prop_begin(model* m, Layer* l) override; - /// Synchronize at the end of the backward prop of layer l - void on_backward_prop_end(model* m, Layer* l) override; - - protected: - bool check_if_all_accounted_for() const; - - layer_ptrs_t::iterator populate_layer_ptrs(Layer* l, const prop_t current_prop); - - /// Synchronize and enable cuda profiler - void do_pre_sync(Layer* l); - /// Synchronize and disble cuda profiler - void do_sync(Layer* l) override; - - /// The layers to synchronize. - layers_t m_layers; - - /** The pointers of layers to synchronize for forward prop. - * This set includes those of layers to synchronize for both props. */ - layer_ptrs_t m_fwd_ptrs; - /** The pointers of layers to synchronize for backward prop. - * This set includes those of layers to synchronize for both props. */ - layer_ptrs_t m_bwd_ptrs; - /// The pointers of layers to synchronize for both props. - layer_ptrs_t m_both_ptrs; - - bool m_all_set; ///< whether all the layer pointers are collected - - /// Tells if cudaProfilerInitialized() has already been called. - static bool m_cuda_profiler_initialized; -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_SYNC_SELECTED_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_timeline.hpp b/include/lbann/callbacks/callback_timeline.hpp deleted file mode 100644 index 8bf84dd787d..00000000000 --- a/include/lbann/callbacks/callback_timeline.hpp +++ /dev/null @@ -1,92 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// callback_timeline .hpp .cpp - Callback hooks to record a timeline of runtime -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED - -#include -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * Record a timeline of training runtime on each rank and output it to a - * logfile for external processing. - * The logfile is named timeline.m\.\.txt. - * Each line is a separate event, written as name:start-time:end-time. - * Times are relative to the beginning of training. - */ -class lbann_callback_timeline : public lbann_callback { - public: - lbann_callback_timeline(std::string outdir) : lbann_callback(1), - m_outdir(outdir) {} - lbann_callback_timeline(const lbann_callback_timeline&) = default; - lbann_callback_timeline& operator=(const lbann_callback_timeline&) = default; - lbann_callback_timeline* copy() const override { - return new lbann_callback_timeline(*this); - } - std::string name() const override { return "timeline"; } - void on_train_begin(model *m) override; - void on_train_end(model *m) override; - - using lbann_callback::on_forward_prop_begin; - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_backward_prop_begin; - using lbann_callback::on_backward_prop_end; - using lbann_callback::on_optimize_begin; - using lbann_callback::on_optimize_end; - - void on_forward_prop_begin(model *m, Layer *l) override; - void on_forward_prop_end(model *m, Layer *l) override; - void on_backward_prop_begin(model *m, Layer *l) override; - void on_backward_prop_end(model *m, Layer *l) override; - void on_optimize_begin(model *m, weights *w) override; - void on_optimize_end(model *m, weights *w) override; - private: - /// Get time relative to the start time. - EvalType get_rel_time() const { return get_time() - m_start_time; } - - /// Directory to write output to. - std::string m_outdir; - /// Time training started; all times are relative to this. - EvalType m_start_time = EvalType(0); - /// Time the current layer's forward pass started. - EvalType m_fp_start_time = EvalType(0); - /// Time the current layer's backward pass started. - EvalType m_bp_start_time = EvalType(0); - /// Time the current weights' optimization pass started. - EvalType m_opt_start_time = EvalType(0); - /// Store (relative) timing information. - std::unordered_map>> m_fp_times; - std::unordered_map>> m_bp_times; - std::unordered_map>> m_opt_times; -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_timer.hpp b/include/lbann/callbacks/callback_timer.hpp deleted file mode 100644 index a53243e7a3f..00000000000 --- a/include/lbann/callbacks/callback_timer.hpp +++ /dev/null @@ -1,103 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" -#include -#include -#include - -namespace lbann { - -/** Record and report model timing results. - * Reports the total time and mini-batch time statistics for training - * epochs and for model evaluations. This reports times for the - * master process in each model. - */ -class lbann_callback_timer : public lbann_callback { -public: - - lbann_callback_timer(lbann_summary *summarizer = nullptr) - : lbann_callback(1, summarizer) {} - lbann_callback_timer(const lbann_callback_timer&) = default; - lbann_callback_timer& operator=(const lbann_callback_timer&) = default; - lbann_callback_timer* copy() const override { - return new lbann_callback_timer(*this); - } - - /** Start timing for a training epoch. */ - void on_epoch_begin(model *m) override { timing_begin(*m); } - /** Report timing for a training epoch. */ - void on_epoch_end(model *m) override { timing_end(*m); } - /** Start timing for validation. */ - void on_validation_begin(model *m) override { timing_begin(*m); } - /** Report timing for validation. */ - void on_validation_end(model *m) override { timing_end(*m); } - /** Start timing for testing. */ - void on_test_begin(model *m) override { timing_begin(*m); } - /** Report timing for testing. */ - void on_test_end(model *m) override { timing_end(*m); } - /** Record training mini-batch start time. */ - void on_batch_begin(model *m) override { batch_timing_begin(*m); } - /** Record training mini-batch run time. */ - void on_batch_end(model *m) override { batch_timing_end(*m); } - /** Record evaluation mini-batch start time. */ - void on_batch_evaluate_begin(model *m) override { batch_timing_begin(*m); } - /** Record evaluation mini-batch run time. */ - void on_batch_evaluate_end(model *m) override { batch_timing_end(*m); } - - /** Callback name. */ - std::string name() const override { return "timer"; } - -private: - - /** Timing session start times. */ - std::map m_start_times; - /** Mini-batch timing session start times. */ - std::map m_batch_start_times; - /** Mini-batch times. */ - std::map> m_batch_times; - - /** Start timing session. */ - void timing_begin(const model& m); - /** End timing session. - * Prints results to standard output. - */ - void timing_end(model& m); - /** Start mini-batch timing session. */ - void batch_timing_begin(const model& m); - /** End mini-batch timing session. - * Prints results to standard output. - */ - void batch_timing_end(const model& m); - -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_variable_minibatch.hpp b/include/lbann/callbacks/callback_variable_minibatch.hpp deleted file mode 100644 index 44d8c62f766..00000000000 --- a/include/lbann/callbacks/callback_variable_minibatch.hpp +++ /dev/null @@ -1,145 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_variable_minibatch .hpp .cpp - Callback for variable-size mini-batches -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED -#define LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED - -#include "lbann/callbacks/callback.hpp" - -namespace lbann { - -/** - * Support changing the mini-batch size on different schedules. - * Implementations should override implement the abstract methods to define - * concrete schedules. - */ -class lbann_callback_variable_minibatch : public lbann_callback { - public: - lbann_callback_variable_minibatch(int starting_mbsize); - lbann_callback_variable_minibatch( - const lbann_callback_variable_minibatch&) = default; - lbann_callback_variable_minibatch& operator=( - const lbann_callback_variable_minibatch&) = default; - /// Set the initial mini-batch size. - void on_train_begin(model *m) override; - /// Potentially change the mini-batch size. - void on_epoch_end(model *m) override; - protected: - /** - * Implemented by child classes to provide the mini-batch/learning schedule. - * This is called at the end of every training epoch. If it returns false, - * no changes are made from the currently established schedule. - * If this returns true, the mini-batch size will be changed accordingly. - * If the mini-batch size is larger than the model's maximum mini-batch size, - * a warning is printed and the maximum mini-batch size is used. - * If new_lr also non-zero, the learning rate will be changed to new_lr, - * with a linear ramp time. (If ramp_time is 0, it is changed immediately.) - * Note changing the learning rate while in a ramp may lead to unexpected - * behavior; also be aware of interactions with other learning rate - * schedules. - */ - virtual bool schedule(model *m, int& new_mbsize, float& new_lr, - int& ramp_time) = 0; - /// Change the learning rate of every layer in m to new_lr. - void change_learning_rate(model *m, float new_lr) const; - /// Get the current learning rate (assumes every layer has the same one). - float get_current_learning_rate(model *m) const; - /// Initial mini-batch size. - const int m_starting_mbsize; - /** - * The current mini-batch size for this epoch. - * This is kept separately from the model's get_current_mini_batch_size() - * method, as calling that in on_epoch_end returns the size of the last mini- - * batch, not the "base" mini-batch. - */ - int m_current_mini_batch_size; - /// Current number of epochs left to ramp the learning rate. - int m_ramp_count = 0; - /// Amount to increment the learning rate by when ramping. - float m_lr_incr = 0.0f; -}; - -/** - * Double the mini-batch size every set number of epochs. - * Also doubles the learning rate. - */ -class lbann_callback_step_minibatch : public lbann_callback_variable_minibatch { - public: - lbann_callback_step_minibatch(int starting_mbsize, int step, - int ramp_time = 0); - lbann_callback_step_minibatch(const lbann_callback_step_minibatch&) = default; - lbann_callback_step_minibatch& operator=( - const lbann_callback_step_minibatch&) = default; - lbann_callback_step_minibatch* copy() const override { - return new lbann_callback_step_minibatch(*this); - } - std::string name() const override { return "step minibatch"; } - protected: - bool schedule(model *m, int& new_mbsize, float& new_lr, int& ramp_time) override; - /// Number of epochs between mini-batch size increases. - int m_step; - /// Number of steps to ramp the learning rate over. - int m_ramp_time; -}; - -class lbann_callback_minibatch_schedule : public lbann_callback_variable_minibatch { - public: - /// Represents a step in a schedule of mini-batch sizes. - struct minibatch_step { - /// Epoch for this schedule to start. - int epoch; - /// Mini-batch size to use. - int mbsize; - /// Learning rate to use. - float lr; - /// Number of epochs to ramp the learning rate over. - int ramp_time; - minibatch_step(int _epoch, int _mbsize, float _lr, int _ramp_time) : - epoch(_epoch), mbsize(_mbsize), lr(_lr), ramp_time(_ramp_time) {} - }; - - lbann_callback_minibatch_schedule( - int starting_mbsize, std::vector steps); - lbann_callback_minibatch_schedule( - const lbann_callback_minibatch_schedule&) = default; - lbann_callback_minibatch_schedule& operator=( - const lbann_callback_minibatch_schedule&) = default; - lbann_callback_minibatch_schedule* copy() const override { - return new lbann_callback_minibatch_schedule(*this); - } - std::string name() const override { return "minibatch schedule"; } - protected: - bool schedule(model *m, int& new_mbsize, float& new_lr, int& ramp_time) override; - - /// Steps in the mini-batch schedule, stored in reverse sorted order. - std::vector m_steps; -}; - -} // namespace lbann - -#endif // LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED diff --git a/include/lbann/callbacks/check_dataset.hpp b/include/lbann/callbacks/check_dataset.hpp new file mode 100644 index 00000000000..74030a6fce9 --- /dev/null +++ b/include/lbann/callbacks/check_dataset.hpp @@ -0,0 +1,79 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED + +#include +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * Save the sample indices for each mini-batch to ordered set. + * Check to make sure that all samples were properly processed. + */ +class check_dataset : public callback_base { + public: + using callback_base::on_forward_prop_end; + using callback_base::on_evaluate_forward_prop_end; + + check_dataset() : + callback_base() {} + check_dataset( + const check_dataset&) = default; + check_dataset& operator=( + const check_dataset&) = default; + check_dataset* copy() const override { + return new check_dataset(*this); + } + void on_forward_prop_end(model *m, Layer *l) override; + void on_evaluate_forward_prop_end(model *m, Layer *l) override; + void on_epoch_end(model *m) override; + void on_validation_end(model *m) override; + void on_test_end(model *m) override; + + void add_to_set(model *m, Layer *l, int64_t step, std::set &set); + + std::string name() const override { return "check data set indices"; } + private: + /** @brief Basename for writing files. */ + std::string m_basename; + + std::set training_set; + std::set validation_set; + std::set testing_set; +}; + +// Builder function +LBANN_ADD_DEFAULT_CALLBACK_BUILDER( + check_dataset, build_check_dataset_callback_from_pbuf); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED diff --git a/include/lbann/callbacks/check_gradients.hpp b/include/lbann/callbacks/check_gradients.hpp new file mode 100644 index 00000000000..39ca536c084 --- /dev/null +++ b/include/lbann/callbacks/check_gradients.hpp @@ -0,0 +1,98 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +#include + +namespace lbann { +namespace callback { + +/** @brief Gradient checking callback. + * + * Gradient checking is performed at the end of each execution mode + * phase. Using a fourth-order finite difference scheme, a numerical + * partial derivative is computed for every weight parameter. If the + * numerical derivative differs signifcantly from the analytical + * derivative computed during backprop, the gradient check has + * failed. + */ +class check_gradients : public callback_base { +public: + + /** + * @param modes Execution modes with gradient checks. If + * none are provided, gradient checking is + * performed for every execution mode. + * @param step_size Step size for numerical + * differentiation (with a step size of + * zero, the step size is estimated to + * minimize the numerical error). + * @param verbose Whether to print results for each + * parameter. + * @param error_on_failure Whether to throw an exception for + * large gradient errors. + */ + check_gradients(std::set modes = {}, + DataType step_size = DataType(0), + bool verbose = false, + bool error_on_failure = false); + check_gradients* copy() const override { + return new check_gradients(*this); + } + std::string name() const override { return "check gradients"; } + void on_train_end(model *m) override { do_check_gradients(*m); } + void on_validation_end(model *m) override { do_check_gradients(*m); } + void on_test_end(model *m) override { do_check_gradients(*m); } + +private: + + /** Execution modes with gradient checks. */ + std::set m_modes; + /** Step size for numerical differentiation. */ + EvalType m_step_size; + /** Whether to print results for each parameter. */ + bool m_verbose; + /** Whether to throw an exception for large gradient errors. */ + bool m_error_on_failure; + + /** Does nothing if current execution mode is not in m_modes. */ + void do_check_gradients(model& m) const; + +}; + +// Builder function +std::unique_ptr +build_check_gradients_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/check_init.hpp b/include/lbann/callbacks/check_init.hpp new file mode 100644 index 00000000000..0f6ffa5c7a5 --- /dev/null +++ b/include/lbann/callbacks/check_init.hpp @@ -0,0 +1,60 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// check_init .hpp .cpp - Check multi-model init +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * Verify that every model uses the same initialization. + */ +class check_init : public callback_base { + public: + check_init() = default; + check_init(const check_init&) = default; + check_init& operator=(const check_init&) = default; + check_init* copy() const override { + return new check_init(*this); + } + /** Check initializations. */ + void on_train_begin(model *m) override; + std::string name() const override { return "check init"; } +}; + +// Builder function +LBANN_ADD_DEFAULT_CALLBACK_BUILDER( + check_init, build_check_init_callback_from_pbuf); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED diff --git a/include/lbann/callbacks/check_metric.hpp b/include/lbann/callbacks/check_metric.hpp new file mode 100644 index 00000000000..d965f6d6ad5 --- /dev/null +++ b/include/lbann/callbacks/check_metric.hpp @@ -0,0 +1,87 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" +#include + +namespace lbann { +namespace callback { + +/** Metric checking callback. + * Checks if a metric value falls within an expected range. + */ +class check_metric : public callback_base { +public: + + check_metric(std::string metric_name, + std::set modes, + EvalType lower_bound, + EvalType upper_bound, + bool error_on_failure); + check_metric* copy() const override { + return new check_metric(*this); + } + std::string name() const override { return "check metric"; } + + void on_epoch_end(model* m) override { do_check_metric(*m); } + void on_validation_end(model* m) override { do_check_metric(*m); } + void on_test_end(model* m) override { do_check_metric(*m); } + +private: + + /** Metric name. */ + std::string m_metric_name; + + /** Execution modes with metric checks. */ + std::set m_modes; + + /** Lower bound for metric value. */ + EvalType m_lower_bound; + /** Upper bound for metric value. */ + EvalType m_upper_bound; + + /** Whether to throw an exception if metric check fails. */ + bool m_error_on_failure; + + /** Perform metric check. + * Does nothing if current execution mode is not in m_modes; + */ + void do_check_metric(const model& m) const; + +}; + +// Builder function +std::unique_ptr +build_check_metric_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED diff --git a/include/lbann/callbacks/check_nan.hpp b/include/lbann/callbacks/check_nan.hpp new file mode 100644 index 00000000000..0894b25a12e --- /dev/null +++ b/include/lbann/callbacks/check_nan.hpp @@ -0,0 +1,72 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// check_nan .hpp .cpp - Check matrices for invalid numbers +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_NAN_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_CHECK_NAN_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * Check matrices for whether they include any NaNs or infs to help debugging. + * This will kill the rank if such values are discovered. + */ +class check_nan : public callback_base { + public: + using callback_base::on_forward_prop_end; + using callback_base::on_backward_prop_end; + + check_nan() = default; + check_nan(const check_nan&) = default; + check_nan& operator=( + const check_nan&) = default; + check_nan* copy() const override { + return new check_nan(*this); + } + /** Check that activations are good. */ + void on_forward_prop_end(model *m, Layer *l) override; + /** Check that error signals are good. */ + void on_backward_prop_end(model *m, Layer *l) override; + /** Check that gradients are good. */ + void on_backward_prop_end(model *m) override; + /** Check that weights are good. */ + void on_batch_end(model *m) override; + std::string name() const override { return "check_nan"; } + +}; + +// Builder function +LBANN_ADD_DEFAULT_CALLBACK_BUILDER( + check_nan, build_check_nan_callback_from_pbuf) + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_CHECK_NAN_HPP_INCLUDED diff --git a/include/lbann/callbacks/check_small.hpp b/include/lbann/callbacks/check_small.hpp new file mode 100644 index 00000000000..c5419f58571 --- /dev/null +++ b/include/lbann/callbacks/check_small.hpp @@ -0,0 +1,72 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// check_small .hpp .cpp - Check matrices for small values +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_SMALL_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_CHECK_SMALL_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * Check matrices for whether they include any very small values to avoid + * getting denormalized values. Denormalized values can significantly slow + * floating point computations. + * Since we often square values, the check is based on the square root of the + * smallest floating point value. + * This will kill the rank if such values are discovered. + */ +class check_small : public callback_base { + public: + using callback_base::on_forward_prop_end; + using callback_base::on_backward_prop_end; + + check_small() = default; + check_small(const check_small&) = default; + check_small& operator=(const check_small&) = default; + check_small* copy() const override { + return new check_small(*this); + } + /** Check that activations are good. */ + void on_forward_prop_end(model *m, Layer *l) override; + /** Check that gradients are good. */ + void on_backward_prop_end(model *m) override; + /** Check that weights are good. */ + void on_batch_end(model *m) override; + std::string name() const override { return "check_small"; } +}; + +// Builder function +LBANN_ADD_DEFAULT_CALLBACK_BUILDER( + check_small, build_check_small_callback_from_pbuf) + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_CHECK_SMALL_HPP_INCLUDED diff --git a/include/lbann/callbacks/checkpoint.hpp b/include/lbann/callbacks/checkpoint.hpp new file mode 100644 index 00000000000..8a5aaac8003 --- /dev/null +++ b/include/lbann/callbacks/checkpoint.hpp @@ -0,0 +1,314 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// checkpoint .hpp .cpp - Callback hooks to checkpoint model +//////////////////////////////////////////////////////////////////////////////// +#ifndef LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" +#include "lbann/io/persist.hpp" +#include "lbann/training_algorithms/training_algorithm.hpp" + +namespace lbann { +namespace callback { + +enum class callback_phase { + batch, + epoch, + validation, + inference, + invalid +}; + +/** @brief Checkpoint at given interval in given directory */ +class checkpoint : public callback_base { + public: + + /** @brief Construct the checkpoint callback + * + * It may be beneficial to the distributed checkpoints at a higher + * tempo than the shared checkpoints because they are less + * expensive. + * + * @param checkpoint_dir directory to save checkpoint files + * @param restart_dir directory to find checkpoint files + * @param checkpoint_epochs interval to checkpoint + * @param checkpoint_steps interval to checkpoint + * @param checkpoint_secs interval to checkpoint + * @param per_rank_dir The directory into which to dump distributed checkpoints + * @param ckpt_dist_epochs The frequency of distributed checkpoints in epochs + * @param ckpt_dist_steps The frequence of distributed checkpoints in steps + */ + checkpoint(std::string checkpoint_dir, + std::string restart_dir, + int checkpoint_epochs, + int checkpoint_steps, + int checkpoint_secs, + std::string per_rank_dir, + int ckpt_dist_epochs, + int ckpt_dist_steps) : + callback_base(), + m_active_trainer(nullptr), + m_active_training_algorithm(nullptr), + m_checkpoint_dir(std::move(checkpoint_dir)), + m_restart_dir(std::move(restart_dir)), + m_checkpoint_epochs(checkpoint_epochs), + m_checkpoint_steps(checkpoint_steps), + m_checkpoint_secs(checkpoint_secs), + m_per_rank_dir(per_rank_dir), + m_ckpt_dist_epochs(ckpt_dist_epochs), + m_ckpt_dist_steps(ckpt_dist_steps) {} + checkpoint(const checkpoint&) = default; + checkpoint& operator=(const checkpoint&) = default; + checkpoint* copy() const override { return new checkpoint(*this); } + void setup(model *m) override; + void setup(trainer *t) override; + void on_train_begin(model *m) override; + void on_epoch_end(model *m) override; + void on_batch_end(model *m) override; + void on_validation_end(model *m) override; + + inline void set_checkpoint_dir(const std::string& dir){ + m_checkpoint_dir = dir; + } + + inline const std::string& get_checkpoint_dir(){ + return m_checkpoint_dir; + } + + inline void set_restart_dir(const std::string& dir){ + m_restart_dir = dir; + } + + inline const std::string& get_restart_dir(){ + // If the restart directory has been explicitly defined use that + if(m_restart_dir.length() != 0) { + return m_restart_dir; + }else { + return m_checkpoint_dir; + } + } + + inline void set_active_trainer(trainer* t){ + m_active_trainer = t; + } + + inline trainer& get_active_trainer(){ + if(m_active_trainer == nullptr) { + LBANN_ERROR("No active trainer for the checkpoint callback"); + } + return *m_active_trainer; + } + + inline void set_active_training_algorithm(training_algorithm* t){ + m_active_training_algorithm = t; + } + + inline training_algorithm& get_active_training_algorithm(){ + if(m_active_training_algorithm == nullptr) { + LBANN_ERROR("No active training algorithm for the checkpoint callback"); + } + return *m_active_training_algorithm; + } + + inline void set_checkpoint_epochs(int epochs){ + m_checkpoint_epochs= epochs; + } + + inline void set_checkpoint_steps(int steps){ + m_checkpoint_steps= steps; + } + + inline void set_checkpoint_secs(EvalType secs){ + m_checkpoint_secs= secs; + } + + inline void set_per_rank_dir(std::string dir){ + m_per_rank_dir = dir; + } + + inline const std::string& get_per_rank_dir(){ + return m_per_rank_dir; + } + + inline void set_ckpt_dist_epochs(int ckpt_dist_epochs){ + m_ckpt_dist_epochs = ckpt_dist_epochs; + } + + inline void set_ckpt_dist_steps(int ckpt_dist_steps){ + m_ckpt_dist_steps = ckpt_dist_steps; + } + + inline std::string get_shared_checkpoint_rootdir() { + return get_restart_dir(); + } + + /// @todo BVE FIMME this looks wrong I think that the order + /// should be reversed + inline std::string get_distributed_checkpoint_rootdir() { + if(m_per_rank_dir.length()) { + return get_per_rank_dir() + "/" + get_restart_dir(); + }else { + return get_restart_dir(); + } + } + + bool need_checkpoint(model *m, callback_phase phase); + std::string find_latest_checkpoint(lbann_comm& comm, + const std::string& trainer_name, + const std::string& alg_name, + execution_mode& mode, + size_t &epoch, + size_t& step, + bool& shared); + bool open_latest_checkpoint(lbann_comm& comm, + const std::string& task_label, + const std::string& trainer_name, + const std::string& alg_name, + std::function reload_shared_ckpt, + std::function reload_distributed_ckpt); + bool reload_model(model *m); + bool reload_trainer(trainer *t); + bool restart(model *m); + std::string name() const override { return "checkpoint"; } + protected: + bool do_checkpoint(model *m); + private: + trainer* m_active_trainer; + training_algorithm* m_active_training_algorithm; + std::string m_checkpoint_dir; + // If the restart directory is not explicity set, default to the + // checkpoint directory + std::string m_restart_dir; + int m_checkpoint_epochs; + int m_checkpoint_steps; + EvalType m_checkpoint_secs; + std::string m_per_rank_dir; + int m_ckpt_dist_epochs; + int m_ckpt_dist_steps; + EvalType m_checkpoint_last; + bool m_checkpoint_dist; + bool m_checkpoint_shared; + + template + struct header_t { + execution_mode mode; + int epoch; + int step; + int shared; + char dirname[_max_dir_len]; + }; +}; + +inline std::string get_trainer_checkpoint_dirname(const std::string& trainer_name, const std::string& dir) { + return build_string(dir, '/', trainer_name, '/'); +} + +inline std::string get_last_shared_checkpoint_filename(const std::string& alg_name, const std::string& dir) { + return build_string(dir, '/', alg_name, ".last.shared.checkpoint"); +} + +inline std::string get_last_shared_checkpoint_filename(const std::string& trainer_name, const std::string& alg_name, const std::string& dir) { + return get_last_shared_checkpoint_filename(alg_name, get_trainer_checkpoint_dirname(trainer_name, dir)); +} + +inline std::string get_shared_checkpoint_dirname(const std::string& alg_name, const std::string& dir, execution_mode mode, size_t epoch, size_t step) { + return build_string(dir, '/', alg_name, ".shared.", to_string(mode), ".epoch.", epoch, ".step.", step, '/'); +} + +inline std::string get_shared_checkpoint_dirname(const std::string& trainer_name, const std::string& alg_name, const std::string& dir, execution_mode mode, size_t epoch, size_t step) { + return get_shared_checkpoint_dirname(alg_name, get_trainer_checkpoint_dirname(trainer_name, dir), mode, epoch, step); +} + +inline std::string get_last_distributed_checkpoint_filename(const std::string& alg_name, const std::string& dir) { + return build_string(dir, '/', alg_name, ".last.distributed.checkpoint"); +} + +inline std::string get_last_distributed_checkpoint_filename(const std::string& trainer_name, const std::string& alg_name, const std::string& dir) { + return get_last_distributed_checkpoint_filename(alg_name, get_trainer_checkpoint_dirname(trainer_name, dir)); +} + +inline std::string get_distributed_checkpoint_dirname(const std::string& alg_name, const int rank_in_trainer, const std::string& dir, execution_mode mode, size_t epoch, size_t step) { + return build_string(dir, '/', + alg_name, + ".rank.", rank_in_trainer, + ".distributed.", to_string(mode), + ".epoch.", epoch, + ".step.", step, '/'); +} + +inline std::string get_distributed_checkpoint_dirname(const std::string& trainer_name, const std::string& alg_name, const int rank_in_trainer, const std::string& dir, execution_mode mode, size_t epoch, size_t step) { + return get_distributed_checkpoint_dirname(alg_name, rank_in_trainer, get_trainer_checkpoint_dirname(trainer_name, dir), mode, epoch, step); +} + +// Print last checkpoint to file, used to determine which checkpoint to load from. +inline bool write_latest(std::string filename, execution_mode mode, size_t epoch, size_t train) { + // open the file for writing + int fd = openwrite(filename.c_str()); + if (fd != -1) { + char field[256]; + sprintf(field, "mode=%s epoch=%ld step=%ld\n", to_string(mode).c_str(), epoch, train); + write_string(fd, filename.c_str(), field, strlen(field)); + // close our file + closewrite(fd, filename.c_str()); + } + return true; +} + +/** \brief Reads the "latest" file and returns the epoch number and + * sample offset for most recent checkpoint + */ +inline bool read_latest(std::string filename, execution_mode *mode, size_t *epochLast, size_t *trainLast) { + // assume we don't have a file, we'll return -1 in that case + *epochLast = -1; + *trainLast = -1; + *mode = execution_mode::invalid; + // open the file for reading + int fd = openread(filename.c_str()); + if (fd != -1) { + // read epoch from file + char field[256]; + read_string(fd, filename.c_str(), field, sizeof(field)); + char modeStr[64]; + int ret = sscanf(field, "mode=%s epoch=%ld step=%ld\n", modeStr, epochLast, trainLast); + *mode = exec_mode_from_string(modeStr); + // close our file + closeread(fd, filename.c_str()); + if(ret != 3) { return false; } + return true; + } + return false; +} + +// Builder function +std::unique_ptr +build_checkpoint_callback_from_pbuf( + const google::protobuf::Message&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED diff --git a/include/lbann/callbacks/confusion_matrix.hpp b/include/lbann/callbacks/confusion_matrix.hpp new file mode 100644 index 00000000000..187c9088487 --- /dev/null +++ b/include/lbann/callbacks/confusion_matrix.hpp @@ -0,0 +1,127 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** Compute confusion matrix. + * Confusion matrices are saved in CSV files of the form + * ".csv". The (i,j)-entry is the proportion of samples + * with prediction i and label j. The prediction and label layers are + * assumed to output one-hot vectors for each mini-batch sample. + */ +class confusion_matrix : public callback_base { +public: + using AbsDistMatType = El::AbstractDistMatrix; +public: + + confusion_matrix(std::string&& prediction_layer, + std::string&& label_layer, + std::string&& prefix); + confusion_matrix(std::string const& prediction_layer, + std::string const& label_layer, + std::string const& prefix); + confusion_matrix(const confusion_matrix&); + confusion_matrix& operator=(const confusion_matrix&); + confusion_matrix* copy() const override { + return new confusion_matrix(*this); + } + std::string name() const override { return "confusion matrix"; } + + void setup(model *m) override; + + void on_epoch_begin(model *m) override { reset_counts(*m); } + void on_epoch_end(model *m) override { save_confusion_matrix(*m); } + void on_validation_begin(model *m) override { reset_counts(*m); } + void on_validation_end(model *m) override { save_confusion_matrix(*m); } + void on_test_begin(model *m) override { reset_counts(*m); } + void on_test_end(model *m) override { save_confusion_matrix(*m); } + void on_batch_end(model *m) override { update_counts(*m); } + void on_batch_evaluate_end(model *m) override { update_counts(*m); } + +private: + + /** Name of prediction layer. + * This layer is assumed to output one-hot vectors. + */ + std::string m_prediction_layer; + /** Name of label layer. + * This layer is assumed to output one-hot vectors. + */ + std::string m_label_layer; + /** Prefix for output files. */ + std::string m_prefix; + + /** Confusion matrix counts. + * Each vector should be interpreted as a num_classes x num_classes + * matrix in row-major order. The (i,j)-entry is the number of + * samples with prediction i and label j. + */ + std::map> m_counts; + + /** "View" into prediction matrix. + * This is a CPU matrix. If the prediction layer keeps data on GPU, + * then this will be a matrix copy rather than a matrix view. + */ + std::unique_ptr m_predictions_v; + /** "View" into label matrix. + * This is a CPU matrix. If the label layer keeps data on GPU or in + * a different distribution than the prediction layer, then this + * will be a matrix copy rather than a matrix view. + */ + std::unique_ptr m_labels_v; + + /** Get prediction matrix. */ + const AbsDistMatType& get_predictions(const model& m) const; + /** Get label matrix. */ + const AbsDistMatType& get_labels(const model& m) const; + + /** Reset confusion matrix counts. */ + void reset_counts(const model& m); + /** Update confusion matrix counts. + * Counts are updated with current mini-batch predictions and + * labels. + */ + void update_counts(const model& m); + /** Output confusion matrix to file. */ + void save_confusion_matrix(const model& m); + +}; + +// Builder function +std::unique_ptr +build_confusion_matrix_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED diff --git a/include/lbann/callbacks/debug.hpp b/include/lbann/callbacks/debug.hpp new file mode 100644 index 00000000000..354696e7dd2 --- /dev/null +++ b/include/lbann/callbacks/debug.hpp @@ -0,0 +1,112 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * @brief Phase specific "printf debugging" + * + * Print verbose status updates to standard error stream. + * This callback is useful for "printf debugging." + * + * Takes a prototext parameter @c phase: train | validate | test | \ + * if \ will print messages for all phases + * + */ +class debug : public callback_base { + public: + + /** @brief Constructor. + * + * If modes is empty, status updates will be printed for all + * execution modes. + */ + debug(std::set modes) : + m_modes(std::move(modes)) {} + debug(const debug&) = default; + debug& operator=(const debug&) = default; + debug* copy() const override { return new debug(*this); } + std::string name() const override { return "debug"; } + + /** @brief Print that a batch is beginning. */ + void on_batch_begin(model *m) override; + /** @brief Print that a batch is ending. */ + void on_batch_end(model *m) override; + /** @brief Print that a layer's forward prop is beginning. */ + void on_batch_evaluate_begin(model *m) override; + /** @brief Print that a layer's forward prop is ending. */ + void on_batch_evaluate_end(model *m) override; + + using callback_base::on_forward_prop_begin; + using callback_base::on_forward_prop_end; + using callback_base::on_backward_prop_begin; + using callback_base::on_backward_prop_end; + using callback_base::on_evaluate_forward_prop_begin; + using callback_base::on_evaluate_forward_prop_end; + + /** @brief Print that a layer's forward prop is beginning. */ + void on_forward_prop_begin(model *m, Layer *l) override; + /** @brief Print that a layer's forward prop is ending. */ + void on_forward_prop_end(model *m, Layer *l) override; + /** @brief Print that a layer's backward prop is beginning. */ + void on_backward_prop_begin(model *m, Layer *l) override; + /** @brief Print that a layer's backward prop is ending. */ + void on_backward_prop_end(model *m, Layer *l) override; + /** @brief Print that a layer's backward prop is beginning. */ + void on_evaluate_forward_prop_begin(model *m, Layer *l) override; + /** @brief Print that a layer's backward prop is ending. */ + void on_evaluate_forward_prop_end(model *m, Layer *l) override; + + /** @brief Print that a weights' optimization step is beginning. */ + void on_optimize_begin(model *m, weights *w) override; + /** @brief Print that a weights' optimization step is ending. */ + void on_optimize_end(model *m, weights *w) override; + + private: + + /** @brief Execution modes for which status updates will be printed. + * + * If empty, status updates are printed for all execution modes. + */ + std::set m_modes; + +}; + +// Builder function +std::unique_ptr +build_debug_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED diff --git a/include/lbann/callbacks/debug_io.hpp b/include/lbann/callbacks/debug_io.hpp new file mode 100644 index 00000000000..834f91e40bb --- /dev/null +++ b/include/lbann/callbacks/debug_io.hpp @@ -0,0 +1,97 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// debug .hpp .cpp - Callback hooks to debug LBANN +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED + +#include +#include +#include "lbann/callbacks/callback.hpp" +#include "lbann/layers/io/input/input_layer.hpp" + +namespace lbann { +namespace callback { + +/** + * Print status updates on where training is. + */ +class debug_io : public callback_base { + public: + using callback_base::on_forward_prop_begin; + using callback_base::on_forward_prop_end; + using callback_base::on_backward_prop_begin; + using callback_base::on_backward_prop_end; + using callback_base::on_evaluate_forward_prop_begin; + using callback_base::on_evaluate_forward_prop_end; + + /** + * Debug a particular phase; use invalid to debug every phase. + */ + debug_io(execution_mode phase = execution_mode::invalid, + int debug_lvl = 0) : + callback_base(1), + m_debug_phase(phase), + m_debug_lvl(debug_lvl) {} + debug_io(const debug_io&) = default; + debug_io& operator=( + const debug_io&) = default; + debug_io* copy() const override { return new debug_io(*this); } + /** Print that a training epoch is being started. */ + void on_epoch_begin(model *m) override; + /** Print that forward prop for a layer is beginning. */ + void on_forward_prop_begin(model *m, Layer *l) override; + + /** Print I/O details at the beginning of validation. */ + void on_validation_begin(model *m) override; + /** Print that an evaluation forward prop is beginning. */ + void on_evaluate_forward_prop_begin(model *m, Layer *l) override; + + /** Print I/O details at the beginning of testing. */ + void on_test_begin(model *m) override; + + /** Common format for printing I/O stats at the start of a mini-batch */ + void print_fp_start(model *m, generic_input_layer *input); + /** Common format for printing I/O stats at the start of a phase */ + void print_phase_start(model *m, execution_mode mode); + + std::string name() const override { return "debug_io"; } + private: + /** The phase to debug. */ + execution_mode m_debug_phase; + int m_debug_lvl; /** Debugging level: 0 - epoch begin, 1 - fwd prop */ +}; + +// Builder function +std::unique_ptr +build_debug_io_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED diff --git a/include/lbann/callbacks/dump_error_signals.hpp b/include/lbann/callbacks/dump_error_signals.hpp new file mode 100644 index 00000000000..9d704d9560a --- /dev/null +++ b/include/lbann/callbacks/dump_error_signals.hpp @@ -0,0 +1,70 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** Dump gradients w.r.t. inputs to file. + * After each layer performs a backward prop step, this callback will + * dump the gradients w.r.t. inputs (the "error signals") to a + * human-readable ASCII file. This is slow and produces a lot of output. + */ +class dump_error_signals : public callback_base { + public: + + /** Constructor. + * @param basename The basename for output files. + */ + dump_error_signals(std::string basename = "") + : callback_base(), m_basename(basename) {} + dump_error_signals* copy() const override { + return new dump_error_signals(*this); + } + std::string name() const override { return "dump error signals"; } + + /** Write error signals to file after each backward prop step. */ + void on_backward_prop_end(model *m, Layer *l) override; + + private: + /** Basename for output files. */ + std::string m_basename; + +}; + +// Builder function +std::unique_ptr +build_dump_error_signals_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED diff --git a/include/lbann/callbacks/dump_gradients.hpp b/include/lbann/callbacks/dump_gradients.hpp new file mode 100644 index 00000000000..005a0195955 --- /dev/null +++ b/include/lbann/callbacks/dump_gradients.hpp @@ -0,0 +1,80 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// dump_gradients .hpp .cpp - Callbacks to dump gradients +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED + +#include + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * @brief Dump gradient matrices to files. + * @details This will dump each hidden layer's gradient matrix after + * each minibatch. The matrices are written to files using + * Elemental's simple ASCII format. This is not meant for + * checkpointing, but for exporting gradient matrices for analysis + * that isn't easily done in LBANN. Note this dumps matrices during + * each mini-batch. This will be slow and produce a lot of output. + */ +class dump_gradients : public callback_base { + public: + using callback_base::on_backward_prop_end; + + /** + * @param basename The basename for writing files. + * @param batch_interval The frequency at which to dump the gradients + */ + dump_gradients(std::string basename, int batch_interval = 1) : + callback_base(batch_interval), m_basename(std::move(basename)) {} + dump_gradients( + const dump_gradients&) = default; + dump_gradients& operator=( + const dump_gradients&) = default; + dump_gradients* copy() const override { + return new dump_gradients(*this); + } + void on_backward_prop_end(model *m) override; + std::string name() const override { return "dump gradients"; } + private: + /** @brief Basename for writing files. */ + std::string m_basename; +}; + +// Builder function +std::unique_ptr +build_dump_gradients_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/dump_minibatch_sample_indices.hpp b/include/lbann/callbacks/dump_minibatch_sample_indices.hpp new file mode 100644 index 00000000000..1aca8c40a0e --- /dev/null +++ b/include/lbann/callbacks/dump_minibatch_sample_indices.hpp @@ -0,0 +1,85 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// dump_minibatch_sample_indices .hpp .cpp - Callbacks +// to dump the list of indices per minibatch +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED + +#include + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * @brief Dump sample indices for each minibatch to files. + * @details This will dump the list of indices from the training / + * validation / testing data that was processed Note this dumps + * vectors during each mini-batch. This will be slow and produce a lot + * of output. + */ +class dump_minibatch_sample_indices : public callback_base { + public: + using callback_base::on_forward_prop_end; + using callback_base::on_evaluate_forward_prop_end; + + /** + * @param basename The basename for writing files. + * @param batch_interval The frequency at which to dump sample indices + */ + dump_minibatch_sample_indices(std::string basename, + int batch_interval = 1) : + callback_base(batch_interval), m_basename(std::move(basename)) {} + dump_minibatch_sample_indices( + const dump_minibatch_sample_indices&) = default; + dump_minibatch_sample_indices& operator=( + const dump_minibatch_sample_indices&) = default; + dump_minibatch_sample_indices* copy() const override { + return new dump_minibatch_sample_indices(*this); + } + void on_forward_prop_end(model *m, Layer *l) override; + void on_evaluate_forward_prop_end(model *m, Layer *l) override; + + void dump_to_file(model *m, Layer *l, int64_t step); + + std::string name() const override { return "dump minibatch sample indices"; } + private: + /** Basename for writing files. */ + std::string m_basename; +}; + +// Builder function +std::unique_ptr +build_dump_mb_indices_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED diff --git a/include/lbann/callbacks/dump_outputs.hpp b/include/lbann/callbacks/dump_outputs.hpp new file mode 100644 index 00000000000..34610896f3b --- /dev/null +++ b/include/lbann/callbacks/dump_outputs.hpp @@ -0,0 +1,127 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +#include +#include + +namespace lbann { +namespace callback { + +/** @brief Dump layer output tensors to files. + * + * Saves a file for each output tensor of each selected layer, + * computed at each mini-batch step. Output files have the form + * "--epoch<#>-step<#>--output<#>.". + * This is primarily intended as a debugging tool, although it can be + * used for inference when performance is not critical. + * + * For NumPy file formats (npy and npz), tensor dimensions are + * recorded. For text file formats (CSV and TSV), each line contains + * flattened tensor data corresponding to one mini-batch sample + * (which is the transpose of the column-major matrix representation + * we use internally). + * + * CNPY is required to export to NumPy file formats (npy and npz). + */ +class dump_outputs : public callback_base { +public: + + /** @brief Construct a callback to dump outputs. + * + * @param layer_names Names of layers with output dumps + * (default: dump outputs for all layers). + * @param modes Execution modes with output dumps + * (default: dump outputs for all modes). + * @param batch_interval Frequency of output dumps (default: dump + * outputs at each mini-batch step). + * @param directory Directory for output files (default: current + * working directory). + * @param file_format Output file format. Options are csv, tsv, + * npy, npz (default: csv). + */ + dump_outputs( + std::set layer_names,// = std::set(), + std::set modes, // = std::set(), + El::Int batch_interval = 0, + std::string directory = "", + std::string file_format = ""); + + dump_outputs* copy() const override { + return new dump_outputs(*this); + } + std::string name() const override { return "dump outputs"; } + + void on_forward_prop_end(model* m, Layer* l) override { + do_dump_outputs(*m, *l); + } + void on_evaluate_forward_prop_end(model* m, Layer* l) override { + const auto& c = static_cast(m->get_execution_context()); + if(c.get_step() % m_batch_interval == 0) { + do_dump_outputs(*m, *l); + } + } + +private: + + /** @brief Names of layers with output dumps. + * @details If empty, outputs will be dumped for all layers. + */ + std::set m_layer_names; + + /** @brief Execution modes with output dumps. + * @details If empty, outputs will be dumped for all execution modes. + */ + std::set m_modes; + + /** @brief Directory for output files. + * @details Pathname has trailing '/'. + */ + std::string m_directory; + + /** @brief Output file format. */ + std::string m_file_format; + + /** @brief Dump outputs to file. + * @details Returns immediately if an output dump is not needed. + */ + void do_dump_outputs(const model& m, const Layer& l); + +}; + +// Builder function +std::unique_ptr +build_dump_outputs_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/dump_weights.hpp b/include/lbann/callbacks/dump_weights.hpp new file mode 100644 index 00000000000..ecfaa58d9d1 --- /dev/null +++ b/include/lbann/callbacks/dump_weights.hpp @@ -0,0 +1,82 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// dump_weights .hpp .cpp - Callbacks to dump weight matrices +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED + +#include + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * Dump weight matrices to files. + * This will dump each hidden layer's weight/bias matrix after specified epoch interval. + * The matrices are written to files using Elemental's simple ASCII format. This + * is not meant for checkpointing, but for exporting weight matrices for + * analysis that isn't easily done in LBANN. + */ +class dump_weights : public callback_base { + public: + /** + * @param basename The basename for writing files. + */ + dump_weights(std::string dir, El::Int epoch_interval=1) : + callback_base(), m_directory(std::move(dir)), + m_epoch_interval(std::max(El::Int(1),epoch_interval)) {} + dump_weights(const dump_weights&) = default; + dump_weights& operator=( + const dump_weights&) = default; + dump_weights* copy() const override { + return new dump_weights(*this); + } + void on_train_begin(model *m) override; + void on_epoch_end(model *m) override; + std::string name() const override { return "dump weights"; } + void set_target_dir(const std::string& dir) { m_directory = dir; } + const std::string& get_target_dir() { return m_directory; } + private: + /** Basename for writing files. */ + std::string m_directory; + /** Interval at which to dump weights */ + El::Int m_epoch_interval; + /// Dump weights from learning layers. + void do_dump_weights(const model& m, std::string s = ""); +}; + +// Builder function +std::unique_ptr +build_dump_weights_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/early_stopping.hpp b/include/lbann/callbacks/early_stopping.hpp new file mode 100644 index 00000000000..f74611900f5 --- /dev/null +++ b/include/lbann/callbacks/early_stopping.hpp @@ -0,0 +1,74 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// lbann_early_stopping .hpp .cpp - Callback hooks for early stopping +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED +#define LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED + +#include +#include +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * Stop training after validation error stops improving. + */ +class early_stopping : public callback_base { + public: + /** + * Continue training until score has not improved for patience epochs. + */ + early_stopping(int64_t patience); + early_stopping(const early_stopping&) = default; + early_stopping& operator=( + const early_stopping&) = default; + early_stopping* copy() const override { + return new early_stopping(*this); + } + /** Update validation score and check for early stopping. */ + void on_validation_end(model *m) override; + std::string name() const override { return "early stopping"; } + private: + /** Number of epochs to wait for improvements. */ + int64_t m_patience; + /** Last recorded score. */ + EvalType m_last_score = std::numeric_limits::max(); + /** Current number of epochs without improvement. */ + int64_t m_wait = 0; +}; + +// Builder function +std::unique_ptr +build_early_stopping_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED diff --git a/include/lbann/callbacks/gpu_memory_usage.hpp b/include/lbann/callbacks/gpu_memory_usage.hpp new file mode 100644 index 00000000000..1d18019776e --- /dev/null +++ b/include/lbann/callbacks/gpu_memory_usage.hpp @@ -0,0 +1,58 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// callback_gpu_memory_usage .hpp .cpp - Callbacks for printing GPU memory usage +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** Callback hooks for printing GPU memory usage. */ +class gpu_memory_usage : public callback_base { + public: + + /** Constructor. + */ + gpu_memory_usage() = default; + gpu_memory_usage(const gpu_memory_usage&) = default; + gpu_memory_usage& operator=(const gpu_memory_usage&) = default; + gpu_memory_usage* copy() const override { return new gpu_memory_usage(*this); } + void on_epoch_begin(model *m) override; + std::string name() const override { return "GPU memory usage"; } +}; + +// Builder function +LBANN_ADD_DEFAULT_CALLBACK_BUILDER( + gpu_memory_usage, build_gpu_memory_usage_callback_from_pbuf); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED diff --git a/include/lbann/callbacks/hang.hpp b/include/lbann/callbacks/hang.hpp new file mode 100644 index 00000000000..246d72ca51b --- /dev/null +++ b/include/lbann/callbacks/hang.hpp @@ -0,0 +1,79 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// hang .hpp .cpp - Callback to hang LBANN for debuggers +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * Hang LBANN as training starts so debuggers can attach. + * This will cause either a specific rank (in COMM_WORLD) or every rank to hang. + * Attach to the hung ranks and set the hang flag to false with a debugger to + * proceed. + */ +class hang : public callback_base { + public: + /** + * @param rank_to_hang The rank to hang; -1 for every rank (default). + */ + hang(int rank_to_hang = -1) : + m_rank_to_hang(rank_to_hang) {} + hang(const hang&) = default; + hang& operator=(const hang&) = default; + hang* copy() const override { return new hang(*this); } + + void setup(model* m) override; + + /// Hang on train begin. + void on_train_begin(model* m) override { + if (m_rank_to_hang == -1 || + m_rank_to_hang == m->get_comm()->get_rank_in_world()) { + // Set this flag to false with your debugger to resume execution. + volatile bool lbann_hang = true; + while (lbann_hang) {} + } + } + std::string name() const override { return "hang"; } + private: + /// The rank that will hang; -1 for every rank. + int m_rank_to_hang; +}; + +// Builder function +std::unique_ptr +build_hang_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED diff --git a/include/lbann/callbacks/imcomm.hpp b/include/lbann/callbacks/imcomm.hpp new file mode 100644 index 00000000000..f7703ade0d9 --- /dev/null +++ b/include/lbann/callbacks/imcomm.hpp @@ -0,0 +1,122 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// imcomm .hpp .cpp - Send gradient updates between models +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED + +#include +#include +#include +#include "lbann/callbacks/callback.hpp" + +namespace lbann { + +template +class data_type_weights; + +namespace callback { + +/** + * @brief Support inter-model communication after each mini-batch to + * synchronize gradient updates. + */ +class imcomm : public callback_base { + public: + using callback_base::on_backward_prop_end; + + enum comm_type { + NONE=0, /** Do no gradient updates. */ + NORMAL, /** Simply sum gradient updates. */ + }; + + /** + * @brief Initialize with ct being used for all weights. + */ + imcomm(comm_type ct = NORMAL, + const std::shared_ptr& summarizer = nullptr); + imcomm(const imcomm&) = default; + imcomm& operator=(const imcomm&) = default; + imcomm* copy() const override { + return new imcomm(*this); + } + /** + * @brief Convenience initialization to do one update type for specific weights. + * + * @details Implies no inter-model updates for other weights. + */ + imcomm(comm_type ct, std::unordered_set weights_list, + const std::shared_ptr& summarizer = nullptr); + + /** @brief Choose comm type ct for weights. */ + void set_weights_comm(weights *w, comm_type ct); + + /** @brief Do initialization for this model. */ + void setup(model *m) override; + + /** @brief Make sure all models have the same weights. */ + void on_train_begin(model *m) override; + + /** @brief Do inter-model gradient updates. */ + void on_backward_prop_end(model *m) override; + + std::string name() const override { return "imcomm"; } + + private: + /** @brief Summarize relevant statistics. */ + template + void do_summary(model const& m, data_type_weights& w, EvalType im_time); + + private: + /** @brief Parameters for a given set of weights. */ + struct imcomm_params { + /** @brief Type of communication done. */ + comm_type ct = NONE; + }; + + /** @brief Default communication type. */ + comm_type m_default_ct; + + /** @brief Per-weights parameters. */ + std::unordered_map m_weights_params; + + /** @brief @brief lbann_summary */ + std::shared_ptr m_summarizer = nullptr; +}; + +/** @brief returns a string representation of the weight_initialization */ +std::string get_comm_type_name(typename imcomm::comm_type m); + +// Builder function +std::unique_ptr +build_imcomm_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED diff --git a/include/lbann/callbacks/learning_rate.hpp b/include/lbann/callbacks/learning_rate.hpp new file mode 100644 index 00000000000..8973fe34b4e --- /dev/null +++ b/include/lbann/callbacks/learning_rate.hpp @@ -0,0 +1,344 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// lbann_learning_rate .hpp .cpp - Callback hooks for learning rate schedules +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED +#define LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED + +#include +#include +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +// Different schedules should inherit from learning_rate. + +/** + * Base class for learning rate schedules. + * Child classes should implement the schedule method to make changes. + */ +class learning_rate : public callback_base { + public: + learning_rate(); + learning_rate(const learning_rate&) = default; + learning_rate& operator=( + const learning_rate&) = default; + /** Only apply to specific weights. */ + learning_rate(std::vector weights_names); + /** Do some initialization. */ + void setup(model *m) override; + /** Apply global learning rate schedules. */ + void on_epoch_end(model *m) override; + + using callback_base::on_backward_prop_end; + /** Apply local/per-optimizer learning rate schedules. */ + void on_backward_prop_end(model *m) override; + protected: + /** + * This is called at the end of every epoch to update the learning + * rate for every optimizer. Adjustments should be made based on the + * current global learning rate. + * The returned learning rate will be used to automatically update + * the current global learning rate. + */ + virtual float global_schedule(model *m) { + return get_current_global_learning_rate(); + } + + /** + * This is called at the end of every training mini-batch to update the + * learning rate for optimizer opt. The current global learning rate is *not* + * updated automatically based on this method. + */ + virtual float optimizer_schedule(model *m, optimizer &opt); + + const std::unordered_set& get_weights() const noexcept { + return m_weights; + } + + static float get_current_global_learning_rate() noexcept { + return m_cur_global_lr; + } + + static void update_global_learning_rate(float rate) noexcept { + m_cur_global_lr = rate; + } + + private: + /** + * This should be maintained by all learning rate schedule + * implementations as the current global learning rate. This enables + * coordination among different schedules, particularly ones that + * work on a per-optimizer basis. + */ + static float m_cur_global_lr; + + /** Names of the weights being updated. */ + std::vector m_weights_names; + + /** Weights to update. */ + std::unordered_set m_weights; +}; + +/** + * Decrease the learning rate by a fixed proportion every X epochs. + */ +class step_learning_rate : public learning_rate { + public: + /** Decrease the learning rate by amt every step epochs. */ + step_learning_rate(size_t step, float amt); + step_learning_rate(size_t step, float amt, + std::vector weights_names); + step_learning_rate( + const step_learning_rate&) = default; + step_learning_rate& operator=( + const step_learning_rate&) = default; + step_learning_rate* copy() const override { + return new step_learning_rate(*this); + } + std::string name() const override { return "step learning rate"; } + protected: + float global_schedule(model *m) override; + private: + /** Number of epochs between each learning rate decrease. */ + size_t m_step; + /** Amount to decrease the learning rate by. */ + float m_amt; +}; + +// Builder function +std::unique_ptr +build_step_learning_rate_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +/** + * Decrease the learning rate by a fixed proportion when validation error stops + * improving. + */ +class adaptive_learning_rate : public learning_rate { + public: + /** + * Decrease the learning rate by amt if accuracy does not improve for patience + * epochs. + */ + adaptive_learning_rate(size_t patience, float amt); + adaptive_learning_rate(size_t patience, float amt, + std::vector weights_names); + adaptive_learning_rate( + const adaptive_learning_rate&) = default; + adaptive_learning_rate& operator=( + const adaptive_learning_rate&) = default; + adaptive_learning_rate* copy() const override { + return new adaptive_learning_rate(*this); + } + std::string name() const override { return "adaptive learning rate"; } + protected: + float global_schedule(model *m) override; + private: + /** Number of epochs to wait for improvements. */ + size_t m_patience; + /** Amount to decrease the learning rate by. */ + float m_amt; + /** Current epoch. */ + size_t m_cur_epoch = std::numeric_limits::max(); + /** Last recorded score. */ + EvalType m_last_score = std::numeric_limits::max(); + /** Current number of epochs without improvement. */ + size_t m_wait = 0; + /** Whether to adjust learning rate for current epoch. */ + bool m_adjust_learning_rate = false; +}; + +// Builder function +std::unique_ptr +build_adaptive_learning_rate_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +/** + * Decrease learning rate by a fixed amount at fixed times. + */ +class drop_fixed_learning_rate : public learning_rate { + public: + /** + * Decrease the learning rate by amt when each epoch in drop_epochs is + * reached. + */ + drop_fixed_learning_rate( + std::vector drop_epochs, float amt); + drop_fixed_learning_rate( + std::vector drop_epochs, float amt, + std::vector weights_names); + drop_fixed_learning_rate( + const drop_fixed_learning_rate&) = default; + drop_fixed_learning_rate& operator=( + const drop_fixed_learning_rate&) = default; + drop_fixed_learning_rate* copy() const override { + return new drop_fixed_learning_rate(*this); + } + std::string name() const override { return "drop fixed learning rate"; } + protected: + float global_schedule(model *m) override; + private: + /// Amount to decrease the learning rate by. + float m_amt; + /** + * Epochs to drop learning rate at. This is stored in reverse sorted order, + * so that the end can be examined and then popped in constant time. + */ + std::vector m_drop_epochs; +}; + +// Builder function +std::unique_ptr +build_drop_fixed_learning_rate_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +/** + * Linearly increase the learning rate to reach a target value over a + * fixed number of epochs. + * @note This currently assumes every optimizer begins with the same + * learning rate. This also *forces* its schedule and will stomp over + * other changes. + */ +class linear_growth_learning_rate : public learning_rate { + public: + /** + * Linearly increase the learning rate to reach target after num_epochs. + */ + linear_growth_learning_rate( + float target, size_t num_epochs); + linear_growth_learning_rate( + float target, size_t num_epochs, size_t delay); + linear_growth_learning_rate( + float target, size_t num_epochs, size_t delay, + std::vector weights_names); + linear_growth_learning_rate( + const linear_growth_learning_rate&) = default; + linear_growth_learning_rate& operator=( + const linear_growth_learning_rate&) = default; + linear_growth_learning_rate* copy() const override { + return new linear_growth_learning_rate(*this); } + void setup(model *m) override; + std::string name() const override { return "linear growth learning rate"; } + protected: + float global_schedule(model *m) override; + private: + /// Initial learning rate. + float m_base_lr; + /// Target learning rate to reach. + float m_target; + /// Amount to increase each epoch. + float m_inc; + /// Number of epochs over which to scale the learning rate. + size_t m_num_epochs; + /// Number of epochs to delay before starting growth. + size_t m_delay; +}; + +// Builder function +std::unique_ptr +build_linear_growth_learning_rate_callback_from_pbuf( + const google::protobuf::Message&,std::shared_ptr const&); + +/** + * Decrease the learning rate by polynomial policy + * base_lr*(1 - i_cur/i_max)^p, where + * base_lr is the initial learning rate, i_cur is the current iteration, + * i_max is the maximum iteration, and p is a parameter. + */ +class poly_learning_rate : public learning_rate { + public: + poly_learning_rate(double p, size_t n_epochs, size_t max_iter); + poly_learning_rate(double p, size_t n_epochs, size_t max_iter, double endl_r, + std::vector weights_names); + poly_learning_rate( + const poly_learning_rate&) = default; + poly_learning_rate& operator=( + const poly_learning_rate&) = default; + poly_learning_rate* copy() const override { + return new poly_learning_rate(*this); + } + void setup(model *m) override; + std::string name() const override { return "poly learning rate"; } + protected: + float global_schedule(model *m) override; + float optimizer_schedule(model *m, optimizer &opt) override; + private: + /// The exponent to compute new learning rate in poly policy + double m_p; + /// The number of epochs for training + size_t m_num_epochs; + /// The maximum number of iterations until which the learning rate changes + size_t m_max_iter; + /// The minimum learning rate + float m_end_lr; + /// The current rate to scale the base learning rate + float m_lr; + /// The learning rate scale used at the end of the last epoch + float m_last_epoch_lr; +}; + +// Builder function +std::unique_ptr +build_poly_learning_rate_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +/** + * This implements an adaptive scheme for adjust each optimizer's + * learning rate based on the ratio of the norms of its weights and + * its gradients. + * See: You et al. "Scaling SGD Batch Size to 32K for ImageNet + * Training", 2017. + */ +class optimizerwise_adaptive_learning_rate : public learning_rate { + public: + optimizerwise_adaptive_learning_rate(float scale); + optimizerwise_adaptive_learning_rate( + float scale, std::vector weights_names); + optimizerwise_adaptive_learning_rate( + const optimizerwise_adaptive_learning_rate&) = default; + optimizerwise_adaptive_learning_rate& operator=( + const optimizerwise_adaptive_learning_rate&) = default; + optimizerwise_adaptive_learning_rate* copy() const override { + return new optimizerwise_adaptive_learning_rate(*this); } + std::string name() const override { return "optimizerwise adaptive learning rate"; } + protected: + float optimizer_schedule(model *m, optimizer &opt) override; + private: + float m_scale; +}; + +// Builder function +std::unique_ptr +build_optimizerwise_adaptive_learning_rate_callback_from_pbuf( + const google::protobuf::Message&,std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED diff --git a/include/lbann/callbacks/load_model.hpp b/include/lbann/callbacks/load_model.hpp new file mode 100644 index 00000000000..670b775bdbb --- /dev/null +++ b/include/lbann/callbacks/load_model.hpp @@ -0,0 +1,108 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// load_model .hpp .cpp - Callbacks to load pretrained model(s) +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_LOAD_MODEL_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_LOAD_MODEL_HPP_INCLUDED + +#include + +#include "lbann/callbacks/callback.hpp" + +#include + +// Forward-declare protobuf classes +namespace lbann_data { +class Model; +} + +namespace lbann { +namespace callback { + +/** + * Load pretrained model from file + */ +class load_model : public callback_base { + public: + /** + * @param dir directory to load model + * @param extension file extension e.g., model, state ...... + */ + load_model(std::vector dirs, + std::string extension="prototext") : + callback_base(), m_dirs(std::move(dirs)), + m_extension(std::move(extension)), + m_loaded(false) + {} + load_model(const load_model&) = default; + load_model& operator=( + const load_model&) = default; + load_model* copy() const override { + return new load_model(*this); + } + + inline void add_dir(const std::string& dir){ + m_dirs.emplace_back(dir); + } + + void on_train_begin(model *m) override; + + void on_test_begin(model *m) override; + + /* ckptdir_is_fullpath flag if true + * allow user to specify full path to model weights to load + * and allow system to ignore appending trainer id, num of epochs/steps + * to default ckpt_dir*/ + static bool load_model_weights(const std::string& ckpt_dir, + const std::string& alg_name, + model *m, + bool ckptdir_is_fullpath=false); + + std::string name() const override { return "load model"; } + + protected: + friend class lbann::model; + + + private: + std::vector m_dirs; //director(ies) to load pretrained model(s) + /// Disables the normal behavior of saving when training is complete + std::string m_extension; //file extension + + /// Flag to indicate if the model has already been loaded + bool m_loaded; +}; + +// Builder function +std::unique_ptr +build_load_model_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_LOAD_MODEL_HPP_INCLUDED diff --git a/include/lbann/callbacks/ltfb.hpp b/include/lbann/callbacks/ltfb.hpp new file mode 100644 index 00000000000..36ca778cdfe --- /dev/null +++ b/include/lbann/callbacks/ltfb.hpp @@ -0,0 +1,185 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" +#include +#include +#include + +namespace lbann { +namespace callback { + +/** @brief Tournament training. + * + * This is intended to support research into the LTFB algorithm. An + * outline: + * - Divide the computational resources into multiple "trainers" + * that can operate in parallel. + * - Setup a model on each trainer and begin training independently. + * - Periodically launch tournaments to select "good" models. More + * specifically, trainers partner up and exchange their models. + * Each trainer evaluates a metric for its local and partner + * models, using its validation data set. The model with the better + * score is retained and the other one is discarded. + * + * There are many algorithmic variations to be explored: + * - How is data is divvied up amongst the trainers. Is it strictly + * partitioned, partially shared, or completely replicated? + * - What model components are exchanged? Just the trainable weights, + * or a subset of the weights? Hyperparameters? + * - Can this be used to explore model architectures? + * + * @todo Exchange optimizer state. + * @todo Support heterogeneous models. + */ +class ltfb : public callback_base { +public: + + /** Inter-trainer communication scheme for LTFB. + * + * The specifics of these algorithms are experimental and will be + * in flux. + */ + enum class communication_algorithm { + /** Directly exchange weights values with sendrecv. + * + * Corresponding ranks in partner trainers will iterate through + * their weights and exchange values with sendrecvs. + * + * Notes: + * - Requires all models to be identical aside from their + * weights values, so this is not suitable for hyperparameter + * or model architecture exploration. + * - Optimizer state is not exchanged, so there may be wonky + * learning behavior immediately after a tournament. + * - Optimal if communication performance between ranks is + * uniform and independent. If intra-trainer communication is + * fast or if communication performance is sensitive to + * network traffic, it may be advantageous to gather model + * data on the trainer master ranks and only perform + * inter-trainer communication between them. + */ + sendrecv_weights, + + /** Save and load model data with checkpoint files. + * + * @todo Implement. + * + * Notes: + * - Supports hyperparameter exploration. + * - Checkpoint files currently do not store model architecture + * information, so this is not suitable for model + * architecture exploraiton. + * - This approach is temporary and experimental, since going + * through the file system is very suboptimal. When a wire + * format for model checkpoints is developed, it should be + * used instead. + */ + checkpoint_file + }; + + /** @brief Construct the LTFB callback + * @param batch_interval Number of training mini-batch steps between + * tournaments. + * @param metric_name Metric for tournament evaluation. + * @param weights_names List of weights to exchange with partner. + * If empty, then all weights are exchanged. + * @param low_score_wins Whether low-scoring or high-scoring models + * survive a tournament. + * @param comm_algo Inter-trainer communication scheme. + * @param summarizer The summarizer to use for this callback + */ + ltfb( + El::Int batch_interval, + std::string metric_name, + std::set weights_names = std::set(), + bool low_score_wins = false, + communication_algorithm comm_algo = communication_algorithm::sendrecv_weights, + const std::string& ckptdir = "", + bool exchange_hyperparameters = false); + ltfb(const ltfb& other); + ltfb& operator=(const ltfb& other); + ltfb* copy() const override { return new ltfb(*this); } + std::string name() const override { return "LTFB"; } + + void setup(model *m) override; + void on_train_begin(model *m) override; + void on_batch_begin(model *m) override; + + /** Convert string to LTFB communication algorithm. + * + * If an empty string is provided, returns @c + * communication_algorithm::sendrecv_weights. + */ + static communication_algorithm string_to_comm_algo(const std::string& str); + + void set_ckpt_basedir(const std::string& dir); + std::string get_ckpt_basedir() const; + +private: + + /** Metric for tournament evaluation. */ + std::string m_metric_name; + + /** List of weights to exchange with partner. + * + * If empty, then all weights are exchanged. + */ + std::set m_weights_names; + + /** Whether low-scoring or high-scoring models survive a + * tournament. */ + bool m_low_score_wins; + + /** Inter-trainer communication scheme. */ + communication_algorithm m_comm_algo; + + /** Base directory of the checkpoint state */ + std::string m_ckpt_basedir; + + /** Whether to exchange training hyperparameters between trainers + */ + bool m_exchange_hyperparameters; + + /** Workspace weights. + * + * Used to temporarily store local weights during a tournament. + */ + std::vector> m_workspace_weights; +}; + +// Builder function +std::unique_ptr +build_ltfb_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED diff --git a/include/lbann/callbacks/mixup.hpp b/include/lbann/callbacks/mixup.hpp new file mode 100644 index 00000000000..b4b5873f3a6 --- /dev/null +++ b/include/lbann/callbacks/mixup.hpp @@ -0,0 +1,88 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_MIXUP_HPP +#define LBANN_CALLBACKS_MIXUP_HPP + +#include +#include + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * Apply mixup to named input layers. + * + * See: + * + * Zhang, H. et al. "mixup: Beyond Empirical Risk Minimization." ICLR, 2018. + * + * This implementation does mixup within a single batch, per the recommendation + * within the paper. + * + * This approach may create duplicate images, and so uses + * + * lambda = max(lambda, 1 - lambda) + * + * for the mixing value. + * + * This recommendation comes from https://docs.fast.ai/callbacks.mixup.html + * + * The recommended default alpha (from the paper) is 0.4. + */ +class mixup : public callback_base { +public: + /** Apply mixup to layers named in layers with mixup parameter alpha. */ + mixup(std::unordered_set layers, float alpha) : + callback_base(), m_layers(layers), m_alpha(alpha) { + if (alpha < 0.0f) { + LBANN_ERROR("Mixup alpha must be non-negative."); + } + } + + mixup* copy() const override { return new mixup(*this); } + std::string name() const override { return "mixup"; } + + void on_forward_prop_end(model *m, Layer *l) override; + +private: + /** Names of input layers to apply mixup to. */ + std::unordered_set m_layers; + /** mixup parameter. */ + float m_alpha; +}; + +// Builder function +std::unique_ptr +build_mixup_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_MIXUP_HPP diff --git a/include/lbann/callbacks/monitor_io.hpp b/include/lbann/callbacks/monitor_io.hpp new file mode 100644 index 00000000000..8f665c928d4 --- /dev/null +++ b/include/lbann/callbacks/monitor_io.hpp @@ -0,0 +1,74 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// monitor_io .hpp .cpp - Callback hooks for I/O monitoring +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_IO_HPP_INCLUDED +#define LBANN_CALLBACKS_IO_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +#include + +#include +#include + +namespace lbann { +namespace callback { + +/** + * Print information on the amount of IO that layers do. + */ +class monitor_io : public callback_base { + public: + monitor_io() = default; + /** Only apply to specific layers. */ + monitor_io(std::vector const& layers) + : m_layers(layers.begin(), layers.end()) {} + + monitor_io(const monitor_io&) = default; + monitor_io& operator=(const monitor_io&) = default; + monitor_io* copy() const override { + return new monitor_io(*this); + } + /** Report how much I/O has occured per data reader */ + void on_epoch_end(model *m) override; + void on_test_end(model *m) override; + std::string name() const override { return "monitor_io"; } + private: + /** Indicies of layers to monitor. */ + std::unordered_set m_layers; +}; + +// Builder function +std::unique_ptr +build_monitor_io_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_IO_HPP_INCLUDED diff --git a/include/lbann/callbacks/perturb_adam.hpp b/include/lbann/callbacks/perturb_adam.hpp new file mode 100644 index 00000000000..3101018c6a7 --- /dev/null +++ b/include/lbann/callbacks/perturb_adam.hpp @@ -0,0 +1,135 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" +#include "lbann/optimizers/adam.hpp" + +#include + +namespace lbann { +namespace callback { + +/** @brief Hyperparameter exploration with Adam optimizers. + * + * Goes through the Adam optimizers in a model and perturbs four + * hyperparameters: the learning rate, @f$\beta_1@f$, @f$\beta_2@f$, + * and @f$\epsilon@f$. Since these hyperparameters can range over + * orders of magnitude, the perturbations are performed in log space. + * More precisely, random values are drawn from normal distributions + * (with user-provided standard deviations) and added to + * @f$\log(\text{learning rate})@f$, @f$\log(1-\beta_1)@f$, + * @f$\log(1-\beta_2)@f$, and @f$\log\epsilon@f$. + */ +class perturb_adam : public callback_base { +public: + + /** @param learning_rate_factor Standard deviation of learning rate + * perturbation (in log space). + * @param beta1_factor Standard deviation of @f$\beta_1@f$ + * perturbation (in log space). + * @param beta2_factor Standard deviation of @f$\beta_2@f$ + * perturbation (in log space). + * @param eps_factor Standard deviation of @f$\epsilon@f$ + * perturbation (in log space). + * @param perturb_during_training Whether to periodically perturb + * hyperparameters during training + * or to only perturb once during + * setup. + * @param batch_interval Number of training mini-batch steps between + * perturbations. Only used if + * @c perturb_during_training is @c true. + * @param weights_names Names of weights with Adam optimizers. If + * empty, all Adam optimizers in the model are + * perturbed. + */ + perturb_adam(DataType learning_rate_factor, + DataType beta1_factor, + DataType beta2_factor, + DataType eps_factor = 0, + bool perturb_during_training = false, + El::Int batch_interval = 1, + std::set weights_names + = std::set()); + perturb_adam* copy() const override { return new perturb_adam(*this); } + std::string name() const override { return "perturb Adam"; } + + void setup(model* m) override; + void on_batch_begin(model* m) override; + +private: + + /** Standard deviation of learning rate perturbation. + * + * In log space. + */ + DataType m_learning_rate_factor; + /** Standard deviation of @f$\beta_1@f$ perturbation. + * + * In log space. + */ + DataType m_beta1_factor; + /** Standard deviation of @f$\beta_2@f$ perturbation. + * + * In log space. + */ + DataType m_beta2_factor; + /** Standard deviation of @f$\epsilon@f$ perturbation. + * + * In log space. + */ + DataType m_eps_factor; + + /** Whether to periodically perturb during training. + * + * If false, only perturb once during setup. + */ + bool m_perturb_during_training; + + /** Optimizers for these weights will be perturbed. + * + * If empty, all Adam optimizers in the model will be perturbed. + */ + std::set m_weights_names; + + /** Perturb Adam optimizers in model. */ + void perturb(model& m) const; + /** Perturb Adam optimizer hyperparameters. */ + void perturb(lbann_comm& comm, adam& m) const; + +}; + +// Builder function +std::unique_ptr +build_perturb_adam_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED diff --git a/include/lbann/callbacks/perturb_dropout.hpp b/include/lbann/callbacks/perturb_dropout.hpp new file mode 100644 index 00000000000..c55722ef618 --- /dev/null +++ b/include/lbann/callbacks/perturb_dropout.hpp @@ -0,0 +1,88 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_PERTURB_DROPOUT_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_PERTURB_DROPOUT_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" +#include "lbann/layers/regularizers/dropout.hpp" +#include + +namespace lbann { +namespace callback { + +/** @brief Hyperparameter exploration with dropouts. + * + * Goes through the dropout layers in a model and perturbs keep probability + */ +class perturb_dropout : public callback_base { +public: + + /** @param keep_prob_factor Standard deviation of learning rate + * perturbation (in log space). + * @param layer_names Names of layers with dropout keep prob to perturb. If + * empty, all dropout layers in the model are + * perturbed. + */ + perturb_dropout(EvalType keep_prob_factor, + std::set layer_names + = std::set()); + perturb_dropout* copy() const override { return new perturb_dropout(*this); } + std::string name() const override { return "perturb dropout"; } + + void setup(model* m) override; + +private: + + /** Standard deviation of keep probability perturbation. + * + * In log space. + */ + EvalType m_keep_prob_factor; + + /** Keep prob for these layers will be perturbed. + * + * If empty, all dropout layers in the model will be perturbed. + */ + std::set m_layer_names; + + template + dropout* get_dropout_layer(Layer* l); + + /** Perturb dropout keep prob in model. */ + void perturb(model& m); + +}; + +// Builder function +std::unique_ptr +build_perturb_dropout_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_PERTURB_DROPOUT_HPP_INCLUDED diff --git a/include/lbann/callbacks/print_model_description.hpp b/include/lbann/callbacks/print_model_description.hpp new file mode 100644 index 00000000000..9f68cc39322 --- /dev/null +++ b/include/lbann/callbacks/print_model_description.hpp @@ -0,0 +1,60 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_PRINT_MODEL_DESCRIPTION_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_PRINT_MODEL_DESCRIPTION_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** @brief Print human-readable description of model to standard input. + * + * Message is printed when the model has finished setup. The + * description includes information on the model's layers, weights, + * and callbacks. + */ +class print_model_description : public callback_base { +public: + print_model_description() : callback_base() {} + print_model_description(const print_model_description&) = default; + print_model_description& operator=(const print_model_description&) = default; + print_model_description* copy() const override { return new print_model_description(*this); } + void on_setup_end(model *m) override; + std::string name() const override { return "print_model_description"; } + +}; + +// Builder function +std::unique_ptr +build_print_model_description_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_PRINT_MODEL_DESCRIPTION_HPP_INCLUDED diff --git a/include/lbann/callbacks/print_statistics.hpp b/include/lbann/callbacks/print_statistics.hpp new file mode 100644 index 00000000000..70fbc42c2ea --- /dev/null +++ b/include/lbann/callbacks/print_statistics.hpp @@ -0,0 +1,71 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// print_statistics .hpp .cpp - Callback hooks to print information +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_PRINT_STATISTICS_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_PRINT_STATISTICS_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** Periodically print computational results. + * Prints average objective function value and metric scores after + * each training epoch and evaluation. + */ +class print_statistics : public callback_base { + public: + print_statistics(int batch_interval = 1, bool print_global_stat_only=false) : + callback_base(batch_interval), + m_print_global_stat_only(print_global_stat_only) {} + print_statistics(const print_statistics&) = default; + print_statistics& operator=(const print_statistics&) = default; + print_statistics* copy() const override { return new print_statistics(*this); } + void setup(model *m) override; + void on_epoch_begin(model *m) override; + void on_epoch_end(model *m) override; + void on_validation_end(model *m) override; + void on_test_end(model *m) override; + std::string name() const override { return "print_statistics"; } + + private: + /** Print objective function and metrics to standard output. */ + void report_results(model *m); + bool m_print_global_stat_only; + +}; + +// Builder function +std::unique_ptr +build_print_statistics_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_PRINT_STATISTICS_HPP_INCLUDED diff --git a/include/lbann/callbacks/profiler.hpp b/include/lbann/callbacks/profiler.hpp index abedbaaa428..2a1c77a21dd 100644 --- a/include/lbann/callbacks/profiler.hpp +++ b/include/lbann/callbacks/profiler.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_timer .hpp .cpp - Callback hooks to time training +// timer .hpp .cpp - Callback hooks to time training //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_PROFILER_HPP_INCLUDED @@ -32,16 +32,17 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** */ -class lbann_callback_profiler : public lbann_callback { +class profiler : public callback_base { public: - lbann_callback_profiler(bool sync = false, bool skip_init = false); - lbann_callback_profiler(const lbann_callback_profiler&) = default; - lbann_callback_profiler& operator=(const lbann_callback_profiler&) = default; - lbann_callback_profiler* copy() const override { - return new lbann_callback_profiler(*this); + profiler(bool sync = false, bool skip_init = false); + profiler(const profiler&) = default; + profiler& operator=(const profiler&) = default; + profiler* copy() const override { + return new profiler(*this); } void on_epoch_begin(model *m) override; void on_epoch_end(model *m) override; @@ -79,6 +80,12 @@ class lbann_callback_profiler : public lbann_callback { bool m_skip_init; }; -} // namespace lbann +// Builder function +std::unique_ptr +build_profiler_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_PROFILER_HPP_INCLUDED diff --git a/include/lbann/callbacks/replace_weights.hpp b/include/lbann/callbacks/replace_weights.hpp new file mode 100644 index 00000000000..d42ed2573be --- /dev/null +++ b/include/lbann/callbacks/replace_weights.hpp @@ -0,0 +1,81 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED + +#include + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * Weights/parameters replacement on k-batch end + * Currently support replacing weights/parameters using layer names + * Can easily be extended to support replacement by weights name + * Given two layers specified in prototext, weights are copied from source layer to destination layer. + */ +class replace_weights : public callback_base { + public: + replace_weights( + std::vector src, + std::vector dst, + int batch_interval=1) + : callback_base(batch_interval), + m_src_layer_names(std::move(src)), + m_dst_layer_names(std::move(dst)) { + if(m_src_layer_names.size() != m_dst_layer_names.size()) + LBANN_ERROR("In replace weights callback: number of src and dest layers does not match."); + } + + replace_weights( + const replace_weights&) = default; + replace_weights& operator=( + const replace_weights&) = default; + replace_weights* copy() const override { + return new replace_weights(*this); + } + void setup(model *m) override; + void on_batch_end(model *m) override; + + std::string name() const override { return "replace weights"; } + private: + std::vector m_src_layer_names, m_dst_layer_names; + std::vector m_src_layers, m_dst_layers; +}; + +// Builder function +std::unique_ptr +build_replace_weights_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/save_images.hpp b/include/lbann/callbacks/save_images.hpp new file mode 100644 index 00000000000..cf37f33e33d --- /dev/null +++ b/include/lbann/callbacks/save_images.hpp @@ -0,0 +1,83 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED + +#include +#include +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** Save layer outputs as image files. + * Image files are in the form + * "-.". + */ +class save_images : public callback_base { +public: + + /** Constructor. + * @param layer_names List of layer names to save as images. + * @param image_format Image file format (e.g. jpg, png, pgm). + * @param image_prefix Prefix for image file names. + */ + save_images(std::vector layer_names, + std::string image_format = "jpg", + std::string image_prefix = ""); + save_images(const save_images&) = default; + save_images& operator=( + const save_images&) = default; + save_images* copy() const override { + return new save_images(*this); + } + void on_epoch_end(model *m) override; + void on_test_end(model *m) override; + std::string name() const override { return "save images"; } + +private: + + /** List of layer names to save as images. */ + std::vector m_layer_names; + /** Image file format. + * Valid options: jpg, png, pgm. + */ + std::string m_image_format; + /** Prefix for saved image files. */ + std::string m_image_prefix; + +}; + +// Builder function +std::unique_ptr +build_save_images_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED diff --git a/include/lbann/callbacks/save_model.hpp b/include/lbann/callbacks/save_model.hpp new file mode 100644 index 00000000000..b5cf2029182 --- /dev/null +++ b/include/lbann/callbacks/save_model.hpp @@ -0,0 +1,103 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// save_model .hpp .cpp - Callbacks to save model, currently as protobuf +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED + +#include + +#include "lbann/callbacks/callback.hpp" + +#include + +// Forward-declare protobuf classes +namespace lbann_data { +class Model; +} + +namespace lbann { +namespace callback { + +/** + * Save model to as protobuf file and set of weights + */ +class save_model : public callback_base { + public: + /** + * @param dir directory to save model + * @param disable_save_after_training Don't save after training + * @param extension file extension e.g., model, state ...... + */ + save_model(std::string dir, + bool disable_save_after_training, + std::string extension="prototext") : + callback_base(), m_dir(std::move(dir)), + m_disable_save_after_training(disable_save_after_training), + m_extension(std::move(extension)) + {} + save_model(const save_model&) = default; + save_model& operator=( + const save_model&) = default; + save_model* copy() const override { + return new save_model(*this); + } + void on_train_end(model *m) override; + std::string name() const override { return "save model"; } + void set_target_dir(const std::string& dir) { m_dir = dir; } + const std::string& get_target_dir() { return m_dir; } + + protected: + friend class lbann::model; + + bool do_save_model(model *m); + bool do_save_model_weights(model *m); + + private: + std::string m_dir; //directory to save file + /// Disables the normal behavior of saving when training is complete + bool m_disable_save_after_training; + std::string m_extension; //file extension + persist p; + + void write_proto_binary(const lbann_data::Model& proto, const std::string filename); + void write_proto_text(const lbann_data::Model& proto, const std::string filename); +}; + +inline std::string get_save_model_dirname(const std::string& trainer_name, const std::string& model_name, const std::string& dir) { + return build_string(dir, '/', trainer_name, '/', model_name, '/'); +} + +// Builder function +std::unique_ptr +build_save_model_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED diff --git a/include/lbann/callbacks/save_topk_models.hpp b/include/lbann/callbacks/save_topk_models.hpp new file mode 100644 index 00000000000..4a5c3800602 --- /dev/null +++ b/include/lbann/callbacks/save_topk_models.hpp @@ -0,0 +1,71 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// save_topk_models .hpp .cpp - Callback to save top k models +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED + +#include "lbann/callbacks/save_model.hpp" + +namespace lbann { +namespace callback { + +/** Save_topk_models for (e.g., inference and other analysis). + * @param dir directory to save model + * @param k number of models to save, should be less than number of trainers + * @param metric_name, evaluation metric + * @ordering for the topk, descending order is default + * Note: may end up saving more than k models if multiple models (trainers) have the same metric score + */ +class save_topk_models : public save_model { + public: + save_topk_models(std::string dir, int k, std::string metric_name, bool ascending_ordering=false) : + save_model(dir,true), m_k(k),m_metric_name(metric_name),m_ascending_ordering(ascending_ordering) {} + save_topk_models(const save_topk_models&) = default; + save_topk_models& operator=(const save_topk_models&) = default; + save_topk_models* copy() const override { return new save_topk_models(*this); } + void on_test_end(model *m) override; + std::string name() const override { return "save_topk_models"; } + + private: + /*determine if a trainer's model is in top k, computation done by trainer master processes*/ + bool am_in_topk(model *m); + int m_k ; + std::string m_metric_name; + bool m_ascending_ordering; + +}; + +// Builder function +std::unique_ptr +build_save_topk_models_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED diff --git a/include/lbann/callbacks/set_weights_value.hpp b/include/lbann/callbacks/set_weights_value.hpp new file mode 100644 index 00000000000..37b8996b301 --- /dev/null +++ b/include/lbann/callbacks/set_weights_value.hpp @@ -0,0 +1,77 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_SET_WEIGHTS_VALUE_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_SET_WEIGHTS_VALUE_HPP_INCLUDED + +#include + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** @brief Set values in a weights object at a given training step + * + * @todo Support weights with arbitrary data types. Currently only + * floats are supported. + */ +class set_weights_value : public callback_base { + public: + /** + * @param weights_name Name of weights object + * @param value Value to set weights + * @param step Mini-batch step at which to set weights value + */ + set_weights_value(std::string weights_name, double value, size_t step); + set_weights_value(const set_weights_value&) = default; + set_weights_value& operator=(const set_weights_value&) = default; + + set_weights_value* copy() const override; + std::string name() const override; + + void on_batch_begin(model *m) override; + + private: + + /** @brief Name of weights object. */ + std::string m_weights_name; + /** @brief Value to set weights. */ + double m_value; + /** @brief Mini-batch step at which to set weights value. */ + size_t m_step; + +}; + +// Builder function +std::unique_ptr +build_set_weights_value_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_SET_WEIGHTS_VALUE_HPP_INCLUDED diff --git a/include/lbann/callbacks/summarize_images.hpp b/include/lbann/callbacks/summarize_images.hpp new file mode 100644 index 00000000000..7d396fc45a4 --- /dev/null +++ b/include/lbann/callbacks/summarize_images.hpp @@ -0,0 +1,228 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// summarize_images .hpp .cpp - Callback hooks to dump +// results of image testing to event files +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_SUMMARIZE_IMAGES_HPP_INCLUDED +#define LBANN_CALLBACKS_SUMMARIZE_IMAGES_HPP_INCLUDED + + +#include "lbann/callbacks/callback.hpp" + +#include +#include +#include +#include +namespace lbann { +namespace callback { + +/** @class image_output_strategy + * @brief Interface for strategies for determining which images + * to output to the summarizer. + */ +class image_output_strategy { + +public: + virtual std::vector> + get_image_indices(model const&) const = 0; + virtual std::string get_tag(std::string const& layer_name, + El::Int index, El::Int epoch) const = 0; + virtual ~image_output_strategy() = default; + +}; //class image_output_strategy + + +/** @class CategoricalAccuracy + * @brief Subclass of image_output_strategy to dump categorized + * images to event files based on categorization criteria + */ +class categorical_accuracy_strategy : public image_output_strategy { +public: + + enum class MatchType { + NOMATCH=0, + MATCH=1, + ALL=2 + };// enum class MatchType + + /** @brief summarize_images Constructor. + * @param cat_accuracy_layer_name Name of categorical accuracy layer + * @param match_type Criteria for dumping images (MATCH, NOMATCH, or ALL) + * @param num_images Number of images to summarize per epoch + */ + categorical_accuracy_strategy(std::string const& cat_accuracy_layer_name, + MatchType match_type=MatchType::NOMATCH, + size_t num_images=10) + : m_cat_accuracy_layer_name(cat_accuracy_layer_name), + m_match_type(match_type), + m_num_images(num_images) {} + + /** @brief Get vector containing indices of images to be dumped. + * @returns std::vector Vector with indices of images to dump. + */ + std::vector> + get_image_indices(model const& m) const final; + + /** @brief Construct tag for image */ + std::string get_tag(std::string const& layer_name, + El::Int index, El::Int epoch) const final; + +private: + /** @brief Tests whether image should be dumped based on criteria + * @returns bool Value is true if matches criteria and false otherwise + */ + bool meets_criteria(const DataType& match) const noexcept; + + /** @brief Name of categorical accuracy layer*/ + std::string const m_cat_accuracy_layer_name; + + /** @brief Criterion to dump images */ + MatchType m_match_type; + + /** @brief Number of images to be dumped per epoch */ + size_t m_num_images; + +}; // class categorical_accuracy_strategy : image_output_strategy + +std::unique_ptr +build_categorical_accuracy_strategy_from_pbuf(google::protobuf::Message const&); + +/** @class Autoencoder Subclass of image_output_strategy to dump autoencoder images + * @brief Dump images to event files based on strategy + */ +class autoencoder_strategy : public image_output_strategy { + +public: + + /** @brief autoencoder_strategy : image_output_strategy Constructor. + * @param sample_indices Vector of sample indices for images + */ + autoencoder_strategy(std::string const& input_layer_name, + size_t num_images = 10) + : m_input_layer_name{input_layer_name}, + m_num_images{num_images} {} + + /** @brief Get vector containing indices of images to be dumped. + * @returns std::vector Vector with indices of images to dump. + */ + std::vector> + get_image_indices(model const& m) const final; + + /** @brief Construct tag for image */ + std::string get_tag(std::string const& layer_name, + El::Int index, El::Int epoch) const final; + +private: + + /** @brief Name of input layer */ + std::string m_input_layer_name; + + /** @brief Number of images to be tracked */ + size_t m_num_images; + + /** @brief Sample indices of images to track */ + mutable std::unordered_set m_tracked_images; + + /** @brief A map from models to shuffled indices */ + mutable std::unordered_map> m_shuffled_indices; + +}; // class Autoencoder : image_output_strategy + +std::unique_ptr +build_track_sample_ids_strategy_from_pbuf(google::protobuf::Message const&); + +/** @class summarize_images + * @brief Callback to dump images to event files based on strategy + */ +class summarize_images : public callback_base { + +public: + /** @brief summarize_images Constructor. + * @param summarizer Pointer to lbann_summary object + * @param strategy Pointer to image image_output_strategy + * @param img_source_layer_name Name of image layer + * @param interval Interval of epochs to dump images + * @param img_format Image file format (e.g. .jpg, .png, .pgm) + */ + summarize_images(std::shared_ptr const& summarizer, + std::unique_ptr strategy, + std::string const& img_source_layer_name, + uint64_t interval = 1, + std::string const& img_format = ".jpg"); + + /** @brief Copy constructor */ + callback_base* copy() const override { + LBANN_ERROR( "This callback is not copyable."); + return nullptr; + } + + /** @brief Return name of callback */ + std::string name() const override { return "summarize_images"; } + + /** @brief Hook to pull data from lbann run */ + void on_batch_evaluate_end(model* m) override; + +private: + + /** @brief Add image to event file */ + void dump_images_to_summary(model const& m) const; + + +private: + + /* @brief lbann_summary object */ + std::shared_ptr m_summarizer; + + /* @brief image_output_strategy object */ + std::unique_ptr m_strategy; + + /* @brief Names of layers */ + std::string m_img_source_layer_name; + + /* @brief Interval for dumping images */ + uint64_t m_epoch_interval; + + /** @brief Image file format. Valid options: .jpg, .png, .pgm. */ + std::string m_img_format; + +}; // class summarize_images + +/** @brief Get a layer from model based on name + * @param m The model + * @param layer_name Name of layer + */ +Layer const& get_layer_by_name(model const& m, std::string const& layer_name); + +std::unique_ptr +build_summarize_images_callback_from_pbuf( + const google::protobuf::Message&, + const std::shared_ptr& summarizer); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_SUMMARIZE_IMAGES_HPP_INCLUDED diff --git a/include/lbann/callbacks/summary.hpp b/include/lbann/callbacks/summary.hpp new file mode 100644 index 00000000000..eb199110378 --- /dev/null +++ b/include/lbann/callbacks/summary.hpp @@ -0,0 +1,83 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// summary .hpp .cpp - Callback hooks to summarize to Tensorboard +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" +#include "lbann/utils/summary.hpp" + +namespace lbann { +namespace callback { + +/** + * Summarize information to Tensorboard using LBANN's summary interface. + */ +class summary : public callback_base { + public: + /** + * @param summarizer The summary object to write to; this callback takes + * ownership of it. + * @param batch_interval The frequency with which to summarize + * @param mat_interval FIXME + * @todo Document mat_interval parameter. + */ + summary(const std::shared_ptr& summarizer, int batch_interval = 1, + int mat_interval = 25); + summary(const summary&) = default; + summary& operator=(const summary&) = default; + summary* copy() const override { + return new summary(*this); + } + void on_train_begin(model *m) override; + void on_batch_end(model *m) override; + void on_epoch_end(model *m) override; + void on_test_end(model *m) override; + std::string name() const override { return "summary"; } + +protected: + /** Write out histograms from the model's layers. */ + void save_histograms(model *m); + +private: + /** @brief lbann_summary */ + std::shared_ptr m_summarizer = nullptr; + + /** Interval for doing matrix summarization. */ + int m_mat_interval; +}; + +// Builder function +std::unique_ptr +build_summary_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED diff --git a/include/lbann/callbacks/sync_layers.hpp b/include/lbann/callbacks/sync_layers.hpp new file mode 100644 index 00000000000..6fa78b5ebb9 --- /dev/null +++ b/include/lbann/callbacks/sync_layers.hpp @@ -0,0 +1,87 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// callback_sync_layers.hpp - Callback to synchronize layers +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** Synchronize layers after forward and backward prop. + * Additionally updates layer timing information to account for this. + * Note that this callback should come before the summarizer callback to report + * time correctly (otherwise it will be shifted by one mini-batch). + */ +class sync_layers : public callback_base { + public: + /** + * @param sync_gpus The GPU stream will be synchronized. + * @param sync_mpi A global barrier will synchronize processes. + * @param only_input The only synchronization will be after the input layer in + * forward prop. + */ + sync_layers(bool sync_gpus = true, bool sync_mpi = true, + bool only_input = false) : + callback_base(1), m_sync_gpus(sync_gpus), m_sync_mpi(sync_mpi), + m_only_input(only_input) {} + sync_layers(const sync_layers&) = default; + sync_layers& operator=( + const sync_layers&) = default; + sync_layers* copy() const override { + return new sync_layers(*this); + } + std::string name() const override { return "sync_layers"; } + + using callback_base::on_forward_prop_end; + using callback_base::on_backward_prop_end; + + void on_forward_prop_end(model *m, Layer *l) override; + void on_backward_prop_end(model *m, Layer *l) override; + + protected: + /** Whether to synchronize GPUs. */ + bool m_sync_gpus; + /** Whether to do a global synchronization. */ + bool m_sync_mpi; + /** Whether to only synchronize after the input layer. */ + bool m_only_input; + + virtual void do_sync(Layer *l); +}; + +// Builder function +std::unique_ptr +build_sync_layers_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED diff --git a/include/lbann/callbacks/timeline.hpp b/include/lbann/callbacks/timeline.hpp new file mode 100644 index 00000000000..5b247070fab --- /dev/null +++ b/include/lbann/callbacks/timeline.hpp @@ -0,0 +1,99 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// callback_timeline .hpp .cpp - Callback hooks to record a timeline of runtime +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED + +#include +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * Record a timeline of training runtime on each rank and output it to a + * logfile for external processing. + * The logfile is named timeline.m\.\.txt. + * Each line is a separate event, written as name:start-time:end-time. + * Times are relative to the beginning of training. + */ +class timeline : public callback_base { + public: + timeline(std::string outdir) : callback_base(1), + m_outdir(outdir) {} + timeline(const timeline&) = default; + timeline& operator=(const timeline&) = default; + timeline* copy() const override { + return new timeline(*this); + } + std::string name() const override { return "timeline"; } + void on_train_begin(model *m) override; + void on_train_end(model *m) override; + + using callback_base::on_forward_prop_begin; + using callback_base::on_forward_prop_end; + using callback_base::on_backward_prop_begin; + using callback_base::on_backward_prop_end; + using callback_base::on_optimize_begin; + using callback_base::on_optimize_end; + + void on_forward_prop_begin(model *m, Layer *l) override; + void on_forward_prop_end(model *m, Layer *l) override; + void on_backward_prop_begin(model *m, Layer *l) override; + void on_backward_prop_end(model *m, Layer *l) override; + void on_optimize_begin(model *m, weights *w) override; + void on_optimize_end(model *m, weights *w) override; + private: + /// Get time relative to the start time. + EvalType get_rel_time() const { return get_time() - m_start_time; } + + /// Directory to write output to. + std::string m_outdir; + /// Time training started; all times are relative to this. + EvalType m_start_time = EvalType(0); + /// Time the current layer's forward pass started. + EvalType m_fp_start_time = EvalType(0); + /// Time the current layer's backward pass started. + EvalType m_bp_start_time = EvalType(0); + /// Time the current weights' optimization pass started. + EvalType m_opt_start_time = EvalType(0); + /// Store (relative) timing information. + std::unordered_map>> m_fp_times; + std::unordered_map>> m_bp_times; + std::unordered_map>> m_opt_times; +}; + +// Builder function +std::unique_ptr +build_timeline_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED diff --git a/include/lbann/callbacks/timer.hpp b/include/lbann/callbacks/timer.hpp new file mode 100644 index 00000000000..2afcf03d23f --- /dev/null +++ b/include/lbann/callbacks/timer.hpp @@ -0,0 +1,113 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" +#include +#include +#include + +namespace lbann { +namespace callback { + +/** Record and report model timing results. + * Reports the total time and mini-batch time statistics for training + * epochs and for model evaluations. This reports times for the + * master process in each model. + */ +class timer : public callback_base { +public: + + timer(const std::shared_ptr& summarizer = nullptr) + : callback_base(1) {} + timer(const timer&) = default; + timer& operator=(const timer&) = default; + timer* copy() const override { + return new timer(*this); + } + + /** Start timing for a training epoch. */ + void on_epoch_begin(model *m) override { timing_begin(*m); } + /** Report timing for a training epoch. */ + void on_epoch_end(model *m) override { timing_end(*m); } + /** Start timing for validation. */ + void on_validation_begin(model *m) override { timing_begin(*m); } + /** Report timing for validation. */ + void on_validation_end(model *m) override { timing_end(*m); } + /** Start timing for testing. */ + void on_test_begin(model *m) override { timing_begin(*m); } + /** Report timing for testing. */ + void on_test_end(model *m) override { timing_end(*m); } + /** Record training mini-batch start time. */ + void on_batch_begin(model *m) override { batch_timing_begin(*m); } + /** Record training mini-batch run time. */ + void on_batch_end(model *m) override { batch_timing_end(*m); } + /** Record evaluation mini-batch start time. */ + void on_batch_evaluate_begin(model *m) override { batch_timing_begin(*m); } + /** Record evaluation mini-batch run time. */ + void on_batch_evaluate_end(model *m) override { batch_timing_end(*m); } + + /** Callback name. */ + std::string name() const override { return "timer"; } + +private: + + /** Timing session start times. */ + std::map m_start_times; + /** Mini-batch timing session start times. */ + std::map m_batch_start_times; + /** Mini-batch times. */ + std::map> m_batch_times; + + /** Start timing session. */ + void timing_begin(const model& m); + /** End timing session. + * Prints results to standard output. + */ + void timing_end(model& m); + /** Start mini-batch timing session. */ + void batch_timing_begin(const model& m); + /** End mini-batch timing session. + * Prints results to standard output. + */ + void batch_timing_end(const model& m); + + /** @brief lbann_summary */ + std::shared_ptr m_summarizer = nullptr; + +}; + +// Builder function +std::unique_ptr +build_timer_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED diff --git a/include/lbann/callbacks/variable_minibatch.hpp b/include/lbann/callbacks/variable_minibatch.hpp new file mode 100644 index 00000000000..5bc5c37318b --- /dev/null +++ b/include/lbann/callbacks/variable_minibatch.hpp @@ -0,0 +1,160 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// lbann_variable_minibatch .hpp .cpp - Callback for variable-size mini-batches +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED +#define LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** + * Support changing the mini-batch size on different schedules. + * Implementations should override implement the abstract methods to define + * concrete schedules. + */ +class variable_minibatch : public callback_base { + public: + variable_minibatch(size_t starting_mbsize); + variable_minibatch( + const variable_minibatch&) = default; + variable_minibatch& operator=( + const variable_minibatch&) = default; + /// Set the initial mini-batch size. + void on_train_begin(model *m) override; + /// Potentially change the mini-batch size. + void on_epoch_end(model *m) override; + protected: + /** + * Implemented by child classes to provide the mini-batch/learning schedule. + * This is called at the end of every training epoch. If it returns false, + * no changes are made from the currently established schedule. + * If this returns true, the mini-batch size will be changed accordingly. + * If the mini-batch size is larger than the model's maximum mini-batch size, + * a warning is printed and the maximum mini-batch size is used. + * If new_lr also non-zero, the learning rate will be changed to new_lr, + * with a linear ramp time. (If ramp_time is 0, it is changed immediately.) + * Note changing the learning rate while in a ramp may lead to unexpected + * behavior; also be aware of interactions with other learning rate + * schedules. + */ + virtual bool schedule(model *m, size_t& new_mbsize, float& new_lr, + size_t& ramp_time) = 0; + /// Change the learning rate of every layer in m to new_lr. + void change_learning_rate(model *m, float new_lr) const; + /// Get the current learning rate (assumes every layer has the same one). + float get_current_learning_rate(model *m) const; + + /// Initial mini-batch size. + size_t m_starting_mbsize; + /** + * The current mini-batch size for this epoch. + * This is kept separately from the model's get_current_mini_batch_size() + * method, as calling that in on_epoch_end returns the size of the last mini- + * batch, not the "base" mini-batch. + */ + size_t m_current_mini_batch_size; + /// Current number of epochs left to ramp the learning rate. + size_t m_ramp_count = 0; + /// Amount to increment the learning rate by when ramping. + float m_lr_incr = 0.0f; +}; + +/** + * Double the mini-batch size every set number of epochs. + * Also doubles the learning rate. + */ +class step_minibatch : public variable_minibatch { + public: + step_minibatch(size_t starting_mbsize, size_t step, + size_t ramp_time = 0); + step_minibatch(const step_minibatch&) = default; + step_minibatch& operator=( + const step_minibatch&) = delete; + step_minibatch* copy() const override { + return new step_minibatch(*this); + } + std::string name() const override { return "step minibatch"; } + protected: + bool schedule(model *m, size_t& new_mbsize, float& new_lr, size_t& ramp_time) override; + + private: + /// Number of epochs between mini-batch size increases. + size_t m_step; + /// Number of steps to ramp the learning rate over. + size_t m_ramp_time; +}; + +// Builder function +std::unique_ptr +build_step_minibatch_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +class minibatch_schedule : public variable_minibatch { + public: + /// Represents a step in a schedule of mini-batch sizes. + struct minibatch_step { + /// Epoch for this schedule to start. + size_t epoch; + /// Mini-batch size to use. + size_t mbsize; + /// Learning rate to use. + float lr; + /// Number of epochs to ramp the learning rate over. + size_t ramp_time; + minibatch_step(size_t _epoch, size_t _mbsize, float _lr, size_t _ramp_time) : + epoch(_epoch), mbsize(_mbsize), lr(_lr), ramp_time(_ramp_time) {} + }; + + minibatch_schedule( + size_t starting_mbsize, std::vector steps); + minibatch_schedule( + const minibatch_schedule&) = default; + minibatch_schedule& operator=( + const minibatch_schedule&) = delete; + minibatch_schedule* copy() const override { + return new minibatch_schedule(*this); + } + std::string name() const override { return "minibatch schedule"; } + protected: + bool schedule(model *m, size_t& new_mbsize, float& new_lr, size_t& ramp_time) override; + private: + /// Steps in the mini-batch schedule, stored in reverse sorted order. + std::vector m_steps; +}; + +// Builder function +std::unique_ptr +build_minibatch_schedule_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED diff --git a/include/lbann/comm.hpp b/include/lbann/comm.hpp index 2ab72fe1273..1af406feea9 100644 --- a/include/lbann/comm.hpp +++ b/include/lbann/comm.hpp @@ -41,6 +41,11 @@ namespace lbann { +#ifdef LBANN_HAS_ALUMINUM +/** Convert an MPI_Op to an Aluminum reduction operator. */ +::Al::ReductionOperator mpi_op_to_al_op(El::mpi::Op op); +#endif + namespace Al { /** Dummy Aluminum backend. */ @@ -165,6 +170,14 @@ class lbann_comm { inline int get_world_rank(int trainer, int rank) const { return procs_per_trainer * trainer + rank; } + /** Return the "rank" of the trainer that this rank is in */ + inline int map_world_rank_to_trainer_rank(int world_rank) const { + return (world_rank / procs_per_trainer); + } + /** Return the "rank" within the trainer that this rank is in */ + inline int map_world_rank_to_rank_in_trainer(int world_rank) const { + return (world_rank % procs_per_trainer); + } /** Return the rank of the master process in this trainer. */ inline int get_trainer_master() const { return 0; @@ -412,6 +425,14 @@ class lbann_comm { El::mpi::AllGather(&src, 1, data.data(), 1, c, El::SyncInfo{}); } + /** + * Allgather for a single element over the world communicator; + * std::vector &data must be correctly sized prior to entry. + */ + template + void world_all_gather(T &src, std::vector &data) { + all_gather(src, data, get_world_comm()); + } /** * Allgather for a single element over the trainer communicator; * std::vector &data must be correctly sized prior to entry. @@ -702,18 +723,21 @@ class lbann_comm { bytes_received += count * sizeof(T) * (size_c - 1); } /** Matrix allreduce. */ - void allreduce(AbsMat& m, + template + void allreduce(El::AbstractMatrix& m, const El::mpi::Comm& c, El::mpi::Op op = El::mpi::SUM); /** Matrix allreduce. */ - void allreduce(AbsDistMat& m, + template + void allreduce(El::AbstractDistMatrix& m, const El::mpi::Comm& c, El::mpi::Op op = El::mpi::SUM); /** Non-blocking matrix allreduce. * If LBANN has not been built with Aluminum, then this calls a * blocking matrix allreduce. */ - void nb_allreduce(AbsMat& m, + template + void nb_allreduce(El::AbstractMatrix& m, const El::mpi::Comm& c, Al::request& req, El::mpi::Op op = El::mpi::SUM); @@ -721,7 +745,8 @@ class lbann_comm { * If LBANN has not been built with Aluminum, then this calls a * blocking matrix allreduce. */ - void nb_allreduce(AbsDistMat& m, + template + void nb_allreduce(El::AbstractDistMatrix& m, const El::mpi::Comm& c, Al::request& req, El::mpi::Op op = El::mpi::SUM); @@ -998,6 +1023,16 @@ class lbann_comm { return node_comm; } + /** + * Return a communicator containing num_per_group processors. + * + * This will attempt to pack processes so that the processes in each group + * are physically close together on the system. + * + * num_per_group must evenly divide the number of processors in the world. + */ + const El::mpi::Comm& get_packed_group_comm(int num_per_group) const; + /** Return true if rank (in comm) is on the local node. */ bool is_rank_node_local(int rank, const El::mpi::Comm& comm) const { // Translating to COMM_WORLD is typically constant time. @@ -1017,6 +1052,8 @@ class lbann_comm { El::mpi::Comm intertrainer_comm; /** Communicator for every process in the same compute node. */ El::mpi::Comm node_comm; + /** Packed group communicators. */ + mutable std::unordered_map group_communicators; /** Grid for this trainer. */ Grid *grid; /** Number of trainers. */ @@ -1040,11 +1077,6 @@ class lbann_comm { */ int threads_per_proc; -#ifdef LBANN_HAS_ALUMINUM - /** Convert an MPI_Op to an Aluminum reduction operator. */ - ::Al::ReductionOperator mpi_op_to_al_op(El::mpi::Op op); -#endif - // Various statistics counters. size_t num_trainer_barriers; size_t num_intertrainer_barriers; @@ -1112,6 +1144,25 @@ void lbann_comm::broadcast(const int root, std::string& str, const */ int get_rank_in_world(); +#ifndef LBANN_COMM_INSTANTIATE +#define PROTO(T) \ + extern template void lbann_comm::allreduce( \ + El::AbstractMatrix& m, const El::mpi::Comm& c, El::mpi::Op op); \ + extern template void lbann_comm::allreduce( \ + El::AbstractDistMatrix& m, const El::mpi::Comm& c, El::mpi::Op op); \ + extern template void lbann_comm::nb_allreduce( \ + El::AbstractMatrix& m, const El::mpi::Comm& c, Al::request& req, El::mpi::Op op); \ + extern template void lbann_comm::nb_allreduce( \ + El::AbstractDistMatrix& m, const El::mpi::Comm& c, Al::request& req, El::mpi::Op op) + +#define LBANN_INSTANTIATE_CPU_HALF +#define LBANN_INSTANTIATE_GPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#undef LBANN_INSTANTIATE_CPU_HALF +#undef LBANN_INSTANTIATE_GPU_HALF +#endif // LBANN_COMM_INSTANTIATE + } // namespace lbann #endif // LBANN_COMM_HPP_INCLUDED diff --git a/include/lbann/data_coordinator/CMakeLists.txt b/include/lbann/data_coordinator/CMakeLists.txt new file mode 100644 index 00000000000..d41974adb29 --- /dev/null +++ b/include/lbann/data_coordinator/CMakeLists.txt @@ -0,0 +1,8 @@ +# Add the headers for this directory +set_full_path(THIS_DIR_HEADERS + data_coordinator.hpp + data_coordinator_metadata.hpp + ) + +# Propagate the files up the tree +set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/data_coordinator/data_coordinator.hpp b/include/lbann/data_coordinator/data_coordinator.hpp new file mode 100644 index 00000000000..e4a9ae06c01 --- /dev/null +++ b/include/lbann/data_coordinator/data_coordinator.hpp @@ -0,0 +1,403 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_DATA_COORDINATOR_HPP +#define LBANN_DATA_COORDINATOR_HPP + +#include "lbann/data_coordinator/data_coordinator_metadata.hpp" +#include "lbann/utils/dataset.hpp" +#include "lbann/execution_contexts/execution_context.hpp" +#include +#include +#include +#include +#include + + +namespace lbann { + +// Forward-declare trainer +class trainer; + +class data_coordinator { + public: + using data_reader_map_t = std::map; + using io_buffer_map_t = std::map>; + + public: + data_coordinator(trainer& trainer, lbann_comm *comm) : + m_trainer(&trainer), + m_comm(comm), + m_data_set_processed(false), + m_execution_context(nullptr) {} + + ~data_coordinator() { + // Data coordinator always frees data readers. + for (auto& dr : m_data_readers) { + delete dr.second; + } + } + + // Data Coordinators copy their data readers. + data_coordinator(const data_coordinator& other) + : m_comm(other.m_comm), + m_training_dataset(other.m_training_dataset), + m_testing_dataset(other.m_testing_dataset), + m_validation_dataset(other.m_validation_dataset), + m_data_readers(other.m_data_readers), + m_execution_context(other.m_execution_context) { + for (auto& dr : m_data_readers) { + dr.second = dr.second ? dr.second->copy() : nullptr; + } + } + + data_coordinator& operator=(const data_coordinator& other) { + for (auto& dr : m_data_readers) { + dr.second = dr.second ? dr.second->copy() : nullptr; + } + return *this; + } + + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(/*CEREAL_NVP(m_io_buffer),*/ + CEREAL_NVP(m_training_dataset), + CEREAL_NVP(m_testing_dataset), + CEREAL_NVP(m_validation_dataset)/*, + CEREAL_NVP(m_data_readers), + CEREAL_NVP(m_data_set_processed)*/); + } + + void setup(int max_mini_batch_size, std::map data_readers); + + /** Check to see if there is a valid training context for the data coordinator */ + bool has_valid_execution_context() const { + return (m_execution_context != nullptr); + } + + /** Grab the training context of the data coordinator */ + const execution_context& get_execution_context() const { + if(m_execution_context == nullptr) { + LBANN_ERROR("execution context is not set"); + } + return *m_execution_context; + } + + /** Grab the training context of the data coordinator */ + execution_context& get_execution_context() { + return const_cast(static_cast(*this).get_execution_context()); + } + + //************************************************************************ + // Helper functions to access the data readers + //************************************************************************ + + generic_data_reader *get_data_reader(const execution_mode mode) const { + generic_data_reader *data_reader = nullptr; + + auto it = m_data_readers.find(mode); + if (it != m_data_readers.end()) data_reader = it->second; + + switch(mode) { + case execution_mode::training: + break; + case execution_mode::validation: + break; + case execution_mode::testing: + break; + default: + LBANN_ERROR("generic data distribution: invalid execution phase"); + } + return data_reader; + } + + /** + * Get the dimensions of the underlying data. + */ + TargetModeDimMap get_data_dims() { + TargetModeDimMap map; + generic_data_reader *dr; + for(execution_mode mode : execution_mode_iterator()) { + dr = get_data_reader(mode); + if (dr != nullptr) { + map[data_reader_target_mode::INPUT] = dr->get_data_dims(); + map[data_reader_target_mode::CLASSIFICATION] = std::vector(1, dr->get_num_labels()); + map[data_reader_target_mode::REGRESSION] = std::vector(1, dr->get_num_responses()); + map[data_reader_target_mode::RECONSTRUCTION] = dr->get_data_dims(); + map[data_reader_target_mode::NA] = std::vector(1, 0); + return map; + } + } + LBANN_ERROR("get_data_dims: no available data readers"); + return {}; + } + + /** + * Get the dimensions of the underlying data. + */ + SPModeSlicePoints get_slice_points() { + SPModeSlicePoints map; + generic_data_reader *dr; + for(execution_mode mode : execution_mode_iterator()) { + dr = get_data_reader(mode); + if (dr != nullptr) { + for(slice_points_mode sp_mode : slice_points_mode_iterator()) { + bool is_supported; + std::vector tmp = dr->get_slice_points(sp_mode, is_supported); + if(is_supported) { + map[sp_mode] = tmp; + } + } + return map; + } + } + LBANN_ERROR("get_data_dims: no available data readers"); + return {}; + } + + DataReaderMetaData get_dr_metadata() { + DataReaderMetaData drm; + drm.data_dims = get_data_dims(); + drm.slice_points = get_slice_points(); + return drm; + } + + // At the start of the epoch, set the execution mode and make sure + // that each layer points to this model + void reset_mode(execution_context& context) { + m_execution_context = static_cast>(&context); + } + + //************************************************************************ + // Helper functions to access the dataset statistics + //************************************************************************ + dataset& get_dataset(execution_mode m) { + switch(m) { + case execution_mode::training: + return m_training_dataset; + break; + case execution_mode::validation: + return m_validation_dataset; + break; + case execution_mode::testing: + return m_testing_dataset; + break; + default: + LBANN_ERROR("get_dataset: invalid execution mode"); + } + } + + const dataset& get_dataset(execution_mode m) const { + switch(m) { + case execution_mode::training: + return m_training_dataset; + break; + case execution_mode::validation: + return m_validation_dataset; + break; + case execution_mode::testing: + return m_testing_dataset; + break; + default: + LBANN_ERROR("get_dataset: invalid execution mode"); + } + } + + /** + * Return the first dataset with a valid (non-null) datareader. + * Returns null if none are valid. + */ + dataset* select_first_valid_dataset() { + if (m_data_readers[execution_mode::training]) { + return &m_training_dataset; + } else if (m_data_readers[execution_mode::validation]) { + return &m_validation_dataset; + } else if (m_data_readers[execution_mode::testing]) { + return &m_testing_dataset; + } else { + return nullptr; + } + } + + long get_num_samples_trained() const { + return m_training_dataset.get_num_samples_processed(); + } + long get_num_samples_tested() const { + return m_testing_dataset.get_num_samples_processed(); + } + long get_total_num_training_samples() const { + return m_training_dataset.get_total_samples(); + } + long get_total_num_testing_samples() const { + return m_testing_dataset.get_total_samples(); + } + + //************************************************************************ + // + //************************************************************************ + + void calculate_num_iterations_per_epoch(int max_mini_batch_size, generic_data_reader *data_reader); + void calculate_num_iterations_per_epoch(int mini_batch_size); + + int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers) const; + static int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers, const lbann_comm* comm); + + //************************************************************************ + // + //************************************************************************ + + // save state of IO to a checkpoint + bool save_to_checkpoint_shared(persist& p) const { + // save state of data readers from input layer + data_reader_map_t::const_iterator it; + if(p.get_cb_type() == callback_type::execution_context_only + || p.get_cb_type() == callback_type::full_checkpoint){ + + it = this->m_data_readers.find(execution_mode::training); + if ((it != this->m_data_readers.end()) && it->second) { + (it->second)->save_to_checkpoint_shared(p, execution_mode::training); + } + it = this->m_data_readers.find(execution_mode::testing); + if ((it != this->m_data_readers.end()) && it->second) { + (it->second)->save_to_checkpoint_shared(p, execution_mode::testing); + } + it = this->m_data_readers.find(execution_mode::validation); + if ((it != this->m_data_readers.end()) && it->second) { + (it->second)->save_to_checkpoint_shared(p, execution_mode::validation); + } + + if (this->m_comm->am_trainer_master()) { + write_cereal_archive(*this, p, execution_mode::training, "_dc.xml"); + } + } + return true; + } + + // reload state of IO from a checkpoint + bool load_from_checkpoint_shared(persist& p) { + // save state of data readers from input layer + data_reader_map_t::const_iterator it; + if(p.get_cb_type() == callback_type::execution_context_only + || p.get_cb_type() == callback_type::full_checkpoint){ + + it = this->m_data_readers.find(execution_mode::training); + if ((it != this->m_data_readers.end()) && it->second) { + (it->second)->load_from_checkpoint_shared(p, execution_mode::training); + } + it = this->m_data_readers.find(execution_mode::testing); + if ((it != this->m_data_readers.end()) && it->second) { + (it->second)->load_from_checkpoint_shared(p, execution_mode::testing); + } + it = this->m_data_readers.find(execution_mode::validation); + if ((it != this->m_data_readers.end()) && it->second) { + (it->second)->load_from_checkpoint_shared(p, execution_mode::validation); + } + + std::string buf; + if (this->m_comm->am_trainer_master()) { + read_cereal_archive(*this, p, execution_mode::training, "_dc.xml"); + buf = create_cereal_archive_binary_string(*this); + } + + // TODO: this assumes homogeneous processors + // broadcast state from rank 0 + this->m_comm->trainer_broadcast(0, buf); + + if (!this->m_comm->am_trainer_master()) { + unpack_cereal_archive_binary_string(*this, buf); + } + } + + return true; + } + + bool save_to_checkpoint_distributed(persist& p) const { + // save state of data readers from input layer + data_reader_map_t::const_iterator it; + if(p.get_cb_type() == callback_type::execution_context_only + || p.get_cb_type() == callback_type::full_checkpoint) { + + it = this->m_data_readers.find(execution_mode::training); + if ((it != this->m_data_readers.end()) && it->second) { + (it->second)->save_to_checkpoint_distributed(p, execution_mode::training); + } + it = this->m_data_readers.find(execution_mode::testing); + if ((it != this->m_data_readers.end()) && it->second) { + (it->second)->save_to_checkpoint_distributed(p, execution_mode::testing); + } + it = this->m_data_readers.find(execution_mode::validation); + if ((it != this->m_data_readers.end()) && it->second) { + (it->second)->save_to_checkpoint_distributed(p, execution_mode::validation); + } + + write_cereal_archive(*this, p, execution_mode::training, "_dc.xml"); + } + return true; + } + + bool load_from_checkpoint_distributed(persist& p) { + // save state of data readers from input layer + data_reader_map_t::const_iterator it; + it = this->m_data_readers.find(execution_mode::training); + if ((it != this->m_data_readers.end()) && it->second) { + (it->second)->load_from_checkpoint_distributed(p, execution_mode::training); + } + it = this->m_data_readers.find(execution_mode::testing); + if ((it != this->m_data_readers.end()) && it->second) { + (it->second)->load_from_checkpoint_distributed(p, execution_mode::testing); + } + it = this->m_data_readers.find(execution_mode::validation); + if ((it != this->m_data_readers.end()) && it->second) { + (it->second)->load_from_checkpoint_distributed(p, execution_mode::validation); + } + + read_cereal_archive(*this, p, execution_mode::training, "_dc.xml"); + return true; + } + + protected: + /** Pointer to hosting trainer */ + trainer *m_trainer; + /** Pointer to LBANN communicator. */ + lbann_comm *m_comm; + + dataset m_training_dataset; + dataset m_testing_dataset; + dataset m_validation_dataset; + + data_reader_map_t m_data_readers; + // std::map m_dataset_stats; +public: // @todo BVE FIXME + bool m_data_set_processed; + std::mutex dr_mutex; + + /** Pointer to the execution context object used for training or evaluating this model */ + observer_ptr m_execution_context; +}; + +} // namespace lbann + +#endif // LBANN_DATA_COORDINATOR_HPP diff --git a/include/lbann/data_coordinator/data_coordinator_metadata.hpp b/include/lbann/data_coordinator/data_coordinator_metadata.hpp new file mode 100644 index 00000000000..d9c37f23527 --- /dev/null +++ b/include/lbann/data_coordinator/data_coordinator_metadata.hpp @@ -0,0 +1,64 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_DATA_COORDINATOR_METADATA_HPP +#define LBANN_DATA_COORDINATOR_METADATA_HPP + +#include + +#include "lbann/utils/enum_iterator.hpp" + +#include +#include +#include + +namespace lbann { + +// NA - Not applicable, used for input layers that don't produce a second output +enum class data_reader_target_mode {CLASSIFICATION, REGRESSION, RECONSTRUCTION, INPUT, NA}; +std::string to_string(data_reader_target_mode m); +/// Map from target modes to dimension maps +using TargetModeDimMap = std::unordered_map>; +using data_reader_target_mode_iterator = enum_iterator; + +enum class slice_points_mode {INDEPENDENT, DEPENDENT, NA}; +std::string to_string(const slice_points_mode m); +slice_points_mode slice_points_mode_from_string(const std::string& m); +/// Map from slice points modes to slice points +using SPModeSlicePoints = std::unordered_map>; +using slice_points_mode_iterator = enum_iterator; + +/// Data structure containing metadata from the data readers +//using DataReaderMetaData = std::pair; + +struct DataReaderMetaData { + TargetModeDimMap data_dims; + SPModeSlicePoints slice_points; +}; + +} // namespace lbann + +#endif // LBANN_DATA_COORDINATOR_METADATA_HPP diff --git a/include/lbann/data_readers/CMakeLists.txt b/include/lbann/data_readers/CMakeLists.txt index f6d513de63a..7d56b6ebf46 100644 --- a/include/lbann/data_readers/CMakeLists.txt +++ b/include/lbann/data_readers/CMakeLists.txt @@ -1,42 +1,23 @@ # Add the headers for this directory set_full_path(THIS_DIR_HEADERS compound_data_reader.hpp - cv_augmenter.hpp - cv_colorizer.hpp - cv_decolorizer.hpp - cv_cropper.hpp - cv_mean_extractor.hpp - cv_normalizer.hpp - cv_process.hpp - cv_process_patches.hpp - cv_transform.hpp - cv_utils.hpp data_reader.hpp - data_reader_ascii.hpp data_reader_cifar10.hpp data_reader_csv.hpp data_reader_image.hpp data_reader_imagenet.hpp - data_reader_imagenet_patches.hpp data_reader_merge_features.hpp data_reader_merge_samples.hpp data_reader_mnist.hpp - data_reader_moving_mnist.hpp data_reader_nci.hpp data_reader_numpy.hpp data_reader_numpy_npz.hpp + data_reader_numpy_npz_conduit.hpp data_reader_pilot2_molecular.hpp data_reader_python.hpp data_reader_synthetic.hpp - image_preprocessor.hpp - image_utils.hpp - opencv.hpp - opencv_extensions.hpp - data_reader_multihead_siamese.hpp + data_reader_smiles.hpp ) -# Add the subdirectories -add_subdirectory(patchworks) - # Propagate the files up the tree set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/data_readers/cv_augmenter.hpp b/include/lbann/data_readers/cv_augmenter.hpp deleted file mode 100644 index ba584ab18fe..00000000000 --- a/include/lbann/data_readers/cv_augmenter.hpp +++ /dev/null @@ -1,114 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_augmenter .cpp .hpp - Augmenting functions for images in opencv format -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_AUGMENTER_HPP -#define LBANN_CV_AUGMENTER_HPP - -#include "cv_transform.hpp" -#include -#include -#include -#include - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/** - * Supports the following transforms: - * - Random horizontal and vertical flips - * - Random rotations - * - Random horizontal and vertical shifts - * - Random shearing - */ -class cv_augmenter : public cv_transform { - protected: - // --- configuration variables --- - /** Whether to do horizontal flips. */ - bool m_do_horizontal_flip; - /** Whether to do vertical flips. */ - bool m_do_vertical_flip; - - /** Range in degrees for rotations (0-180). */ - float m_rotation_range; - /** Range (fraction of total width) for horizontal shifts. */ - float m_horizontal_shift_range; - /** Range (fraction of total height) for vertical shifts. */ - float m_vertical_shift_range; - /** Shear angle (radians). */ - float m_shear_range; - - // --- state variables --- - /// Flip decision made - cv_flipping m_flip; // currently more of a configuration variable but can easily become a state variable - /// The rest of the affine tranformations determined - cv::Mat_ m_trans; - - /// Check if there is a reason to enable. (i.e., any option set) - bool check_to_enable() const override; - - public: - cv_augmenter(); - cv_augmenter(const cv_augmenter& rhs); - cv_augmenter& operator=(const cv_augmenter& rhs); - cv_augmenter* clone() const override; - - ~cv_augmenter() override {} - - /// Set the parameters all at once - void set(const bool hflip, const bool vflip, const float rot, - const float hshift, const float vshift, const float shear); - - /// Clear the states of the previous transform applied - void reset() override; - - /** - * Construct an affine transformation matrix based on the options and random - * numbers. If successful, the tranform is enabled. If not, it is disabled. - * @return false if not enabled or unsuccessful. - */ - bool determine_transform(const cv::Mat& image) override; - - /// Augmentation is irreversible. Thus, this has no effect. - bool determine_inverse_transform() override { return false; } - - /** - * Apply the transformation determined. - * As this method is executed, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - std::string get_type() const override { return "augmenter"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_AUGMENTER_HPP diff --git a/include/lbann/data_readers/cv_colorizer.hpp b/include/lbann/data_readers/cv_colorizer.hpp deleted file mode 100644 index 7d667f9cca5..00000000000 --- a/include/lbann/data_readers/cv_colorizer.hpp +++ /dev/null @@ -1,81 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_colorizer .cpp .hpp - transform a non-color (grayscale) image into a -// 3-channel color image -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_COLORIZER_HPP -#define LBANN_CV_COLORIZER_HPP - -#include "cv_transform.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -class cv_colorizer : public cv_transform { - protected: - // --- state variables --- - bool m_gray; ///< whether an image is monochrome or not - - public: - cv_colorizer() : cv_transform(), m_gray(false) {} - cv_colorizer(const cv_colorizer& rhs); - cv_colorizer& operator=(const cv_colorizer& rhs); - cv_colorizer *clone() const override; - - ~cv_colorizer() override {} - - void set() { reset(); } - void reset() override { - m_enabled = false; - m_gray = false; - } - - /** - * If a given image is in grayscale, the tranform is enabled, and not otherwise. - * @return false if not enabled or unsuccessful. - */ - bool determine_transform(const cv::Mat& image) override; - - /// convert back to color image if it used to be a grayscale image - bool determine_inverse_transform() override; - - /** - * Apply color conversion if enabled. - * As it is applied, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - std::string get_type() const override { return "colorizer"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_COLORIZER_HPP diff --git a/include/lbann/data_readers/cv_cropper.hpp b/include/lbann/data_readers/cv_cropper.hpp deleted file mode 100644 index 651e7945d5b..00000000000 --- a/include/lbann/data_readers/cv_cropper.hpp +++ /dev/null @@ -1,121 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_cropper .cpp .hpp - Functions to crop images -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_CROPPER_HPP -#define LBANN_CV_CROPPER_HPP - -#include "lbann/data_readers/cv_transform.hpp" -#include -#include - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/** - * If the size of a region of interest (ROI) is defined, use the area at the - * center of a given image. Otherwise, use the entire image. - * Zoom in/out the image if necessary to cover the ROI. Then, crop out an area - * of the desired size from the region either randomly within the ROI or at the - * center depending on the given specification. - */ -class cv_cropper : public cv_transform { - protected: - // --- configuration variables --- - unsigned int m_width; ///< desired width of an image - unsigned int m_height; ///< desired height of an image - /// randomize the center position of the area of interest - bool m_rand_crop; - /// indicate if a specific ROI is set or supposed to use whole image - bool m_is_roi_set; - /// The size of the initial region of interest to crop from - std::pair m_roi_size; - - // --- state variables --- - double m_zoom; ///< zoom factor to prepare the initial region for a given image - /** Three modes of pixel interpolation: INTER_LINEAR, INTER_AREA, and INTER_LINEAR - * The first choice is the default when not adaptive. The other two are used when - * interpolatng adaptively. The second is when shrinking, and the third is when enlarging - */ - static const int m_interpolation_choices[3]; - int m_interpolation; ///< id of the channel value interpolation method used - bool m_adaptive_interpolation; ///< whether to use adaptive interpolation - - void unset_roi(); - - public: - cv_cropper(); - cv_cropper(const cv_cropper& rhs) = default; - cv_cropper& operator=(const cv_cropper& rhs) = default; - cv_cropper *clone() const override; - ~cv_cropper() override {} - - /** - * Set the parameters all at once - * @param width desired width of the crop - * @param height desired height of the crop - * @param random_crop whether to crop randomly from the initial region of interest or at the center - * @param roi the size of the initial region of interest to crop from. Set (0,0) to use the full image. - * @param adaptive_interpolation whether to apply a different interpolation method depending on how an image is resized - */ - void set(const unsigned int width, const unsigned int height, - const bool random_crop = false, - const std::pair& roi = std::make_pair(0,0), - const bool adaptive_interpolation = false); - - unsigned int get_crop_width() const { return m_width; } - unsigned int get_crop_height() const { return m_height; } - - /// Clear the states of the previous transform applied - void reset() override; - - /** - * Construct transformation parameters based on the options and random - * numbers. If successful, the tranform is enabled.If not, it is disabled. - * @return false if not enabled or unsuccessful. - */ - bool determine_transform(const cv::Mat& image) override; - - /// Cropping is irreversible. Thus, this has no effect. - bool determine_inverse_transform() override { return false; } - - /** - * Apply the transformation determined. - * As this method is executed, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - std::string get_type() const override { return "cropper"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_CROPPER_HPP diff --git a/include/lbann/data_readers/cv_decolorizer.hpp b/include/lbann/data_readers/cv_decolorizer.hpp deleted file mode 100644 index 18e09aea0cf..00000000000 --- a/include/lbann/data_readers/cv_decolorizer.hpp +++ /dev/null @@ -1,84 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_decolorizer .cpp .hpp - transform a color image into a single-channel -// monochrome image -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_DECOLORIZER_HPP -#define LBANN_CV_DECOLORIZER_HPP - -#include "lbann_config.hpp" -#include "cv_transform.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -class cv_decolorizer : public cv_transform { - protected: - // --- state variables --- - bool m_color; ///< whether an image is color or not - /// Method to used: either pick one channel, or mix BGR channels (default) - bool m_pick_1ch; - - public: - cv_decolorizer() : cv_transform(), m_color(false), m_pick_1ch(false) {} - cv_decolorizer(const cv_decolorizer& rhs); - cv_decolorizer& operator=(const cv_decolorizer& rhs); - cv_decolorizer *clone() const override; - - ~cv_decolorizer() override {} - - void set(const bool pick_1ch); - void reset() override { - m_enabled = false; - m_color = false; - } - - /** - * If a given image is in color, the tranform is enabled, and not otherwise. - * @return false if not enabled or unsuccessful. - */ - bool determine_transform(const cv::Mat& image) override; - - /// The decolorizing transform is irreversible. Thus, this has no effect. - bool determine_inverse_transform() override { return false; } - - /** - * Convert a color image to a monochrome image if enabled. - * As it is applied, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - std::string get_type() const override { return "decolorizer"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_DECOLORIZER_HPP diff --git a/include/lbann/data_readers/cv_mean_extractor.hpp b/include/lbann/data_readers/cv_mean_extractor.hpp deleted file mode 100644 index eef53a0afa5..00000000000 --- a/include/lbann/data_readers/cv_mean_extractor.hpp +++ /dev/null @@ -1,157 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_mean_extractor .cpp .hpp - accumulate mean over the image set -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_MEAN_EXTRACTOR_HPP -#define LBANN_CV_MEAN_EXTRACTOR_HPP - -#include "cv_transform.hpp" -#include - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/** - * Computes a cumulative pixel-wise average of a stream of images. - * It is assumed that the images have the same size and the same number of - * channels. However, they are not required to have the same channel depth. - * If a channel value is an integral type, it is normalized to a floating - * point number of type Float_T between 0 and 1 (inclusive at both ends). - * If a channel value is already in a floating point type, the value is used - * without normalization. - * Images accumulate per pixel and a mean image is obtained by dividing each - * pixel accumulation by the total number of images (if m_batch_size is larger - * than the number of all the images observed). The current mean of images can - * be obtained at any point during the operation by the member function - * extract(). This returns the image normalized to the range of - * channel type, Channel_T. For example, if Channel_T is uint8_t, the range of - * mean values from 0.0 to 1.0 maps to the range from 0 to 256. - * To cope with a large number of images, one might rely on semi-moving average - * method. Up to m_batch_size number of images accumulate aa a batch while the - * moving average of batches is computed upon request by calling extract(). - * This is particularly useful when Float_T is single precision with a limited - * number of bits to represent a wide range of numbers and the images have a - * large bit depth. - */ -class cv_mean_extractor : public cv_transform { - public: - /// type of image statistics value accumulated - using Float_T = double; - static const unsigned int m_default_batch_size = 65536u; - - protected: - // --- configuration variables --- - unsigned int m_batch_size; ///< number of samples per batch - - // --- state variables --- - unsigned int m_batch_cnt; ///< number of complete batches - unsigned int m_partial_cnt; ///< number of samples currently contributing towards a batch - /// OpenCv type code used to create m_sum and m_avg based on Float_T and the number of channels - int m_type_code; - cv::Mat m_sum; ///< partial batch accumulated so far - cv::Mat m_avg; ///< cumulative moving average - - /// create the matrices for accumulating image statistics - void create_matrices(const unsigned int width, const unsigned int height, const unsigned int n_ch); - - public: - cv_mean_extractor(); - cv_mean_extractor(const cv_mean_extractor& rhs); - cv_mean_extractor& operator=(const cv_mean_extractor& rhs); - cv_mean_extractor *clone() const override; - - ~cv_mean_extractor() override {} - - void set(const unsigned int width, const unsigned int height, const unsigned int n_ch, - const unsigned int batch_sz = cv_mean_extractor::m_default_batch_size); - void set(const unsigned int batch_sz); - void reset() override; - - bool determine_transform(const cv::Mat& image) override; - /// The transform does not modify the image. Thus, this has no effect. - bool determine_inverse_transform() override; - bool apply(cv::Mat& image) override; - - template - cv::Mat extract() const; - - std::string get_type() const override { return "mean extractor"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - -/** - * Convert the maxtrix representing the cumulative moving average of images - * observed so far into an image with the channel type 'Channel_T'. The default - * is uint8_t. If it is given as void, the matrix is returned as is. - */ -template -inline cv::Mat cv_mean_extractor::extract() const { - cv::Mat avg_so_far; - if (m_partial_cnt == 0u) { - avg_so_far = m_avg; - } else { - cv::addWeighted(m_avg, m_batch_cnt/static_cast(m_batch_cnt+1), - m_sum, 1/static_cast((m_batch_cnt + 1) * m_partial_cnt), - 0.0, avg_so_far, m_type_code); - } - - if (avg_so_far.empty()) return cv::Mat(); - - if (std::is_void::value) return avg_so_far; - - double minVal = 0.0; - double maxVal = 0.0; - cv::minMaxLoc(avg_so_far, &minVal, &maxVal, nullptr, nullptr); - //const double max_channel_type = std::numeric_limits::max(); - const double max_channel_type = depth_normalization::inverse_factor(); - - cv::Mat recovered; - if ((minVal < 0.0) || (maxVal > 1.0)) { - // This condition may rise either because of unnormalized images with raw - // floating point values or because of precision error. In these cases, - // the minimum value maps to 0 and the maximum value maps to the greatest - // value of Channel_T - const double range = maxVal-minVal; - if (range == 0.0) return cv::Mat(); - const double alpha = max_channel_type/range; - const double beta = - alpha*minVal; - avg_so_far.convertTo(recovered, cv_image_type::T(), - alpha, beta); - } else { - // In this case, 0 maps to 0, and 1 maps to the greatest value of Channel_T - avg_so_far.convertTo(recovered, cv_image_type::T(), - max_channel_type, 0.0); - } - - return recovered; -} - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_MEAN_EXTRACTOR_HPP diff --git a/include/lbann/data_readers/cv_normalizer.hpp b/include/lbann/data_readers/cv_normalizer.hpp deleted file mode 100644 index dfaf2954f89..00000000000 --- a/include/lbann/data_readers/cv_normalizer.hpp +++ /dev/null @@ -1,399 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_cv_normalizer .cpp .hpp - Normalizing functions for images -// in opencv format -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_NORMALIZER_HPP -#define LBANN_CV_NORMALIZER_HPP - -#include // typeid -#include "cv_transform.hpp" -#include "lbann/base.hpp" // DataType -#include "lbann/utils/mild_exception.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { -/** - * Modifies the channel values of each pixel according to the chosen normalization - * strategies: - * - Standardize to 0 mean - * - Standardize to unit variance - * - Scale to the range [0, 1] - * - Normalize via z-score - * - * Combine these strategies into a single per-pixel linear transform, and - * process them all at once. - * It tries to replace the values in place if possible, rather - * than creating a new copy of data, especially, if the channel data type of - * source image is the same as that of the resultant image. - */ -class cv_normalizer : public cv_transform { - public: - /** This is the interim type of input values computed from image data - * It does not have to be the same as the type of the values stored, i.e., DataType. - */ - using ComputeType = DataType; - //using ComputeType = double; - /** - * Define the type of normalization methods available. - * z-score method is essentially the combination of mean subtraction and unit variance - */ - enum normalization_type {_none=0, _u_scale=1, _mean_sub=2, _unit_var=4, _z_score=6}; - using channel_trans_t = std::pair; - - protected: - // --- configuration variables --- - /// Whether to normalize to 0 mean. - bool m_mean_subtraction; - /// Whether to normalize to unit variance. - bool m_unit_variance; - /// Whether to scale to [0, 1]. - bool m_unit_scale; - /// Whether to normalize via z-score. - bool m_z_score; - - - // --- state variables --- - /** - * The parameter to use for linearly transforming channel values of each pixel as: - * new_value[ch] = cv::saturate_cast(m_trans[ch].first*value[ch] + m_trans[ch].second) - */ - std::vector m_trans; - - - /// Set a normalization bit flag - normalization_type set_normalization_bits(const normalization_type ntype, const normalization_type flag) const { - return static_cast(static_cast(ntype) | static_cast(flag)); - } - - /// Mask normalization bits - normalization_type mask_normalization_bits(const normalization_type ntype, const normalization_type flag) const { - return static_cast(static_cast(ntype) & static_cast(flag)); - } - - /// Enable a particular normalization method - normalization_type& set_normalization_type(normalization_type& ntype, const normalization_type flag) const; - - /// Check if there is a reason to enable. (i.e., any option set) - bool check_to_enable() const override; - - public: - - cv_normalizer(); - cv_normalizer(const cv_normalizer& rhs); - cv_normalizer& operator=(const cv_normalizer& rhs); - cv_normalizer *clone() const override; - - ~cv_normalizer() override {} - - /// Set the parameters all at once - void set(const bool meansub, const bool unitvar, const bool unitscale, const bool zscore); - - /// Whether to subtract the per-channel and per-sample mean. - void subtract_mean(bool b) { - m_mean_subtraction = b; - } - /// Whether to normalize to unit variance, per-channel and per-sample. - void unit_variance(bool b) { - m_unit_variance = b; - } - /// Whether to scale to [0, 1] - void unit_scale(bool b) { - m_unit_scale = b; - } - /// Whether to normalize by z-scores, per-channel and per-sample. - void z_score(bool b) { - m_z_score = b; - } - - /// Set a pre-determined normalization transform. - void set_transform(const std::vector& t); - - /// Clear the states of the previous transform applied - void reset() override; - - /// Returns the channel-wise scaling parameter for normalization transform - std::vector transform() const { - return (m_enabled? m_trans : std::vector()); - } - - /** - * Combine the normalizations enabled and define a linear transform - * per pixel to address them all. If successful, the tranform is enabled. - * If not, it is disabled. - * @return false if not enabled or unsuccessful. - */ - bool determine_transform(const cv::Mat& image) override; - - /** - * Reverse the normalization done as x' = alpha*x + beta by - * x = (x'- beta)/alpha - * If successful, the tranform is enabled. If not, it is disabled. - * @return false if not enabled or unsuccessful. - */ - bool determine_inverse_transform() override; - - /** - * Apply the normalization defined as a linear tranform per pixel. - * As this method is executed, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - // utilities - template - static OutputIterator scale(InputIterator first, InputIterator last, OutputIterator result, - const std::vector trans); - - template - static bool scale_with_known_type(cv::Mat& image, const std::vector& trans); - - /** - * Scale an image using a set of parameters for linearly transforming channel - * values per pixel. - * The resultant image will contain channel values of LBANN's DataType. - */ - static bool scale(cv::Mat& image, const std::vector& trans); - - - template - static bool compute_mean_stddev_with_known_type(const cv::Mat& image, - std::vector& mean, std::vector& stddev, cv::InputArray mask); - - /// Compute the per-channel and per-sample mean and standard deviation - static bool compute_mean_stddev(const cv::Mat& image, - std::vector& mean, std::vector& stddev, - cv::InputArray mask=cv::noArray()); - - std::string get_type() const override { return "normalizer"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - - -/** - * Linearly transform each value while copying it from one sequential container - * to another, which may be the same container if the type of the initial value - * and that of the result are the same. - * The transformation is alpha[ch]*input[ch] + beta[ch] -> output[ch] - * @param first The beginning of the input interator - * @param last The last of the input iterator - * @param result The beginning of the output iterator - * @param trans Parameters for linearly transforming channel values per pixel - * @return the last of output iterator - */ -template -inline OutputIterator cv_normalizer::scale( - InputIterator first, InputIterator last, OutputIterator result, - const std::vector trans) { - const size_t NCh = trans.size(); - bool trivial_alpha = true; - bool trivial_beta = true; - - for (size_t ch=0u; ch < NCh; ++ch) { - trivial_alpha = trivial_alpha && (trans[ch].first == 1.0); - trivial_beta = trivial_beta && (trans[ch].second == 0.0); - } - - if (trivial_alpha && trivial_beta) { - if ((typeid(*first) == typeid(*result)) && - (reinterpret_cast(&(*first)) == - reinterpret_cast(&(*result)))) - // This way, it works both for iterator and for pointer - { - std::advance(result, std::distance(first,last)); - return result; - } else { - return std::copy(first, last, result); - } - } - - using T = typename std::iterator_traits::value_type; - - // At this point NCh should not be zero because both alpha and beta are not trivial. - if (NCh == 1) { - const ComputeType a = trans[0].first; - const ComputeType b = trans[0].second; - - while (first != last) { - *result = cv::saturate_cast(a * (*first) + b); - ++result; - ++first; - } - } else { - size_t ch = 0u; - - while (first != last) { - *result = cv::saturate_cast(trans[ch].first * (*first) + trans[ch].second); - ++result; - ++first; - ++ch; - ch = (ch % NCh); - } - } - return result; -} - - -/** - * Linear transform image pixels by scaling parameters given for each channel - * The transformation is trans[ch].first*input[ch] + trans[ch].second -> output[ch]. - * The first template parameter is the channel value type of the input image. - * The second one is the channel value type desired for the output image. - * - * @param image The image to be modified, which is the input and also the ouput. - * @param trans Parameters for linearly transforming channel values per pixel - * @return true if successful. The input image will be modified to a new one. - */ -template -inline bool cv_normalizer::scale_with_known_type(cv::Mat& image, - const std::vector& trans) { - const auto Width = static_cast(image.cols); - const auto Height = static_cast(image.rows); - const auto NCh = static_cast(image.channels()); - if ((trans.size() > 0u) && (trans.size() != NCh)) { - return false; - } - - - // overwrite the storage of the source image if the source and the result have - // the same data type. Otherwise, create a new image for the result. The result - // will replace the image referenced by the input. - if (std::is_same::value) { - if (image.isContinuous()) { - scale(reinterpret_cast(image.datastart), - reinterpret_cast(image.dataend), - reinterpret_cast(image.data), trans); - } else { - // TODO: Should we make this to copy to a new continuous block instead of - // updating the values in-place? - const unsigned int stride = Width*NCh; - for (unsigned int i = 0u; i < Height; ++i) { - auto *optr = reinterpret_cast(image.ptr(i)); - const Tsrc *iptr = optr; - scale(iptr, iptr+stride, optr, trans); - } - } - } else { - cv::Mat image_out = cv::Mat(Height, Width, CV_MAKETYPE(cv::DataType::depth, NCh)); - - if (image.isContinuous()) { - scale(reinterpret_cast(image.datastart), - reinterpret_cast(image.dataend), - reinterpret_cast(image_out.data), trans); - } else { - const unsigned int stride = Width*NCh; - auto *ptr_out = reinterpret_cast(image_out.data); - for (unsigned int i = 0u; i < Height; ++i, ptr_out += stride) { - const Tsrc *ptr = reinterpret_cast(image.ptr(i)); - scale(ptr, ptr+stride, ptr_out, trans); - } - } - image = image_out; - } - return true; -} - - -/** - * Compute the per-channel and per-sample mean and standard deviation - * for a sample image of channel value type T - */ -template -inline bool cv_normalizer::compute_mean_stddev_with_known_type(const cv::Mat& image, - std::vector& mean, std::vector& stddev, cv::InputArray mask) { - mean.clear(); - stddev.clear(); - if (image.empty()) { - return false; - } - - const int NCh = image.channels(); - const int num_pixels = image.rows * image.cols; - ComputeType sum[NCh]; - ComputeType sqsum[NCh]; - ComputeType shift[NCh]; - - for (int ch = 0; ch < NCh; ++ch) { - sum[ch] = 0.0; - sqsum[ch] = 0.0; - const auto *ptr = reinterpret_cast(image.datastart); - shift[ch] = static_cast(*(ptr+ch)); - } - - mean.resize(NCh); - stddev.resize(NCh); - - if (image.isContinuous()) { - const auto *ptr = reinterpret_cast(image.datastart); - const auto *const ptrend = reinterpret_cast(image.dataend); - - int ch = 0; - do { - const ComputeType diff = (*ptr - shift[ch]); - sum[ch] += diff; - sqsum[ch] += diff*diff; - ++ch; - ch = ch % NCh; - } while ((++ptr) != ptrend); - - for (int c = 0; c < NCh; ++c) { - const ComputeType shifted_mean = sum[c] / num_pixels; - mean[c] = shifted_mean + shift[c]; - stddev[c] = sqrt(std::max(sqsum[c]/num_pixels - shifted_mean * shifted_mean, ComputeType(0))); - } - } else { - const int stride = image.cols*NCh; - const int Height = image.rows; - - for (int i = 0; i < Height; ++i) { - const auto *ptr = reinterpret_cast(image.ptr(i)); - const T *const ptrend = ptr + stride; - - int ch = 0; - do { - const ComputeType diff = (*ptr - shift[ch]); - sum[ch] += diff; - sqsum[ch] += diff*diff; - ++ch; - ch = ch % NCh; - } while ((++ptr) != ptrend); - } - - for (int ch = 0; ch < NCh; ++ch) { - const ComputeType shifted_mean = sum[ch] / num_pixels; - mean[ch] = shifted_mean + shift[ch]; - stddev[ch] = sqrt(std::max(sqsum[ch]/num_pixels - shifted_mean*shifted_mean, ComputeType(0))); - } - } - return true; -} - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_NORMALIZER_HPP diff --git a/include/lbann/data_readers/cv_process.hpp b/include/lbann/data_readers/cv_process.hpp deleted file mode 100644 index ffc315016a4..00000000000 --- a/include/lbann/data_readers/cv_process.hpp +++ /dev/null @@ -1,166 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_process .cpp .hpp - structure that defines the operations -// on image data in opencv format -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_PROCESS_HPP -#define LBANN_CV_PROCESS_HPP - -#include "cv_transform.hpp" -#include "cv_normalizer.hpp" -#include "cv_subtractor.hpp" -#include "cv_augmenter.hpp" -#include "cv_colorizer.hpp" -#include "cv_decolorizer.hpp" -#include "cv_cropper.hpp" -#include "cv_resizer.hpp" -#include "cv_mean_extractor.hpp" -#include -#include // std::numeric_limits - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/** A structure packs the parameters for image pre-/post-processing that takes - * advantage of the OpenCV framework. - */ -class cv_process { - /// OpenCV flip codes: c<0 for top_left <-> bottom_right, c=0 for top<->down, and c>0 for left<->right - - protected: - /// unique name for the processor - std::string m_name; - /// Whether to flip an image - cv_transform::cv_flipping m_flip; - /// Whether to split channels - bool m_split; - /// whether a normalizing transform is set or not - bool m_is_normalizer_set; - /// The index of the normalizing transform in the array of transforms - unsigned int m_normalizer_idx; - - /// Array of transforms - std::vector > m_transforms; - - /// Check if the last transform registered in the list is a normalizer and not a subtractor - bool to_fuse_normalizer_with_copy() const; - - void set_normalizer_info(); - - public: - cv_process() - : m_flip(cv_transform::_no_flip_), m_split(true), m_is_normalizer_set(false), m_normalizer_idx(0u) {} - - cv_process(const cv_process& rhs); - cv_process& operator=(const cv_process& rhs); - - cv_process(const cv_transform::cv_flipping flip_code, const bool tosplit) - : m_flip(flip_code), m_split(tosplit), m_is_normalizer_set(false), m_normalizer_idx(0u) {} - - virtual ~cv_process() {} - - std::string get_name() const { return m_name; } - void set_name(const std::string& name) { m_name = name; } - - /// Reset all the transforms - void reset(); - - /// Check whether to flip - bool to_flip() const { - return (m_flip != cv_transform::_no_flip_); - } - /// Tell how to flip - int how_to_flip() const { - return static_cast(m_flip); - } - /** - * Set the flipping behavior. This is to deal with custom image format, which - * is not supported by OpenCV's builtin decoders and may impose different pixel - * coordinate system in its custom decoder. - * It is not to substitute for random flipping in augmentation. - */ - void set_to_flip(const cv_transform::cv_flipping f) { - m_flip = f; - } - /// Set to split channels - bool to_split() const { - return m_split; - } - - /// Export transform operator of normalizer to allow lazy application - std::vector get_transform_normalize() const; - /// Export transform operator of normalizer for a specific channel - std::vector get_transform_normalize(const unsigned int ch) const; - - /// Turn off normalizer. This is useful to make sure it off after potential lazy application - void disable_lazy_normalizer(); - - /// Turn off all transforms - void disable_transforms(); - - /// Add a tranform - bool add_transform(std::unique_ptr tr); - - /// Add a normalizing tranform - bool add_normalizer(std::unique_ptr tr); - bool add_normalizer(std::unique_ptr tr); - - /// Allow access to the list of transforms registered - const std::vector >& get_transforms() const { - return m_transforms; - } - - /// Allow read-only access to a particular transform indexed by idx - const cv_transform* get_transform(const unsigned int idx) const; - - /// Allow read-write access to a particular transform indexed by idx - cv_transform* get_transform(const unsigned int idx); - - /// Retrun the number of transforms registered - unsigned int get_num_transforms() const { return m_transforms.size(); } - - /** Return final image dimension {width, height} after all the transforms - * If a cropper is set, returns {crop_width, crop_height}. Otherwise, {0,0}. - */ - std::vector get_data_dims() const; - - void determine_inverse_lazy_normalization(); - - /// Execute a range of transforms [tr_strart, tr_end) on the given image in order - bool preprocess(cv::Mat& image, unsigned int tr_start = 0u, - unsigned int tr_end = std::numeric_limits::max()); - /// Execute all the inverse transforms on the given image in the reverse order - bool postprocess(cv::Mat& image); - - virtual std::string get_type() const { return "cv_process"; } - virtual std::string get_description() const; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_PROCESS_HPP diff --git a/include/lbann/data_readers/cv_process_patches.hpp b/include/lbann/data_readers/cv_process_patches.hpp deleted file mode 100644 index b9c52ff955a..00000000000 --- a/include/lbann/data_readers/cv_process_patches.hpp +++ /dev/null @@ -1,83 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_process_patches .cpp .hpp - structure that defines the operations -// on patches extracted from an image in the opencv format -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_PROCESS_PATCHES_HPP -#define LBANN_CV_PROCESS_PATCHES_HPP - -#include "cv_process.hpp" -#include "patchworks/patchworks_patch_descriptor.hpp" -#include // std::numeric_limits - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/// Similar to cv_process but works on patches that are extracted from an image -class cv_process_patches : public cv_process { - protected: - patchworks::patch_descriptor m_pd; - bool m_self_label; - unsigned int m_when_to_extract; - - public: - cv_process_patches(); - cv_process_patches(const bool self_label); - cv_process_patches(const cv_process_patches& rhs); - cv_process_patches(const cv_transform::cv_flipping flip_code, const bool tosplit); - cv_process_patches& operator=(const cv_process_patches& rhs); - - ~cv_process_patches() override {} - - void set_patch_descriptor(const patchworks::patch_descriptor& pd, - const unsigned int when_to_extract = - std::numeric_limits::max()); - patchworks::patch_descriptor& patch_descriptor() { - return m_pd; - } - const patchworks::patch_descriptor& patch_descriptor() const { - return m_pd; - } - unsigned int get_when_to_extract() const { return m_when_to_extract; } - bool is_self_labeling() const { return m_self_label; } - unsigned int get_num_labels() const { return m_pd.get_num_labels(); } - virtual unsigned int get_patch_label() const { return m_pd.get_last_label(); } - unsigned int get_num_patches() const { return m_pd.get_num_patches(); } - std::vector get_data_dims() const { - return {m_pd.get_num_patches(), m_pd.get_patch_width(), m_pd.get_patch_height()}; - } - - bool preprocess(cv::Mat& image, std::vector& patches); - - std::string get_type() const override { return "cv_process_patches"; } - std::string get_description() const override; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_PROCESS_PATCHES_HPP diff --git a/include/lbann/data_readers/cv_resizer.hpp b/include/lbann/data_readers/cv_resizer.hpp deleted file mode 100644 index 69555897d2c..00000000000 --- a/include/lbann/data_readers/cv_resizer.hpp +++ /dev/null @@ -1,103 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_resizer .cpp .hpp - Functions to resize images -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_RESIZER_HPP -#define LBANN_CV_RESIZER_HPP - -#include "lbann/data_readers/cv_transform.hpp" -#include -#include - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/** - * Simple image resizing without maintaining the aspect ratio. - */ -class cv_resizer : public cv_transform { - protected: - // --- configuration variables --- - unsigned int m_width; ///< desired width of an image - unsigned int m_height; ///< desired height of an image - - // --- state variables --- - /** Three modes of pixel interpolation: INTER_LINEAR, INTER_AREA, and INTER_LINEAR - * The first choice is the default when not adaptive. The other two are used when - * interpolatng adaptively. The second is when shrinking, and the third is when enlarging - */ - static const int m_interpolation_choices[3]; - int m_interpolation; ///< id of the channel value interpolation method used - bool m_adaptive_interpolation; ///< whether to use adaptive interpolation - - public: - cv_resizer(); - cv_resizer(const cv_resizer& rhs) = default; - cv_resizer& operator=(const cv_resizer& rhs) = default; - cv_resizer *clone() const override; - ~cv_resizer() override {} - - /** - * Set the parameters all at once - * @param width desired width - * @param height desired height - * @param adaptive_interpolation whether to apply a different interpolation method depending on how an image is resized - */ - void set(const unsigned int width, const unsigned int height, - const bool adaptive_interpolation = false); - - unsigned int get_width() const { return m_width; } - unsigned int get_height() const { return m_height; } - - /// Clear the states of the previous transform applied - void reset() override; - - /** - * Determine whether to enable transformation. - * @return false if not enabled. - */ - bool determine_transform(const cv::Mat& image) override; - - /// Determine whether to enable inverse transformation. - bool determine_inverse_transform() override { return false; } - - /** - * Apply the transformation. - * As this method is executed, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - std::string get_type() const override { return "resizer"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_RESIZER_HPP diff --git a/include/lbann/data_readers/cv_subtractor.hpp b/include/lbann/data_readers/cv_subtractor.hpp deleted file mode 100644 index 169181c4576..00000000000 --- a/include/lbann/data_readers/cv_subtractor.hpp +++ /dev/null @@ -1,171 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_subtractor .cpp .hpp - subtract channel values of an image (possibly the -// pixel-wise mean of dataset) from the corresponding values of another (input) -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_SUBTRACTOR_HPP -#define LBANN_CV_SUBTRACTOR_HPP - -#include "cv_transform.hpp" -#include "lbann/base.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/** - * Subtract channel values of an image from the corresponding values of another. - * The former is likely to carry pre-computed mean data per pixel and per channel. - * The latter is an input image. Both image needs to have the same size and the - * same number of channels. The subtracted result is represented in the scale - * between 0 and 1 (both inclusive). - * In the common current use case, a colorizer comes before a subtractor which is - * followed by a random cropper. In this scenario, the input images must be resized - * in advance to match the size of the mean image. - * In another scenario, where the random cropping is not used but resizing is done - * on-line, the subtractor can come after cropper without requiring the input images - * to be resized in advance. - * Alternatively, even a simpler approach is to use a mean image with uniform pixels. - * In this way, it does not need to know the size of input images, and is not impacted - * by random cropping or flipping augmentation. - */ -class cv_subtractor : public cv_transform { - protected: - // --- configuration variables --- - /** - * The image to subtract from an input image in the pixel-wise fashion. - * It has channel values of a floating point type, in the scale from 0 to 1. - * An input image will be mapped into the scale before subtraction by linearly - * mapping the smallest representative value to 0 and the largest representative - * value to 1. - */ - cv::Mat m_img_to_sub; - - /** - * The image to divide an input image in the pixel-wise fashion. - * It has channel values of a floating point type, in the scale from 0 to 1. - * An input image will be mapped into the scale before division. - */ - cv::Mat m_img_to_div; - - /** uniform mean per channel used for channel-wise mean-subtraction. - * This is used to construct the m_img_to_sub when the size of the image is known. - */ - std::vector m_channel_mean; - - /** uniform standard deviation per channel used for channel-wise z-score (division). - * This is used to construct the m_img_to_div when the size of the image is known. - */ - std::vector m_channel_stddev; - - // --- state variables --- - bool m_applied; ///< has been subtracted - - public: - cv_subtractor() : cv_transform(), m_applied(false) {} - cv_subtractor(const cv_subtractor& rhs); - cv_subtractor& operator=(const cv_subtractor& rhs); - cv_subtractor *clone() const override; - - ~cv_subtractor() override {} - - static cv::Mat read_binary_image_file(const std::string filename); - - /// Load and set the image to subtract from every input image. - void set_mean(const std::string name_of_img, const int depth_code = cv_image_type::T()); - - /** - * Set the mean fixed per channel for mean-subtracting each input image. - * This supports an alternative method for mean subtraction given that the - * mean per channel is uniform. - */ - void set_mean(const std::vector channel_mean); - - /** - * Set the dataset-wise mean image to subtract from each input image. - * The image represents the pre-computed pixel-wise mean of the dataset. - * In case that this image is not in a floating point type, it is converted to - * one with the depth specified by depth_code. - */ - void set_mean(const cv::Mat& img, const int depth_code = cv_image_type::T()); - - /// Load and set the image to normalize the pixels of every input image. - void set_stddev(const std::string name_of_img, const int depth_code = cv_image_type::T()); - - /** - * Set the dataset-wise standard deviation fixed per channel for normalizing - * each input image. - * This supports an alternative method for normalizing with stddev given that - * it is uniform per channel. - */ - void set_stddev(const std::vector channel_stddev); - - /** - * Set the dataset-wise standard deviation to normalize each input image. - * In case that this image is not in a floating point type, it is converted to - * one with the depth specified by depth_code. - */ - void set_stddev(const cv::Mat& img, const int depth_code = cv_image_type::T()); - - void reset() override { - m_enabled = false; - m_applied = false; - } - - /** - * If a given image is in grayscale, the tranform is enabled, and not otherwise. - * @return false if not enabled or unsuccessful. - */ - bool determine_transform(const cv::Mat& image) override; - - /// convert back to color image if it used to be a grayscale image - bool determine_inverse_transform() override; - - /** - * Apply color conversion if enabled. - * As it is applied, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - /// true if both sub and div are channel-wise - bool check_if_channel_wise() const; - - std::string get_type() const override { return "subtractor"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; - - protected: - /// Construct an image of the unform channel values using the channel-wise mean. - bool create_img_to_sub(int width, int height, int n_channels); - /// Construct an image of the unform channel values using the channel-wise stddev. - bool create_img_to_div(int width, int height, int n_channels); -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_SUBTRACTOR_HPP diff --git a/include/lbann/data_readers/cv_transform.hpp b/include/lbann/data_readers/cv_transform.hpp deleted file mode 100644 index 72455fc8907..00000000000 --- a/include/lbann/data_readers/cv_transform.hpp +++ /dev/null @@ -1,221 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_transform .cpp .hpp - base class for the transformation -// on image data in opencv format -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_TRANSFORM_HPP -#define LBANN_CV_TRANSFORM_HPP - -#include "opencv.hpp" -#include "opencv_extensions.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -class cv_transform { - protected: - // --- configuration variables --- - // place for the variables to keep the configuration set during initialization - - std::string m_name; - - // --- state variables --- - /// per-image indicator of whether to apply transform or not - bool m_enabled; - - // transform prepared given the configuration (and the image) - // m_trans; - - // Allow to manually shut transform off without destroying it - //bool m_manual_switch; - - /** Check if transform is configured to apply. - * (e.g., if any of the augmentaion methods is enabled) - */ - virtual bool check_to_enable() const { - return true; - } - - public: - enum cv_flipping {_both_axes_=-1, _vertical_=0, _horizontal_=1, _no_flip_=2}; - static const constexpr char* const cv_flip_desc[] = {"both_axes", "vertical", "horizontal", "none"}; - static std::string flip_desc(const cv_flipping flip_code) { return std::string(cv_flip_desc[static_cast(flip_code)+1]); } - - static const float pi; - - - cv_transform(); - cv_transform(const cv_transform& rhs); - cv_transform& operator=(const cv_transform& rhs); - virtual cv_transform *clone() const; - - virtual ~cv_transform() {} - - // define a method to configure the transform - // void set(args) { reset(); ... } - /// Reset the transform state but do not alter the configuration variables - virtual void reset() { - m_enabled = false; - // e.g., m_trans.clear(); - } - - virtual bool determine_transform(const cv::Mat& image); - virtual bool determine_inverse_transform(); - virtual bool apply(cv::Mat& image) = 0; - - /// Turn transform on - void enable() { - m_enabled = true; - } - /// Turn transform off - void disable() { - m_enabled = false; - } - /// Check if transform is on - bool is_enabled() const { - return m_enabled; - } - - //bool toggle_manual_switch() { return (m_manual_switch = !m_manual_switch); } - - // administrative methods - /** Return this transform's type, e.g: "augmenter," "normalizer," etc. */ - virtual std::string get_type() const = 0; - - /// Returns this transform's name - std::string get_name() const { return m_name; } - - /** Sets this transform's name; this is an arbitrary string, e.g, assigned in a prototext file. */ - void set_name(const std::string& name) { m_name = name; } - - /** Returns a description of the parameters passed to the ctor */ - virtual std::string get_description() const; - - virtual std::ostream& print(std::ostream& os) const; -}; - -/// Default constructor -inline cv_transform::cv_transform() - : m_name(""), m_enabled(false)//, m_manual_switch(false) -{} - -/// Deep-copying constructor -inline cv_transform::cv_transform(const cv_transform& rhs) - : m_name(rhs.m_name), m_enabled(rhs.m_enabled) {} - -/// Assignement operator. deep-copy everything -inline cv_transform& cv_transform::operator=(const cv_transform& rhs) { - m_enabled = rhs.m_enabled; - m_name = rhs.m_name; - return *this; -} - -/** Prepare transform for the given image as configured. - * Then, check if they are valid, and turn the transform on if so. - * The preparation includes as much precomputation as possible. For example, - * if the transformation consists of constructing four affine transform matrices - * and applying them to the given image in sequence, the transform matrices - * will be reduced to one. Then, the following function apply(image) will - * finally apply it to the image. - */ -inline bool cv_transform::determine_transform(const cv::Mat& image) { - // clear any transform state computed for previous image - // reset() - m_enabled = check_to_enable(); - // if (!m_enabled) return false; - // compute m_trans for the image and the configuration of the transform - // Here, some transform may not applicable to the given image. - // In that case, set m_enabled = false (or fruther throw an exception). - return m_enabled; -} - -/** Prepare the inverse transform to undo preprocessing transforms if needed - * for postprocessing. Not all transforms can be or need to be inversed. - * Then, check if they are valid, and turn the transform on if so. - * By default, turn this off as we do not need to undo in most of the cases. - * In need of manual overriding to enable/disable inverse transform, implement - * such a logic in this fuction and interfaces to enable/disable. - */ -inline bool cv_transform::determine_inverse_transform() { - // In case of manual overriding, if (!m_manual_switch) return false; - // If this transform, by design, can not be or does not need to be inversed, - // return (m_enabled = false); - // - // If the transform has not been applied (e.g., m_trans has not been set), - // return (m_enabled = false); - // Note that this cannot be determined by m_enabled as the transform is turned - // off once applied. - // - // Compute the inverse of m_trans and overwrite m_trans; - // set m_enabled to true; - // return true; - return false; -} - -/** Apply transform once and turn it off - * To conditionally apply the transform given an image, - * determine_transform(image) or determine_inverse_transform() must be called - * in advance. These will do as much precomputation as possible. For example, - * if the transformation consists of constructing four affine transform matrices - * and multiplying them to the given image in sequence, the transform matrices - * will be reduced to one. Then, this function will finally apply it to the image. - * There are three possible ways to implement condition checking as shown below, - * but here the third option is preferred for minimizing the number of calls - * 1. checking m_enabled internally - * 2. externally call is_enabled() - * 3. rely on the return value of determine_transform()/determine_inverse_transform() - */ -inline bool cv_transform::apply(cv::Mat& image) { - // As the transform is applied once, turn this off - m_enabled = false; - // Return the success of transform - return true; -} - -/// Return the pointer of a newly copy-constructed object -inline cv_transform *cv_transform::clone() const { - return static_cast(nullptr); -} - -//inline std::string cv_transform::get_type() { return "image transform"; } - -inline std::string cv_transform::get_description() const { - return std::string {} + get_type(); -} - -inline std::ostream& cv_transform::print(std::ostream& os) const { - os << get_description(); // Print out configuration variables - // Additionally, print out state variables as well - return os; -} - -std::ostream& operator<<(std::ostream& os, const cv_transform& tr); - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_TRANSFORM_HPP diff --git a/include/lbann/data_readers/cv_utils.hpp b/include/lbann/data_readers/cv_utils.hpp deleted file mode 100644 index fdac1bc77e3..00000000000 --- a/include/lbann/data_readers/cv_utils.hpp +++ /dev/null @@ -1,498 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_utils .cpp .hpp - operations related to opencv images -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_UTILS_HPP -#define LBANN_CV_UTILS_HPP - -#include -#include // operator typeid -#include "opencv_extensions.hpp" -#include "cv_process.hpp" -#include "lbann/utils/mild_exception.hpp" - - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -class cv_utils { - public: - - // copy_cvMat_to_buf (with a tempoary buffer) - template - static bool copy_cvMat_to_buf_with_full_info(const cv::Mat& image, std::vector& buf, const cv_process& pp); - - template - static bool copy_cvMat_to_buf_with_known_type(const cv::Mat& image, std::vector& buf, const cv_process& pp); - - /** Copy a cv::Mat image into a serialized buffer. - * The argument pp specifies the parameters for image preprocessing that - * takes advantage of the OpenCV framework. Returns true if successful. - */ - static bool copy_cvMat_to_buf(const cv::Mat& image, std::vector& buf, const cv_process& pp); - - - // copy_buf_to_cvMat (with a tempoary buffer) - template - static cv::Mat copy_buf_to_cvMat_with_full_info(const std::vector& buf, const int Width, const int Height, const cv_process& pp); - - template - static cv::Mat copy_buf_to_cvMat_with_known_type(const std::vector& buf, const int Width, const int Height, const cv_process& pp); - - /** Reconstruct a cv::Mat image from a serialized buffer. - * The image size is specified by Width and Height. Type indetifies the - * OpenCV image type. The last argument pp specifies the parameters for - * image postprocessing that takes advantage of the OpenCV framework. - * Returns a reconstructed cv::Mat image if successful and an empty one - * otherwise. - */ - static cv::Mat copy_buf_to_cvMat(const std::vector& buf, const int Width, const int Height, const int Type, const cv_process& pp); - - - // copy_buf_to_cvMat (with an El::Matrix block) - template - static bool copy_cvMat_to_buf_with_full_info(const cv::Mat& image, CPUMat& buf, const cv_process& pp); - - template - static bool copy_cvMat_to_buf_with_known_type(const cv::Mat& image, CPUMat& buf, const cv_process& pp); - - /** Copy a cv::Mat image into an El::Matrix block. - * The argument pp specifies the parameters for image preprocessing that - * takes advantage of the OpenCV framework. Returns true if successful. - */ - static bool copy_cvMat_to_buf(const cv::Mat& image, CPUMat& buf, const cv_process& pp); - - - // copy_buf_to_cvMat (with an El::Matrix block) - template - static cv::Mat copy_buf_to_cvMat_with_full_info(const CPUMat& buf, const int Width, const int Height, const cv_process& pp); - - template - static cv::Mat copy_buf_to_cvMat_with_known_type(const CPUMat& buf, const int Width, const int Height, const cv_process& pp); - - /** Reconstruct a cv::Mat image from an El::Matrix block. - * The image size is specified by Width and Height. Type indetifies the - * OpenCV image type. The last argument pp specifies the parameters for - * image postprocessing that takes advantage of the OpenCV framework. - * Returns a reconstructed cv::Mat image if successful and an empty one - * otherwise. - */ - static cv::Mat copy_buf_to_cvMat(const CPUMat& buf, const int Width, const int Height, const int Type, const cv_process& pp); - - /** - * Use cv::imdecode() to load an image data instead of relying on cv::imread(). - * This avoids reading the image header to determine the decoder directly from - * the file but allow doing so from the memory. - * The arguments are the same as the ones with cv::imread() as well as the - * return type. Avoiding the extra access to the underlying filesystem may - * result in a better performance. - */ - static cv::Mat lbann_imread(const std::string& img_file_path, int flags, std::vector& buf, cv::Mat* image = nullptr); -}; - - -//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -// copy_cvMat_to_buf (vector) -/** - * Copy a cv::Mat image into a serialized buffer. This requires the type of - * channel values and the number of channels in the image to be known at - * compile time. The default for these are the type uint8_t and 3 channels. - * The argument pp specifies the parameters for image preprocessing that - * takes advantage of the OpenCV framework. Returns true if successful. - */ -template -inline bool cv_utils::copy_cvMat_to_buf_with_full_info( - const cv::Mat& image, std::vector& buf, const cv_process& pp) { - _LBANN_SILENT_EXCEPTION(image.empty(), "", false) - - const int Width = image.cols; - const int Height = image.rows; - const int sz = Height*Width; - - buf.resize(sz*NCh*sizeof(T)); - auto *Pixels = reinterpret_cast(&(buf[0])); - - if (pp.to_split()) { - // TODO: like the case with the output in El::Matrixi type, branch on whether the - // input channel type T is same as that of the output (especially ::DataType) - std::vector trans = pp.get_transform_normalize(); - if (trans.size() == 0u) { - trans.assign(NCh, cv_normalizer::channel_trans_t(1.0, 0.0)); - } - _LBANN_MILD_EXCEPTION((trans.size() != NCh), - "Incorrect number of channels in transform", false); - std::vector channels(NCh); - - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1), Pixels); - } - cv::split(image, channels); - - Pixels = reinterpret_cast(&(buf[0])); - - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - cv_normalizer:: - scale(Pixels, Pixels + sz, Pixels, {trans[ch]}); - } - } else { - if (image.isContinuous()) { - cv_normalizer:: - scale(reinterpret_cast(image.datastart), - reinterpret_cast(image.dataend), - Pixels, pp.get_transform_normalize()); - } else { - const int stride = Width*NCh; - for (int i = 0; i < Height; ++i, Pixels += stride) { - const auto *ptr = reinterpret_cast(image.ptr(i)); - cv_normalizer:: - scale(ptr, ptr+stride, Pixels, pp.get_transform_normalize()); - } - } - } - - return true; -} - -/** - * Copy a cv::Mat image into a serialized buffer. This requires the type of - * channel values to be known at compile time. The default type is uint8_t. - * The argument pp specifies the parameters for image preprocessing that - * takes advantage of the OpenCV framework. Returns true if successful. - */ -template -inline bool cv_utils::copy_cvMat_to_buf_with_known_type( - const cv::Mat& image, std::vector& buf, const cv_process& pp) { - _SWITCH_CV_FUNC_KNOWN_TYPE_3PARAMS(image.channels(), T, \ - copy_cvMat_to_buf_with_full_info, \ - image, buf, pp) - return false; -} -//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv - - -//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -// copy_buf_to_cvMat (vector) -/** - * Reconstruct a cv::Mat image from a serialized buffer. This requires the type - * of channel values and the number of channels in the image to be known at - * compile time. The default for these are the type uint8_t and 3 channels. - * The image size is specified by Width and Height. The argument pp specifies - * the parameters for image postprocessing that takes advantage of the OpenCV - * framework. Returns an empty image if unsuccessful. - */ -template -inline cv::Mat cv_utils::copy_buf_to_cvMat_with_full_info( - const std::vector& buf, const int Width, const int Height, const cv_process& pp) { - - const int sz = Height*Width; - - _LBANN_MILD_EXCEPTION(sz*NCh*sizeof(T) != buf.size(), \ - "Size mismatch. Buffer has " << buf.size() << " items when " \ - << sz*NCh*sizeof(T) << " are expected.", \ - cv::Mat()) - - const auto *Pixels = reinterpret_cast(&(buf[0])); - - cv::Mat image = cv::Mat(Height, Width, CV_MAKETYPE(cv::DataType::depth, NCh)); - - if (pp.to_split()) { - // TODO: like the case with the output of El::Matrix type, branch on whether the - // input channel type T is same as that of the output (especially ::DataType) - std::vector trans = pp.get_transform_normalize(); - if (trans.size() == 0u) { - trans.assign(NCh, cv_normalizer::channel_trans_t(1.0, 0.0)); - } - _LBANN_MILD_EXCEPTION((trans.size() != NCh), - "Incorrect number of channels in transform", cv::Mat()); - std::vector channels(NCh); - - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1), const_cast(Pixels)); - } - - cv::merge(channels, image); - auto *optr = reinterpret_cast(image.data); - for(size_t ch=0; ch < NCh; ++ch, optr += sz) { - cv_normalizer:: - scale(reinterpret_cast(image.datastart), - reinterpret_cast(image.dataend), - optr, {trans[ch]}); - } - } else { - cv_normalizer:: - scale(Pixels, Pixels + sz*NCh, reinterpret_cast(image.data), pp.get_transform_normalize()); - } - - return image; -} - -/** - * Reconstruct a cv::Mat image from a serialized buffer. This requires the type - * of channel values to be known at compile time. The default type is uint8_t. - * The image size is specified by Width and Height. The last argument pp - * specifies the parameters for image postprocessing that takes advantage of the - * OpenCV framework. Returns a reconstructed cv::Mat image if successful and an - * empty one otherwise. - */ -template -inline cv::Mat cv_utils::copy_buf_to_cvMat_with_known_type( - const std::vector& buf, const int Width, const int Height, const cv_process& pp) { - _LBANN_MILD_EXCEPTION(buf.size() == 0u || Width == 0 || Height == 0, \ - "An empty image (" << Height << " x " << Width << ") or a buffer (" << buf.size() << ")", \ - cv::Mat()) - - const auto sz = static_cast(Width*Height*sizeof(T)); - const size_t NCh = buf.size()/sz; - - _LBANN_MILD_EXCEPTION(sz*NCh != buf.size(), \ - "Size mismatch. Buffer has " << buf.size() << " items when " << sz*NCh << " are expected.", \ - cv::Mat()) - - _SWITCH_CV_FUNC_KNOWN_TYPE_4PARAMS(NCh, T, \ - copy_buf_to_cvMat_with_full_info, \ - buf, Width, Height, pp); - - _LBANN_DEBUG_MSG(NCh << "-channel image is not supported."); - return cv::Mat(); -} -//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv - - - -//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -// copy_cvMat_to_buf (Elemental) -/** - * Copy a cv::Mat image into a data block of El::Matrix type. This - * requires the type of channel values and the number of channels in the image - * to be known at compile time. The default for these are the DataType of LBANN - * and 3 channels. In case of copying a single image into a collection of - * images as an existing El::Matrix matrix, a sub-matrix View can be passed. - * The argument pp specifies the parameters for image preprocessing that - * takes advantage of the OpenCV framework. Returns true if successful. - */ -template -inline bool cv_utils::copy_cvMat_to_buf_with_full_info( - const cv::Mat& image, CPUMat& buf, const cv_process& pp) { - // NCh need not be a template parameter here. It can be a function argument. - // However, keeping it as a static parameter enables custom accesses on pixels - // For example, - // using Vec_T = cv::Vec; - // image.at(y, x) = newPixel; - _LBANN_SILENT_EXCEPTION(image.empty(), "", false) - - const int Width = image.cols; - const int Height = image.rows; - const int sz = Height*Width; - - if (buf.Height() != sz*NCh) { -#if 0 - return false; -#else - //_LBANN_DEBUG_MSG("Resizing buffer height to " << sz*NCh); - buf.Resize(sz*NCh, ((buf.Width()<1)? 1 : buf.Width())); -#endif - } - - DataType *Pixels = buf.Buffer(); - - if (pp.to_split()) { - std::vector trans = pp.get_transform_normalize(); - if (trans.size() == 0u) { - trans.assign(NCh, cv_normalizer::channel_trans_t(1.0, 0.0)); - } - _LBANN_MILD_EXCEPTION((trans.size() != NCh), - "Incorrect number of channels in transform", false); - std::vector channels(NCh); - - if (std::is_same::value) { - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - // create a separate image per channel aliasing the memory of buf - channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1), Pixels); - } - Pixels = buf.Buffer(); - - cv::split(image, channels); - - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - cv_normalizer:: - scale(Pixels, Pixels + sz, Pixels, {trans[ch]}); - } - } else { - cv::split(image, channels); - - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - cv_normalizer:: - scale(reinterpret_cast(channels[ch].datastart), - reinterpret_cast(channels[ch].dataend), - Pixels, {trans[ch]}); - } - } - } else { - if (image.isContinuous()) { - cv_normalizer:: - scale(reinterpret_cast(image.datastart), - reinterpret_cast(image.dataend), - Pixels, pp.get_transform_normalize()); - } else { - const int stride = Width*NCh; - for (int i = 0; i < Height; ++i, Pixels += stride) { - const auto *ptr = reinterpret_cast(image.ptr(i)); - cv_normalizer:: - scale(ptr, ptr+stride, Pixels, pp.get_transform_normalize()); - } - } - } - - return true; -} - -/** - * Copy a cv::Mat image into a data block of El::Matrix type. This - * requires the type of channel values in the image to be known at compile time. - * The default for these are the DataType of LBANN. - * The argument pp specifies the parameters for image preprocessing that - * takes advantage of the OpenCV framework. Returns true if successful. - */ -template -inline bool cv_utils::copy_cvMat_to_buf_with_known_type( - const cv::Mat& image, CPUMat& buf, const cv_process& pp) { - _SWITCH_CV_FUNC_KNOWN_TYPE_3PARAMS(image.channels(), T, \ - copy_cvMat_to_buf_with_full_info, \ - image, buf, pp) - return false; -} -//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv - - -//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -// copy_buf_to_cvMat (Elemental) -/** - * Reconstruct a cv::Mat image from a data block of El::Matrix type. - * This requires the type of channel values and the number of channels in the - * image to be known at compile time. The default for these are DataType of - * LBANN and 3 channels. In case of copying a single image data in a matrix - * of multiple images, a sub-matrix View can be passed. - * The image size is specified by Width and Height. The argument pp specifies - * the parameters for image postprocessing that takes advantage of the OpenCV - * framework. Returns an empty image if unsuccessful. - */ -template -inline cv::Mat cv_utils::copy_buf_to_cvMat_with_full_info( - const CPUMat& buf, const int Width, const int Height, const cv_process& pp) { - - const int sz = Height*Width; - _LBANN_MILD_EXCEPTION(sz*NCh != buf.Height(), \ - "Size mismatch. Buffer has " << buf.Height() << " items in a column when " \ - << sz*NCh << " are expected.", \ - cv::Mat()) - - const DataType *Pixels = buf.LockedBuffer(); - - cv::Mat image = cv::Mat(Height, Width, CV_MAKETYPE(cv::DataType::depth, NCh)); - - if (pp.to_split()) { - std::vector trans = pp.get_transform_normalize(); - if (trans.size() == 0u) { - trans.assign(NCh, cv_normalizer::channel_trans_t(1.0, 0.0)); - } - _LBANN_MILD_EXCEPTION((trans.size() != NCh), - "Incorrect number of channels in transform", cv::Mat()); - std::vector channels(NCh); - - if (std::is_same::value) { - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1), - const_cast(Pixels)); - } - - cv::merge(channels, image); - const auto *iptr = reinterpret_cast(image.data); - auto *optr = reinterpret_cast(image.data); - - cv_normalizer:: - scale(iptr, iptr+sz*NCh, optr, trans); - } else { - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1)); - cv_normalizer:: - scale(Pixels, Pixels+sz, - reinterpret_cast(channels[ch].data), {trans[ch]}); - } - cv::merge(channels, image); - } - } else { - cv_normalizer:: - scale(Pixels, Pixels + sz*NCh, - reinterpret_cast(image.data), - pp.get_transform_normalize()); - } - - return image; -} - -/** - * Reconstruct a cv::Mat image from a data block of El::Matrix type. - * This requires the type of channel values to be known at compile time. The - * default type is DataType. In this case, the new image may require conversion - * to an integer type during postprocessing such that it can be stored in an - * typical image file format. An image can sometimes be constructed even when - * T is different from DataType if the type casting of a DataType value into T - * is valid. - * The image size is specified by Width and Height. The last argument pp - * specifies the parameters for image postprocessing that takes advantage of the - * OpenCV framework. This returns a reconstructed cv::Mat image if successful - * and an empty one otherwise. - */ -template -inline cv::Mat cv_utils::copy_buf_to_cvMat_with_known_type( - const CPUMat& buf, const int Width, const int Height, const cv_process& pp) { - _LBANN_MILD_EXCEPTION(buf.Height() == 0u || buf.Width() == 0u || Width == 0 || Height == 0, \ - "An empty image (" << Height << " x " << Width << ") or a buffer (" \ - << buf.Height() << " x " << buf.Width() << ").", \ - cv::Mat()) - - const int sz = Height*Width; - const int NCh = buf.Height()/sz; - - _LBANN_MILD_EXCEPTION(sz*NCh != buf.Height(), \ - "Size mismatch. Buffer has " << buf.Height() << " items in a column when " \ - << sz*NCh << " are expected.", \ - cv::Mat()) - - _SWITCH_CV_FUNC_KNOWN_TYPE_4PARAMS(NCh, T, \ - copy_buf_to_cvMat_with_full_info, \ - buf, Width, Height, pp) - - _LBANN_DEBUG_MSG(NCh << "-channel image is not supported."); - return cv::Mat(); -} -//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_UTILS_HPP diff --git a/include/lbann/data_readers/data_reader.hpp b/include/lbann/data_readers/data_reader.hpp index b70c576c376..c2a4c8db152 100644 --- a/include/lbann/data_readers/data_reader.hpp +++ b/include/lbann/data_readers/data_reader.hpp @@ -30,21 +30,23 @@ #define LBANN_DATA_READER_HPP #include "lbann/base.hpp" -#include "lbann/utils/random.hpp" +#include "lbann/data_coordinator/data_coordinator_metadata.hpp" +#include "lbann/utils/random_number_generators.hpp" #include "lbann/utils/exception.hpp" #include "lbann/comm.hpp" #include "lbann/io/file_io.hpp" #include "lbann/io/persist.hpp" -#include "lbann/data_readers/image_preprocessor.hpp" #include "lbann/utils/options.hpp" -#include "lbann/utils/threads/thread_pool.hpp" +#include "lbann/transforms/transform_pipeline.hpp" #include #include #include #include #include #include - +#include +#include +#include #define NOT_IMPLEMENTED(n) { \ std::stringstream s; \ @@ -54,7 +56,8 @@ namespace lbann { class data_store_conduit; -class model; +class thread_pool; +class trainer; /** * A data reader manages reading in data in a particular format. @@ -62,7 +65,7 @@ class model; * classes should implement load and the appropriate subset of fetch_datum, * fetch_label, and fetch_response. */ -class generic_data_reader : public lbann_image_preprocessor { +class generic_data_reader { public: #define JAG_NOOP_VOID if (m_jag_partitioned) { return; } @@ -72,6 +75,7 @@ class generic_data_reader : public lbann_image_preprocessor { * ctor */ generic_data_reader(bool shuffle = true) : + m_verbose(options::get()->get_bool("verbose")), m_data_store(nullptr), m_comm(nullptr), m_mini_batch_size(0), m_current_pos(0), @@ -99,14 +103,23 @@ class generic_data_reader : public lbann_image_preprocessor { m_procs_per_partition(1), m_io_thread_pool(nullptr), m_jag_partitioned(false), - m_model(nullptr) - {} + m_trainer(nullptr), + m_issue_warning(true) + { + } generic_data_reader(const generic_data_reader&) = default; generic_data_reader& operator=(const generic_data_reader&) = default; - ~generic_data_reader() override {} + virtual ~generic_data_reader() {} virtual generic_data_reader* copy() const = 0; + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(CEREAL_NVP(m_current_mini_batch_idx), + CEREAL_NVP(m_current_pos), + CEREAL_NVP(m_shuffled_indices)); + } + /// set the comm object void set_comm(lbann_comm *comm) { m_comm = comm; @@ -249,16 +262,7 @@ class generic_data_reader : public lbann_image_preprocessor { * Set an idenifier for the dataset. * The role should be one of "train", "test", or "validate". */ - virtual void set_role(std::string role) { - m_role = role; - if (options::get()->has_string("jag_partitioned") - && get_role() == "train") { - m_jag_partitioned = true; - if (is_master()) { - std::cerr << "USING JAG DATA PARTITIONING\n"; - } - } - } + virtual void set_role(std::string role); /** * Get the role for this dataset. @@ -281,7 +285,7 @@ class generic_data_reader : public lbann_image_preprocessor { * If the base offset is not specified set it to 0 * If the stride is not specified set it to batch size */ - virtual void setup(int num_io_threads, std::shared_ptr io_thread_pool); + virtual void setup(int num_io_threads, observer_ptr io_thread_pool); /** Return this data_reader's type */ virtual std::string get_type() const = 0; @@ -293,15 +297,6 @@ class generic_data_reader : public lbann_image_preprocessor { /// Fetch this mini-batch's responses into Y. virtual int fetch_responses(CPUMat& Y); - /** - * Save pixels to an image. The implementing data reader is responsible for - * handling format detection, conversion, etc. - */ - // TODO: This function needs to go away from here - void save_image(Mat& pixels, const std::string filename, - bool do_scale = true) override { - NOT_IMPLEMENTED("save_image"); - } /** * During the network's update phase, the data reader will * advanced the current position pointer. If the pointer wraps @@ -353,6 +348,13 @@ class generic_data_reader : public lbann_image_preprocessor { virtual const std::vector get_data_dims() const { return std::vector(0); } + + virtual std::vector get_slice_points(const slice_points_mode var_category, + bool& is_supported) { + is_supported = false; + return {}; + } + /// True if the data reader's current position is valid. virtual bool position_valid() const { return (m_current_pos < get_num_data()); @@ -567,9 +569,17 @@ class generic_data_reader : public lbann_image_preprocessor { } /** - * Select the appropriate subset of data based on settings. + * Optionally resizes the shuffled indices based on the data reader + * prototext settings: absolute_sample_count, percent_of_data_to_use. + * (dah - this was formerly part of select_subset_of_data) */ - virtual void select_subset_of_data(); + void resize_shuffled_indices(); + + /** + * Select the appropriate subset of data for the validation set based on + * the data reader prototext setting: validation_percent + */ + void select_subset_of_data(); /// called by select_subset_of_data() if data set is partitioned void select_subset_of_data_partitioned(); @@ -593,96 +603,15 @@ class generic_data_reader : public lbann_image_preprocessor { /** \brief Given directory to store checkpoint files, write state to file and add to number of bytes written */ - bool save_to_checkpoint_shared(persist& p, const char *name); + bool save_to_checkpoint_shared(persist& p, execution_mode mode); /** \brief Given directory to store checkpoint files, read state from file and add to number of bytes read */ - bool load_from_checkpoint_shared(persist& p, const char *name); + bool load_from_checkpoint_shared(persist& p, execution_mode mode); - bool save_to_checkpoint_distributed(persist& p, const char *name); + bool save_to_checkpoint_distributed(persist& p, execution_mode mode); /** \brief Given directory to store checkpoint files, read state from file and add to number of bytes read */ - bool load_from_checkpoint_distributed(persist& p, const char *name); - - struct packing_header { - uint64_t current_pos; - uint64_t current_mini_batch_idx; - uint64_t data_size; - }; - bool pack_scalars(persist& p, const char *name) { - char fieldname[1024]; - lbann::persist_type persist_value; - std::string s_name(name); - if(s_name.compare("data_reader_validation") == 0){ - persist_value = persist_type::validate; - } else { - persist_value= persist_type::train; - } - - - snprintf(fieldname, sizeof(fieldname), "%s_current_mini_batch_idx", name); - p.write_uint64(persist_value, fieldname, (uint64_t) m_current_mini_batch_idx); - - int size = m_shuffled_indices.size(); - snprintf(fieldname, sizeof(fieldname), "%s_data_size", name); - p.write_uint64(persist_value, fieldname, (uint64_t) size); - - snprintf(fieldname, sizeof(fieldname), "%s_data_position", name); - p.write_uint64(persist_value, fieldname, (uint64_t) m_current_pos); - - snprintf(fieldname, sizeof(fieldname), "%s_data_indices", name); - p.write_int32_contig(persist_value, fieldname, &m_shuffled_indices[0], (uint64_t) size); - - return true; - } - - bool unpack_scalars(persist& p, struct packing_header *header, const char *name){ - char fieldname[1024]; - lbann::persist_type persist_value; - std::string s_name(name); - if(s_name.compare("data_reader_validation") == 0){ - persist_value = persist_type::validate; - } else { - persist_value= persist_type::train; - } - // Closest to non checkpoint run only loads m_current_pos - - // record minibatch index - uint64_t val; - - snprintf(fieldname, sizeof(fieldname), "%s_current_mini_batch_idx", name); - p.read_uint64(persist_value, fieldname, &val); - m_current_mini_batch_idx = (int) val; - - snprintf(fieldname, sizeof(fieldname), "%s_data_size", name); - p.read_uint64(persist_value, fieldname, &val); - auto size = (int) val; - - // get current position within data - snprintf(fieldname, sizeof(fieldname), "%s_data_position", name); - p.read_uint64(persist_value, fieldname, &val); - m_current_pos = (int) val; - //resize shuffled index array to hold values - m_shuffled_indices.resize(size); - - //read list of indices - snprintf(fieldname, sizeof(fieldname), "%s_data_indices", name); - p.read_int32_contig(persist_value, fieldname, &m_shuffled_indices[0], (uint64_t) size); - - if(header != nullptr){ - //shuffled data indices array size, used for resize after broadcast. Not unpacked. - header->data_size = size; - // all else, unpacked and set in unpack header. - header->current_pos = m_current_pos; - header->current_mini_batch_idx = m_current_mini_batch_idx; - } - - return true; - } - - void unpack_header(struct packing_header& header){ - m_current_pos = (int) header.current_pos; - m_current_mini_batch_idx = (int) header.current_mini_batch_idx; - } + bool load_from_checkpoint_distributed(persist& p, execution_mode mode); /// returns a const ref to the data store virtual const data_store_conduit& get_data_store() const { @@ -702,17 +631,9 @@ class generic_data_reader : public lbann_image_preprocessor { /// until later. void setup_data_store(int mini_batch_size); - void instantiate_data_store(const std::vector& local_list_sizes = std::vector()); + void instantiate_data_store(); - // note: don't want to make this virtual, since then all derived classes - // would have to override. But, this should only be called from within - // derived classes where it makes sense to do so. - // Once the sample_list class and file formats are generalized and - // finalized, it should (may?) be possible to code a single - // preload_data_store method. - virtual void preload_data_store() { - LBANN_ERROR("you should not be here"); - } + virtual void preload_data_store(); void set_gan_labelling(bool has_gan_labelling) { m_gan_labelling = has_gan_labelling; @@ -726,14 +647,26 @@ class generic_data_reader : public lbann_image_preprocessor { virtual bool priming_data_store() const; - void set_model(model *m) { m_model = m; } + void set_trainer(trainer *t) { m_trainer = t; } + + trainer& get_trainer() const { + if(m_trainer == nullptr) { LBANN_ERROR("get_trainer called with nullptr"); } + return *m_trainer; + } /// experimental; used to ensure all readers for jag_conduit_hdf5 /// have identical shuffled indices virtual void post_update() {} + /** Set the transform pipeline this data reader will use. */ + void set_transform_pipeline(transform::transform_pipeline&& tp) { + m_transform_pipeline = std::move(tp); + } + protected: + bool m_verbose = false; + // For use with conduit when samples are corrupt. mutable std::unordered_set m_using_random_node; @@ -759,7 +692,7 @@ class generic_data_reader : public lbann_image_preprocessor { lbann_comm *m_comm; - virtual bool fetch_data_block(CPUMat& X, El::Int thread_index, El::Int mb_size, El::Matrix& indices_fetched); + virtual bool fetch_data_block(CPUMat& X, El::Int block_offset, El::Int block_stride, El::Int mb_size, El::Matrix& indices_fetched); /** * Fetch a single sample into a matrix. @@ -861,9 +794,29 @@ class generic_data_reader : public lbann_image_preprocessor { bool m_master; + /** @brief Print the return values from various get_X methods to file + * + * For use in unit testing. Only the master prints. + * Currently only prints values from get_X methods that only depend + * on the data_reader (i.e, not on the trainer, model, etc) + */ + void print_get_methods(const std::string filename); + + /** + * Returns the number of the shuffled indices that are to be + * used. Code in this method was formerly in select_subset_of_data() + */ + size_t get_num_indices_to_use() const; + friend class data_reader_merge_features; friend class data_reader_merge_samples; +private: + + virtual void do_preload_data_store() { + LBANN_ERROR("Not implemented."); + } + protected : //var to support GAN bool m_gan_labelling; //boolean flag of whether its GAN binary label, default is false @@ -896,7 +849,7 @@ class generic_data_reader : public lbann_image_preprocessor { std::vector> m_thread_buffer; - std::shared_ptr m_io_thread_pool; + observer_ptr m_io_thread_pool; /// special handling for 1B jag; each reader /// owns a unique subset of the data @@ -906,7 +859,19 @@ class generic_data_reader : public lbann_image_preprocessor { /// this sets various member variables (num_iterations, m_reset_mini_batch_index, /// etc. void set_jag_variables(int mb_size); - model *m_model; + trainer *m_trainer; + + /** Transform pipeline for preprocessing data. */ + transform::transform_pipeline m_transform_pipeline; + + /// for use with data_store: issue a warning a single time if m_data_store != nullptr, + /// but we're not retrieving a conduit::Node from the store. This typically occurs + /// during the test phase + bool m_issue_warning; + + /// throws exception if get_absolute_sample_count() and + /// get_use_percent() are incorrect + void error_check_counts() const; }; template diff --git a/include/lbann/data_readers/data_reader_ascii.hpp b/include/lbann/data_readers/data_reader_ascii.hpp deleted file mode 100644 index 09504b49397..00000000000 --- a/include/lbann/data_readers/data_reader_ascii.hpp +++ /dev/null @@ -1,73 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// data_reader_ascii .hpp .cpp - generic_data_reader class for ASCII text files -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_DATA_READER_ASCII_HPP -#define LBANN_DATA_READER_ASCII_HPP - -#include "data_reader.hpp" - -namespace lbann { - -class ascii_reader : public generic_data_reader { - public: - ascii_reader(int sequence_length = 1, bool shuffle = true); - ascii_reader(const ascii_reader&) = default; - ascii_reader& operator=(const ascii_reader&) = default; - ~ascii_reader() override = default; - ascii_reader* copy() const override { return new ascii_reader(*this); } - - std::string get_type() const override { - return "ascii_reader"; - } - - void load() override; - - int get_linearized_data_size() const override { - return 128 * m_sequence_length; - } - int get_linearized_label_size() const override { - return 128 * m_sequence_length; - } - const std::vector get_data_dims() const override { - return {128 * m_sequence_length}; - } - - protected: - bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; - - /** Length of text sequence. */ - int m_sequence_length; - /** Size of data file in bytes. */ - int m_file_size; - -}; - -} // namespace lbann - -#endif // LBANN_DATA_READER_ASCII_HPP diff --git a/include/lbann/data_readers/data_reader_cifar10.hpp b/include/lbann/data_readers/data_reader_cifar10.hpp index 7c72975bf98..a0c7ae61257 100644 --- a/include/lbann/data_readers/data_reader_cifar10.hpp +++ b/include/lbann/data_readers/data_reader_cifar10.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_data_reader_cifar10 .hpp .cpp - generic_data_reader class for CIFAR10 dataset +// data_reader_cifar10 .hpp .cpp - Data reader for CIFAR-10/100 //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_DATA_READER_CIFAR10_HPP @@ -33,14 +33,23 @@ namespace lbann { +/** + * A data reader for the CIFAR-10/100 datasets. + * + * This requires the binary distributions of the datasets, which + * must retain their original filenames. + * CIFAR-10 vs -100 is inferred by the number of labels set. + * @note This does not store the coarse labels from CIFAR-100. + * + * See: + * https://www.cs.toronto.edu/~kriz/cifar.html + */ class cifar10_reader : public image_data_reader { public: - /// constructor cifar10_reader(bool shuffle = true); cifar10_reader(const cifar10_reader&) = default; cifar10_reader& operator=(const cifar10_reader&) = default; - /// destructor ~cifar10_reader() override; cifar10_reader* copy() const override { return new cifar10_reader(*this); } @@ -58,7 +67,13 @@ class cifar10_reader : public image_data_reader { bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; private: - std::vector > m_data; + /** + * Loaded image data. + * This will be stored in "OpenCV" format for ease of preprocessing. + */ + std::vector> m_images; + /** Loaded label information. */ + std::vector m_labels; }; } // namespace lbann diff --git a/include/lbann/data_readers/data_reader_csv.hpp b/include/lbann/data_readers/data_reader_csv.hpp index 58c55885c68..ae0ead7811f 100644 --- a/include/lbann/data_readers/data_reader_csv.hpp +++ b/include/lbann/data_readers/data_reader_csv.hpp @@ -30,7 +30,6 @@ #define LBANN_DATA_READER_CSV_HPP #include "data_reader.hpp" -#include "image_preprocessor.hpp" #include namespace lbann { diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index ba809f6547c..cde595e781e 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -30,8 +30,7 @@ #define IMAGE_DATA_READER_HPP #include "data_reader.hpp" -#include "image_preprocessor.hpp" -#include "cv_process.hpp" +#include "lbann/data_store/data_store_conduit.hpp" namespace lbann { class image_data_reader : public generic_data_reader { @@ -54,7 +53,7 @@ class image_data_reader : public generic_data_reader { // dataset specific functions void load() override; - void setup(int num_io_threads, std::shared_ptr io_thread_pool) override; + void setup(int num_io_threads, observer_ptr io_thread_pool) override; int get_num_labels() const override { return m_num_labels; @@ -79,11 +78,6 @@ class image_data_reader : public generic_data_reader { return {m_image_num_channels, m_image_height, m_image_width}; } - void save_image(Mat& pixels, const std::string filename, bool do_scale = true) override { - internal_save_image(pixels, filename, m_image_height, m_image_width, - m_image_num_channels, do_scale); - } - /// Return the sample list of current minibatch std::vector get_image_list_of_current_mb() const; @@ -100,13 +94,18 @@ class image_data_reader : public generic_data_reader { return m_image_list.at(idx); } + void do_preload_data_store() override; + + void load_conduit_node_from_file(int data_id, conduit::Node &node); + protected: + void copy_members(const image_data_reader &rhs); + /// Set the default values for the width, the height, the number of channels, and the number of labels of an image virtual void set_defaults(); bool fetch_label(Mat& Y, int data_id, int mb_idx) override; void set_linearized_image_size(); - protected: std::string m_image_dir; ///< where images are stored std::vector m_image_list; ///< list of image files and labels int m_image_width; ///< image width @@ -114,7 +113,9 @@ class image_data_reader : public generic_data_reader { int m_image_num_channels; ///< number of image channels int m_image_linearized_size; ///< linearized image size int m_num_labels; ///< number of labels - std::vector m_thread_cv_buffer; + + bool load_conduit_nodes_from_file(const std::unordered_set &data_ids); + }; } // namespace lbann diff --git a/include/lbann/data_readers/data_reader_imagenet.hpp b/include/lbann/data_readers/data_reader_imagenet.hpp index 4d6484e24c4..7f226f965de 100644 --- a/include/lbann/data_readers/data_reader_imagenet.hpp +++ b/include/lbann/data_readers/data_reader_imagenet.hpp @@ -30,35 +30,25 @@ #define LBANN_DATA_READER_IMAGENET_HPP #include "data_reader_image.hpp" -#include "cv_process.hpp" namespace lbann { class imagenet_reader : public image_data_reader { public: - imagenet_reader(bool shuffle) = delete; - imagenet_reader(const std::shared_ptr& pp, bool shuffle = true); - imagenet_reader(const imagenet_reader&); - imagenet_reader& operator=(const imagenet_reader&); + imagenet_reader(bool shuffle = true); + imagenet_reader(const imagenet_reader&) = default; + imagenet_reader& operator=(const imagenet_reader&) = default; ~imagenet_reader() override; imagenet_reader* copy() const override { return new imagenet_reader(*this); } - void setup(int num_io_threads, std::shared_ptr io_thread_pool) override; - std::string get_type() const override { return "imagenet_reader"; } protected: void set_defaults() override; - virtual bool replicate_processor(const cv_process& pp, const int nthreads); virtual CPUMat create_datum_view(CPUMat& X, const int mb_idx) const; bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - - protected: - /// preprocessor duplicated for each omp thread - std::vector > m_pps; - std::unique_ptr m_master_pps; }; } // namespace lbann diff --git a/include/lbann/data_readers/data_reader_imagenet_patches.hpp b/include/lbann/data_readers/data_reader_imagenet_patches.hpp deleted file mode 100644 index 49539429fab..00000000000 --- a/include/lbann/data_readers/data_reader_imagenet_patches.hpp +++ /dev/null @@ -1,74 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_data_reader_imagenet_patches .hpp .cpp - extract patches from ImageNet dataset -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_DATA_READER_IMAGENET_PATCHES_HPP -#define LBANN_DATA_READER_IMAGENET_PATCHES_HPP - -#include "data_reader_image.hpp" -#include "cv_process_patches.hpp" - -namespace lbann { -class imagenet_reader_patches : public image_data_reader { - public: - imagenet_reader_patches(bool shuffle) = delete; - imagenet_reader_patches(const std::shared_ptr& pp, bool shuffle = true); - imagenet_reader_patches(const imagenet_reader_patches&); - imagenet_reader_patches& operator=(const imagenet_reader_patches&); - ~imagenet_reader_patches() override; - - imagenet_reader_patches* copy() const override { return new imagenet_reader_patches(*this); } - - void setup(int num_io_threads, std::shared_ptr io_thread_pool) override; - - std::string get_type() const override { - return "imagenet_reader_patches"; - } - - int get_linearized_data_size() const override { - return m_image_linearized_size * m_num_patches; - } - const std::vector get_data_dims() const override { - return {m_num_patches*m_image_num_channels, m_image_height, m_image_width}; - } - - protected: - void set_defaults() override; - virtual bool replicate_processor(const cv_process_patches& pp, const int nthreads); - virtual std::vector create_datum_views(CPUMat& X, const int mb_idx) const; - bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - - protected: - int m_num_patches; ///< number of patches extracted - /// preprocessor for patches duplicated for each omp thread - std::vector > m_pps; - std::unique_ptr m_master_pps; -}; - -} // namespace lbann - -#endif // LBANN_DATA_READER_IMAGENET_PATCHES_HPP diff --git a/include/lbann/data_readers/data_reader_jag.hpp b/include/lbann/data_readers/data_reader_jag.hpp deleted file mode 100644 index c10daf0c9de..00000000000 --- a/include/lbann/data_readers/data_reader_jag.hpp +++ /dev/null @@ -1,231 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef _DATA_READER_JAG_HPP_ -#define _DATA_READER_JAG_HPP_ - -#include "cnpy.h" -#include -#include -#include "lbann/base.hpp" -#include "lbann/data_readers/opencv.hpp" -#include "data_reader.hpp" - -namespace lbann { - -/** - * Loads the pairs of JAG simulation inputs and results - */ -class data_reader_jag : public generic_data_reader { - public: - using data_t = double; - using scalar_t = double; - using input_t = double; - - /** - * Dependent/indepdendent variable types - * - JAG_Image: simulation output images - * - JAG_Scalar: simulation output scalars - * - JAG_Input: simulation input parameters - * - Undefined: the default - */ - enum variable_t {Undefined = 0, JAG_Image, JAG_Scalar, JAG_Input}; - - data_reader_jag(bool shuffle = true); - // TODO: copy constructor and assignment operator for deep-copying if needed - // The cnpy structure relies on shared_ptr - data_reader_jag(const data_reader_jag&) = default; - data_reader_jag& operator=(const data_reader_jag&) = default; - ~data_reader_jag() override; - data_reader_jag* copy() const override { return new data_reader_jag(*this); } - - std::string get_type() const override { - return "data_reader_jag"; - } - - /// Choose which data to use for independent variable - void set_independent_variable_type(const std::vector< std::vector >& independent); - /// Choose which data to use for dependent variable - void set_dependent_variable_type(const std::vector< std::vector >& dependent); - - /// Tell which data to use for independent variable - std::vector get_independent_variable_type() const; - /// Tell which data to use for dependent variable - std::vector get_dependent_variable_type() const; - - /// Set normalization mode: 0 = none, 1 = dataset-wise, 2 = image-wise - void set_normalization_mode(int mode); - - /// Set the image dimension - void set_image_dims(const int width, const int height); - - /// Load data and do data reader's chores. - void load() override; - - /// Show the description - std::string get_description() const; - - /// Return the number of samples - size_t get_num_samples() const; - - /// Return the linearized size of an image - size_t get_linearized_image_size() const; - /// Return the linearized size of scalar outputs - size_t get_linearized_scalar_size() const; - /// Return the linearized size of inputs - size_t get_linearized_input_size() const; - - int get_linearized_data_size() const override; - int get_linearized_response_size() const override; - std::vector get_linearized_data_sizes() const; - std::vector get_linearized_response_sizes() const; - const std::vector get_data_dims() const override; - - /// Return the pointer to the raw image data - data_t* get_image_ptr(const size_t i) const; - /// Return the image data as a 1-D vector of lbann::DataType - cv::Mat get_image(const size_t i) const; - - /// Return the pointer to the raw scalar data - scalar_t* get_scalar_ptr(const size_t i) const; - /// Return the scalar values of the i-th sample - std::vector get_scalar(const size_t i) const; - - /// Return the pointer to the raw input data - input_t* get_input_ptr(const size_t i) const; - /// Return the input values of the simulation correspoding to the i-th sample - std::vector get_input(const size_t i) const; - - void save_image(Mat& pixels, const std::string filename, bool do_scale = true) override; - - protected: - /// add data type for independent variable - void add_independent_variable_type(const variable_t independent); - /// add data type for dependent variable - void add_dependent_variable_type(const variable_t dependent); - - /// check if type t is used in the independent variable - bool is_independent(const variable_t t) const; - /// check if type t is used in the dependent variable - bool is_dependent(const variable_t t) const; - /// check if type t is used in either the indepedent or the dependent variable - bool is_used(const variable_t t) const; - - using generic_data_reader::get_linearized_size; - /// Return the linearized size of a particular JAG variable type - size_t get_linearized_size(const variable_t t) const; - /// Return the dimension of a particular JAG variable type - const std::vector get_dims(const variable_t t) const; - - virtual std::vector - create_datum_views(CPUMat& X, const std::vector& sizes, const int mb_idx) const; - - bool fetch(CPUMat& X, int data_id, int mb_idx, - const data_reader_jag::variable_t vt, const std::string tag); - bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - bool fetch_response(CPUMat& Y, int data_id, int mb_idx) override; - bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; - - /** - * Load the data in the numpy format file. - * Use only first_n available samples if specified. - */ - void load(const std::string image_file, const std::string scalar_file, - const std::string input_file, const size_t first_n = 0u); - - /// Check the dimensions of loaded data - bool check_data(size_t& num_samples) const; - - /** - * Normalize image data to [0 1] scale once after loading based on the mode - * 0 (none): no normalization - * 1 (dataset-wise): map the min/max of all the pixels in image data to 0/1 - * 2 (image-wise): map the min/max of all the pixels in current image to 0/1 - */ - void normalize_image(); - - /// Set the linearized size of an image - void set_linearized_image_size(); - /// Set the linearized size of scalar outputs - void set_linearized_scalar_size(); - /// Return the linearized size of inputs - void set_linearized_input_size(); - - int get_num_labels() const override { - return m_num_labels; - } - - int get_linearized_label_size() const override { - return m_num_labels; - } - /// Return the maximum element of all the images - data_t get_image_max() const; - /// Return the minimum element of all the images - data_t get_image_min() const; - - protected: - /// independent variable type - std::vector m_independent; - /// dependent variable type - std::vector m_dependent; - - /// Whether image output data have been loaded - bool m_image_loaded; - /// Whether scalar output data have been loaded - bool m_scalar_loaded; - /// Whether simulation input data have been loaded - bool m_input_loaded; - - /// The number of samples - size_t m_num_samples; - /// The linearized size of an image - size_t m_linearized_image_size; - /// The linearized size of scalar outputs - size_t m_linearized_scalar_size; - /// The linearized size of inputs - size_t m_linearized_input_size; - - /// image normalization mode - int m_image_normalization; - int m_image_width; ///< image width - int m_image_height; ///< image height - - /// List of jag output images - cnpy::NpyArray m_images; - /// List of jag scalar outputs - cnpy::NpyArray m_scalars; - /// List of jag input - cnpy::NpyArray m_inputs; - - /// The smallest pixel value in image data (useful for normalization or visualization) - data_t m_img_min; - /// The largest pixel value in image data (useful for normalization or visualization) - data_t m_img_max; - int m_num_labels; ///< number of labels -}; - -} // end of namespace lbann -#endif // _DATA_READER_JAG_HPP_ diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index 0938fa79438..be53df9aced 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -27,20 +27,23 @@ #ifndef _DATA_READER_JAG_CONDUIT_HPP_ #define _DATA_READER_JAG_CONDUIT_HPP_ -#include "lbann_config.hpp" // may define LBANN_HAS_CONDUIT +#include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT -#include "lbann/data_readers/opencv.hpp" #include "data_reader.hpp" #include "conduit/conduit.hpp" #include "hdf5.h" -#include "lbann/data_readers/cv_process.hpp" #include #include #include #include #include -#include "lbann/data_readers/sample_list_jag.hpp" + +//#define _USE_IO_HANDLE_ +#ifdef _USE_IO_HANDLE_ +#include "lbann/data_readers/sample_list_conduit_io_handle.hpp" +#else +#include "lbann/data_readers/sample_list_hdf5.hpp" +#endif namespace lbann { @@ -58,8 +61,16 @@ class data_reader_jag_conduit : public generic_data_reader { /// Type for the pair of the key string of a sample and the handle of the file that contains it using sample_locator_t = std::pair; using sample_map_t = std::vector; ///< valid sample map type - using sample_t = sample_list_jag::sample_t; - using sample_file_id_t = sample_list_jag::sample_file_id_t; + using sample_name_t = std::string; +#ifdef _USE_IO_HANDLE_ + using sample_list_t = sample_list_conduit_io_handle; +#else + using sample_list_t = sample_list_hdf5; +#endif + using file_handle_t = sample_list_t::file_handle_t; + using sample_file_id_t = sample_list_t::sample_file_id_t; + using sample_t = std::pair; + //using sample_t = sample_list_t::sample_t; /// linear transform on X defined as: first * X + second => X' using linear_transform_t = std::pair; @@ -76,15 +87,13 @@ class data_reader_jag_conduit : public generic_data_reader { /// Type to define a prefix string and the minimum length requirement to filter out a key using prefix_t = std::pair; - data_reader_jag_conduit(bool shuffle = true) = delete; - data_reader_jag_conduit(const std::shared_ptr& pp, bool shuffle = true); + data_reader_jag_conduit(bool shuffle = true); data_reader_jag_conduit(const data_reader_jag_conduit&); - data_reader_jag_conduit(const data_reader_jag_conduit&, const std::vector& ds_sample_move_list); data_reader_jag_conduit& operator=(const data_reader_jag_conduit&); ~data_reader_jag_conduit() override; data_reader_jag_conduit* copy() const override { return new data_reader_jag_conduit(*this); } - void setup(int num_io_threads, std::shared_ptr io_thread_pool) override; + void setup(int num_io_threads, observer_ptr io_thread_pool) override; std::string get_type() const override { return "data_reader_jag_conduit"; @@ -165,8 +174,8 @@ class data_reader_jag_conduit : public generic_data_reader { /// Set every reader instances in a model to have an independent index list void set_list_per_model(bool flag) { m_list_per_model = flag; }; - bool has_list_per_model() const { return m_list_per_model; } - bool has_list_per_trainer() const { return m_list_per_trainer; } + bool has_list_per_model() const override { return m_list_per_model; } + bool has_list_per_trainer() const override { return m_list_per_trainer; } /// Fetch data of a mini-batch or reuse it from the cache of the leading reader @@ -199,16 +208,14 @@ class data_reader_jag_conduit : public generic_data_reader { /// Return the dimension of data const std::vector get_data_dims() const override; - /// Return the slice points for linearized independent variables - std::vector get_slice_points_independent() const; - /// Return the slice points for linearized dependent variables - std::vector get_slice_points_dependent() const; - int get_num_data() const override; int get_num_labels() const override; int get_linearized_label_size() const override; int get_linearized_size(const std::string& desc) const override; + std::vector get_slice_points(const slice_points_mode var_category, + bool& is_supported) override; + void set_split_image_channels(); void unset_split_image_channels(); bool check_split_image_channels() const; @@ -216,15 +223,6 @@ class data_reader_jag_conduit : public generic_data_reader { /// Show the description std::string get_description() const; - /// Return the image simulation output of the i-th sample - std::vector get_cv_images(const size_t i, conduit::Node& sample) const; - - /** - * Return the images of the i-th sample as an 1-D vector of lbann::DataType - * There is one image per view, each of which is taken at closest to the bang time. - */ - std::vector get_images(const size_t i, conduit::Node& sample) const; - /// Return the scalar simulation output data of the i-th sample std::vector get_scalars(const size_t i, conduit::Node& sample) const; @@ -234,13 +232,8 @@ class data_reader_jag_conduit : public generic_data_reader { template static size_t add_val(const std::string key, const conduit::Node& n, std::vector& vals); - void save_image(Mat& pixels, const std::string filename, bool do_scale = true) override; - void setup_data_store(int mini_batch_size); - /// A untiliy function to convert the pointer to image data into an opencv image - static cv::Mat cast_to_cvMat(const std::pair img, - const int height, const int num_ch=1); /// A utility function to convert a JAG variable type to name string static std::string to_string(const variable_t t); @@ -259,11 +252,10 @@ class data_reader_jag_conduit : public generic_data_reader { /// once the sample_list class and file formats are generalized and /// finalized, it should (may?) be possible to code a single /// preload_data_store method. - void preload_data_store() override; + void do_preload_data_store() override; virtual void set_defaults(); - virtual bool replicate_processor(const cv_process& pp, const int nthreads); - virtual void copy_members(const data_reader_jag_conduit& rhs, const std::vector& ds_sample_move_list = std::vector()); + virtual void copy_members(const data_reader_jag_conduit& rhs); /// add data type for independent variable void add_independent_variable_type(const variable_t independent); @@ -280,7 +272,12 @@ class data_reader_jag_conduit : public generic_data_reader { /// Return the dimension of a particular JAG variable type const std::vector get_dims(const variable_t t) const; /// Return the slice points for linearized data or responses - std::vector get_slice_points(const std::vector< std::vector >& var) const; + std::vector get_slice_points_impl(const std::vector< std::vector >& var) const; + /// Return the slice points for linearized independent variables + std::vector get_slice_points_independent() const; + /// Return the slice points for linearized dependent variables + std::vector get_slice_points_dependent() const; + /// A utility function to make a string to show all the variable types static std::string to_string(const std::vector& vec); /// A utility function to make a string to show all the groups of variable types @@ -349,6 +346,9 @@ class data_reader_jag_conduit : public generic_data_reader { */ static bool check_non_numeric(const std::string key); + bool has_path(const file_handle_t& h, const std::string& path) const; + void read_node(const file_handle_t& h, const std::string& path, conduit::Node& n) const; + /// Allow const access to the conduit data structure static const conduit::Node& get_conduit_node(const conduit::Node& n_base, const std::string key); /** Load the conduit node with the data of the sample i identified by key @@ -361,14 +361,14 @@ class data_reader_jag_conduit : public generic_data_reader { bool has_conduit_path(const size_t i, const std::string& key) const; /// Obtain image data - std::vector< std::vector > get_image_data(const size_t i, conduit::Node& sample) const; + std::vector< std::vector > get_image_data(const size_t i, conduit::Node& sample) const; - bool data_store_active() const { + bool data_store_active() const override { bool flag = generic_data_reader::data_store_active(); return (m_data_store != nullptr && flag); } - bool priming_data_store() const { + bool priming_data_store() const override { bool flag = generic_data_reader::priming_data_store(); return (m_data_store != nullptr && flag); } @@ -410,10 +410,6 @@ class data_reader_jag_conduit : public generic_data_reader { /// Keys to select a set of simulation input parameters to use. By default, use all. std::vector m_input_keys; - /// preprocessor duplicated for each omp thread - std::vector > m_pps; - std::unique_ptr m_master_pps; - /** * Set of keys that are associated with non_numerical values. * Such a variable requires a specific method for mapping to a numeric value. @@ -467,14 +463,11 @@ class data_reader_jag_conduit : public generic_data_reader { std::vector m_input_normalization_params; typedef std::pair conduit_sample; - sample_list_jag m_sample_list; + sample_list_t m_sample_list; bool m_list_per_trainer; bool m_list_per_model; - /** temporary image normalization - * The inputs are the image to normalize, the image source id and the channel id. - */ - void image_normalization(cv::Mat& img, size_t i, size_t ch) const; + void preload_helper(const hid_t& h, const std::string &sample_name, const std::string &field_name, int data_id, conduit::Node &node); }; /** @@ -602,5 +595,4 @@ inline size_t data_reader_jag_conduit::add_val(const std::string key, const cond } } // end of namespace lbann -#endif // LBANN_HAS_CONDUIT #endif // _DATA_READER_JAG_CONDUIT_HPP_ diff --git a/include/lbann/data_readers/data_reader_mnist.hpp b/include/lbann/data_readers/data_reader_mnist.hpp index 2d3b30e0ed6..ebd8df8ec27 100644 --- a/include/lbann/data_readers/data_reader_mnist.hpp +++ b/include/lbann/data_readers/data_reader_mnist.hpp @@ -30,7 +30,6 @@ #define LBANN_DATA_READER_MNIST_HPP #include "data_reader_image.hpp" -#include "image_preprocessor.hpp" namespace lbann { diff --git a/include/lbann/data_readers/data_reader_mnist_siamese.hpp b/include/lbann/data_readers/data_reader_mnist_siamese.hpp deleted file mode 100644 index 4536e3cebad..00000000000 --- a/include/lbann/data_readers/data_reader_mnist_siamese.hpp +++ /dev/null @@ -1,126 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// data_reader_mnist_siamese .hpp .cpp - data reader class for mnist dataset -// employing two images per sample to feed siamese model -//////////////////////////////////////////////////////////////////////////////// - -#ifndef DATA_READER_MNIST_SIAMESE_HPP -#define DATA_READER_MNIST_SIAMESE_HPP - -#include "data_reader_multi_images.hpp" -#include "cv_process.hpp" -#include -#include -#include -#include - -namespace lbann { - -/** - * With MNIST dataset, there is no individual image file. All the images or - * labels are packed into a single binary file respectively. This reader - * pre-loads all the data into memory as minist_reader does. - * However, to feed a siamese model, this reader randomly chooses the paired - * input on-line. It maintains another data index list, 'm_shuffled_indices2'. - * It first copies the primary list maintined by the base class to the secondary - * list, and shuffles the secondary whenever the primary gets shuffled via the - * overridden shuffle_indices() method. - */ -class data_reader_mnist_siamese : public data_reader_multi_images { - public: - using label_t = unsigned char; - using sample_t = std::pair; - - data_reader_mnist_siamese(const std::shared_ptr& pp, bool shuffle = true); - data_reader_mnist_siamese(const data_reader_mnist_siamese&); - data_reader_mnist_siamese& operator=(const data_reader_mnist_siamese&); - ~data_reader_mnist_siamese() override; - - data_reader_mnist_siamese* copy() const override { - return new data_reader_mnist_siamese(*this); - } - - std::string get_type() const override { - return "data_reader_mnist_siamese"; - } - - /** Set up MNIST dataset-specific input parameters, which are pre-defined - * and also set as the default. This does not change the setup, but only - * preserves the default. - */ - void set_input_params(const int, const int, const int, const int) override; - - // dataset specific functions - void load() override; - - /// Fetch this mini-batch's samples into X by calling the new overloaded fetch_datum() - int fetch_data(CPUMat& X, El::Matrix& indices_fetched) override; - /// Fetch this mini-batch's labels into Y by calling the new overloaded fetch_label() - int fetch_labels(CPUMat& Y) override; - - protected: - /** - * Set the default configuration such as the width, height, and number of - * channels of the image sample. - */ - void set_defaults() override; - - // unused virtual interfaces replaced by the new interfaces that taks a pair - // of indices to sample list. - using data_reader_multi_images::fetch_datum; - using data_reader_multi_images::fetch_label; - bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; - - /** - * Fetch two data items identified by the pair of indices to the pre-loaded data list, - * and put them into the column mb_idx of matrix x. - */ - virtual bool fetch_datum(CPUMat& X, std::pair data_id, int mb_idx); - /** - * Take a pair of indices to the preloaded sample list, and compare the labels - * of the corresponding samples. Store 1 if equal or 0 at the column mb_idx of - * the given matrix Y. - */ - virtual bool fetch_label(CPUMat& Y, std::pair data_id, int mb_idx); - - /** - * Shuffle the second index list added in this class as well as the one in the - * base class whenever the latter gets shuffled. - */ - void shuffle_indices() override; - - protected: - using generic_data_reader::m_shuffled_indices; - /// To randomly choose the siamese pair input online - std::vector m_shuffled_indices2; - /// Store the preloaded data - std::vector> m_image_data; -}; - -} // namespace lbann - -#endif // DATA_READER_MNIST_SIAMESE_HPP diff --git a/include/lbann/data_readers/data_reader_moving_mnist.hpp b/include/lbann/data_readers/data_reader_moving_mnist.hpp deleted file mode 100644 index 034bca57880..00000000000 --- a/include/lbann/data_readers/data_reader_moving_mnist.hpp +++ /dev/null @@ -1,86 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_DATA_READER_MOVING_MNIST_HPP -#define LBANN_DATA_READER_MOVING_MNIST_HPP - -#include "data_reader.hpp" - -namespace lbann { - -class moving_mnist_reader : public generic_data_reader { -public: - moving_mnist_reader(El::Int num_frames, - El::Int image_height, - El::Int image_width, - El::Int num_objects); - moving_mnist_reader(const moving_mnist_reader&) = default; - moving_mnist_reader& operator=(const moving_mnist_reader&) = default; - ~moving_mnist_reader() override = default; - moving_mnist_reader* copy() const override { return new moving_mnist_reader(*this); } - - std::string get_type() const override { - return "moving_mnist_reader"; - } - - void load() override; - - const std::vector get_data_dims() const override; - int get_num_labels() const override; - int get_linearized_data_size() const override; - int get_linearized_label_size() const override; - -protected: - bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; - -private: - - /** Number of frames. */ - El::Int m_num_frames; - /** Frame height. */ - El::Int m_image_height; - /** Frame width. */ - El::Int m_image_width; - /** Number of MNIST digits in each frame. */ - El::Int m_num_objects; - - /** Number of MNIST samples. */ - El::Int m_num_raw_images = 0; - /** MNIST image height. */ - El::Int m_raw_image_height = 0; - /** MNIST image width. */ - El::Int m_raw_image_width = 0; - /** Raw MNIST image data. */ - std::vector m_raw_image_data; - /** Raw MNIST label data. */ - std::vector m_raw_label_data; - -}; - -} // namespace lbann - -#endif // LBANN_DATA_READER_MOVING_MNIST_HPP diff --git a/include/lbann/data_readers/data_reader_multi_images.hpp b/include/lbann/data_readers/data_reader_multi_images.hpp deleted file mode 100644 index 93a2959bd7d..00000000000 --- a/include/lbann/data_readers/data_reader_multi_images.hpp +++ /dev/null @@ -1,117 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// data_reader_multi_images .hpp .cpp - generic data reader class for datasets -// employing multiple images per sample -//////////////////////////////////////////////////////////////////////////////// - -#ifndef DATA_READER_MULTI_IMAGES_HPP -#define DATA_READER_MULTI_IMAGES_HPP - -#include "data_reader_imagenet.hpp" -#include "cv_process.hpp" -#include -#include -#include -#include - -namespace lbann { -class data_reader_multi_images : public imagenet_reader { - public: - using img_src_t = std::vector; - using sample_t = std::pair; - - data_reader_multi_images(bool shuffle) = delete; - data_reader_multi_images(const std::shared_ptr& pp, bool shuffle = true); - data_reader_multi_images(const data_reader_multi_images&); - data_reader_multi_images& operator=(const data_reader_multi_images&); - ~data_reader_multi_images() override; - - data_reader_multi_images* copy() const override { - return new data_reader_multi_images(*this); - } - - std::string get_type() const override { - return "data_reader_multi_images"; - } - - /** Set up imagenet specific input parameters - * If argument is set to 0, then this method does not change the value of - * the corresponding parameter. However, width and height can only be both - * zero or both non-zero. - */ - void set_input_params(const int width, const int height, const int num_ch, - const int num_labels, const int num_img_srcs); - - void set_input_params(const int width, const int height, const int num_ch, - const int num_labels) override; - - // dataset specific functions - void load() override; - - int get_linearized_data_size() const override { - return m_image_linearized_size * m_num_img_srcs; - } - const std::vector get_data_dims() const override { - return {static_cast(m_num_img_srcs)*m_image_num_channels, m_image_height, m_image_width}; - } - - /// Return the sample list of current minibatch - std::vector get_image_list_of_current_mb() const; - - /// Allow read-only access to the entire sample list - const std::vector& get_image_list() const { - return m_image_list; - } - - sample_t get_sample(size_t idx) const { - return m_image_list.at(idx); - } - - /// The number of image sources or the number of siamese heads. e.g., 2; - /// this method is added to support data_store functionality - unsigned int get_num_img_srcs() const { - return m_num_img_srcs; - } - - protected: - void set_defaults() override; - virtual std::vector create_datum_views(CPUMat& X, const int mb_idx) const; - bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; - - bool read_text_stream(std::istream& text_stream, std::vector& list); - bool load_list(const std::string file_name, std::vector& list, - const bool fetch_list_at_once = false); - - protected: - std::vector m_image_list; ///< list of image files and labels - /// The number of image sources or the number of siamese heads. e.g., 2 - unsigned int m_num_img_srcs; -}; - -} // namespace lbann - -#endif // DATA_READER_MULTI_IMAGES_HPP diff --git a/include/lbann/data_readers/data_reader_multihead_siamese.hpp b/include/lbann/data_readers/data_reader_multihead_siamese.hpp deleted file mode 100644 index dc95f3cb7e8..00000000000 --- a/include/lbann/data_readers/data_reader_multihead_siamese.hpp +++ /dev/null @@ -1,94 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// data_reader_multihead_siamese .hpp .cpp - data reader to use m patches -// generated offline. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef DATA_READER_MULTIHEAD_SIAMESE_HPP -#define DATA_READER_MULTIHEAD_SIAMESE_HPP - -#include "data_reader_multi_images.hpp" -#include "cv_process.hpp" -#include "offline_patches_npz.hpp" -#include -#include -#include -#include - -namespace lbann { -class data_reader_multihead_siamese : public data_reader_multi_images { - public: - using label_t = offline_patches_npz::label_t; - using sample_t = offline_patches_npz::sample_t; - - data_reader_multihead_siamese(const std::shared_ptr& pp, unsigned int nimages, bool shuffle = true); - data_reader_multihead_siamese(const std::shared_ptr& pp, bool shuffle = true); - - data_reader_multihead_siamese(const data_reader_multihead_siamese&); - data_reader_multihead_siamese& operator=(const data_reader_multihead_siamese&); - ~data_reader_multihead_siamese() override; - - data_reader_multihead_siamese* copy() const override { - return new data_reader_multihead_siamese(*this); - } - - std::string get_type() const override { - return "data_reader_multihead_siamese"; - } - - /** Set up imagenet specific input parameters - * If argument is set to 0, then this method does not change the value of - * the corresponding parameter. However, width and height can only be both - * zero or both non-zero. - */ - void set_input_params(const int width, const int height, const int num_ch, - const int num_labels) override; - - // dataset specific functions - void load() override; - - /// Return the sample list of current minibatch - std::vector get_image_list_of_current_mb() const; - - /// Allow read-only access to the entire sample list - std::vector get_image_list() const; - - sample_t get_sample(size_t idx) const { - return m_samples.get_sample(idx); - } - - protected: - void set_defaults() override; - bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; - - protected: - offline_patches_npz m_samples; -}; - -} // namespace lbann - -#endif // DATA_READER_MULTIHEAD_SIAMESE_HPP diff --git a/include/lbann/data_readers/data_reader_npz_ras_lipid.hpp b/include/lbann/data_readers/data_reader_npz_ras_lipid.hpp new file mode 100644 index 00000000000..1e691fbd5d8 --- /dev/null +++ b/include/lbann/data_readers/data_reader_npz_ras_lipid.hpp @@ -0,0 +1,180 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + + +#ifndef LBANN_DATA_READER_NPZ_RAS_LIPID_HPP +#define LBANN_DATA_READER_NPZ_RAS_LIPID_HPP + +#include "conduit/conduit.hpp" +#include "lbann/utils/options.hpp" +#include "lbann/data_readers/data_reader.hpp" +#include "conduit/conduit.hpp" +#include +#include + +namespace lbann { + /** + * Data reader for data stored in numpy (.npz) files that are encapsulated + * in conduit::Nodes + */ +class ras_lipid_conduit_data_reader : public generic_data_reader { + +public: + + ras_lipid_conduit_data_reader(const bool shuffle); + ras_lipid_conduit_data_reader(const ras_lipid_conduit_data_reader&); + ras_lipid_conduit_data_reader& operator=(const ras_lipid_conduit_data_reader&); + ~ras_lipid_conduit_data_reader() override {} + + ras_lipid_conduit_data_reader* copy() const override { return new ras_lipid_conduit_data_reader(*this); } + + std::string get_type() const override { + return "ras_lipid_conduit_data_reader"; + } + + void load() override; + + void set_num_labels(int n) { m_num_labels = n; } + + int get_linearized_data_size() const override { return m_seq_len*m_num_features; } + int get_linearized_label_size() const override { return m_seq_len*m_num_labels; } + int get_linearized_response_size() const override { return m_num_response_features; } + //const std::vector get_data_dims() const override { return m_data_dims; } + const std::vector get_data_dims() const override { return {get_linearized_data_size()}; } + int get_num_labels() const override { return m_seq_len*m_num_labels; } + +private: + + int m_num_features = 0; + int m_num_labels = 3; + int m_num_response_features = 0; + std::vector m_data_dims; + + /** @brief Total of train + validate samples */ + size_t m_num_global_samples; + size_t m_num_train_samples; + size_t m_num_validate_samples; + + /** the number of sequential samples that are combined into a multi-sample */ + int m_seq_len = 1; + + // owner map for multi-samples + std::unordered_map m_multi_sample_to_owner; + + std::unordered_map> m_filename_to_multi_sample; + //std::unordered_map> m_filename_to_multi_sample; + + std::unordered_map m_multi_sample_id_to_first_sample; + +// sample_list_t m_sample_list; + + /** @brief List of input npz filenames */ + std::vector m_filenames; + + /** @brief m_samples_per_file[j] contains the number of samples in the j-th file */ + std::vector m_samples_per_file; + + /** @brief Maps a data_id to the file index (in m_filenames) that + * contains the sample, and the offset in that file's npy array */ + std::unordered_map> m_data_id_map; + + /** @brief Maps a field name to the data's shape + * + * Example: "bbs" -> {184, 3} + */ + std::unordered_map> m_datum_shapes; + + /** @brief Maps a field name to the word size */ + std::unordered_map m_datum_word_sizes; + + /** @brief Maps a field name to the number of bytes in the datum + * + * Example: "bbs" -> 184*3*word_size + */ + std::unordered_map m_datum_num_bytes; + + /** @brief Maps a field name to the number of words in the datum */ + std::unordered_map m_datum_num_words; + + std::vector m_min; + std::vector m_max_min; + std::vector m_mean; + std::vector m_std_dev; + bool m_use_min_max; + bool m_use_z_score; + + //===================================================================== + // private methods follow + //===================================================================== + + /** @brief Contains common code for operator= and copy ctor */ + void copy_members(const ras_lipid_conduit_data_reader& rhs); + + void do_preload_data_store() override; + + bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; + bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; + bool fetch_response(CPUMat& Y, int data_id, int mb_idx) override; + + /** @brief Populates in m_datum_shapes, m_datum_num_bytes, m_datum_word_sizes */ + void fill_in_metadata(); + + /** @brief Re-build the data store's owner map + * + * This one-off, wouldn't need to do this if we were using sample lists. + */ + void rebuild_data_store_owner_map(); + + /** @brief Fills in m_samples_per_file */ + void get_samples_per_file(); + + /** @brief Write file sizes to disk + * + * Each line of the output file contains: filename num_samples + */ + void write_file_sizes(); + + /** @brief Read file that contains: filename num_samples + * + * see: write_file_sizes() + */ + void read_file_sizes(); + + void read_normalization_data(); + + /** Print some statistics to cout */ + void print_shapes_etc(); + + void load_the_next_sample(conduit::Node &node, int sample_index, std::map &data); + + void construct_multi_sample(std::vector &work, int data_id, conduit::Node &node); + +}; + +} // namespace lbann + +#endif //LBANN_DATA_READER_NPZ_RAS_LIPID_HPP diff --git a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp index 7d7cd00bf93..57473224f9f 100644 --- a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp +++ b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp @@ -30,6 +30,7 @@ #define LBANN_DATA_READER_NUMPY_NPZ_CONDUIT_HPP #include "lbann/data_readers/data_reader.hpp" +#include "conduit/conduit.hpp" #include namespace lbann { @@ -37,7 +38,8 @@ namespace lbann { * Data reader for data stored in numpy (.npz) files that are encapsulated . * in conduit::Nodes */ - class numpy_npz_conduit_reader : public generic_data_reader { +class numpy_npz_conduit_reader : public generic_data_reader { + public: numpy_npz_conduit_reader(const bool shuffle); // These need to be explicit because of some issue with the cnpy copy @@ -73,7 +75,7 @@ namespace lbann { const std::vector get_data_dims() const override { return m_data_dims; } protected: - void preload_data_store(); + void do_preload_data_store() override; bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; @@ -105,7 +107,16 @@ namespace lbann { void fill_in_metadata(); std::vector m_filenames; - }; + + bool load_numpy_npz_from_file(const std::unordered_set &data_ids, std::unordered_set& label_classes); + + void load_conduit_node(const std::string filename, int data_id, conduit::Node &output, bool reset = true); + + std::unordered_map> m_npz_cache; + + void load_npz(const std::string filename, int data_id, conduit::Node &node); + +}; } // namespace lbann diff --git a/include/lbann/data_readers/data_reader_python.hpp b/include/lbann/data_readers/data_reader_python.hpp index 35264a18e7b..372e449d3b6 100644 --- a/include/lbann/data_readers/data_reader_python.hpp +++ b/include/lbann/data_readers/data_reader_python.hpp @@ -29,113 +29,18 @@ #include "data_reader.hpp" #ifdef LBANN_HAS_PYTHON -#include +#include "lbann/utils/python.hpp" namespace lbann { -namespace python { - -/** @brief Singleton class to manage embedded Python session. - * - * This is very experimental. Be warned. - */ -class manager { -public: - - /** @brief Get singleton instance. */ - static manager& get_instance(); - /** @brief Construct singleton instance. - * @details If there is already an instance, it is destroyed. - */ - static void create(); - /** Destroy singleton instance. */ - static void destroy(); - - /** @brief Check if a Python error has occurred. - * - * Throw an exception if an error is detected. - * - * @param force_error Whether to force an exception to be thrown. - */ - void check_error(bool force_error = false) const; - - ~manager(); - -private: - - /** @brief Singleton instance. */ - static std::unique_ptr m_instance; - - /** @brief State on main Python thread. */ - PyThreadState* m_thread_state = nullptr; - - // Lifetime functions - manager(); - manager(const manager&) = delete; - manager& operator=(const manager&) = delete; - -}; - -/** @brief RAII wrapper for Python GIL. - * - * The Python interpreter is not thread-safe, so it uses the "global - * interpreter lock" to ensure only one thread is executing at a - * time. Multithreading is achieved by periodically transferring - * control of the GIL between threads. This makes it hard to get - * meaningful speedups from simple multithreading. Certain - * operations, e.g. I/O and numerical kernels in NumPy, can be - * efficiently parallelized because they yield control of the GIL - * while working. - * - * This is very experimental. Be warned. - */ -class global_interpreter_lock { -public: - - global_interpreter_lock(const manager&); - ~global_interpreter_lock(); - -private: - - global_interpreter_lock(const global_interpreter_lock&) = delete; - global_interpreter_lock& operator=(const global_interpreter_lock&) = delete; - - PyGILState_STATE m_gil_state; - -}; - -/** @brief Convenience wrapper around @c PyObject pointer. - * - * This is very experimental. Be warned. - */ -class object { -public: - object(PyObject* obj = nullptr); - object(std::string val); - object(El::Int val); - object(DataType val); - object(const object& other); - object& operator=(const object& other); - object(object&& other); - object& operator=(object&& other); - ~object(); - inline PyObject* get() { return m_ptr; } - inline const PyObject* get() const { return m_ptr; } - inline operator PyObject*() { return get(); } - inline operator const PyObject*() const { return get(); } -private: - PyObject* m_ptr; -}; - -} // namespace python - class python_reader : public generic_data_reader { public: python_reader(std::string module, std::string module_dir, std::string sample_function, std::string num_samples_function, - std::string sample_dims_function); + std::string sample_dims_function, + bool shuffle); python_reader(const python_reader&) = default; python_reader& operator=(const python_reader&) = default; ~python_reader() override; @@ -150,22 +55,58 @@ class python_reader : public generic_data_reader { int get_linearized_data_size() const override; int get_linearized_label_size() const override; - void setup(int num_io_threads, std::shared_ptr io_thread_pool) override; + void setup(int num_io_threads, observer_ptr io_thread_pool) override; void load() override; protected: bool fetch_data_block(CPUMat& X, - El::Int thread_id, + El::Int block_offset, + El::Int block_stride, El::Int mb_size, El::Matrix& indices_fetched) override; bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; private: + + /** @brief Dimensions of data sample tensor. */ std::vector m_sample_dims; + /** @brief Number of data samples in data set. */ El::Int m_num_samples; + + /** @brief User-provided Python function to access data samples. + * + * The function is expected to take one integer argument for the + * sample index. It must return an iterator that defines the + * entries in a data sample. + */ python::object m_sample_function; + + /** @brief Wrapper function around sample access function. + * + * This function will be executed on worker processes (see @c + * m_process_pool). It will obtain a data sample from @c + * m_sample_function and copy it into a @c m_shared_memory_array. + */ + python::object m_sample_function_wrapper; + + /** @brief Pool of worker processes. + * + * From the Python @c multiprocessing module. + */ python::object m_process_pool; + /** @brief Shared memory array. + * + * @c RawArray from the Python @c multiprocessing module. + */ + python::object m_shared_memory_array; + + /** @brief Pointer into shared memory array. + * + * Points to buffer for @c m_shared_memory_array. + */ + DataType* m_shared_memory_array_ptr = nullptr; + }; } // namespace lbann diff --git a/include/lbann/data_readers/data_reader_smiles.hpp b/include/lbann/data_readers/data_reader_smiles.hpp new file mode 100644 index 00000000000..b820f6d9d37 --- /dev/null +++ b/include/lbann/data_readers/data_reader_smiles.hpp @@ -0,0 +1,151 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_DATA_READER_SMILES_HPP +#define LBANN_DATA_READER_SMILES_HPP + +#include "conduit/conduit.hpp" +#include "lbann/utils/options.hpp" +#include "lbann/data_readers/data_reader.hpp" + +namespace lbann { + /** + * Data reader for SMILES data that has been converted to an array + * of short ints and stored in binary format. + * Binary format is: (n_int, int (repeating n_int times) ) repeating + * last entry in the file is the only entry stored as an integer; it + * contains the number of samples. Second to last entry is the maximum + * number of ints in any sample; this is stored as a short int + */ +class smiles_data_reader : public generic_data_reader { + +public: + + smiles_data_reader(const bool shuffle); + smiles_data_reader(const smiles_data_reader&); + smiles_data_reader& operator=(const smiles_data_reader&); + ~smiles_data_reader() override; + + smiles_data_reader* copy() const override { return new smiles_data_reader(*this); } + + std::string get_type() const override { + return "smiles_data_reader"; + } + + void load() override; + + int get_linearized_data_size() const override { return m_linearized_data_size; } + int get_linearized_label_size() const override { return m_linearized_label_size; } + int get_linearized_response_size() const override { return m_linearized_response_size; } + const std::vector get_data_dims() const override { return {get_linearized_data_size()}; } + int get_num_labels() const override { return m_num_labels; } + + void set_sequence_length(int n) { m_linearized_data_size = n; } + int get_sequence_length() { return get_linearized_data_size(); } + +private: + + /// used for sanity checking in load() and do_preload(); + /// may eventually go away + int m_min_index = INT_MAX; + int m_max_index = 0; + + //==== start hack to make it work fast ==== + + // maps: sample_id to + std::unordered_map> m_sample_lookup; + + std::vector m_data; + + void get_sample(int sample_id, std::vector &sample_out); + + void setup_local_cache(); + + // to enable this feature, add '#define DEBUG_F' to data_reader_smiles.cpp; + // this is ONLY for testing/development; if enabled, each rank will encode + // all samples after loading, and prior to the first epoch + void test_encode(); + + char m_delimiter = '\0'; + + // CAUTION: line_number is same as sample_id, i.e, assumes a single + // data input file + int get_smiles_string_length(const std::string &line, int line_number); + + //==== end hack to make it work fast ==== + + int m_linearized_data_size = 0; + int m_linearized_label_size = 0; + int m_linearized_response_size = 0; + int m_num_labels = 0; + + // these may be changed when the vocab file is read + short m_pad = 420; + short m_unk = 421; + short m_bos = 422; + short m_eos = 423; + + bool m_has_header = true; + + std::unordered_map m_vocab; + std::unordered_map m_vocab_inv; + + std::mutex m_mutex; + + size_t m_missing_char_in_vocab_count = 0; + std::unordered_set m_missing_chars; + + //===================================================================== + // private methods follow + //===================================================================== + + void get_delimiter(); + + /// returns a lower bound on memory usage for dataset + size_t get_mem_usage() const; + + /** @brief Contains common code for operator= and copy ctor */ + void copy_members(const smiles_data_reader& rhs); + + void do_preload_data_store() override; + + bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; + bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; + bool fetch_response(CPUMat& Y, int data_id, int mb_idx) override; + + void print_statistics() const; + void load_vocab(); + int get_num_lines(std::string fn); + void construct_conduit_node(int data_id, const std::string &line, conduit::Node &node); + void encode_smiles(const char *smiles, short size, std::vector &data, int data_id); + void encode_smiles(const std::string &smiles, std::vector &data, int data_id); + void decode_smiles(const std::vector &data, std::string &out); +}; + +} // namespace lbann + +#endif //LBANN_DATA_READER_SMILES_HPP diff --git a/include/lbann/data_readers/data_reader_triplet.hpp b/include/lbann/data_readers/data_reader_triplet.hpp deleted file mode 100644 index a1ee9e07871..00000000000 --- a/include/lbann/data_readers/data_reader_triplet.hpp +++ /dev/null @@ -1,95 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// data_reader_triplet .hpp .cpp - data reader to use triplet patches -// generated offline. -// -// Depreciated and replaced by data_reader_multihead_siamese .hpp .cpp. -// Kept here just for reference. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef DATA_READER_TRIPLET_HPP -#define DATA_READER_TRIPLET_HPP - -#include "data_reader_multi_images.hpp" -#include "cv_process.hpp" -#include "offline_patches_npz.hpp" -#include -#include -#include -#include - -namespace lbann { -class data_reader_triplet : public data_reader_multi_images { - public: - using label_t = offline_patches_npz::label_t; - using sample_t = offline_patches_npz::sample_t; - - data_reader_triplet(const std::shared_ptr& pp, bool shuffle = true); - data_reader_triplet(const data_reader_triplet&); - data_reader_triplet& operator=(const data_reader_triplet&); - ~data_reader_triplet() override; - - data_reader_triplet* copy() const override { - return new data_reader_triplet(*this); - } - - std::string get_type() const override { - return "data_reader_triplet"; - } - - /** Set up imagenet specific input parameters - * If argument is set to 0, then this method does not change the value of - * the corresponding parameter. However, width and height can only be both - * zero or both non-zero. - */ - void set_input_params(const int width, const int height, const int num_ch, - const int num_labels) override; - - // dataset specific functions - void load() override; - - /// Return the sample list of current minibatch - std::vector get_image_list_of_current_mb() const; - - /// Allow read-only access to the entire sample list - std::vector get_image_list() const; - - sample_t get_sample(size_t idx) const { - return m_samples.get_sample(idx); - } - - protected: - void set_defaults() override; - bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; - - protected: - offline_patches_npz m_samples; -}; - -} // namespace lbann - -#endif // DATA_READER_TRIPLET_HPP diff --git a/include/lbann/data_readers/image_preprocessor.hpp b/include/lbann/data_readers/image_preprocessor.hpp deleted file mode 100644 index fb730e23bf1..00000000000 --- a/include/lbann/data_readers/image_preprocessor.hpp +++ /dev/null @@ -1,209 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// image_preprocessor.hpp - Preprocessing utilities for image inputs -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_IMAGE_PREPROCESSOR -#define LBANN_IMAGE_PREPROCESSOR - -#include "lbann_config.hpp" - -#ifdef LBANN_HAS_OPENCV -#include "lbann/data_readers/opencv.hpp" -#else -#error OpenCV required -#endif -#include "lbann/base.hpp" - -namespace lbann { - -/** - * Support class for preprocessing image inputs. - * Supports the following transforms: - * - Random horizontal and vertical flips - * - Random rotations - * - Random horizontal and vertical shifts - * - Random shearing - * - Standardize to 0 mean - * - Standardize to unit variance - * - Scale to the range [0, 1] - * - Normalize via z-score - */ -class lbann_image_preprocessor { - public: - lbann_image_preprocessor(); - lbann_image_preprocessor(const lbann_image_preprocessor&) = default; - lbann_image_preprocessor& operator=( - const lbann_image_preprocessor&) = default; - virtual ~lbann_image_preprocessor() {} - - /** Whether to do random horizontal flips. */ - void horizontal_flip(bool b) { - m_horizontal_flip = b; - } - /** Whether to do random vertical flips. */ - void vertical_flip(bool b) { - m_vertical_flip = b; - } - /** Do random rotations up to range degrees (0-180). */ - void rotation(float range) { - m_rotation_range = range; - } - /** Do random horizontal shifts up to range (fraction of image width). */ - void horizontal_shift(float range) { - m_horizontal_shift = range; - } - /** Do random vertical shifts up to range (fraction of image height). */ - void vertical_shift(float range) { - m_vertical_shift = range; - } - /** Do random shears up to range (radians). */ - void shear_range(float range) { - m_shear_range = range; - } - /** Whether to subtract the sample-wise mean. */ - void subtract_mean(bool b) { - m_mean_subtraction = b; - } - /** Whether to normalize to unit variance, sample-wise. */ - void unit_variance(bool b) { - m_unit_variance = b; - } - /** Whether to scale to [0, 1] (assumes max value is 255). */ - void scale(bool b) { - m_scale = b; - } - /** - * Whether to normalize by z-scores, sample-wise. - * This and mean subtraction/unit variance are mutually exclusive. - */ - void z_score(bool b) { - m_z_score = b; - } - /** Disable all data augmentation. */ - void disable_augmentation() { - horizontal_flip(false); - vertical_flip(false); - rotation(0.0f); - horizontal_shift(0.0f); - vertical_shift(0.0f); - shear_range(0.0f); - } - - /** - * Add noise to data (disable by default) - * noise_factor control the ammount of noise - * to be set to a value above zero but less than 1 (say 0.5) - * */ - void add_noise(float noise_factor=0.0f) { - m_noise_factor = noise_factor; - } - - /** - * Preprocess pixels according to the currently-set augmentation transforms. - * @param pixels The pixels to process as a column vector (num x 1 mat). - * @param imheight Height of the image in pixels. - * @param imwidth Width of the image in pixels. - * @param num_channels The number of channels pixels has. - */ - void augment(Mat& pixels, unsigned imheight, unsigned imwidth, - unsigned num_channels); - /** - * Normalize poxels according to the currently-set transforms. - * @param pixels The pixels to process as a column vector. - * @param num_channels The number of channels pixels has. - */ - void normalize(Mat& pixels, unsigned num_channels); - - /** - * External interface to saving an image. - * Classes that want to support this should use it to interface with - * internal_save_image. - * @param pixels The image to save (as a column vector). - * @param filename The image filename (type inferred from extension). - * @param do_scale Whether pixels has been scaled (default true). - */ - virtual void save_image(Mat& pixels, const std::string filename, - bool do_scale = true) {} - - protected: - /** Whether to do horizontal flips. */ - bool m_horizontal_flip; - /** Whether to do vertical flips. */ - bool m_vertical_flip; - /** Range in degrees for rotations (0-180). */ - float m_rotation_range; - /** Range (fraction of total width) for horizontal shifts. */ - float m_horizontal_shift; - /** Range (fraction of total height) for vertical shifts. */ - float m_vertical_shift; - /** Shear angle (radians). */ - float m_shear_range; - /** Whether to normalize to 0 mean. */ - bool m_mean_subtraction; - /** Whether to normalize to unit variance. */ - bool m_unit_variance; - /** Whether to scale to [0, 1]. */ - bool m_scale; - /** Whether to normalize via z-score. */ - bool m_z_score; - - float m_noise_factor; - - void mean_subtraction(Mat& pixels, unsigned num_channels); - void unit_variance(Mat& pixels, unsigned num_channels); - void unit_scale(Mat& pixels, unsigned num_channels); - void z_score(Mat& pixels, unsigned num_channels); - - void pixel_noise(Mat& pixels); - - /** - * Convert a column vector of pixels to an OpenCV matrix. - */ - cv::Mat cv_pixels(const Mat& pixels, unsigned imheight, unsigned imwidth, - unsigned num_channels); - /** Undo cv_pixels. */ - void col_pixels(const cv::Mat& sqpixels, Mat& pixels, unsigned num_channels); - - /** @brief Flip sqpixels. - * @param sqpixels The image to flip - * @param flip_flag OpenCV flip flag: 0=vertical, 1=horizontal, -1=both. - */ - void flip(cv::Mat& sqpixels, int flip_flag); - /** Apply the affine transformation in 3x3 matrix trans. */ - void affine_trans(cv::Mat& sqpixels, const Mat& trans); - - /** - * Save pixels to filename. - */ - void internal_save_image(Mat& pixels, const std::string filename, - unsigned imheight, unsigned imwidth, - unsigned num_channels, bool do_scale); -}; - -} // namespace lbann - -#endif // LBANN_IMAGE_PREPROCESSOR diff --git a/include/lbann/data_readers/image_utils.hpp b/include/lbann/data_readers/image_utils.hpp deleted file mode 100644 index b52a7f4cb78..00000000000 --- a/include/lbann/data_readers/image_utils.hpp +++ /dev/null @@ -1,86 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// image_utils .cpp .hpp - Image I/O utility functions -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_IMAGE_UTILS_HPP -#define LBANN_IMAGE_UTILS_HPP - -#include "lbann/base.hpp" -#include -#include // operator typeid - -#ifdef LBANN_HAS_OPENCV -#include "lbann/data_readers/cv_utils.hpp" -#include "lbann/data_readers/cv_process_patches.hpp" -#endif - - -namespace lbann { -class image_utils { - public: - static bool loadIMG(std::vector& image_buf, int& Width, int& Height, bool Flip, unsigned char *&Pixels); - static bool loadIMG(const std::string& Imagefile, int& Width, int& Height, bool Flip, unsigned char *&Pixels, std::vector& buf); - static bool saveIMG(const std::string& Imagefile, int Width, int Height, bool Flip, unsigned char *Pixels); - -#ifdef LBANN_HAS_OPENCV - // The other load/import methods rely on these core methods - /// process an image and put it into an LBANN Mat data block - static bool process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& out); - /// process an image and put it into a serialized buffer - static bool process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process& pp, std::vector& out); - /// process an image and put it into an LBANN Mat data blocks - static bool process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector& out); -#endif // LBANN_HAS_OPENCV - - // new function, to support sharded data reader and data store functionality - static bool load_image(std::vector& image_buf, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, cv::Mat* cv_buf = nullptr); - - // new function, to support sharded data reader and data store functionality - static bool load_image(std::vector& image_buf, - int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector& data, cv::Mat* cv_buf = nullptr); - - // load/save an image into/from an LBANN data block of El::Matrix type - // Use a thread save temporary buffer for decoding the image - /// Load an image from a file and put it into an LBANN Mat data block - static bool load_image(const std::string& filename, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, std::vector& buf, cv::Mat* cv_buf = nullptr); - /// Load an image from a file, extract patches from it and put them into LBANN Mat data blocks - static bool load_image(const std::string& filename, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector& data, std::vector& buf, cv::Mat* cv_buf = nullptr); - /// Save an image using data from an LBANN Mat data block - static bool save_image(const std::string& filename, const int Width, const int Height, const int Type, cv_process& pp, const CPUMat& data); - - // import/export via a buffer of std::vector containg the raw bytes of an image file - /// Import an image from a file buffer (inbuf) and put it into an LBANN Mat data block - static bool import_image(cv::InputArray inbuf, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, cv::Mat* cv_buf = nullptr); - /// Import an image from a file buffer (inbuf), extract patches from it and put them into LBANN Mat data blocks - static bool import_image(cv::InputArray inbuf, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector& data, cv::Mat* cv_buf = nullptr); - /// Export an image using data from an LBANN Mat block into a file buffer (outbuf) - static bool export_image(const std::string& fileExt, std::vector& outbuf, const int Width, const int Height, const int Type, cv_process& pp, const CPUMat& data); -}; - -} // end of namespace lbann - -#endif // LBANN_IMAGE_UTILS_HPP diff --git a/include/lbann/data_readers/numpy_conduit_converter.hpp b/include/lbann/data_readers/numpy_conduit_converter.hpp deleted file mode 100644 index 32317487043..00000000000 --- a/include/lbann/data_readers/numpy_conduit_converter.hpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -//////////////////////////////////////////////////////////////////////////////// - -#ifndef NUMPY_CONDUIT_CONVERTER_HPP -#define NUMPY_CONDUIT_CONVERTER_HPP - -#include "lbann_config.hpp" -#include "conduit/conduit.hpp" - -namespace lbann { - -/** - * The numpy_conduit_converter class contains static method(s) for - * reading numpy files and copying the contents to a conduit file. - * - * In general the schema for npz files, after conversion to conduit, is: - * - * { - * data_id (int) : - * // one or more of the following sections - * { - * section_name : - * { - * "word_size": , - * "fortran_order: <0|1>, - * "num_vals": , - * "shape": <[ vector ]>, - * "data": - * } - * } - * } - * - * cosmoflow has the following sections: - * "data": - * "frm": - * "responses": - */ - -class numpy_conduit_converter { - public: - - static void load_conduit_node(const std::string filename, int data_id, conduit::Node &output, bool reset_conduit_node = true); - -}; - -} // namespace lbann - -#endif // NUMPY_CONDUIT_CONVERTER_HPP diff --git a/include/lbann/data_readers/offline_patches_npz.hpp b/include/lbann/data_readers/offline_patches_npz.hpp deleted file mode 100644 index c433d232ced..00000000000 --- a/include/lbann/data_readers/offline_patches_npz.hpp +++ /dev/null @@ -1,159 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef _OFFLINE_PATCHES_NPZ_HPP_ -#define _OFFLINE_PATCHES_NPZ_HPP_ - -#include "cnpy.h" -#include -#include - -namespace lbann { - -/** - * Loads the list of patche files, generated off-line, and the label per sample. - * As the list is quite large itself in the ASCII text format, it is packed and - * loaded as a compressed NumPy file (*.npz). - * Each image file name is compressed further by representing it as a sequence of - * indices to common substring dictionaries. There are two types of substring - * dictionaries, root and variant. There is an array of index sequences and an - * array of dictionary substrings per type, and a label array. - * For example, a file path train/n000111/abc.tag1.tag2.jpg would be represented - * as 'r[i][j][k]', 'v[i][j][x]', 'v[i][j][y]', 'v[i][j][z]' for the j-th patch - * of the i-th sample where 'r[i][j][k]' is "train/n000111", and 'v[i][j][x]', - * 'v[i][j][y]' and 'v[i][j][z]' is "abc", "tag1", and "tag2" respectively. - * 'r' is the root dictionary and 'v' is the variant dictionary. - * The list is kept in a compressed form, and uncompressed on-demand during execution. - * Each index sequence array is kept as a CNPY data structure, and each dictionary - * array is loaded into a vector of strings. The label array is loaded into a - * vector of uint8_t. - */ -class offline_patches_npz { - public: - using label_t = uint8_t; - using sample_t = std::pair, label_t>; - - offline_patches_npz(); - offline_patches_npz(size_t npatches); - offline_patches_npz(std::string divider); - offline_patches_npz(size_t npatches, std::string divider); - // TODO: copy constructor and assignment operator for deep-copying if needed - // The cnpy structure relies on shared_ptr - - /** - * Load the data in the compressed numpy format file. - * Use only first_n available samples if specified. - * keep_file_lists indicates whether to remove file lists loaded - * once converting them to vector of strings. - * Need to keep it for selecting a range of samples afterwards. - */ - bool load(const std::string filename, size_t first_n = 0u, - bool keep_file_lists = false); - /// Show the description - std::string get_description() const; - - /// Return the number of samples - size_t get_num_samples() const { - return m_item_class_list.size(); - } - /// Return the number of patches per sample (the number of image data sources) - size_t get_num_patches() const { - return m_num_patches; - } - /// Set the number of patches per sample (the number of image data sources) - void set_num_patches(size_t npatches) { - m_num_patches = npatches; - } - /// Reconsturct and return the meta-data (patch file names and the label) of idx-th sample - sample_t get_sample(const size_t idx) const; - /// Return the label of idx-th sample - label_t get_label(const size_t idx) const; - -#ifdef _OFFLINE_PATCHES_NPZ_OFFLINE_TOOL_MODE_ - std::vector get_file_roots() const; - size_t count_samples(const size_t num_roots) const; - bool select(const std::string out_file, const size_t sample_start, size_t& sample_end); -#endif // _OFFLINE_PATCHES_NPZ_OFFLINE_TOOL_MODE_ - - protected: - /// Check the dimensions of loaded data - bool check_data() const; - - protected: - /// Whether loaded data have passed the format check - bool m_checked_ok; - /// The number of image patches per sample (i.e. the num of patch files to read) - size_t m_num_patches; - /** - * List of index sequences to the dictionary of common file path substrings (m_file_root_list) - * per patch file (dimension: num_samples * num_patches) - */ - cnpy::NpyArray m_item_root_list; - /** - * List of index sequences to the dictionary of common file path substrings (m_file_variant_list) - * per patch file (dimension: num_samples * num_patches) - */ - cnpy::NpyArray m_item_variant_list; - /// list of labels (dimension: num_samples) - std::vector m_item_class_list; - /// The list of common substrings that a file path starts with (dimension is, for example 1000 in case of imagenet data) - std::vector m_file_root_list; - /// The list of common substrings for file path variants - std::vector m_file_variant_list; - /// The text file name of file_root_list - std::string m_file_root_list_name; - /// The text file name of file_variant_list - std::string m_file_variant_list_name; - /// A substring after which the file name of variants begins to differ (e.g., ".JPEG.") - std::string m_variant_divider; - /// control how the text dictionary files are loaded: whether to load all at once and parse or to stream in - bool m_fetch_text_dict_at_once; - /** - * indicate if the numpy file is reformatted to - * - treat an array of character strings as a 2-D character array, of which - * the second dimension is the length of the largest string. This is - * relevant to file_{root,variant}_list. - * - convert item_class_list to a list of label_t(uint8_t) instead of a - * list of a charater sequence (two digits). - * The reformatting is get around the inability of cnpy library for writing - * an array of character strings. - */ - bool m_lbann_format; - - /** - * The original data structure for m_file_root_list. It is used by select() - * if keep_file_lists was on when loading. - */ - cnpy::NpyArray m_file_root_list_org; - /** - * The original data structure for m_file_variant_list. It is used by select() - * if keep_file_lists was on when loading. - */ - cnpy::NpyArray m_file_variant_list_org; -}; - -} // end of namespace lbann -#endif // _OFFLINE_PATCHES_NPZ_HPP_ diff --git a/include/lbann/data_readers/opencv.hpp b/include/lbann/data_readers/opencv.hpp deleted file mode 100644 index 9adc7efa0d7..00000000000 --- a/include/lbann/data_readers/opencv.hpp +++ /dev/null @@ -1,68 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// opencv.hpp - LBANN header for opencv -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN header for opencv - * - includes OpenCV headers according to the version - * - use newer built-in variables in place of the deprecated ones for newer OpenCV - */ - -#include "lbann_config.hpp" - -#ifdef LBANN_HAS_OPENCV -#ifndef _LBANN_OPENCV_H_INCLUDED_ -#define _LBANN_OPENCV_H_INCLUDED_ - -#include -#if (!defined(CV_VERSION_EPOCH) && (CV_VERSION_MAJOR >= 3)) -#include -#include -#include -#define _LBANN_CV_UNCHANGED_ cv::IMREAD_UNCHANGED -#define _LBANN_CV_GRAYSCALE_ cv::IMREAD_GRAYSCALE -#define _LBANN_CV_COLOR_ cv::IMREAD_COLOR -#define _LBANN_CV_ANYDEPTH_ cv::IMREAD_ANYDEPTH -#define _LBANN_CV_ANYCOLOR_ cv::IMREAD_ANYCOLOR -#else -#include -#include -#include -#include -#define _LBANN_CV_UNCHANGED_ CV_LOAD_IMAGE_UNCHANGED -#define _LBANN_CV_GRAYSCALE_ CV_LOAD_IMAGE_GRAYSCALE -#define _LBANN_CV_COLOR_ CV_LOAD_IMAGE_COLOR -#define _LBANN_CV_ANYDEPTH_ CV_LOAD_IMAGE_ANYDEPTH -#define _LBANN_CV_ANYCOLOR_ CV_LOAD_IMAGE_ANYCOLOR -#endif - -#define _LBANN_CV_BLUE_ 0 -#define _LBANN_CV_GREEN_ 1 -#define _LBANN_CV_RED_ 2 - -#endif // _LBANN_OPENCV_H_INCLUDED_ -#endif // LBANN_HAS_OPENCV diff --git a/include/lbann/data_readers/opencv_extensions.hpp b/include/lbann/data_readers/opencv_extensions.hpp deleted file mode 100644 index b24ed360d4d..00000000000 --- a/include/lbann/data_readers/opencv_extensions.hpp +++ /dev/null @@ -1,233 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// opencv_extensions.hpp - LBANN's cv::Mat pixel type handling mechanisms -//////////////////////////////////////////////////////////////////////////////// - -#ifdef LBANN_HAS_OPENCV -#ifndef _LBANN_OPENCV_EXTENSIONS_H_INCLUDED_ -#define _LBANN_OPENCV_EXTENSIONS_H_INCLUDED_ -#include "lbann/data_readers/opencv.hpp" - -namespace lbann { - -/// A template structure to convert an OpenCV identifier of channel depth to a standard C++ type -template class cv_depth_type {}; - -/// define a specialized mapper from a CV channel type to its c++ native type -#define _def_cv_depth_translation(_CV_TYPE_, _NATIVE_TYPE_) \ -template<> struct cv_depth_type<_CV_TYPE_> { \ - public: \ - using standard_type = _NATIVE_TYPE_; \ -} - -/// cv_depth_type maps to uint8_t -_def_cv_depth_translation(CV_8U, uint8_t); -/// cv_depth_type maps to int8_t -_def_cv_depth_translation(CV_8S, int8_t); -/// cv_depth_type maps to uint16_t -_def_cv_depth_translation(CV_16U, uint16_t); -/// cv_depth_type maps to int16_t -_def_cv_depth_translation(CV_16S, int16_t); -/// cv_depth_type maps to int32_t -_def_cv_depth_translation(CV_32S, int32_t); -/// cv_depth_type maps to float -_def_cv_depth_translation(CV_32F, float); -/// cv_depth_type maps to double -_def_cv_depth_translation(CV_64F, double); - - -/// Convert an OpenCV identifier of image depth to a standard C++ type -#define _depth_type(_cv_depth_) lbann::cv_depth_type<_cv_depth_>::standard_type - - -/** A template structure to map the type of channel into the - * corresponding OpenCV type identifier of image. - * - _T_: The channel value type as a native C++ type - */ -template -struct cv_image_type { - /** A static member function which returns the OpenCV image type based on - * the channel type and number of channels: - * - _C_: The number of channels It ranges from 1 to CV_CN_MAX which is 512 - */ - static int T(const int _C_) { - return CV_MAKETYPE(cv::DataType<_T_>::depth, _C_); - } - /** A static member function which maps a native c++ type to the corresponding - * OpenCV channel type. - * The depth value returned ranges from 0 to (CV_DEPTH_MAX-1) which is 7 - */ - static int T() { - return cv::DataType<_T_>::depth; - } -}; - - -template -struct depth_normalization { - static double factor() { - if (!std::is_integral::value) { - return 1.0; - } else { - return 1.0/std::numeric_limits::max(); - } - } - static double inverse_factor() { - if (!std::is_integral::value) { - return 1.0; - } else { - return std::numeric_limits::max(); - } - } -}; - -template<> -struct depth_normalization { - static double factor() { - return 1.0; - } - static double inverse_factor() { - return 1.0; - } -}; - -/// Checks if an OpenCV depth code corresponds to an integral type -inline bool is_float(const int cv_depth) { - return ((cv_depth == CV_64F) || (cv_depth == CV_32F)); -} - -inline bool check_if_cv_Mat_is_float_type(const cv::Mat& image) { - return is_float(image.depth()); -} - -inline bool check_if_cv_Mat_has_same_shape(const cv::Mat& image1, const cv::Mat& image2) { - return ((image1.cols == image2.cols) && - (image1.rows == image2.rows) && - (image1.channels() == image2.channels())); -} - -template -static double depth_norm_factor() { - return depth_normalization::factor(); -} - -template -static double depth_norm_inverse_factor() { - return depth_normalization::inverse_factor(); -} - -/// Return the factor for unit scaling with the type indicated by the OpenCV depth -double get_depth_normalizing_factor(const int cv_depth); -/// Return the factor to inverse the unit scaling -double get_depth_denormalizing_factor(const int cv_depth); - -/// returns the number of bytes that would be used for the image without compresstion and any header -inline size_t image_data_amount(const cv::Mat& img) { - return static_cast(CV_ELEM_SIZE(img.depth())* - CV_MAT_CN(img.type())* - img.cols*img.rows); -} - -} // end of namespace lbann - -#define _SWITCH_CV_FUNC_KNOWN_TYPE_1PARAM(_SW_CH_,_T_,_FUNC_,_P1_) \ - switch (_SW_CH_) { \ - case 1: return _FUNC_<_T_,1>(_P1_); \ - case 2: return _FUNC_<_T_,2>(_P1_); \ - case 3: return _FUNC_<_T_,3>(_P1_); \ - case 4: return _FUNC_<_T_,4>(_P1_); \ - } - -#define _SWITCH_CV_FUNC_KNOWN_TYPE_2PARAMS(_SW_CH_,_T_,_FUNC_,_P1_,_P2_) \ - switch (_SW_CH_) { \ - case 1: return _FUNC_<_T_,1>(_P1_,_P2_); \ - case 2: return _FUNC_<_T_,2>(_P1_,_P2_); \ - case 3: return _FUNC_<_T_,3>(_P1_,_P2_); \ - case 4: return _FUNC_<_T_,4>(_P1_,_P2_); \ - } - -#define _SWITCH_CV_FUNC_KNOWN_TYPE_3PARAMS(_SW_CH_,_T_,_FUNC_,_P1_,_P2_,_P3_) \ - switch (_SW_CH_) { \ - case 1: return _FUNC_<_T_,1>(_P1_,_P2_,_P3_); \ - case 2: return _FUNC_<_T_,2>(_P1_,_P2_,_P3_); \ - case 3: return _FUNC_<_T_,3>(_P1_,_P2_,_P3_); \ - case 4: return _FUNC_<_T_,4>(_P1_,_P2_,_P3_); \ - } - -#define _SWITCH_CV_FUNC_KNOWN_TYPE_4PARAMS(_SW_CH_,_T_,_FUNC_,_P1_,_P2_,_P3_,_P4_) \ - switch (_SW_CH_) { \ - case 1: return _FUNC_<_T_,1>(_P1_,_P2_,_P3_,_P4_); \ - case 2: return _FUNC_<_T_,2>(_P1_,_P2_,_P3_,_P4_); \ - case 3: return _FUNC_<_T_,3>(_P1_,_P2_,_P3_,_P4_); \ - case 4: return _FUNC_<_T_,4>(_P1_,_P2_,_P3_,_P4_); \ - } - -#define _SWITCH_CV_FUNC_1PARAM(_SW_D_,_FUNC_,_P1_) \ - switch (_SW_D_) { \ - case CV_8U : return _FUNC_<_depth_type(CV_8U) >(_P1_); \ - case CV_8S : return _FUNC_<_depth_type(CV_8S) >(_P1_); \ - case CV_16U: return _FUNC_<_depth_type(CV_16U)>(_P1_); \ - case CV_16S: return _FUNC_<_depth_type(CV_16S)>(_P1_); \ - case CV_32S: return _FUNC_<_depth_type(CV_32S)>(_P1_); \ - case CV_32F: return _FUNC_<_depth_type(CV_32F)>(_P1_); \ - case CV_64F: return _FUNC_<_depth_type(CV_64F)>(_P1_); \ - } - -#define _SWITCH_CV_FUNC_2PARAMS(_SW_D_,_FUNC_,_P1_,_P2_) \ - switch (_SW_D_) { \ - case CV_8U : return _FUNC_<_depth_type(CV_8U) >(_P1_,_P2_); \ - case CV_8S : return _FUNC_<_depth_type(CV_8S) >(_P1_,_P2_); \ - case CV_16U: return _FUNC_<_depth_type(CV_16U)>(_P1_,_P2_); \ - case CV_16S: return _FUNC_<_depth_type(CV_16S)>(_P1_,_P2_); \ - case CV_32S: return _FUNC_<_depth_type(CV_32S)>(_P1_,_P2_); \ - case CV_32F: return _FUNC_<_depth_type(CV_32F)>(_P1_,_P2_); \ - case CV_64F: return _FUNC_<_depth_type(CV_64F)>(_P1_,_P2_); \ - } - -#define _SWITCH_CV_FUNC_3PARAMS(_SW_D_,_FUNC_,_P1_,_P2_,_P3_) \ - switch (_SW_D_) { \ - case CV_8U : return _FUNC_<_depth_type(CV_8U) >(_P1_,_P2_,_P3_); \ - case CV_8S : return _FUNC_<_depth_type(CV_8S) >(_P1_,_P2_,_P3_); \ - case CV_16U: return _FUNC_<_depth_type(CV_16U)>(_P1_,_P2_,_P3_); \ - case CV_16S: return _FUNC_<_depth_type(CV_16S)>(_P1_,_P2_,_P3_); \ - case CV_32S: return _FUNC_<_depth_type(CV_32S)>(_P1_,_P2_,_P3_); \ - case CV_32F: return _FUNC_<_depth_type(CV_32F)>(_P1_,_P2_,_P3_); \ - case CV_64F: return _FUNC_<_depth_type(CV_64F)>(_P1_,_P2_,_P3_); \ - } - -#define _SWITCH_CV_FUNC_4PARAMS(_SW_D_,_FUNC_,_P1_,_P2_,_P3_,_P4_) \ - switch (_SW_D_) { \ - case CV_8U : return _FUNC_<_depth_type(CV_8U) >(_P1_,_P2_,_P3_,_P4_); \ - case CV_8S : return _FUNC_<_depth_type(CV_8S) >(_P1_,_P2_,_P3_,_P4_); \ - case CV_16U: return _FUNC_<_depth_type(CV_16U)>(_P1_,_P2_,_P3_,_P4_); \ - case CV_16S: return _FUNC_<_depth_type(CV_16S)>(_P1_,_P2_,_P3_,_P4_); \ - case CV_32S: return _FUNC_<_depth_type(CV_32S)>(_P1_,_P2_,_P3_,_P4_); \ - case CV_32F: return _FUNC_<_depth_type(CV_32F)>(_P1_,_P2_,_P3_,_P4_); \ - case CV_64F: return _FUNC_<_depth_type(CV_64F)>(_P1_,_P2_,_P3_,_P4_); \ - } - -#endif // _LBANN_OPENCV_EXTENSIONS_H_INCLUDED_ -#endif // LBANN_HAS_OPENCV diff --git a/include/lbann/data_readers/patchworks/CMakeLists.txt b/include/lbann/data_readers/patchworks/CMakeLists.txt deleted file mode 100644 index d45491f93cd..00000000000 --- a/include/lbann/data_readers/patchworks/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -# Add the headers for this directory -set_full_path(THIS_DIR_HEADERS - patchworks.hpp - patchworks_ROI.hpp - patchworks_common.hpp - patchworks_patch_descriptor.hpp - patchworks_stats.hpp - ) - -# Propagate the files up the tree -set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/data_readers/patchworks/patchworks.hpp b/include/lbann/data_readers/patchworks/patchworks.hpp deleted file mode 100644 index d445bb2d343..00000000000 --- a/include/lbann/data_readers/patchworks/patchworks.hpp +++ /dev/null @@ -1,59 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// patchworks.hpp - LBANN PATCHWORKS main interface header -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN PATCHWORKS main interface header - * - includes the main interface function declarations - */ - -#include "lbann_config.hpp" - -#ifdef LBANN_HAS_OPENCV -#ifndef _PATCHWORKS_H_INCLUDED_ -#define _PATCHWORKS_H_INCLUDED_ -#include -#include "patchworks_common.hpp" -#include "patchworks_patch_descriptor.hpp" - -namespace lbann { -namespace patchworks { - -/// Compute the min and max value of pixels -std::pair check_min_max(const cv::Mat& _img); - -/// Adjust for reducing chromatic aberration -cv::Mat correct_chromatic_aberration(const cv::Mat& _img); - -/// Drop 2 channels randomly -cv::Mat drop_2channels(const cv::Mat& _img); - -} // end of namespace patchworks -} // end of namespace lbann - -#endif //_PATCHWORKS_H_INCLUDED_ -#endif // LBANN_HAS_OPENCV diff --git a/include/lbann/data_readers/patchworks/patchworks_ROI.hpp b/include/lbann/data_readers/patchworks/patchworks_ROI.hpp deleted file mode 100644 index 3abdfed5da6..00000000000 --- a/include/lbann/data_readers/patchworks/patchworks_ROI.hpp +++ /dev/null @@ -1,153 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// patchworks_ROI.hpp - LBANN PATCHWORKS ROI (region-of-interest) header -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN PATCHWORKS ROI header - * - Region of interest descriptor - */ - -#include "lbann_config.hpp" - -#ifdef LBANN_HAS_OPENCV -#ifndef _PATCHWORKS_ROI_H_INCLUDED_ -#define _PATCHWORKS_ROI_H_INCLUDED_ - -#include -#include -#include "patchworks_common.hpp" - -namespace lbann { -namespace patchworks { - -/** - * Regions of interest descriptor. - * Contains a pair of coordinates that defines a rectangular region of interest - */ -class ROI { - public: - /// An internal value to represent an uninitialized coordinate value - static const int undefined_coordinate; - - int m_left; ///< The left-most pixel position of the region - int m_top; ///< The top-most pixel position of the region - int m_right; ///< The right-most pixel position of the region - int m_bottom; ///< The bottom-most pixel position of the region - - ROI() ///< The default constructor - : m_left(undefined_coordinate), m_top(undefined_coordinate), - m_right(undefined_coordinate), m_bottom(undefined_coordinate) {} - - void init(); ///< Reset the structure with undefined coordinate values - bool is_undefined() const; ///< Tell if the structure has not been initialized - bool is_valid() const; ///< Check if the region is valid - bool set_overlapping_region(const cv::Mat& img); - /// Check if the region of interest covers the whole image - bool is_whole_image(const cv::Mat& img); - - /// Set a region by the coordinates - bool set_by_corners(const int p0_x, const int p0_y, - const int p1_x, const int p1_y); - /// Set a region by the center and its size - bool set_by_center(const int px, const int py, - const unsigned int _width, const unsigned int _height); - - /// move the region horizontally by dx and vertically by dy - void move(const std::pair displacement); - - /// Returns the left position of the region - int left() const { - return m_left; - } - /// Returns the top poisition of the region - int top() const { - return m_top; - } - /// Returns the right position of the region - int right() const { - return m_right; - } - /// Returns the bottom position of the region - int bottom() const { - return m_bottom; - } - - /// Returns a cv::Rect equivalent - cv::Rect rect() const { - return cv::Rect(m_left, m_top, m_right-m_left, m_bottom-m_top); - } - /// Returns the width of the rectangular region - int width() const { - return (m_right - m_left); - } - /// Returns the height of the rectangular region - int height() const { - return (m_bottom - m_top); - } - /// Returns the area of the rectangular region - int area() const { - return width()*height(); - } - /// Returns the size of the area (width, hegiht) - - std::ostream& Print(std::ostream& os) const { ///< Print out the content - return os << '(' << m_left << ", " << m_top << ") (" - << m_right << ", " << m_bottom << ')'; - } - - /// Check if this ROI is exactly the same as the given rectangular area - bool operator==(const ROI& rarea) const; - /// Check if this ROI is not exactly the same as the given rectangular area - bool operator!=(const ROI& rarea) const; - /// Check if the given rectangular region contains this ROI but is not the same - bool operator<(const ROI& rarea) const; - /// Check if the given rectangular region contains this ROI - bool operator<=(const ROI& rarea) const; - /// Check if this ROI contains the given rectangular region but is not the same - bool operator>(const ROI& rarea) const; - /// Check if this ROI contains the given rectangular region - bool operator>=(const ROI& rarea) const; -}; - -inline bool ROI::operator<=(const ROI& rarea) const { - return (((rarea.m_left <= m_left) && (rarea.m_top <= m_top)) && - ((m_right <= rarea.m_right) && (m_bottom <= rarea.m_bottom)) && - is_valid()); -} - -inline bool ROI::operator>=(const ROI& rarea) const { - return (((m_left <= rarea.m_left) && (m_top <= rarea.m_top)) && - ((rarea.m_right <= m_right) && (rarea.m_bottom <= m_bottom)) && - rarea.is_valid()); -} - -std::ostream& operator<<(std::ostream& os, const ROI& roi); - -} // end of namespace patchworks -} // end of namespace lbann -#endif // _PATCHWORKS_ROI_H_INCLUDED_ -#endif // LBANN_HAS_OPENCV diff --git a/include/lbann/data_readers/patchworks/patchworks_common.hpp b/include/lbann/data_readers/patchworks/patchworks_common.hpp deleted file mode 100644 index 5c3b9ceb7d1..00000000000 --- a/include/lbann/data_readers/patchworks/patchworks_common.hpp +++ /dev/null @@ -1,70 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// patchworks_common.hpp - LBANN PATCHWORKS header for common definitions -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN PATCHWORKS common header - * - includes commonly used macros, definitions and declarations - */ - -#include "lbann_config.hpp" - -#ifdef LBANN_HAS_OPENCV -#ifndef _PATCHWORKS_COMMON_H_ -#define _PATCHWORKS_COMMON_H_ - -#include // std::pair -#include -#include -#include -#include "lbann/data_readers/opencv_extensions.hpp" - -namespace lbann { -namespace patchworks { - -/// Patch displacement type -using displacement_type = std::pair; - -#if 0 -// using 32-bit floating point for intermediate image data processing -using pw_fp_t = float; -using pw_cv_vec3 = cv::Vec3f; -#define _PATCHWORKS_STAT_FLOAT_ 32 -#define _PW_CV_FP_ CV_32FC3 -#else -// using 64-bit floating point for intermediate image data processing -using pw_fp_t = double; -using pw_cv_vec3 = cv::Vec3d; -#define _PATCHWORKS_STAT_FLOAT_ 64 -#define _PW_CV_FP_ CV_64FC3 -#endif - -} // end of namespace patchworks -} // end of namespace lbann - -#endif // _PATCHWORKS_COMMON_H_ -#endif // LBANN_HAS_OPENCV diff --git a/include/lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp b/include/lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp deleted file mode 100644 index 2891055593c..00000000000 --- a/include/lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp +++ /dev/null @@ -1,186 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// patchworks_patch_descriptor.hpp - LBANN PATCHWORKS header for patch descriptor -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN PATCHWORKS header for patch descriptor - */ - -#include "lbann_config.hpp" - -#ifdef LBANN_HAS_OPENCV -#ifndef _PATCHWORKS_PATCH_DESCRIPTOR_H_INCLUDED_ -#define _PATCHWORKS_PATCH_DESCRIPTOR_H_INCLUDED_ - -#include -#include -#include "patchworks_common.hpp" -#include "patchworks_ROI.hpp" - -namespace lbann { -namespace patchworks { - -class patch_descriptor { - public: - // --- configuration variables --- - unsigned int m_width; ///< patch width - unsigned int m_height; ///< patch height - unsigned int m_gap; ///< gap between patches - unsigned int m_jitter; ///< for patch position randomization - - /** patch centering mode - * 0: place the center patch anywhere within the image - * 1: place the center patch anywhere as long as it allows the space for all 8 neighboring patches - * other: place the center patch at the center of the image - */ - unsigned int m_mode_center; - - /** chromatic aberration correction mode - * 0: nothing - * 1: pixel transform px*B where a=[-1 2 -1] and B=I-a'a/(aa') - * 2: randomly replace two channels with white noise - */ - unsigned int m_mode_chrom; - - /// Whether patches are self-labeled - bool m_self_label; - - /// The file extension name (i.e., image type) - std::string m_ext; - - // --- post-configuration variables --- - ROI m_sample_area; ///< The area to sample patches from - /// The list of displacements used to generate consecutive patches - std::vector m_displacements; - - // --- state variables --- - ROI m_patch_center; ///< The center patch region - /// The actual patch positions - std::vector m_positions; - /// The index of displacement used to generate the current patch - unsigned int m_cur_patch_idx; - - public: - patch_descriptor() { - init(); ///< Default constructor - } - virtual ~patch_descriptor() {} - void init(); ///< Initializer - void reset(); ///< Clear state variables other than configuration variables - - /// Get patch size - unsigned int get_patch_width() const { return m_width; } - unsigned int get_patch_height() const { return m_height; } - - /// Set patch size - void set_size(const int w, const int h); - /// Set the gap between neighboring patches - void set_gap(const unsigned int g) { - m_gap = g; - } - /// Set poisiton radomization parameter, the maximum jitter - void set_jitter(const unsigned int j) { - m_jitter = j; - } - /// Set mode to place center patch - void set_mode_centering(const unsigned int m) { - m_mode_center = m; - } - /// Set correction mode for chromatic aberration - void set_mode_chromatic_aberration(const unsigned int m) { - m_mode_chrom = m; - } - - /// Declare the size of the image to take patches from, and implicitly set the area to sample as the entire image - bool set_sample_image(const unsigned int w, const unsigned int h); - /// Explicitly set the area to sample patches - bool set_sample_area(const ROI& area); - - /// Set the file extention of patch files - void set_file_ext(const std::string e) { - m_ext = e; - } - - /// Mark self labeling for patches - void set_self_label() { m_self_label = true; } - - /// Unmark self labeling - void unset_self_label() { m_self_label = false; } - - bool is_self_labeling() const { return m_self_label; } - - unsigned int get_num_labels() const { return 8u; } - - /// A function that populates the list of displacements from the base patch to the next one - virtual void define_patch_set(); - - /// transform each pixel by B = I - a'*a/(a*a') where a=[-1 2 -1] to mitigate chromatic aberration - bool is_to_correct_chromatic_aberration_at_pixel() const { - return (m_mode_chrom == 1); - } - - /// randomly drop two channels to avoid chromatic aberration impact - bool is_to_drop_2channels() const { - return (m_mode_chrom == 2); - } - - /// Allow read-only access to the patch displacements - const std::vector& get_displacements() const { - return m_displacements; - } - - virtual unsigned int get_num_patches() const { return 2u; } - - /// Compute the position of the first patch - virtual bool get_first_patch(ROI& patch); - /// Compute the position of a subsequent patch - virtual bool get_next_patch(ROI& patch); - /// extract all the patches defined - virtual bool extract_patches(const cv::Mat& img, std::vector& patches); - /** - * Return the label of the last patch generated. - * For dual patch scenarios, it is one less the id of the non-center patch position. - */ - virtual unsigned int get_last_label() const { return m_cur_patch_idx - 1; } - - /// Allow read-only access to the positions of the patches generated - const std::vector& access_positions() const { - return m_positions; - } - virtual std::string get_type() const { return "patch_descriptor"; } - virtual std::string get_description() const; - /// Print out the content of patch descriptor - virtual std::ostream& print(std::ostream& os) const; -}; - -/// stream out the patch descriptor content -std::ostream& operator<<(std::ostream& os, const patch_descriptor& pd); - -} // end of namespace patchworks -} // end of namespace lbann -#endif // _PATCHWORKS_PATCH_DESCRIPTOR_H_INCLUDED_ -#endif // LBANN_HAS_OPENCV diff --git a/include/lbann/data_readers/patchworks/patchworks_stats.hpp b/include/lbann/data_readers/patchworks/patchworks_stats.hpp deleted file mode 100644 index 12141012eef..00000000000 --- a/include/lbann/data_readers/patchworks/patchworks_stats.hpp +++ /dev/null @@ -1,93 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// patchworks_stats.hpp - LBANN PATCHWORKS header for pixel statistics -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN PATCHWORKS header for pixel statistics - */ - -#include "lbann_config.hpp" - -#ifdef LBANN_HAS_OPENCV -#ifndef _PATCHWORKS_STATS_INCLUDED_ -#define _PATCHWORKS_STATS_INCLUDED_ - -#include -#include -#include "patchworks_common.hpp" - -namespace lbann { -namespace patchworks { - -/// Pixel statistics of an image -struct image_stats { - size_t cnt; ///< number of values (pixels) - size_t cntZeros; ///< number of zero values - pw_fp_t min; ///< minimum intensity of a pixel - pw_fp_t max; ///< maximum intensity of a pixel - pw_fp_t median; ///< median intensity of a pixel - pw_fp_t minNZ; ///< number of non-zero pixels - pw_fp_t medianNZ; ///< median among non-zero values - double avg; ///< average intensity - double avgNZ; ///< average intensity among non-zeros - double stdev; ///< standard deviation of intensity - double stdevNZ; ///< standard deviation among non-zero values - - /// Print out statistics - std::ostream& Print(std::ostream& os) const { - os << " stats:" << std::endl - << " - cnt : " << cnt << std::endl - << " - cnt0 : " << cntZeros << std::endl - << " - min : " << min << std::endl - << " - max : " << max << std::endl - << " - med : " << median << std::endl - << " - minNZ : " << minNZ << std::endl - << " - medNZ : " << medianNZ << std::endl - << " - avg : " << avg << std::endl - << " - avgNZ : " << avgNZ << std::endl - << " - std : " << stdev << std::endl - << " - stdNZ : " << stdevNZ << std::endl; - return os; - } -}; - -/// Stream out the image statistics -inline std::ostream& operator<<(std::ostream& os, const image_stats& stats) { - return stats.Print(os); -} - -/// Compute the pixel statistics for a mono channel image -bool get_single_channel_stats(const cv::Mat& img, image_stats& stats); - -/// Compute the pixel statistics of an image per channel -bool get_channel_stats(const cv::Mat& img, std::vector& stats); - - -} // end of namespace patchworks -} // end of namespace lbann -#endif // _PATCHWORKS_STATS_INCLUDED_ -#endif // LBANN_HAS_OPENCV diff --git a/include/lbann/data_readers/sample_list.hpp b/include/lbann/data_readers/sample_list.hpp new file mode 100644 index 00000000000..6d4aa5e051f --- /dev/null +++ b/include/lbann/data_readers/sample_list.hpp @@ -0,0 +1,160 @@ +#ifndef __SAMPLE_LIST_HPP__ +#define __SAMPLE_LIST_HPP__ + +#include +#include +#include +#include + +#include "lbann/comm.hpp" + +#include "lbann/utils/file_utils.hpp" +#include +#include +#include +#include +#include +#include + +namespace lbann { + +static const std::string sample_exclusion_list = "CONDUIT_HDF5_EXCLUSION"; +static const std::string sample_inclusion_list = "CONDUIT_HDF5_INCLUSION"; + +struct sample_list_header { + bool m_is_exclusive; + /// Number of included samples + size_t m_included_sample_count; + /// Number of excluded samples + size_t m_excluded_sample_count; + size_t m_num_files; + std::string m_file_dir; + std::string m_sample_list_filename; + + sample_list_header(); + + bool is_exclusive() const; + size_t get_sample_count() const; + size_t get_num_files() const; + const std::string& get_sample_list_filename() const; + const std::string& get_file_dir() const; + template void serialize( Archive & ar ) { + ar(m_is_exclusive, m_included_sample_count, m_excluded_sample_count, m_num_files, m_file_dir, m_sample_list_filename); + } +}; + +template +class sample_list { + public: + /// The type for the index assigned to each sample file + using sample_file_id_t = std::size_t; + /** To describe a sample as the id of the file to which it belongs. + * Each file contains only one sample. */ + using sample_t = std::template pair; + /// Type for the list of samples + using samples_t = std::template vector< sample_t >; + /// Mapping of the file index to the filename + using file_id_stats_v_t = std::vector< std::string >; + + sample_list(); + virtual ~sample_list(); + sample_list(const sample_list& rhs); + sample_list& operator=(const sample_list& rhs); + sample_list& copy(const sample_list& rhs); + + void copy_members(const sample_list& rhs); + + /// Load a sample list file + void load(const std::string& samplelist_file, size_t stride=1, size_t offset=0); + + /// Load the header of a sample list file + sample_list_header load_header(const std::string& samplelist_file) const; + + /// Restore a sample list from a serialized string + void load_from_string(const std::string& samplelist); + + /// Tells how many samples in the list + virtual size_t size() const; + + /// Tells how many sample files are there + virtual size_t get_num_files() const; + + /// Tells if the internal list is empty + bool empty() const; + + /// Serialize to and from an archive using the cereal library + template void serialize( Archive & ar ); + + /// Serialize sample list + virtual bool to_string(std::string& sstr) const; + + /// Write the sample list + void write(const std::string filename) const; + + /// Allow read-only access to the internal list data + const samples_t& get_list() const; + + /// Allow the read-only access to the list header + const sample_list_header& get_header() const; + + /// Allow read-only access to the metadata of the idx-th sample in the list + const sample_t& operator[](size_t idx) const; + + virtual const std::string& get_samples_filename(sample_file_id_t id) const; + + const std::string& get_samples_dirname() const; + + void all_gather_archive(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm); + void all_gather_archive_new(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm); + + template size_t all_gather_field(T data, std::vector& gathered_data, lbann_comm& comm); + virtual void all_gather_packed_lists(lbann_comm& comm); + + protected: + + /// Reads a header line from the sample list given as a stream, and use the info string for error message + std::string read_header_line(std::istream& ifs, const std::string& filename, const std::string& info) const; + + /// Reads the header of a sample list + sample_list_header read_header(std::istream& istrm, const std::string& filename) const; + + /// read the body of a sample list, which is the list of sample files, where each file contains a single sample. + virtual void read_sample_list(std::istream& istrm, size_t stride=1, size_t offset=0); + + /// Assign names to samples when there is only one sample per file without a name. + virtual void assign_samples_name(); + + /// Reads a sample list and populates the internal list + size_t get_samples_per_file(std::istream& istrm, const std::string& filename, size_t stride=1, size_t offset=0); + + /// Add the header info to the given string + void write_header(std::string& sstr, size_t num_files) const; + + /// Get the number of total/included/excluded samples + virtual void get_num_samples(size_t& total, size_t& included, size_t& excluded) const; + + virtual void set_samples_filename(sample_file_id_t id, const std::string& filename); + + protected: + /// header info of sample list + sample_list_header m_header; + + private: + /// List of all samples with a file identifier and sample name for each sample + samples_t m_sample_list; + + /// Maps sample's file id to file names, file descriptors, and use counts + file_id_stats_v_t m_file_id_stats_map; + +}; + +void handle_mpi_error(int ierr); + +template +inline T uninitialized_sample_name(); + +} // end of namespace + +#include "sample_list_impl.hpp" + +#endif // __SAMPLE_LIST_HPP__ diff --git a/include/lbann/data_readers/sample_list_conduit_io_handle.hpp b/include/lbann/data_readers/sample_list_conduit_io_handle.hpp new file mode 100644 index 00000000000..ff9b59ed7f5 --- /dev/null +++ b/include/lbann/data_readers/sample_list_conduit_io_handle.hpp @@ -0,0 +1,95 @@ +#ifndef __SAMPLE_LIST_CONDUIT_IO_HANDLE_HPP__ +#define __SAMPLE_LIST_CONDUIT_IO_HANDLE_HPP__ + +#include "sample_list_open_files.hpp" +#include "conduit/conduit.hpp" +#include "conduit/conduit_relay.hpp" +#include "conduit/conduit_relay_io_handle.hpp" + +namespace lbann { + +template +class sample_list_conduit_io_handle : public sample_list_open_files { + public: + using file_handle_t = conduit::relay::io::IOHandle*; + using typename sample_list_open_files::sample_file_id_t; + using typename sample_list_open_files::sample_t; + using typename sample_list_open_files::samples_t; + using typename sample_list_open_files::file_id_stats_t; + using typename sample_list_open_files::file_id_stats_v_t; + using typename sample_list_open_files::fd_use_map_t; + + sample_list_conduit_io_handle(); + ~sample_list_conduit_io_handle() override; + + bool is_file_handle_valid(const file_handle_t& h) const override; + + protected: + void obtain_sample_names(file_handle_t& h, std::vector& sample_names) const override; + file_handle_t open_file_handle_for_read(const std::string& path) override; + void close_file_handle(file_handle_t& h) override; + void clear_file_handle(file_handle_t& h) override; +}; + + +template +inline sample_list_conduit_io_handle::sample_list_conduit_io_handle() +: sample_list_open_files() {} + +template +inline sample_list_conduit_io_handle::~sample_list_conduit_io_handle() { + // Close the existing open files + for(auto& f : this->m_file_id_stats_map) { + file_handle_t& h = std::get<1>(f); + close_file_handle(h); + clear_file_handle(h); + std::get<2>(f).clear(); + } + this->m_file_id_stats_map.clear(); +} + +template +inline void sample_list_conduit_io_handle +::obtain_sample_names(sample_list_conduit_io_handle::file_handle_t& h, std::vector& sample_names) const { + sample_names.clear(); + if (h != nullptr) { + h->list_child_names("/", sample_names); + } +} + +template +inline bool sample_list_conduit_io_handle +::is_file_handle_valid(const sample_list_conduit_io_handle::file_handle_t& h) const { + return ((h != nullptr) && (h->is_open())); +} + +template +inline typename sample_list_conduit_io_handle::file_handle_t sample_list_conduit_io_handle +::open_file_handle_for_read(const std::string& file_path) { + file_handle_t h = new conduit::relay::io::IOHandle; + h->open(file_path, "hdf5"); + return h; +} + +template +inline void sample_list_conduit_io_handle +::close_file_handle(file_handle_t& h) { + if(is_file_handle_valid(h)) { + h->close(); + } +} + +template <> +inline conduit::relay::io::IOHandle* uninitialized_file_handle() { + return nullptr; +} + +template +inline void sample_list_conduit_io_handle +::clear_file_handle(sample_list_conduit_io_handle::file_handle_t& h) { + h = uninitialized_file_handle(); +} + +} // end of namespace lbann + +#endif // __SAMPLE_LIST_CONDUIT_IO_HANDLE_HPP__ diff --git a/include/lbann/data_readers/sample_list_hdf5.hpp b/include/lbann/data_readers/sample_list_hdf5.hpp new file mode 100644 index 00000000000..f9181594076 --- /dev/null +++ b/include/lbann/data_readers/sample_list_hdf5.hpp @@ -0,0 +1,91 @@ +#ifndef __SAMPLE_LIST_HDF5_HPP__ +#define __SAMPLE_LIST_HDF5_HPP__ + +#include "sample_list_open_files.hpp" +#include "hdf5.h" +#include "conduit/conduit.hpp" +#include "conduit/conduit_relay.hpp" +#include "conduit/conduit_relay_io_hdf5.hpp" + +namespace lbann { + +template +class sample_list_hdf5 : public sample_list_open_files { + public: + using file_handle_t = hid_t; + using typename sample_list_open_files::sample_file_id_t; + using typename sample_list_open_files::sample_t; + using typename sample_list_open_files::samples_t; + using typename sample_list_open_files::file_id_stats_t; + using typename sample_list_open_files::file_id_stats_v_t; + using typename sample_list_open_files::fd_use_map_t; + + sample_list_hdf5(); + ~sample_list_hdf5() override; + + bool is_file_handle_valid(const hid_t& h) const override; + + protected: + void obtain_sample_names(hid_t& h, std::vector& sample_names) const override; + hid_t open_file_handle_for_read(const std::string& path) override; + void close_file_handle(hid_t& h) override; + void clear_file_handle(hid_t& h) override; +}; + + +template +inline sample_list_hdf5::sample_list_hdf5() +: sample_list_open_files() {} + +template +inline sample_list_hdf5::~sample_list_hdf5() { + // Close the existing open files + for(auto& f : this->m_file_id_stats_map) { + file_handle_t& h = std::get<1>(f); + close_file_handle(h); + clear_file_handle(h); + std::get<2>(f).clear(); + } + this->m_file_id_stats_map.clear(); +} + +template +inline void sample_list_hdf5 +::obtain_sample_names(hid_t& h, std::vector& sample_names) const { + conduit::relay::io::hdf5_group_list_child_names(h, "/", sample_names); +} + +template +inline bool sample_list_hdf5 +::is_file_handle_valid(const hid_t& h) const { + return (h > static_cast(0)); +} + +template +inline hid_t sample_list_hdf5< sample_name_t> +::open_file_handle_for_read(const std::string& file_path) { + return conduit::relay::io::hdf5_open_file_for_read(file_path); +} + +template +inline void sample_list_hdf5 +::close_file_handle(hid_t& h) { + if(is_file_handle_valid(h)) { + conduit::relay::io::hdf5_close_file(h); + } +} + +template <> +inline hid_t uninitialized_file_handle() { + return static_cast(0); +} + +template +inline void sample_list_hdf5 +::clear_file_handle(hid_t& h) { + h = uninitialized_file_handle(); +} + +} // end of namespace lbann + +#endif // __SAMPLE_LIST_HDF5_HPP__ diff --git a/include/lbann/data_readers/sample_list_impl.hpp b/include/lbann/data_readers/sample_list_impl.hpp new file mode 100644 index 00000000000..0f161bed61f --- /dev/null +++ b/include/lbann/data_readers/sample_list_impl.hpp @@ -0,0 +1,747 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "lbann/utils/exception.hpp" +#include "lbann/utils/file_utils.hpp" +#include +#include +#include +#include +#include + +#include +#include + +namespace lbann { + +template +inline std::string to_string(const T val) { + return std::to_string(val); +} + +template<> +inline std::string to_string(const std::string val) { + return val; +} + +template +inline auto to_sample_name_t(const std::string& sn_str) -> decltype (sample_name_t()){ + LBANN_ERROR(std::string{} + " :: string conversion is not implement for the sample_name_t"); + return sample_name_t(); +} + +template<> inline int to_sample_name_t(const std::string& sn_str) { + return std::stoi(sn_str); +} + +template<> inline long to_sample_name_t(const std::string& sn_str) { + return std::stol(sn_str); +} + +template<> inline unsigned long to_sample_name_t(const std::string& sn_str) { + return std::stoul(sn_str); +} + +template<> inline long long to_sample_name_t(const std::string& sn_str) { + return std::stoll(sn_str); +} + +template<> inline unsigned long long to_sample_name_t(const std::string& sn_str) { + return std::stoull(sn_str); +} + +template<> inline float to_sample_name_t(const std::string& sn_str) { + return std::stof(sn_str); +} + +template<> inline double to_sample_name_t(const std::string& sn_str) { + return std::stod(sn_str); +} + +template<> inline long double to_sample_name_t(const std::string& sn_str) { + return std::stold(sn_str); +} + +template<> inline std::string to_sample_name_t(const std::string& sn_str) { + return sn_str; +} + +//------------------------ +// sample_list_header +//------------------------ + +inline sample_list_header::sample_list_header() + : m_is_exclusive(false), m_included_sample_count(0u), + m_excluded_sample_count(0u), m_num_files(0u), + m_file_dir("") { +} + +inline bool sample_list_header::is_exclusive() const { + return m_is_exclusive; +} + +inline size_t sample_list_header::get_sample_count() const { + return m_included_sample_count; +} + +inline size_t sample_list_header::get_num_files() const { + return m_num_files; +} + +inline const std::string& sample_list_header::get_sample_list_filename() const { + return m_sample_list_filename; +} + +inline const std::string& sample_list_header::get_file_dir() const { + return m_file_dir; +} + +//------------------ +// sample_list +//------------------ + +template +inline sample_list::sample_list() { +} + +template +inline sample_list::~sample_list() { +} + +template +inline sample_list +::sample_list(const sample_list& rhs) { + copy_members(rhs); +} + +template +inline sample_list& sample_list +::operator=(const sample_list& rhs) { + // check for self-assignment + if (this == &rhs) { + return (*this); + } + + copy_members(rhs); + + return (*this); +} + +template +inline sample_list& sample_list +::copy(const sample_list& rhs) { + // check for self-assignment + if (this == &rhs) { + return (*this); + } + + copy_members(rhs); + + return (*this); +} + +template +inline void sample_list +::copy_members(const sample_list& rhs) { + m_header = rhs.m_header; + m_sample_list = rhs.m_sample_list; + + /// Keep track of existing filenames + m_file_id_stats_map = rhs.m_file_id_stats_map; +} + +template +inline void sample_list +::load(const std::string& samplelist_file, + size_t stride, size_t offset) { + std::ifstream istr(samplelist_file); + get_samples_per_file(istr, samplelist_file, stride, offset); + istr.close(); +} + +template +inline sample_list_header sample_list +::load_header(const std::string& samplelist_file) const { + std::ifstream istr(samplelist_file); + return read_header(istr, samplelist_file); +} + +template +inline void sample_list +::load_from_string(const std::string& samplelist) { + std::istringstream istr(samplelist); + get_samples_per_file(istr, "", 1, 0); +} + +template +inline size_t sample_list +::size() const { + return m_sample_list.size(); +} + +template +inline size_t sample_list +::get_num_files() const { + return m_file_id_stats_map.size(); +} + +template +inline bool sample_list +::empty() const { + return (size() == 0ul); +} + +template +inline std::string sample_list +::read_header_line(std::istream& istrm, + const std::string& filename, + const std::string& info) const { + if (!istrm.good()) { + throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + + " :: unable to read the header line of sample list " + filename + " for " + info); + } + + std::string line; + std::getline(istrm, line); + + if (line.empty()) { + throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + + " :: unable to read the header line of sample list " + filename + " for " + info + + " -- the line was empty"); + } + return line; +} + + +template +inline sample_list_header sample_list +::read_header(std::istream& istrm, + const std::string& filename) const { + sample_list_header hdr; + + hdr.m_sample_list_filename = filename; + + std::string line1 = read_header_line(istrm, filename, "the exclusiveness"); + std::stringstream header1(line1); + + std::string line2 = read_header_line(istrm, filename, "the number of samples and the number of files"); + std::stringstream header2(line2); + + std::string line3 = read_header_line(istrm, filename, "the data file directory"); + std::stringstream header3(line3); + + std::string sample_list_type; + header1 >> sample_list_type; + std::for_each(sample_list_type.begin(), sample_list_type.end(), [](char& c){ c = std::toupper(c); }); + + const std::string type_exclusive = sample_exclusion_list; + size_t found = sample_list_type.find(type_exclusive); + + if (found != std::string::npos) { + hdr.m_is_exclusive = true; + } else { + hdr.m_is_exclusive = false; + } + + header2 >> hdr.m_included_sample_count; + header2 >> hdr.m_excluded_sample_count; + header2 >> hdr.m_num_files; + + header3 >> hdr.m_file_dir; + + if (hdr.get_file_dir().empty() || !check_if_dir_exists(hdr.get_file_dir())) { + LBANN_ERROR(std::string{} + "file " + filename + + " :: data root directory '" + hdr.get_file_dir() + "' does not exist."); + } + + return hdr; +} + + +template +inline void sample_list +::read_sample_list(std::istream& istrm, + size_t stride, size_t offset) { + m_sample_list.reserve(m_header.get_sample_count()); + + const std::string whitespaces(" \t\f\v\n\r"); + size_t cnt_files = 0u; + std::string line; + + while (std::getline(istrm, line)) { + const size_t end_of_str = line.find_last_not_of(whitespaces); + if (end_of_str == std::string::npos) { // empty line + continue; + } + if (cnt_files++ >= m_header.get_num_files()) { + break; + } + // Check to see if there is a strided load and skip the lines that are not for this rank + if ((cnt_files-1)%stride != offset) { + continue; + } + + std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing + std::string filename; + + sstr >> filename; + + const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename; + + if (filename.empty() || !check_if_file_exists(file_path)) { + throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + + " :: data file '" + filename + "' does not exist."); + } + + const sample_file_id_t index = m_file_id_stats_map.size(); + static const auto sn0 = uninitialized_sample_name(); + m_sample_list.emplace_back(std::make_pair(index, sn0)); + m_file_id_stats_map.emplace_back(filename); + } + + if (m_header.get_num_files() != cnt_files) { + LBANN_ERROR(std::string("Sample list number of files requested ") + + std::to_string(m_header.get_num_files()) + + std::string(" does not equal number of files loaded ") + + std::to_string(cnt_files)); + } + + if(stride == 1 && m_header.get_sample_count() != m_sample_list.size()) { + LBANN_ERROR(std::string("Sample list count ") + + std::to_string(m_header.get_sample_count()) + + std::string(" does not equal sample list size ") + + std::to_string(m_sample_list.size())); + } +} + + +template +inline size_t sample_list +::get_samples_per_file(std::istream& istrm, + const std::string& filename, + size_t stride, size_t offset) { + m_header = read_header(istrm, filename); + + read_sample_list(istrm, stride, offset); + + return size(); +} + + +template +inline void sample_list +::all_gather_archive(const std::string &archive, + std::vector& gathered_archive, + lbann_comm& comm) { + if (!options::get()->get_bool("all_gather_old")) { + all_gather_archive_new(archive, gathered_archive, comm); + return; + } + + int size_of_list_archive = archive.size(); + std::vector packed_sizes(comm.get_procs_per_trainer()); + + comm.trainer_all_gather(size_of_list_archive, packed_sizes); + + int total_packed_size = 0; + std::vector displ; + displ.assign(comm.get_procs_per_trainer()+1, 0); + + for (size_t i = 0u; i < packed_sizes.size(); ++i) { + const auto sz = packed_sizes[i]; + displ[i+1] = displ[i] + sz; + } + total_packed_size = displ.back(); + + if (total_packed_size <= 0) { + return; + } + + std::string all_samples; + all_samples.resize(static_cast(total_packed_size)); + + std::vector local_data(archive.begin(), archive.end()); + std::vector packed_data(all_samples.size() * sizeof(decltype(all_samples)::value_type)); + comm.trainer_all_gather(local_data, + packed_data, + packed_sizes, + displ); + + for (size_t i = 0u; i < packed_sizes.size(); ++i) { + std::string& buf = gathered_archive[i]; + const auto sz = packed_sizes[i]; + displ[i+1] = displ[i] + sz; + std::vector::const_iterator first = packed_data.begin() + displ[i]; + std::vector::const_iterator last = packed_data.begin() + displ[i] + sz; + buf.resize(sz); + buf.assign(first, last); + } + return; +} + +template +inline void sample_list +::all_gather_archive_new(const std::string &archive, + std::vector& gathered_archive, + lbann_comm& comm) { + + // there's commented out code below to deal with the case where + // archive.size() > INT_MAX; but for now let's assume we won't + // encounter that (which is true for the 100M JAG set) + int constexpr max_int = std::numeric_limits::max(); + size_t n = archive.size(); + if (n > max_int) { + LBANN_ERROR("(n > max_int"); + } + + // change int to size_t for case where n > max_int (see commented out + // code block below) + int size_of_my_archive= archive.size(); + std::vector packed_sizes(comm.get_procs_per_trainer()); + comm.trainer_all_gather(size_of_my_archive, packed_sizes); + + int me = comm.get_rank_in_trainer(); + int np = comm.get_procs_per_trainer(); + + size_t g = 0; + for (auto t : packed_sizes) { + g += t; + } + if (!me) { + std::cout << "global archive size: " << g << std::endl; + } + + for (int p=0; p(gathered_archive[p].data()); + comm.trainer_broadcast(p, data, sz); + } + +#if 0 + std::vector rounds; + for (int p=0; p(archive.data() + offset); + comm.trainer_broadcast(p, data, rounds[k]); + } else { + char *data = const_cast(gathered_archive[p].data() + offset); + comm.trainer_broadcast(p, data, rounds[k]); + } + offset += rounds[k]; +if (me == p) { +std::cout << "XX finished round" << std::endl; +} + } + } +#endif + + return; +} + +template +template +inline size_t sample_list +::all_gather_field(T data, + std::vector& gathered_data, + lbann_comm& comm) { + std::string archive; + std::ostringstream ss; + { + cereal::BinaryOutputArchive oarchive(ss); + oarchive(data); + } // archive goes out of scope, ensuring all contents are flushed + archive = ss.str(); + + std::vector gathered_archive(comm.get_procs_per_trainer()); + + all_gather_archive(archive, gathered_archive, comm); + + std::vector per_rank_data(comm.get_procs_per_trainer()); + + size_t gathered_field_size = 0; + for (size_t i = 0u; i < gathered_archive.size(); ++i) { + std::string& buf = gathered_archive[i]; + T& tmp = gathered_data[i]; + + std::stringstream in_ss(buf); + cereal::BinaryInputArchive iarchive(in_ss); + iarchive(tmp); + gathered_field_size += tmp.size(); + } + return gathered_field_size; +} + +template +template +void sample_list +::serialize( Archive & ar ) { + ar(m_header, m_sample_list, m_file_id_stats_map); +} + +template +inline void sample_list +::write_header(std::string& sstr, size_t num_files) const { + // The first line indicate if the list is exclusive or inclusive + // The next line contains the number of samples (included and excluded), + // as well as the number of files, which are the same in this caes + // The next line contains the root data file directory + + sstr += (m_header.is_exclusive()? sample_exclusion_list + "\n" : sample_inclusion_list + "\n"); + size_t total, included, excluded; + get_num_samples(total, included, excluded); + /// TODO: clarify the comment below + /// Include the number of invalid samples, which for an inclusive index list is always 0 + sstr += std::to_string(included) + ' ' + std::to_string(excluded) + ' ' + std::to_string(num_files) + '\n'; + sstr += m_header.get_file_dir() + '\n'; +} + +template +inline void sample_list +::get_num_samples(size_t& total, size_t& included, size_t& excluded) const { + total = size(); + included = size(); + excluded = 0ul; +} + +template +inline bool sample_list +::to_string(std::string& sstr) const { + size_t total_len = 0ul; + for (const auto& s : m_sample_list) { + const std::string& filename = m_file_id_stats_map[s.first]; + total_len += filename.size() + 1u; + } + + sstr.clear(); + + // reserve the string to hold the entire sample lit + size_t estimated_len = 30 + 42 + m_header.get_file_dir().size() + 1 + total_len + 1000; + sstr.reserve(estimated_len); + + // write the list header + write_header(sstr, get_num_files()); + + // write the list body + for (const auto& s : m_sample_list) { + // File name + const std::string& filename = m_file_id_stats_map[s.first]; + sstr += filename + '\n'; + } + + return true; +} + +template +inline void sample_list +::write(const std::string filename) const { + std::string dir, basename; + parse_path(filename, dir, basename); + if (!dir.empty() && !check_if_dir_exists(dir)) { + // The creation of a shared directory must be done once in a coordinated fashion + // among the entities that have access to it. Thus, it must be done in advance + std::cerr << "The sample list output directory (" + dir + ") does not exist" << std::endl; + return; + } + + std::fstream ofs(filename, std::fstream::out | std::fstream::binary); + + if (!ofs.good()) { + return; + } + + std::string buf; + to_string(buf); + + ofs.write(buf.data(), buf.size()*sizeof(std::string::value_type)); + ofs.close(); +} + +template +inline const typename sample_list::samples_t& +sample_list::get_list() const { + return m_sample_list; +} + +template +inline const sample_list_header& +sample_list::get_header() const { + return m_header; +} + +template +inline const typename sample_list::sample_t& +sample_list::operator[](size_t idx) const { + return m_sample_list[idx]; +} + +template +inline const std::string& sample_list +::get_samples_filename(sample_file_id_t id) const { + return m_file_id_stats_map[id]; +} + +template +inline const std::string& sample_list +::get_samples_dirname() const { + return m_header.get_file_dir(); +} + +template +inline void sample_list +::set_samples_filename(sample_file_id_t id, const std::string& filename) { + m_file_id_stats_map[id] = filename; +} + +#if defined(__cpp_if_constexpr) // c++17 +template +inline void sample_list +::assign_samples_name() { + if constexpr (std::is_integral::value + && !std::is_same::value) { + sample_name_t i = 0; + for (auto& s: m_sample_list) { + s.second = i++; + } + } else if constexpr (std::is_same::value) { + for (auto& s: m_sample_list) { + s.second = s.first; + } + } else { + LBANN_ERROR(std::string{} + " :: base class does not implement this method" + + " for the current sample name type"); + } +} + +template +inline sample_name_t uninitialized_sample_name() { + if constexpr (std::is_integral::value) { + return static_cast(0); + } else if constexpr (std::is_same::value) { + return ""; + } else if constexpr (std::is_floating_point::value) { + return 0.0; + } else if constexpr (std::is_default_constructible::value + && std::is_copy_constructible::value) { + sample_name_t ret{}; + return ret; + } else { + LBANN_ERROR(std::string{} + " :: base class does not implement this method" + + " for the current sample name type"); + } +} +#else +template<> inline void sample_list +::assign_samples_name() { + size_t i = 0ul; + for (auto& s: m_sample_list) { + s.second = i++; + } +} + +template<> inline void sample_list +::assign_samples_name() { + for (auto& s: m_sample_list) { + s.second = s.first; + } +} + +template +inline void sample_list +::assign_samples_name() { + LBANN_ERROR(std::string{} + " :: base class does not implement this method" + + " for the current sample name type"); +} + +template<> inline size_t uninitialized_sample_name() { + return 0ul; +} + +template<> inline std::string uninitialized_sample_name() { + return ""; +} + +template +inline sample_name_t uninitialized_sample_name() { + sample_name_t ret{}; + return ret; +} +#endif // defined(__cpp_if_constexpr) + +template +inline void sample_list +::all_gather_packed_lists(lbann_comm& comm) { + int num_ranks = comm.get_procs_per_trainer(); + typename std::vector per_rank_samples(num_ranks); + typename std::vector> per_rank_files(num_ranks); + + size_t num_samples = all_gather_field(m_sample_list, per_rank_samples, comm); + size_t num_ids = all_gather_field(m_file_id_stats_map, per_rank_files, comm); + + m_sample_list.clear(); + m_file_id_stats_map.clear(); + + m_sample_list.reserve(num_samples); + m_file_id_stats_map.reserve(num_ids); + + for(int r = 0; r < num_ranks; r++) { + const samples_t& s_list = per_rank_samples[r]; + const auto& files = per_rank_files[r]; + for (const auto& s : s_list) { + sample_file_id_t index = s.first; + const std::string& filename = files[index]; + if(index >= m_file_id_stats_map.size() + || (m_file_id_stats_map.back() != filename)) { + index = m_file_id_stats_map.size(); + m_file_id_stats_map.emplace_back(filename); + }else { + for(size_t i = 0; i < m_file_id_stats_map.size(); i++) { + if(filename == m_file_id_stats_map[i]) { + index = i; + break; + } + } + } + static const auto sn0 = uninitialized_sample_name(); + m_sample_list.emplace_back(std::make_pair(index, sn0)); + } + } + + assign_samples_name(); + + return; +} + +} // end of namespace lbann diff --git a/include/lbann/data_readers/sample_list_jag.hpp b/include/lbann/data_readers/sample_list_jag.hpp deleted file mode 100644 index 07040a80d48..00000000000 --- a/include/lbann/data_readers/sample_list_jag.hpp +++ /dev/null @@ -1,321 +0,0 @@ -#ifndef __SAMPLE_LIST_JAG_HPP__ -#define __SAMPLE_LIST_JAG_HPP__ - -#include -#include -#include -#include - -#ifndef _JAG_OFFLINE_TOOL_MODE_ -#include "lbann/comm.hpp" -#else -#include -#endif - -#include "lbann/utils/file_utils.hpp" -#include -#include -#include -#include -#include -#include -#include "conduit/conduit_relay_io_hdf5.hpp" - -/// Number of system and other files that may be open during execution -#define LBANN_MAX_OPEN_FILE_MARGIN 128 -#define LBANN_MAX_OPEN_FILE_RETRY 3 - -namespace lbann { - -struct sample_list_header { - bool m_is_exclusive; - /// Number of included samples - size_t m_included_sample_count; - /// Number of excluded samples - size_t m_excluded_sample_count; - size_t m_num_files; - std::string m_file_dir; - std::string m_sample_list_filename; - - sample_list_header(); - - bool is_exclusive() const; - size_t get_sample_count() const; - size_t get_num_files() const; - const std::string& get_sample_list_filename() const; - const std::string& get_file_dir() const; - template void serialize( Archive & ar ) { - ar(m_is_exclusive, m_included_sample_count, m_excluded_sample_count, m_num_files, m_file_dir, m_sample_list_filename); - } -}; - -static const std::string conduit_hdf5_exclusion_list = "CONDUIT_HDF5_EXCLUSION"; -static const std::string conduit_hdf5_inclusion_list = "CONDUIT_HDF5_INCLUSION"; - -class sample_list_jag { - public: - /// The type of the native identifier of a sample rather than an arbitrarily assigned index - using sample_name_t = std::string; - /// The type for arbitrarily assigned index - using sample_file_id_t = std::size_t; - /// To describe a sample as a pair of the file to which it belongs and its name - // using sample_t = std::pair; - using sample_t = std::pair; - /// Statistics for each file used by the sample list: includes the file name, file descriptor, and - /// and a queue of each step and substep when data will be loaded from the file - using file_id_stats_t = std::tuple>>; - - /// Type for the list of samples - using samples_t = std::vector< sample_t >; - /// Mapping of the file index to the statistics for each file - using file_id_stats_v_t = std::vector< file_id_stats_t >; // rename to sample_to_file_v or something - /// Type for the map of file descriptors to usage step and substep - using fd_use_map_t = std::pair>; - - sample_list_jag(); - ~sample_list_jag(); - sample_list_jag(const sample_list_jag& rhs); - sample_list_jag& operator=(const sample_list_jag& rhs); - sample_list_jag& copy(const sample_list_jag& rhs); - - void copy_members(const sample_list_jag& rhs); - - /// Load a sample list file - void load(const std::string& samplelist_file, size_t stride=1, size_t offset=0); - - /// Load the header of a sample list file - sample_list_header load_header(const std::string& samplelist_file) const; - - /// Extract a sample list from a serialized sample list in a string - void load_from_string(const std::string& samplelist); - - /// Tells how many samples in the list - size_t size() const; - - /// Tells if the internal list is empty - bool empty() const; - - /// Clear internal states - void clear(); - - template void serialize( Archive & ar ); - - /// Check if a sample index is in the valid range - bool check_index(size_t idx) const; - - /// Serialize sample list - bool to_string(std::string& sstr) const; - - /// Write the sample list - void write(const std::string filename) const; - - /// Allow read-only access to the internal list data - const samples_t& get_list() const; - - /// Allow the read-only access to the list header - const sample_list_header& get_header() const; - - /// Allow read-only access to the metadata of the idx-th sample in the list - const sample_t& operator[](size_t idx) const; - - const std::string& get_samples_filename(sample_file_id_t id) const { - return std::get<0>(m_file_id_stats_map[id]); - } - - const std::string& get_samples_dirname() const { - return m_header.get_file_dir(); - } - - hid_t get_samples_hdf5_handle(sample_file_id_t id) const { - hid_t h = std::get<1>(m_file_id_stats_map[id]); - return h; - } - - void set_samples_filename(sample_file_id_t id, const std::string& filename) { - std::get<0>(m_file_id_stats_map[id]) = filename; - } - - void set_files_hdf5_handle(const std::string& filename, hid_t h) { - sample_file_id_t id = 0; - for (auto&& e : m_file_id_stats_map) { - if(std::get<0>(e) == filename) { - std::get<1>(e) = h; - break; - } - id++; - } - manage_open_hdf5_handles(id, true); - } - - void delete_hdf5_handle_pq_entry(sample_file_id_t id) { - for (std::deque::iterator it = m_open_fd_pq.begin(); it!=m_open_fd_pq.end(); ++it) { - if(it->first == id) { - it = m_open_fd_pq.erase(it); - break; - } - } - return; - } - - void manage_open_hdf5_handles(sample_file_id_t id, bool pre_open_fd = false) { - /// When we enter this function the priority queue is either empty or a heap - if(!m_open_fd_pq.empty()) { - if(m_open_fd_pq.size() > m_max_open_files) { - auto& f = m_open_fd_pq.front(); - auto& victim = m_file_id_stats_map[f.first]; - hid_t victim_fd = std::get<1>(victim); - std::pop_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); - m_open_fd_pq.pop_back(); - if(victim_fd > 0) { - conduit::relay::io::hdf5_close_file(victim_fd); - std::get<1>(victim) = 0; - } - } - } - - /// Before we can enqueue the any new access times for this descriptor, remove any - /// earlier descriptor - std::sort_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); - if(m_open_fd_pq.front().first == id) { - m_open_fd_pq.pop_front(); - } - std::make_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); - - auto& e = m_file_id_stats_map[id]; - auto& file_access_queue = std::get<2>(e); - if(!file_access_queue.empty()) { - if(!pre_open_fd) { - file_access_queue.pop_front(); - } - } - if(!file_access_queue.empty()) { - m_open_fd_pq.emplace_back(std::make_pair(id,file_access_queue.front())); - }else { - /// If there are no future access of the file place a terminator entry to track - /// the open file, but is always sorted to the top of the heap - m_open_fd_pq.emplace_back(std::make_pair(id,std::make_pair(INT_MAX,id))); - } - std::push_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); - return; - } - - hid_t open_samples_hdf5_handle(const size_t i, bool pre_open_fd = false) { - const sample_t& s = m_sample_list[i]; - sample_file_id_t id = s.first; - hid_t h = get_samples_hdf5_handle(id); - if (h <= static_cast(0)) { - const std::string& file_name = get_samples_filename(id); - const std::string conduit_file_path = add_delimiter(get_samples_dirname()) + file_name; - if (file_name.empty() || !check_if_file_exists(conduit_file_path)) { - LBANN_ERROR(std::string{} + " :: data file '" + conduit_file_path + "' does not exist."); - } - bool retry = false; - int retry_cnt = 0; - do { - try { - h = conduit::relay::io::hdf5_open_file_for_read( conduit_file_path ); - }catch (conduit::Error const& e) { - LBANN_WARNING(" :: trying to open the file " + conduit_file_path + " and got " + e.what()); - retry = true; - retry_cnt++; - }catch (...) { - LBANN_ERROR("trying to open the file " + conduit_file_path + " and got an unknown exception"); - } - }while(retry && retry_cnt < 3); - - if (h <= static_cast(0)) { - LBANN_ERROR(std::string{} + " :: data file '" + conduit_file_path + "' could not be opened."); - } - auto& e = m_file_id_stats_map[id]; - std::get<1>(e) = h; - /// If a new file is opened, place it in the priority queue - manage_open_hdf5_handles(id, pre_open_fd); - } - return h; - } - - void close_if_done_samples_hdf5_handle(const size_t i) { - const sample_t& s = m_sample_list[i]; - sample_file_id_t id = s.first; - hid_t h = get_samples_hdf5_handle(id); - if (h > static_cast(0)) { - auto& e = m_file_id_stats_map[id]; - auto& file_access_queue = std::get<2>(e); - if(file_access_queue.empty()) { - conduit::relay::io::hdf5_close_file(std::get<1>(e)); - std::get<1>(e) = 0; - delete_hdf5_handle_pq_entry(id); - } - } - } - - void all_gather_archive(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm); - template size_t all_gather_field(T data, std::vector& gathered_data, lbann_comm& comm); - void all_gather_packed_lists(lbann_comm& comm); - - void compute_epochs_file_usage(const std::vector& shufled_indices, int mini_batch_size, const lbann_comm& comm); - - protected: - - /// Reads a header line from the sample list given as a stream, and use the info string for error message - std::string read_header_line(std::istream& ifs, const std::string& filename, const std::string& info) const; - - /// Reads the header of a sample list - sample_list_header read_header(std::istream& istrm, const std::string& filename) const; - - /// Get the list of samples that exist in a conduit bundle - hid_t get_conduit_bundle_samples(std::string conduit_file_path, std::vector& sample_names, size_t included_samples, size_t excluded_samples); - - /// read the body of exclusive sample list - void read_exclusive_list(std::istream& istrm, size_t stride=1, size_t offset=0); - - /// read the body of inclusive sample list - void read_inclusive_list(std::istream& istrm, size_t stride=1, size_t offset=0); - - /// Reads a sample list and populates the internal list - size_t get_samples_per_file(std::istream& istrm, const std::string& filename, size_t stride=1, size_t offset=0); - - /// Add the header info to the given string - void write_header(std::string& sstr, size_t num_files) const; - - static bool pq_cmp(fd_use_map_t left, fd_use_map_t right) { - return ((left.second).first < (right.second).first) || - (((left.second).first == (right.second).first) && - ((left.second).second < (right.second).second)); } - - private: - /// header info of sample list - sample_list_header m_header; - - /// List of all samples with a file identifier and sample name for each sample - samples_t m_sample_list; - - /// Maps sample's file id to file names, file descriptors, and use counts - file_id_stats_v_t m_file_id_stats_map; - - /// Track the number of samples per file - std::unordered_map m_file_map; - - /// Track the number of open file descriptors and when they will be used next - std::deque m_open_fd_pq; - - size_t m_max_open_files; -}; - -void handle_mpi_error(int ierr); - -#ifndef _JAG_OFFLINE_TOOL_MODE_ -void distribute_sample_list(const sample_list_jag& sn, - std::string& my_samples, - lbann_comm& comm); -#else -void distribute_sample_list(const sample_list_jag& sn, - std::string& my_samples, - MPI_Comm& comm); -#endif - -} // end of namespace - -#include "sample_list_jag_impl.hpp" - -#endif // __SAMPLE_LIST_JAG_HPP__ diff --git a/include/lbann/data_readers/sample_list_jag_impl.hpp b/include/lbann/data_readers/sample_list_jag_impl.hpp deleted file mode 100644 index 6b7ea1eeaa8..00000000000 --- a/include/lbann/data_readers/sample_list_jag_impl.hpp +++ /dev/null @@ -1,683 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include "sample_list_jag.hpp" -#include "lbann/utils/exception.hpp" -#include "lbann/utils/file_utils.hpp" -#include -#include "hdf5.h" -#include "conduit/conduit.hpp" -#include "conduit/conduit_relay.hpp" -#include "conduit/conduit_relay_io_hdf5.hpp" -#include -#include - -#include -#include -#include - -namespace lbann { - -inline sample_list_header::sample_list_header() - : m_is_exclusive(false), m_included_sample_count(0u), m_excluded_sample_count(0u), m_num_files(0u), m_file_dir("") { -} - -inline bool sample_list_header::is_exclusive() const { - return m_is_exclusive; -} - -inline size_t sample_list_header::get_sample_count() const { - return m_included_sample_count; -} - -inline size_t sample_list_header::get_num_files() const { - return m_num_files; -} - -inline const std::string& sample_list_header::get_sample_list_filename() const { - return m_sample_list_filename; -} - -inline const std::string& sample_list_header::get_file_dir() const { - return m_file_dir; -} - -inline sample_list_jag::sample_list_jag() { - m_max_open_files = getdtablesize() - LBANN_MAX_OPEN_FILE_MARGIN; -} - -inline sample_list_jag::~sample_list_jag() { - // Close the existing open files - for(auto f : m_file_id_stats_map) { - if(std::get<1>(f) > 0) { - conduit::relay::io::hdf5_close_file(std::get<1>(f)); - } - std::get<1>(f) = 0; - std::get<2>(f).clear(); - } - m_file_id_stats_map.clear(); - m_open_fd_pq.clear(); -} - -inline sample_list_jag::sample_list_jag(const sample_list_jag& rhs) { - copy_members(rhs); -} - -inline sample_list_jag& sample_list_jag::operator=(const sample_list_jag& rhs) { - // check for self-assignment - if (this == &rhs) { - return (*this); - } - - copy_members(rhs); - - return (*this); -} - -inline sample_list_jag& sample_list_jag::copy(const sample_list_jag& rhs) { - // check for self-assignment - if (this == &rhs) { - return (*this); - } - - copy_members(rhs); - - return (*this); -} - -inline void sample_list_jag::copy_members(const sample_list_jag& rhs) { - m_header = rhs.m_header; - m_sample_list = rhs.m_sample_list; - m_file_id_stats_map = rhs.m_file_id_stats_map; - m_file_map = rhs.m_file_map; - m_max_open_files = rhs.m_max_open_files; - - /// Keep track of existing filenames but do not copy any file - /// descriptor information - for(auto&& e : m_file_id_stats_map) { - if(std::get<1>(e) > 0) { - std::get<1>(e) = 0; - } - std::get<2>(e).clear(); - } - - /// Do not copy the open file descriptor priority queue - /// File handle ownership is not transfered in the copy - m_open_fd_pq.clear(); -} - -inline void sample_list_jag::load(const std::string& samplelist_file, size_t stride, size_t offset) { - std::ifstream istr(samplelist_file); - get_samples_per_file(istr, samplelist_file, stride, offset); - istr.close(); -} - -inline sample_list_header sample_list_jag::load_header(const std::string& samplelist_file) const { - std::ifstream istr(samplelist_file); - return read_header(istr, samplelist_file); -} - -inline void sample_list_jag::load_from_string(const std::string& samplelist) { - std::istringstream istr(samplelist); - get_samples_per_file(istr, "", 1, 0); -} - -inline size_t sample_list_jag::size() const { - return m_sample_list.size(); -} - -inline bool sample_list_jag::empty() const { - return m_sample_list.empty(); -} - -inline std::string sample_list_jag::read_header_line(std::istream& istrm, const std::string& filename, const std::string& info) const { - if (!istrm.good()) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) - + " :: unable to read the header line of sample list " + filename + " for " + info); - } - - std::string line; - std::getline(istrm, line); - - if (line.empty()) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) - + " :: unable to read the header line of sample list " + filename + " for " + info - + " -- the line was empty"); - } - return line; -} - - -inline sample_list_header sample_list_jag::read_header(std::istream& istrm, const std::string& filename) const { - sample_list_header hdr; - - hdr.m_sample_list_filename = filename; - - std::string line1 = read_header_line(istrm, filename, "the exclusiveness"); - std::stringstream header1(line1); - - std::string line2 = read_header_line(istrm, filename, "the number of samples and the number of files"); - std::stringstream header2(line2); - - std::string line3 = read_header_line(istrm, filename, "the data file directory"); - std::stringstream header3(line3); - - std::string sample_list_type; - header1 >> sample_list_type; - std::for_each(sample_list_type.begin(), sample_list_type.end(), [](char& c){ c = std::toupper(c); }); - - const std::string type_exclusive = conduit_hdf5_exclusion_list; - size_t found = sample_list_type.find(type_exclusive); - - if (found != std::string::npos) { - hdr.m_is_exclusive = true; - } else { - hdr.m_is_exclusive = false; - } - - header2 >> hdr.m_included_sample_count; - header2 >> hdr.m_excluded_sample_count; - header2 >> hdr.m_num_files; - - header3 >> hdr.m_file_dir; - - if (hdr.get_file_dir().empty() || !check_if_dir_exists(hdr.get_file_dir())) { - LBANN_ERROR(std::string{} + "file " + filename - + " :: data root directory '" + hdr.get_file_dir() + "' does not exist."); - } - - return hdr; -} - -inline hid_t sample_list_jag::get_conduit_bundle_samples(std::string conduit_file_path, std::vector& sample_names, size_t included_samples, size_t excluded_samples) { - hid_t hdf5_file_hnd = 0; - bool retry = false; - int retry_cnt = 0; - do { - try { - hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( conduit_file_path ); - }catch (conduit::Error const& e) { - LBANN_WARNING(" :: trying to open the file " + conduit_file_path + " and got " + e.what()); - retry = true; - retry_cnt++; - } - }while(retry && retry_cnt < LBANN_MAX_OPEN_FILE_RETRY); - - if (hdf5_file_hnd <= static_cast(0)) { - std::cout << "Opening the file didn't work" << std::endl; - return hdf5_file_hnd; - } - - conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", sample_names); - - if(sample_names.size() != (included_samples + excluded_samples)) { - LBANN_ERROR(std::string("File does not contain the correct number of samples: found ") - + std::to_string(sample_names.size()) - + std::string(" -- this does not equal the expected number of samples that are marked for inclusion: ") - + std::to_string(included_samples) - + std::string(" and exclusion: ") - + std::to_string(excluded_samples)); - } - - return hdf5_file_hnd; -} - -inline void sample_list_jag::read_exclusive_list(std::istream& istrm, size_t stride, size_t offset) { - const std::string whitespaces(" \t\f\v\n\r"); - size_t cnt_files = 0u; - std::string line; - - while (std::getline(istrm, line)) { - const size_t end_of_str = line.find_last_not_of(whitespaces); - if (end_of_str == std::string::npos) { // empty line - continue; - } - if (cnt_files++ >= m_header.get_num_files()) { - break; - } - // Check to see if there is a strided load and skip the lines that are not for this rank - if ((cnt_files-1)%stride != offset) { - continue; - } - - std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing - std::string filename; - size_t included_samples; - size_t excluded_samples; - std::unordered_set excluded_sample_indices; - - sstr >> filename >> included_samples >> excluded_samples; - - const std::string conduit_file_path = add_delimiter(m_header.get_file_dir()) + filename; - - if (filename.empty() || !check_if_file_exists(conduit_file_path)) { - LBANN_ERROR(std::string{} + " :: data file '" + conduit_file_path + "' does not exist."); - } - - excluded_sample_indices.reserve(excluded_samples); - - while(!sstr.eof()) { - std::string index; - sstr >> index; - excluded_sample_indices.insert(index); - } - - if(excluded_sample_indices.size() != excluded_samples) { - LBANN_ERROR(std::string("Index file does not contain the correct number of excluded samples: expected ") - + std::to_string(excluded_samples) - + std::string(" exclusions but found ") - + std::to_string(excluded_sample_indices.size())); - } - - std::vector sample_names; - hid_t hdf5_file_hnd = get_conduit_bundle_samples(conduit_file_path, sample_names, included_samples, excluded_samples); - if(hdf5_file_hnd <= static_cast(0)) { - continue; // skipping the file - } - - if(m_file_map.count(filename) > 0) { - if(sample_names.size() != m_file_map[filename]) { - LBANN_ERROR(std::string("The same file ") - + filename - + " was opened multiple times and reported different sizes: " - + std::to_string(sample_names.size()) - + " and " - + std::to_string(m_file_map[filename])); - } - }else { - m_file_map[filename] = sample_names.size(); - } - - sample_file_id_t index = m_file_id_stats_map.size(); - m_file_id_stats_map.emplace_back(std::make_tuple(filename, 0, std::deque>{})); - set_files_hdf5_handle(filename, hdf5_file_hnd); - - size_t valid_sample_count = 0u; - for(auto s : sample_names) { - std::unordered_set::const_iterator found = excluded_sample_indices.find(s); - if (found != excluded_sample_indices.cend()) { - continue; - } - m_sample_list.emplace_back(index, s); - valid_sample_count++; - } - - if(valid_sample_count != included_samples) { - LBANN_ERROR(std::string("Bundle file does not contain the correct number of included samples: expected ") - + std::to_string(included_samples) - + std::string(" samples, but found ") - + std::to_string(valid_sample_count)); - } - } - - if (m_header.get_num_files() != cnt_files) { - LBANN_ERROR(std::string("Sample list ") - + m_header.get_sample_list_filename() - + std::string(": number of files requested ") - + std::to_string(m_header.get_num_files()) - + std::string(" does not equal number of files loaded ") - + std::to_string(cnt_files)); - } - - m_header.m_is_exclusive = false; -} - - -inline void sample_list_jag::read_inclusive_list(std::istream& istrm, size_t stride, size_t offset) { - const std::string whitespaces(" \t\f\v\n\r"); - size_t cnt_files = 0u; - std::string line; - - while (std::getline(istrm, line)) { - const size_t end_of_str = line.find_last_not_of(whitespaces); - if (end_of_str == std::string::npos) { // empty line - continue; - } - if (cnt_files++ >= m_header.get_num_files()) { - break; - } - // Check to see if there is a strided load and skip the lines that are not for this rank - if ((cnt_files-1)%stride != offset) { - continue; - } - - std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing - std::string filename; - size_t included_samples; - size_t excluded_samples; - - sstr >> filename >> included_samples >> excluded_samples; - - const std::string conduit_file_path = add_delimiter(m_header.get_file_dir()) + filename; - - if (filename.empty() || !check_if_file_exists(conduit_file_path)) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) - + " :: data file '" + filename + "' does not exist."); - } - - std::vector sample_names; - hid_t hdf5_file_hnd = get_conduit_bundle_samples(conduit_file_path, sample_names, included_samples, excluded_samples); - if(hdf5_file_hnd <= static_cast(0)) { - continue; // skipping the file - } - - if(m_file_map.count(filename) > 0) { - if(sample_names.size() != m_file_map[filename]) { - LBANN_ERROR(std::string("The same file ") - + filename - + " was opened multiple times and reported different sizes: " - + std::to_string(sample_names.size()) - + " and " - + std::to_string(m_file_map[filename])); - } - }else { - m_file_map[filename] = sample_names.size(); - } - - std::unordered_set set_of_samples(sample_names.begin(), sample_names.end()); - - sample_file_id_t index = m_file_id_stats_map.size(); - m_file_id_stats_map.emplace_back(std::make_tuple(filename, 0, std::deque>{})); - set_files_hdf5_handle(filename, hdf5_file_hnd); - - size_t valid_sample_count = 0u; - while(!sstr.eof()) { - std::string sample_name;; - sstr >> sample_name; - std::unordered_set::const_iterator found = set_of_samples.find(sample_name); - if (found == set_of_samples.cend()) { - LBANN_ERROR(std::string("Illegal request for a data ID that does not exist: ") + sample_name); - } - m_sample_list.emplace_back(index, sample_name); - valid_sample_count++; - } - if(valid_sample_count != included_samples) { - LBANN_ERROR(std::string("Bundle file does not contain the correct number of included samples: expected ") - + std::to_string(included_samples) - + std::string(" samples, but found ") - + std::to_string(valid_sample_count)); - } - } - - if (m_header.get_num_files() != cnt_files) { - LBANN_ERROR(std::string("Sample list number of files requested ") - + std::to_string(m_header.get_num_files()) - + std::string(" does not equal number of files loaded ") - + std::to_string(cnt_files)); - } -} - - -inline size_t sample_list_jag::get_samples_per_file(std::istream& istrm, const std::string& filename, size_t stride, size_t offset) { - m_header = read_header(istrm, filename); - m_sample_list.reserve(m_header.get_sample_count()); - - if (m_header.is_exclusive()) { - read_exclusive_list(istrm, stride, offset); - } else { - read_inclusive_list(istrm, stride, offset); - } - - if(stride == 1 && m_header.get_sample_count() != m_sample_list.size()) { - LBANN_ERROR(std::string("Sample list count ") - + std::to_string(m_header.get_sample_count()) - + std::string(" does not equal sample list size ") - + std::to_string(m_sample_list.size())); - } - - return m_sample_list.size(); -} - - -inline void sample_list_jag::all_gather_archive(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm) { - int size_of_list_archive = archive.size(); - std::vector packed_sizes(comm.get_procs_per_trainer()); - - comm.trainer_all_gather(size_of_list_archive, packed_sizes); - - int total_packed_size = 0; - std::vector displ; - displ.assign(comm.get_procs_per_trainer()+1, 0); - - for (size_t i = 0u; i < packed_sizes.size(); ++i) { - const auto sz = packed_sizes[i]; - displ[i+1] = displ[i] + sz; - } - total_packed_size = displ.back(); - - if (total_packed_size <= 0) { - return; - } - - std::string all_samples; - all_samples.resize(static_cast(total_packed_size)); - - std::vector local_data(archive.begin(), archive.end()); - std::vector packed_data(all_samples.begin(), all_samples.end()); - comm.trainer_all_gather(local_data, - packed_data, - packed_sizes, - displ); - - for (size_t i = 0u; i < packed_sizes.size(); ++i) { - std::string& buf = gathered_archive[i]; - const auto sz = packed_sizes[i]; - displ[i+1] = displ[i] + sz; - std::vector::const_iterator first = packed_data.begin() + displ[i]; - std::vector::const_iterator last = packed_data.begin() + displ[i] + sz; - buf.resize(sz); - buf.assign(first, last); - } - return; -} - -template -inline size_t sample_list_jag::all_gather_field(T data, std::vector& gathered_data, lbann_comm& comm) { - std::string archive; - std::stringstream ss; - cereal::BinaryOutputArchive oarchive(ss); - oarchive(data); - archive = ss.str(); - - std::vector gathered_archive(comm.get_procs_per_trainer()); - - all_gather_archive(archive, gathered_archive, comm); - - std::vector per_rank_data(comm.get_procs_per_trainer()); - - size_t gathered_field_size = 0; - for (size_t i = 0u; i < gathered_archive.size(); ++i) { - std::string& buf = gathered_archive[i]; - T& tmp = gathered_data[i]; - - std::stringstream in_ss(buf); - cereal::BinaryInputArchive iarchive(in_ss); - iarchive(tmp); - gathered_field_size += tmp.size(); - } - return gathered_field_size; -} - -inline void sample_list_jag::all_gather_packed_lists(lbann_comm& comm) { - int num_ranks = comm.get_procs_per_trainer(); - std::vector per_rank_samples(num_ranks); - std::vector per_rank_file_id_stats_map(num_ranks); - std::vector> per_rank_file_map(num_ranks); - - // Close the existing open files - for(auto&& e : m_file_id_stats_map) { - if(std::get<1>(e) > 0) { - conduit::relay::io::hdf5_close_file(std::get<1>(e)); - std::get<1>(e) = 0; - } - std::get<2>(e).clear(); - } - m_open_fd_pq.clear(); - - size_t num_samples = all_gather_field(m_sample_list, per_rank_samples, comm); - size_t num_ids = all_gather_field(m_file_id_stats_map, per_rank_file_id_stats_map, comm); - size_t num_files = all_gather_field(m_file_map, per_rank_file_map, comm); - - m_sample_list.clear(); - m_file_id_stats_map.clear(); - - m_sample_list.reserve(num_samples); - m_file_id_stats_map.reserve(num_ids); - m_file_map.reserve(num_files); - - for(int r = 0; r < num_ranks; r++) { - const samples_t& sample_list = per_rank_samples[r]; - const file_id_stats_v_t& file_id_stats_map = per_rank_file_id_stats_map[r]; - const std::unordered_map& file_map = per_rank_file_map[r]; - for (const auto& s : sample_list) { - sample_file_id_t index = s.first; - const std::string& filename = std::get<0>(file_id_stats_map[index]); - if(index >= m_file_id_stats_map.size() - || (std::get<0>(m_file_id_stats_map.back()) != filename)) { - index = m_file_id_stats_map.size(); - m_file_id_stats_map.emplace_back(std::make_tuple(filename, 0, std::deque>{})); - // Update the file map structure - if(m_file_map.count(filename) == 0) { - m_file_map[filename] = file_map.at(filename); - } - }else { - for(size_t i = 0; i < m_file_id_stats_map.size(); i++) { - if(filename == std::get<0>(m_file_id_stats_map[i])) { - index = i; - break; - } - } - } - m_sample_list.emplace_back(std::make_pair(index, s.second)); - } - } - - return; -} - -inline void sample_list_jag::compute_epochs_file_usage(const std::vector& shuffled_indices, int mini_batch_size, const lbann_comm& comm) { - for (auto&& e : m_file_id_stats_map) { - if(std::get<1>(e) > 0) { - conduit::relay::io::hdf5_close_file(std::get<1>(e)); - } - std::get<1>(e) = 0; - std::get<2>(e).clear(); - } - // Once all of the file handles are closed, clear the priority queue - m_open_fd_pq.clear(); - - for (size_t i = 0; i < shuffled_indices.size(); i++) { - int idx = shuffled_indices[i]; - const auto& s = m_sample_list[idx]; - sample_file_id_t index = s.first; - - if((i % mini_batch_size) % comm.get_procs_per_trainer() == static_cast(comm.get_rank_in_trainer())) { - /// Enqueue the iteration step when the sample will get used - int step = i / mini_batch_size; - int substep = (i % mini_batch_size) / comm.get_procs_per_trainer(); - std::get<2>(m_file_id_stats_map[index]).emplace_back(std::make_pair(step, substep)); - } - } -} - -inline void sample_list_jag::clear() { - m_sample_list.clear(); -} - -template void sample_list_jag::serialize( Archive & ar ) { - ar(m_header, m_sample_list, m_file_id_stats_map); -} - -inline void sample_list_jag::write_header(std::string& sstr, size_t num_files) const { - // The first line indicate if the list is exclusive or inclusive - // The next line contains the number of samples and the number of files, which are the same in this caes - // The next line contains the root data file directory - - sstr += (m_header.is_exclusive()? conduit_hdf5_exclusion_list + "\n" : conduit_hdf5_inclusion_list + "\n"); - /// Include the number of invalid samples, which for an inclusive index list is always 0 - sstr += std::to_string(m_sample_list.size()) + " 0 " + std::to_string(num_files) + '\n'; - sstr += m_header.get_file_dir() + '\n'; -} - - -inline bool sample_list_jag::to_string(std::string& sstr) const { - std::map> tmp_file_map; - for (const auto& s : m_sample_list) { - std::string filename = std::get<0>(m_file_id_stats_map[s.first]); - tmp_file_map[filename].emplace_back(s.second); - } - - samples_t::const_iterator it_begin = m_sample_list.cbegin(); - samples_t::const_iterator it_end = m_sample_list.cbegin(); - - sstr.clear(); - - // reserve the string to hold the entire sample lit - size_t estimated_len = 30 + 42 + m_header.get_file_dir().size() + 1; - if (it_begin < it_end) { - estimated_len += tmp_file_map.size(); - sstr.reserve(estimated_len); - } - - // write the list header - write_header(sstr, tmp_file_map.size()); - - // write the list body - for (const auto& f : tmp_file_map) { - // File name - sstr += f.first; - // Number of included samples - sstr += std::string(" ") + std::to_string(f.second.size()); - // Number of excluded samples - sstr += std::string(" ") + std::to_string(m_file_map.at(f.first) - f.second.size()); - // Inclusion sample list - for (const auto& s : f.second) { - sstr += ' ' + s; - } - sstr += '\n'; - } - - return true; -} - -inline void sample_list_jag::write(const std::string filename) const { - std::string dir, basename; - parse_path(filename, dir, basename); - if (!dir.empty() && !check_if_dir_exists(dir)) { - // The creation of a shared directory must be done once in a coordinated fashion - // among the entities that have access to it. Thus, it must be done in advance - std::cerr << "The sample list output directory (" + dir + ") does not exist" << std::endl; - return; - } - - std::fstream ofs(filename, std::fstream::out | std::fstream::binary); - - if (!ofs.good()) { - return; - } - - std::string buf; - to_string(buf); - - ofs.write(buf.data(), buf.size()*sizeof(std::string::value_type)); - ofs.close(); -} - -inline const sample_list_jag::samples_t& sample_list_jag::get_list() const { - return m_sample_list; -} - -inline const sample_list_header& sample_list_jag::get_header() const { - return m_header; -} - -inline const sample_list_jag::sample_t& sample_list_jag::operator[](size_t idx) const { - return m_sample_list[idx]; -} - -} // end of namespace lbann diff --git a/include/lbann/data_readers/sample_list_open_files.hpp b/include/lbann/data_readers/sample_list_open_files.hpp new file mode 100644 index 00000000000..57bfb89980e --- /dev/null +++ b/include/lbann/data_readers/sample_list_open_files.hpp @@ -0,0 +1,152 @@ +#ifndef __SAMPLE_LIST_OPEN_FILES_HPP__ +#define __SAMPLE_LIST_OPEN_FILES_HPP__ + +#include "sample_list.hpp" + +/// Number of system and other files that may be open during execution +#define LBANN_MAX_OPEN_FILE_MARGIN 128 +#define LBANN_MAX_OPEN_FILE_RETRY 3 + +namespace lbann { + +template +class sample_list_open_files : public sample_list { + public: + /// The type for the index assigned to each sample file + using sample_file_id_t = std::size_t; + /** To describe a sample as a pair of the file to which it belongs and its name + Each file may contain multiple samples. */ + using sample_t = std::pair; + /// Information for each file used by the sample list: includes the file name, file descriptor, and + /// and a queue of each step and substep when data will be loaded from the file + using file_id_stats_t = std::tuple>>; + + /// Type for the list of samples + using samples_t = std::template vector< sample_t >; + /// Mapping of the file index to the statistics for each file + using file_id_stats_v_t = std::vector< file_id_stats_t >; // rename to sample_to_file_v or something + /// Type for the map of file descriptors to usage step and substep + using fd_use_map_t = std::template pair>; + + sample_list_open_files(); + virtual ~sample_list_open_files(); + /** Copy constructor repllicates all the member variables as they are except + * the file information vector, for which only the file name is copied. */ + sample_list_open_files(const sample_list_open_files& rhs); + /** assignemnt operation repllicates all the member variables as they are except + * the file information vector, for which only the file name is copied. */ + sample_list_open_files& operator=(const sample_list_open_files& rhs); + sample_list_open_files& copy(const sample_list_open_files& rhs); + + void copy_members(const sample_list_open_files& rhs); + + /// Tells how many samples in the list + size_t size() const override; + + /// Tells how many sample files are there + size_t get_num_files() const override; + + using sample_list::load; + /// Emit a serialized archive using the cereal library + template void save( Archive & ar ) const; + /// Restore the member variables from a given archrive serialized by the cereal library + template void load( Archive & ar ); + + /// Serialize this sample list into an std::string object + bool to_string(std::string& sstr) const override; + + /// Allow read-only access to the internal list data + const samples_t& get_list() const; + + /// Allow read-only access to the metadata of the idx-th sample in the list + const sample_t& operator[](size_t idx) const; + + const std::string& get_samples_filename(sample_file_id_t id) const override; + + file_handle_t get_samples_file_handle(sample_file_id_t id) const; + + void set_files_handle(const std::string& filename, file_handle_t h); + + void delete_file_handle_pq_entry(sample_file_id_t id); + + void manage_open_file_handles(sample_file_id_t id, bool pre_open_fd = false); + + file_handle_t open_samples_file_handle(const size_t i, bool pre_open_fd = false); + + virtual void close_if_done_samples_file_handle(const size_t i); + + void compute_epochs_file_usage(const std::vector& shufled_indices, int mini_batch_size, const lbann_comm& comm); + + virtual bool is_file_handle_valid(const file_handle_t& h) const = 0; + + void all_gather_packed_lists(lbann_comm& comm) override; + + protected: + + void set_samples_filename(sample_file_id_t id, const std::string& filename) override; + + /// Get the list of samples from a specific type of bundle file + virtual void obtain_sample_names(file_handle_t& h, std::vector& sample_names) const = 0; + + file_handle_t open_file_handle(std::string file_path); + + /// Get the list of samples that exist in a bundle file + file_handle_t get_bundled_sample_names(std::string file_path, std::vector& sample_names, size_t included_samples, size_t excluded_samples); + + /// Check that the list of samples given actually exist in a bundle file + void validate_implicit_bundles_sample_names(std::string file_path, std::string filename, std::vector& sample_names, size_t included_samples, size_t excluded_samples); + + /// read the body of exclusive sample list + void read_exclusive_list(std::istream& istrm, size_t stride=1, size_t offset=0); + + /// read the body of inclusive sample list + void read_inclusive_list(std::istream& istrm, size_t stride=1, size_t offset=0); + + /// read the body of a sample list + void read_sample_list(std::istream& istrm, size_t stride=1, size_t offset=0) override; + + void assign_samples_name() override {} + + /// Get the number of total/included/excluded samples + void get_num_samples(size_t& total, size_t& included, size_t& excluded) const override; + + static bool pq_cmp(fd_use_map_t left, fd_use_map_t right) { + return ((left.second).first < (right.second).first) || + (((left.second).first == (right.second).first) && + ((left.second).second < (right.second).second)); } + + virtual file_handle_t open_file_handle_for_read(const std::string& file_path) = 0; + virtual void close_file_handle(file_handle_t& h) = 0; + virtual void clear_file_handle(file_handle_t& h) = 0; + + private: + using sample_list::serialize; + template void serialize( Archive & ar ) = delete; + + protected: + using sample_list::m_header; + + /// Maps sample's file id to file names, file descriptors, and use counts + file_id_stats_v_t m_file_id_stats_map; + + private: + /// List of all samples with a file identifier and sample name for each sample + samples_t m_sample_list; + + /// Track the number of samples per file + std::unordered_map m_file_map; + + /// Track the number of open file descriptors and when they will be used next + std::deque m_open_fd_pq; + + size_t m_max_open_files; +}; + +template +inline T uninitialized_file_handle(); + +} // end of namespace + +#include "sample_list_open_files_impl.hpp" + +#endif // __SAMPLE_LIST_OPEN_FILES_HPP__ diff --git a/include/lbann/data_readers/sample_list_open_files_impl.hpp b/include/lbann/data_readers/sample_list_open_files_impl.hpp new file mode 100644 index 00000000000..565b016bd22 --- /dev/null +++ b/include/lbann/data_readers/sample_list_open_files_impl.hpp @@ -0,0 +1,719 @@ +namespace lbann { + +template +inline sample_list_open_files::sample_list_open_files() { + m_max_open_files = getdtablesize() - LBANN_MAX_OPEN_FILE_MARGIN; +} + +template +inline sample_list_open_files::~sample_list_open_files() { + m_open_fd_pq.clear(); +} + +template +inline sample_list_open_files +::sample_list_open_files(const sample_list_open_files& rhs) { + copy_members(rhs); +} + +template +inline sample_list_open_files& +sample_list_open_files +::operator=(const sample_list_open_files& rhs) { + // check for self-assignment + if (this == &rhs) { + return (*this); + } + + copy_members(rhs); + + return (*this); +} + +template +inline sample_list_open_files& +sample_list_open_files +::copy(const sample_list_open_files& rhs) { + // check for self-assignment + if (this == &rhs) { + return (*this); + } + + copy_members(rhs); + + return (*this); +} + +template +inline void sample_list_open_files +::copy_members(const sample_list_open_files& rhs) { + sample_list::copy_members(rhs); + m_sample_list = rhs.m_sample_list; + m_file_map = rhs.m_file_map; + m_max_open_files = rhs.m_max_open_files; + + /// Keep track of existing filenames but do not copy any file + /// descriptor information + m_file_id_stats_map.assign(rhs.m_file_id_stats_map.size(), + std::make_tuple("", + uninitialized_file_handle(), + std::deque>{})); + + for(size_t i = 0u; i < m_file_id_stats_map.size(); ++i) { + set_samples_filename(i, rhs.get_samples_filename(i)); + } + + /// Do not copy the open file descriptor priority queue + /// File handle ownership is not transfered in the copy + m_open_fd_pq.clear(); +} + +template +inline size_t sample_list_open_files +::size() const { + return m_sample_list.size(); +} + +template +inline size_t sample_list_open_files +::get_num_files() const { + return m_file_id_stats_map.size(); +} + +template +inline void sample_list_open_files +::read_exclusive_list(std::istream& istrm, + size_t stride, size_t offset) { + const std::string whitespaces(" \t\f\v\n\r"); + size_t cnt_files = 0u; + std::string line; + + while (std::getline(istrm, line)) { + const size_t end_of_str = line.find_last_not_of(whitespaces); + if (end_of_str == std::string::npos) { // empty line + continue; + } + if (cnt_files++ >= m_header.get_num_files()) { + break; + } + // Check to see if there is a strided load and skip the lines that are not for this rank + if ((cnt_files-1)%stride != offset) { + continue; + } + + std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing + std::string filename; + size_t included_samples; + size_t excluded_samples; + std::unordered_set excluded_sample_indices; + + sstr >> filename >> included_samples >> excluded_samples; + + const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename; + + if (filename.empty() || !check_if_file_exists(file_path)) { + LBANN_ERROR(std::string{} + " :: data file '" + file_path + "' does not exist."); + } + + excluded_sample_indices.reserve(excluded_samples); + + while(!sstr.eof()) { + std::string index; + sstr >> index; + excluded_sample_indices.insert(index); + } + + if(excluded_sample_indices.size() != excluded_samples) { + LBANN_ERROR(std::string("Index file does not contain the correct number of excluded samples: expected ") + + std::to_string(excluded_samples) + + std::string(" exclusions but found ") + + std::to_string(excluded_sample_indices.size())); + } + + std::vector sample_names; + file_handle_t file_hnd = get_bundled_sample_names(file_path, sample_names, included_samples, excluded_samples); + if (!is_file_handle_valid(file_hnd)) { + continue; // skipping the file + } + + if(m_file_map.count(filename) > 0) { + if(sample_names.size() != m_file_map[filename]) { + LBANN_ERROR(std::string("The same file ") + + filename + + " was opened multiple times and reported different sizes: " + + std::to_string(sample_names.size()) + + " and " + + std::to_string(m_file_map[filename])); + } + }else { + m_file_map[filename] = sample_names.size(); + } + + sample_file_id_t index = m_file_id_stats_map.size(); + m_file_id_stats_map.emplace_back(std::make_tuple(filename, uninitialized_file_handle(), std::deque>{})); + set_files_handle(filename, file_hnd); + + size_t valid_sample_count = 0u; + for(auto s : sample_names) { + std::unordered_set::const_iterator found = excluded_sample_indices.find(s); + if (found != excluded_sample_indices.cend()) { + continue; + } + m_sample_list.emplace_back(index, to_sample_name_t(s)); + valid_sample_count++; + } + + if(valid_sample_count != included_samples) { + LBANN_ERROR(std::string("Bundle file does not contain the correct number of included samples: expected ") + + std::to_string(included_samples) + + std::string(" samples, but found ") + + std::to_string(valid_sample_count)); + } + } + + if (m_header.get_num_files() != cnt_files) { + LBANN_ERROR(std::string("Sample list ") + + m_header.get_sample_list_filename() + + std::string(": number of files requested ") + + std::to_string(m_header.get_num_files()) + + std::string(" does not equal number of files loaded ") + + std::to_string(cnt_files)); + } + + m_header.m_is_exclusive = false; +} + + +template +inline void sample_list_open_files +::read_inclusive_list(std::istream& istrm, + size_t stride, size_t offset) { + const std::string whitespaces(" \t\f\v\n\r"); + size_t cnt_files = 0u; + std::string line; + + while (std::getline(istrm, line)) { + const size_t end_of_str = line.find_last_not_of(whitespaces); + if (end_of_str == std::string::npos) { // empty line + continue; + } + if (cnt_files++ >= m_header.get_num_files()) { + break; + } + // Check to see if there is a strided load and skip the lines that are not for this rank + if ((cnt_files-1)%stride != offset) { + continue; + } + + std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing + std::string filename; + size_t included_samples; + size_t excluded_samples; + + sstr >> filename >> included_samples >> excluded_samples; + + const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename; + + if (filename.empty() || !check_if_file_exists(file_path)) { + throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + + " :: data file '" + filename + "' does not exist."); + } + + file_handle_t file_hnd = open_file_handle(file_path); + if (!is_file_handle_valid(file_hnd)) { + continue; // skipping the file + } + + sample_file_id_t index = m_file_id_stats_map.size(); + m_file_id_stats_map.emplace_back(std::make_tuple(filename, uninitialized_file_handle(), std::deque>{})); + set_files_handle(filename, file_hnd); + + size_t valid_sample_count = 0u; + //#define VALIDATE_SAMPLE_LIST +#ifdef VALIDATE_SAMPLE_LIST + std::vector sample_names; +#endif + while(!sstr.eof()) { + std::string sample_name_str; + sstr >> sample_name_str; + m_sample_list.emplace_back(index, to_sample_name_t(sample_name_str)); +#ifdef VALIDATE_SAMPLE_LIST + sample_names.emplace_back(sample_name_str); +#endif + valid_sample_count++; + } + if(valid_sample_count != included_samples) { + LBANN_ERROR(std::string("Bundle file does not contain the correct number of included samples: expected ") + + std::to_string(included_samples) + + std::string(" samples, but found ") + + std::to_string(valid_sample_count)); + } + + if(m_file_map.count(filename) > 0) { + if(valid_sample_count != m_file_map[filename]) { + LBANN_ERROR(std::string("The same file ") + + filename + + " was opened multiple times and reported different sizes: " + + std::to_string(valid_sample_count) + + " and " + + std::to_string(m_file_map[filename])); + } + }else { + m_file_map[filename] = /*valid_sample_count*/ included_samples + excluded_samples; + } +#ifdef VALIDATE_SAMPLE_LIST + validate_implicit_bundles_sample_names(file_path, filename, sample_names, included_samples, excluded_samples); +#endif + } + + if (m_header.get_num_files() != cnt_files) { + LBANN_ERROR(std::string("Sample list number of files requested ") + + std::to_string(m_header.get_num_files()) + + std::string(" does not equal number of files loaded ") + + std::to_string(cnt_files)); + } +} + + +template +inline void sample_list_open_files +::read_sample_list(std::istream& istrm, size_t stride, size_t offset) { + if (m_header.is_exclusive()) { + read_exclusive_list(istrm, stride, offset); + } else { + read_inclusive_list(istrm, stride, offset); + } +} + + +template +template +void sample_list_open_files +::save( Archive & ar ) const { + using ar_file_stats_t = std::tuple>>; + std::vector file_stats; + file_stats.reserve(m_file_id_stats_map.size()); + for(auto&& e : m_file_id_stats_map) { + file_stats.emplace_back(std::make_tuple(std::get<0>(e), std::get<2>(e))); + } + ar(m_header, m_sample_list, file_stats); +} + +template +template +void sample_list_open_files +::load( Archive & ar ) { + using ar_file_stats_t = std::tuple>>; + std::vector file_stats; + ar(m_header, m_sample_list, file_stats); + m_file_id_stats_map.reserve(file_stats.size()); + for(auto&& e : file_stats) { + //m_file_id_stats_map.emplace_back(std::make_tuple(std::get<0>(e), uninitialized_file_handle(), std::deque>{})); + m_file_id_stats_map.emplace_back(std::make_tuple(std::get<0>(e), uninitialized_file_handle(), std::get<1>(e))); + //m_file_id_stats_map.emplace_back(std::make_tuple(std::get<0>(e), file_handle_t(), std::get<1>(e))); + } +} + +template +inline bool sample_list_open_files +::to_string(std::string& sstr) const { + std::map> tmp_file_map; + for (const auto& s : m_sample_list) { + const std::string& filename = get_samples_filename(s.first); + tmp_file_map[filename].emplace_back(s.second); + } + + sstr.clear(); + + // reserve the string to hold the entire sample lit + size_t estimated_len = 30 + 42 + m_header.get_file_dir().size() + 1; + for (const auto& f : tmp_file_map) { + estimated_len += f.first.size() + + std::to_string(f.second.size()).size() + + std::to_string(m_file_map.at(f.first) - f.second.size()).size() + + 3u; + for (const auto& s : f.second) { + estimated_len += lbann::to_string(s).size() + 1u; + } + } + sstr.reserve(estimated_len); + + // write the list header + this->write_header(sstr, tmp_file_map.size()); + + // write the list body + for (const auto& f : tmp_file_map) { + // File name + sstr += f.first; + // Number of included samples + sstr += std::string(" ") + std::to_string(f.second.size()); + // Number of excluded samples + sstr += std::string(" ") + std::to_string(m_file_map.at(f.first) - f.second.size()); + // Inclusion sample list + for (const auto& s : f.second) { + sstr += ' ' + lbann::to_string(s); + } + sstr += '\n'; + } + + return true; +} + +template +inline void sample_list_open_files +::get_num_samples(size_t& total, size_t& included, size_t& excluded) const { + total = 0u; + for ( const auto f: m_file_map) { + total += f.second; + } + included = size(); + excluded = total - included; +} + +template +inline const typename sample_list_open_files::samples_t& +sample_list_open_files::get_list() const { + return m_sample_list; +} + +template +inline const typename sample_list_open_files::sample_t& +sample_list_open_files::operator[](size_t idx) const { + return m_sample_list[idx]; +} + +template +inline const std::string& sample_list_open_files +::get_samples_filename(sample_file_id_t id) const { + return std::get<0>(m_file_id_stats_map[id]); +} + +template +inline file_handle_t sample_list_open_files +::get_samples_file_handle(sample_file_id_t id) const { + file_handle_t h = std::get<1>(m_file_id_stats_map[id]); + return h; +} + +template +inline void sample_list_open_files +::set_samples_filename(sample_file_id_t id, const std::string& filename) { + std::get<0>(m_file_id_stats_map[id]) = filename; +} + +template +inline void sample_list_open_files +::set_files_handle(const std::string& filename, file_handle_t h) { + sample_file_id_t id = sample_file_id_t(0); + for (auto&& e : m_file_id_stats_map) { + if(std::get<0>(e) == filename) { + std::get<1>(e) = h; + break; + } + id++; + } + manage_open_file_handles(id, true); +} + +template +inline void sample_list_open_files +::obtain_sample_names(file_handle_t& h, std::vector& sample_names) const { + LBANN_ERROR(std::string{} + " :: abstract class does not implement this method"); +} + +template +inline file_handle_t sample_list_open_files +::open_file_handle(std::string file_path) { + file_handle_t file_hnd; + clear_file_handle(file_hnd); + bool retry = false; + int retry_cnt = 0; + do { + try { + file_hnd = open_file_handle_for_read( file_path ); + }catch (conduit::Error const& e) { + LBANN_WARNING(" :: trying to open the file " + file_path + " and got " + e.what()); + retry = true; + retry_cnt++; + } + }while(retry && retry_cnt < LBANN_MAX_OPEN_FILE_RETRY); + + return file_hnd; +} + +template +inline file_handle_t sample_list_open_files +::get_bundled_sample_names(std::string file_path, + std::vector& sample_names, + size_t included_samples, + size_t excluded_samples) { + file_handle_t file_hnd = open_file_handle(file_path); + + if (!is_file_handle_valid(file_hnd)) { + std::cout << "Opening the file didn't work" << std::endl; + return file_hnd; + } + + obtain_sample_names(file_hnd, sample_names); + + if(sample_names.size() != (included_samples + excluded_samples)) { + LBANN_ERROR(std::string("File does not contain the correct number of samples: found ") + + std::to_string(sample_names.size()) + + std::string(" -- this does not equal the expected number of samples that are marked for inclusion: ") + + std::to_string(included_samples) + + std::string(" and exclusion: ") + + std::to_string(excluded_samples)); + } + + return file_hnd; +} + +template +inline void sample_list_open_files +::validate_implicit_bundles_sample_names(std::string file_path, + std::string filename, + std::vector& sample_names, + size_t included_samples, + size_t excluded_samples) { + std::vector all_sample_names; + file_handle_t file_hnd = get_bundled_sample_names(file_path, all_sample_names, included_samples, excluded_samples); + if (!is_file_handle_valid(file_hnd)) { + return; // skipping the file + } + if(m_file_map.count(filename) > 0) { + if(all_sample_names.size() != m_file_map[filename]) { + LBANN_ERROR(std::string("The same file ") + + filename + + " was opened multiple times and reported different sizes: " + + std::to_string(all_sample_names.size()) + + " and " + + std::to_string(m_file_map[filename])); + } + }else { + m_file_map[filename] = all_sample_names.size(); + } + std::unordered_set set_of_samples(all_sample_names.begin(), all_sample_names.end()); + for(auto&& sample_name : sample_names) { + std::unordered_set::const_iterator found = set_of_samples.find(sample_name); + if (found == set_of_samples.cend()) { + LBANN_ERROR(std::string("Illegal request for a data ID that does not exist: ") + sample_name); + } + } + return; +} + +template +inline void sample_list_open_files +::all_gather_packed_lists(lbann_comm& comm) { + int num_ranks = comm.get_procs_per_trainer(); + typename std::vector per_rank_samples(num_ranks); + typename std::vector> per_rank_files(num_ranks); + std::vector my_files; + my_files.reserve(m_file_id_stats_map.size()); + std::vector> per_rank_file_map(num_ranks); + + // Close the existing open files + for(auto&& e : m_file_id_stats_map) { + auto& h = std::get<1>(e); + close_file_handle(h); + clear_file_handle(h); + std::get<2>(e).clear(); + my_files.emplace_back(std::get<0>(e)); + } + m_open_fd_pq.clear(); + + size_t num_samples = this->all_gather_field(m_sample_list, per_rank_samples, comm); + size_t num_ids = this->all_gather_field(my_files, per_rank_files, comm); + size_t num_files = this->all_gather_field(m_file_map, per_rank_file_map, comm); + + m_sample_list.clear(); + m_file_id_stats_map.clear(); + + m_sample_list.reserve(num_samples); + m_file_id_stats_map.reserve(num_ids); + m_file_map.reserve(num_files); + + std::unordered_map mp; + for(int r = 0; r < num_ranks; r++) { + const samples_t& s_list = per_rank_samples[r]; + const auto& files = per_rank_files[r]; + const std::unordered_map& file_map = per_rank_file_map[r]; + for (const auto& s : s_list) { + sample_file_id_t index = s.first; + const std::string& filename = files[index]; + if(index >= m_file_id_stats_map.size() + || (std::get<0>(m_file_id_stats_map.back()) != filename)) { + index = m_file_id_stats_map.size(); + m_file_id_stats_map.emplace_back(std::make_tuple(filename, uninitialized_file_handle(), std::deque>{})); + // Update the file map structure + if(m_file_map.count(filename) == 0) { + m_file_map[filename] = file_map.at(filename); + } + mp[filename] = index; + }else { + auto search_result = mp.find(filename); + if (search_result == mp.end()) { + LBANN_ERROR("mp.find(filename) == mp.end()"); + } + index = search_result->second; + } + m_sample_list.emplace_back(std::make_pair(index, s.second)); + } + } + + return; +} + +template +inline void sample_list_open_files +::compute_epochs_file_usage(const std::vector& shuffled_indices, + int mini_batch_size, + const lbann_comm& comm) { + for (auto&& e : m_file_id_stats_map) { + auto& h = std::get<1>(e); + close_file_handle(h); + clear_file_handle(h); + std::get<2>(e).clear(); + } + // Once all of the file handles are closed, clear the priority queue + m_open_fd_pq.clear(); + for (size_t i = 0; i < shuffled_indices.size(); i++) { + int idx = shuffled_indices[i]; + const auto& s = m_sample_list[idx]; + sample_file_id_t index = s.first; + + if((i % mini_batch_size) % comm.get_procs_per_trainer() == static_cast(comm.get_rank_in_trainer())) { + /// Enqueue the iteration step when the sample will get used + int step = i / mini_batch_size; + int substep = (i % mini_batch_size) / comm.get_procs_per_trainer(); + std::get<2>(m_file_id_stats_map[index]).emplace_back(std::make_pair(step, substep)); + } + } +} + +template +inline void sample_list_open_files +::delete_file_handle_pq_entry(sample_file_id_t id) { + for (std::deque::iterator it = m_open_fd_pq.begin(); it!=m_open_fd_pq.end(); ++it) { + if(it->first == id) { + it = m_open_fd_pq.erase(it); + break; + } + } + return; +} + +template +inline void sample_list_open_files +::manage_open_file_handles(sample_file_id_t id, bool pre_open_fd) { + /// When we enter this function the priority queue is either empty or a heap + if(!m_open_fd_pq.empty()) { + if(m_open_fd_pq.size() > m_max_open_files) { + auto& f = m_open_fd_pq.front(); + auto& victim = m_file_id_stats_map[f.first]; + auto& victim_fd = std::get<1>(victim); + std::pop_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); + m_open_fd_pq.pop_back(); + close_file_handle(victim_fd); + clear_file_handle(victim_fd); + } + } + + /// Before we can enqueue the any new access times for this descriptor, remove any + /// earlier descriptor + std::sort_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); + if(m_open_fd_pq.front().first == id) { + m_open_fd_pq.pop_front(); + } + std::make_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); + + auto& e = m_file_id_stats_map[id]; + auto& file_access_queue = std::get<2>(e); + if(!file_access_queue.empty()) { + if(!pre_open_fd) { + file_access_queue.pop_front(); + } + } + if(!file_access_queue.empty()) { + m_open_fd_pq.emplace_back(std::make_pair(id,file_access_queue.front())); + }else { + /// If there are no future access of the file place a terminator entry to track + /// the open file, but is always sorted to the top of the heap + m_open_fd_pq.emplace_back(std::make_pair(id,std::make_pair(INT_MAX,id))); + } + std::push_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); + return; +} + +template +inline file_handle_t sample_list_open_files +::open_samples_file_handle(const size_t i, bool pre_open_fd) { + const sample_t& s = m_sample_list[i]; + sample_file_id_t id = s.first; + file_handle_t h = get_samples_file_handle(id); + if (!is_file_handle_valid(h)) { + const std::string& file_name = get_samples_filename(id); + const std::string& file_dir = this->get_samples_dirname(); + const std::string file_path = add_delimiter(file_dir) + file_name; + if (file_name.empty() || !check_if_file_exists(file_path)) { + LBANN_ERROR(std::string{} + " :: data file '" + file_path + "' does not exist."); + } + + h = open_file_handle(file_path); + + if (!is_file_handle_valid(h)) { + LBANN_ERROR(std::string{} + " :: data file '" + file_path + "' could not be opened."); + } + auto& e = m_file_id_stats_map[id]; + std::get<1>(e) = h; + /// If a new file is opened, place it in the priority queue + manage_open_file_handles(id, pre_open_fd); + } + return h; +} + +template +inline void sample_list_open_files +::close_if_done_samples_file_handle(const size_t i) { + const sample_t& s = m_sample_list[i]; + sample_file_id_t id = s.first; + auto h = get_samples_file_handle(id); + if (!is_file_handle_valid(h)) { + auto& e = m_file_id_stats_map[id]; + auto& file_access_queue = std::get<2>(e); + if(file_access_queue.empty()) { + auto& fh = std::get<1>(e); + close_file_handle(fh); + clear_file_handle(fh); + delete_file_handle_pq_entry(id); + } + } +} + +template +inline bool sample_list_open_files +::is_file_handle_valid(const file_handle_t& h) const { + LBANN_ERROR(std::string{} + " :: abstract class does not implement this method"); + return false; +} + +template +inline file_handle_t sample_list_open_files +::open_file_handle_for_read(const std::string& file_path) { + LBANN_ERROR(std::string{} + " :: abstract class does not implement this method"); + return file_handle_t(); +} + +template +inline void sample_list_open_files +::close_file_handle(file_handle_t& h) { + LBANN_ERROR(std::string{} + " :: abstract class does not implement this method"); +} + +template +inline void sample_list_open_files +::clear_file_handle(file_handle_t& h) { + LBANN_ERROR(std::string{} + " :: abstract class does not implement this method"); +} + +} // end of namespace lbann diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 91995485d83..df0f1ced1d4 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -30,13 +30,13 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "lbann/base.hpp" #include "lbann/comm.hpp" +#include "lbann/utils/exception.hpp" #include "conduit/conduit_node.hpp" #include #include +#include namespace lbann { @@ -52,6 +52,13 @@ class data_store_conduit { public: + // need to quickly change from unordered_map to map for debugging + using map_ii_t = std::unordered_map; + using map_is_t = std::unordered_map; + + // not currently used; will be in the future + using map_ss_t = std::unordered_map; + //! ctor data_store_conduit(generic_data_reader *reader); @@ -69,36 +76,32 @@ class data_store_conduit { //! dtor ~data_store_conduit(); - /// normally not needed, since reader is passed to ctor. But may - /// be useful in some cases - void set_data_reader_ptr(generic_data_reader *reader) { m_reader = reader; } + void set_data_reader_ptr(generic_data_reader *reader); //! convenience handle - void set_shuffled_indices(const std::vector *indices) { m_shuffled_indices = indices; } - - void setup(int mini_batch_size); + void set_shuffled_indices(const std::vector *indices); - /* - * dah - may be needed in the future, but not needed for bare-bones squashing - void set_is_subsidiary_store() { - m_is_subsidiary_store = true; - } + /** @brief Returns the number of samples summed over all ranks */ + size_t get_num_global_indices() const; - bool is_subsidiary_store() const { - return m_is_subsidiary_store; - } - */ + void setup(int mini_batch_size); + // TODO FIXME void check_mem_capacity(lbann_comm *comm, const std::string sample_list_file, size_t stride, size_t offset); - /// returns the conduit node + /** @brief Returns the conduit Node associated with the data_id */ const conduit::Node & get_conduit_node(int data_id) const; - /// if 'already_have = true' then the passed 'node' was obtained by a call to - /// get_empty_node(). In some operating modes this saves us from copying the node - void set_conduit_node(int data_id, conduit::Node &node, bool already_have = false); + /** @brief Set a conduit node in the data store + * + * if 'already_have = true' then the passed 'node' was obtained by a call to + * get_empty_node(); note, we do this to prevent copying the node + */ + void set_conduit_node(int data_id, const conduit::Node &node, bool already_have = false); - void set_preloaded_conduit_node(int data_id, conduit::Node &node); + void set_preloaded_conduit_node(int data_id, const conduit::Node &node); + + void spill_preloaded_conduit_node(int data_id, const conduit::Node &node); const conduit::Node & get_random_node() const; @@ -107,21 +110,92 @@ class data_store_conduit { /// returns an empty node conduit::Node & get_empty_node(int data_id); - /// As of this writing, will be called if cmd line includes: --preload_data_store - /// This may change in the future; TODO revisit - void set_preload() { m_preload = true; } - - bool is_preloaded() { return m_preload; } - - void set_explicit_loading(bool flag) { m_explicit_loading = flag; } + //================================================================= + // methods for setting and querying the data store's mode + //================================================================= + /** @brief Returns true if preloading is turned on + * + * See notes in: is_explicitly_loading() + */ + bool is_preloading() const { return m_preloading; } + + /** @brief Returns true if explicitly loading is turned on + * + * 'explicitly loading' means that the data that will be owned + * by each rank is passed into the data store during the first epoch. + * This is in contrast to preloading, in which the data is passed into + * the data store prior to the first epoch. Explicit and preloading + * are exclusive: at most only one may be true, however, both will + * be set to false when all loading is complete. + */ + bool is_explicitly_loading() const { return m_explicitly_loading; } + + /** @brief Returns true if all loading has been completed + * + * See notes in: set_loading_is_complete() + */ + bool is_fully_loaded() const; + + /** @brief Returns "true" is running in local cache mode + * + * In local cache mode, each node contains a complete copy + * of the data set. This is stored in a shared memory segment, + * but part of the set may be spilled to disk if memory is + * insufficient. Local cache mode is activated via the cmd line + * flag: --data_store_cache + */ + bool is_local_cache() const { return m_is_local_cache; } - bool is_explicitly_loading() { return m_explicit_loading; } + /** @brief Turn preloading on or off */ + void set_is_preloading(bool flag); + + /** @brief Turn on explicit loading */ + void set_is_explicitly_loading(bool flag); + + /** @brief Marks the data_store as fully loaded + * + * Fully loaded means that each rank has all the data that it + * is intended to own. When not running in local cache mode, this + * occurs (1) at the conclusion of preloading, prior to the beginning of + * the first epoch, or (2) at the conclusion of the first epoch, if + * explicitly loading. When running in local cache mode, this occurs + * (1) at the conclusion of preload_local_cache(), which is called prior + * to the first epoch, or (2) at the conclusion of exchange_local_caches(), + * at th conclusion of the first epoch, if explicitly loading. + */ + void set_loading_is_complete(); + + + /** @brief turns local cache mode on of off */ + void set_is_local_cache(bool flag = true) { m_is_local_cache = flag; } + + /** @brief Check that explicit loading, preloading, and fully loaded flags are consistent */ + void check_query_flags() const; + + //================================================================= + // END methods for setting and querying the data store's mode + //================================================================= + +//XX void { m_owner_maps_were_exchanged = false; } + /// fills in m_owner, which maps index -> owning processor + void exchange_owner_maps(); /// fills in m_owner, which maps index -> owning processor void build_preloaded_owner_map(const std::vector& per_rank_list_sizes); - /// Removed nodes corresponding from the indices vector from the data store - void purge_unused_samples(const std::vector& indices); + /// fills in m_owner, which maps index -> owning processor + void set_preloaded_owner_map(const std::unordered_map &owner) { m_owner = owner; } + + /** @brief Special hanling for ras_lipid_conduit_data_reader; may go away in the future */ + void clear_owner_map(); + + void set_owner_map(const std::unordered_map &m) { m_owner = m; } + + /** @brief Special handling for ras_lipid_conduit_data_reader; may go away in the future */ + void add_owner(int data_id, int owner) { m_owner[data_id] = owner; } + + /** @brief Special handling for ras_lipid_conduit_data_reader; may go away in the future */ + void set_finished_building_map() { m_owner_maps_were_exchanged = true; } /// Recompact the nodes because they are not copied properly when instantiating /// using the copy constructor @@ -131,77 +205,239 @@ class data_store_conduit { /// with the index int get_index_owner(int idx); - bool is_local_cache() const { return m_is_local_cache; } - void exchange_mini_batch_data(size_t current_pos, size_t mb_size) { - if (is_local_cache()) { - return; - } - if (m_super_node) { - exchange_data_by_super_node(current_pos, mb_size); - } else { - exchange_data_by_sample(current_pos, mb_size); - } - ++m_n; - } + /** @brief Read the data set into memory + * + * Each rank reads a portion of the data set, then + * bcasts to all other ranks. + */ + void preload_local_cache(); + + void exchange_mini_batch_data(size_t current_pos, size_t mb_size); + + void set_node_sizes_vary() { m_node_sizes_vary = true; } bool has_conduit_node(int data_id) const; -protected : + /// only used for debugging; pass --debug on cmd line to get + /// each data store to print to a different file. This is made + /// public so data readers can also print to the file + std::ofstream *m_debug = nullptr; + std::ofstream *m_profile = nullptr; - /// records the number of times exchange_mini_batch_data has been called - int m_n; + /// for use during development and debugging + int get_data_size() { return m_data.size(); } - bool m_is_setup; + /// made public for debugging during development + void copy_members(const data_store_conduit& rhs); - void copy_members(const data_store_conduit& rhs, const std::vector& = std::vector()); - generic_data_reader *m_reader; + /** @brief Closes then reopens the debug logging file + * + * Debug logging is enabled on all ranks via the cmd line flag: --data_store_debug + */ + void flush_debug_file(); - lbann_comm *m_comm; + /** @brief Closes then reopens the profile logging file + * + * Profile logging is enabled on P_0 via the cmd line flag: --data_store_profile + */ + void flush_profile_file() const; - /// rank in the trainer; convenience handle - int m_rank_in_trainer; + /** @brief Writes object's state to file */ + void write_checkpoint(std::string dir_name); + + /** @brief Loads object's state from file */ + void load_checkpoint(std::string dir_name, generic_data_reader *reader = nullptr); - /// number of procs in the trainer; convenience handle - int m_np_in_trainer; + /** @brief Add text to the profiling file, if it's opened */ + void set_profile_msg(std::string); - /// convenience handle - bool m_world_master; + /** @brief Runs an internal test to ensure the locally cached conduit data is correct + * + * For use during development and testing. This test is activated via + * the cmd line flag: --data_store_test_cache. Output may be written to + * cout, and the profile and debug files (if they are opened) + * @param n is the maximum number of samples to test; set to -1 to test all + * @return true, if all samples read from file match those constructed from + * the local shared memory segment (aka, cache) + */ + bool test_local_cache_imagenet(int n); - /// convenience handle - bool m_trainer_master; + void test_imagenet_node(int sample_id, bool dereference = true); - /// set to true if data_store is preloaded - bool m_preload; + size_t get_mem_usage(); - /// set to true if data_store is being explicitly loaded - bool m_explicit_loading; +private : - /// maps an index to the processor that owns the associated data - mutable std::unordered_map m_owner; + bool m_bcast_sample_size = true; - /// convenience handle - const std::vector *m_shuffled_indices; + // if not null, 'm_other' points from a train to a validation + // data store; this permits communication which is needed in + // special cases (e.g, see: data_reader_npz_ras_lipid.cpp) + data_store_conduit *m_other = nullptr; + + bool m_owner_maps_were_exchanged = false; + + bool m_run_checkpoint_test = false; + + /** @brief The number of samples that this processor owns */ + size_t m_my_num_indices = 0; + + /** @brief if true, then we are spilling (offloading) samples to disk */ + bool m_spill = false; + + /** @brief if true, then all samples have been spilled */ + bool m_is_spilled = false; + + /** During spilling, the conduit file pathnames are written to this file */ + std::ofstream m_metadata; + + /** @brief Base directory for spilling (offloading) conduit nodes */ + std::string m_spill_dir_base; + + /** @brief Used to form the directory path for spilling conduit nodes */ + int m_cur_spill_dir_integer = -1; + + /** @brief @brief Current directory for spilling (writing to file) conduit nodes + * + * m_cur_spill_dir = m_spill_dir_base/ + */ + std::string m_cur_spill_dir; + + /** @brief The directory to use for testing checkpointing + * + * Testing is activated by passing the cmd flag: --data_store_test_checkpoint= + */ + std::string m_test_dir; + + /** @brief Contains the number of conduit nodes that have been written to m_cur_dir + * + * When m_num_files_in_cur_spill_dir == m_max_files_per_directory, + * m_cur_spill_dir_integer is incremented and a new m_cur_dir is created + */ + int m_num_files_in_cur_spill_dir; + + /** @brief maps data_id to m_m_cur_spill_dir_integer. */ + map_ii_t m_spilled_nodes; + + /// used in set_conduit_node(...) + std::mutex m_mutex; + std::mutex m_mutex_2; + + /// for use in local cache mode + char *m_mem_seg = 0; + size_t m_mem_seg_length = 0; + std::string m_seg_name; + + const std::string m_debug_filename_base = "debug"; + std::string m_debug_filename; + + const std::string m_profile_filename_base = "data_store_profile"; + std::string m_profile_filename; + + bool m_was_loaded_from_file = false; + const std::string m_cereal_fn = "data_store_cereal"; + + /// used in spill_to_file + /// (actually, conduit::Node.save() writes both a + /// json file and a binary file, so double this number + const int m_max_files_per_directory = 500; + + //=========================================================== + // timers for profiling exchange_data + //=========================================================== + + // applicable to imagenet; NA for JAG + double m_exchange_sample_sizes_time = 0; + + // time from beginning of exchange_data_by_sample to wait_all + double m_start_snd_rcv_time = 0; + + // time for wait_all + double m_wait_all_time = 0; + + // time to unpack nodes received from other ranks + double m_rebuild_time = 0; + + // total time for exchange_mini_batch_data + double m_exchange_time = 0; + + // sanity check: + // m_start_snd_rcv_time + m_wait_all_time + m_rebuild_time + // should be only slightly less than m_exchange_time; + // Note that, for imagenet, the first call to exchange_data_by_sample + // involves additional communication for exchanging sample sizes + + //=========================================================== + // END: timers for profiling exchange_data + //=========================================================== + + bool m_is_setup = false; + + /// set to true if data_store is preloaded + bool m_loading_is_complete = false; + + /** @brief True, if we are in preload mode */ + bool m_preloading = false; + + /** @brief True, if we are in explicit loading mode + * + * There is some redundancy here: m_preloading and m_explicitly_loading + * can not both be true, but both may be false. When m_loading_is_complete + * is true, both m_preloading and m_preloading should be false. + */ + bool m_explicitly_loading = false; /// The size of the mini-batch that was used to calculate ownership /// of samples when building the owner map. This size has to be /// used consistently when computing the indices that will be sent /// and received. - int m_owner_map_mb_size; + int m_owner_map_mb_size = 0; - /// if true, use exchange_data_by_super_node, else use - /// exchange_data_by_sample; default if false - bool m_super_node; + /// size of a compacted conduit::Node that contains a single sample + int m_compacted_sample_size = 0; - void exchange_data_by_super_node(size_t current_pos, size_t mb_size); - void exchange_data_by_sample(size_t current_pos, size_t mb_size); + bool m_is_local_cache = false; + + bool m_node_sizes_vary = false; + + /// used in exchange_data_by_sample, when sample sizes are non-uniform + bool m_have_sample_sizes = false; + + generic_data_reader *m_reader; + + lbann_comm *m_comm = nullptr; + + /// convenience handles + bool m_world_master; + bool m_trainer_master; + int m_rank_in_trainer; + int m_rank_in_world = -1; // -1 for debugging + int m_np_in_trainer; + + /** @brief Maps an index to the processor that owns the associated data */ + map_ii_t m_owner; + + /// convenience handle + const std::vector *m_shuffled_indices; + + /** @brief Contains the conduit nodes that are "owned" by this rank + * + * Maps data_id -> conduit::Node. + */ + std::unordered_map m_data; + + /** @brief Contains the conduit nodes that are "owned" by this rank + * + * This differs from m_data in that this holds temporarily, + * during the first epoch, if we're running in local cache mode + * and explicitly loading + */ + std::unordered_map m_data_cache; /// Contains the list of data IDs that will be received std::vector m_recv_data_ids; - - /// contains the Nodes that this processor owns; - /// maps data_id to conduit::Node - mutable std::unordered_map m_data; + map_ii_t m_recv_sample_sizes; /// This vector contains Nodes that this processor needs for /// the current minibatch; this is filled in by exchange_data() @@ -213,47 +449,189 @@ protected : std::vector> m_send_requests; std::vector> m_recv_requests; std::vector m_recv_buffer; - std::vector m_outgoing_msg_sizes; - std::vector m_incoming_msg_sizes; + std::vector m_outgoing_msg_sizes; + std::vector m_incoming_msg_sizes; - /// size of a compacted conduit::Node that contains a single sample - int m_compacted_sample_size; + /** @brief Maps a data_id to its image size + * + * Used when conduit Nodes have non-uniform size, e.g, imagenet; + * see: set_node_sizes_vary() + */ + map_is_t m_sample_sizes; + + /** @brief Maps a data_id to the image location in a shared memory segment */ + map_is_t m_image_offsets; + + /// maps processor id -> set of indices (whose associated samples) + /// this proc needs to send. (formerly called "proc_to_indices); + /// this is filled in by build_indices_i_will_send() + std::vector> m_indices_to_send; + + /// maps processor id -> set of indices (whose associated samples) + /// this proc needs to recv from others. (formerly called "needed") + std::vector> m_indices_to_recv; - /// used in exchange_data_by_super_node(); contains the super_nodes, - /// after they have been converted from compacted format - std::vector m_reconstituted; + //========================================================================= + // methods follow + //========================================================================= + + void exchange_data_by_sample(size_t current_pos, size_t mb_size); void setup_data_store_buffers(); /// called by exchange_data - static void build_node_for_sending(const conduit::Node &node_in, conduit::Node &node_out); + void build_node_for_sending(const conduit::Node &node_in, conduit::Node &node_out); - /// fills in m_owner, which maps index -> owning processor - void build_owner_map(int mini_batch_size); - - /// maps processor id -> set of indices (whose associated samples) - /// this proc needs to send. (formerly called "proc_to_indices) - std::vector> m_indices_to_send; + /// for use when conduit Nodes have non-uniform size, e.g, imagenet + void exchange_sample_sizes(); /// fills in m_indices_to_send and returns the number of samples /// that will be sent int build_indices_i_will_send(int current_pos, int mb_size); - /// maps processor id -> set of indices (whose associated samples) - /// this proc needs to recv from others. (formerly called "needed") - std::vector> m_indices_to_recv; - /// fills in m_indices_to_recv and returns the number of samples /// that will be received int build_indices_i_will_recv(int current_pos, int mb_size); void error_check_compacted_node(const conduit::Node &nd, int data_id); - bool m_is_local_cache; + /** @brief All ranks exchange their cached data */ + void exchange_local_caches(); + + /// Currently only used for imagenet. On return, 'sizes' maps a sample_id to image size, and indices[p] contains the sample_ids that P_p owns + /// for use in local cache mode + void get_image_sizes(map_is_t &sizes, std::vector> &indices); + + /// for use in local cache mode + void allocate_shared_segment(map_is_t &sizes, std::vector> &indices); + + /// for use in local cache mode + void read_files(std::vector &work, map_is_t &sizes, std::vector &indices); + + /// fills in m_image_offsets for use in local cache mode + void compute_image_offsets(map_is_t &image_sizes, std::vector> &indices); + + /// for use in local cache mode + void exchange_images(std::vector &work, map_is_t &image_sizes, std::vector> &indices); + + /// for use in local cache mode + void build_conduit_nodes(map_is_t &sizes); + + + /// for use in local cache mode + void fillin_shared_images(char* images, size_t size, size_t offset); + + /** @brief For testing during development + * + * At the beginning of the 2nd epoch, calls write_checkpoint(), + * clears some variables, calls load_checkpoint then continues. + * To activate this test use cmd flag: --data_store_test_checkpoint= + */ + void test_checkpoint(const std::string&); + + /** @brief Called by test_checkpoint */ + void print_variables(); + + /** @brief Called by test_checkpoint + * + * For testing and development. Prints the first 'n' entries from + * the owner map * (which maps sample_id -> owning rank) to std::cout + */ + void print_partial_owner_map(int n); + + std::string get_conduit_dir() const; + std::string get_cereal_fn() const; + std::string get_metadata_fn() const; + + /** @brief Creates the directory if it does not already exist */ + void make_dir_if_it_doesnt_exist(const std::string &dir); + + /** @brief Writes conduit node to file */ + void spill_conduit_node(const conduit::Node &node, int data_id); + + /** @brief Loads conduit nodes from file into m_data */ + void load_spilled_conduit_nodes(); + + /** @brief Creates directory structure, opens metadata file for output, etc + * + * This method is called for both --data_store_spill and + * --data_store_test_checkpoint + */ + void setup_spill(std::string dir); + + /** @brief Saves this object's state to file + * + * Here, "state" is all data, except for conduit nodes, that is + * needed to reload from checkpoint + */ + void save_state(); + + /** @brief Optionally open debug and profiling files + * + * A debug file is opened for every pair; + * files are opened if the cmd flag --data_store_debug is passed. + * A profiling file is opened only be + * pairs; files are opened if the cmd flag --data_store_profile is passed. + */ + void open_informational_files(); + + /** @brief Creates a directory for spilling conduit nodes */ + void open_next_conduit_spill_directory(); + + /** @brief Write timing data for data exchange to the profile file, if it's opened */ + void profile_timing(); + + void setup_checkpoint_test(); + + std::string get_lassen_spill_dir(); + + void verify_sample_size(); + + //========================================================================= + // functions and templates for optional profiling and debug files follow + //========================================================================= + + void PROFILE() const { + if (!m_profile) { + return; + } + (*m_profile) << std::endl; + flush_profile_file(); + } + + template + void PROFILE(T var1, Types... var2) const { + if (!m_world_master) { + return; + } + if (!m_profile) { + return; + } + (*m_profile) << var1 << " "; + PROFILE(var2...) ; + flush_profile_file(); + } + + void DEBUG_DS() { + if (!m_debug) { + return; + } + (*m_debug) << std::endl; + flush_debug_file(); + } + + template + void DEBUG_DS(T var1, Types... var2) { + if (!m_debug) { + return; + } + (*m_debug) << var1 << " "; + DEBUG_DS(var2...) ; + flush_debug_file(); + } }; } // namespace lbann -#endif //#ifdef LBANN_HAS_CONDUIT #endif // __DATA_STORE_JAG_HPP__ diff --git a/include/lbann/execution_contexts/CMakeLists.txt b/include/lbann/execution_contexts/CMakeLists.txt new file mode 100644 index 00000000000..79bd7243399 --- /dev/null +++ b/include/lbann/execution_contexts/CMakeLists.txt @@ -0,0 +1,8 @@ +# Add the headers for this directory +set_full_path(THIS_DIR_HEADERS + execution_context.hpp + sgd_execution_context.hpp + ) + +# Propagate the files up the tree +set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/execution_contexts/execution_context.hpp b/include/lbann/execution_contexts/execution_context.hpp new file mode 100644 index 00000000000..f26ea0d21d4 --- /dev/null +++ b/include/lbann/execution_contexts/execution_context.hpp @@ -0,0 +1,177 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_EXECUTION_CONTEXT_HPP +#define LBANN_EXECUTION_CONTEXT_HPP + +#include "lbann/base.hpp" +#include "lbann/comm.hpp" +#include "lbann/io/persist.hpp" +#include "lbann/utils/threads/thread_pool.hpp" +#include + +namespace lbann { + +// Forward-declare this. +class trainer; +class training_algorithm; + +class termination_criteria { +public: + size_t num_steps; +}; + +class execution_context { +public: + /** Constructor. */ + execution_context(trainer& trainer, training_algorithm& training_alg, + lbann_comm *comm, execution_mode mode); + /** Destructor. */ + virtual ~execution_context() = default; + + /** Copy execution_context. */ + virtual std::unique_ptr copy_execution_context() const { + // Use explicit construction of unique pointer since copy + // constructor is protected and cannot be accessed in make_unique + return std::unique_ptr{new execution_context(*this)}; + } + + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(CEREAL_NVP(m_execution_mode), + CEREAL_NVP(m_terminate_training), + CEREAL_NVP(m_step)); + } + + /** @brief Return the state of the execution context as a string */ + virtual std::string get_state_string() const noexcept { + return build_string("ec.", to_string(get_execution_mode()), + ".step.", get_step()); + } + + /** @brief Current step in the training algorithm + * @details Step counts the number of iterations in the training + * algorithm's internal state + */ + size_t get_step() const noexcept { return m_step; } + + /** @brief Increment the current step in the training algorithm + * @details Increment the step count in the training + * algorithm's internal state + */ + void inc_step() noexcept { ++m_step; } + + /** Get the mode that the trainer is currenting executing. */ + inline void set_execution_mode(execution_mode mode) noexcept { + m_execution_mode = mode; + } + + /** Get the mode that the trainer is currenting executing. */ + inline execution_mode get_execution_mode() const noexcept { + return m_execution_mode; + } + + /** Return true if the flag to stop training is set. */ + bool get_terminate_training() const { + return m_terminate_training; + } + /** Set the terminate training flag (on or off). */ + void set_terminate_training(bool f) { + m_terminate_training = f; + } + + /** Grab the trainer from the execution context */ + const trainer& get_trainer() const { + return m_trainer; + } + + trainer& get_trainer() { + return const_cast(static_cast(*this).get_trainer()); + } + + const training_algorithm& get_training_algorithm() const { + return m_training_algorithm; + } + + training_algorithm& get_training_algorithm() { + return const_cast(static_cast(*this).get_training_algorithm()); + } + + thread_pool& get_io_thread_pool() const; + + lbann_comm& get_comm() const { + if (!m_comm) { LBANN_ERROR("m_comm is null"); } + return *m_comm; + }; + + /** Are background I/O activities enabled by the input layers */ + bool background_io_activity_allowed(); + + /** Checkpoint training_algorithm to given file descriptor */ + virtual void save_to_checkpoint_shared(persist& p); + /** Restore training_algorithm by reading checkpoint from given file descriptor */ + virtual void load_from_checkpoint_shared(persist& p); + virtual void save_to_checkpoint_distributed(persist& p); + virtual void load_from_checkpoint_distributed(persist& p); + +protected: + /** Copy constructor. */ + execution_context(const execution_context& other) = default; + /** Copy assignment operator. */ + execution_context& operator=(const execution_context& other) = default; + /** Move constructor. */ + execution_context(execution_context&& other) = default; + /** Move assignment operator. */ + execution_context& operator=(execution_context&& other) = default; + +private: + /** Pointer to the training context (execution environment) for the training algorithm */ + trainer& m_trainer; + + training_algorithm& m_training_algorithm; + + /** LBANN communicator. */ + observer_ptr m_comm; + + /** The trainer's current execution mode. */ + execution_mode m_execution_mode = execution_mode::training; + + /** @brief Current step in the training algorithm + * @details Step counts the number of iterations in the training + * algorithm's internal state + */ + size_t m_step = 0; + + /** @brief Whether to terminate training. + * @details If true, training will terminate immediately before + * the next epoch. + */ + bool m_terminate_training = false; +}; + +} // namespace lbann + +#endif // LBANN_EXECUTION_CONTEXT_HPP diff --git a/include/lbann/execution_contexts/sgd_execution_context.hpp b/include/lbann/execution_contexts/sgd_execution_context.hpp new file mode 100644 index 00000000000..4d81ae68fbf --- /dev/null +++ b/include/lbann/execution_contexts/sgd_execution_context.hpp @@ -0,0 +1,127 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_SGD_EXECUTION_CONTEXT_HPP +#define LBANN_SGD_EXECUTION_CONTEXT_HPP + +#include "lbann/execution_contexts/execution_context.hpp" +#include +namespace lbann { + +class sgd_termination_criteria : public termination_criteria { +public: + size_t num_epochs; +}; + + +/** @brief SGD Uses the step to track the Current mini-batch step for + * execution mode. + * @details Step counts are not reset after each epoch. + */ +class sgd_execution_context final : public execution_context { +public: + /** Constructor. */ + sgd_execution_context(trainer& trainer, training_algorithm& training_alg, + lbann_comm *comm, execution_mode mode, size_t mini_batch_size); + /** Destructor. */ + virtual ~sgd_execution_context() = default; + + /** Copy constructor. */ + sgd_execution_context(const sgd_execution_context& other) = default; + /** Copy assignment operator. */ + sgd_execution_context& operator=(const sgd_execution_context& other) = default; + /** Move constructor. */ + sgd_execution_context(sgd_execution_context&& other) = default; + /** Move assignment operator. */ + sgd_execution_context& operator=(sgd_execution_context&& other) = default; + /** Copy sgd_execution_context. */ + virtual std::unique_ptr copy_execution_context() const { return make_unique(*this); } + + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(cereal::base_class( this ), + CEREAL_NVP(m_epoch), + CEREAL_NVP(m_current_mini_batch_size), + CEREAL_NVP(m_effective_mini_batch_size)); + } + + /** @brief Return the state of the execution context as a string */ + std::string get_state_string() const noexcept override { + return build_string("sgd.", to_string(get_execution_mode()), + ".epoch.", get_epoch(), ".step.", get_step()); + } + + /** Number of times the training set has been traversed. */ + inline size_t get_epoch() const noexcept { return m_epoch; } + + /** @brief Increment the current epoch in the execution context + * @details Increment the counter tracking the number of times + * that the data set has been traversed. + */ + void inc_epoch() noexcept { ++m_epoch; } + + /** Set the trainer's current mini-batch size. */ + inline void set_current_mini_batch_size(size_t mini_batch_size) { + m_current_mini_batch_size = mini_batch_size; + } + /** Get the trainer's current mini-batch size. */ + inline size_t get_current_mini_batch_size() const { + return m_current_mini_batch_size; + } + /** Get the trainer's effective mini-batch size. */ + inline size_t get_effective_mini_batch_size() const { + return m_effective_mini_batch_size; + } + /** Set the trainer's effective mini-batch size. */ + inline void set_effective_mini_batch_size(size_t mini_batch_size) { + m_effective_mini_batch_size = mini_batch_size; + } + + /** Checkpoint training_algorithm to given file descriptor */ + virtual void save_to_checkpoint_shared(persist& p); + /** Restore training_algorithm by reading checkpoint from given file descriptor */ + virtual void load_from_checkpoint_shared(persist& p); + virtual void save_to_checkpoint_distributed(persist& p); + virtual void load_from_checkpoint_distributed(persist& p); + +private: + /** Number of times the training data set has been traversed. */ + size_t m_epoch = 0; + + /** Size of the current mini-batch in the model. */ + size_t m_current_mini_batch_size; + + /** The "effective" size of a minibatch. + * + * This is the size of the minibatch across all models and used for + * e.g. correctly averaging gradients from multiple models. + */ + size_t m_effective_mini_batch_size; +}; + +} // namespace lbann + +#endif // LBANN_SGD_EXECUTION_CONTEXT_HPP diff --git a/include/lbann/io/data_buffers/generic_io_buffer.hpp b/include/lbann/io/data_buffers/generic_io_buffer.hpp index 1f0ebc807de..a8d4f7ecec0 100644 --- a/include/lbann/io/data_buffers/generic_io_buffer.hpp +++ b/include/lbann/io/data_buffers/generic_io_buffer.hpp @@ -36,11 +36,12 @@ namespace lbann { +template class fetch_data_functor { public: fetch_data_functor (data_reader_target_mode target_mode) : _target_mode(target_mode) {} - int operator() (CPUMat& samples, CPUMat& responses, El::Matrix& indices_fetched, generic_data_reader* data_reader) const { + int operator() (CPUMatDT& samples, CPUMatDT& responses, El::Matrix& indices_fetched, generic_data_reader* data_reader) const { int num_samples_fetched = data_reader->fetch_data(samples, indices_fetched); int num_responses_fetched; switch(_target_mode) { @@ -64,7 +65,7 @@ class fetch_data_functor { } return num_samples_fetched; } - int operator() (CPUMat& samples, El::Matrix& indices_fetched, generic_data_reader* data_reader) const { + int operator() (CPUMatDT& samples, El::Matrix& indices_fetched, generic_data_reader* data_reader) const { int num_samples_fetched = data_reader->fetch_data(samples, indices_fetched); switch(_target_mode) { case data_reader_target_mode::NA: @@ -89,9 +90,22 @@ class update_data_reader_functor { } }; +template class generic_io_buffer { public: - generic_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map data_readers); + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The local tensor type expected for IO in this object. */ + using IODataType = DataType; + + ///@} + +public: + generic_io_buffer(lbann_comm *comm, int num_parallel_readers); generic_io_buffer( const generic_io_buffer&); generic_io_buffer& operator=( @@ -112,8 +126,8 @@ class generic_io_buffer { virtual void setup_data(El::Int num_neurons, El::Int num_targets, El::Int max_minibatch_size) = 0; virtual int fetch_to_local_matrix(generic_data_reader *data_reader, execution_mode mode) = 0; - virtual void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMat& sample, AbsDistMat& response) {} - virtual void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMat& sample) {} + virtual void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample, AbsDistMatrixType& response) {} + virtual void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample) {} virtual bool update_data_set(generic_data_reader *data_reader, execution_mode mode) = 0; virtual void set_fetch_data_in_background(bool flag, execution_mode mode) = 0; virtual bool is_data_fetched_in_background(execution_mode mode) = 0; @@ -122,17 +136,27 @@ class generic_io_buffer { virtual void set_data_fetch_future(std::future future, execution_mode mode) = 0; virtual std::future get_data_fetch_future(execution_mode mode) = 0; - virtual void calculate_num_iterations_per_epoch_spanning_models(int max_mini_batch_size, generic_data_reader *data_reader) = 0; - virtual void calculate_num_iterations_per_epoch_single_model(int max_mini_batch_size, generic_data_reader *data_reader) = 0; - - virtual int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers) const = 0; - // protected: public: lbann_comm *m_comm; - const fetch_data_functor *fetch_data_fn; + const fetch_data_functor *fetch_data_fn; const update_data_reader_functor *update_data_reader_fn; }; -} + +#ifndef LBANN_GENERIC_IO_BUFFER_INSTANTIATE + +#define PROTO(T) \ + extern template class generic_io_buffer + +#define LBANN_INSTANTIATE_CPU_HALF +#define LBANN_INSTANTIATE_GPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#undef LBANN_INSTANTIATE_CPU_HALF +#undef LBANN_INSTANTIATE_GPU_HALF + +#endif // LBANN_GENERIC_IO_BUFFER_INSTANTIATE + +} // namespace lbann #endif // LBANN_GENERIC_IO_BUFFER_HPP_INCLUDED diff --git a/include/lbann/io/data_buffers/partitioned_io_buffer.hpp b/include/lbann/io/data_buffers/partitioned_io_buffer.hpp index 13a4a23f8b2..56a438fa1c0 100644 --- a/include/lbann/io/data_buffers/partitioned_io_buffer.hpp +++ b/include/lbann/io/data_buffers/partitioned_io_buffer.hpp @@ -31,12 +31,22 @@ namespace lbann { +template class data_buffer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: /** Number of samples in the current mini-batch */ int m_num_samples_fetched; /** Distributed matrix used to stage local data to layer output */ - std::vector> m_input_buffers; + std::vector> m_input_buffers; std::atomic m_fetch_data_in_background; std::future m_data_fetch_future; /// 1-D Matrix of which indices were fetched in this mini-batch @@ -48,7 +58,7 @@ class data_buffer { m_input_buffers.clear(); m_input_buffers.resize(num_child_layers); for(int i = 0; i < num_child_layers; i++) { - m_input_buffers[i].reset(new StarVCMat(comm->get_trainer_grid())); + m_input_buffers[i].reset(new StarVCMatDT(comm->get_trainer_grid())); } } @@ -78,11 +88,24 @@ class data_buffer { /** * Parallel I/O routines for managing partitioned minibatches */ -class partitioned_io_buffer : public generic_io_buffer { +template +class partitioned_io_buffer : public generic_io_buffer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The local tensor type expected for IO in this object. */ + using IODataType = DataType; + + ///@} + public: - typedef std::map data_buffer_map_t; + typedef std::map *> data_buffer_map_t; public: - partitioned_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map data_readers, int num_child_layers); + partitioned_io_buffer(lbann_comm *comm, int num_parallel_readers, int num_child_layers); partitioned_io_buffer(const partitioned_io_buffer& other); partitioned_io_buffer& operator=(const partitioned_io_buffer& other); ~partitioned_io_buffer(); @@ -94,8 +117,8 @@ class partitioned_io_buffer : public generic_io_buffer { void setup_data(El::Int num_neurons, El::Int num_targets, El::Int max_mini_batch_size) override; int fetch_to_local_matrix(generic_data_reader *data_reader, execution_mode mode) override; - void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMat& sample, AbsDistMat& response) override; - void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMat& sample) override; + void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample, AbsDistMatrixType& response) override; + void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample) override; bool update_data_set(generic_data_reader *data_reader, execution_mode mode) override; void set_fetch_data_in_background(bool flag, execution_mode mode) override; bool is_data_fetched_in_background(execution_mode mode) override; @@ -104,14 +127,9 @@ class partitioned_io_buffer : public generic_io_buffer { void set_data_fetch_future(std::future future, execution_mode mode) override; std::future get_data_fetch_future(execution_mode mode) override; - void calculate_num_iterations_per_epoch_spanning_models(int max_mini_batch_size, generic_data_reader *data_reader) override; - void calculate_num_iterations_per_epoch_single_model(int max_mini_batch_size, generic_data_reader *data_reader) override; - int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers) const override; - static int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers, const lbann_comm* comm); - - data_buffer *get_data_buffer(const execution_mode mode) const { - data_buffer *data_buffer = nullptr; - data_buffer_map_t::const_iterator it = m_data_buffers.find(mode); + data_buffer *get_data_buffer(const execution_mode mode) const { + data_buffer *data_buffer = nullptr; + typename data_buffer_map_t::const_iterator it = m_data_buffers.find(mode); if (it != m_data_buffers.end()) data_buffer = it->second; switch(mode) { diff --git a/include/lbann/io/persist.hpp b/include/lbann/io/persist.hpp index 409dc5ddf89..e616019b806 100644 --- a/include/lbann/io/persist.hpp +++ b/include/lbann/io/persist.hpp @@ -30,41 +30,100 @@ #define LBANN_PERSIST_H #include "lbann/base.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/enum_iterator.hpp" #include "El.hpp" +#include +#include +#include +#include +#include namespace lbann { enum class persist_type { train, // data should be saved in file with train data model, // data should be saved in file with model data - validate + metrics, + validate, + testing, + prediction_context, + training_context, + testing_context, + validation_context, }; +using persist_type_iterator = enum_iterator; + +inline persist_type execution_mode_to_persist_type(execution_mode m) { + switch(m) { + case execution_mode::training: + return persist_type::training_context; + case execution_mode::validation: + return persist_type::validation_context; + case execution_mode::testing: + return persist_type::testing_context; + case execution_mode::prediction: + return persist_type::prediction_context; + // case execution_mode::tournament: + // return persist_type::tournament; + case execution_mode::invalid: + default: + LBANN_ERROR("Invalid execution mode specified"); + } +} + +inline std::string to_string(persist_type pt) { + switch(pt) { + case persist_type::model: + return "model"; + case persist_type::metrics: + return "metrics"; + case persist_type::train: + return "train"; + case persist_type::validate: + return "validate"; + case persist_type::testing: + return "test"; + case persist_type::prediction_context: + return "prediction"; + case persist_type::training_context: + return "training"; + case persist_type::validation_context: + return "validation"; + case persist_type::testing_context: + return "testing"; + default: + LBANN_ERROR("Invalid persist type specified"); + } +} + +/// @todo Fix the callback types to properly track execution phases enum class callback_type { - batch, - epoch, - validation, - inference, + model_only, + weights_only, + execution_context_only, + full_checkpoint, invalid }; class persist { - protected: - uint64_t m_bytes; - int m_model_fd; - int m_train_fd; - int m_validate_fd; - char m_model_filename[1024]; - char m_train_filename[1024]; - char m_validate_filename[1024]; + private: + std::map m_bytes; + std::map m_filenames; callback_type ckpt_type; public: - char m_checkpoint_dir[1024]; + std::string m_checkpoint_dir; public: persist(); ~persist() {}; + /** Archive for checkpoint and restart */ + template void serialize(Archive & ar) { + ar(CEREAL_NVP(ckpt_type)); + } + callback_type get_cb_type() const { return ckpt_type; } @@ -73,77 +132,190 @@ class persist { ckpt_type = type; } - void open_checkpoint(const char *dir); + void open_checkpoint_dir(const std::string& dir, bool create_dir); + void open_checkpoint(const std::string& dir, bool create_dir); void close_checkpoint(); - void open_restart(const char *dir); + void open_restart(const std::string& dir); void close_restart(); + void set_restart_dir(const std::string& dir) { m_checkpoint_dir = dir; } uint64_t get_bytes() const { - return m_bytes; + uint64_t bytes = 0; + for(auto& pt : m_bytes) { + bytes += pt.second; + } + return bytes; } void reset_bytes() { - m_bytes = 0; + for(auto& pt : m_bytes) { + pt.second = 0; + } } - bool write_rank_distmat(persist_type type, const char *name, const AbsDistMat& M); - bool read_rank_distmat(persist_type type, const char *name, AbsDistMat& M); - - bool write_distmat(persist_type type, const char *name, AbsDistMat *M); - bool read_distmat (persist_type type, const char *name, AbsDistMat *M); - - bool write_bytes(persist_type type, const char *name, const void *buf, size_t size); - bool read_bytes(persist_type type, const char *name, void *buf, size_t size); - - bool write_uint32(persist_type type, const char *name, uint32_t val); - bool read_uint32 (persist_type type, const char *name, uint32_t *val); - - bool write_uint64(persist_type type, const char *name, uint64_t val); - bool read_uint64 (persist_type type, const char *name, uint64_t *val); - - bool write_int32_contig(persist_type type, const char *name, const int32_t *buf, uint64_t count); - bool read_int32_contig (persist_type type, const char *name, int32_t *buf, uint64_t count); - - bool write_float(persist_type type, const char *name, float val); - bool read_float (persist_type type, const char *name, float *val); - - bool write_string(persist_type type, const char *name, const char *val, int str_length); - bool read_string (persist_type type, const char *name, char *val, int str_length); + template + bool write_rank_distmat(persist_type type, const char *name, const El::AbstractDistMatrix& M); + template + bool read_rank_distmat(persist_type type, const char *name, El::AbstractDistMatrix& M); - bool write_double(persist_type type, const char *name, double val); - bool read_double (persist_type type, const char *name, double *val); + template + bool write_distmat(persist_type type, const char *name, El::AbstractDistMatrix *M); + template + bool read_distmat (persist_type type, const char *name, El::AbstractDistMatrix *M); - bool write_datatype(persist_type type, const char *name, DataType val); - bool read_datatype (persist_type type, const char *name, DataType *val); + const std::string& get_checkpoint_dir() const { return m_checkpoint_dir; } - private: - int get_fd(persist_type type) const; + std::string get_filename(persist_type type) const; }; -bool write_distmat(int fd, const char *name, DistMat *M, uint64_t *bytes); -bool read_distmat (int fd, const char *name, DistMat *M, uint64_t *bytes); - bool write_bytes(int fd, const char *name, const void *buf, size_t size); bool read_bytes(int fd, const char *name, void *buf, size_t size); -bool write_uint32(int fd, const char *name, uint32_t val); -bool read_uint32 (int fd, const char *name, uint32_t *val); - -bool write_uint64(int fd, const char *name, uint64_t val); -bool read_uint64 (int fd, const char *name, uint64_t *val); +bool write_string(int fd, const char *name, const char *buf, size_t size); +bool read_string(int fd, const char *name, char *buf, size_t size); -bool write_int32_contig(int fd, const char *name, const int32_t *buf, uint64_t count); -bool read_int32_contig (int fd, const char *name, int32_t *buf, uint64_t count); +class NonexistentArchiveFile : public std::runtime_error { +public: + NonexistentArchiveFile(std::string const& filename) : std::runtime_error(std::string("Archive file not found: ") + filename) {} +}; -bool write_float(int fd, const char *name, float val); -bool read_float (int fd, const char *name, float *val); +template +void write_cereal_archive(C& obj, const std::string& filename) { + std::ofstream os(filename); + if(!os.is_open()) { + throw NonexistentArchiveFile(filename); + } + cereal::XMLOutputArchive archive(os); + archive(obj); +} + +template +void write_cereal_archive(C& obj, persist& p, const std::string& filename) { + write_cereal_archive(obj, p.get_checkpoint_dir() + "/" + filename); +} + +template +void write_cereal_archive(C& obj, persist& p, persist_type pt, const std::string& suffix) { + write_cereal_archive(obj, p.get_filename(pt) + suffix); +} + +template +void write_cereal_archive(C& obj, persist& p, execution_mode mode, const std::string& suffix) { + const persist_type pt = execution_mode_to_persist_type(mode); + write_cereal_archive(obj, p, pt, suffix); +} + +template +void read_cereal_archive(C& obj, const std::string& filename) { + std::ifstream is(filename); + if(!is.is_open()) { + throw NonexistentArchiveFile(filename); + } + cereal::XMLInputArchive archive(is); + archive(obj); +} + +template +void read_cereal_archive(C& obj, persist& p, const std::string& filename) { + read_cereal_archive(obj, p.get_checkpoint_dir() + "/" + filename); +} + +template +void read_cereal_archive(C& obj, persist& p, persist_type pt, const std::string& suffix) { + read_cereal_archive(obj, p.get_filename(pt) + suffix); +} + +template +void read_cereal_archive(C& obj, persist& p, execution_mode mode, const std::string& suffix) { + const persist_type pt = execution_mode_to_persist_type(mode); + read_cereal_archive(obj, p, pt, suffix); +} + +template +std::string create_cereal_archive_binary_string(C& obj) { + std::ostringstream ss; + { + cereal::BinaryOutputArchive archive(ss); + archive(obj); + } // archive goes out of scope, ensuring all contents are flushed + return ss.str(); +} + +template +void unpack_cereal_archive_binary_string(C& obj, const std::string& buf) { + std::istringstream ss(buf); + { + cereal::BinaryInputArchive archive(ss); + archive(obj); + } // archive goes out of scope, ensuring all contents are flushed +} + +template +void load_from_shared_cereal_archive(C& obj, + lbann_comm& comm, + const std::string& filename) { + std::string buf; + if (comm.am_trainer_master()) { + read_cereal_archive(obj, filename); + buf = create_cereal_archive_binary_string(obj); + }else { + // If you are not the trainer master, still check to see if the file exists + std::ifstream is(filename); + if(!is.is_open()) { + throw NonexistentArchiveFile(filename); + } + } -bool write_double(int fd, const char *name, double val); -bool read_double (int fd, const char *name, double *val); + // TODO: this assumes homogeneous processors + // broadcast state from rank 0 + comm.trainer_broadcast(0, buf); -bool write_string(int fd, const char *name, const char *buf, size_t size); -bool read_string(int fd, const char *name, char *buf, size_t size); + if (!comm.am_trainer_master()) { + unpack_cereal_archive_binary_string(obj, buf); + } +} + +template +void load_from_shared_cereal_archive(C& obj, persist& p, + lbann_comm& comm, + const std::string& filename) { + load_from_shared_cereal_archive(obj, comm, p.get_checkpoint_dir() + filename); +} + +template +void load_from_shared_cereal_archive(C& obj, persist& p, persist_type pt, + lbann_comm& comm, + const std::string& suffix) { + load_from_shared_cereal_archive(obj, comm, p.get_filename(pt) + suffix); +} + +template +void load_from_shared_cereal_archive(C& obj, persist& p, execution_mode mode, + lbann_comm& comm, + const std::string& suffix) { + const persist_type pt = execution_mode_to_persist_type(mode); + load_from_shared_cereal_archive(obj, p, pt, comm, suffix); +} + +#ifndef LBANN_PERSIST_INSTANTIATE +#define PROTO(T) \ + extern template bool persist::write_rank_distmat( \ + persist_type type, const char *name, const El::AbstractDistMatrix& M); \ + extern template bool persist::read_rank_distmat( \ + persist_type type, const char *name, El::AbstractDistMatrix& M); \ + extern template bool persist::write_distmat( \ + persist_type type, const char *name, El::AbstractDistMatrix *M); \ + extern template bool persist::read_distmat( \ + persist_type type, const char *name, El::AbstractDistMatrix *M) + +#define LBANN_INSTANTIATE_CPU_HALF +#define LBANN_INSTANTIATE_GPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#undef LBANN_INSTANTIATE_CPU_HALF +#undef LBANN_INSTANTIATE_GPU_HALF +#endif // LBANN_PERSIST_INSTANTIATE } // namespace lbann diff --git a/include/lbann/layers/CMakeLists.txt b/include/lbann/layers/CMakeLists.txt index 0cc71271bcb..ab56ae6f153 100644 --- a/include/lbann/layers/CMakeLists.txt +++ b/include/lbann/layers/CMakeLists.txt @@ -1,8 +1,16 @@ # Add the headers for this directory set_full_path(THIS_DIR_HEADERS layer.hpp + data_type_layer.hpp ) +if (LBANN_HAS_DISTCONV) + list(APPEND THIS_DIR_HEADERS + "${CMAKE_CURRENT_SOURCE_DIR}/distconv_adapter.hpp") + list(APPEND THIS_DIR_HEADERS + "${CMAKE_CURRENT_SOURCE_DIR}/data_type_distconv_adapter.hpp") +endif () + # Add the subdirectories add_subdirectory(activations) add_subdirectory(image) diff --git a/include/lbann/layers/activations/CMakeLists.txt b/include/lbann/layers/activations/CMakeLists.txt index bbcb0179add..553c4b3cebf 100644 --- a/include/lbann/layers/activations/CMakeLists.txt +++ b/include/lbann/layers/activations/CMakeLists.txt @@ -3,6 +3,7 @@ set_full_path(THIS_DIR_HEADERS activations.hpp elu.hpp identity.hpp + relu.hpp leaky_relu.hpp log_softmax.hpp softmax.hpp diff --git a/include/lbann/layers/activations/activations.hpp b/include/lbann/layers/activations/activations.hpp index b36c8d61072..24d11fade27 100644 --- a/include/lbann/layers/activations/activations.hpp +++ b/include/lbann/layers/activations/activations.hpp @@ -31,14 +31,30 @@ namespace lbann { +// Convenience macros for ETI decls for unary layers + +#ifndef LBANN_ACTIVATIONS_LAYER_INSTANTIATE +#define UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, DEVICE) \ + extern template class LAYER_NAME; \ + extern template class LAYER_NAME +#else +#define UNARY_ETI_DECL_MACRO_DEV(...) +#endif // LBANN_UNARY_LAYER_INSTANTIATE + +#ifdef LBANN_HAS_GPU +#define UNARY_ETI_DECL_MACRO(LAYER_NAME, T) \ + UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU); \ + UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::GPU) +#else +#define UNARY_ETI_DECL_MACRO(LAYER_NAME, T) \ + UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU) +#endif // LBANN_HAS_GPU + // Convenience macro to define an entry-wise unary layer class -#define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string) \ - struct layer_name##_name_struct { \ - inline operator std::string() { return layer_string; } \ - }; \ - template \ - using layer_name \ - = entrywise_unary_layer; +#define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string) \ + LBANN_DECLARE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string); \ + UNARY_ETI_DECL_MACRO(layer_name, float); \ + UNARY_ETI_DECL_MACRO(layer_name, double) /** @class lbann::log_sigmoid_layer * @brief Logarithm of sigmoid function. @@ -46,15 +62,7 @@ namespace lbann { * @f[ \log(\sigma(x)) = -\log(1 + e^{-x}) @f] * See https://en.wikipedia.org/wiki/Sigmoid_function. */ -DEFINE_ENTRYWISE_UNARY_LAYER(log_sigmoid_layer, "log sigmoid") - -/** @class lbann::relu_layer - * @brief Rectified linear unit. - * - * @f[ \text{ReLU}(x) = \text{max}(x, 0) @f] - * See https://en.wikipedia.org/wiki/Rectifier_(neural_networks). - */ -DEFINE_ENTRYWISE_UNARY_LAYER(relu_layer, "ReLU") +DEFINE_ENTRYWISE_UNARY_LAYER(log_sigmoid_layer, "log sigmoid"); /** @class lbann::selu_layer * @brief Scaled exponential rectified linear unit. @@ -73,7 +81,7 @@ DEFINE_ENTRYWISE_UNARY_LAYER(relu_layer, "ReLU") * Hochreiter. "Self-normalizing neural networks." In Advances in * Neural Information Processing Systems, pp. 971-980. 2017. */ -DEFINE_ENTRYWISE_UNARY_LAYER(selu_layer, "SELU") +DEFINE_ENTRYWISE_UNARY_LAYER(selu_layer, "SELU"); /** @class lbann::sigmoid_layer * @brief Special case of logistic function. @@ -81,7 +89,7 @@ DEFINE_ENTRYWISE_UNARY_LAYER(selu_layer, "SELU") * @f[ \sigma(x) = \frac{1}{1 + e^{-x}} @f] * See https://en.wikipedia.org/wiki/Sigmoid_function. */ -DEFINE_ENTRYWISE_UNARY_LAYER(sigmoid_layer, "sigmoid") +DEFINE_ENTRYWISE_UNARY_LAYER(sigmoid_layer, "sigmoid"); // Sigmoid function output is strictly in (0,1) // Note: Output is in the range [eps,1-eps], where 'eps' is machine // epsilon. This avoids denormalized floats and helps mitigate some @@ -94,16 +102,19 @@ DEFINE_ENTRYWISE_UNARY_LAYER(sigmoid_layer, "sigmoid") * @f[ \text{softplus}(x) = \log (e^x + 1) @f] * See https://en.wikipedia.org/wiki/Rectifier_(neural_networks) */ -DEFINE_ENTRYWISE_UNARY_LAYER(softplus_layer, "softplus") +DEFINE_ENTRYWISE_UNARY_LAYER(softplus_layer, "softplus"); /** @class lbann::softsign_layer * @brief Smooth approximation to sign function. * * @f[ \text{softsign}(x) = \frac{x}{1 + |x|} @f] */ -DEFINE_ENTRYWISE_UNARY_LAYER(softsign_layer, "softsign") +DEFINE_ENTRYWISE_UNARY_LAYER(softsign_layer, "softsign"); } // namespace lbann #undef DEFINE_ENTRYWISE_UNARY_LAYER +#undef UNARY_ETI_DECL_MACRO +#undef UNARY_ETI_DECL_MACRO_DEV + #endif // LBANN_LAYERS_ACTIVATIONS_ACTIVATIONS_HPP_INCLUDED diff --git a/include/lbann/layers/activations/elu.hpp b/include/lbann/layers/activations/elu.hpp index 52f797488be..c64846f3224 100644 --- a/include/lbann/layers/activations/elu.hpp +++ b/include/lbann/layers/activations/elu.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_ACTIVATIONS_ELU_HPP_INCLUDED #define LBANN_LAYERS_ACTIVATIONS_ELU_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -46,36 +46,45 @@ namespace lbann { * and accurate deep network learning by exponential linear units * (ELUs)." arXiv preprint arXiv:1511.07289 (2015). */ -template -class elu_layer : public Layer { +template +class elu_layer : public data_type_layer { public: - elu_layer(lbann_comm *comm, DataType alpha = 1) - : Layer(comm), m_alpha(alpha) {} + elu_layer(lbann_comm *comm, TensorDataType alpha = 1) + : data_type_layer(comm), m_alpha(alpha) {} elu_layer* copy() const override { return new elu_layer(*this); } std::string get_type() const override { return "ELU"; } data_layout get_data_layout() const override { return Layout; } El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); desc.add("alpha", m_alpha); return desc; } protected: - void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); } void fp_compute() override; void bp_compute() override; private: /** Scale parameter for negative region. */ - DataType m_alpha; + TensorDataType m_alpha; }; +#ifndef LBANN_ELU_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class elu_layer; \ + extern template class elu_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_ELU_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_ACTIVATIONS_ELU_HPP_INCLUDED diff --git a/include/lbann/layers/activations/identity.hpp b/include/lbann/layers/activations/identity.hpp index e895ba44b99..ff59d2138dd 100644 --- a/include/lbann/layers/activations/identity.hpp +++ b/include/lbann/layers/activations/identity.hpp @@ -27,38 +27,73 @@ #ifndef LBANN_LAYERS_ACTIVATIONS_IDENTITY_HPP_INCLUDED #define LBANN_LAYERS_ACTIVATIONS_IDENTITY_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" +#include "lbann/utils/distconv.hpp" namespace lbann { +#ifdef LBANN_HAS_DISTCONV +template +class identity_distconv_adapter: public data_type_distconv_adapter { + public: + using TensorDevType = typename data_type_distconv_adapter::TensorDevType; + identity_distconv_adapter(Layer &layer): + data_type_distconv_adapter(layer) {} + virtual ~identity_distconv_adapter() = default; + void setup_distributions(tensor_overlap_constraints &constraints) override; + std::unique_ptr setup_activations_i(int index) const override; + std::unique_ptr setup_error_signals_i(int index) const override; +}; +#endif // LBANN_HAS_DISTCONV + + /** @brief Output a tensor view. * * Forward and backward prop simply involve setting up tensor views, * and hence are very cheap. */ -template -class identity_layer : public Layer { +template +class identity_layer : public data_type_layer { public: - identity_layer(lbann_comm *comm) : Layer(comm) {} + identity_layer(lbann_comm *comm) : data_type_layer(comm) {} identity_layer* copy() const override { return new identity_layer(*this); } std::string get_type() const override { return "identity"; } data_layout get_data_layout() const override { return Layout; } El::Device get_device_allocation() const override { return Device; } protected: - void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); } void fp_setup_outputs(El::Int mini_batch_size) override { - El::LockedView(get_activations(), get_prev_activations()); + El::LockedView(this->get_activations(), this->get_prev_activations()); } void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { - El::LockedView(get_error_signals(), get_prev_error_signals()); + El::LockedView(this->get_error_signals(), this->get_prev_error_signals()); } void fp_compute() override {} void bp_compute() override {} +#ifdef LBANN_HAS_DISTCONV + protected: + bool is_distconv_supported() const override { + return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL; + } + void setup_distconv_adapter() override { + this->get_distconv_adapter_ptr() = make_unique>(*this); + } +#endif // LBANN_HAS_DISTCONV }; +#ifndef LBANN_IDENTITY_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class identity_layer; \ + extern template class identity_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_IDENTITY_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_ACTIVATIONS_IDENTITY_HPP_INCLUDED diff --git a/include/lbann/layers/activations/leaky_relu.hpp b/include/lbann/layers/activations/leaky_relu.hpp index 0e576117d3c..b936a5ac1b9 100644 --- a/include/lbann/layers/activations/leaky_relu.hpp +++ b/include/lbann/layers/activations/leaky_relu.hpp @@ -27,10 +27,27 @@ #ifndef LBANN_LAYERS_ACTIVATIONS_LEAKY_RELU_HPP_INCLUDED #define LBANN_LAYERS_ACTIVATIONS_LEAKY_RELU_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" +#include "lbann/utils/distconv.hpp" namespace lbann { +#ifdef LBANN_HAS_DISTCONV +template +class leaky_relu_distconv_adapter: public data_type_distconv_adapter { + public: + using TensorDevType = typename data_type_distconv_adapter::TensorDevType; + + leaky_relu_distconv_adapter(Layer& layer): data_type_distconv_adapter(layer) {} + virtual ~leaky_relu_distconv_adapter() = default; + + void setup_distributions(tensor_overlap_constraints &constraints) override; + void setup_layer(size_t workspace_capacity) override; + + std::unique_ptr m_leaky_relu; +}; +#endif // LBANN_HAS_DISTCONV + /** @brief * * @f[ @@ -46,36 +63,98 @@ namespace lbann { * nonlinearities improve neural network acoustic models." In * Proc. ICML, vol. 30, no. 1, p. 3. 2013. */ -template -class leaky_relu_layer : public Layer { +template +class leaky_relu_layer : public data_type_layer { public: - leaky_relu_layer(lbann_comm *comm, DataType negative_slope = 0.01) - : Layer(comm), m_negative_slope(negative_slope) {} + leaky_relu_layer(lbann_comm *comm, TensorDataType negative_slope = 0.01) + : data_type_layer(comm), m_negative_slope(negative_slope) {} leaky_relu_layer* copy() const override { return new leaky_relu_layer(*this); } std::string get_type() const override { return "leaky ReLU"; } data_layout get_data_layout() const override { return Layout; } El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); desc.add("Negative slope", m_negative_slope); return desc; } protected: - void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); } void fp_compute() override; void bp_compute() override; private: /** Function slope in negative region. */ - DataType m_negative_slope; + TensorDataType m_negative_slope; +#ifdef LBANN_HAS_DISTCONV + protected: + bool is_distconv_supported() const override { + return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL; + } + void setup_distconv_adapter() override { + this->get_distconv_adapter_ptr() = make_unique>(*this); + } + leaky_relu_distconv_adapter& get_distconv_adapter() override; + const leaky_relu_distconv_adapter& get_distconv_adapter() const override; +#endif // LBANN_HAS_DISTCONV }; +#ifdef LBANN_HAS_DISTCONV +template +leaky_relu_distconv_adapter& +leaky_relu_layer::get_distconv_adapter() { + return const_cast&>( + static_cast&>(*this).get_distconv_adapter()); +} + +template +const leaky_relu_distconv_adapter& +leaky_relu_layer::get_distconv_adapter() const { + return dynamic_cast&>( + data_type_layer::get_distconv_adapter()); +} + +template +void leaky_relu_distconv_adapter:: +setup_distributions(tensor_overlap_constraints &constraints) { + data_type_distconv_adapter::setup_distributions( + constraints); + + auto &x = this->get_prev_activations_dist(); + auto &y = this->get_activations_dist(); + auto &dx = this->get_error_signals_dist(); + auto &dy = this->get_prev_error_signals_dist(); + + // x == y + constraints.mark_equivalent(x, y); + // x == dx + constraints.mark_equivalent(x, dx); + // dx == dy + constraints.mark_equivalent(dx, dy); +} + +template +void leaky_relu_distconv_adapter::setup_layer( + size_t workspace_capacity) { + m_leaky_relu = make_unique(dc::get_backend()); +} +#endif // LBANN_HAS_DISTCONV + +#ifndef LBANN_LEAKY_RELU_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class leaky_relu_layer; \ + extern template class leaky_relu_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_LEAKY_RELU_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_ACTIVATIONS_LEAKY_RELU_HPP_INCLUDED diff --git a/include/lbann/layers/activations/log_softmax.hpp b/include/lbann/layers/activations/log_softmax.hpp index 136edf89600..669370f816a 100644 --- a/include/lbann/layers/activations/log_softmax.hpp +++ b/include/lbann/layers/activations/log_softmax.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_ACTIVATIONS_LOG_SOFTMAX_HPP_INCLUDED #define LBANN_LAYERS_ACTIVATIONS_LOG_SOFTMAX_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" #include "lbann/utils/cudnn.hpp" namespace lbann { @@ -36,19 +36,28 @@ namespace lbann { * * @f[ \log \text{softmax}(x)_i = x_i - \log \sum_j e^{x_j} @f] */ -template -class log_softmax_layer : public Layer { +template +class log_softmax_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: log_softmax_layer(lbann_comm *comm) - : Layer(comm) + : data_type_layer(comm) #ifdef LBANN_HAS_CUDNN , m_tensors_cudnn_desc(this) #endif // LBANN_HAS_CUDNN {} log_softmax_layer(const log_softmax_layer& other) - : Layer(other), + : data_type_layer(other), m_workspace(other.m_workspace ? other.m_workspace->Copy() : nullptr) #ifdef LBANN_HAS_CUDNN @@ -61,7 +70,7 @@ class log_softmax_layer : public Layer { } log_softmax_layer& operator=(const log_softmax_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); #ifdef LBANN_HAS_CUDNN @@ -78,16 +87,16 @@ class log_softmax_layer : public Layer { data_layout get_data_layout() const override { return Layout; } El::Device get_device_allocation() const override { return Device; } - void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); } void setup_matrices(const El::Grid& grid) override { - Layer::setup_matrices(grid); - auto dist = get_prev_activations().DistData(); + data_type_layer::setup_matrices(grid); + auto dist = this->get_prev_activations().DistData(); dist.colDist = El::STAR; - m_workspace.reset(AbsDistMat::Instantiate(dist)); + m_workspace.reset(AbsDistMatrixType::Instantiate(dist)); #ifdef HYDROGEN_HAVE_CUB if (m_workspace->GetLocalDevice() == El::Device::GPU) { m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool @@ -96,8 +105,8 @@ class log_softmax_layer : public Layer { } void fp_setup_outputs(El::Int mini_batch_size) override { - Layer::fp_setup_outputs(mini_batch_size); - const auto& dist_data = get_prev_activations().DistData(); + data_type_layer::fp_setup_outputs(mini_batch_size); + const auto& dist_data = this->get_prev_activations().DistData(); m_workspace->Empty(false); m_workspace->AlignWith(dist_data); m_workspace->Resize(1, mini_batch_size); @@ -106,18 +115,32 @@ class log_softmax_layer : public Layer { void fp_compute() override; void bp_compute() override; + template + friend void fp_compute_impl(log_softmax_layer& l); + template + friend void bp_compute_impl(log_softmax_layer& l); + private: /** Workspace for column-wise reductions. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; #ifdef LBANN_HAS_CUDNN /** Tensor cuDNN descriptors. */ - cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; + cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; #endif // LBANN_HAS_CUDNN }; +#ifndef LBANN_LOG_SOFTMAX_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class log_softmax_layer; \ + extern template class log_softmax_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_LOG_SOFTMAX_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_ACTIVATIONS_LOG_SOFTMAX_HPP_INCLUDED diff --git a/include/lbann/layers/activations/relu.hpp b/include/lbann/layers/activations/relu.hpp new file mode 100644 index 00000000000..f95c663ac86 --- /dev/null +++ b/include/lbann/layers/activations/relu.hpp @@ -0,0 +1,131 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYER_ACTIVATION_RELU_HPP_INCLUDED +#define LBANN_LAYER_ACTIVATION_RELU_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" +#include "lbann/utils/distconv.hpp" + +namespace lbann { + +#ifdef LBANN_HAS_DISTCONV +template +class relu_distconv_adapter: public data_type_distconv_adapter { + public: + using TensorDevType = typename data_type_distconv_adapter::TensorDevType; + relu_distconv_adapter(Layer& layer): data_type_distconv_adapter(layer) {} + virtual ~relu_distconv_adapter() = default; + void setup_distributions(tensor_overlap_constraints &constraints) override; + void setup_layer(size_t workspace_capacity) override; + std::unique_ptr m_relu; +}; +#endif // LBANN_HAS_DISTCONV + +/** Rectified linear unit activation function layer. + * \f[ ReLU(x) = \text{max}(x, 0) \f] + * See https://en.wikipedia.org/wiki/Rectifier_(neural_networks) + */ +template +class relu_layer : public data_type_layer { +public: + relu_layer(lbann_comm *comm) : data_type_layer(comm) {} + relu_layer* copy() const override { return new relu_layer(*this); } + std::string get_type() const override { return "ReLU"; } + data_layout get_data_layout() const override { return T_layout; } + El::Device get_device_allocation() const override { return Dev; } + +protected: + void fp_compute() override; + void bp_compute() override; +#ifdef LBANN_HAS_DISTCONV + bool is_distconv_supported() const override { + return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL; + } + void setup_distconv_adapter() override { + this->get_distconv_adapter_ptr() = make_unique>(*this); + } + relu_distconv_adapter& get_distconv_adapter() override; + const relu_distconv_adapter& get_distconv_adapter() const override; +#endif // LBANN_HAS_DISTCONV +}; + +#ifdef LBANN_HAS_DISTCONV +template +relu_distconv_adapter& +relu_layer::get_distconv_adapter() { + return const_cast&>( + static_cast&>(*this).get_distconv_adapter()); +} + +template +const relu_distconv_adapter& +relu_layer::get_distconv_adapter() const { + return dynamic_cast&>( + data_type_layer::get_distconv_adapter()); +} + +template +void relu_distconv_adapter:: +setup_distributions(tensor_overlap_constraints &constraints) { + data_type_distconv_adapter::setup_distributions( + constraints); + + auto &x = this->get_prev_activations_dist(); + auto &y = this->get_activations_dist(); + auto &dx = this->get_error_signals_dist(); + auto &dy = this->get_prev_error_signals_dist(); + + // x == dx + constraints.mark_equivalent(x, dx); + // y == dy + constraints.mark_equivalent(y, dy); +} + +template +void relu_distconv_adapter::setup_layer( + size_t workspace_capacity) { + m_relu = make_unique(dc::get_backend()); + m_relu->setup(this->get_prev_activations(), + this->get_activations(), + this->get_error_signals(), + this->get_prev_error_signals()); +} +#endif // LBANN_HAS_DISTCONV + +#ifndef LBANN_RELU_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class relu_layer; \ + extern template class relu_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_RELU_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYER_ACTIVATION_RELU_HPP_INCLUDED diff --git a/include/lbann/layers/activations/softmax.hpp b/include/lbann/layers/activations/softmax.hpp index 665323c3c14..0a3a4c9917a 100644 --- a/include/lbann/layers/activations/softmax.hpp +++ b/include/lbann/layers/activations/softmax.hpp @@ -27,35 +27,91 @@ #ifndef LBANN_LAYERS_ACTIVATIONS_SOFTMAX_HPP_INCLUDED #define LBANN_LAYERS_ACTIVATIONS_SOFTMAX_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" #include "lbann/utils/cudnn.hpp" +#include "lbann/utils/distconv.hpp" // Threshold outputs to a minimum value. + // If enabled, the minimum output value is sqrt(min), where min is the // minimum, normalized, positive value (~1e-19 for float and ~1e-154 -// for double). The gradients w.r.t. input will be inaccurate, on the -// order of the minimum output value. -#define LBANN_ENABLE_SOFTMAX_CUTOFF +// for double). During backprop, gradients are computed as if +// thresholding did not occur, so there will be a discrepancy for +// values that are thresholded. +#define LBANN_ENABLE_SOFTMAX_THRESHOLD namespace lbann { -/** @brief - * +/** @brief Which tensor dimensions to apply softmax over. */ +enum class softmax_mode { + INVALID, + /** @brief Sample-wise softmax. + * + * Slice tensor along the sample dimension (assuming data in NCHW + * format) and apply softmax independently to each slice (once per + * sample). + */ + INSTANCE, + /** @brief Position-wise softmax. + * + * Split tensor along all but the channel dimension (assuming data + * in NCHW format) and apply softmax independently to each piece + * (once per spatial position per sample). + * + * This is not to be confused with @c channelwise_softmax, which + * slices along the sample and channel dimensions. + */ + CHANNEL +}; + +#ifdef LBANN_HAS_DISTCONV +template +class softmax_distconv_adapter: public data_type_distconv_adapter { + public: + using TensorDevType = typename data_type_distconv_adapter::TensorDevType; + + softmax_distconv_adapter(Layer& layer): data_type_distconv_adapter(layer) {} + virtual ~softmax_distconv_adapter() = default; + + void setup_distributions(tensor_overlap_constraints &constraints) override; + void setup_layer(size_t workspace_capacity) override; + + std::unique_ptr m_softmax; +}; +#endif // LBANN_HAS_DISTCONV + +/** * @f[ \text{softmax}(x)_i = \frac{e^{x_i}}{\sum_j e^{x_j}} @f] */ -template -class softmax_layer : public Layer { +template +class softmax_layer : public data_type_layer { public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} - softmax_layer(lbann_comm *comm) - : Layer(comm) +public: + + softmax_layer(lbann_comm *comm, + softmax_mode mode) + : data_type_layer(comm), + m_mode(mode) #ifdef LBANN_HAS_CUDNN , m_tensors_cudnn_desc(this) #endif // LBANN_HAS_CUDNN - {} + { + if(mode == softmax_mode::INVALID) { + LBANN_ERROR("invalid softmax mode"); + } + } softmax_layer(const softmax_layer& other) - : Layer(other), + : data_type_layer(other), + m_mode(other.m_mode), m_workspace(other.m_workspace ? other.m_workspace->Copy() : nullptr) #ifdef LBANN_HAS_CUDNN @@ -67,17 +123,6 @@ class softmax_layer : public Layer { #endif // LBANN_HAS_CUDNN } - softmax_layer& operator=(const softmax_layer& other) { - Layer::operator=(other); - m_workspace.reset(other.m_workspace ? - other.m_workspace->Copy() : nullptr); -#ifdef LBANN_HAS_CUDNN - m_tensors_cudnn_desc = other.m_tensors_cudnn_desc; - m_tensors_cudnn_desc.set_layer(this); -#endif // LBANN_HAS_CUDNN - return *this; - } - ~softmax_layer() = default; softmax_layer* copy() const override { return new softmax_layer(*this); } @@ -85,16 +130,16 @@ class softmax_layer : public Layer { data_layout get_data_layout() const override { return Layout; } El::Device get_device_allocation() const override { return Device; } - void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); } void setup_matrices(const El::Grid& grid) override { - Layer::setup_matrices(grid); - auto dist = get_prev_activations().DistData(); + data_type_layer::setup_matrices(grid); + auto dist = this->get_prev_activations().DistData(); dist.colDist = El::STAR; - m_workspace.reset(AbsDistMat::Instantiate(dist)); + m_workspace.reset(AbsDistMatrixType::Instantiate(dist)); #ifdef HYDROGEN_HAVE_CUB if (m_workspace->GetLocalDevice() == El::Device::GPU) { m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool @@ -103,8 +148,8 @@ class softmax_layer : public Layer { } void fp_setup_outputs(El::Int mini_batch_size) override { - Layer::fp_setup_outputs(mini_batch_size); - const auto& dist_data = get_prev_activations().DistData(); + data_type_layer::fp_setup_outputs(mini_batch_size); + const auto& dist_data = this->get_prev_activations().DistData(); m_workspace->Empty(false); m_workspace->AlignWith(dist_data); m_workspace->Resize(1, mini_batch_size); @@ -113,18 +158,114 @@ class softmax_layer : public Layer { void fp_compute() override; void bp_compute() override; + template + friend void fp_compute_impl(softmax_layer& l); + template + friend void bp_compute_impl(softmax_layer& l); + private: + /** Softmax mode. */ + const softmax_mode m_mode; + /** Workspace for column-wise reductions. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; #ifdef LBANN_HAS_CUDNN /** Tensor cuDNN descriptors. */ - cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; + cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; #endif // LBANN_HAS_CUDNN +// Minimum output value to avoid denormalized floats +#ifdef LBANN_ENABLE_SOFTMAX_THRESHOLD + const TensorDataType threshold_val = static_cast(El::Sqrt(std::numeric_limits::min())); +#else + const TensorDataType threshold_val = El::TypeTraits::Zero(); +#endif // LBANN_ENABLE_SOFTMAX_THRESHOLD + +#ifdef LBANN_HAS_DISTCONV + friend class softmax_distconv_adapter; + protected: + bool is_distconv_supported() const override { + return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL; + } + void setup_distconv_adapter() override { + this->get_distconv_adapter_ptr() = make_unique>(*this); + } + softmax_distconv_adapter& get_distconv_adapter() override; + const softmax_distconv_adapter& get_distconv_adapter() const override; +#endif // LBANN_HAS_DISTCONV }; +#ifdef LBANN_HAS_DISTCONV +template +softmax_distconv_adapter& +softmax_layer::get_distconv_adapter() { + return const_cast&>( + static_cast&>(*this).get_distconv_adapter()); +} + +template +const softmax_distconv_adapter& +softmax_layer::get_distconv_adapter() const { + return dynamic_cast&>( + data_type_layer::get_distconv_adapter()); +} + +template +void softmax_distconv_adapter:: +setup_distributions(tensor_overlap_constraints &constraints) { + data_type_distconv_adapter::setup_distributions( + constraints); + // No overlap supported yet + for (auto &d: this->m_prev_activations_dists) { + d.clear_overlap(); + constraints.mark_updated(d); + constraints.mark_invariant(d); + } + for (auto &d: this->m_activations_dists) { + d.clear_overlap(); + constraints.mark_updated(d); + constraints.mark_invariant(d); + } + for (auto &d: this->m_prev_error_signals_dists) { + d.clear_overlap(); + constraints.mark_updated(d); + constraints.mark_invariant(d); + } + for (auto &d: this->m_error_signals_dists) { + d.clear_overlap(); + constraints.mark_updated(d); + constraints.mark_invariant(d); + } +} + +template +void softmax_distconv_adapter::setup_layer( + size_t workspace_capacity) { + auto &l = dynamic_cast&>( + this->layer()); + m_softmax = make_unique(dc::get_backend()); + auto mode = l.m_mode == softmax_mode::INSTANCE ? + ::distconv::SoftmaxMode::INSTANCE : + ::distconv::SoftmaxMode::CHANNEL; + m_softmax->setup(this->get_prev_activations(), mode); +} +#endif // LBANN_HAS_DISTCONV + + +LBANN_DEFINE_LAYER_BUILDER(softmax); + +#ifndef LBANN_SOFTMAX_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class softmax_layer; \ + extern template class softmax_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_SOFTMAX_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_ACTIVATIONS_SOFTMAX_HPP_INCLUDED diff --git a/include/lbann/layers/data_type_distconv_adapter.hpp b/include/lbann/layers/data_type_distconv_adapter.hpp new file mode 100644 index 00000000000..a120965ad67 --- /dev/null +++ b/include/lbann/layers/data_type_distconv_adapter.hpp @@ -0,0 +1,163 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_DATA_TYPE_DISTCONV_ADAPTER_HPP_INCLUDED +#define LBANN_LAYERS_DATA_TYPE_DISTCONV_ADAPTER_HPP_INCLUDED + +#include "lbann/layers/distconv_adapter.hpp" +#include "lbann/layers/layer.hpp" + +namespace lbann { + +template +class data_type_distconv_adapter: public distconv_adapter { +public: + using TensorDevType = dc::TensorDev; + using TensorShufflerType = dc::TensorShuffler; + + data_type_distconv_adapter(Layer& layer): distconv_adapter(layer) {} + virtual ~data_type_distconv_adapter() = default; + + /** Get activation tensor corresponding to child layer. */ + const TensorDevType& get_activations(const Layer& child) const override; + /** Get error signal tensor corresponding to parent layer. */ + const TensorDevType& get_error_signals(const Layer& parent) const override; + + /** Get activation tensor. */ + const TensorDevType& get_activations(int child_index = 0) const; + /** Get activation tensor. */ + TensorDevType& get_activations(int child_index = 0); + /** Get original activation tensor. */ + const TensorDevType& get_original_activations(int child_index = 0) const; + /** Get original activation tensor. */ + TensorDevType& get_original_activations(int child_index = 0); + + /** Get previous activation tensor. */ + const TensorDevType& get_prev_activations(int parent_index = 0) const; + /** Get previous activation tensor. */ + TensorDevType& get_prev_activations(int parent_index = 0); + /** Get original previous activation tensor. */ + const TensorDevType& get_original_prev_activations(int parent_index = 0) const; + /** Get original previous activation tensor. */ + TensorDevType& get_original_prev_activations(int parent_index = 0); + + /** Get error signal tensor. */ + const TensorDevType& get_error_signals(int parent_index = 0) const; + /** Get error signal tensor. */ + TensorDevType& get_error_signals(int parent_index = 0); + /** Get original error signal tensor. */ + const TensorDevType& get_original_error_signals(int parent_index = 0) const; + /** Get original error signal tensor. */ + TensorDevType& get_original_error_signals(int parent_index = 0); + + /** Get previous error siganl tensor. */ + const TensorDevType& get_prev_error_signals(int child_index = 0) const; + /** Get previous error siganl tensor. */ + TensorDevType& get_prev_error_signals(int child_index = 0); + /** Get original previous error signal tensor. */ + const TensorDevType& get_original_prev_error_signals(int child_index = 0) const; + /** Get original previous error signal tensor. */ + TensorDevType& get_original_prev_error_signals(int child_index = 0); + + void fp_setup(El::Int mini_batch_size) override; + void fp_postprocess() override; + void bp_setup(El::Int mini_batch_size) override; + void bp_postprocess() override; + + void dump_activations() const override; + void dump_original_activations() override; + void dump_error_signals() const override; + void dump_original_error_signals() override; + + protected: + // Setup fp tensors + void setup_prev_activations() override; + virtual std::unique_ptr setup_prev_activations_i(int index) const; + void setup_original_prev_activations() override; + virtual std::unique_ptr setup_original_prev_activations_i(int index) const; + void setup_activations() override; + virtual std::unique_ptr setup_activations_i(int index) const; + void setup_original_activations() override; + virtual std::unique_ptr setup_original_activations_i(int index) const; + + // Setup bp tensors + void setup_prev_error_signals() override; + virtual std::unique_ptr setup_prev_error_signals_i(int index) const; + void setup_original_prev_error_signals() override; + virtual std::unique_ptr setup_original_prev_error_signals_i(int index) const; + void setup_error_signals() override; + virtual std::unique_ptr setup_error_signals_i(int index) const; + void setup_original_error_signals() override; + virtual std::unique_ptr setup_original_error_signals_i(int index) const; + + virtual dc::Shape get_prev_activations_shape(int input_index=0) const; + virtual dc::Shape get_prev_activations_local_shape(int input_index=0) const; + virtual dc::Shape get_activations_shape(int index=0) const; + virtual dc::Shape get_activations_local_shape(int index=0) const; + + virtual dc::Shape get_prev_error_signals_shape(int index=0) const; + virtual dc::Shape get_prev_error_signals_local_shape(int index=0) const; + virtual dc::Shape get_error_signals_shape(int index=0) const; + virtual dc::Shape get_error_signals_local_shape(int index=0) const; + + void ensure_prev_activations() override; + void copy_out_activations() override; + void ensure_prev_error_signals() override; + void copy_out_error_signals() override; + + TensorShufflerType& get_prev_activations_shuffler( + const TensorDevType &src, const TensorDevType &dst); + TensorShufflerType& get_activations_shuffler( + const TensorDevType &src, const TensorDevType &dst); + TensorShufflerType& get_prev_error_signals_shuffler( + const TensorDevType &src, const TensorDevType &dst); + TensorShufflerType& get_error_signals_shuffler( + const TensorDevType &src, const TensorDevType &dst); + + private: + std::vector> m_inputs; + std::vector> m_original_inputs; + std::vector> m_outputs; + std::vector> m_original_outputs; + + std::vector> m_gradient_wrt_inputs; + std::vector> m_original_gradient_wrt_inputs; + std::vector> m_gradient_wrt_outputs; + std::vector> m_original_gradient_wrt_outputs; + + // TODO: Use unique_ptr + std::array m_prev_activations_shufflers{ {nullptr, nullptr, nullptr, nullptr} }; + std::array m_activations_shufflers{ {nullptr, nullptr, nullptr, nullptr} }; + std::array m_prev_error_signals_shufflers{ {nullptr, nullptr, nullptr, nullptr} }; + std::array m_error_signals_shufflers{ {nullptr, nullptr, nullptr, nullptr} }; + + void set_activations_outermost_dimension(size_t dim); + void set_error_signals_outermost_dimension(size_t dim); +}; + +} // namespace lbann + +#endif // LBANN_LAYERS_DATA_TYPE_DISTCONV_ADAPTER_HPP_INCLUDED diff --git a/include/lbann/layers/data_type_layer.hpp b/include/lbann/layers/data_type_layer.hpp new file mode 100644 index 00000000000..2c363ccef21 --- /dev/null +++ b/include/lbann/layers/data_type_layer.hpp @@ -0,0 +1,394 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_DATA_TYPE_LAYER_HPP_INCLUDED +#define LBANN_LAYERS_DATA_TYPE_LAYER_HPP_INCLUDED + +#include "lbann/layers/layer.hpp" +#include "lbann/weights/weights_proxy.hpp" + +#include "lbann/utils/h2_tmp.hpp" + +#ifdef LBANN_HAS_DISTCONV +#include "lbann/layers/data_type_distconv_adapter.hpp" +#include +#include +#include +#endif // LBANN_HAS_DISTCONV + +namespace lbann { + +// Forward declarations +namespace cudnn { +template +class data_parallel_layer_tensor_manager; +template +class entrywise_layer_tensor_manager; +} + +using supported_layer_data_type = h2::meta::TL< +#ifdef LBANN_HAS_GPU_FP16 + fp16, +#endif +#ifdef LBANN_HAS_HALF + cpu_fp16, +#endif + float, double>; + +template +class data_type_layer : public Layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The proxy tensor type expected in this object. */ + template + using AbsDistMatReadProxyType = El::AbstractDistMatrixReadDeviceProxy; + + /** @brief The local tensor type expected in this object. */ + using AbsMatrixType = El::AbstractMatrix; + + /** @brief The proxy type for weights used by this object. */ + using WeightsProxyType = weights_proxy; + + ///@} + +public: + static_assert( + h2::meta::tlist::MemberV(), + "Must use a supported type."); + + data_type_layer(lbann_comm *comm, bool persistent_error_signals=false) + : Layer(comm), m_persistent_error_signals{persistent_error_signals} {} + data_type_layer(const data_type_layer& other); + data_type_layer& operator=(const data_type_layer& other); + virtual ~data_type_layer() = default; + + /** Get a string representing the layer datatype + */ + std::string get_datatype_name() const override { + return TypeName(); + }; + + /** Forward propagation step. + * Apply a mathematical operation to input tensors to obtain output + * tensors. + */ + void forward_prop() final; + + void summarize_matrices(lbann_summary& summarizer, int step) override; + + /** Check that the setup is reasonable. */ + void check_setup() override; + + // =========================================================== + // Public Tensor access functions + // =========================================================== + + /** Get activation tensor corresponding to child layer. */ + const BaseDistMat& get_activations(const Layer& child) const override; + /** Get error signal tensor corresponding to parent layer. */ + const BaseDistMat& get_error_signals(const Layer& parent) const override; + + /** Get activation tensor. */ + AbsDistMatrixType& get_activations(int child_index = 0); + /** Get error signal tensor. */ + AbsDistMatrixType& get_error_signals(int parent_index = 0); + /** Get activation tensor. */ + const AbsDistMatrixType& get_activations(int child_index = 0) const; + /** Get error signal tensor. */ + const AbsDistMatrixType& get_error_signals(int parent_index = 0) const; + + /** Get local portion of activation tensor. */ + AbsMatrixType& get_local_activations(int child_index = 0); + /** Get local portion of error signal tensor. */ + AbsMatrixType& get_local_error_signals(int parent_index = 0); + /** Get local portion of activation tensor. */ + const AbsMatrixType& get_local_activations(int child_index = 0) const; + /** Get local portion of error signal tensor. */ + const AbsMatrixType& get_local_error_signals(int parent_index = 0) const; + + /** @brief Set whether to keep or dynamically reallocate error signals. + * + * Passing a value of @c true means to keep the error signals; @c + * false means to dynamically reallocate them. + */ + void set_keep_error_signals(bool) override; + +protected: + + // =========================================================== + // Protected Tensor access functions + // =========================================================== + + /** Get previous activation tensor. */ + const AbsDistMatrixType& get_prev_activations(int parent_index = 0) const; + /** Get previous error signal tensor. */ + const AbsDistMatrixType& get_prev_error_signals(int child_index = 0) const; + + /** Get local portion of previous activation tensor. */ + const AbsMatrixType& get_local_prev_activations(int parent_index = 0) const; + /** Get local portion of previous error signal tensor. */ + const AbsMatrixType& get_local_prev_error_signals(int child_index = 0) const; + +protected: + + // =========================================================== + // Setup helper functions + // =========================================================== + + /** Setup distributed matrices. + * Called by the 'setup' function. Each column of these distributed + * matrices is interpreted as the flattened tensor for a mini-batch + * sample. The matrices themselves are constructed by calling the + * 'construct_matrix' function. If any matrices have already been + * setup, they are destroyed and reinstantiated. + */ + void setup_matrices(const El::Grid& grid) override; + + /** Setup layer data. + * Called by the 'setup' function. Memory is allocated for + * distributed matrices. + */ + void setup_data(size_t max_mini_batch_size) override; + + // =========================================================== + // Forward prop step helper functions + // =========================================================== + + /** Setup input tensors. + * Called by the 'forward_prop' function. Each input tensor is + * setup as a view or copy of the corresponding parent layer's + * output tensor. + */ + void fp_setup_inputs(El::Int mini_batch_size) override; + /** Setup output tensors. + * Called by the 'forward_prop' function. Each output tensor is + * resized to match the mini-batch size. + */ + void fp_setup_outputs(El::Int mini_batch_size) override; + + // =========================================================== + // Back prop step helper functions + // =========================================================== + + /** Setup gradient w.r.t. input tensors. + * Called by the 'back_prop' function. Each gradient w.r.t. input + * tensor is resized to match the mini-batch size. + */ + void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override; + /** Compute objective funciton gradients. + * Called by the 'back_prop' function. Given the input, output, and + * gradient w.r.t. output tensors, the gradient w.r.t. input + * tensors are populated with the computed values and the gradients + * w.r.t. the weights are sent to the appropriate optimizers. + */ + void bp_compute() override; + + // =========================================================== + // Protected Weights access functions + // =========================================================== + + /** @brief Get the values matrix for a specific weights object */ + AbsDistMatrixType const& weights_values(size_t idx) const { + if (idx >= m_weights_proxy.size()) + LBANN_ERROR("Bad index ", idx, " " + "(size=" , m_weights_proxy.size(), ")"); + return m_weights_proxy[idx].values(); + } + + /** @brief Get a specific master weights object. + * + * This is sufficient for setting or accessing metadata about the + * weights class. + */ + weights& master_weights(size_t idx) { + return get_weights(idx); + } + weights const& master_weights(size_t idx) const { + return get_weights(idx); + } + +private: + + void setup_weights(size_t idx, weights& w) override; + + /** @brief Attempt to take ownership of the previous error signal. + * + * If the underlying matrix has the right datatype and + * distribution, the signal is moved explicitly. Otherwise a deep + * copy is made so that it has the correct datatype and + * distribution. + * + * This is valid if the child layer does not have persistent error + * signals. + * + * @param child The layer from which the error signal has come. + * @param signal The error signal from the layer. + */ + void move_or_copy_prev_error_signal_( + const Layer& child, + std::unique_ptr signal) final; + + /** @brief Attempt to view the previous error signal. + * + * If the underlying matrix has the right datatype and + * distribution, the signal can be viewed directly. Otherwise a + * deep copy is made so that it has the correct datatype and + * distribution. + * + * This is only valid if the child layer has persistent error + * signals. Otherwise, the viewed data my be invalidated. + * + * @param child The layer from which the error signal has come. + * @param signal The error signal from the layer. + */ + void view_or_copy_prev_error_signal_( + const Layer& child, + const El::BaseDistMatrix& signal) final; + + /** @brief Deep copy the error signal. + * + * In some cases, it can be determined that neither viewing nor + * moving is a possibility. In these cases, we must do a deep copy. + * + * @param child The layer from which the error signal has come. + * @param signal The error signal from the layer. + */ + void deep_copy_prev_error_signal_( + const Layer& child, + const El::BaseDistMatrix& signal) final; + + /** @brief Ensure that gradient matrices exist. + * + * This step is performed immediately prior to the bp_compute() + * work. + */ + void allocate_new_gradients_() final; + + /** @brief Send error signals computed by this layer to their + * respective parents. + * + * This step is performed immediately after the bp_compute() work + * and prior to clearing the previous error signals. This ordering + * is necessary in case this layer's error signals are views into + * the previous error signals. + */ + void propagate_error_signals_to_parents_() final; + + /** @brief Free previous error signals, if possible. + * + * This step is performed at the end of a layer's backprop phase. + */ + void clear_prev_error_signals_() final; + + /** Backward propagation step. + * Given the objective function gradients w.r.t. the output + * tensors, compute the gradients w.r.t. the input tensors and + * w.r.t. the weights. This is essentially an application of the + * chain rule. + */ + void back_prop_impl_() final; + + // =========================================================== + // Private class members + // =========================================================== + + /** @brief Persistent, read-only, proxied views of the weights + * values matrix. + * + * @note (trb 05/28/2020): These are kept as members out of + * consideration for the case where accessing them could require a + * deep copy. This is more out of my own concern about ways in + * which derived classes could abuse weights; in theory, I believe, + * you could just create these on the fly once during FP and once + * during BP. Then the question is: does the performance cost of + * (potentially) two(ish) copies or the memory cost of storing an + * additional copy of the weights hurt more? + */ + std::vector m_weights_proxy; + + /** Input tensors. + * Each matrix column corresponds to a flattened mini-batch sample. + */ + std::vector> m_inputs; + /** Output tensors. + * Each matrix column corresponds to a flattened mini-batch sample. + */ + std::vector> m_outputs; + /** Objective function gradients w.r.t. the output tensors. + * Each matrix column corresponds to a flattened mini-batch sample. + */ + std::vector> m_gradient_wrt_outputs; + /** Objective function gradients w.r.t. the input tensors. + * Each matrix column corresponds to a flattened mini-batch sample. + */ + std::vector> m_gradient_wrt_inputs; + + /** @brief Whether to keep persistent error signals or dynamically + * allocate/deallocate them. + * + * The default behavior is dynamic allocation. + */ + bool m_persistent_error_signals = false; + +#ifdef LBANN_HAS_DISTCONV + friend class data_type_distconv_adapter; + public: + data_type_distconv_adapter& get_distconv_adapter() override; + const data_type_distconv_adapter& get_distconv_adapter() const override; + + protected: + void setup_distconv_adapter() override; +#endif // LBANN_HAS_DISTCONV + +#ifdef LBANN_HAS_CUDA + template + friend class cudnn::data_parallel_layer_tensor_manager; + template + friend class cudnn::entrywise_layer_tensor_manager; +#endif // LBANN_HAS_CUDA +}; + +#ifndef LBANN_DATA_TYPE_LAYER_INSTANTIATE +#define PROTO(T) \ + extern template class data_type_layer + +#define LBANN_INSTANTIATE_CPU_HALF +#define LBANN_INSTANTIATE_GPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#undef LBANN_INSTANTIATE_CPU_HALF +#undef LBANN_INSTANTIATE_GPU_HALF + +#endif // LBANN_DATA_TYPE_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYERS_DATA_TYPE_LAYER_HPP_INCLUDED diff --git a/include/lbann/layers/distconv_adapter.hpp b/include/lbann/layers/distconv_adapter.hpp new file mode 100644 index 00000000000..0b6175ec1e8 --- /dev/null +++ b/include/lbann/layers/distconv_adapter.hpp @@ -0,0 +1,141 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_DISTCONV_ADAPTER_HPP_INCLUDED +#define LBANN_LAYERS_DISTCONV_ADAPTER_HPP_INCLUDED + +#include "lbann/utils/distconv.hpp" + +#include +#include + +namespace lbann { + +class Layer; + +class tensor_overlap_constraints { + public: + using dist_set = std::unordered_set; + using const_dist_set = std::unordered_set; + + tensor_overlap_constraints() = default; + virtual ~tensor_overlap_constraints() = default; + + void mark_equivalent(dc::Dist &d1, dc::Dist &d2); + void mark_updated(const dc::Dist &d); + void mark_invariant(const dc::Dist &d); + + void find_valid_overlap(); + + private: + std::unordered_map m_equivalents; + const_dist_set m_updated; + const_dist_set m_invariants; +}; + +class distconv_adapter { + friend class Layer; + public: + distconv_adapter(Layer& layer); + virtual ~distconv_adapter() = default; + + /** Get activation tensor corresponding to child layer. */ + virtual const dc::AbsTensor& get_activations(const Layer& child) const = 0; + /** Get error signal tensor corresponding to parent layer. */ + virtual const dc::AbsTensor& get_error_signals(const Layer& parent) const = 0; + + virtual void setup_distributions(tensor_overlap_constraints &constraints); + void impose_adjacent_overlap_constraints( + tensor_overlap_constraints &constraints); + + dc::Dist &get_prev_activations_dist(); + const dc::Dist &get_prev_activations_dist() const; + dc::Dist &get_activations_dist(); + const dc::Dist &get_activations_dist() const; + dc::Dist &get_prev_error_signals_dist(); + const dc::Dist &get_prev_error_signals_dist() const; + dc::Dist &get_error_signals_dist(); + const dc::Dist &get_error_signals_dist() const; + + virtual void setup_fp_tensors(); + virtual void setup_bp_tensors(); + + virtual void setup_layer(size_t workspace_capacity) {} + + virtual void fp_setup(El::Int mini_batch_size) = 0; + virtual void fp_postprocess() = 0; + virtual void bp_setup(El::Int mini_batch_size) = 0; + virtual void bp_postprocess() = 0; + + virtual bool parent_copy_required(size_t input_index) const; + virtual bool parent_shuffle_required(size_t input_index) const; + virtual bool child_copy_required(size_t output_index) const; + virtual bool child_shuffle_required(size_t output_index) const; + + virtual void dump_activations() const = 0; + virtual void dump_original_activations()= 0; + virtual void dump_error_signals() const = 0; + virtual void dump_original_error_signals()= 0; + + protected: + virtual Layer& layer(); + virtual const Layer& layer() const; + std::string get_name() const; + + virtual void setup_prev_activations() = 0; + virtual void setup_original_prev_activations() = 0; + virtual void setup_activations() = 0; + virtual void setup_original_activations() = 0; + + virtual void setup_prev_error_signals() = 0; + virtual void setup_original_prev_error_signals() = 0; + virtual void setup_error_signals() = 0; + virtual void setup_original_error_signals() = 0; + + virtual void ensure_prev_activations() = 0; + virtual void copy_out_activations() = 0; + virtual void ensure_prev_error_signals() = 0; + virtual void copy_out_error_signals() = 0; + + std::vector m_prev_activations_dists; + std::vector m_activations_dists; + std::vector m_prev_error_signals_dists; + std::vector m_error_signals_dists; + + private: + Layer& m_layer; + std::vector m_parent_copy_required; + std::vector m_parent_shuffle_required; + std::vector m_child_copy_required; + std::vector m_child_shuffle_required; + + void setup_tensor_shuffle(); + void adjust_parallel_strategy(); +}; + +} // namespace lbann + +#endif // LBANN_LAYERS_DISTCONV_ADAPTER_HPP_INCLUDED diff --git a/include/lbann/layers/image/bilinear_resize.hpp b/include/lbann/layers/image/bilinear_resize.hpp index 2e3e9e9da67..b77fba2e138 100644 --- a/include/lbann/layers/image/bilinear_resize.hpp +++ b/include/lbann/layers/image/bilinear_resize.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_IMAGE_BILINEAR_RESIZE_HPP_INCLUDED #define LBANN_LAYERS_IMAGE_BILINEAR_RESIZE_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -36,14 +36,14 @@ namespace lbann { * Tensors are assumed to be image data in CHW format. Gradients are * not propagated during backprop. */ -template -class bilinear_resize_layer : public Layer { +template +class bilinear_resize_layer : public data_type_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "bilinear_resize_layer only supports DATA_PARALLEL"); public: bilinear_resize_layer(lbann_comm *comm, El::Int height, El::Int width) - : Layer(comm), m_height(height), m_width(width) { - static_assert(Layout == data_layout::DATA_PARALLEL, - "bilinear_resize_layer only supports DATA_PARALLEL"); + : data_type_layer(comm), m_height(height), m_width(width) { } bilinear_resize_layer* copy() const override { @@ -57,17 +57,17 @@ class bilinear_resize_layer : public Layer { protected: - void setup_dims() override { - Layer::setup_dims(); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); // Get input dimensions - auto dims = get_input_dims(); + auto dims = this->get_input_dims(); const auto& num_dims = dims.size(); // Check that dimensions are valid std::stringstream err; if (num_dims < 2) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "expects input with at least two dimensions, " << "but input dimensions are "; for (size_t i = 0; i < num_dims; ++i) { @@ -75,12 +75,12 @@ class bilinear_resize_layer : public Layer { } LBANN_ERROR(err.str()); } else if (m_height <= 0) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "attempted to resize with " << "negative height (" << m_height << ")"; LBANN_ERROR(err.str()); } else if (m_width <= 0) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "attempted to resize with " << "negative width (" << m_width << ")"; LBANN_ERROR(err.str()); @@ -89,7 +89,7 @@ class bilinear_resize_layer : public Layer { // Resize output tensor dims[num_dims-2] = m_height; dims[num_dims-1] = m_width; - set_output_dims(dims); + this->set_output_dims(dims); } @@ -106,6 +106,14 @@ class bilinear_resize_layer : public Layer { }; +#ifndef LBANN_BILINEAR_RESIZE_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class bilinear_resize_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_BILINEAR_RESIZE_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_IMAGE_BILINEAR_RESIZE_HPP_INCLUDED diff --git a/include/lbann/layers/io/input/generic_input_layer.hpp b/include/lbann/layers/io/input/generic_input_layer.hpp index 3dfa79edb79..4c911ec255e 100644 --- a/include/lbann/layers/io/input/generic_input_layer.hpp +++ b/include/lbann/layers/io/input/generic_input_layer.hpp @@ -29,170 +29,129 @@ #include "lbann/layers/io/io_layer.hpp" //#include "lbann/utils/dataset.hpp" +#include "lbann/io/persist.hpp" #include "lbann/io/data_buffers/generic_io_buffer.hpp" #include "lbann/io/data_buffers/partitioned_io_buffer.hpp" #include "lbann/models/model.hpp" -#include "lbann/callbacks/callback_imcomm.hpp" +#include "lbann/callbacks/imcomm.hpp" #include "lbann/utils/omp_diagnostics.hpp" +#include +#include +#include +#include +#include #include namespace lbann { /** @todo Move functionality to input_layer. */ -class generic_input_layer : public io_layer { +template +class generic_input_layer : public io_layer { public: - using data_reader_map_t = std::map; using io_buffer_map_t = std::map>; public: generic_input_layer(lbann_comm *comm, int num_parallel_readers, - std::map data_readers, - bool data_set_spans_models = true, data_reader_target_mode dr_mode = data_reader_target_mode::CLASSIFICATION) - : io_layer(comm, data_set_spans_models, dr_mode), - m_io_buffers(), - m_training_dataset(), - m_testing_dataset(), - m_validation_dataset(), - m_data_readers(data_readers), - m_data_set_processed(false) { + : io_layer(comm, dr_mode), + m_io_buffers() { //m_data_sets_span_models(data_sets_span_models) { // Input layers have no parents - m_expected_num_parent_layers = 0; + this->m_expected_num_parent_layers = 0; if(dr_mode == data_reader_target_mode::NA) { - m_expected_num_child_layers = 1; + this->m_expected_num_child_layers = 1; }else { // Input layers output a sample and target, which could be the // original value, categorical label, or regression value - m_expected_num_child_layers = 2; + this->m_expected_num_child_layers = 2; } - if(m_data_readers[execution_mode::training] != nullptr) { - m_training_dataset.total_samples() = m_data_readers[execution_mode::training]->get_num_data(); - } + this->m_active_buffer[execution_mode::training].store(-1); + this->m_active_buffer[execution_mode::validation].store(-1); + this->m_active_buffer[execution_mode::testing].store(-1); + } - if(m_data_readers[execution_mode::validation] != nullptr) { - m_validation_dataset.total_samples() = m_data_readers[execution_mode::validation]->get_num_data(); - } + ~generic_input_layer() override { - if(m_data_readers[execution_mode::testing] != nullptr) { - m_testing_dataset.total_samples() = m_data_readers[execution_mode::testing]->get_num_data(); + // Synchronize the I/O thread pool + // Note: The thread pool may still be running asynchronously if the + // trainer is destroyed in the middle of an epoch. The thread pool + // needs to interact with data readers, etc., so it needs to be + // synchronized before any of them are destroyed. + if (this->m_model != nullptr) { + if (this->m_model->has_valid_execution_context()) { + this->m_model->get_execution_context().get_io_thread_pool().reap_threads(); + } } - m_active_buffer[execution_mode::training].store(-1); - m_active_buffer[execution_mode::validation].store(-1); - m_active_buffer[execution_mode::testing].store(-1); - } - - ~generic_input_layer() override { for (auto& io_buffer : m_io_buffers) { delete io_buffer; } - // Input layer always frees data readers. - for (auto& dr : m_data_readers) { - delete dr.second; - } } // Input layers copy their datareaders. generic_input_layer(const generic_input_layer& other) - : io_layer(other), - m_io_buffers(other.m_io_buffers), - m_training_dataset(other.m_training_dataset), - m_testing_dataset(other.m_testing_dataset), - m_validation_dataset(other.m_validation_dataset), - m_data_readers(other.m_data_readers) { + : io_layer(other), + m_io_buffers(other.m_io_buffers) { for (auto& io_buffer : m_io_buffers) { io_buffer = io_buffer->copy(); } - for (auto& dr : m_data_readers) { - dr.second = dr.second->copy(); - } } generic_input_layer& operator=(const generic_input_layer& other) { - io_layer::operator=(other); + io_layer::operator=(other); for (auto& io_buffer : m_io_buffers) { io_buffer = io_buffer->copy(); } - for (auto& dr : m_data_readers) { - dr.second = dr.second->copy(); - } return *this; } + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + // ar(CEREAL_NVP(m_io_buffer)); + } + template - inline void initialize_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map data_readers); + inline void initialize_io_buffer(lbann_comm *comm, int num_parallel_readers) { + m_io_buffers.push_back(new T_io_buffer(comm, num_parallel_readers, this->m_expected_num_child_layers)); + } std::string get_type() const override { return "generic_input"; } description get_description() const override { - auto&& desc = io_layer::get_description(); + auto desc = io_layer::get_description(); desc.add("Buffer", m_io_buffers[0]->get_type()); - desc.add("Background I/O", this->m_model->background_io_activity_allowed()); return desc; } - void setup_dims() override { - io_layer::setup_dims(); - for (int i = 0; i < get_num_children(); ++i) { - set_output_dims(get_data_dims(i), i); + void setup_dims(DataReaderMetaData& dr_metadata) override { + io_layer::setup_dims(dr_metadata); + for (int i = 0; i < this->get_num_children(); ++i) { + this->set_output_dims(get_data_dims(dr_metadata, i), i); } } - void setup_data() override { - io_layer::setup_data(); + void setup_data(size_t max_mini_batch_size) override { + io_layer::setup_data(max_mini_batch_size); // Resize output to maximum mini-batch size - const auto& max_mb_size = this->m_model->get_max_mini_batch_size(); - for (int i = 0; i < get_num_children(); ++i) { - auto& output = get_activations(i); - output.Resize(output.Height(), max_mb_size); - } - - auto num_io_threads = this->m_model->get_io_thread_pool()->get_num_threads(); - /// BVE FIXME foreach data reader - // in case that target_layer gets initialized beforehand - if(m_data_readers[execution_mode::training] != nullptr) { - m_data_readers[execution_mode::training]->setup(num_io_threads, this->m_model->get_io_thread_pool()); - m_data_readers[execution_mode::training]->set_rank(Layer::m_comm->get_rank_in_trainer()); - } - if(m_data_readers[execution_mode::validation] != nullptr) { - m_data_readers[execution_mode::validation]->setup(num_io_threads, this->m_model->get_io_thread_pool()); - m_data_readers[execution_mode::validation]->set_rank(Layer::m_comm->get_rank_in_trainer()); - } - if(m_data_readers[execution_mode::testing] != nullptr) { - m_data_readers[execution_mode::testing]->setup(num_io_threads, this->m_model->get_io_thread_pool()); - m_data_readers[execution_mode::testing]->set_rank(Layer::m_comm->get_rank_in_trainer()); - } - - if(io_layer::m_data_set_spans_models) { - calculate_num_iterations_per_epoch_training_spans_models(max_mb_size); - } else { - calculate_num_iterations_per_epoch_training_unique_per_models(max_mb_size); + for (int i = 0; i < this->get_num_children(); ++i) { + auto& output = this->get_activations(i); + output.Resize(output.Height(), max_mini_batch_size); } for (auto& io_buffer : m_io_buffers) { int linearized_target_size; - switch(m_data_reader_mode) { - case data_reader_target_mode::REGRESSION: - linearized_target_size = get_linearized_response_size(); - break; - case data_reader_target_mode::RECONSTRUCTION: - linearized_target_size = get_linearized_data_size(); - break; - case data_reader_target_mode::CLASSIFICATION: - linearized_target_size = get_linearized_label_size(); - break; - case data_reader_target_mode::NA: - default: + if(this->get_num_children() > 1) { + linearized_target_size = this->get_output_size(1); + }else { linearized_target_size = 0; } - io_buffer->setup_data(get_output_size(0), + io_buffer->setup_data(this->get_output_size(0), linearized_target_size, - max_mb_size); + max_mini_batch_size); } } @@ -200,29 +159,34 @@ class generic_input_layer : public io_layer { * Sets up the effective (global) mini-batch size. */ void fp_setup_outputs(El::Int mini_batch_size) override { - - // Determine model mini-batch size and effective mini-batch size - // Note: If inter-model communication is activated, the effective - // mini-batch is equal to the global mini-batch size. - /// @todo This functionality should probably be moved elsewhere - mini_batch_size = get_current_mini_batch_size(); - int effective_mini_batch_size = mini_batch_size; - for (auto&& cb : this->m_model->get_callbacks()) { - if (dynamic_cast(cb) != nullptr) { - effective_mini_batch_size = get_current_global_mini_batch_size(); - break; + /// During model setup there is no valid execution context, but + /// during execution there is a context + if(this->m_model->has_valid_execution_context()) { + // Determine model mini-batch size and effective mini-batch size + // Note: If inter-model communication is activated, the effective + // mini-batch is equal to the global mini-batch size. + /// @todo This functionality should probably be moved elsewhere + mini_batch_size = get_current_mini_batch_size(); + + auto effective_mini_batch_size = mini_batch_size; + for (auto&& cb : this->m_model->get_callbacks()) { + if (dynamic_cast(cb) != nullptr) { + effective_mini_batch_size = get_current_global_mini_batch_size(); + break; + } } - } - // Set mini-batch size in model - this->m_model->set_current_mini_batch_size(mini_batch_size); - this->m_model->set_effective_mini_batch_size(effective_mini_batch_size); + auto& c = static_cast(this->m_model->get_execution_context()); + // Set mini-batch size in model + c.set_current_mini_batch_size(mini_batch_size); + c.set_effective_mini_batch_size(effective_mini_batch_size); + } // Initialize matrices - io_layer::fp_setup_outputs(mini_batch_size); + io_layer::fp_setup_outputs(mini_batch_size); for (auto& io_buffer : m_io_buffers) { - for (int i = 0; i < get_num_children(); ++i) { + for (int i = 0; i < this->get_num_children(); ++i) { io_buffer->fp_setup_data(mini_batch_size, i); } } @@ -230,8 +194,9 @@ class generic_input_layer : public io_layer { void fetch_data_in_background(int future_active_buffer, execution_mode mode) { int active_buffer = future_active_buffer % m_io_buffers.size(); - generic_io_buffer* io_buffer = m_io_buffers[active_buffer]; - std::lock_guard guard(dr_mutex); + generic_io_buffer* io_buffer = m_io_buffers[active_buffer]; + data_coordinator& dc = this->m_model->get_execution_context().get_trainer().get_data_coordinator(); + std::lock_guard guard(dc.dr_mutex); setup_next_io_buffer(io_buffer); io_buffer->fetch_to_local_matrix(get_data_reader(mode), mode); return; @@ -248,16 +213,16 @@ class generic_input_layer : public io_layer { } void fp_compute() override { - execution_mode mode = this->m_model->get_execution_mode(); + execution_mode mode = this->m_model->get_execution_context().get_execution_mode(); increment_active_buffer_idx(mode); - generic_io_buffer* io_buffer = m_io_buffers[get_active_buffer_idx(mode) % m_io_buffers.size()]; + generic_io_buffer* io_buffer = m_io_buffers[get_active_buffer_idx(mode) % m_io_buffers.size()]; // If there is no valid data and there is not already a background // thread to fetch the data, queue up the background thread if(io_buffer->num_samples_ready(mode) == 0 && !io_buffer->is_data_fetched_in_background(mode)) { - std::future background_fetch_done = this->m_model->get_io_thread_pool()->submit_job( + std::future background_fetch_done = this->m_model->get_execution_context().get_io_thread_pool().submit_job( std::bind(&generic_input_layer::fetch_data_in_background, this, get_active_buffer_idx(mode), mode)); io_buffer->set_data_fetch_future(std::move(background_fetch_done), mode); io_buffer->set_fetch_data_in_background(true, mode); @@ -280,36 +245,37 @@ class generic_input_layer : public io_layer { } } - if(dynamic_cast(io_buffer) != nullptr) { + if(dynamic_cast*>(io_buffer) != nullptr) { // Use the predetermined size of the mini-batch to set the current // batch size for the neural network num_samples_in_batch = get_current_mini_batch_size(); update_num_samples_processed(num_samples_in_batch); - if(m_expected_num_child_layers == 1) { - io_buffer->distribute_from_local_matrix(get_data_reader(), mode, get_activations(0)); + if(this->m_expected_num_child_layers == 1) { + io_buffer->distribute_from_local_matrix(get_data_reader(), mode, this->get_activations(0)); }else { - io_buffer->distribute_from_local_matrix(get_data_reader(), mode, get_activations(0), get_activations(1)); + io_buffer->distribute_from_local_matrix(get_data_reader(), mode, this->get_activations(0), this->get_activations(1)); } }else { - LBANN_ERROR("could not fp_compute for I/O layers : encoutered generic_io_buffer type"); + LBANN_ERROR("could not fp_compute for I/O layers : encoutered generic_io_buffer type"); } - m_data_set_processed = io_buffer->update_data_set(get_data_reader(mode), mode); + data_coordinator& dc = this->m_model->get_execution_context().get_trainer().get_data_coordinator(); + dc.m_data_set_processed = io_buffer->update_data_set(get_data_reader(mode), mode); - if(!m_data_set_processed && this->m_model->background_io_activity_allowed()) { + if(!dc.m_data_set_processed && this->m_model->get_execution_context().background_io_activity_allowed()) { int next_active_buffer = get_active_buffer_idx(mode) + 1; - std::future background_fetch_done = this->m_model->get_io_thread_pool()->submit_job( + std::future background_fetch_done = this->m_model->get_execution_context().get_io_thread_pool().submit_job( std::bind(&generic_input_layer::fetch_data_in_background, this, next_active_buffer, mode)); - generic_io_buffer* next_io_buffer = m_io_buffers[next_active_buffer % m_io_buffers.size()]; + generic_io_buffer* next_io_buffer = m_io_buffers[next_active_buffer % m_io_buffers.size()]; next_io_buffer->set_data_fetch_future(std::move(background_fetch_done), mode); next_io_buffer->set_fetch_data_in_background(true, mode); } } - void setup_next_io_buffer(generic_io_buffer* io_buffer) { + void setup_next_io_buffer(generic_io_buffer* io_buffer) { int mini_batch_size = get_current_mini_batch_size(); - for (int i = 0; i < get_num_children(); ++i) { + for (int i = 0; i < this->get_num_children(); ++i) { io_buffer->fp_setup_data(mini_batch_size, i); } } @@ -318,7 +284,8 @@ class generic_input_layer : public io_layer { * Once a mini-batch is processed, resuffle the data for the next batch if necessary */ bool update_compute() override { - return m_data_set_processed; + data_coordinator& dc = this->m_model->get_execution_context().get_trainer().get_data_coordinator(); + return dc.m_data_set_processed; } //************************************************************************ @@ -326,26 +293,11 @@ class generic_input_layer : public io_layer { //************************************************************************ generic_data_reader *get_data_reader(const execution_mode mode) const { - generic_data_reader *data_reader = nullptr; - - auto it = m_data_readers.find(mode); - if (it != m_data_readers.end()) data_reader = it->second; - - switch(mode) { - case execution_mode::training: - break; - case execution_mode::validation: - break; - case execution_mode::testing: - break; - default: - LBANN_ERROR("generic data distribution: invalid execution phase"); - } - return data_reader; + return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_data_reader(mode); } generic_data_reader *get_data_reader() const { - return get_data_reader(this->m_model->get_execution_mode()); + return get_data_reader(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_num_parallel_readers(execution_mode mode) const { @@ -354,7 +306,7 @@ class generic_input_layer : public io_layer { } virtual int get_num_parallel_readers() const { - return get_num_parallel_readers(this->m_model->get_execution_mode()); + return get_num_parallel_readers(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_num_iterations_per_epoch(execution_mode mode) const { @@ -363,7 +315,7 @@ class generic_input_layer : public io_layer { } virtual int get_num_iterations_per_epoch() const { - return get_num_iterations_per_epoch(this->m_model->get_execution_mode()); + return get_num_iterations_per_epoch(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_current_step_in_epoch(execution_mode mode) const { @@ -372,7 +324,7 @@ class generic_input_layer : public io_layer { } virtual int get_current_step_in_epoch() const { - return get_current_step_in_epoch(this->m_model->get_execution_mode()); + return get_current_step_in_epoch(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_mini_batch_size(execution_mode mode) const { @@ -386,7 +338,7 @@ class generic_input_layer : public io_layer { } virtual int get_last_mini_batch_size() const { - return get_last_mini_batch_size(this->m_model->get_execution_mode()); + return get_last_mini_batch_size(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_current_mini_batch_size(execution_mode mode) const { @@ -395,7 +347,7 @@ class generic_input_layer : public io_layer { } virtual int get_current_mini_batch_size() const { - return get_current_mini_batch_size(this->m_model->get_execution_mode()); + return get_current_mini_batch_size(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_global_mini_batch_size(execution_mode mode) const { @@ -414,7 +366,7 @@ class generic_input_layer : public io_layer { } virtual int get_current_global_mini_batch_size() const { - return get_current_global_mini_batch_size(this->m_model->get_execution_mode()); + return get_current_global_mini_batch_size(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_world_master_mini_batch_adjustment(execution_mode mode) const { @@ -423,7 +375,7 @@ class generic_input_layer : public io_layer { } virtual int get_world_master_mini_batch_adjustment() const { - return get_world_master_mini_batch_adjustment(this->m_model->get_execution_mode()); + return get_world_master_mini_batch_adjustment(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_current_world_master_mini_batch_adjustment(execution_mode mode, int model_rank) const { @@ -432,110 +384,32 @@ class generic_input_layer : public io_layer { } virtual int get_current_world_master_mini_batch_adjustment(int model_rank) const { - return get_current_world_master_mini_batch_adjustment(this->m_model->get_execution_mode(), model_rank); - } - - /** Calculate how many iterations are required for training, testing, - * and validation given a specified mini-batch size and that the - * training data set is spanning all of the models. - */ - void calculate_num_iterations_per_epoch_training_spans_models(int mini_batch_size) { - - generic_data_reader *dr = get_data_reader(execution_mode::training); - if(dr != nullptr) { - /// Setup the training data set so that it spans all models - m_io_buffers[0]->calculate_num_iterations_per_epoch_spanning_models(mini_batch_size, dr); - } - - dr = get_data_reader(execution_mode::validation); - if(dr != nullptr) { - /// Each model uses the entire validation and testing data sets - m_io_buffers[0]->calculate_num_iterations_per_epoch_single_model(mini_batch_size, dr); - } - - dr = get_data_reader(execution_mode::testing); - if(dr != nullptr) { - m_io_buffers[0]->calculate_num_iterations_per_epoch_single_model(mini_batch_size, dr); - } - - } - - void calculate_num_iterations_per_epoch_training_unique_per_models(int mini_batch_size) { - - generic_data_reader *dr = get_data_reader(execution_mode::training); - if(dr != nullptr) { - /// Setup the training data set so that it spans all models - m_io_buffers[0]->calculate_num_iterations_per_epoch_single_model(mini_batch_size, dr); - } - - dr = get_data_reader(execution_mode::validation); - if(dr != nullptr) { - /// Each model uses the entire validation and testing data sets - m_io_buffers[0]->calculate_num_iterations_per_epoch_single_model(mini_batch_size, dr); - } - - dr = get_data_reader(execution_mode::testing); - if(dr != nullptr) { - m_io_buffers[0]->calculate_num_iterations_per_epoch_single_model(mini_batch_size, dr); - } - + return get_current_world_master_mini_batch_adjustment(this->m_model->get_execution_context().get_execution_mode(), model_rank); } //************************************************************************ // Helper functions to access the dataset statistics //************************************************************************ dataset& get_dataset(execution_mode m) override { - switch(m) { - case execution_mode::training: - return m_training_dataset; - break; - case execution_mode::validation: - return m_validation_dataset; - break; - case execution_mode::testing: - return m_testing_dataset; - break; - default: - LBANN_ERROR("get_dataset: invalid execution mode"); - } + return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_dataset(m); } const dataset& get_dataset(execution_mode m) const override { - switch(m) { - case execution_mode::training: - return m_training_dataset; - break; - case execution_mode::validation: - return m_validation_dataset; - break; - case execution_mode::testing: - return m_testing_dataset; - break; - default: - LBANN_ERROR("get_dataset: invalid execution mode"); - } + return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_dataset(m); } /** * Return the dataset associated with the current execution mode. */ - dataset& select_dataset() override { return get_dataset(m_model->get_execution_mode()); } - const dataset& select_dataset() const override { return get_dataset(m_model->get_execution_mode()); } + dataset& select_dataset() override { return get_dataset(this->m_model->get_execution_context().get_execution_mode()); } + const dataset& select_dataset() const override { return get_dataset(this->m_model->get_execution_context().get_execution_mode()); } /** * Return the first dataset with a valid (non-null) datareader. * Returns null if none are valid. */ dataset* select_first_valid_dataset() override { - if (m_data_readers[execution_mode::training]) { - return &m_training_dataset; - } else if (m_data_readers[execution_mode::validation]) { - return &m_validation_dataset; - } else if (m_data_readers[execution_mode::testing]) { - return &m_testing_dataset; - } else { - return nullptr; - } + return this->m_model->get_execution_context().get_trainer().get_data_coordinator().select_first_valid_dataset(); } /** @@ -558,34 +432,21 @@ class generic_input_layer : public io_layer { * Return the sample indices fetched in the current mini-batch. */ El::Matrix* get_sample_indices_per_mb() override { - execution_mode mode = this->m_model->get_execution_mode(); - generic_io_buffer* io_buffer = m_io_buffers[get_active_buffer_idx(mode) % m_io_buffers.size()]; - return io_buffer->get_sample_indices_fetched_per_mb(this->m_model->get_execution_mode()); + execution_mode mode = this->m_model->get_execution_context().get_execution_mode(); + generic_io_buffer* io_buffer = m_io_buffers[get_active_buffer_idx(mode) % m_io_buffers.size()]; + return io_buffer->get_sample_indices_fetched_per_mb(this->m_model->get_execution_context().get_execution_mode()); } /** * Get the dimensions of the underlying data. */ - const std::vector get_data_dims(int child_index = 0) const override { - const generic_data_reader *dr = get_data_reader(); - // dataset* ds = select_first_valid_dataset(); - if (dr) { - if(child_index == 0) { - return dr->get_data_dims(); - }else if(child_index == 1) { - switch(m_data_reader_mode) { - case data_reader_target_mode::REGRESSION: - return std::vector(1, dr->get_num_responses()); - case data_reader_target_mode::RECONSTRUCTION: - return dr->get_data_dims(); - case data_reader_target_mode::CLASSIFICATION: - default: - return std::vector(1, dr->get_num_labels()); - } - // the correct value based on initialization - }else { - LBANN_ERROR("get_data_dims: Invalid child index"); - } + std::vector get_data_dims(DataReaderMetaData& dr_metadata, int child_index = 0) const override { + if(child_index == 0) { + return dr_metadata.data_dims[data_reader_target_mode::INPUT]; + }else if(child_index == 1) { + return dr_metadata.data_dims[this->m_data_reader_mode]; + }else { + LBANN_ERROR("get_data_dims: Invalid child index"); } return std::vector(1, 0); } @@ -596,26 +457,26 @@ class generic_input_layer : public io_layer { long get_linearized_data_size() const override { long linearized_data_size = -1; - data_reader_map_t::const_iterator it; + generic_data_reader *dr; - it = m_data_readers.find(execution_mode::training); - if ((it != m_data_readers.end()) && it->second) { - linearized_data_size = (it->second)->get_linearized_data_size(); - std::cerr << "XX >>>>>> linearized_data_size: " << linearized_data_size << "\n"; + auto& dc = this->m_model->get_execution_context().get_trainer().get_data_coordinator(); + dr = dc.get_data_reader(execution_mode::training); + if (dr != nullptr) { + linearized_data_size = dr->get_linearized_data_size(); } - it = m_data_readers.find(execution_mode::validation); - if ((it != m_data_readers.end()) && it->second) { - long tmp_data_size = (it->second)->get_linearized_data_size(); + dr = dc.get_data_reader(execution_mode::validation); + if (dr != nullptr) { + long tmp_data_size = dr->get_linearized_data_size(); if (linearized_data_size != -1 && linearized_data_size != tmp_data_size) { LBANN_ERROR("lbann_io_layer: validation data set size does not " "match the currently established data set size"); } } - it = m_data_readers.find(execution_mode::testing); - if ((it != m_data_readers.end()) && it->second) { - long tmp_data_size = (it->second)->get_linearized_data_size(); + dr = dc.get_data_reader(execution_mode::testing); + if (dr != nullptr) { + long tmp_data_size = dr->get_linearized_data_size(); if (linearized_data_size != -1 && linearized_data_size != tmp_data_size) { LBANN_ERROR("lbann_io_layer: testing data set size does not " "match the currently established data set size"); @@ -628,26 +489,27 @@ class generic_input_layer : public io_layer { * Get the linearized size of the labels for the underlying data. */ long get_linearized_label_size() const override { - if (is_for_regression()) { + if (this->is_for_regression()) { return static_cast(1); } long linearized_label_size = -1; - data_reader_map_t::const_iterator it; + generic_data_reader *dr; - it = m_data_readers.find(execution_mode::training); - if ((it != m_data_readers.end()) && it->second) { - linearized_label_size = (it->second)->get_linearized_label_size(); + auto& dc = this->m_model->get_execution_context().get_trainer().get_data_coordinator(); + dr = dc.get_data_reader(execution_mode::training); + if (dr != nullptr) { + linearized_label_size = dr->get_linearized_label_size(); } - it = m_data_readers.find(execution_mode::validation); - if ((it != m_data_readers.end()) && it->second) { - long tmp_label_size = (it->second)->get_linearized_label_size(); + dr = dc.get_data_reader(execution_mode::validation); + if (dr != nullptr) { + long tmp_label_size = dr->get_linearized_label_size(); if (linearized_label_size != -1 && linearized_label_size != tmp_label_size) { LBANN_ERROR("lbann_io_layer: validation label set size (" + std::to_string(tmp_label_size) + ") does not match the currently established data set size (" + std::to_string(linearized_label_size) + ")"); } } - it = m_data_readers.find(execution_mode::testing); - if ((it != m_data_readers.end()) && it->second) { - long tmp_label_size = (it->second)->get_linearized_label_size(); + dr = dc.get_data_reader(execution_mode::testing); + if (dr != nullptr) { + long tmp_label_size = dr->get_linearized_label_size(); if (linearized_label_size != -1 && linearized_label_size != tmp_label_size) { LBANN_ERROR("lbann_io_layer: testing label set size does not " "match the currently established data set size"); @@ -657,27 +519,28 @@ class generic_input_layer : public io_layer { } long get_linearized_response_size() const override { - if (!is_for_regression()) { + if (!this->is_for_regression()) { return static_cast(1); } long linearized_response_size = -1; - data_reader_map_t::const_iterator it; + generic_data_reader *dr; - it = m_data_readers.find(execution_mode::training); - if ((it != m_data_readers.end()) && it->second) { - linearized_response_size = (it->second)->get_linearized_response_size(); + auto& dc = this->m_model->get_execution_context().get_trainer().get_data_coordinator(); + dr = dc.get_data_reader(execution_mode::training); + if (dr != nullptr) { + linearized_response_size = dr->get_linearized_response_size(); } - it = m_data_readers.find(execution_mode::validation); - if ((it != m_data_readers.end()) && it->second) { - long tmp_response_size = (it->second)->get_linearized_response_size(); + dr = dc.get_data_reader(execution_mode::validation); + if (dr != nullptr) { + long tmp_response_size = dr->get_linearized_response_size(); if (linearized_response_size != -1 && linearized_response_size != tmp_response_size) { LBANN_ERROR("lbann_io_layer: validation response set size does not " "match the currently established data set size"); } } - it = m_data_readers.find(execution_mode::testing); - if ((it != m_data_readers.end()) && it->second) { - long tmp_response_size = (it->second)->get_linearized_response_size(); + dr = dc.get_data_reader(execution_mode::testing); + if (dr != nullptr) { + long tmp_response_size = dr->get_linearized_response_size(); if (linearized_response_size != -1 && linearized_response_size != tmp_response_size) { LBANN_ERROR("lbann_io_layer: testing response set size does not " "match the currently established data set size"); @@ -687,21 +550,21 @@ class generic_input_layer : public io_layer { } long get_num_samples_trained() const override { - return m_training_dataset.get_num_samples_processed(); + return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_num_samples_trained(); } long get_num_samples_tested() const override { - return m_testing_dataset.get_num_samples_processed(); + return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_num_samples_tested(); } long get_total_num_training_samples() const override { - return m_training_dataset.get_total_samples(); + return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_total_num_training_samples(); } long get_total_num_testing_samples() const override { - return m_testing_dataset.get_total_samples(); + return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_total_num_testing_samples(); } bool at_new_epoch() const override { - const data_reader_map_t::const_iterator it = m_data_readers.find(execution_mode::training); - return ((it != m_data_readers.end()) && it->second && (it->second)->at_new_epoch()); + const generic_data_reader *dr = this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_data_reader(execution_mode::training); + return (dr != nullptr && dr->at_new_epoch()); } bool is_execution_mode_valid(execution_mode mode) const override { @@ -715,174 +578,59 @@ class generic_input_layer : public io_layer { // save state of IO to a checkpoint bool save_to_checkpoint_shared(persist& p) const override { // save state of data readers from input layer - data_reader_map_t::const_iterator it; - if(p.get_cb_type() != callback_type::validation){ - it = this->m_data_readers.find(execution_mode::training); - if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->save_to_checkpoint_shared(p, "data_reader_training"); - } - it = this->m_data_readers.find(execution_mode::testing); - if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->save_to_checkpoint_shared(p, "data_reader_testing"); - } - if (m_comm->am_trainer_master()) { - p.write_uint64(persist_type::train, "reader_train_processed", - (uint64_t) m_training_dataset.get_num_samples_processed()); - p.write_uint64(persist_type::train, "reader_train_total", - (uint64_t) m_training_dataset.get_total_samples()); + if(p.get_cb_type() == callback_type::execution_context_only + || p.get_cb_type() == callback_type::full_checkpoint){ - p.write_uint64(persist_type::train, "reader_test_processed", - (uint64_t) m_testing_dataset.get_num_samples_processed()); - p.write_uint64(persist_type::train, "reader_test_total", - (uint64_t) m_testing_dataset.get_total_samples()); + this->m_model->get_execution_context().get_trainer().get_data_coordinator().save_to_checkpoint_shared(p); + if (this->get_comm()->am_trainer_master()) { + write_cereal_archive(*this, p, execution_mode::training, "_io.xml"); } - } - if(p.get_cb_type() == callback_type::validation || p.get_cb_type() == callback_type::batch){ - if (m_comm->am_trainer_master()) { - p.write_uint64(persist_type::validate, "reader_validate_processed", - (uint64_t) m_validation_dataset.get_num_samples_processed()); - p.write_uint64(persist_type::validate, "reader_validate_total", - (uint64_t) m_validation_dataset.get_total_samples()); - } - it = this->m_data_readers.find(execution_mode::validation); - if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->save_to_checkpoint_shared(p, "data_reader_validation"); - } + } return true; } - struct dataset_header { - uint64_t train_proc; - uint64_t train_total; - uint64_t test_proc; - uint64_t test_total; - uint64_t validate_proc; - uint64_t validate_total; - }; - // reload state of IO from a checkpoint bool load_from_checkpoint_shared(persist& p) override { - // save state of data readers from input layer - data_reader_map_t::const_iterator it; - - it = this->m_data_readers.find(execution_mode::training); - if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->load_from_checkpoint_shared(p, "data_reader_training"); - } - it = this->m_data_readers.find(execution_mode::testing); - if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->load_from_checkpoint_shared(p, "data_reader_testing"); - } - - // save our own state - // rank 0 reads the file - dataset_header header; - // Assume we are loading from a epoch end checkpoint - if (m_comm->am_trainer_master()) { - p.read_uint64(persist_type::train, "reader_train_processed", &header.train_proc); - p.read_uint64(persist_type::train, "reader_train_total", &header.train_total); - p.read_uint64(persist_type::train, "reader_test_processed", &header.test_proc); - p.read_uint64(persist_type::train, "reader_test_total", &header.test_total); - if(m_data_readers[execution_mode::validation] != nullptr){ - p.read_uint64(persist_type::validate, "reader_validate_processed", &header.validate_proc); - p.read_uint64(persist_type::validate, "reader_validate_total", &header.validate_total); + // save state of the input layer + if(p.get_cb_type() == callback_type::execution_context_only + || p.get_cb_type() == callback_type::full_checkpoint){ + + std::string buf; + if (this->get_comm()->am_trainer_master()) { + read_cereal_archive(*this, p, execution_mode::training, "_io.xml"); + buf = create_cereal_archive_binary_string(*this); + } + + // TODO: this assumes homogeneous processors + // broadcast state from rank 0 + this->get_comm()->trainer_broadcast(0, buf); + + if (!this->get_comm()->am_trainer_master()) { + unpack_cereal_archive_binary_string(*this, buf); } - } - it = this->m_data_readers.find(execution_mode::validation); - if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->load_from_checkpoint_shared(p, "data_reader_validation"); - } - // TODO: assumes homogeneous hardware - // broadcast data from rank 0 - MPI_Bcast(&header, sizeof(header), MPI_BYTE, 0, MPI_COMM_WORLD); - // set our fields - m_training_dataset.num_samples_processed() = (long) header.train_proc; - m_training_dataset.total_samples() = (long) header.train_total; - m_testing_dataset.num_samples_processed() = (long) header.test_proc; - m_testing_dataset.total_samples() = (long) header.test_total; - if(m_data_readers[execution_mode::validation] != nullptr){ - m_validation_dataset.num_samples_processed() = (long) header.validate_proc; - m_validation_dataset.total_samples() = (long) header.validate_total; } return true; } bool save_to_checkpoint_distributed(persist& p) const override { // save state of data readers from input layer - data_reader_map_t::const_iterator it; - if(p.get_cb_type() != callback_type::validation){ - it = this->m_data_readers.find(execution_mode::training); - if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->save_to_checkpoint_distributed(p, "data_reader_training"); - } - it = this->m_data_readers.find(execution_mode::testing); - if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->save_to_checkpoint_distributed(p, "data_reader_testing"); - } - p.write_uint64(persist_type::train, "reader_train_processed", - (uint64_t) m_training_dataset.get_num_samples_processed()); - p.write_uint64(persist_type::train, "reader_train_total", - (uint64_t) m_training_dataset.get_total_samples()); - - p.write_uint64(persist_type::train, "reader_test_processed", - (uint64_t) m_testing_dataset.get_num_samples_processed()); - p.write_uint64(persist_type::train, "reader_test_total", - (uint64_t) m_testing_dataset.get_total_samples()); - - } - if(p.get_cb_type() == callback_type::validation || p.get_cb_type() == callback_type::batch){ - p.write_uint64(persist_type::validate, "reader_validate_processed", - (uint64_t) m_validation_dataset.get_num_samples_processed()); - p.write_uint64(persist_type::validate, "reader_validate_total", - (uint64_t) m_validation_dataset.get_total_samples()); - it = this->m_data_readers.find(execution_mode::validation); - if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->save_to_checkpoint_distributed(p, "data_reader_validation"); - } + if(p.get_cb_type() == callback_type::execution_context_only || p.get_cb_type() == callback_type::full_checkpoint) { + this->m_model->get_execution_context().get_trainer().get_data_coordinator().save_to_checkpoint_distributed(p); + write_cereal_archive(*this, p, execution_mode::training, "_io.xml"); } return true; } bool load_from_checkpoint_distributed(persist& p) override { - // save state of data readers from input layer - data_reader_map_t::const_iterator it; - it = this->m_data_readers.find(execution_mode::training); - if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->load_from_checkpoint_distributed(p, "data_reader_training"); - } - it = this->m_data_readers.find(execution_mode::testing); - if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->load_from_checkpoint_distributed(p, "data_reader_testing"); - } - // save our own state - // rank 0 reads the file - dataset_header header; - p.read_uint64(persist_type::train, "reader_train_processed", &header.train_proc); - p.read_uint64(persist_type::train, "reader_train_total", &header.train_total); - p.read_uint64(persist_type::train, "reader_test_processed", &header.test_proc); - p.read_uint64(persist_type::train, "reader_test_total", &header.test_total); - if(m_data_readers[execution_mode::validation] != nullptr){ - p.read_uint64(persist_type::validate, "reader_validate_processed", &header.validate_proc); - p.read_uint64(persist_type::validate, "reader_validate_total", &header.validate_total); - } - it = this->m_data_readers.find(execution_mode::validation); - if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->load_from_checkpoint_distributed(p, "data_reader_validation"); - } - - // set our fields - m_training_dataset.num_samples_processed() = (long) header.train_proc; - m_training_dataset.total_samples() = (long) header.train_total; - m_testing_dataset.num_samples_processed() = (long) header.test_proc; - m_testing_dataset.total_samples() = (long) header.test_total; - if(m_data_readers[execution_mode::validation] != nullptr){ - m_validation_dataset.num_samples_processed() = (long) header.validate_proc; - m_validation_dataset.total_samples() = (long) header.validate_total; - } + // load state of data readers for input layer + + this->m_model->get_execution_context().get_trainer().get_data_coordinator().load_from_checkpoint_distributed(p); + + read_cereal_archive(*this, p, execution_mode::training, "_io.xml"); return true; } @@ -894,24 +642,10 @@ class generic_input_layer : public io_layer { } protected: - std::vector m_io_buffers; + std::vector*> m_io_buffers; io_buffer_map_t m_active_buffer; - - dataset m_training_dataset; - dataset m_testing_dataset; - dataset m_validation_dataset; - // bool m_data_sets_span_models; - - data_reader_map_t m_data_readers; - // std::map m_dataset_stats; - bool m_data_set_processed; - std::mutex dr_mutex; }; -template inline void generic_input_layer::initialize_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map data_readers) { - m_io_buffers.push_back(new T(comm, num_parallel_readers, data_readers, m_expected_num_child_layers)); -} - } // namespace lbann #endif // LBANN_LAYERS_GENERIC_INPUT_LAYER_HPP_INCLUDED diff --git a/include/lbann/layers/io/input/input_layer.hpp b/include/lbann/layers/io/input/input_layer.hpp index e2c144684b3..35bdee54a92 100644 --- a/include/lbann/layers/io/input/input_layer.hpp +++ b/include/lbann/layers/io/input/input_layer.hpp @@ -29,6 +29,7 @@ #include "lbann/layers/io/input/generic_input_layer.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/utils/distconv.hpp" #include "lbann/models/model.hpp" #include #include @@ -37,23 +38,85 @@ namespace lbann { -template +#ifdef LBANN_HAS_DISTCONV +template +class input_distconv_adapter: public data_type_distconv_adapter { + public: + using TensorDevType = typename data_type_distconv_adapter::TensorDevType; + using TensorHost = dc::TensorHost; + using TensorHostShuffler = dc::TensorHostShuffler; + + input_distconv_adapter(Layer& layer); + virtual ~input_distconv_adapter() = default; + + TensorHostShuffler &get_shuffler(const TensorHost &src, const TensorHost &dst, + int mat_idx); + void setup_fp_tensors() override; + std::unique_ptr setup_activations_i(int index) const override; + dc::Shape get_activations_local_shape(int index) const override; + dc::Shape get_activations_shape(int index) const; + void setup_shuffler_buffers(const TensorHost &src, const TensorHost &dst); + + // No bp tensors needed for this layer. + void setup_prev_error_signals() override {} + void setup_original_prev_error_signals() override {} + void setup_error_signals() override {} + void setup_original_error_signals() override {} + void setup_bp_tensors() override {} + + bool child_copy_required(size_t output_index) const override; + bool child_shuffle_required(size_t output_index) const override; + + // Nothing to do here as everything is done in fp_compute_distconv. + void fp_setup(El::Int mini_batch_size) override {} + void fp_compute(); + bool is_input_processed(size_t index) const; + + private: + std::vector m_is_input_processed; + std::vector> m_original_host_tensors; + std::vector> m_host_tensors; + + bool m_shuffle_required; + std::vector, 4>> m_shufflers; + std::unique_ptr m_shuffler_src_buf; + size_t m_shuffler_src_buf_size = 0; + std::unique_ptr m_shuffler_dst_buf; + size_t m_shuffler_dst_buf_size = 0; + + // TODO: Use pinned memory pool + TensorDataType *m_copy_pinned_buffer = nullptr; +}; +#endif // LBANN_HAS_DISTCONV /** @brief Interface with data reader. */ -class input_layer : public generic_input_layer { +template +class input_layer : public generic_input_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "input layer only supports DATA_PARALLEL data layout"); + public: + /** @name Public Types */ + ///@{ + + /** @brief The local tensor type expected for IO in this object. */ + using IODataType = DataType; + + ///@} public: /// @todo make the map and vector references - input_layer(lbann_comm *comm, int num_parallel_readers, std::map data_readers, bool data_set_spans_models = true, + input_layer(lbann_comm *comm, int num_parallel_readers, data_reader_target_mode target_mode = data_reader_target_mode::CLASSIFICATION) - : generic_input_layer(comm, num_parallel_readers, data_readers, data_set_spans_models, target_mode) { - validate_data_layout(); + : generic_input_layer(comm, num_parallel_readers, target_mode) { // Initialize two buffers - initialize_io_buffer(comm, std::min(num_parallel_readers, Layer::m_comm->get_procs_per_trainer()), data_readers); - initialize_io_buffer(comm, std::min(num_parallel_readers, Layer::m_comm->get_procs_per_trainer()), data_readers); - for (auto io_buffer : m_io_buffers) { - io_buffer->fetch_data_fn = new fetch_data_functor(target_mode); + initialize_io_buffer(comm, std::min(num_parallel_readers, data_type_layer::m_comm->get_procs_per_trainer())); + initialize_io_buffer(comm, std::min(num_parallel_readers, data_type_layer::m_comm->get_procs_per_trainer())); + for (auto io_buffer : this->m_io_buffers) { + io_buffer->fetch_data_fn = new fetch_data_functor(target_mode); io_buffer->update_data_reader_fn = new update_data_reader_functor(); } } @@ -63,42 +126,43 @@ class input_layer : public generic_input_layer { return new input_layer(*this); } - inline void validate_data_layout(); - - inline void initialize_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map data_readers) { - generic_input_layer::initialize_io_buffer(comm, num_parallel_readers, data_readers); + inline void initialize_io_buffer(lbann_comm *comm, int num_parallel_readers) { + generic_input_layer::template initialize_io_buffer(comm, num_parallel_readers); } std::string get_type() const override { return "input"; } data_layout get_data_layout() const override { return T_layout; } El::Device get_device_allocation() const override { return Dev; } +#ifdef LBANN_HAS_DISTCONV + void fp_compute () override; + using distconv_adapter_type = input_distconv_adapter; + friend distconv_adapter_type; + protected: + bool is_distconv_supported() const override { + return Dev == El::Device::CPU && T_layout == data_layout::DATA_PARALLEL; + } + void setup_distconv_adapter() override { + this->get_distconv_adapter_ptr() = make_unique(*this); + } + distconv_adapter_type& get_distconv_adapter() override; + const distconv_adapter_type& get_distconv_adapter() const override; + bool keep_original_outputs(int index) const override; +#endif // LBANN_HAS_DISTCONV }; -template<> -inline void input_layer::validate_data_layout() { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << "input_layer with partitioned_io_buffer does not supports MODEL_PARALLEL data layout"; - throw lbann_exception(err.str()); -} - -template<> -inline void input_layer::validate_data_layout() {} - -#ifdef LBANN_HAS_GPU -template<> -inline void input_layer::validate_data_layout() { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << "input_layer with partitioned_io_buffer does not supports MODEL_PARALLEL data layout"; - throw lbann_exception(err.str()); -} - -template<> -inline void input_layer::validate_data_layout() {} -#endif // LBANN_HAS_GPU - -} +#ifndef LBANN_INPUT_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class input_layer< \ + T, partitioned_io_buffer, \ + data_layout::DATA_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_INPUT_LAYER_INSTANTIATE + +} // namespace lbann #endif // LBANN_LAYERS_INPUT_LAYER_HPP_INCLUDED diff --git a/include/lbann/layers/io/io_layer.hpp b/include/lbann/layers/io/io_layer.hpp index 4f0b22ec529..939010c0472 100644 --- a/include/lbann/layers/io/io_layer.hpp +++ b/include/lbann/layers/io/io_layer.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_IO_LAYER_HPP_INCLUDED #define LBANN_LAYERS_IO_LAYER_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" #include "lbann/data_readers/data_reader.hpp" #include "lbann/utils/dataset.hpp" #include "lbann/io/persist.hpp" @@ -43,17 +43,15 @@ namespace lbann { /** @todo Move functionality to input_layer. */ -class io_layer : public Layer { +template +class io_layer : public data_type_layer { protected: - bool m_data_set_spans_models; data_reader_target_mode m_data_reader_mode; public: io_layer(lbann_comm *comm, - bool data_set_spans_models = true, data_reader_target_mode data_reader_mode = data_reader_target_mode::CLASSIFICATION) - : Layer(comm), - m_data_set_spans_models(data_set_spans_models), + : data_type_layer(comm), m_data_reader_mode(data_reader_mode) { } @@ -93,7 +91,7 @@ class io_layer : public Layer { /** * Get the dimensions of the underlying data. */ - virtual const std::vector get_data_dims(int child_index = 0) const = 0; + virtual std::vector get_data_dims(DataReaderMetaData& dr_metadata, int child_index = 0) const = 0; /** * Get the linearized size of the underlying data. diff --git a/include/lbann/layers/layer.hpp b/include/lbann/layers/layer.hpp index 6ed9ecb096b..9778e2f433f 100644 --- a/include/lbann/layers/layer.hpp +++ b/include/lbann/layers/layer.hpp @@ -29,22 +29,131 @@ #include "lbann/base.hpp" #include "lbann/comm.hpp" -#include "lbann/utils/summary.hpp" +#include "lbann/data_coordinator/data_coordinator_metadata.hpp" +#include "lbann/io/persist.hpp" #include "lbann/optimizers/optimizer.hpp" +#include "lbann/utils/description.hpp" +#include "lbann/utils/distconv.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/utils/memory.hpp" +#include "lbann/utils/summary.hpp" #include "lbann/utils/timer.hpp" -#include "lbann/utils/description.hpp" -#include "lbann/io/persist.hpp" -#include +#include "lbann/utils/typename.hpp" +#include "lbann/weights/weights.hpp" +#ifdef LBANN_HAS_DISTCONV +#include "lbann/layers/distconv_adapter.hpp" +#endif // LBANN_HAS_DISTCONV #include #include +/** @brief A utility macro for easily defining default-constructed sub-class + * builders.*/ +#define LBANN_DEFINE_LAYER_BUILDER(LAYER_NAME) \ + template \ + std::unique_ptr build_##LAYER_NAME##_layer_from_pbuf( \ + lbann_comm*, lbann_data::Layer const&) + +/** @brief A utility macro for easily defining "default" builders. + * @note Must be called inside lbann namespace. + */ +#define LBANN_LAYER_DEFAULT_BUILDER(LAYER_NAME) \ + template \ + std::unique_ptr build_##LAYER_NAME##_layer_from_pbuf( \ + lbann_comm* comm, lbann_data::Layer const&) { \ + using LayerType = LAYER_NAME##_layer; \ + return make_unique(comm); \ + } + +/** @brief A utility macro for easily adding ETI for layer builders + * @note Must be called inside lbann namespace. + */ +#define LBANN_LAYER_BUILDER_ETI(LAYER_NAME, T, Device) \ + template std::unique_ptr \ + build_##LAYER_NAME##_layer_from_pbuf( \ + lbann_comm*, lbann_data::Layer const&); \ + template std::unique_ptr \ + build_##LAYER_NAME##_layer_from_pbuf( \ + lbann_comm*, lbann_data::Layer const&) + +// Forward-declare protobuf classes +namespace lbann_data { +class Layer; +} + namespace lbann { // Forward declarations class model; -class weights; -class lbann_callback_sync_layers; +namespace callback { +class sync_layers; +} // namespace callback + +/** Represents a parallel strategy for a layer. */ +struct ParallelStrategy { + /** Number of process groups the sample dimension is split over. */ + int sample_groups = 0; + /** Number of groups the sample dimension is split over. */ + int sample_splits = 0; + /** Number of process groups the depth dimension is split over. */ + int depth_groups = 0; + /** Number of groups the depth dimension is split over. */ + int depth_splits = 0; + /** Number of process groups the height dimension is split over. */ + int height_groups = 0; + /** Number of groups the height dimension is split over. */ + int height_splits = 0; + /** Number of process groups the width dimension is split over. */ + int width_groups = 0; + /** Number of groups the width dimension is split over. */ + int width_splits = 0; + /** Number of process groups the channel dimension is split over. */ + int channel_groups = 0; + /** Number of groups the channel dimension is split over. */ + int channel_splits = 0; + /** Number of process groups the filter dimension is split over. */ + int filter_groups = 0; + /** Number of groups the filter dimension is split over. */ + int filter_splits = 0; + /** Number of times the layer is replicated (for FC layers right now). */ + int replications = 0; + bool operator==(const ParallelStrategy &ps) const { + return sample_groups == ps.sample_groups && + sample_splits == ps.sample_splits && + depth_groups == ps.depth_groups && + depth_splits == ps.depth_splits && + height_groups == ps.height_groups && + height_splits == ps.height_splits && + width_groups == ps.width_groups && + width_splits == ps.width_splits && + channel_groups == ps.channel_groups && + channel_splits == ps.channel_splits && + filter_groups == ps.filter_groups && + filter_splits == ps.filter_splits && + replications == ps.replications; + } + bool operator!=(const ParallelStrategy &ps) const { + return !(*this == ps); + } +}; + +inline std::ostream &operator<<(std::ostream &os, + const ParallelStrategy &ps) { + os << "{" << ps.sample_groups + << "/" << ps.sample_splits + << ", " << ps.depth_groups + << "/" << ps.depth_splits + << ", " << ps.height_groups + << "/" << ps.height_splits + << ", " << ps.width_groups + << "/" << ps.width_splits + << ", " << ps.channel_groups + << "/" << ps.channel_splits + << ", " << ps.filter_groups + << "/" << ps.filter_splits + << ", " << ps.replications + << "}"; + return os; +} /** * @brief Neural network tensor operation. @@ -64,8 +173,7 @@ class lbann_callback_sync_layers; * the weights. */ class Layer { - friend class lbann_callback_sync_layers; - friend class lbann_callback_sync_selected; + friend class callback::sync_layers; public: @@ -96,22 +204,37 @@ class Layer { * human-readable, name. */ inline void set_name(const std::string name) { m_name = name; } + /** Get a string representing the layer datatype + */ + virtual std::string get_datatype_name() const { + return TypeName(); + }; /** Human-readable description. */ virtual description get_description() const; + /** Get the parallel strategy for the layer. */ + inline ParallelStrategy& get_parallel_strategy() { + return m_parallel_strategy; + } + /** Get the parallel strategy for the layer. */ + const ParallelStrategy& get_parallel_strategy() const { + return m_parallel_strategy; + } + /** Forward propagation step. * Apply a mathematical operation to input tensors to obtain output * tensors. */ - virtual void forward_prop(); + virtual void forward_prop() {}; /** Backward propagation step. * Given the objective function gradients w.r.t. the output * tensors, compute the gradients w.r.t. the input tensors and * w.r.t. the weights. This is essentially an application of the * chain rule. */ - virtual void back_prop(); + void back_prop(); + /** Update step. * Update the layer's internal members. Note that the optimization * step for the weights happens elsewhere. @@ -119,7 +242,7 @@ class Layer { virtual bool update(); virtual void summarize_stats(lbann_summary& summarizer, int step); - virtual void summarize_matrices(lbann_summary& summarizer, int step); + virtual void summarize_matrices(lbann_summary& summarizer, int step) = 0; /** Setup layer members. * This calls the 'setup_pointers', 'setup_dims', 'setup_matrices', @@ -127,7 +250,7 @@ class Layer { * assumed that pointers to parent/child layers have already been * initialized. */ - virtual void setup(); + virtual void setup(size_t max_mini_batch_size, DataReaderMetaData& dr_metadata); /** Check that the setup is reasonable. */ virtual void check_setup(); @@ -145,12 +268,6 @@ class Layer { * should override this function to return its template parameter. */ virtual El::Device get_device_allocation() const = 0; - /** Get a human-readable description of the data_layout */ - std::string get_data_layout_string(data_layout d) const; - /** Get a human-readable description of the device allocation */ - std::string get_device_allocation_string(El::Device dev) const; - /** Get a short human-readable description of the device allocation */ - std::string get_device_allocation_string_short(El::Device dev) const; /** Reset layer stat counters. */ virtual void reset_counters(); @@ -198,6 +315,20 @@ class Layer { /** Get child layers. (const) */ inline const std::vector& get_child_layers() const { return m_child_layers; } + inline int find_child_layer_index(const Layer* l) const { + return std::distance(m_child_layers.begin(), + std::find(m_child_layers.begin(), + m_child_layers.end(), + l)); + } + + inline int find_parent_layer_index(const Layer* l) const { + return std::distance(m_parent_layers.begin(), + std::find(m_parent_layers.begin(), + m_parent_layers.end(), + l)); + } + /** Get number of parent layers. */ inline int get_num_parents() const { return get_parent_layers().size(); } /** Get number of child layers. */ @@ -241,14 +372,22 @@ class Layer { // Weights access functions // =========================================================== - /** Get references to weights. */ - inline std::vector& get_weights() { return m_weights; } - /** Get references to weights. (const) */ - inline const std::vector& get_weights() const { return m_weights; } /** Set list of pointers to weights. */ - inline void set_weights(std::vector w) { get_weights() = w; } + void set_weights(std::vector const& w) { + m_weights = w; + } + /** Replace weights with another Layer's weights*/ - void replace_weights(Layer* other_layer); + void replace_weights(Layer const& other_layer); + + // =========================================================== + // Tensor access functions + // =========================================================== + + /** Get activation tensor corresponding to child layer. */ + virtual const BaseDistMat& get_activations(const Layer& child) const = 0; + /** Get error signal tensor corresponding to parent layer. */ + virtual const BaseDistMat& get_error_signals(const Layer& parent) const = 0; // =========================================================== // Tensor dimension access functions @@ -266,34 +405,6 @@ class Layer { /** Set output tensor dimensions. */ void set_output_dims(std::vector dims, int output_index = 0); - // =========================================================== - // Tensor access functions - // =========================================================== - - /** Get activation tensor. */ - AbsDistMat& get_activations(int child_index = 0); - /** Get error signal tensor. */ - AbsDistMat& get_error_signals(int parent_index = 0); - /** Get previous activation tensor. */ - const AbsDistMat& get_prev_activations(int parent_index = 0) const; - /** Get activation tensor. */ - const AbsDistMat& get_activations(int child_index = 0) const; - /** Get previous error signal tensor. */ - const AbsDistMat& get_prev_error_signals(int child_index = 0) const; - /** Get error signal tensor. */ - const AbsDistMat& get_error_signals(int parent_index = 0) const; - /** Get local portion of activation tensor. */ - AbsMat& get_local_activations(int child_index = 0); - /** Get local portion of error signal tensor. */ - AbsMat& get_local_error_signals(int parent_index = 0); - /** Get local portion of previous activation tensor. */ - const AbsMat& get_local_prev_activations(int parent_index = 0) const; - /** Get local portion of activation tensor. */ - const AbsMat& get_local_activations(int child_index = 0) const; - /** Get local portion of previous error signal tensor. */ - const AbsMat& get_local_prev_error_signals(int child_index = 0) const; - /** Get local portion of error signal tensor. */ - const AbsMat& get_local_error_signals(int parent_index = 0) const; /** Get reference to LBANN communicator. */ lbann_comm* get_comm() const { return m_comm; } @@ -320,8 +431,63 @@ class Layer { void unfreeze(); bool is_frozen() const; + /** @brief Set whether to keep or dynamically reallocate error signals. + * + * Passing a value of @c true means to keep the error signals; @c + * false means to dynamically reallocate them. + */ + virtual void set_keep_error_signals(bool) = 0; + protected: + /** @name Weights-related accessors */ + ///@{ + void add_weights(weights* w) { + m_weights.push_back(w); + } + size_t num_weights() const noexcept { return m_weights.size(); } + bool has_weights() const noexcept { return num_weights() > 0; } + bool has_weights(size_t idx) const noexcept { + return ((idx < this->num_weights()) && (m_weights[idx])); + } + void set_num_weights(size_t n) { m_weights.resize(n, nullptr); } + void set_weights(size_t idx, weights* w) { + m_weights.at(idx) = w; + } + weights const& get_weights(size_t idx) const { + if (idx >= num_weights()) { + LBANN_ERROR("Asked for weights index \"", idx, "\"; " + "however, this layer has ", num_weights(), + " weights associated with it."); + } + if (m_weights[idx] == nullptr) { + LBANN_ERROR("Logic error: Detected an in-bounds null weights pointer."); + } + return *(m_weights[idx]); + } + + weights& get_weights(size_t idx) { + return const_cast( + static_cast(*this).get_weights(idx)); + } + + void add_as_gradient_source() + { + for (auto&& w : this->m_weights) { + optimizer* opt = w->get_optimizer(); + if (opt != nullptr) { opt->add_gradient_source(this); } + } + } + + void remove_as_gradient_source() + { + for (auto&& w : this->m_weights) { + auto&& opt = w->get_optimizer(); + if (opt != nullptr) { opt->remove_gradient_source(this); } + } + } + ///@} + // =========================================================== // Setup helper functions // =========================================================== @@ -336,7 +502,7 @@ class Layer { * the base method sets all uninitialized output tensor dimensions * equal to the first input tensor dimensions. */ - virtual void setup_dims(); + virtual void setup_dims(DataReaderMetaData& dr_metadata); /** Setup distributed matrices. * Called by the 'setup' function. Each column of these distributed * matrices is interpreted as the flattened tensor for a mini-batch @@ -344,20 +510,12 @@ class Layer { * 'construct_matrix' function. If any matrices have already been * setup, they are destroyed and reinstantiated. */ - virtual void setup_matrices(const El::Grid& grid); - /** Construct distributed matrix. - * Called by the 'setup_matrices' function. 'type' is one of the - * following: "input", "output", "gradient_wrt_output", - * "gradient_wrt_input". - */ - virtual std::unique_ptr construct_matrix(const El::Grid& grid, - std::string type, - El::Int index); + virtual void setup_matrices(const El::Grid& grid) = 0; /** Setup layer data. * Called by the 'setup' function. Memory is allocated for * distributed matrices. */ - virtual void setup_data(); + virtual void setup_data(size_t max_mini_batch_size) {}; /** Setup GPU objects. * Called by the 'setup' function if the layer is on GPUs. */ @@ -372,12 +530,12 @@ class Layer { * setup as a view or copy of the corresponding parent layer's * output tensor. */ - virtual void fp_setup_inputs(El::Int mini_batch_size); + virtual void fp_setup_inputs(El::Int mini_batch_size) = 0; /** Setup output tensors. * Called by the 'forward_prop' function. Each output tensor is * resized to match the mini-batch size. */ - virtual void fp_setup_outputs(El::Int mini_batch_size); + virtual void fp_setup_outputs(El::Int mini_batch_size) = 0; /** Apply layer operation. * Called by the 'forward_prop' function. Given the input tensors, * the output tensors are populated with computed values. @@ -388,24 +546,18 @@ class Layer { // Back prop step helper functions // =========================================================== - /** Setup gradient w.r.t. output tensors. - * Called by the 'back_prop' function. Each gradient w.r.t. output - * tensor is setup as a view or copy of the corresponding child - * layer's gradient w.r.t. input tensor. - */ - virtual void bp_setup_gradient_wrt_outputs(El::Int mini_batch_size); /** Setup gradient w.r.t. input tensors. * Called by the 'back_prop' function. Each gradient w.r.t. input * tensor is resized to match the mini-batch size. */ - virtual void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size); + virtual void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) = 0; /** Compute objective funciton gradients. * Called by the 'back_prop' function. Given the input, output, and * gradient w.r.t. output tensors, the gradient w.r.t. input * tensors are populated with the computed values and the gradients * w.r.t. the weights are sent to the appropriate optimizers. */ - virtual void bp_compute(); + virtual void bp_compute() {}; // =========================================================== // Update step helper functions @@ -423,9 +575,6 @@ class Layer { /** Reference to LBANN communicator. */ lbann_comm *m_comm; - /** References to layer weights. */ - std::vector m_weights; - /** References to parent layers. */ std::vector m_parent_layers; /** References to child layers. */ @@ -465,39 +614,120 @@ class Layer { private: - // =========================================================== - // Private access functions - // =========================================================== - - /** Get activation tensor corresponding to child layer. */ - const AbsDistMat& get_activations(const Layer& child) const; - /** Get error signal tensor corresponding to parent layer. */ - const AbsDistMat& get_error_signals(const Layer& parent) const; + virtual void setup_weights(size_t idx, weights& w) = 0; + + /** @name Implementation details of back-prop. */ + ///@{ + + /** @brief Move error signals from a child to its parent. + * + * This is a hacky workaround to C++ rules for protected member + * functions. No error-checking is done, e.g., to assert that the + * two layers actually have a parent-child relationship because + * this is just an implementation detail. The symbol is never + * exposed to the public API. + * + * @param parent The parent layer, into which the signal is moved + * @param child The child layer, from which the signal is moved + * @param signal The now-released error signal from the child layer + */ + friend void attempt_move_error_signal( + Layer& parent, Layer const& child, + std::unique_ptr signals); + friend void attempt_view_error_signal( + Layer& parent, Layer const& child, + const BaseDistMat& signals); + friend void deep_copy_error_signal( + Layer& parent, Layer const& child, + const BaseDistMat& signals); + + /** @brief Computes the core back-prop steps. */ + virtual void back_prop_impl_() = 0; + + /** @brief Allocates new storage for the gradients that this layer + * will compute. + * + * If the layer has persistent error signal information, this will + * simply clear the gradients. + */ + virtual void allocate_new_gradients_() = 0; + + /** @brief Moves all error signals to their respective parents. + * + * Error signals from this instances either are directly moved into + * the parent layer or, in cases in which a direct move is not + * possible, are deep-copied into a new tensor in the parent layer + * (e.g., into a different data type or data distribution). + */ + virtual void propagate_error_signals_to_parents_() = 0; + + /** @brief Releases the error signals propagated from the child + * layers. + * + * At the conclusion of back-prop, the error signals propagated + * from the child layers are no longer needed. This ensures that + * the memory is released. + * + * This function may do other work, but must respect the persistent + * error signal flag. + */ + virtual void clear_prev_error_signals_() = 0; + + /** @brief Assumes ownership of the error signals from the specified + * child layer. + * + * This is a simple pointer move when possible; otherwise it is a + * deep-copy of the signal data. + * + * @param child The layer whence the signal is coming. + * @param signal The error signals being sent to this layer. + */ + virtual void move_or_copy_prev_error_signal_( + const Layer& child, + std::unique_ptr signal) = 0; + + /** @brief Attempts to view the error signals from the specified + * child layer. + * + * This is a simple data view when possible; otherwise it is a + * deep-copy of the signal data. + * + * @param child The layer whence the signal is coming. + * @param signal The error signals being sent to this layer. + */ + virtual void view_or_copy_prev_error_signal_( + const Layer& child, + const El::BaseDistMatrix& signal) = 0; + + /** @brief Deep-copy the error signals from the specified child + * layer. + * + * @param child The layer whence the signal is coming. + * @param signal The error signals being sent to this layer. + */ + virtual void deep_copy_prev_error_signal_( + const Layer& child, + const El::BaseDistMatrix& signal) = 0; + + ///@} // =========================================================== // Private class members // =========================================================== + /** @brief References to layer weights. + * + * These are references to the base weights objects. The tensor + * data type for weights storage might differ from the tensor data + * type of this layer's tensors. To ensure consistency, we must + * only access weights values through the WeightsProxy class during + * training. + */ + std::vector m_weights; + /** Dimensions of output tensors. */ std::vector> m_output_dims_list; - /** Input tensors. - * Each matrix column corresponds to a flattened mini-batch sample. - */ - std::vector> m_inputs; - /** Output tensors. - * Each matrix column corresponds to a flattened mini-batch sample. - */ - std::vector> m_outputs; - /** Objective function gradients w.r.t. the output tensors. - * Each matrix column corresponds to a flattened mini-batch sample. - */ - std::vector> m_gradient_wrt_outputs; - /** Objective function gradients w.r.t. the input tensors. - * Each matrix column corresponds to a flattened mini-batch sample. - */ - std::vector> m_gradient_wrt_inputs; - /** Hint layer. * During setup, the output tensor dimensions are set to match the * first output tensor of the hint layer. Derived classes may do @@ -505,8 +735,60 @@ class Layer { */ const Layer* m_hint_layer = nullptr; + /** Parallel strategy for the layer. */ + ParallelStrategy m_parallel_strategy; + +private: + friend std::vector extract_weights(Layer const& l); + friend std::vector extract_weights(Layer& l); + +#ifdef LBANN_HAS_DISTCONV + friend class distconv_adapter; + public: + /** Indicate whether distconv is enabled. */ + bool distconv_enabled() const; + /** Indicate whether original input matrices need to be set up. */ + virtual bool keep_original_inputs(int index) const; + /** Indicate whether original output matrices need to be set up. */ + virtual bool keep_original_outputs(int index) const; + /** Indicate whether original gradient wrt input matrices need to be set up. */ + virtual bool keep_original_gradient_wrt_inputs(int index) const; + /** Indicate whether original gradient wrt output matrices need to be set up. */ + virtual bool keep_original_gradient_wrt_outputs(int index) const; + /** Retrievs distconv adapter. */ + virtual const distconv_adapter& get_distconv_adapter() const; + /** Retrievs distconv adapter. */ + virtual distconv_adapter& get_distconv_adapter(); + + protected: + /** Indicate whether distconv is supported. */ + virtual bool is_distconv_supported() const { return false; } + /** Pre-initialize distconv attributes needed for setup_data(). */ + void prepare_distconv(); + virtual void setup_distconv_adapter() = 0; + std::unique_ptr& get_distconv_adapter_ptr() { + return m_dc; }; + const std::unique_ptr& get_distconv_adapter_ptr() const { + return m_dc; }; + + private: + mutable bool m_distconv_enabled = false; + mutable bool m_distconv_enabled_set = false; + std::unique_ptr m_dc; +#endif // LBANN_HAS_DISTCONV }; +// FIXME (trb 05/28/2020): These should go away. They're used in +// "model.cpp" and "model_factory.cpp" but could be refactored +// out. Outside the scope of current PR. +inline std::vector extract_weights(Layer& l) { + return l.m_weights; +} + +inline std::vector extract_weights(Layer const& l) { + return {l.m_weights.cbegin(), l.m_weights.cend()}; +} + } // namespace lbann #endif // LBANN_LAYERS_LAYER_HPP_INCLUDED diff --git a/include/lbann/layers/learning/CMakeLists.txt b/include/lbann/layers/learning/CMakeLists.txt index ac855e21023..71111d57435 100644 --- a/include/lbann/layers/learning/CMakeLists.txt +++ b/include/lbann/layers/learning/CMakeLists.txt @@ -1,8 +1,12 @@ # Add the headers for this directory set_full_path(THIS_DIR_HEADERS base_convolution.hpp + channelwise_scale_bias.hpp + channelwise_fully_connected.hpp convolution.hpp deconvolution.hpp + embedding.hpp + entrywise_scale_bias.hpp fully_connected.hpp fully_connected_cuda.hpp learning.hpp diff --git a/include/lbann/layers/learning/base_convolution.hpp b/include/lbann/layers/learning/base_convolution.hpp index afa3046086b..5f15e935ee0 100644 --- a/include/lbann/layers/learning/base_convolution.hpp +++ b/include/lbann/layers/learning/base_convolution.hpp @@ -27,23 +27,70 @@ #ifndef LBANN_LAYERS_LEARNING_BASE_CONVOLUTION_HPP_INCLUDED #define LBANN_LAYERS_LEARNING_BASE_CONVOLUTION_HPP_INCLUDED -#include -#include +#include "lbann/layers/data_type_layer.hpp" #include "lbann/layers/layer.hpp" -#include "lbann/weights/initializer.hpp" -#include "lbann/weights/variance_scaling_initializers.hpp" #include "lbann/utils/cudnn.hpp" -#include "lbann/utils/exception.hpp" -#include "lbann/utils/random.hpp" -#include "lbann/utils/timer.hpp" -#include "lbann/utils/im2col.hpp" +#include "lbann/utils/memory.hpp" + +#include namespace lbann { +#ifdef LBANN_HAS_DISTCONV +template +class base_convolution_adapter: public data_type_distconv_adapter { + public: + using TensorDevType = typename data_type_distconv_adapter::TensorDevType; + + base_convolution_adapter(Layer& layer): data_type_distconv_adapter(layer) {} + virtual ~base_convolution_adapter() = default; + + void setup_fp_tensors() override; + void setup_bp_tensors() override; + void setup_layer(size_t workspace_capacity) override; + + void fp_compute_convolution(); + void fp_apply_bias(); + + void bp_compute_convolution_data(); + void bp_compute_convolution_filter(); + + std::unique_ptr> m_conv; + std::unique_ptr m_kernel; + std::unique_ptr m_bias; + std::unique_ptr m_kernel_gradient; + std::unique_ptr m_bias_gradient; + + std::string m_fwd_algo; + std::string m_bwd_data_algo; + std::string m_bwd_filter_algo; +}; +#endif // LBANN_HAS_DISTCONV + /** @brief Computation kernels for convolution and deconvolution layers. */ -template -class base_convolution_layer : public Layer { +template +class base_convolution_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; + + template + using DMatDT = El::Matrix; + +#ifdef LBANN_HAS_CUDNN + using ScalingType = cudnn::ScalingParamType; +#else + using ScalingType = TensorDataType; +#endif // LBANN_HAS_CUDNN + + ///@} protected: @@ -68,10 +115,15 @@ class base_convolution_layer : public Layer { /** Scaling factor for bias term. * If the scaling factor is zero, bias is not applied. */ - DataType m_bias_scaling_factor; + ScalingType m_bias_scaling_factor; #ifdef LBANN_HAS_CUDNN + /** @brief Math type to use inside cuDNN. + * @details Must be cached since it isn't used until setup. + */ + cudnnMathType_t m_convolution_math_type = + cudnn::get_default_convolution_math_type(); /** Convolution kernel cuDNN descriptor. */ cudnnFilterDescriptor_t m_kernel_cudnn_desc = nullptr; /** Convolution cuDNN descriptor. */ @@ -79,7 +131,7 @@ class base_convolution_layer : public Layer { /** Bias tensor cuDNN descriptor. */ cudnnTensorDescriptor_t m_bias_cudnn_desc = nullptr; /** Tensor cuDNN descriptors. */ - cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; + cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; /** Forward algorithm cache (mini-batch size -> algo). */ std::unordered_map m_fwd_cudnn_algos; /** Backward data algorithm cache (mini-batch size -> algo). */ @@ -99,358 +151,28 @@ class base_convolution_layer : public Layer { std::vector strides, std::vector dilations, int groups, - bool has_bias) - : Layer(comm), - m_output_channels(output_channels), - m_conv_dims(std::move(conv_dims)), - m_pads(std::move(pads)), - m_strides(std::move(strides)), - m_dilations(std::move(dilations)), - m_groups(groups), - m_bias_scaling_factor(has_bias ? 1 : 0) -#ifdef LBANN_HAS_CUDNN - , m_tensors_cudnn_desc(this) -#endif // LBANN_HAS_CUDNN - {} - - base_convolution_layer(const base_convolution_layer& other) - : Layer(other), - m_output_channels(other.m_output_channels), - m_conv_dims(other.m_conv_dims), - m_pads(other.m_pads), - m_strides(other.m_strides), - m_dilations(other.m_dilations), - m_groups(other.m_groups), - m_bias_scaling_factor(other.m_bias_scaling_factor) -#ifdef LBANN_HAS_CUDNN - , m_tensors_cudnn_desc(other.m_tensors_cudnn_desc), - m_fwd_cudnn_algos(other.m_fwd_cudnn_algos), - m_bwd_data_cudnn_algos(other.m_bwd_data_cudnn_algos), - m_bwd_filter_cudnn_algos(other.m_bwd_filter_cudnn_algos) -#endif // LBANN_HAS_CUDNN - { -#ifdef LBANN_HAS_CUDNN - copy_kernel_cudnn_desc(other.m_kernel_cudnn_desc, - m_kernel_cudnn_desc); - copy_convolution_cudnn_desc(other.m_convolution_cudnn_desc, - m_convolution_cudnn_desc); - if (other.m_bias_scaling_factor != DataType(0)) { - cudnn::copy_tensor_desc(other.m_bias_cudnn_desc, - m_bias_cudnn_desc); - } - m_tensors_cudnn_desc.set_layer(this); -#endif // LBANN_HAS_CUDNN - } + bool has_bias); - base_convolution_layer& operator=(const base_convolution_layer& other) { - Layer::operator=(other); - m_output_channels = other.m_output_channels; - m_conv_dims = other.m_conv_dims; - m_pads = other.m_pads; - m_strides = other.m_strides; - m_dilations = other.m_dilations; - m_groups = other.m_groups; - m_bias_scaling_factor = other.m_bias_scaling_factor; + base_convolution_layer(const base_convolution_layer& other); -#ifdef LBANN_HAS_CUDNN - // Copy cuDNN objects - copy_kernel_cudnn_desc(other.m_kernel_cudnn_desc, - m_kernel_cudnn_desc); - copy_convolution_cudnn_desc(other.m_convolution_cudnn_desc, - m_convolution_cudnn_desc); - if (other.m_bias_scaling_factor != DataType(0)) { - cudnn::copy_tensor_desc(other.m_bias_cudnn_desc, - m_bias_cudnn_desc); - } - m_tensors_cudnn_desc = other.m_tensors_cudnn_desc; - m_tensors_cudnn_desc.set_layer(this); - m_fwd_cudnn_algos = other.m_fwd_cudnn_algos; - m_bwd_data_cudnn_algos = other.m_bwd_data_cudnn_algos; - m_bwd_filter_cudnn_algos = other.m_bwd_filter_cudnn_algos; -#endif // LBANN_HAS_CUDNN + base_convolution_layer& operator=(const base_convolution_layer& other); - return *this; - } + ~base_convolution_layer(); - ~base_convolution_layer() { #ifdef LBANN_HAS_CUDNN - if (m_kernel_cudnn_desc != nullptr) { - CHECK_CUDNN_DTOR(cudnnDestroyFilterDescriptor(m_kernel_cudnn_desc)); - } - if (m_convolution_cudnn_desc != nullptr) { - CHECK_CUDNN_DTOR(cudnnDestroyConvolutionDescriptor(m_convolution_cudnn_desc)); - } - if (m_bias_cudnn_desc != nullptr) { - CHECK_CUDNN_DTOR(cudnnDestroyTensorDescriptor(m_bias_cudnn_desc)); - } + void set_cudnn_math_mode(cudnnMathType_t math_type) noexcept; #endif // LBANN_HAS_CUDNN - } - - description get_description() const override { - auto&& desc = Layer::get_description(); - std::ostringstream ss; - - // Convolution dimensions - ss.str(std::string{}); - ss.clear(); - for (size_t i = 0; i < m_conv_dims.size(); ++i) { - ss << (i > 0 ? ", " : "" ) << m_conv_dims[i]; - } - desc.add("Convolution dimensions", ss.str()); - // Strides - ss.str(std::string{}); - ss.clear(); - for (size_t i = 0; i < m_strides.size(); ++i) { - ss << (i > 0 ? ", " : "" ) << m_strides[i]; - } - desc.add("Strides", ss.str()); + description get_description() const override; + void setup_dims(DataReaderMetaData& dr_metadata) override; - // Pads - ss.str(std::string{}); - ss.clear(); - for (size_t i = 0; i < m_pads.size(); ++i) { - ss << (i > 0 ? ", " : "" ) << m_pads[i]; - } - desc.add("Pads", ss.str()); - - // Dilation - ss.str(std::string{}); - ss.clear(); - for (size_t i = 0; i < m_dilations.size(); ++i) { - ss << (i > 0 ? ", " : "" ) << m_dilations[i]; - } - desc.add("Dilations", ss.str()); - - // Groups - desc.add("Groups", m_groups); - - // Bias - ss.str(std::string{}); - ss.clear(); - ss << (m_bias_scaling_factor == DataType(0) ? - "disabled" : "enabled"); - desc.add("Bias", ss.str()); - - // Result - return desc; - - } - - void setup_dims() override { - Layer::setup_dims(); - std::ostringstream err; - - // Check number of channels and channel groups - const auto& input_dims = get_input_dims(); - if (m_output_channels < 1) { - err << get_type() << " layer \"" << get_name() << "\" " - << "has an invalid number of output channels " - << "(" << m_output_channels << ")"; - LBANN_ERROR(err.str()); - } else if (m_groups < 1) { - err << get_type() << " layer \"" << get_name() << "\" " - << "has an invalid number of groups (" << m_groups << ")"; - LBANN_ERROR(err.str()); - } else if (input_dims[0] % m_groups != 0 - || m_output_channels % m_groups != 0) { - err << get_type() << " layer \"" << get_name() << "\" " - << "has " << m_groups << " groups, which does not divide " - << "the input channels (" << input_dims[0] << ") or " - << "the output channels (" << m_output_channels << ")"; - LBANN_ERROR(err.str()); - } - - // Check kernel dims, pads, stride, dilations - const auto& num_spatial_dims = input_dims.size() - 1; - if (m_conv_dims.size() != num_spatial_dims - || std::any_of(m_conv_dims.begin(), m_conv_dims.end(), - [](El::Int d) { return d < 1; })) { - err << get_type() << " layer \"" << get_name() << "\" " - << "has invalid spatial dimensions for convolution kernel ("; - if (m_conv_dims.empty()) { err << "no dimensions"; } - for (size_t i = 0; i < m_conv_dims.size(); ++i) { - err << (i > 0 ? "x" : "") << m_conv_dims[i]; - } - err << ", expected " << num_spatial_dims << " spatial dimensions)"; - LBANN_ERROR(err.str()); - } else if (m_pads.size() != num_spatial_dims) { - err << get_type() << " layer \"" << get_name() << "\" " - << "has invalid convolution pads (("; - for (size_t i = 0; i < m_pads.size(); ++i) { - err << (i > 0 ? "," : "") << m_pads[i]; - } - err << "), expected " << num_spatial_dims << " spatial dimensions)"; - LBANN_ERROR(err.str()); - } else if (m_strides.size() != num_spatial_dims - || std::any_of(m_strides.begin(), m_strides.end(), - [](El::Int d) { return d < 1; })) { - err << get_type() << " layer \"" << get_name() << "\" " - << "has invalid convolution strides (("; - for (size_t i = 0; i < m_strides.size(); ++i) { - err << (i > 0 ? "," : "") << m_strides[i]; - } - err << "), expected " << num_spatial_dims << " spatial dimensions)"; - LBANN_ERROR(err.str()); - } else if (m_dilations.size() != num_spatial_dims - || std::any_of(m_dilations.begin(), m_dilations.end(), - [](El::Int d) { return d < 1; })) { - err << get_type() << " layer \"" << get_name() << "\" " - << "has invalid convolution dilations (("; - for (size_t i = 0; i < m_dilations.size(); ++i) { - err << (i > 0 ? "," : "") << m_dilations[i]; - } - err << "), expected " << num_spatial_dims << " spatial dimensions)"; - LBANN_ERROR(err.str()); - } - - // Make sure that configuration is supported - if (Device == El::Device::CPU - && std::any_of(m_dilations.begin(), m_dilations.end(), - [](El::Int d) { return d != 1; })) { - err << get_type() << " layer \"" << get_name() << "\" " - << "has non-unit dilation, which is not yet supported on CPU"; - LBANN_ERROR(err.str()); - } - if (Device == El::Device::CPU && m_groups != 1) { - err << get_type() << " layer \"" << get_name() << "\" " - << "has " << m_groups << " groups, " - << "but only one group is currently supported on CPU"; - LBANN_ERROR(err.str()); - } - - } - - /** Setup layer data. + /** @brief Setup layer data. * The kernel weights are setup in the convolution and * deconvolution classes. */ - void setup_data() override { - Layer::setup_data(); - - // Tensor dimensions - const auto& input_dims = get_input_dims(); - const auto& output_dims = get_output_dims(); - const auto& kernel_dims = get_kernel_dims(); - const auto& kernel_size = std::accumulate(kernel_dims.begin(), - kernel_dims.end(), - 1, std::multiplies()); - - // Initialize default weights if none are provided - if (this->m_weights.size() > 2) { - std::stringstream err; - err << "attempted to setup layer \"" << get_name() << "\" " - << "with an invalid number of weights " - << "(expected at most 2, " - << "found " << this->m_weights.size() << ")"; - LBANN_ERROR(err.str()); - } - if (m_bias_scaling_factor != DataType(0)) { - this->m_weights.resize(2, nullptr); - } else { - this->m_weights.resize(1, nullptr); - } - if (this->m_weights[0] == nullptr) { - auto* w = new weights(get_comm()); - std::unique_ptr init(new he_initializer(probability_distribution::gaussian)); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_kernel"); - w->set_initializer(init); - w->set_optimizer(opt); - this->m_weights[0] = w; - this->m_model->add_weights(w); - } - auto& kernel_weights = *this->m_weights[0]; - - // Initialize variance scaling initialization - auto* cast_initializer - = dynamic_cast(kernel_weights.get_initializer()); - if (cast_initializer != nullptr) { - cast_initializer->set_fan_in(kernel_size / output_dims[0]); - cast_initializer->set_fan_out(kernel_size / input_dims[0]); - } - - // Initialize weight matrices - auto dist = get_prev_activations().DistData(); - dist.colDist = El::STAR; - dist.rowDist = El::STAR; - kernel_weights.set_dims(kernel_dims); - kernel_weights.set_matrix_distribution(dist); - - // Set up bias if needed. - if (m_bias_scaling_factor != DataType(0)) { - if (this->m_weights[1] == nullptr) { - auto* w = new weights(get_comm()); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_bias"); - w->set_optimizer(opt); - this->m_weights[1] = w; - this->m_model->add_weights(w); - } - auto& bias_weights = *this->m_weights[1]; - bias_weights.set_dims(output_dims[0]); - bias_weights.set_matrix_distribution(dist); - } + void setup_data(size_t max_mini_batch_size) override; - // Initialize freeze state - for (auto&& w : this->m_weights) { - if (m_frozen) { - w->freeze(); - } else { - w->unfreeze(); - } - } - for (auto&& w : this->m_weights) { - if (w->is_frozen() != m_frozen) { - std::stringstream err; - err << (m_frozen ? "" : "un") << "frozen " - << "layer \"" << get_name() << "\" has " - << (w->is_frozen() ? "" : "un") << "frozen " - << "weights \"" << w->get_name() << "\""; - LBANN_ERROR(err.str()); - } - } - - } - - /// Initialize GPU objects - void setup_gpu() override { - Layer::setup_gpu(); -#ifndef LBANN_HAS_CUDNN - LBANN_ERROR("cuDNN not detected"); -#else - - const auto& output_dims = get_output_dims(); - const auto& kernel_dims = get_kernel_dims(); - - // Set kernel descriptor - CHECK_CUDNN(cudnnCreateFilterDescriptor(&m_kernel_cudnn_desc)); - CHECK_CUDNN(cudnnSetFilterNdDescriptor(m_kernel_cudnn_desc, - cudnn::get_data_type(), - CUDNN_TENSOR_NCHW, - kernel_dims.size(), - kernel_dims.data())); - - // Set convolution descriptor - CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&m_convolution_cudnn_desc)); - CHECK_CUDNN(cudnnSetConvolutionNdDescriptor(m_convolution_cudnn_desc, - m_pads.size(), - m_pads.data(), - m_strides.data(), - m_dilations.data(), - CUDNN_CROSS_CORRELATION, - cudnn::get_data_type())); - CHECK_CUDNN(cudnnSetConvolutionGroupCount(m_convolution_cudnn_desc, - m_groups)); - - // Set bias tensor descriptor - if (m_bias_scaling_factor != DataType(0)) { - std::vector bias_dims(output_dims.size() + 1, 1); - bias_dims[1] = output_dims[0]; - cudnn::set_tensor_desc(m_bias_cudnn_desc, bias_dims); - } - -#endif // LBANN_HAS_CUDNN - } + /** @brief Initialize GPU objects */ + void setup_gpu() override; protected: @@ -458,564 +180,23 @@ class base_convolution_layer : public Layer { virtual std::vector get_kernel_dims() const = 0; /** Convolution with cuDNN. */ - void apply_convolution_cudnn(bool during_forward_prop) { -#ifndef LBANN_HAS_CUDNN - LBANN_ERROR("cuDNN not detected"); -#else - - // Useful constants - const DataType zero = DataType(0); - const DataType one = DataType(1); - - // Matrices - const auto& kernel = m_weights[0]->get_values(); - const auto& input = (during_forward_prop ? - get_local_prev_activations() : - get_local_prev_error_signals()); - auto& output = (during_forward_prop ? - get_local_activations() : - get_local_error_signals()); - - // Do nothing if there is no local data - if (input.Height() < 1 || input.Width() < 1 - || output.Height() < 1 || output.Width() < 1) { - return; - } - - // Initialize GPU workspace - GPUMat workspace; -#ifdef HYDROGEN_HAVE_CUB - workspace.SetMemoryMode(1); -#endif // HYDROGEN_HAVE_CUB - size_t workspace_size = 1 << 30; /// @todo Allocate largest free block - workspace.Resize(workspace_size / sizeof(DataType), 1); - workspace_size = workspace.Height() * sizeof(DataType); - - // Convolution parameters - std::vector input_dims, output_dims; - cudnnTensorDescriptor_t input_desc, output_desc; - if (during_forward_prop) { - input_dims = get_input_dims(); - output_dims = get_output_dims(); - input_desc = m_tensors_cudnn_desc.get_prev_activations(); - output_desc = m_tensors_cudnn_desc.get_activations(); - } - else { - input_dims = get_output_dims(); - output_dims = get_input_dims(); - input_desc = m_tensors_cudnn_desc.get_prev_error_signals(); - output_desc = m_tensors_cudnn_desc.get_error_signals(); - } - - // Perform convolution on the GPU - // Determine convolution algorithm - cudnnConvolutionFwdAlgo_t convolution_cudnn_algorithm - = get_forward_algo_cudnn(input.Width(), input_desc, input.LockedBuffer(), - m_kernel_cudnn_desc, kernel.LockedBuffer(), - m_convolution_cudnn_desc, - output_desc, output.Buffer(), - workspace_size, workspace.Buffer()); - - // Apply convolution - CHECK_CUDNN(cudnnConvolutionForward(cudnn::get_handle(), - &one, - input_desc, - input.LockedBuffer(), - m_kernel_cudnn_desc, - kernel.LockedBuffer(), - m_convolution_cudnn_desc, - convolution_cudnn_algorithm, - workspace.Buffer(), - workspace_size, - &zero, - output_desc, - output.Buffer())); - -#endif // LBANN_HAS_CUDNN - } + void apply_convolution_cudnn(bool during_forward_prop); /** Transposed convolution with cuDNN. */ - void apply_transposed_convolution_cudnn(bool during_forward_prop) { -#ifndef LBANN_HAS_CUDNN - LBANN_ERROR("cuDNN not detected"); -#else - - // Useful constants - const DataType zero = DataType(0); - const DataType one = DataType(1); - - // GPU data - const auto& kernel = m_weights[0]->get_values(); - const auto& input = (during_forward_prop ? - get_local_prev_activations() : - get_local_prev_error_signals()); - auto& output = (during_forward_prop ? - get_local_activations() : - get_local_error_signals()); - - // Do nothing if there is no local data - if (input.Height() < 1 || input.Width() < 1 - || output.Height() < 1 || output.Width() < 1) { - return; - } - - // Initialize GPU workspace - // Note: Use CUB GPU memory pool if possible - GPUMat workspace; -#ifdef HYDROGEN_HAVE_CUB - workspace.SetMemoryMode(1); -#endif // HYDROGEN_HAVE_CUB - size_t workspace_size = 1 << 30; /// @todo Allocate largest free block - workspace.Resize(workspace_size / sizeof(DataType), 1); - workspace_size = workspace.Height() * sizeof(DataType); - - // Convolution transpose parameters - std::vector input_dims, output_dims; - cudnnTensorDescriptor_t input_desc, output_desc; - if (during_forward_prop) { - input_dims = get_input_dims(); - output_dims = get_output_dims(); - input_desc = m_tensors_cudnn_desc.get_prev_activations(); - output_desc = m_tensors_cudnn_desc.get_activations(); - } - else { - input_dims = get_output_dims(); - output_dims = get_input_dims(); - input_desc = m_tensors_cudnn_desc.get_prev_error_signals(); - output_desc = m_tensors_cudnn_desc.get_error_signals(); - } - - // Perform transposed convolution on the GPU - // Determine transposed convolution algorithm - cudnnConvolutionBwdDataAlgo_t transposed_convolution_cudnn_algorithm - = get_backward_data_algo_cudnn(input.Width(), - m_kernel_cudnn_desc, kernel.LockedBuffer(), - input_desc, input.LockedBuffer(), - m_convolution_cudnn_desc, - output_desc, output.Buffer(), - workspace_size, workspace.Buffer()); - // Perform transposed convolution - CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn::get_handle(), - &one, - m_kernel_cudnn_desc, - kernel.LockedBuffer(), - input_desc, - input.LockedBuffer(), - m_convolution_cudnn_desc, - transposed_convolution_cudnn_algorithm, - workspace.Buffer(), - workspace_size, - &zero, - output_desc, - output.Buffer())); - - - #endif // LBANN_HAS_CUDNN - } - - void apply_bias_cudnn() { -#ifndef LBANN_HAS_CUDNN - LBANN_ERROR("cuDNN not detected"); -#else - auto& local_output = get_local_activations(); - if (m_bias_scaling_factor != DataType(0) - && local_output.Height() > 0 - && local_output.Width() > 0) { - const DataType one = 1; - const auto& bias = m_weights[1]->get_values(); - CHECK_CUDNN(cudnnAddTensor(cudnn::get_handle(), - &m_bias_scaling_factor, - m_bias_cudnn_desc, - bias.LockedBuffer(), - &one, - m_tensors_cudnn_desc.get_activations(), - local_output.Buffer())); - } - #endif // LBANN_HAS_CUDNN - } - - void compute_gradients_cudnn(bool using_transposed_convolution) { -#ifndef LBANN_HAS_CUDNN - LBANN_ERROR("cuDNN not detected"); -#else + void apply_transposed_convolution_cudnn(bool during_forward_prop); - // Matrices - const auto& local_input = get_local_prev_activations(); - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - - // Useful constants - const int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size(); - const bool has_local_data = (local_input.Height() > 0 - && local_input.Width() > 0 - && local_gradient_wrt_output.Height() > 0 - && local_gradient_wrt_output.Width() > 0); - - // Compute bias gradient - if (m_bias_scaling_factor != DataType(0) - && m_weights[1]->get_optimizer() != nullptr) { - optimizer* bias_optimizer = m_weights[1]->get_optimizer(); - DataType dst_scale = DataType(0), gradient_scale = DataType(0); - auto& bias_gradient = bias_optimizer->get_gradient_buffer( - dst_scale, gradient_scale, true); - gradient_scale /= effective_mini_batch_size; - if (has_local_data) { - CHECK_CUDNN(cudnnConvolutionBackwardBias( - cudnn::get_handle(), - &gradient_scale, - m_tensors_cudnn_desc.get_prev_error_signals(), - local_gradient_wrt_output.LockedBuffer(), - &dst_scale, - m_bias_cudnn_desc, - bias_gradient.Buffer())); - } else { - El::Scale(dst_scale, bias_gradient); - } - } - - // Compute kernel gradient - optimizer* kernel_optimizer = m_weights[0]->get_optimizer(); - if (kernel_optimizer != nullptr) { - DataType dst_scale = DataType(0), gradient_scale = DataType(0); - auto& kernel_gradient = kernel_optimizer->get_gradient_buffer( - dst_scale, gradient_scale, true); - gradient_scale /= effective_mini_batch_size; - if (has_local_data) { - // Initialize GPU workspace - GPUMat workspace; -#ifdef HYDROGEN_HAVE_CUB - workspace.SetMemoryMode(1); // CUB GPU memory pool -#endif // HYDROGEN_HAVE_CUB - size_t workspace_size = 1 << 30; /// @todo Allocate largest free block - workspace.Resize(workspace_size / sizeof(DataType), 1); - workspace_size = workspace.Height() * sizeof(DataType); - - // Initialize cuDNN objects - auto&& input_desc = m_tensors_cudnn_desc.get_prev_activations(); - auto&& gradient_wrt_output_desc = m_tensors_cudnn_desc.get_prev_error_signals(); - - // Determine algorithm and compute kernel gradient - if (using_transposed_convolution) { - cudnnConvolutionBwdFilterAlgo_t kernel_gradient_cudnn_algorithm - = get_backward_filter_algo_cudnn( - local_input.Width(), - gradient_wrt_output_desc, local_gradient_wrt_output.LockedBuffer(), - input_desc, local_input.LockedBuffer(), - m_convolution_cudnn_desc, - m_kernel_cudnn_desc, - workspace_size, workspace.Buffer()); - CHECK_CUDNN(cudnnConvolutionBackwardFilter( - cudnn::get_handle(), - &gradient_scale, - gradient_wrt_output_desc, - local_gradient_wrt_output.LockedBuffer(), - input_desc, - local_input.LockedBuffer(), - m_convolution_cudnn_desc, - kernel_gradient_cudnn_algorithm, - workspace.Buffer(), - workspace_size, - &dst_scale, - m_kernel_cudnn_desc, - kernel_gradient.Buffer())); - } else { - cudnnConvolutionBwdFilterAlgo_t kernel_gradient_cudnn_algorithm - = get_backward_filter_algo_cudnn( - local_input.Width(), - input_desc, local_input.LockedBuffer(), - gradient_wrt_output_desc, local_gradient_wrt_output.LockedBuffer(), - m_convolution_cudnn_desc, - m_kernel_cudnn_desc, - workspace_size, workspace.Buffer()); - CHECK_CUDNN(cudnnConvolutionBackwardFilter( - cudnn::get_handle(), - &gradient_scale, - input_desc, - local_input.LockedBuffer(), - gradient_wrt_output_desc, - local_gradient_wrt_output.LockedBuffer(), - m_convolution_cudnn_desc, - kernel_gradient_cudnn_algorithm, - workspace.Buffer(), - workspace_size, - &dst_scale, - m_kernel_cudnn_desc, - kernel_gradient.Buffer())); - } - } else { - El::Scale(dst_scale, kernel_gradient); - } - } - -#endif // LBANN_HAS_CUDNN - } + void apply_bias_cudnn(); + void compute_gradients_cudnn(bool using_transposed_convolution); /** Convolution with im2col GEMM algorithm. */ - void apply_convolution_im2col(bool during_forward_prop) { - - // Local matrices - const auto& local_kernel = this->m_weights[0]->get_values().LockedMatrix(); - const auto& local_input = (during_forward_prop ? - get_local_prev_activations() : - get_local_prev_error_signals()); - auto& local_output = (during_forward_prop ? - get_local_activations() : - get_local_error_signals()); - - // Matrix parameters - const int output_size = local_output.Height(); - const El::Int local_width = local_input.Width(); - std::vector input_dims, output_dims; - if (during_forward_prop) { - input_dims = get_input_dims(); - output_dims = get_output_dims(); - } - else { - input_dims = get_output_dims(); - output_dims = get_input_dims(); - } - const auto& kernel_dims = get_kernel_dims(); - const auto& kernel_size = std::accumulate(kernel_dims.begin(), - kernel_dims.end(), - 1, std::multiplies()); - - // Initialize matrices - const int m = output_size / output_dims[0]; - const int n = output_dims[0]; - const int k = kernel_size / output_dims[0]; - DMat input_col, output_col; - DMat im2col_matrix(k, m); - const DMat kernel_matrix(k, n, local_kernel.LockedBuffer(), k); - - // Iterate through input columns - for (El::Int col = 0; col < local_width; ++col) { - - // Construct im2col matrix from current input column - El::LockedView(input_col, local_input, El::ALL, El::IR(col)); - im2col(input_col, - im2col_matrix, - input_dims[0], - input_dims.size() - 1, - &input_dims[1], - m_pads.data(), - &kernel_dims[2], - m_strides.data()); - - // Apply convolution to current input column - output_col.Attach(m, n, local_output.Buffer(0, col), m); - El::Gemm(El::TRANSPOSE, El::NORMAL, - DataType(1), im2col_matrix, kernel_matrix, - DataType(0), output_col); - - } - - } + void apply_convolution_im2col(bool during_forward_prop); /** Transposed convolution with im2col GEMM algorithm. */ - void apply_transposed_convolution_im2col(bool during_forward_prop) { - - // Local matrices - const auto& local_kernel = this->m_weights[0]->get_values().LockedMatrix(); - const auto& local_input = (during_forward_prop ? - get_local_prev_activations() : - get_local_prev_error_signals()); - DMat& local_output = (during_forward_prop ? - get_local_activations() : - get_local_error_signals()); + void apply_transposed_convolution_im2col(bool during_forward_prop); - // Matrix parameters - const int input_size = local_input.Height(); - const El::Int local_width = local_input.Width(); - std::vector input_dims, output_dims; - if (during_forward_prop) { - input_dims = get_input_dims(); - output_dims = get_output_dims(); - } - else { - input_dims = get_output_dims(); - output_dims = get_input_dims(); - } - const auto& kernel_dims = get_kernel_dims(); - const auto& kernel_size = std::accumulate(kernel_dims.begin(), - kernel_dims.end(), - 1, std::multiplies()); + void apply_bias_cpu(); - // Initialize matrices - const int m = kernel_size / input_dims[0]; - const int n = input_size / input_dims[0]; - const int k = input_dims[0]; - DMat input_col, output_col; - DMat im2col_matrix(m, n); - const DMat kernel_matrix(m, k, local_kernel.LockedBuffer(), m); - - // Iterate through input columns - for (El::Int col = 0; col < local_width; ++col) { - - // Apply transposed convolution to current input column - input_col.LockedAttach(n, k, local_input.LockedBuffer(0, col), n); - El::Gemm(El::NORMAL, El::TRANSPOSE, - DataType(1), kernel_matrix, input_col, - DataType(0), im2col_matrix); - - // Perform col2im to accumulate contributions from each kernel - // position - El::View(output_col, local_output, El::ALL, El::IR(col)); - col2im(im2col_matrix, - output_col, - output_dims[0], - output_dims.size() - 1, - &output_dims[1], - m_pads.data(), - &kernel_dims[2], - m_strides.data()); - - } - - } - - void apply_bias_cpu() { - - // Return immediately if there is no bias - if (m_bias_scaling_factor == DataType(0)) return; - - // Local matrices - const auto& local_bias = m_weights[1]->get_values().LockedMatrix(); - auto& local_output = get_local_activations(); - - // Matrix parameters - const El::Int local_width = local_output.Width(); - const auto& output_dims = get_output_dims(); - const El::Int num_output_channels = output_dims[0]; - const El::Int num_per_output_channel = get_output_size() / num_output_channels; - - // Apply bias to each output channel - LBANN_OMP_PARALLEL_FOR - for (El::Int channel = 0; channel < num_output_channels; ++channel) { - const El::Int row_start = channel * num_per_output_channel; - const El::Int row_end = (channel+1) * num_per_output_channel; - const DataType bias_term = m_bias_scaling_factor * local_bias(channel, 0); - for (El::Int col = 0; col < local_width; ++col) { - for (El::Int row = row_start; row < row_end; ++row) { - local_output(row, col) += bias_term; - } - } - } - - } - - void compute_gradients_im2col(bool using_transposed_convolution) { - - // Local matrices - const DMat& local_input = get_local_prev_activations(); - const DMat& local_gradient_wrt_output = get_local_prev_error_signals(); - const bool has_local_data = (!local_input.IsEmpty() - && !local_gradient_wrt_output.IsEmpty()); - - // Get convolution parameters - const El::Int local_width = local_input.Width(); - const auto& input_dims = get_input_dims(); - const auto& output_dims = get_output_dims(); - const int num_input_channels = input_dims[0]; - const int num_output_channels = output_dims[0]; - const int num_per_output_channel = get_output_size() / num_output_channels; - const int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size(); - const auto& kernel_dims = get_kernel_dims(); - const auto& kernel_size = std::accumulate(kernel_dims.begin(), - kernel_dims.end(), - 1, std::multiplies()); - - // Compute bias gradient - // Note: Sum is computed with Kahan summation - if (m_bias_scaling_factor != DataType(0) - && this->m_weights[1]->get_optimizer() != nullptr) { - optimizer* bias_optimizer = this->m_weights[1]->get_optimizer(); - DataType dst_scale = DataType(0), gradient_scale = DataType(0); - auto& bias_gradient = bias_optimizer->get_gradient_buffer( - dst_scale, gradient_scale, true); - gradient_scale /= effective_mini_batch_size; - if (has_local_data) { - auto& local_bias_gradient = bias_gradient.Matrix(); - LBANN_OMP_PARALLEL_FOR - for (int channel = 0; channel < num_output_channels; ++channel) { - const El::Int row_start = channel * num_per_output_channel; - const El::Int row_end = (channel+1) * num_per_output_channel; - DataType sum = 0; - DataType correction = 0; - for (El::Int col = 0; col < local_width; ++col) { - for (El::Int row = row_start; row < row_end; ++row) { - DataType term = local_gradient_wrt_output(row, col); - term += correction; - const DataType next_sum = sum + term; - correction = term - (next_sum - sum); - sum = next_sum; - } - } - local_bias_gradient(channel, 0) = dst_scale*local_bias_gradient(channel, 0) - + gradient_scale*sum; - } - } else { - El::Scale(dst_scale, bias_gradient); - } - } - - // Stop early if kernel is not being optimized - optimizer* kernel_optimizer = this->m_weights[0]->get_optimizer(); - if (kernel_optimizer == nullptr) { return; } - - // Initialize matrices - const int m = (using_transposed_convolution ? - kernel_size / num_input_channels : - kernel_size / num_output_channels); - const int n = (using_transposed_convolution ? - num_input_channels : - num_output_channels); - const int k = (using_transposed_convolution ? - get_input_size() / num_input_channels : - get_output_size() / num_output_channels); - DataType dst_scale = 0, gradient_scale = 0; - auto& kernel_gradient = kernel_optimizer->get_gradient_buffer( - dst_scale, gradient_scale, true); - El::Scale(dst_scale, kernel_gradient); - gradient_scale /= effective_mini_batch_size; - DMat im2col_matrix(m, k); - DMat kernel_gradient_matrix(m, n, kernel_gradient.Buffer(), m); - - // Compute kernel gradient contributions from each data sample - for (El::Int col = 0; col < local_width; ++col) { - if (using_transposed_convolution) { - const DMat input_col(k, n, local_input.LockedBuffer(0,col), k); - const DMat gradient_wrt_output_col = - El::LockedView(local_gradient_wrt_output, El::ALL, El::IR(col)); - im2col(gradient_wrt_output_col, - im2col_matrix, - num_output_channels, - output_dims.size() - 1, - &output_dims[1], - m_pads.data(), - &kernel_dims[2], - m_strides.data()); - El::Gemm(El::NORMAL, El::NORMAL, - gradient_scale, im2col_matrix, input_col, - DataType(1), kernel_gradient_matrix); - } - else { - const DMat input_col - = El::LockedView(local_input, El::ALL, El::IR(col)); - const DMat gradient_wrt_output_col(k, n, local_gradient_wrt_output.LockedBuffer(0,col), k); - im2col(input_col, - im2col_matrix, - num_input_channels, - input_dims.size() - 1, - &input_dims[1], - m_pads.data(), - &kernel_dims[2], - m_strides.data()); - El::Gemm(El::NORMAL, El::NORMAL, - gradient_scale, im2col_matrix, gradient_wrt_output_col, - DataType(1), kernel_gradient_matrix); - } - } - - } + void compute_gradients_im2col(bool using_transposed_convolution); private: @@ -1023,155 +204,37 @@ class base_convolution_layer : public Layer { /** Copy convolution kernel cuDNN descriptor. */ static void copy_kernel_cudnn_desc(const cudnnFilterDescriptor_t& src, - cudnnFilterDescriptor_t& dst) { - - // Create or destroy descriptor if needed - if(src != nullptr && dst == nullptr) { - CHECK_CUDNN(cudnnCreateFilterDescriptor(&dst)); - } - else if(src == nullptr && dst != nullptr) { - CHECK_CUDNN(cudnnDestroyFilterDescriptor(dst)); - dst = nullptr; - } - - // Copy descriptor data if needed - if(src != nullptr) { - cudnnDataType_t data_type; - cudnnTensorFormat_t format; - int num_dims; - std::vector dims(1); - CHECK_CUDNN(cudnnGetFilterNdDescriptor(src, - dims.size(), - &data_type, - &format, - &num_dims, - dims.data())); - dims.resize(num_dims); - CHECK_CUDNN(cudnnGetFilterNdDescriptor(src, - num_dims, - &data_type, - &format, - &num_dims, - dims.data())); - CHECK_CUDNN(cudnnSetFilterNdDescriptor(dst, - data_type, - format, - num_dims, - dims.data())); - } - - } - + cudnnFilterDescriptor_t& dst); /** Copy convolution cuDNN descriptor. */ - static void copy_convolution_cudnn_desc(const cudnnConvolutionDescriptor_t& src, - cudnnConvolutionDescriptor_t& dst) { - - // Create or destroy descriptor if needed - if(src != nullptr && dst == nullptr) { - CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&dst)); - } - else if(src == nullptr && dst != nullptr) { - CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(dst)); - dst = nullptr; - } - - // Copy descriptor data if needed - if(src != nullptr) { - cudnnConvolutionMode_t mode; - cudnnDataType_t data_type; - int num_dims; - CHECK_CUDNN(cudnnGetConvolutionNdDescriptor(src, - 0, - &num_dims, - nullptr, - nullptr, - nullptr, - &mode, - &data_type)); - std::vector pads(num_dims), strides(num_dims), dilations(num_dims); - CHECK_CUDNN(cudnnGetConvolutionNdDescriptor(src, - num_dims, - &num_dims, - pads.data(), - strides.data(), - dilations.data(), - &mode, - &data_type)); - int num_groups; - CHECK_CUDNN(cudnnGetConvolutionGroupCount(src, - &num_groups)); - CHECK_CUDNN(cudnnSetConvolutionNdDescriptor(dst, - num_dims, - pads.data(), - strides.data(), - dilations.data(), - mode, - data_type)); - CHECK_CUDNN(cudnnSetConvolutionGroupCount(dst, - num_groups)); - } - - } + static void copy_convolution_cudnn_desc( + const cudnnConvolutionDescriptor_t& src, + cudnnConvolutionDescriptor_t& dst); /** Get the cuDNN algorithm to use for forward prop. */ cudnnConvolutionFwdAlgo_t get_forward_algo_cudnn( const int local_mini_batch_size, const cudnnTensorDescriptor_t& input_desc, - const DataType* input, + const TensorDataType* input, const cudnnFilterDescriptor_t& kernel_desc, - const DataType* kernel, + const TensorDataType* kernel, const cudnnConvolutionDescriptor_t& conv_desc, const cudnnTensorDescriptor_t& output_desc, - DataType* output, + TensorDataType* output, size_t ws_size, - DataType* ws) { - if (m_fwd_cudnn_algos.count(local_mini_batch_size) == 0) { -#ifdef LBANN_DETERMINISTIC - bool deterministic = true; -#else - bool deterministic = false; -#endif - m_fwd_cudnn_algos[local_mini_batch_size] = - cudnn::get_fwd_algorithm( - true, deterministic, - input_desc, input, - kernel_desc, kernel, - conv_desc, - output_desc, output, - ws_size, ws); - } - return m_fwd_cudnn_algos[local_mini_batch_size]; - } + TensorDataType* ws); /** Get the cuDNN algorithm to use for backward-data. */ cudnnConvolutionBwdDataAlgo_t get_backward_data_algo_cudnn( const int local_mini_batch_size, const cudnnFilterDescriptor_t& kernel_desc, - const DataType* kernel, + const TensorDataType* kernel, const cudnnTensorDescriptor_t& prev_error_signal_desc, - const DataType* prev_error_signal, + const TensorDataType* prev_error_signal, const cudnnConvolutionDescriptor_t& conv_desc, const cudnnTensorDescriptor_t& error_signal_desc, - DataType* error_signal, + TensorDataType* error_signal, size_t ws_size, - DataType* ws) { - if (m_bwd_data_cudnn_algos.count(local_mini_batch_size) == 0) { -#ifdef LBANN_DETERMINISTIC - bool deterministic = true; -#else - bool deterministic = false; -#endif - m_bwd_data_cudnn_algos[local_mini_batch_size] = - cudnn::get_bwd_data_algorithm( - true, deterministic, - kernel_desc, kernel, - prev_error_signal_desc, prev_error_signal, - conv_desc, - error_signal_desc, error_signal, - ws_size, ws); - } - return m_bwd_data_cudnn_algos[local_mini_batch_size]; - } + TensorDataType* ws); /** * Get the cuDNN algorithm to use for backward-filter. @@ -1180,42 +243,24 @@ class base_convolution_layer : public Layer { cudnnConvolutionBwdFilterAlgo_t get_backward_filter_algo_cudnn( const int local_mini_batch_size, const cudnnTensorDescriptor_t& input_desc, - const DataType* input, + const TensorDataType* input, const cudnnTensorDescriptor_t& prev_error_signal_desc, - const DataType* prev_error_signal, + const TensorDataType* prev_error_signal, const cudnnConvolutionDescriptor_t& conv_desc, const cudnnFilterDescriptor_t& kernel_gradient_desc, size_t ws_size, - DataType* ws) { - if (m_bwd_filter_cudnn_algos.count(local_mini_batch_size) == 0) { -#ifdef LBANN_DETERMINISTIC - bool deterministic = true; -#else - bool deterministic = false; -#endif - // Temporary filter gradient buffer. - GPUMat kernel_gradient; -#ifdef HYDROGEN_HAVE_CUB - kernel_gradient.SetMemoryMode(1); -#endif - kernel_gradient.Resize(this->m_weights[0]->get_matrix_height(), - this->m_weights[0]->get_matrix_width()); - m_bwd_filter_cudnn_algos[local_mini_batch_size] = - cudnn::get_bwd_filter_algorithm( - true, deterministic, - input_desc, input, - prev_error_signal_desc, prev_error_signal, - conv_desc, - kernel_gradient_desc, kernel_gradient.Buffer(), - ws_size, ws); - } - return m_bwd_filter_cudnn_algos[local_mini_batch_size]; - } - + TensorDataType* ws); #endif // LBANN_HAS_CUDNN +#ifdef LBANN_HAS_DISTCONV + friend class base_convolution_adapter; + protected: + using BaseConvAdapterType = base_convolution_adapter; + void setup_distconv_adapter() override; + BaseConvAdapterType& get_distconv_adapter() override; + const BaseConvAdapterType& get_distconv_adapter() const override; +#endif // LBANN_HAS_DISTCONV }; } // namespace lbann - #endif // LBANN_LAYERS_LEARNING_BASE_CONVOLUTION_HPP_INCLUDED diff --git a/include/lbann/layers/learning/channelwise_fully_connected.hpp b/include/lbann/layers/learning/channelwise_fully_connected.hpp new file mode 100644 index 00000000000..928ef732da2 --- /dev/null +++ b/include/lbann/layers/learning/channelwise_fully_connected.hpp @@ -0,0 +1,115 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_LEARNING_CHANNELWISE_FULLY_CONNECTED_HPP_INCLUDED +#define LBANN_LAYERS_LEARNING_CHANNELWISE_FULLY_CONNECTED_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" + +namespace lbann { + +/** @brief Apply affine transformation to tensor channels. + * + * The input tensor is sliced along the first tensor dimension (the + * "channel" dimension for image data in CHW format) and the same + * affine transformation is applied to each slice. Following a + * row-vector convention: + * @f[ y(i,*) = \text{vec}( x(i,*) ) W^T + b @f] + * + * Two weights are required if bias is applied: the linearity and the + * bias. Only the linearity weights are required if bias is not + * applied. If weights aren't provided, the linearity weights are + * initialized with He normal initialization and the bias weights are + * initialized to zero. + * + */ +template +class channelwise_fully_connected_layer + : public data_type_layer { + + static_assert(Layout == data_layout::DATA_PARALLEL, + "channelwise_fully_connected layer " + "only supports data parallel layout"); + +public: + + /** @param comm LBANN communicator. + * @param output_channel_dims Output tensor dimensions, + * excluding the first dimension. + * @param bias Whether to apply bias. + * @param transpose Whether to apply transpose of + * weights matrix. + */ + channelwise_fully_connected_layer( + lbann_comm* comm, + std::vector output_channel_dims, + bool bias, + bool transpose); + + channelwise_fully_connected_layer( + const channelwise_fully_connected_layer& other) = default; + channelwise_fully_connected_layer& operator=( + const channelwise_fully_connected_layer& other) = default; + ~channelwise_fully_connected_layer() = default; + + channelwise_fully_connected_layer* copy() const override; + std::string get_type() const override; + data_layout get_data_layout() const override; + El::Device get_device_allocation() const override; + description get_description() const override; + +protected: + + void setup_dims(DataReaderMetaData& dr_metadata) override; + void setup_data(size_t max_mini_batch_size) override; + + void fp_compute() override; + void bp_compute() override; + +private: + + /** Whether to apply bias. */ + bool m_has_bias; + /** Whether to transpose linearity. */ + bool m_transpose; + +}; + +// Builder function +LBANN_DEFINE_LAYER_BUILDER(channelwise_fully_connected); + +// Explicit template instantiation +#ifndef LBANN_CHANNELWISE_FULLY_CONNECTED_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class channelwise_fully_connected_layer< \ + T, data_layout::DATA_PARALLEL, Device> +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_CHANNELWISE_FULLY_CONNECTED_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYERS_LEARNING_CHANNELWISE_FULLY_CONNECTED_HPP_INCLUDED diff --git a/include/lbann/layers/learning/channelwise_scale_bias.hpp b/include/lbann/layers/learning/channelwise_scale_bias.hpp new file mode 100644 index 00000000000..6d38b6b19b8 --- /dev/null +++ b/include/lbann/layers/learning/channelwise_scale_bias.hpp @@ -0,0 +1,196 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYER_LEARNING_CHANNELWISE_SCALE_BIAS_HPP_INCLUDED +#define LBANN_LAYER_LEARNING_CHANNELWISE_SCALE_BIAS_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/utils/exception.hpp" + +namespace lbann { + +/** @brief Apply scale and bias to tensor channels. + * + * The input tensor is sliced along the first tensor dimension (the + * "channel" dimension, assuming image data in CHW format) and scale + * and bias terms are applied independently to each slice. More + * precisely, given input and output tensors + * @f$ X,Y\in\mathbb{R}^{d_1\times\cdots\times d_n} @f$ + * and scale and bias vectors @f$ a,b\in\mathbb{R}^{d_1} @f$: + * @f[ + * Y_{i,j,\cdots} = a_i X_{i,j,\cdots} + b_i + * @f] + * + * The scale and bias vectors are fused into a single weights tensor + * to reduce the number of gradient allreduces during backprop. In + * particular, the weights tensor is a + * @f$ \text{num\_channels} \times 2 @f$ matrix, where the first + * column correspond to scale terms and the second column to bias + * terms. + */ +template +class channelwise_scale_bias_layer : public data_type_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "channelwise_mean_layer only supports " + "data-parallel data layout"); +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; + + ///@} + +public: + + channelwise_scale_bias_layer(lbann_comm *comm); + channelwise_scale_bias_layer(const channelwise_scale_bias_layer& other); + channelwise_scale_bias_layer& operator=( + const channelwise_scale_bias_layer& other); + + channelwise_scale_bias_layer* copy() const override { + return new channelwise_scale_bias_layer(*this); + } + + std::string get_type() const override { return "channel-wise scale/bias"; } + data_layout get_data_layout() const override { return Layout; } + El::Device get_device_allocation() const override { return Device; } + + void setup_matrices(const El::Grid& grid) override; + void setup_data(size_t max_mini_batch_size) override; + +protected: + + void fp_compute() override; + void bp_compute() override; + +private: + + /** @brief Objective function gradient w.r.t. weights. */ + std::unique_ptr m_weights_gradient; + +}; + +// Implementation +template +channelwise_scale_bias_layer +::channelwise_scale_bias_layer(lbann_comm *comm) + : data_type_layer(comm) +{} + +template +channelwise_scale_bias_layer +::channelwise_scale_bias_layer(const channelwise_scale_bias_layer& other) + : data_type_layer(other), + m_weights_gradient(other.m_weights_gradient + ? other.m_weights_gradient->Copy() + : nullptr) +{} + +template +auto channelwise_scale_bias_layer +::operator=(const channelwise_scale_bias_layer& other) + -> channelwise_scale_bias_layer& { + data_type_layer::operator=(other); + m_weights_gradient.reset(other.m_weights_gradient + ? other.m_weights_gradient->Copy() + : nullptr); + return *this; +} + +template +void channelwise_scale_bias_layer +::setup_matrices(const El::Grid& grid) { + data_type_layer::setup_matrices(grid); + m_weights_gradient.reset(new StarMatDT(grid)); +} + +template +void channelwise_scale_bias_layer +::setup_data(size_t max_mini_batch_size) { + data_type_layer::setup_data(max_mini_batch_size); + const El::Int num_channels = this->get_output_dims()[0]; + + // Construct default weights if needed + // Note: Scale is initialized to 1 and bias to 0 + if (!this->has_weights()) { + auto w = make_unique(this->get_comm()); + std::vector vals(2*num_channels, + El::TypeTraits::Zero()); + std::fill(vals.begin(), vals.begin()+num_channels, + El::TypeTraits::One()); + auto init = make_unique>(vals); + auto opt = this->m_model->template create_optimizer(); + w->set_name(this->get_name() + "_weights"); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->add_weights(w.get()); + this->m_model->add_weights(std::move(w)); + } + if (this->num_weights() != 1) { + LBANN_ERROR("attempted to setup ", + this->get_type()," layer \"",this->get_name(),"\" ", + "with an invalid number of weights ", + "(expected 1, found ",this->num_weights(),")"); + } + + // Setup weights + auto dist = this->get_prev_activations().DistData(); + dist.colDist = El::STAR; + dist.rowDist = El::STAR; + this->get_weights(0).set_dims({static_cast(num_channels)}, {2}); + this->get_weights(0).set_matrix_distribution(dist); + + // Setup gradient w.r.t. weights + m_weights_gradient->AlignWith(dist); + m_weights_gradient->Resize(num_channels, 2); +} + +LBANN_DEFINE_LAYER_BUILDER(channelwise_scale_bias); + +#ifndef LBANN_CHANNELWISE_SCALE_BIAS_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class channelwise_scale_bias_layer; + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_CHANNELWISE_SCALE_BIAS_LAYER_INSTANTIATE + + +} // namespace lbann + +#endif // LBANN_LAYER_LEARNING_CHANNELWISE_SCALE_BIAS_HPP_INCLUDED diff --git a/include/lbann/layers/learning/convolution.hpp b/include/lbann/layers/learning/convolution.hpp index 9a7cf276a5d..19fb2daf248 100644 --- a/include/lbann/layers/learning/convolution.hpp +++ b/include/lbann/layers/learning/convolution.hpp @@ -29,20 +29,51 @@ #include "lbann/layers/learning/base_convolution.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/utils/distconv.hpp" namespace lbann { +// Forward declaration. +namespace callback { +class imcomm; +} + +#ifdef LBANN_HAS_DISTCONV +template +class convolution_distconv_adapter + : public base_convolution_adapter { +public: + using TensorDevType = typename base_convolution_adapter::TensorDevType; + + convolution_distconv_adapter(Layer& layer) + : base_convolution_adapter(layer) + {} + virtual ~convolution_distconv_adapter() = default; + + void setup_distributions(tensor_overlap_constraints &constraints) override; + void setup_layer(size_t workspace_capacity) override; + dc::Shape get_activations_local_shape(int index=0) const override; +}; +#endif // LBANN_HAS_DISTCONV + /** @brief Standard deep learning convolution. * * Applies convolution (more precisely, cross-correlation) to input * tensors. This is primarily optimized for image data in NCHW * format. */ -template -class convolution_layer : public base_convolution_layer { +template +class convolution_layer + : public base_convolution_layer { + + static_assert(Layout == data_layout::DATA_PARALLEL, + "convolution layer only supports DATA_PARALLEL"); + private: - friend class lbann_callback_imcomm; + friend class callback::imcomm; public: @@ -54,16 +85,7 @@ class convolution_layer : public base_convolution_layer { int stride, int dilation, int groups, - bool has_bias = true) - : convolution_layer(comm, - num_data_dims, - num_output_channels, - std::vector(num_data_dims, conv_dim), - std::vector(num_data_dims, pad), - std::vector(num_data_dims, stride), - std::vector(num_data_dims, dilation), - groups, - has_bias) {} + bool has_bias = true); convolution_layer(lbann_comm *comm, int num_data_dims, @@ -73,21 +95,7 @@ class convolution_layer : public base_convolution_layer { std::vector strides, std::vector dilations, int groups, - bool has_bias = true) - : base_convolution_layer( - comm, - num_data_dims, - num_output_channels, - std::move(conv_dims), - std::move(pads), - std::move(strides), - std::move(dilations), - groups, - has_bias) { - static_assert(Layout == data_layout::DATA_PARALLEL, - "convolution layer only supports DATA_PARALLEL"); - - } + bool has_bias = true); convolution_layer* copy() const override { return new convolution_layer(*this); } @@ -99,62 +107,32 @@ class convolution_layer : public base_convolution_layer { protected: - void setup_dims() override { - base_convolution_layer::setup_dims(); - - // Get tensor dimensions - const auto& input_dims = this->get_input_dims(); - auto output_dims = input_dims; - - // Initialize output tensor dimensions - output_dims[0] = this->m_output_channels; - for (size_t i = 0; i < output_dims.size() - 1; ++i) { - const auto& input_dim = input_dims[i+1]; - const auto& kernel_dim = this->m_conv_dims[i]; - const auto& stride = this->m_strides[i]; - const auto& pad = this->m_pads[i]; - const auto& dilation = this->m_dilations[i]; - const auto& effective_dim = (input_dim - + 2 * pad - - dilation * (kernel_dim-1)); - output_dims[i+1] = (effective_dim + stride - 1) / stride; - } - this->set_output_dims(output_dims); - - } - - std::vector get_kernel_dims() const { - std::vector dims; - dims.push_back(this->m_output_channels); - dims.push_back(this->get_input_dims()[0] / this->m_groups); - dims.insert(dims.end(), - this->m_conv_dims.begin(), - this->m_conv_dims.end()); - return dims; - } - - void fp_compute() override { - if(this->using_gpus()) { - base_convolution_layer::apply_convolution_cudnn(true); - base_convolution_layer::apply_bias_cudnn(); - } else { - base_convolution_layer::apply_convolution_im2col(true); - base_convolution_layer::apply_bias_cpu(); - } - } - - void bp_compute() override { - if(this->using_gpus()) { - base_convolution_layer::compute_gradients_cudnn(false); - base_convolution_layer::apply_transposed_convolution_cudnn(false); - } else { - base_convolution_layer::compute_gradients_im2col(false); - base_convolution_layer::apply_transposed_convolution_im2col(false); - } - } - + void setup_dims(DataReaderMetaData& dr_metadata) override; + std::vector get_kernel_dims() const override; + void fp_compute() override; + void bp_compute() override; + +#ifdef LBANN_HAS_DISTCONV + friend class convolution_distconv_adapter; + protected: + void setup_distconv_adapter() override; + bool is_distconv_supported() const override; +#endif // LBANN_HAS_DISTCONV }; +// Builder function +LBANN_DEFINE_LAYER_BUILDER(convolution); + +#ifndef LBANN_CONVOLUTION_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class convolution_layer; + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_CONVOLUTION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LEARNING_CONVOLUTION_HPP_INCLUDED diff --git a/include/lbann/layers/learning/deconvolution.hpp b/include/lbann/layers/learning/deconvolution.hpp index f3c1f7bdd9e..6ebce704f9b 100644 --- a/include/lbann/layers/learning/deconvolution.hpp +++ b/include/lbann/layers/learning/deconvolution.hpp @@ -28,19 +28,38 @@ #define LBANN_LAYERS_LEARNING_DECONVOLUTION_HPP_INCLUDED #include "lbann/layers/learning/base_convolution.hpp" -#include "lbann/utils/exception.hpp" +#include "lbann/utils/distconv.hpp" namespace lbann { // Forward declaration. -class lbann_callback_imcomm; +namespace callback { +class imcomm; +} + +#ifdef LBANN_HAS_DISTCONV +template +class deconvolution_distconv_adapter: public base_convolution_adapter { + public: + using TensorDevType = typename base_convolution_adapter::TensorDevType; + + deconvolution_distconv_adapter(Layer& layer): base_convolution_adapter(layer) {} + virtual ~deconvolution_distconv_adapter() = default; + + void setup_distributions(tensor_overlap_constraints &constraints) override; + void setup_layer(size_t workspace_capacity) override; + dc::Shape get_activations_local_shape(int index=0) const override; +}; +#endif // LBANN_HAS_DISTCONV /** @brief Transpose of the convolution layer. */ -template -class deconvolution_layer : public base_convolution_layer { +template +class deconvolution_layer : public base_convolution_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "deconvolution layer only supports DATA_PARALLEL"); private: - friend class lbann_callback_imcomm; + friend class callback::imcomm; public: @@ -52,16 +71,7 @@ class deconvolution_layer : public base_convolution_layer { int stride, int dilation, int groups, - bool has_bias = true) - : deconvolution_layer(comm, - num_data_dims, - num_output_channels, - std::vector(num_data_dims, conv_dim), - std::vector(num_data_dims, pad), - std::vector(num_data_dims, stride), - std::vector(num_data_dims, dilation), - groups, - has_bias) {} + bool has_bias = true); deconvolution_layer(lbann_comm *comm, int num_data_dims, @@ -71,108 +81,43 @@ class deconvolution_layer : public base_convolution_layer { std::vector strides, std::vector dilations, int groups, - bool has_bias = true) - : base_convolution_layer( - comm, - num_data_dims, - num_output_channels, - std::move(conv_dims), - std::move(pads), - std::move(strides), - std::move(dilations), - groups, - has_bias) { - static_assert(Layout == data_layout::DATA_PARALLEL, - "deconvolution layer only supports DATA_PARALLEL"); + bool has_bias = true); + deconvolution_layer* copy() const override { + return new deconvolution_layer(*this); } - deconvolution_layer* copy() const override { return new deconvolution_layer(*this); } - std::string get_type() const override { return "deconvolution"; } data_layout get_data_layout() const override { return Layout; } El::Device get_device_allocation() const override { return Device; } - void setup_dims() override { - base_convolution_layer::setup_dims(); - std::stringstream err; - - // Get tensor dimensions - const auto& input_dims = this->get_input_dims(); - auto output_dims = input_dims; - - // Check for unsupported features - /// @todo Implement dilated and grouped deconvolution - if (std::any_of(this->m_dilations.begin(), - this->m_dilations.end(), - [] (int d) { return d != 1; })) { - err << this->get_type() << " layer " - << "\"" << this->get_name() << "\" " - << "has non-unit dilations ("; - for (size_t i = 0; i < this->m_dilations.size(); ++i) { - err << (i > 0 ? ", " : "") << this->m_dilations[i]; - } - err << ")"; - LBANN_ERROR(err.str()); - } - if (this->m_groups != 1) { - err << this->get_type() << " layer " - << "\"" << this->get_name() << "\" " - << "has non-unit groups " - << "(" << this->m_groups << ")"; - LBANN_ERROR(err.str()); - } - - // Initialize output tensor dimensions - /// @todo Dilated deconvolution - output_dims[0] = this->m_output_channels; - for (size_t i = 0; i < output_dims.size() - 1; ++i) { - const auto& input_dim = input_dims[i+1]; - const auto& kernel_dim = this->m_conv_dims[i]; - const auto& stride = this->m_strides[i]; - const auto& pad = this->m_pads[i]; - // const auto& dilation = this->m_dilations[i]; - output_dims[i+1] = (input_dim-1) * stride + kernel_dim - 2 * pad; - } - this->set_output_dims(output_dims); - - } + void setup_dims(DataReaderMetaData& dr_metadata) override; protected: - std::vector get_kernel_dims() const { - std::vector dims; - dims.push_back(this->get_input_dims()[0]); - dims.push_back(this->m_output_channels); - dims.insert(dims.end(), - this->m_conv_dims.begin(), - this->m_conv_dims.end()); - return dims; - } + std::vector get_kernel_dims() const override; + void fp_compute() override; + void bp_compute() override; - void fp_compute() override { - if(this->using_gpus()) { - base_convolution_layer::apply_transposed_convolution_cudnn(true); - base_convolution_layer::apply_bias_cudnn(); - } else { - base_convolution_layer::apply_transposed_convolution_im2col(true); - base_convolution_layer::apply_bias_cpu(); - } - } +#ifdef LBANN_HAS_DISTCONV + friend class deconvolution_distconv_adapter; + protected: + void setup_distconv_adapter() override; + bool is_distconv_supported() const override; +#endif // LBANN_HAS_DISTCONV +}; - void bp_compute() override { - if(this->using_gpus()) { - base_convolution_layer::compute_gradients_cudnn(true); - base_convolution_layer::apply_convolution_cudnn(false); - } else { - base_convolution_layer::compute_gradients_im2col(true); - base_convolution_layer::apply_convolution_im2col(false); - } - } +#ifndef LBANN_DECONVOLUTION_LAYER_INSTANTIATE -}; +#define PROTO_DEVICE(T, Device) \ + extern template class deconvolution_layer; + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_DECONVOLUTION_LAYER_INSTANTIATE } // namespace lbann diff --git a/include/lbann/layers/learning/embedding.hpp b/include/lbann/layers/learning/embedding.hpp new file mode 100644 index 00000000000..5c26e1975f3 --- /dev/null +++ b/include/lbann/layers/learning/embedding.hpp @@ -0,0 +1,267 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_LEARNING_EMBEDDING_HPP_INCLUDED +#define LBANN_LAYERS_LEARNING_EMBEDDING_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/utils/memory.hpp" + +namespace lbann { + +/** @brief Lookup table to vectors of fixed size. + * + * Each input value is interpreted as an index and the corresponding + * embedding vector is output. Thus, given an input vector of length + * @f$ \text{sequence\_length} @f$, the output is a + * @f$ \text{sequence\_length} \times \text{embedding\_dim} @f$ tensor. + * If an index is out-of-range, then corresponding output is a vector + * of zeros. + * + * The embedding vectors are stored in an + * @f$ \text{embedding\_dim} \times \text{num\_embeddings} @f$ + * weights matrix. Note that this is the transpose of the weights in + * the PyTorch embedding layer. + */ +template +class embedding_layer : public data_type_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "embedding layer only supports data parallel layout"); +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; + + ///@} + +public: + + /** + * @param comm LBANN communicator. + * @param num_embeddings Size of dictionary of embeddings. + * @param embedding_dim Size of embedding vectors. + * @param padding_idx If set, then the corresponding embedding + * vector is initialized with zeros. The + * objective function gradient w.r.t. this + * embedding vector is always zero. + */ + embedding_layer(lbann_comm* comm, + size_t num_embeddings, + size_t embedding_dim, + El::Int padding_idx=-1); + + embedding_layer(const embedding_layer& other); + embedding_layer& operator=(const embedding_layer& other); + ~embedding_layer() = default; + + embedding_layer* copy() const override; + std::string get_type() const override; + data_layout get_data_layout() const override; + El::Device get_device_allocation() const override; + + description get_description() const override; + +protected: + + void setup_matrices(const El::Grid& grid) override; + void setup_dims(DataReaderMetaData& dr_metadata) override; + void setup_data(size_t max_mini_batch_size) override; + + void fp_compute() override; + void bp_compute() override; + +private: + + /** Size of dictionary of embeddings. */ + size_t m_num_embeddings; + /** Size of embedding vectors. */ + size_t m_embedding_dim; + /** If the padding index is set, then the corresponding embedding + * vector is initialized with zeros. The objective function + * gradient w.r.t. this embedding vector is always zero. + */ + El::Int m_padding_idx; + + /** Gradient w.r.t. embedding weights. */ + std::unique_ptr m_embeddings_grad; + +}; + +// ========================================================= +// Implementation +// ========================================================= + +template +embedding_layer::embedding_layer( + lbann_comm* comm, + size_t num_embeddings, + size_t embedding_dim, + El::Int padding_idx) + : data_type_layer(comm), + m_num_embeddings{num_embeddings}, + m_embedding_dim{embedding_dim}, + m_padding_idx{padding_idx} {} + +template +embedding_layer::embedding_layer( + const embedding_layer& other) + : data_type_layer(other), + m_num_embeddings{other.m_num_embeddings}, + m_embedding_dim{other.m_embedding_dim}, + m_padding_idx{other.m_padding_idx}, + m_embeddings_grad(other.m_embeddings_grad + ? other.m_embeddings_grad->Copy() + : nullptr) {} + +template +embedding_layer& embedding_layer::operator=( + const embedding_layer& other) { + data_type_layer::operator=(other); + m_num_embeddings = other.m_num_embeddings; + m_embedding_dim = other.m_embedding_dim; + m_padding_idx = other.m_padding_idx; + m_embeddings_grad.reset(other.m_embeddings_grad + ? other.m_embeddings_grad->Copy() + : nullptr); + return *this; +} + +template +embedding_layer* embedding_layer::copy() const { + return new embedding_layer(*this); +} + +template +std::string embedding_layer::get_type() const { + return "embedding"; +} + +template +data_layout embedding_layer::get_data_layout() const { + return Layout; +} + +template +El::Device embedding_layer::get_device_allocation() const { + return Device; +} + +template +description embedding_layer::get_description() const { + auto desc = data_type_layer::get_description(); + desc.add("Num embeddings", m_num_embeddings); + desc.add("Embedding dim", m_embedding_dim); + desc.add("Padding index", m_padding_idx); + return desc; +} + +template +void embedding_layer::setup_dims(DataReaderMetaData& dr_metadata) { + data_type_layer::setup_dims(dr_metadata); + auto dims = this->get_input_dims(); + dims.push_back(static_cast(m_embedding_dim)); + this->set_output_dims(dims); +} + +template +void embedding_layer::setup_data(size_t max_mini_batch_size) { + data_type_layer::setup_data(max_mini_batch_size); + + // Construct default weights if needed + // Note: Randomly drawn from normal distribution with mean 0 and + // standard deviation 1. + if (!this->has_weights()) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(El::TypeTraits::Zero(), + El::TypeTraits::One()); + auto opt = this->m_model->template create_optimizer(); + w->set_name(this->get_name() + "_weights"); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->add_weights(w.get()); + this->m_model->add_weights(std::move(w)); + } + if (this->num_weights() != 1) { + LBANN_ERROR("attempted to setup ", + this->get_type()," layer \"",this->get_name(),"\" ", + "with an invalid number of weights ", + "(expected 1, found ",this->num_weights(),")"); + } + + // Initialize dictionary + auto& embeddings = this->get_weights(0); + auto matrix_dist = this->get_prev_activations().DistData(); + matrix_dist.colDist = El::STAR; + matrix_dist.rowDist = El::STAR; + embeddings.set_dims({static_cast(m_embedding_dim)}, + {static_cast(m_num_embeddings)}); + embeddings.set_matrix_distribution(matrix_dist); + embeddings.setup(); + + // Zero out embedding vector for padding index + if (0 <= m_padding_idx + && m_padding_idx < static_cast(m_embedding_dim)) { + // FIXME (trb 06/01/2020): Assuming embedding values have data + // type that matches this layer. In future, we should abstract + // this or dynamically dispatch it. + auto& embedding_values = + dynamic_cast(embeddings.get_values()); + std::unique_ptr pad_embedding( + embedding_values.Construct(embedding_values.Grid(), + embedding_values.Root())); + El::View(*pad_embedding, embedding_values, El::ALL, El::IR(m_padding_idx)); + El::Zero(*pad_embedding); + } + + // Initialize gradient w.r.t. embeddings + m_embeddings_grad->Resize(m_embedding_dim, m_num_embeddings); + +} + +LBANN_DEFINE_LAYER_BUILDER(embedding); + +#ifndef LBANN_EMBEDDING_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class embedding_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_EMBEDDING_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYERS_LEARNING_EMBEDDING_HPP_INCLUDED diff --git a/include/lbann/layers/learning/entrywise_scale_bias.hpp b/include/lbann/layers/learning/entrywise_scale_bias.hpp new file mode 100644 index 00000000000..cd4535e059b --- /dev/null +++ b/include/lbann/layers/learning/entrywise_scale_bias.hpp @@ -0,0 +1,247 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYER_LEARNING_ENTRYWISE_SCALE_BIAS_HPP_INCLUDED +#define LBANN_LAYER_LEARNING_ENTRYWISE_SCALE_BIAS_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/utils/exception.hpp" + +namespace lbann { + +/** @brief Apply scale and bias to tensor entries. + * + * Scale and bias terms are applied independently to each tensor + * entry. More precisely, given input, output, scale, and bias + * tensors @f$ X,Y,A,B\in\mathbb{R}^{d_1\times\cdots\times d_n} @f$: + * @f[ + * Y = A \circ X + B + * @f] + * + * The scale and bias terms are fused into a single weights tensor to + * reduce the number of gradient allreduces during backprop. In + * particular, the weights tensor is a + * @f$ \text{size} \times 2 @f$ matrix, where the first + * column correspond to scale terms and the second column to bias + * terms. + */ +template +class entrywise_scale_bias_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; + + ///@} + +public: + + entrywise_scale_bias_layer(lbann_comm *comm); + entrywise_scale_bias_layer(const entrywise_scale_bias_layer& other); + entrywise_scale_bias_layer& operator=( + const entrywise_scale_bias_layer& other); + + entrywise_scale_bias_layer* copy() const override { + return new entrywise_scale_bias_layer(*this); + } + + std::string get_type() const override { return "entry-wise scale/bias"; } + data_layout get_data_layout() const override { return Layout; } + El::Device get_device_allocation() const override { return Device; } + + void setup_matrices(const El::Grid& grid) override; + void setup_data(size_t max_mini_batch_size) override; + + void fp_setup_outputs(El::Int mini_batch_size) override; + void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override; + +protected: + + void fp_compute() override; + void bp_compute() override; + +private: + + /** @brief Objective function gradient w.r.t. weights. */ + std::unique_ptr m_weights_gradient; + +}; + +// Implementation +template +entrywise_scale_bias_layer +::entrywise_scale_bias_layer(lbann_comm *comm) + : data_type_layer(comm) +{} + +template +entrywise_scale_bias_layer +::entrywise_scale_bias_layer(const entrywise_scale_bias_layer& other) + : data_type_layer(other), + m_weights_gradient(other.m_weights_gradient ? + other.m_weights_gradient->Copy() : nullptr) +{} + +template +auto entrywise_scale_bias_layer +::operator=(const entrywise_scale_bias_layer& other) + -> entrywise_scale_bias_layer& { + data_type_layer::operator=(other); + m_weights_gradient.reset(other.m_weights_gradient ? + other.m_weights_gradient->Copy() : + nullptr); + return *this; +} + +template +void +entrywise_scale_bias_layer +::setup_matrices(const El::Grid& grid) { + data_type_layer::setup_matrices(grid); + auto dist = this->get_prev_activations().DistData(); + dist.rowDist = El::STAR; + m_weights_gradient.reset(AbsDistMatrixType::Instantiate(dist)); +} + +template +void +entrywise_scale_bias_layer +::setup_data(size_t max_mini_batch_size) { + data_type_layer::setup_data(max_mini_batch_size); + + // Initialize output dimensions + this->set_output_dims(this->get_input_dims()); + const auto output_dims = this->get_output_dims(); + const El::Int output_size = this->get_output_size(); + + // Construct default weights if needed + // Note: Scale is initialized to 1 and bias to 0 + if (!this->has_weights()) { + auto w = make_unique(this->get_comm()); + std::vector vals(2*output_size, + El::TypeTraits::Zero()); + std::fill(vals.begin(), vals.begin()+output_size, + El::TypeTraits::One()); + auto init = make_unique>(vals); + auto opt = this->m_model->template create_optimizer(); + w->set_name(this->get_name() + "_weights"); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->add_weights(w.get()); + this->m_model->add_weights(std::move(w)); + } + if (this->num_weights() != 1) { + LBANN_ERROR("attempted to setup ", + this->get_type()," layer \"",this->get_name(),"\" ", + "with an invalid number of weights ", + "(expected 1, found ",this->num_weights(),")"); + } + + // Setup weights + auto dist = this->get_prev_activations().DistData(); + dist.rowDist = El::STAR; + this->get_weights(0).set_dims(output_dims, + {static_cast(2)}); + this->get_weights(0).set_matrix_distribution(dist); + + // Setup gradient w.r.t. weights + m_weights_gradient->AlignWith(dist); + m_weights_gradient->Resize(output_size, 2); +} + +template +void +entrywise_scale_bias_layer +::fp_setup_outputs(El::Int mini_batch_size) { + data_type_layer::fp_setup_outputs(mini_batch_size); + +#if 0 /// @todo See https://github.com/LLNL/lbann/issues/1123 + + // Check that input and weights tensors are aligned + /// @todo Realign weights tensor if misaligned + bool aligned = true; + try { + const auto& x = this->get_prev_activations(); + const auto& w = m_weights[0]->get_values(); + aligned = (x.ColAlign() == w.ColAlign() + && x.RowAlign() == w.RowAlign()); + } + catch (const exception& e) { + // An exception is thrown if you try accessing weights values + // before they are initialized. We don't care if this case is + // aligned, so it's safe to ignore. + } + if (!aligned) { + std::ostringstream err; + err << this->get_type() << " layer \"" << this->get_name() << "\" " + << "has misaligned input and weights matrices"; + LBANN_ERROR(err.str()); + } + +#endif // 0 + +} + +template +void +entrywise_scale_bias_layer +::bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) { + data_type_layer::bp_setup_gradient_wrt_inputs(mini_batch_size); + m_weights_gradient->Empty(false); + m_weights_gradient->AlignWith(this->get_prev_activations()); + m_weights_gradient->Resize(this->get_input_size(), 2); +} + + +LBANN_DEFINE_LAYER_BUILDER(entrywise_scale_bias); + +#ifndef LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class entrywise_scale_bias_layer< \ + T, data_layout::DATA_PARALLEL, Device>; \ + extern template class entrywise_scale_bias_layer< \ + T, data_layout::MODEL_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYER_LEARNING_ENTRYWISE_SCALE_BIAS_HPP_INCLUDED diff --git a/include/lbann/layers/learning/fully_connected.hpp b/include/lbann/layers/learning/fully_connected.hpp index f62c2318594..68b6ce23135 100644 --- a/include/lbann/layers/learning/fully_connected.hpp +++ b/include/lbann/layers/learning/fully_connected.hpp @@ -29,67 +29,55 @@ #include "lbann/layers/learning/learning.hpp" #include "lbann/models/model.hpp" -#include "lbann/weights/initializer.hpp" -#include "lbann/weights/variance_scaling_initializers.hpp" + #include -#include namespace lbann { -/** @brief Perform an affine transformation. */ -template -class fully_connected_layer : public learning_layer { +/** @brief Affine transformation + * + * Flattens the input tensor, multiplies with a weights matrix, and + * optionally applies an entry-wise bias. Following the + * column-vector convention: + * @f[ y = W * \text{vec}(x) + b @f] + * + * Two weights are required if bias is applied: the linearity and the + * bias. Only the linearity weights are required if bias is not + * applied. If weights aren't provided, the linearity weights are + * initialized with He normal initialization and the bias weights are + * initialized to zero. + */ +template +class fully_connected_layer : public learning_layer { public: + /** @name Public Types */ + ///@{ - /** @todo Accept a vector for output_size */ - fully_connected_layer(lbann_comm *comm, - int output_size, - bool transpose = false, - weights* weight = nullptr, - bool has_bias = true) - : learning_layer(comm), - m_bias_gradient(nullptr), - m_transpose(transpose) { - - // Initialize output tensor dimensions - set_output_dims({output_size}); + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; - // Initialize bias - m_bias_scaling_factor = has_bias ? DataType(1) : DataType(0); + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; - } + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; - fully_connected_layer(const fully_connected_layer& other) : - learning_layer(other), - m_bias_scaling_factor(other.m_bias_scaling_factor), - m_transpose(other.m_transpose) { + ///@} - // Deep matrix copies - m_bias_gradient = other.m_bias_gradient; - if (m_bias_gradient != nullptr) { - m_bias_gradient = m_bias_gradient->Copy(); - } - - } +public: - fully_connected_layer& operator=(const fully_connected_layer& other) { - learning_layer::operator=(other); - m_bias_scaling_factor = other.m_bias_scaling_factor; - m_transpose = other.m_transpose; + /** @todo Accept a vector for output_size */ + fully_connected_layer(lbann_comm *comm, + int output_size, + bool transpose = false, + WeightsType* weight = nullptr, + bool has_bias = true); - // Deep matrix copies - deallocate_matrices(); - m_bias_gradient = other.m_bias_gradient; - if (m_bias_gradient != nullptr) { - m_bias_gradient = m_bias_gradient->Copy(); - } + fully_connected_layer(const fully_connected_layer& other); - return *this; - } + fully_connected_layer& operator=(const fully_connected_layer& other); - ~fully_connected_layer() override { - deallocate_matrices(); - } + ~fully_connected_layer() override; fully_connected_layer* copy() const override { return new fully_connected_layer(*this); @@ -99,110 +87,12 @@ class fully_connected_layer : public learning_layer { data_layout get_data_layout() const override { return T_layout; } El::Device get_device_allocation() const override { return Dev; } - description get_description() const override { - auto&& desc = learning_layer::get_description(); - const auto& bias_str = (m_bias_scaling_factor == DataType(0) ? - "disabled" : "enabled"); - desc.add("Bias", bias_str); - return desc; - } + description get_description() const override; protected: void setup_matrices(const El::Grid& grid) override; - - void setup_data() override { - learning_layer::setup_data(); - - // Initialize default weights if none are provided - if (this->m_weights.size() > 2) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << "attempted to setup " << m_name << " with an invalid number of weights"; - throw lbann_exception(err.str()); - } - if (m_bias_scaling_factor != DataType(0)) { - this->m_weights.resize(2, nullptr); - } else { - this->m_weights.resize(1, nullptr); - } - if (this->m_weights[0] == nullptr) { - auto* w = new weights(get_comm()); - std::unique_ptr init(new he_initializer(probability_distribution::gaussian)); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_linearity_weights"); - w->set_initializer(init); - w->set_optimizer(opt); - this->m_weights[0] = w; - this->m_model->add_weights(w); - } - auto& linearity_weights = *this->m_weights[0]; - - // Initialize variance scaling initialization - auto* cast_initializer - = dynamic_cast(linearity_weights.get_initializer()); - if (cast_initializer != nullptr) { - cast_initializer->set_fan_in(get_input_size()); - cast_initializer->set_fan_out(get_output_size()); - } - - // Setup linearity weights - auto linearity_dist = get_prev_activations().DistData(); - if (linearity_dist.colDist != El::MC - || linearity_dist.rowDist != El::MR) { - linearity_dist.colDist = El::STAR; - linearity_dist.rowDist = El::STAR; - } - if (m_transpose) { - linearity_weights.set_dims(get_input_dims(), get_output_dims()); - } else { - linearity_weights.set_dims(get_output_dims(), get_input_dims()); - } - linearity_weights.set_matrix_distribution(linearity_dist); - - // Set up bias if needed. - if (m_bias_scaling_factor != DataType(0)) { - if (this->m_weights[1] == nullptr) { - auto* w = new weights(get_comm()); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_bias_weights"); - w->set_optimizer(opt); - this->m_weights[1] = w; - this->m_model->add_weights(w); - } - auto& bias_weights = *this->m_weights[1]; - // Setup bias weights - auto bias_dist = get_activations().DistData(); - bias_dist.rowDist = El::STAR; - bias_weights.set_dims(get_output_dims()); - bias_weights.set_matrix_distribution(bias_dist); - if (this->m_bias_gradient != nullptr) { - El::Zeros(*this->m_bias_gradient, - bias_weights.get_matrix_height(), - bias_weights.get_matrix_width()); - } - } - - // Initialize freeze state - for (auto&& w : this->m_weights) { - if (m_frozen) { - w->freeze(); - } else { - w->unfreeze(); - } - } - for (auto&& w : this->m_weights) { - if (w->is_frozen() != m_frozen) { - std::stringstream err; - err << (m_frozen ? "" : "un") << "frozen " - << "layer \"" << get_name() << "\" has " - << (w->is_frozen() ? "" : "un") << "frozen " - << "weights \"" << w->get_name() << "\""; - LBANN_ERROR(err.str()); - } - } - - } + void setup_data(size_t max_mini_batch_size) override; void fp_compute() override; void bp_compute() override; @@ -212,13 +102,13 @@ class fully_connected_layer : public learning_layer { /** Scaling factor for bias term. * If the scaling factor is zero, bias is not applied. */ - DataType m_bias_scaling_factor; + TensorDataType m_bias_scaling_factor; /** Bias weights gradient. * This is this layer's contribution to the objective function * gradient w.r.t. the bias weights. */ - AbsDistMat* m_bias_gradient; + AbsDistMatrixType* m_bias_gradient; /** Whether the transpose of the linearity matrix is applied. */ bool m_transpose; @@ -228,8 +118,26 @@ class fully_connected_layer : public learning_layer { if (m_bias_gradient != nullptr) delete m_bias_gradient; } + template + friend void fp_compute_impl(fully_connected_layer& l); + template + friend void bp_compute_impl(fully_connected_layer& l); }; +// Builder function +LBANN_DEFINE_LAYER_BUILDER(fully_connected); + +#ifndef LBANN_FULLY_CONNECTED_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class fully_connected_layer; \ + extern template class fully_connected_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_FULLY_CONNECTED_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LEARNING_FULLY_CONNECTED_HPP_INCLUDED diff --git a/include/lbann/layers/learning/learning.hpp b/include/lbann/layers/learning/learning.hpp index 2f2ab120bc5..f3b0d5e451c 100644 --- a/include/lbann/layers/learning/learning.hpp +++ b/include/lbann/layers/learning/learning.hpp @@ -27,16 +27,18 @@ #ifndef LBANN_LAYER_LEARNING_HPP_INCLUDED #define LBANN_LAYER_LEARNING_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { /** @todo Remove. Layers should inherit directly from the base layer * class. */ -class learning_layer : public Layer { + +template +class learning_layer : public data_type_layer { public: - learning_layer(lbann_comm *comm) : Layer(comm) {} + learning_layer(lbann_comm *comm) : data_type_layer(comm) {} }; } // namespace lbann diff --git a/include/lbann/layers/loss/categorical_accuracy.hpp b/include/lbann/layers/loss/categorical_accuracy.hpp index 078abb6b2a4..aa5a5e0d006 100644 --- a/include/lbann/layers/loss/categorical_accuracy.hpp +++ b/include/lbann/layers/loss/categorical_accuracy.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LOSS_CATEGORICAL_ACCURACY_HPP_INCLUDED #define LBANN_LAYERS_LOSS_CATEGORICAL_ACCURACY_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -42,11 +42,11 @@ namespace lbann { * This is primarily intended for use as a metric since it is not * differentiable. */ -template -class categorical_accuracy_layer : public Layer { +template +class categorical_accuracy_layer : public data_type_layer { public: - categorical_accuracy_layer(lbann_comm *comm) : Layer(comm) { + categorical_accuracy_layer(lbann_comm *comm) : data_type_layer(comm) { this->m_expected_num_parent_layers = 2; } @@ -57,18 +57,18 @@ class categorical_accuracy_layer : public Layer { data_layout get_data_layout() const override { return T_layout; } El::Device get_device_allocation() const override { return Dev; } - void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); // Check that input dimensions match - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); + if (this->get_input_dims(0) != this->get_input_dims(1)) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); + for (int i = 0; i < this->get_num_parents(); ++i) { + const auto& dims = this->get_input_dims(i); err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name() << "\" outputs "; for (size_t j = 0; j < dims.size(); ++j) { @@ -85,6 +85,19 @@ class categorical_accuracy_layer : public Layer { }; +#ifndef LBANN_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class categorical_accuracy_layer< \ + T, data_layout::DATA_PARALLEL, Device>; \ + extern template class categorical_accuracy_layer< \ + T, data_layout::MODEL_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_CATEGORICAL_ACCURACY_HPP_INCLUDED diff --git a/include/lbann/layers/loss/cross_entropy.hpp b/include/lbann/layers/loss/cross_entropy.hpp index e2ee89e4350..238bda958a4 100644 --- a/include/lbann/layers/loss/cross_entropy.hpp +++ b/include/lbann/layers/loss/cross_entropy.hpp @@ -27,33 +27,59 @@ #ifndef LBANN_LAYERS_LOSS_CROSS_ENTROPY_HPP_INCLUDED #define LBANN_LAYERS_LOSS_CROSS_ENTROPY_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" +#include "lbann/utils/distconv.hpp" namespace lbann { +#ifdef LBANN_HAS_DISTCONV +template +class cross_entropy_distconv_adapter: public data_type_distconv_adapter { + public: + using TensorDevType = typename data_type_distconv_adapter::TensorDevType; + cross_entropy_distconv_adapter(Layer& layer): data_type_distconv_adapter(layer) {} + virtual ~cross_entropy_distconv_adapter() = default; + void setup_distributions(tensor_overlap_constraints &constraints) override; + dc::Shape get_prev_activations_shape(int index) const override; + dc::Shape get_activations_shape(int index) const override; + dc::Shape get_activations_local_shape(int index) const override; + void setup_layer(size_t workspace_capacity) override; + std::unique_ptr m_cross_entropy; +}; +#endif // LBANN_HAS_DISTCONV + /** @brief Cross entropy loss function. * * Given a predicted distribution @f$y@f$ and ground truth * distribution @f$\hat{y}@f$, * @f[ CE(y,\hat{y}) = - \sum\limits_{i} \hat{y}_i \log y_i @f] */ -template -class cross_entropy_layer : public Layer { +template +class cross_entropy_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: - cross_entropy_layer(lbann_comm *comm) : Layer(comm) { + cross_entropy_layer(lbann_comm *comm) : data_type_layer(comm) { this->m_expected_num_parent_layers = 2; } cross_entropy_layer(const cross_entropy_layer& other) - : Layer(other) { + : data_type_layer(other) { m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); } cross_entropy_layer& operator=(const cross_entropy_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); @@ -65,18 +91,31 @@ class cross_entropy_layer : public Layer { data_layout get_data_layout() const override { return T_layout; } El::Device get_device_allocation() const override { return Dev; } - void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); + +#ifdef LBANN_HAS_DISTCONV + // In the current implementation of cross entropy in Distconv, we + // do not use the reshape layer and just assumes both inputs have + // the matching shape. Therefore, the following check on the input + // dimensions would fail. We could address this by either 1) + // implementing the reshape layer, or 2) giving a proper shape to + // the ground-truth data. + // + if (this->distconv_enabled()) { + return; + } +#endif // Check that input dimensions match - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); + if (this->get_input_dims(0) != this->get_input_dims(1)) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); + for (int i = 0; i < this->get_num_parents(); ++i) { + const auto& dims = this->get_input_dims(i); err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name() << "\" outputs "; for (size_t j = 0; j < dims.size(); ++j) { @@ -89,19 +128,21 @@ class cross_entropy_layer : public Layer { } - void setup_data() override { - Layer::setup_data(); + void setup_data(size_t max_mini_batch_size) override { + data_type_layer::setup_data(max_mini_batch_size); // Initialize workspace - const auto& prediction = get_prev_activations(0); - switch (get_data_layout()) { + const auto& prediction = this->get_prev_activations(0); + switch (this->get_data_layout()) { case data_layout::DATA_PARALLEL: - m_workspace.reset(new StarVCMat(prediction.Grid(), - prediction.Root())); + m_workspace.reset(new StarVCMatDT( + prediction.Grid(), + prediction.Root())); break; case data_layout::MODEL_PARALLEL: - m_workspace.reset(new StarMRMat(prediction.Grid(), - prediction.Root())); + m_workspace.reset(new StarMRMatDT( + prediction.Grid(), + prediction.Root())); break; default: LBANN_ERROR("invalid data layout"); } @@ -115,55 +156,193 @@ class cross_entropy_layer : public Layer { void fp_compute() override { +#ifdef LBANN_HAS_DISTCONV + if (this->distconv_enabled()) { + fp_compute_distconv(); + return; + } +#endif + // Initialize workspace - const auto& prediction = get_prev_activations(0); + const auto& prediction = this->get_prev_activations(0); m_workspace->AlignWith(prediction.DistData()); m_workspace->Resize(1, prediction.Width()); // Compute local contributions and accumulate /// @todo Consider reduce rather than allreduce - local_fp_compute(get_local_prev_activations(0), - get_local_prev_activations(1), - m_workspace->Matrix()); - m_comm->allreduce(*m_workspace, m_workspace->RedundantComm()); - El::Copy(*m_workspace, get_activations()); + local_fp_compute(); + this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm()); + El::Copy(*m_workspace, this->get_activations()); } void bp_compute() override { +#ifdef LBANN_HAS_DISTCONV + if (this->distconv_enabled()) { + bp_compute_distconv(); + return; + } +#endif // LBANN_HAS_DISTCONV + // Initialize workspace - const auto& prediction = get_prev_activations(0); + const auto& prediction = this->get_prev_activations(0); m_workspace->AlignWith(prediction.DistData()); - El::Copy(get_prev_error_signals(), *m_workspace); + El::Copy(this->get_prev_error_signals(), *m_workspace); // Compute local gradients - local_bp_compute(get_local_prev_activations(0), - get_local_prev_activations(1), - m_workspace->LockedMatrix(), - get_local_error_signals(0), - get_local_error_signals(1)); - + local_bp_compute(); } private: /** Compute local contributions to cross entropy loss. */ - static void local_fp_compute(const AbsMat& local_prediction, - const AbsMat& local_ground_truth, - AbsMat& local_contribution); + void local_fp_compute(); /** Compute local gradients. */ - static void local_bp_compute(const AbsMat& local_prediction, - const AbsMat& local_ground_truth, - const AbsMat& local_gradient_wrt_output, - AbsMat& local_gradient_wrt_prediction, - AbsMat& local_gradient_wrt_ground_truth); + void local_bp_compute(); /** Workspace matrix. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; + +#ifdef LBANN_HAS_DISTCONV + friend class cross_entropy_distconv_adapter; + protected: + bool is_distconv_supported() const override { + return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL; + } + + void setup_distconv_adapter() override { + this->get_distconv_adapter_ptr() = make_unique< + cross_entropy_distconv_adapter>(*this); + } + + cross_entropy_distconv_adapter& get_distconv_adapter() override; + const cross_entropy_distconv_adapter& get_distconv_adapter() const override; + + void fp_compute_distconv() { + assert_always(this->distconv_enabled()); + get_distconv_adapter().m_cross_entropy->forward(this->get_distconv_adapter().get_prev_activations(0), + this->get_distconv_adapter().get_prev_activations(1), + this->get_distconv_adapter().get_activations()); + } + void bp_compute_distconv() { + assert_always(this->distconv_enabled()); + get_distconv_adapter().m_cross_entropy->backward(this->get_distconv_adapter().get_prev_activations(0), + this->get_distconv_adapter().get_prev_activations(1), + this->get_distconv_adapter().get_prev_error_signals(0), + this->get_distconv_adapter().get_error_signals(0), + this->get_distconv_adapter().get_error_signals(1)); + } +#endif // LBANN_HAS_DISTCONV }; +#ifdef LBANN_HAS_DISTCONV +template +const cross_entropy_distconv_adapter& +cross_entropy_layer::get_distconv_adapter() const { + return dynamic_cast&>(data_type_layer::get_distconv_adapter()); +} + +template +cross_entropy_distconv_adapter& +cross_entropy_layer::get_distconv_adapter() { + return const_cast&>( + static_cast&>(*this).get_distconv_adapter()); +} + +template +dc::Shape cross_entropy_distconv_adapter:: +get_prev_activations_shape(int index) const { + // Assumes both of the two input tensors have the equal shape. + return data_type_distconv_adapter::get_prev_activations_shape(0); +} + +template +dc::Shape cross_entropy_distconv_adapter:: +get_activations_shape(int output_index) const { + // NOTE: LBANN matrix is a 2-D matrix, while Distconv keeps the + // original spatial and channel dimensions, so + // get_output_tensor_shape() doesn't work here. + dc::Shape shape = this->get_prev_activations_shape(0); + for (int i = 0; i < shape.num_dims() - 1; ++i) { + shape[i] = 1; + } + return shape; +} + +template +dc::Shape cross_entropy_distconv_adapter:: +get_activations_local_shape(int index) const { + assert_eq(index, 0); + auto input_shape = this->get_prev_activations().get_local_shape(); + for (int i = 0; i < input_shape.length() - 1; ++i) { + input_shape[i] = 1; + } + return input_shape; +} + +template +void cross_entropy_distconv_adapter:: +setup_distributions(tensor_overlap_constraints &constraints) { + data_type_distconv_adapter::setup_distributions( + constraints); + // Output tensors share all dimensions except for the sample dimension + auto activations_split = this->get_activations_dist().get_split_shape(); + auto prev_error_signals_split = this->get_prev_error_signals_dist().get_split_shape(); + for (int i = 0; i < activations_split.length() - 1; ++i) { + activations_split[i] = 1; + prev_error_signals_split[i] = 1; + } + this->get_activations_dist().set_split_shape(activations_split); + this->get_prev_error_signals_dist().set_split_shape(prev_error_signals_split); + + for (auto &d: this->m_prev_activations_dists) { + d.clear_overlap(); + constraints.mark_updated(d); + constraints.mark_invariant(d); + } + for (auto &d: this->m_activations_dists) { + d.clear_overlap(); + constraints.mark_updated(d); + constraints.mark_invariant(d); + } + for (auto &d: this->m_prev_error_signals_dists) { + d.clear_overlap(); + constraints.mark_updated(d); + constraints.mark_invariant(d); + } + for (auto &d: this->m_error_signals_dists) { + d.clear_overlap(); + constraints.mark_updated(d); + constraints.mark_invariant(d); + } +} + +template +void cross_entropy_distconv_adapter::setup_layer( + size_t workspace_capacity) { + m_cross_entropy = make_unique(dc::get_backend()); + m_cross_entropy->setup(this->get_prev_activations(0), + this->get_prev_activations(1), + this->get_activations(0)); +} +#endif // LBANN_HAS_DISTCONV + +#ifndef LBANN_CROSS_ENTROPY_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class cross_entropy_layer< \ + T, data_layout::DATA_PARALLEL, Device>; \ + extern template class cross_entropy_layer< \ + T, data_layout::MODEL_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_CROSS_ENTROPY_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_CROSS_ENTROPY_HPP_INCLUDED diff --git a/include/lbann/layers/loss/entrywise.hpp b/include/lbann/layers/loss/entrywise.hpp index 6e55f58313e..6bb9ab1b15f 100644 --- a/include/lbann/layers/loss/entrywise.hpp +++ b/include/lbann/layers/loss/entrywise.hpp @@ -31,25 +31,46 @@ namespace lbann { +#ifndef LBANN_ENTRYWISE_LAYER_INSTANTIATE +#define BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, DEVICE) \ + extern template class LAYER_NAME; \ + extern template class LAYER_NAME +#else +#define BINARY_ETI_DECL_MACRO_DEV(...) +#endif // LBANN_BINARY_LAYER_INSTANTIATE + +#ifdef LBANN_HAS_GPU +#define BINARY_ETI_DECL_MACRO(LAYER_NAME, T) \ + BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU); \ + BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::GPU) +#else +#define BINARY_ETI_DECL_MACRO(LAYER_NAME, T) \ + BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU) +#endif // LBANN_HAS_GPU + // Convenience macro to define an entry-wise binary layer class #define DEFINE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string) \ - struct layer_name##_name_struct { \ - inline operator std::string() { return layer_string; } \ - }; \ - template \ - using layer_name \ - = entrywise_binary_layer; + LBANN_DECLARE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string); \ + BINARY_ETI_DECL_MACRO(layer_name, float); \ + BINARY_ETI_DECL_MACRO(layer_name, double) // Cross entropy loss -DEFINE_ENTRYWISE_BINARY_LAYER(binary_cross_entropy_layer, "binary cross entropy"); -DEFINE_ENTRYWISE_BINARY_LAYER(sigmoid_binary_cross_entropy_layer, "sigmoid binary cross entropy"); +DEFINE_ENTRYWISE_BINARY_LAYER(binary_cross_entropy_layer, + "binary cross entropy"); +DEFINE_ENTRYWISE_BINARY_LAYER(sigmoid_binary_cross_entropy_layer, + "sigmoid binary cross entropy"); // Boolean loss functions DEFINE_ENTRYWISE_BINARY_LAYER(boolean_accuracy_layer, "Boolean accuracy"); -DEFINE_ENTRYWISE_BINARY_LAYER(boolean_false_negative_layer, "Boolean false negative rate"); -DEFINE_ENTRYWISE_BINARY_LAYER(boolean_false_positive_layer, "Boolean false positive rate"); +DEFINE_ENTRYWISE_BINARY_LAYER(boolean_false_negative_layer, + "Boolean false negative rate"); +DEFINE_ENTRYWISE_BINARY_LAYER(boolean_false_positive_layer, + "Boolean false positive rate"); } // namespace lbann #undef DEFINE_ENTRYWISE_BINARY_LAYER +#undef BINARY_ETI_DECL_MACRO +#undef BINARY_ETI_DECL_MACRO_DEV + #endif // LBANN_LAYERS_LOSS_ENTRYWISE_HPP_INCLUDED diff --git a/include/lbann/layers/loss/l1_norm.hpp b/include/lbann/layers/loss/l1_norm.hpp index 8ceb88c09c3..3315b4ec756 100644 --- a/include/lbann/layers/loss/l1_norm.hpp +++ b/include/lbann/layers/loss/l1_norm.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LOSS_L1_NORM_HPP_INCLUDED #define LBANN_LAYERS_LOSS_L1_NORM_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -35,18 +35,27 @@ namespace lbann { * * @f[ \lVert x\rVert_1 = \sum\limits_{i} | x_i | @f] */ -template -class l1_norm_layer : public Layer { +template +class l1_norm_layer : public data_type_layer { public: + /** @name Public Types */ + ///@{ - l1_norm_layer(lbann_comm *comm) : Layer(comm) {} + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + +public: + + l1_norm_layer(lbann_comm *comm) : data_type_layer(comm) {} l1_norm_layer(const l1_norm_layer& other) - : Layer(other), + : data_type_layer(other), m_workspace(other.m_workspace ? other.m_workspace->Copy() : nullptr) {} l1_norm_layer& operator=(const l1_norm_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); return *this; @@ -57,18 +66,18 @@ class l1_norm_layer : public Layer { data_layout get_data_layout() const override { return T_layout; } El::Device get_device_allocation() const override { return Dev; } - void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); } - void setup_data() override { - Layer::setup_data(); + void setup_data(size_t max_mini_batch_size) override { + data_type_layer::setup_data(max_mini_batch_size); // Initialize workspace - auto dist = get_prev_activations().DistData(); + auto dist = this->get_prev_activations().DistData(); dist.colDist = El::STAR; - m_workspace.reset(AbsDistMat::Instantiate(dist)); + m_workspace.reset(AbsDistMatrixType::Instantiate(dist)); #ifdef HYDROGEN_HAVE_CUB if (m_workspace->GetLocalDevice() == El::Device::GPU) { m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool @@ -81,15 +90,14 @@ class l1_norm_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - m_workspace->Resize(1, get_prev_activations().Width()); + m_workspace->AlignWith(this->get_prev_activations()); + m_workspace->Resize(1, this->get_prev_activations().Width()); // Compute local contributions and accumulate /// @todo Consider reduce rather than allreduce - local_fp_compute(get_local_prev_activations(), - m_workspace->Matrix()); - m_comm->allreduce(*m_workspace, m_workspace->RedundantComm()); - El::Copy(*m_workspace, get_activations()); + local_fp_compute(); + this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm()); + El::Copy(*m_workspace, this->get_activations()); // Clean up m_workspace->Empty(); @@ -100,13 +108,11 @@ class l1_norm_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - El::Copy(get_prev_error_signals(), *m_workspace); + m_workspace->AlignWith(this->get_prev_activations()); + El::Copy(this->get_prev_error_signals(), *m_workspace); // Compute local gradients - local_bp_compute(get_local_prev_activations(), - m_workspace->LockedMatrix(), - get_local_error_signals()); + local_bp_compute(); // Clean up m_workspace->Empty(); @@ -116,18 +122,28 @@ class l1_norm_layer : public Layer { private: /** Compute local contributions to L2 norm. */ - static void local_fp_compute(const AbsMat& local_input, - AbsMat& local_contribution); + void local_fp_compute(); /** Compute local gradients. */ - static void local_bp_compute(const AbsMat& local_input, - const AbsMat& local_gradient_wrt_output, - AbsMat& local_gradient_wrt_input); + void local_bp_compute(); /** Workspace matrix. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; +#ifndef LBANN_L1_NORM_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class l1_norm_layer< \ + T, data_layout::DATA_PARALLEL, Device>; \ + extern template class l1_norm_layer< \ + T, data_layout::MODEL_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_L1_NORM_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_L1_NORM_HPP_INCLUDED diff --git a/include/lbann/layers/loss/l2_norm2.hpp b/include/lbann/layers/loss/l2_norm2.hpp index 15ad24adbd0..0c2d897ba10 100644 --- a/include/lbann/layers/loss/l2_norm2.hpp +++ b/include/lbann/layers/loss/l2_norm2.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LOSS_L2_NORM2_HPP_INCLUDED #define LBANN_LAYERS_LOSS_L2_NORM2_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -35,18 +35,27 @@ namespace lbann { * * @f[ \lVert x\rVert_2^2 = \sum\limits_{i} x_i^2 @f] */ -template -class l2_norm2_layer : public Layer { +template +class l2_norm2_layer : public data_type_layer { public: + /** @name Public Types */ + ///@{ - l2_norm2_layer(lbann_comm *comm) : Layer(comm) {} + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + +public: + + l2_norm2_layer(lbann_comm *comm) : data_type_layer(comm) {} l2_norm2_layer(const l2_norm2_layer& other) - : Layer(other), + : data_type_layer(other), m_workspace(other.m_workspace ? other.m_workspace->Copy() : nullptr) {} l2_norm2_layer& operator=(const l2_norm2_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); return *this; @@ -57,18 +66,18 @@ class l2_norm2_layer : public Layer { data_layout get_data_layout() const override { return T_layout; } El::Device get_device_allocation() const override { return Dev; } - void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); } - void setup_data() override { - Layer::setup_data(); + void setup_data(size_t max_mini_batch_size) override { + data_type_layer::setup_data(max_mini_batch_size); // Initialize workspace - auto dist = get_prev_activations().DistData(); + auto dist = this->get_prev_activations().DistData(); dist.colDist = El::STAR; - m_workspace.reset(AbsDistMat::Instantiate(dist)); + m_workspace.reset(AbsDistMatrixType::Instantiate(dist)); #ifdef HYDROGEN_HAVE_CUB if (m_workspace->GetLocalDevice() == El::Device::GPU) { m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool @@ -81,15 +90,14 @@ class l2_norm2_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - m_workspace->Resize(1, get_prev_activations().Width()); + m_workspace->AlignWith(this->get_prev_activations()); + m_workspace->Resize(1, this->get_prev_activations().Width()); // Compute local contributions and accumulate /// @todo Consider reduce rather than allreduce - local_fp_compute(get_local_prev_activations(), - m_workspace->Matrix()); - m_comm->allreduce(*m_workspace, m_workspace->RedundantComm()); - El::Copy(*m_workspace, get_activations()); + local_fp_compute(); + this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm()); + El::Copy(*m_workspace, this->get_activations()); // Clean up m_workspace->Empty(); @@ -100,13 +108,11 @@ class l2_norm2_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - El::Copy(get_prev_error_signals(), *m_workspace); + m_workspace->AlignWith(this->get_prev_activations()); + El::Copy(this->get_prev_error_signals(), *m_workspace); // Compute local gradients - local_bp_compute(get_local_prev_activations(), - m_workspace->LockedMatrix(), - get_local_error_signals()); + local_bp_compute(); // Clean up m_workspace->Empty(); @@ -116,18 +122,28 @@ class l2_norm2_layer : public Layer { private: /** Compute local contributions to L2 norm. */ - static void local_fp_compute(const AbsMat& local_input, - AbsMat& local_contribution); + void local_fp_compute(); /** Compute local gradients. */ - static void local_bp_compute(const AbsMat& local_input, - const AbsMat& local_gradient_wrt_output, - AbsMat& local_gradient_wrt_input); + void local_bp_compute(); /** Workspace matrix. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; +#ifndef LBANN_L2_NORM2_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class l2_norm2_layer< \ + T, data_layout::DATA_PARALLEL, Device>; \ + extern template class l2_norm2_layer< \ + T, data_layout::MODEL_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_L2_NORM2_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_L2_NORM2_HPP_INCLUDED diff --git a/include/lbann/layers/loss/mean_absolute_error.hpp b/include/lbann/layers/loss/mean_absolute_error.hpp index c136f1f6c72..4a9050e8733 100644 --- a/include/lbann/layers/loss/mean_absolute_error.hpp +++ b/include/lbann/layers/loss/mean_absolute_error.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LOSS_MEAN_ABSOLUTE_ERROR_HPP_INCLUDED #define LBANN_LAYERS_LOSS_MEAN_ABSOLUTE_ERROR_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -39,23 +39,32 @@ namespace lbann { * = \frac{1}{n} \sum\limits_{i=1}^{n} | y_i - \hat{y}_i | * @f] */ -template -class mean_absolute_error_layer : public Layer { +template +class mean_absolute_error_layer : public data_type_layer { public: + /** @name Public Types */ + ///@{ - mean_absolute_error_layer(lbann_comm *comm) : Layer(comm) { + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + +public: + + mean_absolute_error_layer(lbann_comm *comm) : data_type_layer(comm) { this->m_expected_num_parent_layers = 2; } mean_absolute_error_layer(const mean_absolute_error_layer& other) - : Layer(other) { + : data_type_layer(other) { m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); } mean_absolute_error_layer& operator=(const mean_absolute_error_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); @@ -67,18 +76,18 @@ class mean_absolute_error_layer : public Layer { data_layout get_data_layout() const override { return T_layout; } El::Device get_device_allocation() const override { return Dev; } - void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); // Check that input dimensions match - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); + if (this->get_input_dims(0) != this->get_input_dims(1)) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); + for (int i = 0; i < this->get_num_parents(); ++i) { + const auto& dims = this->get_input_dims(i); err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name() << "\" outputs "; for (size_t j = 0; j < dims.size(); ++j) { @@ -91,12 +100,12 @@ class mean_absolute_error_layer : public Layer { } - void setup_data() override { - Layer::setup_data(); + void setup_data(size_t max_mini_batch_size) override { + data_type_layer::setup_data(max_mini_batch_size); // Initialize workspace - const auto& input_dist = get_prev_activations(0).DistData(); - m_workspace.reset(AbsDistMat::Instantiate(*input_dist.grid, + const auto& input_dist = this->get_prev_activations(0).DistData(); + m_workspace.reset(AbsDistMatrixType::Instantiate(*input_dist.grid, input_dist.root, El::STAR, input_dist.rowDist, @@ -116,17 +125,14 @@ class mean_absolute_error_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - m_workspace->Resize(1, get_prev_activations().Width()); + m_workspace->AlignWith(this->get_prev_activations()); + m_workspace->Resize(1, this->get_prev_activations().Width()); // Compute local contributions and accumulate /// @todo Consider reduce rather than allreduce - local_fp_compute(get_input_size(), - get_local_prev_activations(0), - get_local_prev_activations(1), - m_workspace->Matrix()); - m_comm->allreduce(*m_workspace, m_workspace->RedundantComm()); - El::Copy(*m_workspace, get_activations()); + local_fp_compute(); + this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm()); + El::Copy(*m_workspace, this->get_activations()); // Clean up m_workspace->Empty(); @@ -137,16 +143,11 @@ class mean_absolute_error_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - El::Copy(get_prev_error_signals(), *m_workspace); + m_workspace->AlignWith(this->get_prev_activations()); + El::Copy(this->get_prev_error_signals(), *m_workspace); // Compute local gradients - local_bp_compute(get_input_size(), - get_local_prev_activations(0), - get_local_prev_activations(1), - m_workspace->LockedMatrix(), - get_local_error_signals(0), - get_local_error_signals(1)); + local_bp_compute(); // Clean up m_workspace->Empty(); @@ -156,23 +157,28 @@ class mean_absolute_error_layer : public Layer { private: /** Compute local contributions to mean absolute error loss. */ - static void local_fp_compute(El::Int height, - const AbsMat& local_prediction, - const AbsMat& local_ground_truth, - AbsMat& local_contribution); + void local_fp_compute(); /** Compute local gradients. */ - static void local_bp_compute(El::Int height, - const AbsMat& local_prediction, - const AbsMat& local_ground_truth, - const AbsMat& local_gradient_wrt_output, - AbsMat& local_gradient_wrt_prediction, - AbsMat& local_gradient_wrt_ground_truth); + void local_bp_compute(); /** Workspace matrix. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; +#ifndef LBANN_MEAN_ABSOLUTE_ERROR_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class mean_absolute_error_layer< \ + T, data_layout::DATA_PARALLEL, Device>; \ + extern template class mean_absolute_error_layer< \ + T, data_layout::MODEL_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_MEAN_ABSOLUTE_ERROR_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_MEAN_ABSOLUTE_ERROR_HPP_INCLUDED diff --git a/include/lbann/layers/loss/mean_squared_error.hpp b/include/lbann/layers/loss/mean_squared_error.hpp index 19ead85c346..d3de49a9580 100644 --- a/include/lbann/layers/loss/mean_squared_error.hpp +++ b/include/lbann/layers/loss/mean_squared_error.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LOSS_MEAN_SQUARED_ERROR_HPP_INCLUDED #define LBANN_LAYERS_LOSS_MEAN_SQUARED_ERROR_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -39,23 +39,32 @@ namespace lbann { * = \frac{1}{n} \sum\limits_{i=1}^{n} (y_i - \hat{y}_i)^2 * @f] */ -template -class mean_squared_error_layer : public Layer { +template +class mean_squared_error_layer : public data_type_layer { public: + /** @name Public Types */ + ///@{ - mean_squared_error_layer(lbann_comm *comm) : Layer(comm) { + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + +public: + + mean_squared_error_layer(lbann_comm *comm) : data_type_layer(comm) { this->m_expected_num_parent_layers = 2; } mean_squared_error_layer(const mean_squared_error_layer& other) - : Layer(other) { + : data_type_layer(other) { m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); } mean_squared_error_layer& operator=(const mean_squared_error_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); @@ -67,18 +76,18 @@ class mean_squared_error_layer : public Layer { data_layout get_data_layout() const override { return T_layout; } El::Device get_device_allocation() const override { return Dev; } - void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); // Check that input dimensions match - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); + if (this->get_input_dims(0) != this->get_input_dims(1)) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); + for (int i = 0; i < this->get_num_parents(); ++i) { + const auto& dims = this->get_input_dims(i); err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name() << "\" outputs "; for (size_t j = 0; j < dims.size(); ++j) { @@ -91,12 +100,12 @@ class mean_squared_error_layer : public Layer { } - void setup_data() override { - Layer::setup_data(); + void setup_data(size_t max_mini_batch_size) override { + data_type_layer::setup_data(max_mini_batch_size); // Initialize workspace - const auto& input_dist = get_prev_activations(0).DistData(); - m_workspace.reset(AbsDistMat::Instantiate(*input_dist.grid, + const auto& input_dist = this->get_prev_activations(0).DistData(); + m_workspace.reset(AbsDistMatrixType::Instantiate(*input_dist.grid, input_dist.root, El::STAR, input_dist.rowDist, @@ -116,17 +125,14 @@ class mean_squared_error_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - m_workspace->Resize(1, get_prev_activations().Width()); + m_workspace->AlignWith(this->get_prev_activations()); + m_workspace->Resize(1, this->get_prev_activations().Width()); // Compute local contributions and accumulate /// @todo Consider reduce rather than allreduce - local_fp_compute(get_input_size(), - get_local_prev_activations(0), - get_local_prev_activations(1), - m_workspace->Matrix()); - m_comm->allreduce(*m_workspace, m_workspace->RedundantComm()); - El::Copy(*m_workspace, get_activations()); + local_fp_compute(); + this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm()); + El::Copy(*m_workspace, this->get_activations()); // Clean up m_workspace->Empty(); @@ -137,16 +143,11 @@ class mean_squared_error_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - El::Copy(get_prev_error_signals(), *m_workspace); + m_workspace->AlignWith(this->get_prev_activations()); + El::Copy(this->get_prev_error_signals(), *m_workspace); // Compute local gradients - local_bp_compute(get_input_size(), - get_local_prev_activations(0), - get_local_prev_activations(1), - m_workspace->LockedMatrix(), - get_local_error_signals(0), - get_local_error_signals(1)); + local_bp_compute(); // Clean up m_workspace->Empty(); @@ -156,23 +157,28 @@ class mean_squared_error_layer : public Layer { private: /** Compute local contributions to mean squared error loss. */ - static void local_fp_compute(El::Int height, - const AbsMat& local_prediction, - const AbsMat& local_ground_truth, - AbsMat& local_contribution); + void local_fp_compute(); /** Compute local gradients. */ - static void local_bp_compute(El::Int height, - const AbsMat& local_prediction, - const AbsMat& local_ground_truth, - const AbsMat& local_gradient_wrt_output, - AbsMat& local_gradient_wrt_prediction, - AbsMat& local_gradient_wrt_ground_truth); + void local_bp_compute(); /** Workspace matrix. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; +#ifndef LBANN_MEAN_SQUARED_ERROR_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class mean_squared_error_layer< \ + T, data_layout::DATA_PARALLEL, Device>; \ + extern template class mean_squared_error_layer< \ + T, data_layout::MODEL_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_MEAN_SQUARED_ERROR_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_MEAN_SQUARED_ERROR_HPP_INCLUDED diff --git a/include/lbann/layers/loss/top_k_categorical_accuracy.hpp b/include/lbann/layers/loss/top_k_categorical_accuracy.hpp index 6e0389e5f73..a58362bc4ef 100644 --- a/include/lbann/layers/loss/top_k_categorical_accuracy.hpp +++ b/include/lbann/layers/loss/top_k_categorical_accuracy.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LOSS_TOP_K_CATEGORICAL_ACCURACY_HPP_INCLUDED #define LBANN_LAYERS_LOSS_TOP_K_CATEGORICAL_ACCURACY_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -42,12 +42,12 @@ namespace lbann { * * @todo Gracefully handle case where label is not a one-hot vector. */ -template -class top_k_categorical_accuracy_layer : public Layer { +template +class top_k_categorical_accuracy_layer : public data_type_layer { public: top_k_categorical_accuracy_layer(lbann_comm *comm, El::Int k) - : Layer(comm), m_k(k) { + : data_type_layer(comm), m_k(k) { this->m_expected_num_parent_layers = 2; } @@ -59,25 +59,25 @@ class top_k_categorical_accuracy_layer : public Layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); desc.add("k", m_k); return desc; } protected: - void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); // Check that input dimensions match - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); + if (this->get_input_dims(0) != this->get_input_dims(1)) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); + for (int i = 0; i < this->get_num_parents(); ++i) { + const auto& dims = this->get_input_dims(i); err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name() << "\" outputs "; for (size_t j = 0; j < dims.size(); ++j) { @@ -99,6 +99,18 @@ class top_k_categorical_accuracy_layer : public Layer { }; +#ifndef LBANN_TOP_K_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class top_k_categorical_accuracy_layer< \ + T, data_layout::DATA_PARALLEL, Device>; \ + extern template class top_k_categorical_accuracy_layer< \ + T, data_layout::MODEL_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_TOP_K_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_TOP_K_CATEGORICAL_ACCURACY_HPP_INCLUDED diff --git a/include/lbann/layers/math/CMakeLists.txt b/include/lbann/layers/math/CMakeLists.txt index a6d19112716..ca9b3d9461b 100644 --- a/include/lbann/layers/math/CMakeLists.txt +++ b/include/lbann/layers/math/CMakeLists.txt @@ -3,6 +3,7 @@ set_full_path(THIS_DIR_HEADERS unary.hpp binary.hpp clamp.hpp + matmul.hpp ) # Propagate the files up the tree diff --git a/include/lbann/layers/math/binary.hpp b/include/lbann/layers/math/binary.hpp index d389ccbaae8..cb462a21dff 100644 --- a/include/lbann/layers/math/binary.hpp +++ b/include/lbann/layers/math/binary.hpp @@ -27,68 +27,83 @@ #ifndef LBANN_LAYERS_MATH_BINARY_HPP_INCLUDED #define LBANN_LAYERS_MATH_BINARY_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { -/** @brief Templated class for entry-wise binary layers. - * @param Layout Parallelism scheme. - * @param Device Device allocation. - * @param Name Type that can be converted into a string. - */ -template -class entrywise_binary_layer : public Layer { -public: - - entrywise_binary_layer(lbann_comm *comm) : Layer(comm) { - this->m_expected_num_parent_layers = 2; - } - entrywise_binary_layer* copy() const override { - return new entrywise_binary_layer(*this); - } - std::string get_type() const override { return Name(); } - data_layout get_data_layout() const override { return Layout; } - El::Device get_device_allocation() const override { return Device; } - -protected: - - void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); - - // Check that input dimensions match - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); - std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " - << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); - err << (i > 0 ? ", " : "") - << "layer \"" << parents[i]->get_name() << "\" outputs "; - for (size_t j = 0; j < dims.size(); ++j) { - err << (j > 0 ? " x " : "") << dims[j]; - } - } - err << ")"; - LBANN_ERROR(err.str()); - } - +#define LBANN_DECLARE_ENTRYWISE_BINARY_LAYER(LAYER_NAME, LAYER_STRING) \ + template \ + class LAYER_NAME : public data_type_layer { \ + public: \ + LAYER_NAME(lbann_comm *comm) : data_type_layer(comm) { \ + this->m_expected_num_parent_layers = 2; \ + } \ + LAYER_NAME* copy() const override { \ + return new LAYER_NAME(*this); \ + } \ + std::string get_type() const override { return LAYER_STRING; } \ + data_layout get_data_layout() const override { return Layout; } \ + El::Device get_device_allocation() const override { return Device; } \ + protected: \ + void setup_dims(DataReaderMetaData& dr_metadata) override { \ + data_type_layer::setup_dims(dr_metadata); \ + this->set_output_dims(this->get_input_dims()); \ + /* Check that input dimensions match */ \ + if (this->get_input_dims(0) != this->get_input_dims(1)) { \ + const auto& parents = this->get_parent_layers(); \ + std::stringstream err; \ + err << this->get_type() << " layer \"" << this->get_name() << "\" " \ + << "has input tensors with different dimensions ("; \ + for (int i = 0; i < this->get_num_parents(); ++i) { \ + const auto& dims = this->get_input_dims(i); \ + err << (i > 0 ? ", " : "") \ + << "layer \"" << parents[i]->get_name() << "\" outputs "; \ + for (size_t j = 0; j < dims.size(); ++j) { \ + err << (j > 0 ? " x " : "") << dims[j]; \ + } \ + } \ + err << ")"; \ + LBANN_ERROR(err.str()); \ + } \ + } \ + void fp_compute() override; \ + void bp_compute() override; \ } - void fp_compute() override; - void bp_compute() override; - -}; +// Convenience macros for ETI decls for binary layers + +#ifndef LBANN_BINARY_LAYER_INSTANTIATE +#define BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, DEVICE) \ + extern template class LAYER_NAME; \ + extern template class LAYER_NAME +#else +#define BINARY_ETI_DECL_MACRO_DEV(...) +#endif // LBANN_BINARY_LAYER_INSTANTIATE + +// Instnatiate both data and model parallel layers +#define BINARY_ETI_INST_MACRO_DEV_DT(LAYER_NAME, T, DEVICE) \ + template class LAYER_NAME; \ + template class LAYER_NAME + +// Instantiate a DEVICE for each allowed tensor data type +#define BINARY_ETI_INST_MACRO_DEV(LAYER_NAME, DEVICE) \ + BINARY_ETI_INST_MACRO_DEV_DT(LAYER_NAME, float, DEVICE); \ + BINARY_ETI_INST_MACRO_DEV_DT(LAYER_NAME, double, DEVICE) + +#ifdef LBANN_HAS_GPU +#define BINARY_ETI_DECL_MACRO(LAYER_NAME, T) \ + BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU); \ + BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::GPU) +#else +#define BINARY_ETI_DECL_MACRO(LAYER_NAME, T) \ + BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU) +#endif // LBANN_HAS_GPU // Convenience macro to define an entry-wise binary layer class #define DEFINE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string) \ - struct layer_name##_name_struct { \ - inline operator std::string() { return layer_string; } \ - }; \ - template \ - using layer_name \ - = entrywise_binary_layer; + LBANN_DECLARE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string); \ + BINARY_ETI_DECL_MACRO(layer_name, float); \ + BINARY_ETI_DECL_MACRO(layer_name, double) // Arithmetic operations DEFINE_ENTRYWISE_BINARY_LAYER(add_layer, "add"); @@ -118,4 +133,7 @@ DEFINE_ENTRYWISE_BINARY_LAYER(logical_xor_layer, "logical xor"); } // namespace lbann #undef DEFINE_ENTRYWISE_BINARY_LAYER +#undef BINARY_ETI_DECL_MACRO +#undef BINARY_ETI_DECL_MACRO_DEV + #endif // LBANN_LAYERS_MATH_BINARY_HPP_INCLUDED diff --git a/include/lbann/layers/math/clamp.hpp b/include/lbann/layers/math/clamp.hpp index 4b79dc06c09..69164b65da3 100644 --- a/include/lbann/layers/math/clamp.hpp +++ b/include/lbann/layers/math/clamp.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_MATH_CLAMP_HPP_INCLUDED #define LBANN_LAYERS_MATH_CLAMP_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -42,12 +42,18 @@ namespace lbann { * \end{cases} * @f] */ -template -class clamp_layer : public Layer { +template +class clamp_layer : public data_type_layer { +#ifdef LBANN_HAS_GPU_FP16 + using CompareType = typename std::conditional::value, float, TensorDataType>::type; +#else + using CompareType = TensorDataType; +#endif + public: - clamp_layer(lbann_comm *comm, DataType min, DataType max) - : Layer(comm), m_min(min), m_max(max) { - if (m_min > m_max) { + clamp_layer(lbann_comm *comm, TensorDataType min, TensorDataType max) + : data_type_layer(comm), m_min(min), m_max(max) { + if (CompareType(m_min) > CompareType(m_max)) { std::stringstream err; err << "[" << m_min << "," << m_max << "] is an invalid range"; LBANN_ERROR(err.str()); @@ -59,7 +65,7 @@ class clamp_layer : public Layer { El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); std::stringstream ss; ss << "[" << m_min << "," << m_max << "]"; desc.add("Range", ss.str()); @@ -67,21 +73,34 @@ class clamp_layer : public Layer { } protected: - void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); } void fp_compute() override; void bp_compute() override; private: /** Minimum output. */ - DataType m_min; + TensorDataType m_min; /** Maximum output. */ - DataType m_max; + TensorDataType m_max; }; +#ifndef LBANN_CLAMP_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class clamp_layer< \ + T, data_layout::DATA_PARALLEL, Device>; \ + extern template class clamp_layer< \ + T, data_layout::MODEL_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_CLAMP_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MATH_CLAMP_HPP_INCLUDED diff --git a/include/lbann/layers/math/matmul.hpp b/include/lbann/layers/math/matmul.hpp new file mode 100644 index 00000000000..1331bee921c --- /dev/null +++ b/include/lbann/layers/math/matmul.hpp @@ -0,0 +1,204 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYER_MATH_MATMUL_HPP_INCLUDED +#define LBANN_LAYER_MATH_MATMUL_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" + +namespace lbann { + +/** @brief Matrix multiplication. + * + * Takes two 2D input tensors and outputs their matrix product. + * Matrix products are computed independently for each mini-batch + * sample, in a similar manner as NumPy's matmul function. + * + * @todo Support >2 dimensions, matvecs, and dot products + * + */ +template +class matmul_layer : public data_type_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "matmul_layer only supports " + "data-parallel data layout"); + +public: + + matmul_layer(lbann_comm *comm, + bool transpose_a = false, + bool transpose_b = false); + matmul_layer(const matmul_layer& other) = default; + matmul_layer& operator=(const matmul_layer& other) = default; + matmul_layer* copy() const override; + + std::string get_type() const override; + data_layout get_data_layout() const override; + El::Device get_device_allocation() const override; + + description get_description() const override; + +protected: + + void setup_dims(DataReaderMetaData& dr_metadata) override; + void fp_compute() override; + void bp_compute() override; + +private: + + /** If true, matrices from the first input tensor are transposed + * before multiplication. */ + bool m_transpose_a; + /** If true, matrices from the second input tensor are transposed + * before multiplication. */ + bool m_transpose_b; + + template + friend void fp_compute_impl(matmul_layer&, bool, bool); + template + friend void bp_compute_impl(matmul_layer&, bool, bool); +}; + +// ========================================================= +// Implementation +// ========================================================= + +template +matmul_layer::matmul_layer(lbann_comm *comm, bool transpose_a, bool transpose_b) + : data_type_layer(comm), + m_transpose_a{transpose_a}, + m_transpose_b{transpose_b} { + this->m_expected_num_parent_layers = 2; +} + +template +matmul_layer* matmul_layer::copy() const { + return new matmul_layer(*this); +} + +template +std::string matmul_layer::get_type() const { + return "matrix multiply"; +} + +template +data_layout matmul_layer::get_data_layout() const { + return Layout; +} + +template +El::Device matmul_layer::get_device_allocation() const { + return Device; +} + +template +description matmul_layer::get_description() const { + auto desc = data_type_layer::get_description(); + desc.add("Transpose A", m_transpose_a); + desc.add("Transpose B", m_transpose_b); + return desc; +} + +template +void matmul_layer::setup_dims(DataReaderMetaData& dr_metadata) { + data_type_layer::setup_dims(dr_metadata); + + // Input dimensions + const auto& input0_dims = this->get_input_dims(0); + const auto& input1_dims = this->get_input_dims(1); + + // Lambdas to help print error messages + auto print_name = [this] () -> std::string { + return this->get_type() + " layer \"" + this->get_name() + "\""; + }; + auto print_inputs = [this, &input0_dims, &input1_dims] () -> std::string { + auto print_dims = [] (const decltype(input0_dims)& dims) -> std::string { + std::ostringstream ss; + for (size_t i = 0; i < dims.size(); ++i) { + ss << (i > 0 ? "x" : "") << dims[i]; + } + return ss.str(); + }; + const auto& parents = this->get_parent_layers(); + return lbann::build_string( + parents[0]->get_type()," layer \"",parents[0]->get_name(),"\" ", + "outputs ",print_dims(input0_dims),", ", + parents[1]->get_type()," layer \"",parents[1]->get_name(),"\" ", + "outputs ",print_dims(input1_dims)); + }; + + // Check input dimensions + if (input0_dims.size() != input1_dims.size()) { + LBANN_ERROR("input tensors in ",print_name()," " + "have different numbers of dimensions ", + "(",print_inputs(),")"); + } + if (input0_dims.size() != 2) { + LBANN_ERROR("input tensors in ",print_name()," are not 2D ", + "(",print_inputs(),")"); + } + + // Get matrix dimensions + const auto input0_height = *(input0_dims.rbegin()+1); + const auto input0_width = *(input0_dims.rbegin()); + const auto input1_height = *(input1_dims.rbegin()+1); + const auto input1_width = *(input1_dims.rbegin()); + if ((m_transpose_a ? input0_height : input0_width) + != (m_transpose_b ? input1_width : input1_height)) { + LBANN_ERROR("input tensors in ",print_name()," ", + "are not compatible with ", + (m_transpose_a ? "T" : "N"), (m_transpose_b ? "T" : "N"), + " matrix multiplication ", + "(",print_inputs(),")"); + } + + // Set output dimensions + std::vector output_dims(input0_dims); + *(output_dims.rbegin()+1) = (m_transpose_a ? input0_width : input0_height); + *(output_dims.rbegin()) = (m_transpose_b ? input1_height : input1_width); + this->set_output_dims(output_dims); + +} + +// ========================================================= +// Explicit template instantiation +// ========================================================= + +#ifndef LBANN_MATMUL_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class matmul_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_MATMUL_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYER_MATH_MATMUL_HPP_INCLUDED diff --git a/include/lbann/layers/math/unary.hpp b/include/lbann/layers/math/unary.hpp index 73034b0593f..550f7fa7b45 100644 --- a/include/lbann/layers/math/unary.hpp +++ b/include/lbann/layers/math/unary.hpp @@ -27,42 +27,62 @@ #ifndef LBANN_LAYERS_MATH_UNARY_HPP_INCLUDED #define LBANN_LAYERS_MATH_UNARY_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { -/** @brief Templated class for entry-wise unary layers. - * @param Layout Parallelism scheme. - * @param Device Device allocation. - * @param Name Type that can be converted into a string. - */ -template -class entrywise_unary_layer : public Layer { -public: - entrywise_unary_layer(lbann_comm *comm) : Layer(comm) {} - entrywise_unary_layer* copy() const override { - return new entrywise_unary_layer(*this); +#define LBANN_DECLARE_ENTRYWISE_UNARY_LAYER(LAYER_NAME, LAYER_STRING) \ + template \ + class LAYER_NAME : public data_type_layer { \ + public: \ + LAYER_NAME(lbann_comm *comm) : data_type_layer(comm) {} \ + LAYER_NAME* copy() const override { \ + return new LAYER_NAME(*this); \ + } \ + std::string get_type() const override { return LAYER_STRING; } \ + data_layout get_data_layout() const override { return Layout; } \ + El::Device get_device_allocation() const override { return Device; } \ + protected: \ + void setup_dims(DataReaderMetaData& dr_metadata) override { \ + data_type_layer::setup_dims(dr_metadata); \ + this->set_output_dims(this->get_input_dims()); \ + } \ + void fp_compute() override; \ + void bp_compute() override; \ } - std::string get_type() const override { return Name(); } - data_layout get_data_layout() const override { return Layout; } - El::Device get_device_allocation() const override { return Device; } -protected: - void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); - } - void fp_compute() override; - void bp_compute() override; -}; + +// Convenience macros for ETI decls for unary layers + +#ifndef LBANN_UNARY_LAYER_INSTANTIATE +#define UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, DEVICE) \ + extern template class LAYER_NAME; \ + extern template class LAYER_NAME +#else +#define UNARY_ETI_DECL_MACRO_DEV(...) +#endif // LBANN_UNARY_LAYER_INSTANTIATE + +#define UNARY_ETI_INST_MACRO_DEV_DT(LAYER_NAME, T, DEVICE) \ + template class LAYER_NAME; \ + template class LAYER_NAME + +#define UNARY_ETI_INST_MACRO_DEV(LAYER_NAME, DEVICE) \ + UNARY_ETI_INST_MACRO_DEV_DT(LAYER_NAME, float, DEVICE); \ + UNARY_ETI_INST_MACRO_DEV_DT(LAYER_NAME, double, DEVICE) + +#ifdef LBANN_HAS_GPU +#define UNARY_ETI_DECL_MACRO(LAYER_NAME, T) \ + UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU); \ + UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::GPU) +#else +#define UNARY_ETI_DECL_MACRO(LAYER_NAME, T) \ + UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU) +#endif // LBANN_HAS_GPU // Convenience macro to define an entry-wise unary layer class -#define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string) \ - struct layer_name##_name_struct { \ - inline operator std::string() { return layer_string; } \ - }; \ - template \ - using layer_name \ - = entrywise_unary_layer; +#define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string) \ + LBANN_DECLARE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string); \ + UNARY_ETI_DECL_MACRO(layer_name, float); \ + UNARY_ETI_DECL_MACRO(layer_name, double) // Logical operations DEFINE_ENTRYWISE_UNARY_LAYER(logical_not_layer, "logical not"); @@ -109,4 +129,7 @@ DEFINE_ENTRYWISE_UNARY_LAYER(atanh_layer, "hyperbolic arctangent"); } // namespace lbann #undef DEFINE_ENTRYWISE_UNARY_LAYER +#undef UNARY_ETI_DECL_MACRO +#undef UNARY_ETI_DECL_MACRO_DEV + #endif // LBANN_LAYERS_MATH_UNARY_HPP_INCLUDED diff --git a/include/lbann/layers/misc/CMakeLists.txt b/include/lbann/layers/misc/CMakeLists.txt index 2b5808fdfa7..06c9e2acfb7 100644 --- a/include/lbann/layers/misc/CMakeLists.txt +++ b/include/lbann/layers/misc/CMakeLists.txt @@ -5,6 +5,9 @@ set_full_path(THIS_DIR_HEADERS channelwise_mean.hpp mini_batch_index.hpp mini_batch_size.hpp + argmax.hpp + argmin.hpp + one_hot.hpp ) # Propagate the files up the tree diff --git a/include/lbann/layers/misc/argmax.hpp b/include/lbann/layers/misc/argmax.hpp new file mode 100644 index 00000000000..71c581dd19c --- /dev/null +++ b/include/lbann/layers/misc/argmax.hpp @@ -0,0 +1,86 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_MISC_ARGMAX_HPP_INCLUDED +#define LBANN_LAYERS_MISC_ARGMAX_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" + +namespace lbann { + +/** @brief Get index of maximum-value tensor entry + * + * Expects a 1-D input tensor. If multiple entries have the same + * maximum value, outputs the index of the first one. + */ +template +class argmax_layer : public data_type_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "argmax layer only supports data parallel layout"); + static_assert(Device == El::Device::CPU, + "argmax layer only supports CPU"); +public: + + argmax_layer(lbann_comm* comm) : data_type_layer(comm) { } + argmax_layer* copy() const override { return new argmax_layer(*this); } + std::string get_type() const override { return "argmax"; } + data_layout get_data_layout() const override { return Layout; } + El::Device get_device_allocation() const override { return Device; } + +protected: + + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); + + // Make sure input tensor is 1-D + const auto input_dims = this->get_input_dims(); + if (input_dims.size() != 1) { + LBANN_ERROR(get_type()," layer \"",this->get_name(),"\" ", + "expects a 1-D input tensor, ", + "but parent layer \"",this->m_parent_layers[0]->get_name(),"\" ", + "outputs a ",input_dims.size(),"-D tensor"); + } + + } + + void fp_compute() override; + +}; + +#ifndef LBANN_ARGMAX_LAYER_INSTANTIATE +#define PROTO(T) \ + extern template class argmax_layer + +#define LBANN_INSTANTIATE_CPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#undef LBANN_INSTANTIATE_CPU_HALF +#endif // LBANN_ARGMAX_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYERS_MISC_ARGMAX_HPP_INCLUDED diff --git a/include/lbann/layers/misc/argmin.hpp b/include/lbann/layers/misc/argmin.hpp new file mode 100644 index 00000000000..ccfe846bfc1 --- /dev/null +++ b/include/lbann/layers/misc/argmin.hpp @@ -0,0 +1,85 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_MISC_ARGMIN_HPP_INCLUDED +#define LBANN_LAYERS_MISC_ARGMIN_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" + +namespace lbann { + +/** @brief Get index of minimum-value tensor entry + * + * Expects a 1-D input tensor. If multiple entries have the same + * minimum value, outputs the index of the first one. + */ +template +class argmin_layer : public data_type_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "argmin layer only supports data parallel layout"); + static_assert(Device == El::Device::CPU, + "argmin layer only supports CPU"); +public: + + argmin_layer(lbann_comm* comm) : data_type_layer(comm) { } + argmin_layer* copy() const override { return new argmin_layer(*this); } + std::string get_type() const override { return "argmin"; } + data_layout get_data_layout() const override { return Layout; } + El::Device get_device_allocation() const override { return Device; } + +protected: + + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); + + // Make sure input tensor is 1-D + const auto input_dims = this->get_input_dims(); + if (input_dims.size() != 1) { + LBANN_ERROR(get_type()," layer \"",this->get_name(),"\" ", + "expects a 1-D input tensor, ", + "but parent layer \"",this->m_parent_layers[0]->get_name(),"\" ", + "outputs a ",input_dims.size(),"-D tensor"); + } + + } + + void fp_compute() override; + +}; + +#ifndef LBANN_ARGMIN_LAYER_INSTANTIATE +#define PROTO(T) \ + extern template class argmin_layer + +#define LBANN_INSTANTIATE_CPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#undef LBANN_INSTANTIATE_CPU_HALF +#endif // LBANN_ARGMIN_LAYER_INSTANTIATE +} // namespace lbann + +#endif // LBANN_LAYERS_MISC_ARGMIN_HPP_INCLUDED diff --git a/include/lbann/layers/misc/channelwise_mean.hpp b/include/lbann/layers/misc/channelwise_mean.hpp index 5889b853256..aea45b04a6c 100644 --- a/include/lbann/layers/misc/channelwise_mean.hpp +++ b/include/lbann/layers/misc/channelwise_mean.hpp @@ -27,20 +27,22 @@ #ifndef LBANN_LAYERS_MISC_CHANNELWISE_MEAN_HPP_INCLUDED #define LBANN_LAYERS_MISC_CHANNELWISE_MEAN_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { /** @todo Replace with more general reduction layer. */ -template -class channelwise_mean_layer : public Layer { +template +class channelwise_mean_layer : public data_type_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "channelwise_mean_layer only supports " + "data-parallel data layout"); public: channelwise_mean_layer(lbann_comm *comm) - : Layer(comm) { - static_assert(Layout == data_layout::DATA_PARALLEL, - "channelwise_mean_layer only supports " - "data-parallel data layout"); + : data_type_layer(comm) { if (comm->am_trainer_master()) { LBANN_WARNING("channelwise_mean_layer is experimental " "and may be deprecated at any time"); @@ -54,10 +56,10 @@ class channelwise_mean_layer : public Layer { protected: - void setup_dims() override { - Layer::setup_dims(); - const auto& input_dims = get_input_dims(); - set_output_dims({input_dims[0]}); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + const auto& input_dims = this->get_input_dims(); + this->set_output_dims({input_dims[0]}); } void fp_compute() override; @@ -65,6 +67,14 @@ class channelwise_mean_layer : public Layer { }; +#ifndef LBANN_CHANNELWISE_MEAN_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class channelwise_mean_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_CHANNELWISE_MEAN_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MISC_CHANNELWISE_MEAN_HPP_INCLUDED diff --git a/include/lbann/layers/misc/channelwise_softmax.hpp b/include/lbann/layers/misc/channelwise_softmax.hpp new file mode 100644 index 00000000000..41ae2e4f865 --- /dev/null +++ b/include/lbann/layers/misc/channelwise_softmax.hpp @@ -0,0 +1,127 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_REGULARIZERS_CHANNELWISE_SOFTMAX_HPP_INCLUDED +#define LBANN_LAYERS_REGULARIZERS_CHANNELWISE_SOFTMAX_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" + +namespace lbann { + +/** @brief Apply softmax to tensor channels. + * + * The input tensor is sliced along the first tensor dimension (the + * "channel" dimension for image data in CHW format) and the softmax + * function is applied to each slice: + * @f[ \text{softmax}(x)_i = \frac{e^{x_i}}{\sum_j e^{x_j}} @f] + * + * This is not to be confused with @c softmax_mode::CHANNEL for + * @c softmax_layer, which applies the softmax function to entries + * corresponding to the same spatial position. "Channel mode" softmax + * might be described as "position-wise softmax". + * + */ +template +class channelwise_softmax_layer : public data_type_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "channelwise_softmax_layer only supports " + "data-parallel data layout"); + +public: + + channelwise_softmax_layer(lbann_comm* comm); + + channelwise_softmax_layer(const channelwise_softmax_layer& other) = default; + channelwise_softmax_layer& operator=(const channelwise_softmax_layer& other) = default; + channelwise_softmax_layer* copy() const override; + + std::string get_type() const override; + data_layout get_data_layout() const override; + El::Device get_device_allocation() const override; + +protected: + + void setup_dims(DataReaderMetaData& dr_metadata) override; + + void fp_compute() override; + void bp_compute() override; + +}; + +// Builder function +LBANN_DEFINE_LAYER_BUILDER(channelwise_softmax); + +// ========================================================= +// Implementation +// ========================================================= + +template +channelwise_softmax_layer::channelwise_softmax_layer( + lbann_comm* comm) + : data_type_layer(comm) +{} + +template +channelwise_softmax_layer* channelwise_softmax_layer::copy() const { + return new channelwise_softmax_layer(*this); +} + +template +std::string channelwise_softmax_layer::get_type() const { + return "channel-wise softmax"; +} + +template +data_layout channelwise_softmax_layer::get_data_layout() const { + return Layout; +} + +template +El::Device channelwise_softmax_layer::get_device_allocation() const { + return Device; +} + +template +void channelwise_softmax_layer::setup_dims(DataReaderMetaData& dr_metadata) { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); +} + +// ========================================================= +// Explicit template instantiation +// ========================================================= + +#ifndef LBANN_CHANNELWISE_SOFTMAX_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class channelwise_softmax_layer< \ + T, data_layout::DATA_PARALLEL, Device>; +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_CHANNELWISE_SOFTMAX_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYERS_REGULARIZERS_CHANNELWISE_SOFTMAX_HPP_INCLUDED diff --git a/include/lbann/layers/misc/covariance.hpp b/include/lbann/layers/misc/covariance.hpp index 8f31d12d545..a370e71f58d 100644 --- a/include/lbann/layers/misc/covariance.hpp +++ b/include/lbann/layers/misc/covariance.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_MISC_COVARIANCE_HPP_INCLUDED #define LBANN_LAYERS_MISC_COVARIANCE_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -43,22 +43,31 @@ namespace lbann { * Scaling by @f$ 1/n @f$ instead of @f$ 1/(n-1) @f$ is a biased * estimator. */ -template -class covariance_layer : public Layer { +template +class covariance_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: covariance_layer(lbann_comm *comm, bool biased) - : Layer(comm), m_biased(biased) { + : data_type_layer(comm), m_biased(biased) { this->m_expected_num_parent_layers = 2; } covariance_layer(const covariance_layer& other) - : Layer(other), + : data_type_layer(other), m_biased(other.m_biased), m_means(other.m_means ? other.m_means->Copy() : nullptr), m_workspace(other.m_workspace ? other.m_workspace->Copy() : nullptr) {} covariance_layer& operator=(const covariance_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_biased = other.m_biased; m_means.reset(other.m_means ? other.m_means->Copy() : nullptr); m_workspace.reset(other.m_workspace ? @@ -72,7 +81,7 @@ class covariance_layer : public Layer { El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); desc.add("Biased", m_biased); return desc; } @@ -80,23 +89,23 @@ class covariance_layer : public Layer { protected: void setup_matrices(const El::Grid& grid) override { - Layer::setup_matrices(grid); - auto dist_data = get_prev_activations().DistData(); + data_type_layer::setup_matrices(grid); + auto dist_data = this->get_prev_activations().DistData(); dist_data.colDist = El::STAR; - m_means.reset(AbsDistMat::Instantiate(dist_data)); - m_workspace.reset(AbsDistMat::Instantiate(dist_data)); + m_means.reset(AbsDistMatrixType::Instantiate(dist_data)); + m_workspace.reset(AbsDistMatrixType::Instantiate(dist_data)); } - void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); + if (this->get_input_dims(0) != this->get_input_dims(1)) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); + for (int i = 0; i < this->get_num_parents(); ++i) { + const auto& dims = this->get_input_dims(i); err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name() << "\" outputs "; for (size_t j = 0; j < dims.size(); ++j) { @@ -117,12 +126,21 @@ class covariance_layer : public Layer { bool m_biased; /** Means for each mini-batch sample. */ - std::unique_ptr m_means; + std::unique_ptr m_means; /** Workspace. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; +#ifndef LBANN_COVARIANCE_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class covariance_layer; \ + extern template class covariance_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_COVARIANCE_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MISC_COVARIANCE_HPP_INCLUDED diff --git a/include/lbann/layers/misc/dist_embedding.hpp b/include/lbann/layers/misc/dist_embedding.hpp new file mode 100644 index 00000000000..9d7cb445521 --- /dev/null +++ b/include/lbann/layers/misc/dist_embedding.hpp @@ -0,0 +1,406 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_MISC_DIST_EMBEDDING_HPP_INCLUDED +#define LBANN_LAYERS_MISC_DIST_EMBEDDING_HPP_INCLUDED +#include "lbann/base.hpp" +#include "lbann/layers/layer.hpp" + +#if defined(LBANN_HAS_SHMEM) || defined(LBANN_HAS_NVSHMEM) +#include "lbann/layers/data_type_layer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/optimizers/sgd.hpp" +#include "lbann/weights/weights_helpers.hpp" +#include "lbann/utils/memory.hpp" + +namespace lbann { + +/** @brief Embedding layer with distributed weights. + * + * This is similar to the embedding layer, which takes integer + * indices and returns embedding vectors from a lookup table. + * However, the embedding vectors are distributed between processes + * and one-sided inter-process communication is performed with + * OpenSHMEM (on CPU) or NVSHMEM (on GPU). + * + * The main benefit of this model-parallel approach is to handle + * cases where the embedding vectors don't fit on one process. It + * should also have better scaling properties when the mini-batch + * size is very large. + * + * To take advantage of sparse gradients, the distributed embedding + * layer provides the option to bypass the optimizer (which currently + * only supports dense gradients) and perform sparse SGD directly on + * the embedding weights. If enabled, SGD occurs during the layers + * "update" phase (i.e. in the virtual update_compute function). + * Otherwise, the layer converts sparse gradients to a dense tensor + * and passes it into the usual optimizer. This is a hack and will be + * deprecated once the optimizer class supports sparse gradients. + * + * @warning This is experimental. + * + * @todo Sparse SGD with optimizer class + */ +template +class dist_embedding_layer : public data_type_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "distributed embedding layer only supports data parallel layout"); + +public: + + dist_embedding_layer( + lbann_comm* comm, + size_t num_embeddings, + size_t embedding_dim, + bool sparse_sgd, + DataType learning_rate, + bool barrier_in_forward_prop); + + dist_embedding_layer(const dist_embedding_layer& other); + dist_embedding_layer& operator=(const dist_embedding_layer& other); + ~dist_embedding_layer(); + + dist_embedding_layer* copy() const override; + std::string get_type() const override; + data_layout get_data_layout() const override; + El::Device get_device_allocation() const override; + + description get_description() const override; + +protected: + + void setup_dims(DataReaderMetaData& dr_metadata) override; + void setup_data(size_t max_mini_batch_size) override; + + void fp_compute() override; + void bp_compute() override; + bool update_compute() override; + +public: + + /** Metadata for an embedding vector from a remote process. + * + * This should be treated as an internal implementation detail. It + * is only in public scope so it is available to CUDA kernels in an + * anonymous namespace. + */ + struct vector_metadata { + size_t source_rank{0}; + size_t source_index{0}; + size_t target_rank{0}; + size_t target_index{0}; + bool is_active{false}; + }; + +private: + + using LocalMat = El::Matrix; + + /** @brief Non-blocking barrier + * @todo Handle case with non-default CUDA stream. + * @todo Move to comm header. + */ + static void nb_barrier( + lbann_comm& comm, + const El::mpi::Comm& c, + Al::request& req); + + void attach_embeddings_to_shmem_buffer(); + void apply_sparse_sgd_step( + size_t num_gradients, + LocalMat& local_embeddings); + + /** SHMEM buffer for embedding vectors. + * + * If the embedding weights matrix is not already attached to a + * SHMEM buffer, then this layer allocates a SHMEM buffer and + * attaches it. In this case, the layer is responsible for managing + * the buffer. + */ + TensorDataType* m_embeddings_buffer{nullptr}; + /** Allocated size of @c m_embeddings_buffer. */ + size_t m_embeddings_buffer_size{0}; + + /** SHMEM buffer to communicate embedding vectors. */ + TensorDataType* m_workspace_buffer{nullptr}; + /** Allocated size of @c m_workspace_buffer. */ + size_t m_workspace_buffer_size{0}; + + /** SHMEM buffer to communicate metadata for embedding vectors. */ + vector_metadata* m_metadata_buffer{nullptr}; + /** Allocated size of @c m_metadata_buffer. */ + size_t m_metadata_buffer_size{0}; + + /** Request to synchronize non-blocking barriers. + * + * Careful synchronization is required to ensure the correctness of + * asynchronous, one-sided communication via SHMEM buffers. After + * any modification to a SHMEM buffer (local or remote), a + * non-blocking barrier is launched to signal that the local + * process has finished its work. Before the next access to the + * SHMEM buffer, the non-blocking barrier is synchronized to make + * sure that all remote processes have finished their work and that + * the buffers are safe to access. + */ + Al::request m_nb_barrier_request; + + /** Size of dictionary of embeddings. */ + size_t m_num_embeddings; + /** Size of embedding vectors. */ + size_t m_embedding_dim; + + /** Perform sparse SGD during backprop. + * + * Bypasses optimizer class. + */ + bool m_sparse_sgd; + /** SGD learning rate. */ + DataType m_learning_rate; + + /** Perform a blocking barrier at the beginning of forward prop. + * + * This layer performs synchronization with non-blocking barriers + * to ensure the correctness of asynchronous communication. + * However, gradient checking changes the embedding values without + * performing any synchronization. The quickest fix is to do a + * blocking barrier at the beginning of forward prop to make sure + * that all the embeddings are ready to be accessed. + * + * @todo Think of a way to avoid this synchronization. + */ + bool m_barrier_in_forward_prop; + +}; + +// --------------------------------------------- +// Implementation +// --------------------------------------------- + +template +dist_embedding_layer::dist_embedding_layer( + lbann_comm* comm, + size_t num_embeddings, + size_t embedding_dim, + bool sparse_sgd, + DataType learning_rate, + bool barrier_in_forward_prop) + : data_type_layer(comm), + m_num_embeddings{num_embeddings}, + m_embedding_dim{embedding_dim}, + m_sparse_sgd{sparse_sgd}, + m_learning_rate{learning_rate}, + m_barrier_in_forward_prop{barrier_in_forward_prop} { + + // Learning rate is only used for sparse SGD + if (!m_sparse_sgd) { + m_learning_rate = -1.0; + } + +} + +template +dist_embedding_layer::dist_embedding_layer( + const dist_embedding_layer& other) + : data_type_layer(other) { + LBANN_ERROR("copy constructor is invalid for dist_embedding_layer"); +} + +template +dist_embedding_layer& dist_embedding_layer::operator=( + const dist_embedding_layer& other) { + LBANN_ERROR("copy assignment operator is invalid for dist_embedding_layer"); +} + +template +dist_embedding_layer* dist_embedding_layer::copy() const { + return new dist_embedding_layer(*this); +} + +template +std::string dist_embedding_layer::get_type() const { + return "distributed embedding"; +} + +template +data_layout dist_embedding_layer::get_data_layout() const { + return Layout; +} + +template +El::Device dist_embedding_layer::get_device_allocation() const { + return Device; +} + +template +description dist_embedding_layer::get_description() const { + auto desc = data_type_layer::get_description(); + desc.add("Num embeddings", m_num_embeddings); + desc.add("Embedding dim", m_embedding_dim); + desc.add("Using sparse SGD", m_sparse_sgd); + desc.add("SGD learning rate", m_learning_rate); + return desc; +} + +template +void dist_embedding_layer::setup_dims(DataReaderMetaData& dr_metadata) { + data_type_layer::setup_dims(dr_metadata); + auto dims = this->get_input_dims(); + dims.push_back(static_cast(m_embedding_dim)); + this->set_output_dims(dims); +} + +template +void dist_embedding_layer::setup_data(size_t max_mini_batch_size) { + data_type_layer::setup_data(max_mini_batch_size); + + // Synchronize non-blocking barrier + // Note: Make sure SHMEM buffers are safe to reset. + auto& comm = *this->get_comm(); + comm.wait(m_nb_barrier_request); + + // Construct default weights if needed + // Note: Randomly drawn from normal distribution with mean 0 and + // standard deviation 1. + if (!this->has_weights()) { + auto w = make_unique>(&comm); + auto init = make_unique>(0,1); + auto opt = this->m_model->template create_optimizer(); + w->set_name(this->get_name() + "_weights"); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->add_weights(w.get()); + this->m_model->add_weights(std::move(w)); + } + if (this->num_weights() != 1) { + LBANN_ERROR("attempted to setup ", + this->get_type()," layer \"",this->get_name(),"\" ", + "with an invalid number of weights ", + "(expected 1, found ",this->num_weights(),")"); + } + + // Configure embedding weights + auto& embeddings = this->get_weights(0); + { + auto dist = this->get_prev_activations().DistData(); + dist.colDist = El::STAR; + dist.rowDist = El::VC; + embeddings.set_dims( + {static_cast(m_embedding_dim)}, + {static_cast(m_num_embeddings)}); + embeddings.set_matrix_distribution(dist); + } + + // Destroy embedding optimizer and create dummy weights + // Note: This layer manually performs sparse SGD on embedding + // weights during backprop, so the embedding optimizer isn't needed. + // However, the layer must send gradients to some optimizer to + // prevent the model from optimizing the layer out of compute graph + // during backprop. We get around this by creating dummy weights + // with no entries. + if (m_sparse_sgd) { + embeddings.set_optimizer(nullptr); + auto w = make_unique>(&comm); + auto opt = make_unique>(0.); + w->set_name(this->get_name() + "_dummy_weights"); + w->set_optimizer(std::move(opt)); + w->set_dims(1); + w->set_matrix_distribution(embeddings.get_matrix_distribution()); + w->setup(); + this->add_weights(w.get()); + this->m_model->add_weights(std::move(w)); + } + + // Setup embedding weights + embeddings.setup(); + attach_embeddings_to_shmem_buffer(); + + // Non-blocking barrier + // Note: Embeddings have been initialized + nb_barrier(comm, comm.get_trainer_comm(), m_nb_barrier_request); + +} + +template +bool dist_embedding_layer::update_compute() { + + // Apply sparse SGD if needed + if (m_sparse_sgd) { + const size_t input_size = this->get_input_size(); + const size_t mini_batch_size = this->get_prev_activations().Width(); + using ValuesGetter = weights_details::SafeWeightsAccessor; + auto& embeddings = ValuesGetter::mutable_values(this->get_weights(0)); + auto& local_embeddings = dynamic_cast(embeddings.Matrix()); + apply_sparse_sgd_step(input_size * mini_batch_size, local_embeddings); + } + + // Non-blocking barrier + // Note: Embeddings are up-to-date. + auto& comm = *this->get_comm(); + comm.wait(m_nb_barrier_request); + nb_barrier(comm, comm.get_trainer_comm(), m_nb_barrier_request); + + return true; +} + +template +void dist_embedding_layer::nb_barrier( + lbann_comm& comm, + const El::mpi::Comm& c, + Al::request& req) { + static El::Matrix buffer; + buffer.SetMemoryMode(0); // Don't use memory pool + buffer.Resize(1, 1); + comm.nb_allreduce(buffer, c, req); +} + +// --------------------------------------------- +// Explicit template instantiation +// --------------------------------------------- + +#ifdef LBANN_HAS_SHMEM +extern template class dist_embedding_layer< + float, data_layout::DATA_PARALLEL, El::Device::CPU>; +#endif // LBANN_HAS_SHMEM +#if defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM) +extern template class dist_embedding_layer< + float, data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM) + +} // namespace lbann +#endif // defined(LBANN_HAS_SHMEM) || defined(LBANN_HAS_NVSHMEM) + +// --------------------------------------------- +// Builder function +// --------------------------------------------- + +namespace lbann +{ + +LBANN_DEFINE_LAYER_BUILDER(dist_embedding); + +} // namespace lbann + +#endif // LBANN_LAYERS_MISC_DIST_EMBEDDING_HPP_INCLUDED diff --git a/include/lbann/layers/misc/mini_batch_index.hpp b/include/lbann/layers/misc/mini_batch_index.hpp index 51538000dce..75f882ea1b9 100644 --- a/include/lbann/layers/misc/mini_batch_index.hpp +++ b/include/lbann/layers/misc/mini_batch_index.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_MISC_MINI_BATCH_INDEX_HPP_INCLUDED #define LBANN_LAYERS_MISC_MINI_BATCH_INDEX_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -37,11 +37,13 @@ namespace lbann { * mini-batch sample. Each sample in a model's mini-batch has a * unique index in [0, mini_batch_size). */ -template -class mini_batch_index_layer : public Layer { +template +class mini_batch_index_layer : public data_type_layer { public: - mini_batch_index_layer(lbann_comm* comm) : Layer(comm) { + mini_batch_index_layer(lbann_comm* comm) : data_type_layer(comm) { this->m_expected_num_parent_layers = 0; } @@ -52,20 +54,21 @@ class mini_batch_index_layer : public Layer { protected: - void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); } void fp_compute() override { + using CPUMatType = El::Matrix; // Get output matrix - auto& output = get_activations(); + auto& output = this->get_activations(); auto& local_output = output.Matrix(); const auto& local_width = local_output.Width(); // Create temporary matrix if output matrix is not on CPU - CPUMat local_output_v; + CPUMatType local_output_v; if (local_output.GetDevice() == El::Device::CPU) { El::View(local_output_v, local_output); } else { @@ -75,7 +78,7 @@ class mini_batch_index_layer : public Layer { // Populate matrix on CPU LBANN_OMP_PARALLEL_FOR for (El::Int col = 0; col < local_width; ++col) { - local_output_v(0, col) = DataType(output.GlobalCol(col)); + local_output_v(0, col) = El::To(output.GlobalCol(col)); } // Copy result from CPU if needed @@ -87,6 +90,15 @@ class mini_batch_index_layer : public Layer { }; +#ifndef LBANN_MINI_BATCH_INDEX_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class mini_batch_index_layer; \ + extern template class mini_batch_index_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_MINI_BATCH_INDEX_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MISC_MINI_BATCH_INDEX_HPP_INCLUDED diff --git a/include/lbann/layers/misc/mini_batch_size.hpp b/include/lbann/layers/misc/mini_batch_size.hpp index 5a1445ef422..bd011a73ecd 100644 --- a/include/lbann/layers/misc/mini_batch_size.hpp +++ b/include/lbann/layers/misc/mini_batch_size.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_MISC_MINI_BATCH_SIZE_HPP_INCLUDED #define LBANN_LAYERS_MISC_MINI_BATCH_SIZE_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -36,11 +36,13 @@ namespace lbann { * Output tensor is a 1D tensor with a single entry containing the * model's current mini-batch size. */ -template -class mini_batch_size_layer : public Layer { +template +class mini_batch_size_layer : public data_type_layer { public: - mini_batch_size_layer(lbann_comm* comm) : Layer(comm) { + mini_batch_size_layer(lbann_comm* comm) : data_type_layer(comm) { this->m_expected_num_parent_layers = 0; } @@ -51,18 +53,18 @@ class mini_batch_size_layer : public Layer { protected: - void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); } void fp_setup_outputs(El::Int mini_batch_size) override { - Layer::fp_setup_outputs(mini_batch_size); + data_type_layer::fp_setup_outputs(mini_batch_size); m_mini_batch_size = mini_batch_size; } void fp_compute() override { - El::Fill(get_activations(), DataType(m_mini_batch_size)); + El::Fill(this->get_activations(), El::To(m_mini_batch_size)); } private: @@ -72,6 +74,15 @@ class mini_batch_size_layer : public Layer { }; +#ifndef LBANN_MINI_BATCH_SIZE_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class mini_batch_size_layer; \ + extern template class mini_batch_size_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_MINI_BATCH_SIZE_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MISC_MINI_BATCH_SIZE_HPP_INCLUDED diff --git a/include/lbann/layers/misc/one_hot.hpp b/include/lbann/layers/misc/one_hot.hpp new file mode 100644 index 00000000000..a4b4ea0ecd4 --- /dev/null +++ b/include/lbann/layers/misc/one_hot.hpp @@ -0,0 +1,89 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_MISC_ONE_HOT_HPP_INCLUDED +#define LBANN_LAYERS_MISC_ONE_HOT_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" + +namespace lbann { + +/** @brief Convert index to a one-hot vector + * + * Expects a scalar input tensor and outputs a 1-D output tensor with + * @c size entries. The input is interpreted as an index, and output + * entries are one if they correspond to that index and zero + * otherwise. If the input is outside @f$[0,\text{size})@f$, then the + * output is all zeros. + */ +template +class one_hot_layer : public data_type_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "one-hot layer only supports data-parallel layout"); +public: + + one_hot_layer(lbann_comm* comm, size_t size) : data_type_layer(comm) { + this->set_output_dims({static_cast(size)}); + } + one_hot_layer* copy() const override { return new one_hot_layer(*this); } + std::string get_type() const override { return "one-hot"; } + data_layout get_data_layout() const override { return Layout; } + El::Device get_device_allocation() const override { return Device; } + +protected: + + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + + // Make sure input tensor is scalar + if (this->get_input_size() != 1) { + const auto input_dims = this->get_input_dims(); + std::ostringstream dim_ss; + for (size_t i = 0; i < input_dims.size(); ++i) { + dim_ss << (i > 0 ? "x" : "") << input_dims[i]; + } + LBANN_ERROR(get_type()," layer \"",this->get_name(),"\" ", + "received an input tensor with invalid dimensions ", + "(expected 1, got ",dim_ss.str(),")"); + } + + } + + void fp_compute() override; + +}; + +#ifndef LBANN_ONE_HOT_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class one_hot_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_ONE_HOT_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYERS_MISC_ONE_HOT_HPP_INCLUDED diff --git a/include/lbann/layers/misc/variance.hpp b/include/lbann/layers/misc/variance.hpp index bc36581b73f..4006a161667 100644 --- a/include/lbann/layers/misc/variance.hpp +++ b/include/lbann/layers/misc/variance.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_MISC_VARIANCE_HPP_INCLUDED #define LBANN_LAYERS_MISC_VARIANCE_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -42,20 +42,29 @@ namespace lbann { * Scaling by @f$ 1/n @f$ instead of @f$ 1/(n-1) @f$ is a biased * estimator. */ -template -class variance_layer : public Layer { +template +class variance_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: variance_layer(lbann_comm *comm, bool biased) - : Layer(comm), m_biased(biased) {} + : data_type_layer(comm), m_biased(biased) {} variance_layer(const variance_layer& other) - : Layer(other), + : data_type_layer(other), m_biased(other.m_biased), m_means(other.m_means ? other.m_means->Copy() : nullptr), m_workspace(other.m_workspace ? other.m_workspace->Copy() : nullptr) {} variance_layer& operator=(const variance_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_biased = other.m_biased; m_means.reset(other.m_means ? other.m_means->Copy() : nullptr); m_workspace.reset(other.m_workspace ? @@ -69,7 +78,7 @@ class variance_layer : public Layer { El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); desc.add("Biased", m_biased); return desc; } @@ -77,21 +86,21 @@ class variance_layer : public Layer { protected: void setup_matrices(const El::Grid& grid) override { - Layer::setup_matrices(grid); - auto dist_data = get_prev_activations().DistData(); + data_type_layer::setup_matrices(grid); + auto dist_data = this->get_prev_activations().DistData(); dist_data.colDist = El::STAR; - m_means.reset(AbsDistMat::Instantiate(dist_data)); - m_workspace.reset(AbsDistMat::Instantiate(dist_data)); + m_means.reset(AbsDistMatrixType::Instantiate(dist_data)); + m_workspace.reset(AbsDistMatrixType::Instantiate(dist_data)); } - void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); - if (get_input_size() <= 1) { + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); + if (this->get_input_size() <= 1) { std::stringstream err; - const auto& parents = get_parent_layers(); - const auto& dims = get_input_dims(); - err << get_type() << " layer \"" << get_name() << "\" " + const auto& parents = this->get_parent_layers(); + const auto& dims = this->get_input_dims(); + err << get_type() << " layer \"" << this->get_name() << "\" " << "expects an input tensor with at least two entries, " << "but parent layer \"" << parents[0]->get_name() << "\" " << "outputs a tensor with dimensions "; @@ -111,12 +120,21 @@ class variance_layer : public Layer { bool m_biased; /** Means for each mini-batch sample. */ - std::unique_ptr m_means; + std::unique_ptr m_means; /** Workspace. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; +#ifndef LBANN_VARIANCE_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class variance_layer; \ + extern template class variance_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_VARIANCE_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MISC_VARIANCE_HPP_INCLUDED diff --git a/include/lbann/layers/regularizers/CMakeLists.txt b/include/lbann/layers/regularizers/CMakeLists.txt index cd27df13645..15384770bc0 100644 --- a/include/lbann/layers/regularizers/CMakeLists.txt +++ b/include/lbann/layers/regularizers/CMakeLists.txt @@ -2,6 +2,8 @@ set_full_path(THIS_DIR_HEADERS batch_normalization.hpp dropout.hpp + entrywise_batch_normalization.hpp + layer_norm.hpp local_response_normalization.hpp regularizer.hpp selu_dropout.hpp diff --git a/include/lbann/layers/regularizers/batch_normalization.hpp b/include/lbann/layers/regularizers/batch_normalization.hpp index 2f896fcf081..4c5a3013eed 100644 --- a/include/lbann/layers/regularizers/batch_normalization.hpp +++ b/include/lbann/layers/regularizers/batch_normalization.hpp @@ -29,6 +29,7 @@ #include "lbann/layers/regularizers/regularizer.hpp" #include "lbann/models/model.hpp" +#include "lbann/utils/distconv.hpp" namespace lbann { @@ -41,6 +42,36 @@ enum class batch_normalization_stats_aggregation { global }; +#ifdef LBANN_HAS_DISTCONV +template +class batch_normalization_distconv_adapter: public data_type_distconv_adapter { + public: + using TensorDevType = typename data_type_distconv_adapter::TensorDevType; + batch_normalization_distconv_adapter(Layer& layer): + data_type_distconv_adapter(layer) {} + virtual ~batch_normalization_distconv_adapter() = default; + void setup_fp_tensors() override; + void setup_bp_tensors() override; + dc::Shape get_per_channel_stat_shape() const; + dc::Dist get_per_channel_stat_dist(const dc::Dist &input_dist) const; + void setup_layer(size_t workspace_capacity) override; + void fp_compute(); + void bp_compute(); + + TensorDevType m_mean; + TensorDevType m_var; + TensorDevType m_scale; + TensorDevType m_bias; + TensorDevType m_running_mean; + TensorDevType m_running_var; + TensorDevType m_mean_gradient; + TensorDevType m_var_gradient; + TensorDevType m_scale_gradient; + TensorDevType m_bias_gradient; + std::unique_ptr> m_bn; +}; +#endif // LBANN_HAS_DISTCONV + /** @brief * * Each input channel is normalized across the mini-batch to have @@ -54,35 +85,65 @@ enum class batch_normalization_stats_aggregation { * Shift." In International Conference on Machine Learning, * pp. 448-456. 2015. */ -template -class batch_normalization_layer : public regularizer_layer { +template +class batch_normalization_layer : public regularizer_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "batch normalization only supports DATA_PARALLEL"); +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; + + ///@} private: /** Decay rate for the running statistics. */ - DataType m_decay; + TensorDataType m_decay; /** Small number to avoid division by zero. */ - DataType m_epsilon; - /** Type of statistics aggregation to use. */ - batch_normalization_stats_aggregation m_stats_aggregation; + TensorDataType m_epsilon; + /** @brief Size of group to aggregate statistics over. + * + * If this is 1, the group consists of one process and aggregation + * is local. If it is 0, statistics are aggregated globally. + */ + int m_statistics_group_size; /** * Cache of node-local num_per_sum results for node-local stats. * Indexed by effective mini-batch size. */ std::unordered_map m_num_per_sum_cache; - /** Current minibatch means. */ - std::unique_ptr m_mean; - /** Current minibatch standard deviations. */ - std::unique_ptr m_var; - /** Gradient w.r.t. means. */ - std::unique_ptr m_mean_gradient; - /** Gradient w.r.t. standard deviations. */ - std::unique_ptr m_var_gradient; + /** @brief Current minibatch means and standard deviations. + * + * These are fused for performance when doing non-local batchnorm. + */ + std::unique_ptr m_mean_and_var; + /** View of current mini-batch means. */ + std::unique_ptr m_mean_v; + /** View of current mini-batch standard deviations. */ + std::unique_ptr m_var_v; + /** @brief Gradients w.r.t. means and standard deviations. + * + * These are fused for performance when doing non-local batchnorm. + */ + std::unique_ptr m_mean_and_var_gradient; + /** View of gradient w.r.t. means. */ + std::unique_ptr m_mean_gradient_v; + /** View of gradient w.r.t. standard deviations. */ + std::unique_ptr m_var_gradient_v; /** Gradient w.r.t. scaling terms. */ - std::unique_ptr m_scale_gradient; + std::unique_ptr m_scale_gradient; /** Gradient w.r.t. bias terms. */ - std::unique_ptr m_bias_gradient; + std::unique_ptr m_bias_gradient; public: /** @brief Set up batch normalization. @@ -91,56 +152,64 @@ class batch_normalization_layer : public regularizer_layer { * @param decay Controls the momentum of the running mean/standard * deviation averages. * @param epsilon A small number to avoid division by zero. - * @param stats_aggregation The type of statistics to use when training. + * @param statistics_group_size Number of processors to aggregate + * statistics over. Defaults to 1 (i.e. local aggregation). */ batch_normalization_layer(lbann_comm *comm, - DataType decay=0.9, - DataType epsilon=1e-5, - batch_normalization_stats_aggregation stats_aggregation = - batch_normalization_stats_aggregation::local) - : regularizer_layer(comm), + TensorDataType decay=0.9, + TensorDataType epsilon=1e-5, + int statistics_group_size=1) + : regularizer_layer(comm), m_decay(decay), m_epsilon(epsilon), - m_stats_aggregation(stats_aggregation) { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "batch normalization only supports DATA_PARALLEL"); + m_statistics_group_size(statistics_group_size) { #ifdef LBANN_DETERMINISTIC // Force global computation. - m_stats_aggregation = batch_normalization_stats_aggregation::global; + m_statistics_group_size = 0; #endif } batch_normalization_layer(const batch_normalization_layer& other) - : regularizer_layer(other), + : regularizer_layer(other), m_decay(other.m_decay), m_epsilon(other.m_epsilon), - m_stats_aggregation(other.m_stats_aggregation), + m_statistics_group_size(other.m_statistics_group_size), m_num_per_sum_cache(other.m_num_per_sum_cache), - m_mean(other.m_mean ? other.m_mean->Copy() : nullptr), - m_var(other.m_var ? other.m_var->Copy() : nullptr), - m_mean_gradient(other.m_mean_gradient ? - other.m_mean_gradient->Copy() : nullptr), - m_var_gradient(other.m_var_gradient ? - other.m_var_gradient->Copy() : nullptr), + m_mean_and_var(other.m_mean_and_var ? + other.m_mean_and_var->Copy() : nullptr), + m_mean_v(other.m_mean_v ? other.m_mean_v->Copy() : nullptr), + m_var_v(other.m_var_v ? other.m_var_v->Copy() : nullptr), + m_mean_and_var_gradient(other.m_mean_and_var_gradient ? + other.m_mean_and_var_gradient->Copy() : nullptr), + m_mean_gradient_v(other.m_mean_gradient_v ? + other.m_mean_gradient_v->Copy() : nullptr), + m_var_gradient_v(other.m_var_gradient_v ? + other.m_var_gradient_v->Copy() : nullptr), m_scale_gradient(other.m_scale_gradient ? other.m_scale_gradient->Copy() : nullptr), m_bias_gradient(other.m_bias_gradient ? other.m_bias_gradient->Copy() : nullptr) {} batch_normalization_layer& operator=(const batch_normalization_layer& other) { - regularizer_layer::operator=(other); + regularizer_layer::operator=(other); m_decay = other.m_decay; m_epsilon = other.m_epsilon; - m_stats_aggregation = other.m_stats_aggregation; + m_statistics_group_size = other.m_statistics_group_size; m_num_per_sum_cache = other.m_num_per_sum_cache; // Deep copy matrices - m_mean.reset(other.m_mean ? other.m_mean->Copy() : nullptr); - m_var.reset(other.m_var ? other.m_var->Copy() : nullptr); - m_mean_gradient.reset(other.m_mean_gradient ? - other.m_mean_gradient->Copy() : nullptr); - m_var_gradient.reset(other.m_var_gradient ? - other.m_var_gradient->Copy() : nullptr); + m_mean_and_var.reset(other.m_mean_and_var ? + other.m_mean_and_var->Copy() : nullptr); + m_mean_v.reset(other.m_mean_v ? + other.m_mean_v->Copy() : nullptr); + m_var_v.reset(other.m_var_v ? + other.m_var_v->Copy() : nullptr); + m_mean_and_var_gradient.reset(other.m_mean_and_var_gradient ? + other.m_mean_and_var_gradient->Copy() : nullptr); + m_mean_gradient_v.reset(other.m_mean_gradient_v ? + other.m_mean_gradient_v->Copy() : nullptr); + m_var_gradient_v.reset(other.m_var_gradient_v ? + other.m_var_gradient_v->Copy() : nullptr); m_scale_gradient.reset(other.m_scale_gradient ? other.m_scale_gradient->Copy() : nullptr); m_bias_gradient.reset(other.m_bias_gradient ? @@ -155,159 +224,154 @@ class batch_normalization_layer : public regularizer_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = regularizer_layer::get_description(); + auto desc = regularizer_layer::get_description(); desc.add("Decay", m_decay); desc.add("Epsilon", m_epsilon); - switch (m_stats_aggregation) { - case batch_normalization_stats_aggregation::local: - desc.add("Statistics aggregation", "local"); - break; - case batch_normalization_stats_aggregation::node_local: - desc.add("Statistics aggregation", "node-local"); - break; - case batch_normalization_stats_aggregation::global: - desc.add("Statistics aggregation", "global"); - break; - } + desc.add("Statistics group size", m_statistics_group_size); return desc; } protected: void setup_matrices(const El::Grid& grid) override { - regularizer_layer::setup_matrices(grid); - m_mean.reset(new StarMat(grid)); - m_var.reset(new StarMat(grid)); - m_mean_gradient.reset(new StarMat(grid)); - m_var_gradient.reset(new StarMat(grid)); - m_scale_gradient.reset(new StarMat(grid)); - m_bias_gradient.reset(new StarMat(grid)); + regularizer_layer::setup_matrices(grid); + m_mean_and_var.reset(new StarMatDT(grid)); + m_mean_v.reset(new StarMatDT(grid)); + m_var_v.reset(new StarMatDT(grid)); + m_mean_and_var_gradient.reset(new StarMatDT(grid)); + m_mean_gradient_v.reset(new StarMatDT(grid)); + m_var_gradient_v.reset(new StarMatDT(grid)); + m_scale_gradient.reset(new StarMatDT(grid)); + m_bias_gradient.reset(new StarMatDT(grid)); } - void setup_dims() override { - regularizer_layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + regularizer_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); } - void setup_data() override { - regularizer_layer::setup_data(); - const auto& output_dims = get_output_dims(); + void setup_data(size_t max_mini_batch_size) override { + regularizer_layer::setup_data(max_mini_batch_size); + const auto& output_dims = this->get_output_dims(); const auto& num_channels = output_dims[0]; // Display warning if mini-batch size is small - const auto& output = get_activations(); + const auto& output = this->get_activations(); const auto& mini_batch_size = output.Width(); const auto& local_mini_batch_size = mini_batch_size / output.DistSize(); - if (m_stats_aggregation == batch_normalization_stats_aggregation::global - && mini_batch_size <= 4) { - std::stringstream err; - err << "LBANN warning: " - << get_type() << " layer \"" << get_name() << "\" " - << "is using global statistics and " - << "the mini-batch size (" << mini_batch_size << ") " - << "may be too small to get good statistics"; + if (m_statistics_group_size == 0 && mini_batch_size <= 4) { if (output.DistRank() == 0) { + std::stringstream err; + err << "LBANN warning: " + << get_type() << " layer \"" << this->get_name() << "\" " + << "is using global statistics and " + << "the mini-batch size (" << mini_batch_size << ") " + << "may be too small to get good statistics"; std::cerr << err.str() << std::endl; } - } else if (m_stats_aggregation == batch_normalization_stats_aggregation::node_local - && local_mini_batch_size*m_comm->get_procs_per_node() <= 4) { - std::stringstream err; - err << "LBANN warning: " - << get_type() << " layer \"" << get_name() << "\" " - << "is using node-local statistics and " - << "the node-local mini-batch size (" - << (local_mini_batch_size*m_comm->get_procs_per_node()) << ") " - << "may be too small to get good statistics"; + } else if (m_statistics_group_size != 0 && + m_statistics_group_size*local_mini_batch_size <= 4) { + // This possibly underestimates the aggregation size for processors with + // smaller local mini-batch sizes. if (output.DistRank() == 0) { - std::cerr << err.str() << std::endl; - } - } else if (m_stats_aggregation == batch_normalization_stats_aggregation::local - && local_mini_batch_size <= 4) { - std::stringstream err; + std::stringstream err; err << "LBANN warning: " - << get_type() << " layer \"" << get_name() << "\" " - << "is using local statistics and " - << "the local mini-batch size (" << local_mini_batch_size << ") " + << get_type() << " layer \"" << this->get_name() << "\" " + << "is aggregating statistics over " + << m_statistics_group_size + << "processors and the aggregated mini-batch size (" + << (m_statistics_group_size*local_mini_batch_size) << ") " << "may be too small to get good statistics"; - if (output.DistRank() == 0) { std::cerr << err.str() << std::endl; } } // Initialize default weights if none are provided - if (this->m_weights.size() > 4) { + if (this->num_weights() > 4) { std::stringstream err; - err << "attempted to setup layer \"" << m_name << "\" " + err << "attempted to setup layer \"" << this->m_name << "\" " << "with an invalid number of weights"; LBANN_ERROR(err.str()); } - this->m_weights.resize(4, nullptr); - if (this->m_weights[0] == nullptr) { - this->m_weights[0] = new weights(get_comm()); - std::unique_ptr init(new constant_initializer(DataType(1))); - std::unique_ptr opt(m_model->create_optimizer()); - this->m_weights[0]->set_name(get_name() + "_scale"); - this->m_weights[0]->set_initializer(init); - this->m_weights[0]->set_optimizer(opt); - this->m_model->add_weights(this->m_weights[0]); + this->set_num_weights(4); + if (!this->has_weights(0)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(El::TypeTraits::One()); + auto opt = this->m_model->template create_optimizer(); + w->set_name(this->get_name() + "_scale"); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->set_weights(0, w.get()); + this->m_model->add_weights(std::move(w)); } - if (this->m_weights[1] == nullptr) { - this->m_weights[1] = new weights(get_comm()); - std::unique_ptr init(new constant_initializer(DataType(0))); - std::unique_ptr opt(m_model->create_optimizer()); - this->m_weights[1]->set_name(get_name() + "_bias"); - this->m_weights[1]->set_initializer(init); - this->m_weights[1]->set_optimizer(opt); - this->m_model->add_weights(this->m_weights[1]); + if (!this->has_weights(1)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(El::TypeTraits::Zero()); + auto opt = this->m_model->template create_optimizer(); + w->set_name(this->get_name() + "_bias"); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->set_weights(1, w.get()); + this->m_model->add_weights(std::move(w)); } - if (this->m_weights[2] == nullptr) { - this->m_weights[2] = new weights(get_comm()); - this->m_weights[2]->set_name(get_name() + "_running_mean"); - std::unique_ptr init(new constant_initializer(DataType(0))); - this->m_weights[2]->set_initializer(init); - this->m_model->add_weights(this->m_weights[2]); + if (!this->has_weights(2)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(El::TypeTraits::Zero()); + w->set_name(this->get_name() + "_running_mean"); + w->set_initializer(std::move(init)); + this->set_weights(2, w.get()); + this->m_model->add_weights(std::move(w)); } - if (this->m_weights[3] == nullptr) { - this->m_weights[3] = new weights(get_comm()); - this->m_weights[3]->set_name(get_name() + "_running_variance"); - std::unique_ptr init(new constant_initializer(DataType(1))); - this->m_weights[3]->set_initializer(init); - this->m_model->add_weights(this->m_weights[3]); + if (!this->has_weights(3)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(El::TypeTraits::One()); + w->set_name(this->get_name() + "_running_variance"); + w->set_initializer(std::move(init)); + this->set_weights(3, w.get()); + this->m_model->add_weights(std::move(w)); } // Setup weights - auto dist = get_prev_activations().DistData(); + auto dist = this->get_prev_activations().DistData(); dist.colDist = El::STAR; dist.rowDist = El::STAR; - for (auto* w : this->m_weights) { - w->set_dims(num_channels); - w->set_matrix_distribution(dist); + size_t const num_weights = this->num_weights(); + for (size_t ii = 0; ii < num_weights; ++ii) { + auto& w = this->get_weights(ii); + w.set_dims(num_channels); + w.set_matrix_distribution(dist); } // Initialize matrices - El::Zeros(*m_mean, num_channels, 1); - El::Zeros(*m_var, num_channels, 1); - El::Zeros(*m_mean_gradient, num_channels, 1); - El::Zeros(*m_var_gradient, num_channels, 1); + El::Zeros(*m_mean_and_var, num_channels, 2); + El::Zeros(*m_mean_and_var_gradient, num_channels, 2); El::Zeros(*m_scale_gradient, num_channels, 1); El::Zeros(*m_bias_gradient, num_channels, 1); + // Initialize views. + El::View(*m_mean_v, *m_mean_and_var, El::ALL, El::IR(0, 1)); + El::View(*m_var_v, *m_mean_and_var, El::ALL, El::IR(1, 2)); + El::View(*m_mean_gradient_v, *m_mean_and_var_gradient, + El::ALL, El::IR(0, 1)); + El::View(*m_var_gradient_v, *m_mean_and_var_gradient, + El::ALL, El::IR(1, 2)); + // Initialize freeze state - for (auto&& w : this->m_weights) { - if (m_frozen) { - w->freeze(); + for (size_t ii = 0; ii < num_weights; ++ii) { + auto& w = this->get_weights(ii); + if (this->m_frozen) { + w.freeze(); } else { - w->unfreeze(); + w.unfreeze(); } } - for (auto&& w : this->m_weights) { - if (w->is_frozen() != m_frozen) { - std::stringstream err; - err << (m_frozen ? "" : "un") << "frozen " - << "layer \"" << get_name() << "\" has " - << (w->is_frozen() ? "" : "un") << "frozen " - << "weights \"" << w->get_name() << "\""; - LBANN_ERROR(err.str()); + for (size_t ii = 0; ii < num_weights; ++ii) { + auto& w = this->get_weights(ii); + if (w.is_frozen() != this->m_frozen) { + LBANN_ERROR((this->m_frozen ? "" : "un"), "frozen layer " + "\"", this->get_name(), "\" has ", + (w.is_frozen() ? "" : "un"), "frozen weights " + "\"", w.get_name(), "\"");; } } @@ -316,8 +380,159 @@ class batch_normalization_layer : public regularizer_layer { void fp_compute() override; void bp_compute() override; +#ifdef LBANN_HAS_DISTCONV + friend class batch_normalization_distconv_adapter; + protected: + bool is_distconv_supported() const override { + return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL; + } + void setup_distconv_adapter() override { + this->get_distconv_adapter_ptr() = make_unique< + batch_normalization_distconv_adapter>(*this); + } + batch_normalization_distconv_adapter& get_distconv_adapter() override; + const batch_normalization_distconv_adapter& get_distconv_adapter() const override; +#endif // LBANN_HAS_DISTCONV }; +#ifdef LBANN_HAS_DISTCONV +template +const batch_normalization_distconv_adapter& +batch_normalization_layer::get_distconv_adapter() const { + return dynamic_cast&>(data_type_layer::get_distconv_adapter()); +} + +template +batch_normalization_distconv_adapter& +batch_normalization_layer::get_distconv_adapter() { + return const_cast&>( + static_cast&>(*this).get_distconv_adapter()); +} + +template +dc::Shape batch_normalization_distconv_adapter:: +get_per_channel_stat_shape() const { + auto &l = dynamic_cast&>(this->layer()); + const int num_channels = this->get_activations_shape()[dc::get_channel_dim()]; + // Sanity check that the shared tensors have the correct shape + assert_ne(num_channels, 0); + assert_eq(l.m_mean_and_var->Matrix().Width() * + l.m_mean_and_var->Matrix().Height(), + num_channels * 2); + dc::Shape per_channel_stat_shape(dc::get_num_dims(l), 1); + per_channel_stat_shape[dc::get_channel_dim()] = num_channels; + return per_channel_stat_shape; +} + +template +dc::Dist batch_normalization_distconv_adapter:: +get_per_channel_stat_dist(const dc::Dist &input_dist) const { + auto shared_dist = dc::Dist::make_distribution( + input_dist.get_locale_shape()); + auto split_shape = input_dist.get_split_shape(); + // set all dimensions to be 1 except for the channel dimension + auto pc = split_shape[-2]; + // set all elements to 1 + split_shape = 1; + split_shape[-2] = pc; + shared_dist.set_split_shape(split_shape); + + return shared_dist; +} + +template +void batch_normalization_distconv_adapter:: +setup_fp_tensors() { + data_type_distconv_adapter::setup_fp_tensors(); + + auto &l = static_cast&>(this->layer()); + const auto &input_dist = this->get_prev_activations_dist(); + + const auto per_channel_stat_shape = get_per_channel_stat_shape(); + const auto shared_dist = get_per_channel_stat_dist(input_dist); + + const dc::LocaleMPI loc(dc::get_mpi_comm(), false); + + // mean + m_mean = TensorDevType(per_channel_stat_shape, loc, shared_dist); + assert0(dc::tensor::View(m_mean, l.m_mean_v->Buffer())); + // var + m_var = TensorDevType(per_channel_stat_shape, loc, shared_dist); + assert0(dc::tensor::View(m_var, l.m_var_v->Buffer())); + // scale: view to weights[0] + m_scale = TensorDevType(per_channel_stat_shape, loc, shared_dist); + // bias: view to weights[1] + m_bias = TensorDevType(per_channel_stat_shape, loc, shared_dist); + // running_mean: view to weights[2] + m_running_mean = TensorDevType(per_channel_stat_shape, loc, shared_dist); + // running_var: view to weights[3] + m_running_var = TensorDevType(per_channel_stat_shape, loc, shared_dist); +} + +template +void batch_normalization_distconv_adapter:: +setup_bp_tensors() { + data_type_distconv_adapter::setup_bp_tensors(); + + const auto &prev_error_signal_dist = this->get_prev_error_signals_dist(); + auto &l = static_cast&>(this->layer()); + + const auto per_channel_stat_shape = get_per_channel_stat_shape(); + const auto shared_dist = get_per_channel_stat_dist( + prev_error_signal_dist); + + const dc::LocaleMPI loc(dc::get_mpi_comm(), false); + + // scale_gradient + m_scale_gradient = TensorDevType(per_channel_stat_shape, loc, shared_dist); + assert0(dc::tensor::View( + m_scale_gradient, l.m_scale_gradient->Buffer())); + // bias_gradient + m_bias_gradient = TensorDevType(per_channel_stat_shape, loc, shared_dist); + assert0(dc::tensor::View( + m_bias_gradient, l.m_bias_gradient->Buffer())); + // mean_gradient + m_mean_gradient = TensorDevType(per_channel_stat_shape, loc, shared_dist); + assert0(dc::tensor::View( + m_mean_gradient, l.m_mean_gradient_v->Buffer())); + // var_gradient + m_var_gradient = TensorDevType(per_channel_stat_shape, loc, shared_dist); + assert0(dc::tensor::View( + m_var_gradient, l.m_var_gradient_v->Buffer())); +} + +template +void batch_normalization_distconv_adapter::setup_layer( + size_t workspace_capacity) { + auto &l = dynamic_cast&>(this->layer()); + bool global_stats; + if (l.m_statistics_group_size == 0) { + global_stats = true; + } else if (l.m_statistics_group_size == 1) { + global_stats = false; + } else { + LBANN_ERROR("statistics_group_size must be either 0 or 1 for now."); + } + + m_bn = make_unique>( + dc::get_backend(), dc::get_num_dims(l), + l.m_decay, l.m_epsilon, global_stats); +} +#endif // LBANN_HAS_DISTCONV + +#ifndef LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class batch_normalization_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_REGULARIZER_BATCH_NORMALIZATION_HPP_INCLUDED diff --git a/include/lbann/layers/regularizers/dropout.hpp b/include/lbann/layers/regularizers/dropout.hpp index d19f4be4125..35e72af956c 100644 --- a/include/lbann/layers/regularizers/dropout.hpp +++ b/include/lbann/layers/regularizers/dropout.hpp @@ -28,7 +28,9 @@ #define LBANN_LAYER_REGULARIZER_DROPOUT_HPP_INCLUDED #include "lbann/layers/regularizers/regularizer.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/cudnn.hpp" +#include "lbann/utils/random_number_generators.hpp" namespace lbann { @@ -43,13 +45,22 @@ namespace lbann { * prevent neural networks from overfitting." The Journal of Machine * Learning Research 15, no. 1 (2014): 1929-1958. */ -template -class dropout : public regularizer_layer { +template +class dropout : public regularizer_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: /** Keep units with probabiliy keep_prob. */ dropout(lbann_comm *comm, EvalType keep_prob = EvalType(0.5)) - : regularizer_layer(comm), + : regularizer_layer(comm), m_keep_prob(keep_prob) #ifdef LBANN_HAS_CUDNN , m_dropout_cudnn_desc(nullptr), @@ -58,7 +69,7 @@ class dropout : public regularizer_layer { { #if defined(LBANN_HAS_CUDNN) && defined(LBANN_DETERMINISTIC) /// @todo GPU implementation of dropout with sequential consistency - if (Dev == El::Device::GPU && get_comm()->am_trainer_master()) { + if (Dev == El::Device::GPU && this->get_comm()->am_trainer_master()) { std::cerr << "Warning: GPU dropout currently does not guarantee " << "sequential consistency" << std::endl; } @@ -66,7 +77,7 @@ class dropout : public regularizer_layer { } dropout(const dropout& other) - : regularizer_layer(other), + : regularizer_layer(other), m_keep_prob(other.m_keep_prob), m_mask(other.m_mask ? other.m_mask->Copy() : nullptr) #ifdef LBANN_HAS_CUDNN @@ -85,9 +96,9 @@ class dropout : public regularizer_layer { } dropout& operator=(const dropout& other) { - regularizer_layer::operator=(other); + regularizer_layer::operator=(other); m_keep_prob = other.m_keep_prob; - m_mask = other.m_mask ? other.m_mask->Copy() : nullptr; + m_mask = other.m_mask ? std::unique_ptr(other.m_mask->Copy()) : nullptr; #ifdef LBANN_HAS_CUDNN m_tensors_cudnn_desc = other.m_tensors_cudnn_desc; m_tensors_cudnn_desc.set_layer(this); @@ -117,25 +128,33 @@ class dropout : public regularizer_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = regularizer_layer::get_description(); + auto desc = regularizer_layer::get_description(); desc.add("Keep probability", m_keep_prob); return desc; } + /** @brief get prob for keep each unit. */ + EvalType get_keep_prob() const { + return m_keep_prob; + } + /** @brief set prob for keep each unit. */ + void set_keep_prob(EvalType keep_prob) { + m_keep_prob = keep_prob; + } protected: - void setup_dims() override { - regularizer_layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + regularizer_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); } void setup_matrices(const El::Grid& grid) override { - regularizer_layer::setup_matrices(grid); - m_mask = std::unique_ptr(get_activations().Copy()); + regularizer_layer::setup_matrices(grid); + m_mask = std::unique_ptr(this->get_activations().Copy()); } void setup_gpu() override { - regularizer_layer::setup_gpu(); + regularizer_layer::setup_gpu(); #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else @@ -147,7 +166,7 @@ class dropout : public regularizer_layer { } void fp_compute () override { - if (using_gpus()) { + if (this->using_gpus()) { fp_compute_gpu(); } else { fp_compute_cpu(); @@ -155,7 +174,7 @@ class dropout : public regularizer_layer { } void bp_compute () override { - if (using_gpus()) { + if (this->using_gpus()) { bp_compute_gpu(); } else { bp_compute_cpu(); @@ -167,31 +186,31 @@ class dropout : public regularizer_layer { void fp_compute_cpu() { // Matrices - const auto& input = get_prev_activations(); - auto& output = get_activations(); + const auto& input = this->get_prev_activations(); + auto& output = this->get_activations(); // Do nothing if dropout is disabled - const auto& mode = this->m_model->get_execution_mode(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); if (mode != execution_mode::training || m_keep_prob < EvalType(0)) { El::Copy(input, output); return; } // Construct mask matrix - const DataType scale = 1 / m_keep_prob; + const TensorDataType scale = static_cast(1 / m_keep_prob); const auto& height = input.Height(); const auto& width = input.Width(); m_mask->Resize(height, width); #ifdef LBANN_DETERMINISTIC - bernoulli_fill_procdet(*m_mask, height, width, DataType(m_keep_prob)); + bernoulli_fill_procdet(*m_mask, height, width, TensorDataType(m_keep_prob)); El::Scale(scale, *m_mask); #else El::EntrywiseMap(*m_mask, - (std::function) - ([this,scale](const DataType& z)->DataType { + (std::function) + ([this,scale](const TensorDataType& z)->TensorDataType { auto& gen = get_fast_generator(); std::bernoulli_distribution dist(m_keep_prob); - return dist(gen) ? scale : DataType(0); + return dist(gen) ? scale : El::TypeTraits::Zero(); })); #endif // LBANN_DETERMINISTIC @@ -202,9 +221,9 @@ class dropout : public regularizer_layer { /** Adjust gradients for dropout in backprop. */ void bp_compute_cpu() { - const auto& gradient_wrt_output = get_prev_error_signals(); - auto& gradient_wrt_input = get_error_signals(); - const auto& mode = this->m_model->get_execution_mode(); + const auto& gradient_wrt_output = this->get_prev_error_signals(); + auto& gradient_wrt_input = this->get_error_signals(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); if (mode != execution_mode::training || m_keep_prob < EvalType(0)) { El::Copy(gradient_wrt_output, gradient_wrt_input); } else { @@ -218,13 +237,13 @@ class dropout : public regularizer_layer { #else // Matrices - const auto& input = get_prev_activations(); + const auto& input = this->get_prev_activations(); const auto& local_input = input.LockedMatrix(); - auto& output = get_activations(); + auto& output = this->get_activations(); auto& local_output = output.Matrix(); // Do nothing if dropout is disabled or there is no local data - const auto& mode = this->m_model->get_execution_mode(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); if (mode != execution_mode::training || m_keep_prob < EvalType(0)) { El::Copy(input, output); return; @@ -236,7 +255,7 @@ class dropout : public regularizer_layer { auto&& output_desc = m_tensors_cudnn_desc.get_activations(); size_t size; CHECK_CUDNN(cudnnDropoutGetReserveSpaceSize(input_desc, &size)); - m_reserve_space.Resize((size + sizeof(DataType) - 1) / sizeof(DataType), 1); + m_reserve_space.Resize((size + sizeof(TensorDataType) - 1) / sizeof(TensorDataType), 1); // Apply dropout on the GPU CHECK_CUDNN(cudnnDropoutForward(cudnn::get_handle(), @@ -246,7 +265,7 @@ class dropout : public regularizer_layer { output_desc, local_output.Buffer(), m_reserve_space.Buffer(), - m_reserve_space.Height() * sizeof(DataType))); + m_reserve_space.Height() * sizeof(TensorDataType))); #endif // LBANN_HAS_CUDNN } @@ -257,13 +276,13 @@ class dropout : public regularizer_layer { #else // Matrices - const auto& gradient_wrt_output = get_prev_error_signals(); + const auto& gradient_wrt_output = this->get_prev_error_signals(); const auto& local_gradient_wrt_output = gradient_wrt_output.LockedMatrix(); - auto& gradient_wrt_input = get_error_signals(); + auto& gradient_wrt_input = this->get_error_signals(); auto& local_gradient_wrt_input = gradient_wrt_input.Matrix(); // Copy error signal if dropout is disabled - const auto& mode = this->m_model->get_execution_mode(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); if (mode != execution_mode::training || m_keep_prob < EvalType(0)) { El::Copy(gradient_wrt_output, gradient_wrt_input); } else { @@ -276,7 +295,7 @@ class dropout : public regularizer_layer { m_tensors_cudnn_desc.get_error_signals(), local_gradient_wrt_input.Buffer(), m_reserve_space.Buffer(), - m_reserve_space.Height() * sizeof(DataType))); + m_reserve_space.Height() * sizeof(TensorDataType))); } } #endif // LBANN_HAS_CUDNN @@ -296,7 +315,7 @@ class dropout : public regularizer_layer { // Setup RNG state size_t size; CHECK_CUDNN(cudnnDropoutGetStatesSize(cudnn::get_handle(), &size)); - m_states.Resize((size + sizeof(DataType) - 1) / sizeof(DataType), 1); + m_states.Resize((size + sizeof(TensorDataType) - 1) / sizeof(TensorDataType), 1); // Setup dropout descriptor CHECK_CUDNN(cudnnCreateDropoutDescriptor(&m_dropout_cudnn_desc)); @@ -304,7 +323,7 @@ class dropout : public regularizer_layer { cudnn::get_handle(), float(1 - m_keep_prob), m_states.Buffer(), - m_states.Height() * sizeof(DataType), + m_states.Height() * sizeof(TensorDataType), get_generator()())); } @@ -313,21 +332,35 @@ class dropout : public regularizer_layer { /** Probability of keeping each unit. */ EvalType m_keep_prob; /** Current dropout mask (a scaled Bernoulli random matrix). */ - std::unique_ptr m_mask; + std::unique_ptr m_mask; #ifdef LBANN_HAS_CUDNN /** Dropout cuDNN descriptor. */ cudnnDropoutDescriptor_t m_dropout_cudnn_desc; /** Tensor cuDNN descriptors. */ - cudnn::entrywise_layer_tensor_manager m_tensors_cudnn_desc; + cudnn::entrywise_layer_tensor_manager m_tensors_cudnn_desc; /** RNG state for cuDNN dropout. */ - GPUMat m_states; + El::Matrix m_states; /** Work space for cuDNN dropout. */ - GPUMat m_reserve_space; + El::Matrix m_reserve_space; #endif // LBANN_HAS_CUDNN }; +template +using dropout_layer = dropout; + +LBANN_DEFINE_LAYER_BUILDER(dropout); + +#ifndef LBANN_DROPOUT_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class dropout; \ + extern template class dropout + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_DROPOUT_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_REGULARIZER_DROPOUT_HPP_INCLUDED diff --git a/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp b/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp new file mode 100644 index 00000000000..67ed575faf8 --- /dev/null +++ b/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp @@ -0,0 +1,249 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_REGULARIZERS_ENTRYWISE_BATCH_NORMALIZATION_HPP_INCLUDED +#define LBANN_LAYERS_REGULARIZERS_ENTRYWISE_BATCH_NORMALIZATION_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/utils/memory.hpp" + +namespace lbann { + +/** @brief + * + * Each input entry is normalized across the mini-batch to have zero + * mean and unit standard deviation. This uses the standard approach + * of maintaining the running mean and standard deviation (with + * exponential decay) for use at test time. See: + * + * Sergey Ioffe and Christian Szegedy. "Batch Normalization: + * Accelerating Deep Network Training by Reducing Internal Covariate + * Shift." In International Conference on Machine Learning, + * pp. 448-456. 2015. + */ +template +class entrywise_batch_normalization_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} + +public: + + entrywise_batch_normalization_layer(lbann_comm* comm, + TensorDataType decay=0.9, + TensorDataType epsilon=1e-5) + : data_type_layer(comm), m_decay(decay), m_epsilon(epsilon) {} + + entrywise_batch_normalization_layer(const entrywise_batch_normalization_layer& other) + : data_type_layer(other), + m_decay(other.m_decay), + m_epsilon(other.m_epsilon), + m_batch_statistics(other.m_batch_statistics ? + other.m_batch_statistics->Copy() : + nullptr), + m_batch_statistics_gradient(other.m_batch_statistics_gradient ? + other.m_batch_statistics_gradient->Copy() : + nullptr) {} + + entrywise_batch_normalization_layer& operator=(const entrywise_batch_normalization_layer& other) { + data_type_layer::operator=(other); + m_decay = other.m_decay; + m_epsilon = other.m_epsilon; + m_batch_statistics.reset(other.m_batch_statistics ? + other.m_batch_statistics->Copy() : + nullptr); + m_batch_statistics_gradient.reset(other.m_batch_statistics_gradient ? + other.m_batch_statistics_gradient->Copy() : + nullptr); + return *this; + } + + entrywise_batch_normalization_layer* copy() const override { return new entrywise_batch_normalization_layer(*this); } + std::string get_type() const override { return "entry-wise batch normalization"; } + data_layout get_data_layout() const override { return Layout; } + El::Device get_device_allocation() const override { return Device; } + + description get_description() const override { + auto desc = data_type_layer::get_description(); + desc.add("Decay", m_decay); + desc.add("Epsilon", m_epsilon); + return desc; + } + +protected: + + void setup_matrices(const El::Grid& grid) override { + data_type_layer::setup_matrices(grid); + auto dist = this->get_prev_activations().DistData(); + dist.rowDist = El::STAR; + m_batch_statistics.reset(AbsDistMatrixType::Instantiate(dist)); + m_batch_statistics_gradient.reset(AbsDistMatrixType::Instantiate(dist)); + } + + void setup_data(size_t max_mini_batch_size) override { + data_type_layer::setup_data(max_mini_batch_size); + + // Initialize output dimensions + this->set_output_dims(this->get_input_dims()); + const auto output_dims = this->get_output_dims(); + const auto output_size = this->get_output_size(); + + // Initialize default weights if none are provided + if (this->num_weights() > 2) { + std::stringstream err; + err << "attempted to setup layer \"" << this->get_name() << "\" " + << "with an invalid number of weights " + << "(found " << this->num_weights() << ", expected 2)"; + LBANN_ERROR(err.str()); + } + this->set_num_weights(2); + if (!this->has_weights(0)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(El::TypeTraits::Zero()); + w->set_name(this->get_name() + "_running_mean"); + w->set_initializer(std::move(init)); + this->set_weights(0, w.get()); + this->m_model->add_weights(std::move(w)); + } + if (!this->has_weights(1)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(El::TypeTraits::One()); + w->set_name(this->get_name() + "_running_variance"); + w->set_initializer(std::move(init)); + this->set_weights(1, w.get()); + this->m_model->add_weights(std::move(w)); + } + + // Setup weights + auto dist = this->get_prev_activations().DistData(); + dist.rowDist = El::STAR; + auto const num_weights = this->num_weights(); + for (size_t ii = 0; ii < num_weights; ++ii) { + auto& w = this->get_weights(ii); + w.set_dims(output_dims); + w.set_matrix_distribution(dist); + } + + // Initialize matrices + m_batch_statistics->AlignWith(dist); + m_batch_statistics->Resize(output_size, 2); + m_batch_statistics_gradient->AlignWith(dist); + m_batch_statistics_gradient->Resize(output_size, 2); + + } + + void fp_setup_outputs(El::Int mini_batch_size) override { + data_type_layer::fp_setup_outputs(mini_batch_size); + const auto& input = this->get_prev_activations(); + const auto input_size = this->get_input_size(); + + // Make sure batch statistics tensor is aligned with input tensor + m_batch_statistics->Empty(false); + m_batch_statistics->AlignWith(input); + m_batch_statistics->Resize(input_size, 2); + +#if 0 /// @todo See https://github.com/LLNL/lbann/issues/1123 + + // Check that weights tensors is aligned with input tensor + /// @todo Realign tensors if misaligned + bool aligned = true; + try { + const auto& running_mean = weights_values(0); + const auto& running_var = weights_values(1); + aligned = (input.ColAlign() == running_mean.ColAlign() + && input.RowAlign() == running_mean.RowAlign() + && input.ColAlign() == running_var.ColAlign() + && input.RowAlign() == running_var.RowAlign()); + } + catch (const exception& e) { + // An exception is thrown if you try accessing weights values + // before they are initialized. We don't care if this case is + // aligned, so it's safe to ignore. + } + if (!aligned) { + std::ostringstream err; + err << this->get_type() << " layer \"" << this->get_name() << "\" " + << "has misaligned input and weights matrices"; + LBANN_ERROR(err.str()); + } + +#endif // 0 + + } + + void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { + data_type_layer::bp_setup_gradient_wrt_inputs(mini_batch_size); + m_batch_statistics_gradient->Empty(false); + m_batch_statistics_gradient->AlignWith(this->get_prev_activations()); + m_batch_statistics_gradient->Resize(this->get_input_size(), 2); + } + + void fp_compute() override; + void bp_compute() override; + +private: + + /** Decay rate for the running statistics. */ + TensorDataType m_decay; + /** Small number to avoid division by zero. */ + TensorDataType m_epsilon; + + /** @brief Current mini-batch statistics. + * + * These are fused for performance when doing non-local batchnorm. + */ + std::unique_ptr m_batch_statistics; + /** @brief Gradients w.r.t. current mini-batch statistics. + * + * These are fused for performance when doing non-local batchnorm. + */ + std::unique_ptr m_batch_statistics_gradient; + +}; + +#ifndef LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class entrywise_batch_normalization_layer< \ + T, data_layout::DATA_PARALLEL, Device>; \ + extern template class entrywise_batch_normalization_layer< \ + T, data_layout::MODEL_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYERS_REGULARIZERS_ENTRYWISE_BATCH_NORMALIZATION_HPP_INCLUDED diff --git a/include/lbann/layers/regularizers/instance_norm.hpp b/include/lbann/layers/regularizers/instance_norm.hpp new file mode 100644 index 00000000000..f9f3c7f41fd --- /dev/null +++ b/include/lbann/layers/regularizers/instance_norm.hpp @@ -0,0 +1,148 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_REGULARIZERS_INSTANCE_NORM_HPP_INCLUDED +#define LBANN_LAYERS_REGULARIZERS_INSTANCE_NORM_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" + +namespace lbann { + +/** @brief + * + * Each channel within a data sample is normalized to have zero mean + * and unit standard deviation. See: + * + * Dmitry Ulyanov, Andrea Vedaldi, and Victor Lempitsky. "Instance + * normalization: The missing ingredient for fast stylization." arXiv + * preprint arXiv:1607.08022 (2016). + * + * This is equivalent to applying layer normalization independently + * to each channel. Note that this layer does not apply a + * channel-wise scale and bias. Use the channel-wise scale/bias layer + * to reproduce that functionality. + * + */ +template +class instance_norm_layer : public data_type_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "instance norm layer only supports data parallel layout"); +public: + + /** + * @param comm LBANN communicator + * @param epsilon Small number to avoid division by zero + */ + instance_norm_layer(lbann_comm* comm, TensorDataType epsilon=1e-5); + + instance_norm_layer(const instance_norm_layer& other) = default; + instance_norm_layer& operator=(const instance_norm_layer& other) = default; + instance_norm_layer* copy() const override; + + std::string get_type() const override; + data_layout get_data_layout() const override; + El::Device get_device_allocation() const override; + description get_description() const override; + +protected: + + void setup_dims(DataReaderMetaData& dr_metadata) override; + + void fp_compute() override; + void bp_compute() override; + +private: + + /** Small number to avoid division by zero. */ + TensorDataType m_epsilon; + + /** Contains per-channel sums and sums of squares. */ + El::Matrix m_workspace; + +}; + +// Builder function +LBANN_DEFINE_LAYER_BUILDER(instance_norm); + +// ========================================================= +// Implementation +// ========================================================= + +template +instance_norm_layer::instance_norm_layer( + lbann_comm* comm, + TensorDataType epsilon) + : data_type_layer(comm), m_epsilon(epsilon) +{} + +template +instance_norm_layer* instance_norm_layer::copy() const { + return new instance_norm_layer(*this); +} + +template +std::string instance_norm_layer::get_type() const { + return "instance norm"; +} + +template +data_layout instance_norm_layer::get_data_layout() const { + return Layout; +} + +template +El::Device instance_norm_layer::get_device_allocation() const { + return Device; +} + +template +description instance_norm_layer::get_description() const { + auto desc = data_type_layer::get_description(); + desc.add("Epsilon", m_epsilon); + return desc; +} + +template +void instance_norm_layer::setup_dims(DataReaderMetaData& dr_metadata) { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); +} + +// ========================================================= +// Explicit template instantiation +// ========================================================= + +#ifndef LBANN_INSTANCE_NORM_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class instance_norm_layer< \ + T, data_layout::DATA_PARALLEL, Device>; +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_INSTANCE_NORM_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYERS_REGULARIZERS_INSTANCE_NORM_HPP_INCLUDED diff --git a/include/lbann/layers/regularizers/layer_norm.hpp b/include/lbann/layers/regularizers/layer_norm.hpp new file mode 100644 index 00000000000..19421e91085 --- /dev/null +++ b/include/lbann/layers/regularizers/layer_norm.hpp @@ -0,0 +1,222 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED +#define LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" + +#include + +namespace lbann { + +/** @brief + * + * Each data sample is normalized to have zero mean and unit standard + * deviation. See: + * + * Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E. Hinton. "Layer + * normalization." arXiv preprint arXiv:1607.06450 (2016). + * + * Note that this layer does not apply an entry-wise scale and bias + * like in the paper. Use the entry-wise scale/bias layer to + * reproduce that functionality. + * + */ +template +class layer_norm_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + +public: + + /** + * @param comm LBANN communicator + * @param epsilon Small number to avoid division by zero + */ + layer_norm_layer(lbann_comm* comm, TensorDataType epsilon=1e-5); + + layer_norm_layer(const layer_norm_layer& other); + layer_norm_layer& operator=(const layer_norm_layer& other); + layer_norm_layer* copy() const override; + + std::string get_type() const override; + data_layout get_data_layout() const override; + El::Device get_device_allocation() const override; + description get_description() const override; + +protected: + + void setup_dims(DataReaderMetaData& dr_metadata) override; + void setup_matrices(const El::Grid& grid) override; + void fp_setup_outputs(El::Int mini_batch_size) override; + void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override; + + void fp_compute() override; + void bp_compute() override; + +private: + + using AbsDistMatType = El::AbstractDistMatrix; + + /** Small number to avoid division by zero. */ + TensorDataType m_epsilon; + + /** @brief Per-sample statistics. + * + * The means and variances are fused for performance. + */ + std::unique_ptr m_statistics; + /** @brief Gradients w.r.t. per-sample statistics. + * + * The means and variances are fused for performance. + */ + std::unique_ptr m_statistics_gradient; + +}; + +// ========================================================= +// Implementation +// ========================================================= + +template +layer_norm_layer::layer_norm_layer( + lbann_comm* comm, + TensorDataType epsilon) + : data_type_layer(comm), m_epsilon(epsilon) +{} + +template +layer_norm_layer::layer_norm_layer( + const layer_norm_layer& other) + : data_type_layer(other), + m_epsilon(other.m_epsilon), + m_statistics(other.m_statistics + ? other.m_statistics->Copy() + : nullptr), + m_statistics_gradient(other.m_statistics_gradient + ? other.m_statistics_gradient->Copy() + : nullptr) +{} + +template +layer_norm_layer& layer_norm_layer::operator=( + const layer_norm_layer& other) { + data_type_layer::operator=(other); + m_epsilon = other.m_epsilon; + m_statistics.reset(other.m_statistics + ? other.m_statistics->Copy() + : nullptr); + m_statistics_gradient.reset(other.m_statistics_gradient + ? other.m_statistics_gradient->Copy() + : nullptr); + return *this; +} + +template +layer_norm_layer* layer_norm_layer::copy() const { + return new layer_norm_layer(*this); +} + +template +std::string layer_norm_layer::get_type() const { + return "layer norm"; +} + +template +data_layout layer_norm_layer::get_data_layout() const { + return Layout; +} + +template +El::Device layer_norm_layer::get_device_allocation() const { + return Device; +} + +template +description layer_norm_layer::get_description() const { + auto desc = data_type_layer::get_description(); + desc.add("Epsilon", m_epsilon); + return desc; +} + +template +void layer_norm_layer::setup_dims(DataReaderMetaData& dr_metadata) { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); +} + +template +void layer_norm_layer::setup_matrices(const El::Grid& grid) { + data_type_layer::setup_matrices(grid); + auto dist = this->get_prev_activations().DistData(); + dist.colDist = El::STAR; + m_statistics.reset(AbsDistMatrixType::Instantiate(dist)); + m_statistics_gradient.reset(AbsDistMatrixType::Instantiate(dist)); +} + +template +void layer_norm_layer::fp_setup_outputs(El::Int mini_batch_size) { + data_type_layer::fp_setup_outputs(mini_batch_size); + const auto& input = this->get_prev_activations(); + m_statistics->Empty(false); + m_statistics->AlignWith(input); + m_statistics->Resize(2, input.Width()); +} + +template +void layer_norm_layer::bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) { + data_type_layer::bp_setup_gradient_wrt_inputs(mini_batch_size); + const auto& input = this->get_prev_activations(); + m_statistics_gradient->Empty(false); + m_statistics_gradient->AlignWith(input); + m_statistics_gradient->Resize(2, input.Width()); +} + +// ========================================================= +// Explicit template instantiation +// ========================================================= + +#ifndef LBANN_LAYER_NORM_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class layer_norm_layer< \ + T, data_layout::DATA_PARALLEL, Device>; \ + extern template class layer_norm_layer< \ + T, data_layout::MODEL_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_LAYER_NORM_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED diff --git a/include/lbann/layers/regularizers/local_response_normalization.hpp b/include/lbann/layers/regularizers/local_response_normalization.hpp index 23ff7051fab..77c077c5ce3 100644 --- a/include/lbann/layers/regularizers/local_response_normalization.hpp +++ b/include/lbann/layers/regularizers/local_response_normalization.hpp @@ -43,28 +43,35 @@ namespace lbann { * Advances in Neural Information Processing Systems, * pp. 1097-1105. 2012. */ -template -class local_response_normalization_layer : public regularizer_layer { +template +class local_response_normalization_layer : public regularizer_layer { +#ifdef LBANN_HAS_CUDNN + using ScalingType = cudnn::ScalingParamType; +#else + using ScalingType = TensorDataType; +#endif // LBANN_HAS_CUDNN + + static_assert(T_layout == data_layout::DATA_PARALLEL, + "local_response_normalization only supports DATA_PARALLEL"); public: local_response_normalization_layer(lbann_comm *comm, int window_width, - DataType alpha, - DataType beta, - DataType k) - : regularizer_layer(comm), + TensorDataType alpha, + TensorDataType beta, + TensorDataType k) + : regularizer_layer(comm), m_window_width(window_width), m_alpha(alpha), m_beta(beta), m_k(k) #ifdef LBANN_HAS_CUDNN , m_lrn_cudnn_desc(nullptr), m_tensors_cudnn_desc(this) #endif // LBANN_HAS_CUDNN - { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "local_response_normalization only supports DATA_PARALLEL"); - } + { } local_response_normalization_layer(const local_response_normalization_layer& other) - : regularizer_layer(other), + : regularizer_layer(other), m_window_width(other.m_window_width), m_alpha(other.m_alpha), m_beta(other.m_beta), @@ -87,7 +94,7 @@ class local_response_normalization_layer : public regularizer_layer { } local_response_normalization_layer& operator=(const local_response_normalization_layer& other) { - regularizer_layer::operator=(other); + regularizer_layer::operator=(other); m_window_width = other.m_window_width; m_alpha = other.m_alpha; m_beta = other.m_beta; @@ -110,6 +117,7 @@ class local_response_normalization_layer : public regularizer_layer { m_tensors_cudnn_desc = other.m_tensors_cudnn_desc; m_tensors_cudnn_desc.set_layer(this); #endif // LBANN_HAS_CUDNN + return *this; } ~local_response_normalization_layer() override { @@ -128,7 +136,7 @@ class local_response_normalization_layer : public regularizer_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = regularizer_layer::get_description(); + auto desc = regularizer_layer::get_description(); desc.add("alpha", m_alpha); desc.add("beta", m_beta); desc.add("k", m_k); @@ -137,14 +145,14 @@ class local_response_normalization_layer : public regularizer_layer { protected: - void setup_dims() override { - regularizer_layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + regularizer_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); } /// Initialize GPU objects void setup_gpu() override { - regularizer_layer::setup_gpu(); + regularizer_layer::setup_gpu(); #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else @@ -178,17 +186,17 @@ class local_response_normalization_layer : public regularizer_layer { /** Normalization window width. */ int m_window_width; /** LRN alpha scaling parameter. */ - DataType m_alpha; + TensorDataType m_alpha; /** LRN beta power parameter. */ - DataType m_beta; + TensorDataType m_beta; /** LRN k parameter. */ - DataType m_k; + TensorDataType m_k; #ifdef LBANN_HAS_CUDNN /** LRN cuDNN descriptor. */ cudnnLRNDescriptor_t m_lrn_cudnn_desc; /** Tensor cuDNN descriptors. */ - cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; + cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; #endif // LBANN_HAS_CUDNN /// GPU implementation of forward propagation @@ -196,11 +204,11 @@ class local_response_normalization_layer : public regularizer_layer { #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); + const auto& local_input = this->get_local_prev_activations(); + auto& local_output = this->get_local_activations(); if (local_input.Height() > 0 && local_input.Width() > 0) { - const DataType zero = DataType(0); - const DataType one = DataType(1); + const ScalingType zero = El::TypeTraits::Zero(); + const ScalingType one = El::TypeTraits::One(); CHECK_CUDNN(cudnnLRNCrossChannelForward(cudnn::get_handle(), m_lrn_cudnn_desc, CUDNN_LRN_CROSS_CHANNEL_DIM1, @@ -219,13 +227,13 @@ class local_response_normalization_layer : public regularizer_layer { #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else - const auto& local_input = get_local_prev_activations(); - const auto& local_output = get_local_activations(); - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); + const auto& local_input = this->get_local_prev_activations(); + const auto& local_output = this->get_local_activations(); + const auto& local_gradient_wrt_output = this->get_local_prev_error_signals(); + auto& local_gradient_wrt_input = this->get_local_error_signals(); if (local_input.Height() > 0 && local_input.Width() > 0) { - const DataType zero = DataType(0); - const DataType one = DataType(1); + const ScalingType zero = El::TypeTraits::Zero(); + const ScalingType one = El::TypeTraits::One(); CHECK_CUDNN(cudnnLRNCrossChannelBackward(cudnn::get_handle(), m_lrn_cudnn_desc, CUDNN_LRN_CROSS_CHANNEL_DIM1, @@ -247,23 +255,24 @@ class local_response_normalization_layer : public regularizer_layer { void fp_compute_cpu() { // Local matrices - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); + const auto& local_input = this->get_local_prev_activations(); + auto& local_output = this->get_local_activations(); // Matrix parameters const int local_width = local_input.Width(); - const DataType* input_buffer = local_input.LockedBuffer(); + const TensorDataType* input_buffer = local_input.LockedBuffer(); const int input_ldim = local_input.LDim(); - DataType* output_buffer = local_output.Buffer(); + TensorDataType* output_buffer = local_output.Buffer(); const int output_ldim = local_output.LDim(); // Get LRN parameters - const auto& output_dims = get_output_dims(); + const auto& output_dims = this->get_output_dims(); const int num_channels = output_dims[0]; - const int num_per_channel = get_output_size() / num_channels; + const int num_per_channel = this->get_output_size() / num_channels; // Check if LRN is using default beta parameter - const bool default_beta = (std::fabs((m_beta - 0.75) / 0.75) + const bool default_beta = (std::fabs((m_beta - El::To(0.75)) + / El::To(0.75)) < 2 * std::numeric_limits::epsilon()); //////////////////////////////////////////////////////////////// @@ -282,7 +291,7 @@ class local_response_normalization_layer : public regularizer_layer { block_start += max_block_size) { const int block_size = std::min(max_block_size, num_per_channel - block_start); - DataType workspace[max_block_size]; + TensorDataType workspace[max_block_size]; // Iterate through channels for (int channel = 0; channel < num_channels; ++channel) { @@ -290,32 +299,33 @@ class local_response_normalization_layer : public regularizer_layer { const int window_end = std::min(channel + m_window_width / 2, num_channels - 1); // Compute sum of squares in workspace - std::fill(workspace, workspace + block_size, DataType(0)); + std::fill(workspace, workspace + block_size, El::TypeTraits::Zero()); for (int window_pos = window_start; window_pos <= window_end; ++window_pos) { for (int block_pos = 0; block_pos < block_size; ++block_pos) { const int index = block_start + block_pos + window_pos * num_per_channel; - const DataType input_entry = input_buffer[index + sample * input_ldim]; + const TensorDataType input_entry = input_buffer[index + sample * input_ldim]; workspace[block_pos] += input_entry * input_entry; } } // Compute 1 / (k + alpha * sum(x^2) ) in workspace for (int block_pos = 0; block_pos < block_size; ++block_pos) { - workspace[block_pos] = 1 / (m_k + m_alpha * workspace[block_pos]); + workspace[block_pos] = El::TypeTraits::One() + / (m_k + m_alpha * workspace[block_pos]); } // Compute output for (int block_pos = 0; block_pos < block_size; ++block_pos) { const int index = block_start + block_pos + channel * num_per_channel; - const DataType scale_factor = workspace[block_pos]; - const DataType input_entry = input_buffer[index + sample * input_ldim]; - DataType& output_entry = output_buffer[index + sample * output_ldim]; + const TensorDataType scale_factor = workspace[block_pos]; + const TensorDataType input_entry = input_buffer[index + sample * input_ldim]; + TensorDataType& output_entry = output_buffer[index + sample * output_ldim]; if (default_beta) { // Special case when beta = 0.75 output_entry = (input_entry - * std::sqrt(scale_factor * std::sqrt(scale_factor))); + * El::Sqrt(scale_factor * El::Sqrt(scale_factor))); } else { - output_entry = input_entry * std::pow(scale_factor, m_beta); + output_entry = input_entry * El::Pow(scale_factor, m_beta); } } @@ -330,30 +340,31 @@ class local_response_normalization_layer : public regularizer_layer { void bp_compute_cpu() { // Get local matrices - const auto& local_input = get_local_prev_activations(); - const auto& local_output = get_local_activations(); - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); + const auto& local_input = this->get_local_prev_activations(); + const auto& local_output = this->get_local_activations(); + const auto& local_gradient_wrt_output = this->get_local_prev_error_signals(); + auto& local_gradient_wrt_input = this->get_local_error_signals(); // Get matrix buffers const int local_width = local_input.Width(); - const DataType* input_buffer = local_input.LockedBuffer(); + const TensorDataType* input_buffer = local_input.LockedBuffer(); const int input_ldim = local_input.LDim(); - const DataType* output_buffer = local_output.LockedBuffer(); + const TensorDataType* output_buffer = local_output.LockedBuffer(); const int output_ldim = local_output.LDim(); - const DataType* gradient_wrt_output_buffer = local_gradient_wrt_output.LockedBuffer(); + const TensorDataType* gradient_wrt_output_buffer = local_gradient_wrt_output.LockedBuffer(); const int gradient_wrt_output_ldim = local_gradient_wrt_output.LDim(); - DataType* gradient_wrt_input_buffer = local_gradient_wrt_input.Buffer(); + TensorDataType* gradient_wrt_input_buffer = local_gradient_wrt_input.Buffer(); const int gradient_wrt_input_ldim = local_gradient_wrt_input.LDim(); // Get LRN parameters - const auto& output_dims = get_output_dims(); + const auto& output_dims = this->get_output_dims(); const int num_channels = output_dims[0]; - const int num_per_channel = get_output_size() / num_channels; + const int num_per_channel = this->get_output_size() / num_channels; // Check if LRN is using default beta parameter - const bool default_beta = (std::fabs((m_beta - 0.75) / 0.75) - < 2 * std::numeric_limits::epsilon()); + const bool default_beta = (std::fabs((m_beta - El::To(0.75)) + / El::To(0.75)) + < El::To(2) * std::numeric_limits::epsilon()); //////////////////////////////////////////////////////////////// // error_signal(i) @@ -375,7 +386,7 @@ class local_response_normalization_layer : public regularizer_layer { block_start += max_block_size) { const int block_size = std::min(max_block_size, num_per_channel - block_start); - DataType workspace[max_block_size]; + TensorDataType workspace[max_block_size]; // Iterate through channels for (int channel = 0; channel < num_channels; ++channel) { @@ -383,45 +394,46 @@ class local_response_normalization_layer : public regularizer_layer { const int window_end = std::min(channel + m_window_width / 2, num_channels - 1); // Compute sum of squares in workspace - std::fill(workspace, workspace + block_size, DataType(0)); + std::fill(workspace, workspace + block_size, El::TypeTraits::Zero()); for (int window_pos = window_start; window_pos <= window_end; ++window_pos) { for (int block_pos = 0; block_pos < block_size; ++block_pos) { const int index = block_start + block_pos + window_pos * num_per_channel; - const DataType input_entry = input_buffer[index + sample * input_ldim]; + const TensorDataType input_entry = input_buffer[index + sample * input_ldim]; workspace[block_pos] += input_entry * input_entry; } } // Compute 1 / (k + alpha * sum(x^2) ) in workspace for (int block_pos = 0; block_pos < block_size; ++block_pos) { - workspace[block_pos] = 1 / (m_k + m_alpha * workspace[block_pos]); + workspace[block_pos] = El::TypeTraits::One() + / (m_k + m_alpha * workspace[block_pos]); } // Compute error signal contribution for current entry for (int block_pos = 0; block_pos < block_size; ++block_pos) { const int index = block_start + block_pos + channel * num_per_channel; - const DataType scale_factor = workspace[block_pos]; - const DataType gradient_wrt_output_entry + const TensorDataType scale_factor = workspace[block_pos]; + const TensorDataType gradient_wrt_output_entry = gradient_wrt_output_buffer[index + sample * gradient_wrt_output_ldim]; - DataType& gradient_wrt_input_entry + TensorDataType& gradient_wrt_input_entry = gradient_wrt_input_buffer[index + sample * gradient_wrt_input_ldim]; if (default_beta) { // Special case when beta = 0.75 gradient_wrt_input_entry - = gradient_wrt_output_entry * std::sqrt(scale_factor * std::sqrt(scale_factor)); + = gradient_wrt_output_entry * El::Sqrt(scale_factor * El::Sqrt(scale_factor)); } else { gradient_wrt_input_entry - = gradient_wrt_output_entry * std::pow(scale_factor, m_beta); + = gradient_wrt_output_entry * El::Pow(scale_factor, m_beta); } } // Compute y * dy / (k + alpha * sum(x^2) ) in workspace for (int block_pos = 0; block_pos < block_size; ++block_pos) { const int index = block_start + block_pos + channel * num_per_channel; - const DataType output_entry = output_buffer[index + sample * output_ldim]; - const DataType gradient_wrt_output_entry + const TensorDataType output_entry = output_buffer[index + sample * output_ldim]; + const TensorDataType gradient_wrt_output_entry = gradient_wrt_output_buffer[index + sample * gradient_wrt_output_ldim]; - workspace[block_pos] = (-2 * m_alpha * m_beta * workspace[block_pos] + workspace[block_pos] = (El::To(-2) * m_alpha * m_beta * workspace[block_pos] * output_entry * gradient_wrt_output_entry); } @@ -429,7 +441,7 @@ class local_response_normalization_layer : public regularizer_layer { for (int window_pos = window_start; window_pos <= window_end; ++window_pos) { for (int block_pos = 0; block_pos < block_size; ++block_pos) { const int index = block_start + block_pos + window_pos * num_per_channel; - const DataType input_entry = input_buffer[index + sample * input_ldim]; + const TensorDataType input_entry = input_buffer[index + sample * input_ldim]; gradient_wrt_input_buffer[index + sample * gradient_wrt_input_ldim] += workspace[block_pos] * input_entry; } @@ -444,6 +456,17 @@ class local_response_normalization_layer : public regularizer_layer { }; +LBANN_DEFINE_LAYER_BUILDER(local_response_normalization); + +#ifndef LBANN_LOCAL_RESPONSE_NORMALIZATION_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class local_response_normalization_layer< \ + T, data_layout::DATA_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_LOCAL_RESPONSE_NORMALIZATION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_LOCAL_RESPONSE_NORMALIZATION_HPP_INCLUDED diff --git a/include/lbann/layers/regularizers/regularizer.hpp b/include/lbann/layers/regularizers/regularizer.hpp index c01b892c820..51966d28258 100644 --- a/include/lbann/layers/regularizers/regularizer.hpp +++ b/include/lbann/layers/regularizers/regularizer.hpp @@ -26,16 +26,17 @@ #ifndef LBANN_LAYER_REGULARIZER_HPP_INCLUDED #define LBANN_LAYER_REGULARIZER_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { /** @todo Remove. Layers should inherit directly from the base layer * class. */ -class regularizer_layer : public Layer { +template +class regularizer_layer : public data_type_layer { public: - regularizer_layer(lbann_comm *comm) : Layer(comm) {} + regularizer_layer(lbann_comm *comm) : data_type_layer(comm) {} }; } // namespace lbann diff --git a/include/lbann/layers/regularizers/selu_dropout.hpp b/include/lbann/layers/regularizers/selu_dropout.hpp index a2b3d6475a3..f62e6e509c4 100644 --- a/include/lbann/layers/regularizers/selu_dropout.hpp +++ b/include/lbann/layers/regularizers/selu_dropout.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_REGULARIZER_SELU_DROPOUT_HPP_INCLUDED #include "lbann/layers/regularizers/regularizer.hpp" +#include "lbann/models/model.hpp" namespace lbann { @@ -39,15 +40,27 @@ namespace lbann { * Hochreiter. "Self-normalizing neural networks." In Advances in * Neural Information Processing Systems, pp. 971-980. 2017. */ -template -class selu_dropout : public regularizer_layer { +template +class selu_dropout : public regularizer_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The tensor type expected in this object. */ + using CPUMatrixType = El::Matrix; + + ///@} + public: /** Keep units with probabiliy keep_prob. */ selu_dropout(lbann_comm *comm, - float keep_prob=0.95f, - DataType alpha = DataType(1.6732632423543772848170429916717), - DataType scale = DataType(1.0507009873554804934193349852946)) : - regularizer_layer(comm), + TensorDataType keep_prob = TensorDataType(0.95f), + TensorDataType alpha = TensorDataType(1.6732632423543772848170429916717), + TensorDataType scale = TensorDataType(1.0507009873554804934193349852946)) : + regularizer_layer(comm), m_keep_prob(keep_prob), m_mask(nullptr) { #ifdef LBANN_DETERMINISTIC @@ -56,13 +69,13 @@ class selu_dropout : public regularizer_layer { // Compute alpha' and the affine transform. m_alpha_prime = -scale*alpha; m_a = keep_prob + - m_alpha_prime*m_alpha_prime*keep_prob*(DataType(1) - keep_prob); - m_a = DataType(1) / std::sqrt(m_a); - m_b = -m_a * m_alpha_prime*(DataType(1) - keep_prob); + m_alpha_prime*m_alpha_prime*keep_prob*(El::TypeTraits::One() - keep_prob); + m_a = El::TypeTraits::One() / El::Sqrt(m_a); + m_b = -m_a * m_alpha_prime*(El::TypeTraits::One() - keep_prob); } selu_dropout(const selu_dropout& other) : - regularizer_layer(other), + regularizer_layer(other), m_alpha_prime(other.m_alpha_prime), m_a(other.m_a), m_b(other.m_b), @@ -72,7 +85,7 @@ class selu_dropout : public regularizer_layer { } selu_dropout& operator=(const selu_dropout& other) { - regularizer_layer::operator=(other); + regularizer_layer::operator=(other); m_alpha_prime = other.m_alpha_prime; m_a = other.m_a; m_b = other.m_b; @@ -95,35 +108,35 @@ class selu_dropout : public regularizer_layer { El::Device get_device_allocation() const override { return Dev; } - void setup_dims() override { - regularizer_layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + regularizer_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); } void setup_matrices(const El::Grid& grid) override { - regularizer_layer::setup_matrices(grid); + regularizer_layer::setup_matrices(grid); if (m_mask != nullptr) { delete m_mask; } - m_mask = get_activations().Copy(); + m_mask = this->get_activations().Copy(); } protected: /** Drop out units in forward propagation. */ void fp_compute() override { - if (this->m_model->get_execution_mode() != execution_mode::training || + if (this->m_model->get_execution_context().get_execution_mode() != execution_mode::training || m_keep_prob < 0.0f) { // Do nothing if dropout is disabled - El::Copy(get_prev_activations(), get_activations()); + El::Copy(this->get_prev_activations(), this->get_activations()); } else { - const auto *input_acts = &get_prev_activations(); + const auto *input_acts = &this->get_prev_activations(); const El::Int height = input_acts->Height(); const El::Int width = input_acts->Width(); const El::Int local_height = input_acts->LocalHeight(); const El::Int local_width = input_acts->LocalWidth(); const auto& local_input_acts = input_acts->LockedMatrix(); - Mat& local_output_acts = get_local_activations(); - Mat& local_mask = m_mask->Matrix(); + CPUMatrixType& local_output_acts = this->get_local_activations(); + CPUMatrixType& local_mask = m_mask->Matrix(); // Construct and apply mask and the affine transform. // TODO: Optimize. @@ -132,7 +145,7 @@ class selu_dropout : public regularizer_layer { for (El::Int row = 0; row < local_height; ++row) { local_output_acts(row, col) = m_a * (local_input_acts(row, col)*local_mask(row, col) + - m_alpha_prime*(1 - local_mask(row, col))) + m_b; + m_alpha_prime*(El::TypeTraits::One() - local_mask(row, col))) + m_b; } } @@ -141,14 +154,14 @@ class selu_dropout : public regularizer_layer { /** Adjust gradients for dropout in backprop. */ void bp_compute() override { - if (this->m_model->get_execution_mode() != execution_mode::training + if (this->m_model->get_execution_context().get_execution_mode() != execution_mode::training || m_keep_prob < 0.0f) { - El::Copy(get_prev_error_signals(), get_error_signals()); + El::Copy(this->get_prev_error_signals(), this->get_error_signals()); } else { - const auto& local_prev_error_signal = get_local_prev_error_signals(); - Mat& local_error_signal = get_local_error_signals(); - Mat& local_mask = m_mask->Matrix(); + const auto& local_prev_error_signal = this->get_local_prev_error_signals(); + CPUMatrixType& local_error_signal = this->get_local_error_signals(); + CPUMatrixType& local_mask = m_mask->Matrix(); const El::Int local_height = local_prev_error_signal.Height(); const El::Int local_width = local_prev_error_signal.Width(); // Reweight with the affine scale factor and the dropout mask. @@ -164,17 +177,26 @@ class selu_dropout : public regularizer_layer { private: /** Alpha prime, the low-variance saturation point. */ - DataType m_alpha_prime; + TensorDataType m_alpha_prime; /** Affine scaling parameter to keep mean/variance at desired value. */ - DataType m_a; + TensorDataType m_a; /** Affine additive parameter to keep mean/variance at desired value. */ - DataType m_b; + TensorDataType m_b; /** Probability of keeping each unit. */ - float m_keep_prob; + TensorDataType m_keep_prob; /** Current dropout mask (a scaled Bernoulli random matrix). */ - AbsDistMat *m_mask; + AbsDistMatrixType *m_mask; }; +#ifndef LBANN_SELU_DROPOUT_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class selu_dropout; \ + extern template class selu_dropout + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_SELU_DROPOUT_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_REGULARIZER_SELU_DROPOUT_HPP_INCLUDED diff --git a/include/lbann/layers/transform/CMakeLists.txt b/include/lbann/layers/transform/CMakeLists.txt index 645d1764511..874ed611b23 100644 --- a/include/lbann/layers/transform/CMakeLists.txt +++ b/include/lbann/layers/transform/CMakeLists.txt @@ -1,6 +1,6 @@ # Add the headers for this directory set_full_path(THIS_DIR_HEADERS - concatenation.hpp + concatenate.hpp pooling.hpp reshape.hpp slice.hpp diff --git a/include/lbann/layers/transform/bernoulli.hpp b/include/lbann/layers/transform/bernoulli.hpp index d3e827e6ee7..127cd581e52 100644 --- a/include/lbann/layers/transform/bernoulli.hpp +++ b/include/lbann/layers/transform/bernoulli.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_BERNOULLI_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/random.hpp" namespace lbann { @@ -36,18 +37,20 @@ namespace lbann { * * During validation and testing, outputs are all zero. */ -template -class bernoulli_layer : public transform_layer { -private: - /** Probability of outputting 1. */ - DataType m_prob; +template +class bernoulli_layer : public transform_layer { +public: + + using ProbabilityType = double; public: bernoulli_layer(lbann_comm *comm, std::vector dims, - DataType prob = DataType(0.5)) - : transform_layer(comm), m_prob(prob) { - set_output_dims(dims); + ProbabilityType prob = 0.5) + : transform_layer(comm), m_prob(prob) { + this->set_output_dims(dims); this->m_expected_num_parent_layers = 0; } bernoulli_layer* copy() const override { return new bernoulli_layer(*this); } @@ -56,7 +59,7 @@ class bernoulli_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Probability", m_prob); return desc; } @@ -64,16 +67,34 @@ class bernoulli_layer : public transform_layer { protected: void fp_compute() override { - auto& output = get_activations(); - if (this->m_model->get_execution_mode() == execution_mode::training) { + auto& output = this->get_activations(); + if (this->m_model->get_execution_context().get_execution_mode() == execution_mode::training) { bernoulli_fill(output, output.Height(), output.Width(), m_prob); } else { El::Zero(output); } } +private: + + /** Probability of outputting 1. */ + ProbabilityType m_prob; + }; +LBANN_DEFINE_LAYER_BUILDER(bernoulli); + +#ifndef LBANN_BERNOULLI_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class bernoulli_layer; \ + extern template class bernoulli_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_BERNOULLI_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_BERNOULLI_HPP_INCLUDED diff --git a/include/lbann/layers/transform/categorical_random.hpp b/include/lbann/layers/transform/categorical_random.hpp index ac756dbeb5f..ef14c4ed4c1 100644 --- a/include/lbann/layers/transform/categorical_random.hpp +++ b/include/lbann/layers/transform/categorical_random.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_CATEGORICAL_RANDOM_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/random.hpp" namespace lbann { @@ -40,16 +41,18 @@ namespace lbann { * * @todo Remove. */ -template -class categorical_random_layer : public transform_layer { - +template +class categorical_random_layer : public transform_layer { + static_assert(Dev == El::Device::CPU, + "categorical random layer currently only supports CPU"); + static_assert(T_layout == data_layout::DATA_PARALLEL, + "categorical random layer currently only " + "supports DATA_PARALLEL"); public: categorical_random_layer(lbann_comm *comm) - : transform_layer(comm) { - static_assert(Dev == El::Device::CPU, - "categorical random layer currently only supports CPU"); - static_assert(T_layout == data_layout::DATA_PARALLEL, - "categorical random layer currently only supports DATA_PARALLEL"); + : transform_layer(comm) { } categorical_random_layer* copy() const override { return new categorical_random_layer(*this); } std::string get_type() const override { return "categorical random"; } @@ -61,19 +64,19 @@ class categorical_random_layer : public transform_layer { void fp_compute() override { // Input and output matrices - const auto& input = get_prev_activations(); + const auto& input = this->get_prev_activations(); const auto& local_input = input.LockedMatrix(); - auto& local_output = get_local_activations(); + auto& local_output = this->get_local_activations(); const auto& width = input.Width(); const auto& local_height = local_input.Height(); const auto& local_width = local_input.Width(); // Initialize output and random numbers - const auto& mode = this->m_model->get_execution_mode(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); El::Zero(local_output); - StarVCMat rand_mat(input.Grid(), input.Root()); + StarVCMatDT rand_mat(input.Grid(), input.Root()); if (mode == execution_mode::training) { - uniform_fill(rand_mat, 1, width, DataType(0.5), DataType(0.5)); + uniform_fill(rand_mat, 1, width, TensorDataType(0.5), TensorDataType(0.5)); } // Process each mini-batch sample @@ -85,7 +88,7 @@ class categorical_random_layer : public transform_layer { if (mode == execution_mode::training) { // Choose first output with CDF above random number in (0,1) const auto& rand = rand_mat.GetLocal(0, col); - DataType cdf = DataType(0); + TensorDataType cdf = El::TypeTraits::Zero(); for (El::Int row = 0; row < local_height; ++row) { cdf += local_input(row, col); if (rand < cdf) { @@ -101,7 +104,7 @@ class categorical_random_layer : public transform_layer { } // Output a one-hot vector - local_output(index, col) = DataType(1); + local_output(index, col) = El::TypeTraits::One(); } @@ -109,6 +112,20 @@ class categorical_random_layer : public transform_layer { }; +LBANN_DEFINE_LAYER_BUILDER(categorical_random); + +#ifndef LBANN_CATEGORICAL_RANDOM_LAYER_INSTANTIATE + +#define PROTO(T) \ + extern template class categorical_random_layer + +#define LBANN_INSTANTIATE_CPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#undef LBANN_INSTANTIATE_CPU_HALF + +#endif // LBANN_CATEGORICAL_RANDOM_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_CATEGORICAL_RANDOM_HPP_INCLUDED diff --git a/include/lbann/layers/transform/concatenate.hpp b/include/lbann/layers/transform/concatenate.hpp new file mode 100644 index 00000000000..2b3e5091436 --- /dev/null +++ b/include/lbann/layers/transform/concatenate.hpp @@ -0,0 +1,410 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_TRANSFORM_CONCATENATE_HPP_INCLUDED +#define LBANN_LAYERS_TRANSFORM_CONCATENATE_HPP_INCLUDED + +#include "lbann/layers/data_type_layer.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/distconv.hpp" + +#include +#include + +namespace lbann { + +#ifdef LBANN_HAS_DISTCONV +template +class concatenate_distconv_adapter : public data_type_distconv_adapter { + public: + using TensorDevType = typename data_type_distconv_adapter::TensorDevType; + concatenate_distconv_adapter(Layer& layer): + data_type_distconv_adapter(layer) {} + virtual ~concatenate_distconv_adapter() = default; + dc::Shape get_activations_local_shape(int index=0) const override; + void fp_compute(); + void bp_compute(); +}; +#endif // LBANN_HAS_DISTCONV + +/** @brief Concatenate tensors along specified dimension. */ +template +class concatenate_layer : public data_type_layer { +public: + + concatenate_layer(lbann_comm *comm, size_t concat_dim); + concatenate_layer(const concatenate_layer& other) = default; + concatenate_layer& operator=(const concatenate_layer& other) = default; + + concatenate_layer* copy() const override; + std::string get_type() const override; + data_layout get_data_layout() const override; + El::Device get_device_allocation() const override; + + description get_description() const override; + +protected: + + void setup_pointers() override; + void setup_dims(DataReaderMetaData& dr_metadata) override; + + void fp_setup_outputs(El::Int mini_batch_size) override; + void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override; + void fp_compute() override; + void bp_compute() override; + +private: + + /** @brief Tensor dimension to concatenate along. */ + size_t m_concat_dim; + +#ifdef LBANN_HAS_GPU + /** @brief Workspace buffer. + * + * Parameters for CUDA kernels are copied into this buffer and + * asynchronously transferred to GPU. + */ + std::vector m_workspace; + /** @brief CUDA event for workspace buffer. + * + * Makes sure asynchronous GPU memory transfers are completed + * before modifying workspace buffer. + */ + cuda::event_wrapper m_workspace_event; +#endif // LBANN_HAS_GPU + + template + friend void fp_compute_impl(concatenate_layer&, size_t); + template + friend void bp_setup_gradient_wrt_inputs_impl(concatenate_layer&); + template + friend void bp_compute_impl(concatenate_layer&, size_t); + +#ifdef LBANN_HAS_DISTCONV + friend class concatenate_distconv_adapter; + protected: + bool is_distconv_supported() const override { + // Only supported for the channel dimension + return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL + && m_concat_dim == 0; + } + void setup_distconv_adapter() override { + this->get_distconv_adapter_ptr() = make_unique< + concatenate_distconv_adapter>(*this); + } + concatenate_distconv_adapter& get_distconv_adapter() override; + const concatenate_distconv_adapter& get_distconv_adapter() const override; +#endif // LBANN_HAS_DISTCONV +}; + +// ========================================================= +// Implementation +// ========================================================= + +template +concatenate_layer::concatenate_layer( + lbann_comm *comm, + size_t concat_dim) + : data_type_layer(comm), + m_concat_dim{concat_dim} { + this->m_expected_num_parent_layers = -1; // No limit on parents +} + +template +concatenate_layer* concatenate_layer::copy() const { + return new concatenate_layer(*this); +} + +template +std::string concatenate_layer::get_type() const { + return "concatenate"; +} + +template +data_layout concatenate_layer::get_data_layout() const { + return Layout; +} + +template +El::Device concatenate_layer::get_device_allocation() const { + return Device; +} + +template +description concatenate_layer::get_description() const { + auto desc = data_type_layer::get_description(); + desc.add("Concatenation dimension", m_concat_dim); + return desc; +} + +template +void concatenate_layer::setup_pointers() { + data_type_layer::setup_pointers(); + if (this->get_num_parents() < 1) { + LBANN_ERROR(get_type()," layer \"",this->get_name(),"\" ", + "has no parents"); + } +} + +template +void concatenate_layer::setup_dims(DataReaderMetaData& dr_metadata) { + data_type_layer::setup_dims(dr_metadata); + + // Dimensions of first input tensor + auto output_dims = this->get_input_dims(0); + if (m_concat_dim >= output_dims.size()) { + std::ostringstream err; + err << get_type() << " layer \"" << this->get_name() << "\" " + << "is concatenating along dimension " << m_concat_dim << ", " + << "but it has a " << output_dims.size() << "-D input tensor " + << "(parent layer \"" << this->get_parent_layers()[0]->get_name() << "\" " + << "outputs with dimensions "; + for (size_t d=0; d0 ? " x " : "") << output_dims[d]; + } + err << ")"; + LBANN_ERROR(err.str()); + } + + // Dimensions of remaining input tensors + for (int j=1; jget_num_parents(); ++j) { + const auto& input_dims = this->get_input_dims(j); + if (input_dims.size() != output_dims.size() + || !std::equal(input_dims.begin(), + input_dims.begin() + m_concat_dim, + output_dims.begin()) + || !std::equal(input_dims.begin() + m_concat_dim + 1, + input_dims.end(), + output_dims.begin() + m_concat_dim + 1)) { + std::ostringstream err; + err << get_type() << " layer \"" << this->get_name() << "\" " + << "expects input tensors with dimensions "; + for (size_t d=0; d0 ? " x " : ""); + if (d == m_concat_dim) { err << "X"; } + else { err << output_dims[d]; } + } + err << ", but parent layer " + << "\"" << this->get_parent_layers()[j]->get_name() << "\" " + << "outputs with dimensions "; + for (size_t d=0; d < input_dims.size(); ++d) { + err << (d>0 ? " x " : "") << input_dims[d]; + } + LBANN_ERROR(err.str()); + } + output_dims[m_concat_dim] += input_dims[m_concat_dim]; + } + + // Model-parallel implementation only supports flat data + if (Layout == data_layout::MODEL_PARALLEL + && std::accumulate(&output_dims[0], &output_dims[m_concat_dim], 1, std::multiplies()) > 1) { + LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ", + "attempted to concatenate along dimension ",m_concat_dim,", ", + "but model-parallel concatenate layer " + "only supports flat data"); + } + + // Update output dimensions + this->set_output_dims(output_dims); + +} + +template +void concatenate_layer::fp_setup_outputs(El::Int mini_batch_size) { +#ifdef LBANN_HAS_DISTCONV + if (!this->keep_original_outputs(0)) return; +#endif // LBANN_HAS_DISTCONV + const auto& input0 = this->get_prev_activations(0); + auto& output = this->get_activations(); + output.Empty(false); + if (this->get_num_parents() == 1) { + El::LockedView(output, input0); + } + else { + output.AlignWith(input0); + output.Resize(this->get_output_size(), input0.Width()); + } +} + +template +void concatenate_layer::fp_compute() { +#ifdef LBANN_HAS_DISTCONV + if (this->distconv_enabled()) { + get_distconv_adapter().fp_compute(); + return; + } +#endif + + // Just make a view if there is one input + if (this->get_num_parents() == 1) { + El::LockedView(this->get_activations(), this->get_prev_activations(0)); + return; + } + + // Perform concatenation + fp_compute_impl(*this, m_concat_dim); + +} + +template +void bp_setup_gradient_wrt_inputs_impl( + concatenate_layer& l) { +#ifdef LBANN_HAS_DISTCONV + if (l.distconv_enabled()) { + LBANN_ERROR("Model-parallel LBANN matrix not supported in distconv"); + } +#endif // LBANN_HAS_DISTCONV + + // Slice Elemental matrices + // Note: Assume each mini-batch sample is flat. + const size_t num_inputs = l.get_num_parents(); + const auto& output_grad = l.get_prev_error_signals(); + size_t offset = 0; + for (size_t j=0; j +void bp_setup_gradient_wrt_inputs_impl( + concatenate_layer& l) { + + const size_t num_inputs = l.get_num_parents(); + const auto& output_grad = l.get_prev_error_signals(); + if (num_inputs == 1) { +#ifdef LBANN_HAS_DISTCONV + if (!l.keep_original_gradient_wrt_inputs(0)) return; +#endif + El::LockedView(l.get_error_signals(0), output_grad); + } + else { + for (size_t j=0; j +void concatenate_layer::bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) { + bp_setup_gradient_wrt_inputs_impl(*this); +} + +template +void concatenate_layer::bp_compute() { +#ifdef LBANN_HAS_DISTCONV + if (this->distconv_enabled()) { + get_distconv_adapter().bp_compute(); + return; + } +#endif + + // Just make a view if there is one input + if (this->get_num_parents() == 1) { + El::LockedView(this->get_error_signals(0), this->get_prev_error_signals()); + return; + } + + // Perform slice + bp_compute_impl(*this, m_concat_dim); + +} + +#ifdef LBANN_HAS_DISTCONV +template +concatenate_distconv_adapter& +concatenate_layer::get_distconv_adapter() { + return const_cast&>( + static_cast&>(*this).get_distconv_adapter()); +} + +template +const concatenate_distconv_adapter& +concatenate_layer::get_distconv_adapter() const { + return dynamic_cast&>( + data_type_layer::get_distconv_adapter()); +} + +template +dc::Shape concatenate_distconv_adapter:: +get_activations_local_shape(int index) const { + assert_eq(index, 0); + auto shape = this->get_prev_activations().get_local_shape(); + shape[-2] = this->get_activations_shape()[-2]; + return shape; +} + +template +void concatenate_distconv_adapter:: +fp_compute() { + assert_always(this->layer().get_num_parents() == 2); + dc::tensor::Concatenate(this->get_activations(0), + this->get_prev_activations(0), + this->get_prev_activations(1), + El::GPUManager::Stream()); +} + +template +void concatenate_distconv_adapter:: +bp_compute() { + dc::tensor::Slice(this->get_error_signals(0), + this->get_error_signals(1), + this->get_prev_error_signals(0), + El::GPUManager::Stream()); +} +#endif // LBANN_HAS_DISTCONV + +LBANN_DEFINE_LAYER_BUILDER(concatenate); + +#ifndef LBANN_CONCATENATE_LAYER_INSTANTIATE + +#define PROTO_DEVICE(T, Device) \ + extern template class concatenate_layer< \ + T, data_layout::DATA_PARALLEL, Device>; \ + extern template class concatenate_layer< \ + T, data_layout::MODEL_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE + +#endif // LBANN_CONCATENATE_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYERS_TRANSFORM_CONCATENATE_HPP_INCLUDED diff --git a/include/lbann/layers/transform/concatenation.hpp b/include/lbann/layers/transform/concatenation.hpp deleted file mode 100644 index 5355787269f..00000000000 --- a/include/lbann/layers/transform/concatenation.hpp +++ /dev/null @@ -1,288 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_LAYER_CONCATENATION_HPP_INCLUDED -#define LBANN_LAYER_CONCATENATION_HPP_INCLUDED - -#include "lbann/layers/transform/transform.hpp" -#include "lbann/utils/exception.hpp" - -namespace lbann { - -/** @brief Concatenate tensors along specified dimension. */ -template -class concatenation_layer : public transform_layer { -public: - - concatenation_layer(lbann_comm *comm, El::Int concat_dim) - : transform_layer(comm), m_concat_dim(concat_dim) { - this->m_expected_num_parent_layers = -1; // No limit on parents - } - - concatenation_layer(const concatenation_layer& other) - : transform_layer(other), - m_concat_dim(other.m_concat_dim), - m_concat_points(other.m_concat_points) { - m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); - m_output_v.reset(other.m_output_v ? other.m_output_v->Copy() : nullptr); - } - - concatenation_layer& operator=(const concatenation_layer& other) { - transform_layer::operator=(other); - m_concat_dim = other.m_concat_dim; - m_concat_points = other.m_concat_points; - m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); - m_output_v.reset(other.m_output_v ? other.m_output_v->Copy() : nullptr); - } - - concatenation_layer* copy() const override { return new concatenation_layer(*this); } - std::string get_type() const override { return "concatenation"; } - data_layout get_data_layout() const override { return T_layout; } - El::Device get_device_allocation() const override { return Dev; } - - description get_description() const override { - auto&& desc = transform_layer::get_description(); - desc.add("Concatenation dimension", m_concat_dim); - return desc; - } - -protected: - - void setup_pointers() override { - transform_layer::setup_pointers(); - if (get_num_parents() < 1) { - std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " - << "has no parents"; - LBANN_ERROR(err.str()); - } - } - - void setup_matrices(const El::Grid& grid) override { - transform_layer::setup_matrices(grid); - const auto& input = get_prev_activations(); - m_input_v.reset(input.Construct(input.Grid(), input.Root())); - m_output_v.reset(input.Construct(input.Grid(), input.Root())); - } - - void setup_dims() override { - transform_layer::setup_dims(); - - // Get concatenation points for first parent layer - auto output_dims = get_input_dims(0); - if (m_concat_dim < 0 - || m_concat_dim >= (El::Int) output_dims.size()) { - std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " - << "has " << output_dims.size() << " dimensions, " - << "but attempted to concatenate along " - << "dimension " << m_concat_dim; - LBANN_ERROR(err.str()); - } - m_concat_points.clear(); - m_concat_points.push_back(0); - m_concat_points.push_back(output_dims[m_concat_dim]); - - // Get concatenation points for remaining parent layers - for (int i = 1; i < get_num_parents(); ++i) { - const auto& input_dims = get_input_dims(i); - if (input_dims.size() != output_dims.size() - || !std::equal(input_dims.begin(), - input_dims.begin() + m_concat_dim, - output_dims.begin()) - || !std::equal(input_dims.begin() + m_concat_dim + 1, - input_dims.end(), - output_dims.begin() + m_concat_dim + 1)) { - std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " - << "expects input tensors with dimensions "; - for (size_t j = 0; j < output_dims.size(); ++j) { - err << (j > 0 ? " x " : ""); - if ((int) j == m_concat_dim) { - err << "X"; - } else { - err << output_dims[j]; - } - } - err << ", but parent layer " - << "\"" << m_parent_layers[i]->get_name() << "\" " - << "outputs with dimensions "; - for (size_t j = 0; j < input_dims.size(); ++j) { - err << (j > 0 ? " x " : "") << input_dims[j]; - } - LBANN_ERROR(err.str()); - } - output_dims[m_concat_dim] += input_dims[m_concat_dim]; - m_concat_points.push_back(output_dims[m_concat_dim]); - } - - // Update output dimensions - set_output_dims(output_dims); - - } - - void fp_setup_outputs(El::Int mini_batch_size) override { - const auto& num_inputs = get_num_parents(); - const auto& output_dims = get_output_dims(); - - // Initialize output tensor - auto& output = get_activations(); - output.Empty(false); - if (num_inputs > 1) { - output.AlignWith(get_prev_activations()); - output.Resize(get_output_size(), mini_batch_size); - } else { - El::LockedView(output, get_prev_activations()); - return; - } - - // Divide output tensor into unit slices along concat dimension - // Note: Each unit slice is divided into contiguous "unit blocks" - const auto& output_num_unit_slices = output_dims[m_concat_dim]; - const auto& blocks_per_slice - = (m_concat_dim > 0 ? - std::accumulate(&output_dims[0], &output_dims[m_concat_dim], - 1, std::multiplies()) : - 1); - const auto& unit_block_size - = std::accumulate(output_dims.begin() + m_concat_dim + 1, - output_dims.end(), - 1, std::multiplies()); - const auto& output_block_stride = (output_num_unit_slices - * unit_block_size); - - // Populate slices of output tensor with input tensors - for (int i = 0; i < num_inputs; ++i) { - const auto& input_dims = get_input_dims(i); - auto& input = get_prev_activations(i); - - // Divide input tensor into unit slices - const auto& input_num_unit_slices = input_dims[m_concat_dim]; - - // Merge unit slices - const auto& block_size = input_num_unit_slices * unit_block_size; - const auto& output_block_offset = m_concat_points[i] * unit_block_size; - - // Populate output tensor one block at a time - for (int block = 0; block < blocks_per_slice; ++block) { - const auto& input_offset = block * block_size; - const auto& output_offset = (output_block_offset - + block * output_block_stride); - El::LockedView(*m_input_v, input, - El::IR(input_offset, input_offset + block_size), - El::ALL); - El::View(*m_output_v, output, - El::IR(output_offset, output_offset + block_size), - El::ALL); - El::Copy(*m_input_v, *m_output_v); - } - - } - - } - - void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { - const auto& num_inputs = get_num_parents(); - const auto& output_dims = get_output_dims(); - - // Divide output tensor into unit slices along concat dimension - // Note: Each unit slice is divided into contiguous "unit blocks" - const auto& output_num_unit_slices = output_dims[m_concat_dim]; - const auto& blocks_per_slice - = (m_concat_dim > 0 ? - std::accumulate(&output_dims[0], &output_dims[m_concat_dim], - 1, std::multiplies()) : - 1); - const auto& unit_block_size - = std::accumulate(output_dims.begin() + m_concat_dim + 1, - output_dims.end(), - 1, std::multiplies()); - const auto& output_block_stride = (output_num_unit_slices - * unit_block_size); - - // Populate gradient w.r.t. input tensors - const auto& gradient_wrt_output = get_prev_error_signals(); - for (int i = 0; i < num_inputs; ++i) { - const auto& input_dims = get_input_dims(i); - const auto& input_size = get_input_size(i); - auto& gradient_wrt_input = get_error_signals(i); - - // Divide input tensor into unit slices - const auto& input_num_unit_slices = input_dims[m_concat_dim]; - - // Merge unit slices and get first contiguous output block - const auto& block_size = input_num_unit_slices * unit_block_size; - const auto& output_block_offset = m_concat_points[i] * unit_block_size; - El::LockedView(*m_output_v, gradient_wrt_output, - El::IR(output_block_offset, - output_block_offset + block_size), - El::ALL); - - // Populate gradient w.r.t. input tensor one block at a time - // Note: If there is only one block, the tensor can be a view - if (blocks_per_slice > 1) { - gradient_wrt_input.AlignWith(*m_output_v); - gradient_wrt_input.Resize(input_size, mini_batch_size); - for (int block = 0; block < blocks_per_slice; ++block) { - const auto& input_offset = block * block_size; - const auto& output_offset = (output_block_offset - + block * output_block_stride); - El::LockedView(*m_output_v, gradient_wrt_output, - El::IR(output_offset, output_offset + block_size), - El::ALL); - El::View(*m_input_v, gradient_wrt_input, - El::IR(input_offset, input_offset + block_size), - El::ALL); - El::Copy(*m_output_v, *m_input_v); - } - } else { - El::LockedView(gradient_wrt_input, *m_output_v); - } - - } - - } - - void fp_compute() override {} - void bp_compute() override {} - -private: - - /** Tensor dimension to concatenation. */ - El::Int m_concat_dim; - /** Concatenation points for each child layer. */ - std::vector m_concat_points; - - /** View into input tensor. */ - std::unique_ptr m_input_v; - /** View into output tensor. */ - std::unique_ptr m_output_v; - -}; - -} // namespace lbann - -#endif // LBANN_LAYER_CONCATENATION_HPP_INCLUDED diff --git a/include/lbann/layers/transform/constant.hpp b/include/lbann/layers/transform/constant.hpp index f4390884a56..b13737e424d 100644 --- a/include/lbann/layers/transform/constant.hpp +++ b/include/lbann/layers/transform/constant.hpp @@ -32,15 +32,17 @@ namespace lbann { /** @brief Constant output. */ -template -class constant_layer : public transform_layer { +template +class constant_layer : public transform_layer { public: constant_layer(lbann_comm *comm, - DataType value, + TensorDataType value, std::vector dims) - : transform_layer(comm), m_value(value) { - set_output_dims(dims); + : transform_layer(comm), m_value(value) { + this->set_output_dims(dims); this->m_expected_num_parent_layers = 0; } @@ -50,7 +52,7 @@ class constant_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Value", m_value); return desc; } @@ -59,19 +61,30 @@ class constant_layer : public transform_layer { void fp_compute() override { if (m_value == EvalType(0)) { - El::Zero(get_activations()); + El::Zero(this->get_activations()); } else { - El::Fill(get_activations(), m_value); + El::Fill(this->get_activations(), m_value); } } private: /** Constant value. */ - DataType m_value; + TensorDataType m_value; }; +LBANN_DEFINE_LAYER_BUILDER(constant); + +#ifndef LBANN_CONSTANT_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class constant_layer; \ + extern template class constant_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_CONSTANT_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_CONSTANT_HPP_INCLUDED diff --git a/include/lbann/layers/transform/crop.hpp b/include/lbann/layers/transform/crop.hpp index f0b37b293d3..e77ab06a18f 100644 --- a/include/lbann/layers/transform/crop.hpp +++ b/include/lbann/layers/transform/crop.hpp @@ -40,21 +40,37 @@ namespace lbann { * to the red-top-left corner and (1,1,1) to the blue-bottom-right * corner. The crop size is determined at setup. */ -template -class crop_layer : public transform_layer { +template +class crop_layer : public transform_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "crop layer only supports DATA_PARALLEL"); +#ifdef LBANN_HAS_GPU_FP16 + using CompareType = typename std::conditional::value, float, TensorDataType>::type; +#else + using CompareType = TensorDataType; +#endif +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: crop_layer(lbann_comm *comm, std::vector dims) - : transform_layer(comm) { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "crop layer only supports DATA_PARALLEL"); - set_output_dims(dims); + : transform_layer(comm) { + this->set_output_dims(dims); this->m_expected_num_parent_layers = 2; } crop_layer(const crop_layer& other) - : transform_layer(other), + : transform_layer(other), m_input_v(other.m_input_v ? other.m_input_v->Copy() : nullptr), m_output_v(other.m_output_v ? @@ -62,7 +78,7 @@ class crop_layer : public transform_layer { m_crop_pos_v(other.m_crop_pos_v ? other.m_crop_pos_v->Copy() : nullptr){} crop_layer& operator=(const crop_layer& other) { - transform_layer::operator=(other); + transform_layer::operator=(other); m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); m_output_v.reset(other.m_output_v ? @@ -78,14 +94,14 @@ class crop_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } void setup_matrices(const El::Grid& grid) override { - transform_layer::setup_matrices(grid); - const auto& input = get_prev_activations(); + transform_layer::setup_matrices(grid); + const auto& input = this->get_prev_activations(); const auto& dist = input.DistData(); m_input_v.reset(input.Construct(input.Grid(), input.Root())); m_output_v.reset(input.Construct(input.Grid(), input.Root())); /// @todo Setup the input tensor with this data distribution - m_crop_pos_v.reset(AbsDistMat::Instantiate(*dist.grid, + m_crop_pos_v.reset(AbsDistMatrixType::Instantiate(*dist.grid, dist.root, El::STAR, dist.rowDist, @@ -95,30 +111,30 @@ class crop_layer : public transform_layer { } - void setup_dims() override { - transform_layer::setup_dims(); + void setup_dims(DataReaderMetaData& dr_metadata) override { + transform_layer::setup_dims(dr_metadata); std::stringstream err; // Make sure input tensors have valid dimensions - const auto& input_dims = get_input_dims(0); - const auto& loc_dims = get_input_dims(1); - const auto& output_dims = get_output_dims(); + const auto& input_dims = this->get_input_dims(0); + const auto& loc_dims = this->get_input_dims(1); + const auto& output_dims = this->get_output_dims(); if (input_dims.size() != output_dims.size()) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "expects a crop input tensor with " << output_dims.size() << " dimensions, " << "but parent layer " - << "\"" << m_parent_layers[0]->get_name() << "\" " + << "\"" << this->get_parent_layers()[0]->get_name() << "\" " << "outputs a tensor with " << input_dims.size() << " dimensions"; LBANN_ERROR(err.str()); } if (loc_dims.size() != 1 || loc_dims[0] != (int) input_dims.size()) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "expects a 1D crop position tensor with " << output_dims.size() << " entries, " << "but parent layer " - << "\"" << m_parent_layers[1]->get_name() << "\" " + << "\"" << this->get_parent_layers()[1]->get_name() << "\" " << "outputs a tensor with dimensions "; for (size_t i = 0; i < loc_dims.size(); ++i) { err << (i > 0 ? " x " : "") << loc_dims[i]; @@ -131,14 +147,14 @@ class crop_layer : public transform_layer { protected: void fp_compute() override { - switch (get_input_dims().size()) { + switch (this->get_input_dims().size()) { case 3: fp_compute_3d(); break; default: fp_compute_nd(); } } void bp_compute() override { - switch (get_input_dims().size()) { + switch (this->get_input_dims().size()) { case 3: bp_compute_3d(); break; default: bp_compute_nd(); } @@ -146,22 +162,22 @@ class crop_layer : public transform_layer { private: /** View into input tensor. */ - std::unique_ptr m_input_v; + std::unique_ptr m_input_v; /** View into output tensor. */ - std::unique_ptr m_output_v; + std::unique_ptr m_output_v; /** View into crop positions. */ - std::unique_ptr m_crop_pos_v; + std::unique_ptr m_crop_pos_v; /** Forward prop implementation for n-dimensional tensors. */ void fp_compute_nd() { // Input and output tensors - const auto& input = get_prev_activations(0); - auto& output = get_activations(); + const auto& input = this->get_prev_activations(0); + auto& output = this->get_activations(); // Tensor dimensions - const auto& input_dims = get_input_dims(0); - const auto& output_dims = get_output_dims(); + const auto& input_dims = this->get_input_dims(0); + const auto& output_dims = this->get_output_dims(); const El::Int num_dims = output_dims.size(); const auto& local_width = input.LocalWidth(); const auto& region_size = output_dims.back(); @@ -169,7 +185,7 @@ class crop_layer : public transform_layer { // Get crop position m_crop_pos_v->Empty(false); m_crop_pos_v->AlignWith(input); - const auto& input1 = get_prev_activations(1); + const auto& input1 = this->get_prev_activations(1); if (m_crop_pos_v->DistData() == input1.DistData()) { El::LockedView(*m_crop_pos_v, input1); } else { @@ -186,7 +202,7 @@ class crop_layer : public transform_layer { std::vector crop_offsets; for (El::Int d = 0; d < num_dims; ++d) { const auto& pos = local_crop_pos(d, local_col); - if (pos < DataType(0) || pos > DataType(1)) { + if (CompareType(pos) < CompareType(0.0) || CompareType(pos) > CompareType(1.0)) { std::stringstream err; err << "crop position not in range [0,1] (pos=("; for (El::Int i = 0; i < local_crop_pos.Height(); ++i) { @@ -196,7 +212,7 @@ class crop_layer : public transform_layer { LBANN_ERROR(err.str()); } const El::Int num_offsets = input_dims[d] - output_dims[d] + 1; - crop_offsets.push_back(std::min(El::Int(pos * num_offsets), + crop_offsets.push_back(std::min(El::Int(static_cast(pos) * num_offsets), num_offsets - 1)); } @@ -241,17 +257,17 @@ class crop_layer : public transform_layer { void bp_compute_nd() { // Clear error signals - El::Zero(get_error_signals(0)); - El::Zero(get_error_signals(1)); + El::Zero(this->get_error_signals(0)); + El::Zero(this->get_error_signals(1)); // Input and gradient tensors - const auto& gradient_wrt_output = get_prev_error_signals(); - auto& gradient_wrt_input = get_error_signals(0); + const auto& gradient_wrt_output = this->get_prev_error_signals(); + auto& gradient_wrt_input = this->get_error_signals(0); const auto& local_crop_pos = m_crop_pos_v->LockedMatrix(); // Tensor dimensions - const auto& input_dims = get_input_dims(0); - const auto& output_dims = get_output_dims(); + const auto& input_dims = this->get_input_dims(0); + const auto& output_dims = this->get_output_dims(); const El::Int num_dims = output_dims.size(); const auto& local_width = gradient_wrt_input.LocalWidth(); const auto& region_size = output_dims.back(); @@ -265,7 +281,7 @@ class crop_layer : public transform_layer { std::vector crop_offsets; for (El::Int d = 0; d < num_dims; ++d) { const auto& pos = local_crop_pos(d, local_col); - if (pos < DataType(0) || pos > DataType(1)) { + if (CompareType(pos) < CompareType(0.0) || CompareType(pos) > CompareType(1.0)) { std::stringstream err; err << "crop position not in range [0,1] (pos=("; for (El::Int i = 0; i < local_crop_pos.Height(); ++i) { @@ -275,7 +291,7 @@ class crop_layer : public transform_layer { LBANN_ERROR(err.str()); } const El::Int num_offsets = input_dims[d] - output_dims[d] + 1; - crop_offsets.push_back(std::min(El::Int(pos * num_offsets), + crop_offsets.push_back(std::min(El::Int(static_cast(pos) * num_offsets), num_offsets - 1)); } @@ -327,6 +343,16 @@ class crop_layer : public transform_layer { }; +LBANN_DEFINE_LAYER_BUILDER(crop); + +#ifndef LBANN_CROP_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class crop_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_CROP_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_CROP_HPP_INCLUDED diff --git a/include/lbann/layers/transform/discrete_random.hpp b/include/lbann/layers/transform/discrete_random.hpp index c668971726f..18d30846e25 100644 --- a/include/lbann/layers/transform/discrete_random.hpp +++ b/include/lbann/layers/transform/discrete_random.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_DISCRETE_RANDOM_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/random.hpp" namespace lbann { @@ -39,8 +40,14 @@ namespace lbann { * * @todo Remove. */ -template -class discrete_random_layer : public transform_layer { +template +class discrete_random_layer : public transform_layer { + static_assert(Dev == El::Device::CPU, + "discrete random layer currently only supports CPU"); + static_assert(T_layout == data_layout::DATA_PARALLEL, + "discrete random layer currently only supports DATA_PARALLEL"); private: /** Values in discrete distribution. */ @@ -50,13 +57,9 @@ class discrete_random_layer : public transform_layer { discrete_random_layer(lbann_comm *comm, std::vector values, std::vector dims) - : transform_layer(comm), + : transform_layer(comm), m_values(values) { - static_assert(Dev == El::Device::CPU, - "discrete random layer currently only supports CPU"); - static_assert(T_layout == data_layout::DATA_PARALLEL, - "discrete random layer currently only supports DATA_PARALLEL"); - set_output_dims(dims); + this->set_output_dims(dims); } discrete_random_layer* copy() const override { return new discrete_random_layer(*this); } std::string get_type() const override { return "discrete random"; } @@ -65,9 +68,9 @@ class discrete_random_layer : public transform_layer { protected: - void setup_dims() override { - transform_layer::setup_dims(); - if (get_input_size() != (int) m_values.size()) { + void setup_dims(DataReaderMetaData& dr_metadata) override { + transform_layer::setup_dims(dr_metadata); + if (this->get_input_size() != (int) m_values.size()) { LBANN_ERROR("input tensor dimensions don't match number of " "values in discrete distribution"); } @@ -76,9 +79,9 @@ class discrete_random_layer : public transform_layer { void fp_compute() override { // Input and output matrices - const auto& input = get_prev_activations(); + const auto& input = this->get_prev_activations(); const auto& local_input = input.LockedMatrix(); - auto& output = get_activations(); + auto& output = this->get_activations(); auto& local_output = output.Matrix(); const int num_values = m_values.size(); const auto& num_outputs = local_output.Height(); @@ -86,9 +89,9 @@ class discrete_random_layer : public transform_layer { const auto& local_width = input.LocalWidth(); // Initialize random numbers - const auto& mode = this->m_model->get_execution_mode(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); if (mode == execution_mode::training) { - uniform_fill(output, 1, width, DataType(0.5), DataType(0.5)); + uniform_fill(output, 1, width, TensorDataType(0.5), TensorDataType(0.5)); } // Process each mini-batch sample @@ -119,6 +122,16 @@ class discrete_random_layer : public transform_layer { }; +#ifndef LBANN_DISCRETE_RANDOM_LAYER_INSTANTIATE +#define PROTO(T) \ + extern template class discrete_random_layer< \ + T, data_layout::DATA_PARALLEL, El::Device::CPU> + +#define LBANN_INSTANTIATE_CPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#undef LBANN_INSTANTIATE_CPU_HALF +#endif // LBANN_DISCRETE_RANDOM_LAYER_INSTANTIATE } // namespace lbann #endif // LBANN_LAYER_DISCRETE_RANDOM_HPP_INCLUDED diff --git a/include/lbann/layers/transform/dummy.hpp b/include/lbann/layers/transform/dummy.hpp index ec451fbe08e..4a3371752b9 100644 --- a/include/lbann/layers/transform/dummy.hpp +++ b/include/lbann/layers/transform/dummy.hpp @@ -36,10 +36,12 @@ namespace lbann { * Does no computation and is primarily intended as a placeholder for * unused layer outputs. */ -template -class dummy_layer : public transform_layer { +template +class dummy_layer : public transform_layer { public: - dummy_layer(lbann_comm *comm) : transform_layer(comm) { + dummy_layer(lbann_comm *comm) : transform_layer(comm) { this->m_expected_num_child_layers = 0; } dummy_layer* copy() const override { return new dummy_layer(*this); } @@ -50,6 +52,17 @@ class dummy_layer : public transform_layer { void fp_compute() override {} }; +LBANN_DEFINE_LAYER_BUILDER(dummy); + +#ifndef LBANN_DUMMY_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class dummy_layer; \ + extern template class dummy_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_DUMMY_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_DUMMY_HPP_INCLUDED diff --git a/include/lbann/layers/transform/evaluation.hpp b/include/lbann/layers/transform/evaluation.hpp index 014ff9e3849..aaf5d3ea209 100644 --- a/include/lbann/layers/transform/evaluation.hpp +++ b/include/lbann/layers/transform/evaluation.hpp @@ -32,7 +32,16 @@ namespace lbann { /** @brief Interface with objective function and metrics. */ -class abstract_evaluation_layer : public transform_layer { +template +class abstract_evaluation_layer : public transform_layer { +public: +#ifdef LBANN_DETERMINISTIC + using EvalDataType = EvalType; +#else + using EvalDataType = TensorDataType; +#endif + using CPUMatType = El::Matrix; + public: /** Get scaling factor. */ @@ -51,8 +60,8 @@ class abstract_evaluation_layer : public transform_layer { protected: abstract_evaluation_layer(lbann_comm *comm); - void setup_dims() override; - void setup_data() override; + void setup_dims(DataReaderMetaData& dr_metadata) override; + void setup_data(size_t max_mini_batch_size) override; void fp_compute() override; void bp_compute() override; @@ -63,7 +72,7 @@ class abstract_evaluation_layer : public transform_layer { /** Evaluated value. * The value may be stored in pinned memory. */ - CPUMat m_value; + CPUMatType m_value; /** Non-blocking allreduce request. */ Al::request m_allreduce_req; #ifdef LBANN_HAS_GPU @@ -77,16 +86,39 @@ class abstract_evaluation_layer : public transform_layer { * Computes the average value across a mini-batch. If the input * tensor has multiple neurons, their values are added together. */ -template -class evaluation_layer : public abstract_evaluation_layer { +template +class evaluation_layer : public abstract_evaluation_layer { public: - evaluation_layer(lbann_comm *comm) : abstract_evaluation_layer(comm) {} + evaluation_layer(lbann_comm *comm) : abstract_evaluation_layer(comm) {} evaluation_layer* copy() const override { return new evaluation_layer(*this); } std::string get_type() const override { return "evaluation"; } data_layout get_data_layout() const override { return T_layout; } El::Device get_device_allocation() const override { return Dev; } }; +LBANN_DEFINE_LAYER_BUILDER(evaluation); + +#ifndef LBANN_EVALUATION_LAYER_INSTANTIATE +#define PROTO(T) \ + extern template class abstract_evaluation_layer + +#define LBANN_INSTANTIATE_CPU_HALF +#define LBANN_INSTANTIATE_GPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#undef LBANN_INSTANTIATE_CPU_HALF +#undef LBANN_INSTANTIATE_GPU_HALF + +#define PROTO_DEVICE(T, Device) \ + extern template class evaluation_layer; \ + extern template class evaluation_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_EVALUATION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_EVALUATION_HPP_INCLUDED diff --git a/include/lbann/layers/transform/gaussian.hpp b/include/lbann/layers/transform/gaussian.hpp index 7ab43afc3a7..9123c2e1172 100644 --- a/include/lbann/layers/transform/gaussian.hpp +++ b/include/lbann/layers/transform/gaussian.hpp @@ -28,30 +28,38 @@ #define LBANN_LAYER_GAUSSIAN_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/random.hpp" namespace lbann { -/** @brief Random values with Gaussian distribution. - * - * During validation and testing, outputs are all equal to the - * distribution mean. - */ -template -class gaussian_layer : public transform_layer { +/** @brief Random values from Gaussian/normal distribution. */ +template +class gaussian_layer : public transform_layer { private: - /** Gaussian distribution mean. */ - DataType m_mean; - /** Gaussian distribution standard deviation. */ - DataType m_stdev; + /** @brief Gaussian distribution mean. */ + TensorDataType m_mean; + /** @brief Gaussian distribution standard deviation. */ + TensorDataType m_stdev; + /** @brief Whether to have deterministic output when not training. + * + * Applies to execution modes other than training, e.g. validation + * and inference. If true, outputs are all equal to the + * distribution mean when not training. + */ + bool m_training_only; public: gaussian_layer(lbann_comm *comm, const std::vector& dims, - DataType mean = DataType(0), - DataType stdev = DataType(1)) - : transform_layer(comm), m_mean(mean), m_stdev(stdev) { - set_output_dims(dims); + TensorDataType mean = El::TypeTraits::Zero(), + TensorDataType stdev = El::TypeTraits::One(), + bool training_only = false) + : transform_layer(comm), + m_mean(mean), m_stdev(stdev), m_training_only(training_only) { + this->set_output_dims(dims); this->m_expected_num_parent_layers = 0; } gaussian_layer* copy() const override { return new gaussian_layer(*this); } @@ -60,25 +68,37 @@ class gaussian_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Mean", m_mean); desc.add("Standard deviation", m_stdev); + desc.add("Training only", m_training_only); return desc; } protected: void fp_compute() override { - auto& output = get_activations(); - if (this->m_model->get_execution_mode() == execution_mode::training) { - gaussian_fill(output, output.Height(), output.Width(), m_mean, m_stdev); - } else { + auto& output = this->get_activations(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); + if (m_training_only && (mode != execution_mode::training)) { El::Fill(output, m_mean); } + else { + gaussian_fill(output, output.Height(), output.Width(), m_mean, m_stdev); + } } }; +#ifndef LBANN_GAUSSIAN_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class gaussian_layer; \ + extern template class gaussian_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_GAUSSIAN_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_GAUSSIAN_HPP_INCLUDED diff --git a/include/lbann/layers/transform/hadamard.hpp b/include/lbann/layers/transform/hadamard.hpp index 04426334b91..7987ef0b72f 100644 --- a/include/lbann/layers/transform/hadamard.hpp +++ b/include/lbann/layers/transform/hadamard.hpp @@ -34,11 +34,13 @@ namespace lbann { /** @brief Entry-wise tensor product. */ -template -class hadamard_layer : public transform_layer { +template +class hadamard_layer : public transform_layer { public: - hadamard_layer(lbann_comm *comm) : transform_layer(comm) { + hadamard_layer(lbann_comm *comm) : transform_layer(comm) { this->m_expected_num_parent_layers = -1; // No limit on parents } @@ -50,29 +52,29 @@ class hadamard_layer : public transform_layer { protected: void setup_pointers() override { - transform_layer::setup_pointers(); - if (get_num_parents() < 1) { + transform_layer::setup_pointers(); + if (this->get_num_parents() < 1) { std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has no parent layers"; LBANN_ERROR(err.str()); } } - void setup_dims() override { - transform_layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + transform_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); // Check that input dimensions match - const auto& output_dims = get_output_dims(); - for (int i = 0; i < get_num_parents(); ++i) { - if (get_input_dims(i) != output_dims) { - const auto& parents = get_parent_layers(); + const auto& output_dims = this->get_output_dims(); + for (int i = 0; i < this->get_num_parents(); ++i) { + if (this->get_input_dims(i) != output_dims) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with incompatible dimensions ("; - for (int j = 0; j < get_num_parents(); ++j) { - const auto& dims = get_input_dims(j); + for (int j = 0; j < this->get_num_parents(); ++j) { + const auto& dims = this->get_input_dims(j); err << (j > 0 ? ", " : "") << "layer \"" << parents[j]->get_name() << "\" outputs "; for (size_t k = 0; k < dims.size(); ++k) { @@ -87,35 +89,35 @@ class hadamard_layer : public transform_layer { } void fp_compute() override { - auto& output = get_activations(); - switch (get_num_parents()) { - case 0: El::Fill(output, DataType(1)); break; - case 1: El::LockedView(output, get_prev_activations()); break; + auto& output = this->get_activations(); + switch (this->get_num_parents()) { + case 0: El::Fill(output, El::TypeTraits::One()); break; + case 1: El::LockedView(output, this->get_prev_activations()); break; default: - El::Hadamard(get_prev_activations(0), - get_prev_activations(1), + El::Hadamard(this->get_prev_activations(0), + this->get_prev_activations(1), output); - for (int i = 2; i < get_num_parents(); ++i) { - El::Hadamard(get_prev_activations(i), output, output); + for (int i = 2; i < this->get_num_parents(); ++i) { + El::Hadamard(this->get_prev_activations(i), output, output); } } } void bp_compute() override { - const int num_parents = get_num_parents(); - const auto& gradient_wrt_output = get_prev_error_signals(); + const int num_parents = this->get_num_parents(); + const auto& gradient_wrt_output = this->get_prev_error_signals(); switch (num_parents) { case 0: break; case 1: - El::LockedView(get_error_signals(), gradient_wrt_output); + El::LockedView(this->get_error_signals(), gradient_wrt_output); break; default: for (int i = 0; i < num_parents; ++i) { - auto& gradient_wrt_input = get_error_signals(i); + auto& gradient_wrt_input = this->get_error_signals(i); El::Copy(gradient_wrt_output, gradient_wrt_input); for (int j = 0; j < num_parents; ++j) { if (i != j) { - El::Hadamard(get_prev_activations(j), + El::Hadamard(this->get_prev_activations(j), gradient_wrt_input, gradient_wrt_input); } @@ -126,6 +128,17 @@ class hadamard_layer : public transform_layer { }; +LBANN_DEFINE_LAYER_BUILDER(hadamard); + +#ifndef LBANN_HADAMARD_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class hadamard_layer; \ + extern template class hadamard_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_HADAMARD_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_HADAMARD_HPP_INCLUDED diff --git a/include/lbann/layers/transform/in_top_k.hpp b/include/lbann/layers/transform/in_top_k.hpp index 85abe8caba4..26852889c3c 100644 --- a/include/lbann/layers/transform/in_top_k.hpp +++ b/include/lbann/layers/transform/in_top_k.hpp @@ -38,12 +38,14 @@ namespace lbann { * one and the rest to zero. Ties are broken in favor of entries with * smaller indices. */ -template -class in_top_k_layer : public transform_layer { +template +class in_top_k_layer : public transform_layer { public: in_top_k_layer(lbann_comm *comm, El::Int k) - : transform_layer(comm), m_k(k) { + : transform_layer(comm), m_k(k) { if (m_k < 0) { std::stringstream err; err << "invalid parameter for top-k search (k=" << m_k << ")"; @@ -57,16 +59,16 @@ class in_top_k_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("k", m_k); return desc; } protected: - void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); } void fp_compute() override; @@ -78,6 +80,15 @@ class in_top_k_layer : public transform_layer { }; +#ifndef LBANN_IN_TOP_K_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class in_top_k_layer; \ + extern template class in_top_k_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_IN_TOP_K_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_IN_TOP_K_HPP_INCLUDED diff --git a/include/lbann/layers/transform/pooling.hpp b/include/lbann/layers/transform/pooling.hpp index abf6689aa82..35db88a633c 100644 --- a/include/lbann/layers/transform/pooling.hpp +++ b/include/lbann/layers/transform/pooling.hpp @@ -33,19 +33,42 @@ #include "lbann/utils/cudnn.hpp" #include "lbann/utils/exception.hpp" #include "lbann/utils/im2col.hpp" +#include "lbann/utils/distconv.hpp" namespace lbann { +#ifdef LBANN_HAS_DISTCONV +template +class pooling_distconv_adapter : public data_type_distconv_adapter { + public: + using TensorDevType = typename data_type_distconv_adapter::TensorDevType; + pooling_distconv_adapter(Layer& layer): data_type_distconv_adapter(layer) {} + virtual ~pooling_distconv_adapter() = default; + void setup_distributions(tensor_overlap_constraints &constraints) override; + dc::Shape get_activations_local_shape(int index=0) const override; + void setup_layer(size_t workspace_capacity) override; + void fp_compute(); + void bp_compute(); + std::unique_ptr> m_pooling; +}; +#endif // LBANN_HAS_DISTCONV + // Forward declaration -template +template class unpooling_layer; -template -class pooling_layer : public transform_layer { +template +class pooling_layer : public transform_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "pooling only supports DATA_PARALLEL"); private: /** Pooling mode. */ - const pool_mode m_pool_mode; + pool_mode m_pool_mode; /** Pooling window dimensions. */ std::vector m_pool_dims; @@ -67,10 +90,10 @@ class pooling_layer : public transform_layer { /** Pooling descriptor. */ cudnnPoolingDescriptor_t m_pooling_cudnn_desc; /** Tensor cuDNN descriptors. */ - cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; + cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; #endif // LBANN_HAS_CUDNN - friend class unpooling_layer; + friend class unpooling_layer; public: @@ -93,7 +116,7 @@ class pooling_layer : public transform_layer { std::vector pads, std::vector strides, pool_mode mode) - : transform_layer(comm), + : transform_layer(comm), m_pool_mode(mode), m_pool_dims(pool_dims), m_pads(pads), @@ -103,9 +126,6 @@ class pooling_layer : public transform_layer { m_tensors_cudnn_desc(this) #endif // LBANN_HAS_CUDNN { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "pooling only supports DATA_PARALLEL"); - // Initialize input dimensions and pooling parameters m_pool_size = std::accumulate(m_pool_dims.begin(), m_pool_dims.end(), @@ -115,7 +135,7 @@ class pooling_layer : public transform_layer { } pooling_layer(const pooling_layer& other) - : transform_layer(other), + : transform_layer(other), m_pool_mode(other.m_pool_mode), m_pool_dims(other.m_pool_dims), m_pool_size(other.m_pool_size), @@ -134,7 +154,7 @@ class pooling_layer : public transform_layer { } pooling_layer& operator=(const pooling_layer& other){ - transform_layer::operator=(other); + transform_layer::operator=(other); m_pool_mode = other.m_pool_mode; m_pool_dims = other.m_pool_dims; m_pool_size = other.m_pool_size; @@ -163,7 +183,7 @@ class pooling_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); std::stringstream ss; // Pool mode @@ -210,21 +230,21 @@ class pooling_layer : public transform_layer { protected: - void setup_dims() override { - transform_layer::setup_dims(); - const auto& input_dims = get_input_dims(); + void setup_dims(DataReaderMetaData& dr_metadata) override { + transform_layer::setup_dims(dr_metadata); + const auto& input_dims = this->get_input_dims(); auto output_dims = input_dims; for(size_t i = 0; i < output_dims.size() - 1; ++i) { const int effective_dim = (input_dims[i+1] + 2 * m_pads[i] - m_pool_dims[i] + 1); output_dims[i+1] = (effective_dim + m_strides[i] - 1) / m_strides[i]; } - set_output_dims(output_dims); + this->set_output_dims(output_dims); } /// Initialize GPU objects void setup_gpu() override { - transform_layer::setup_gpu(); + transform_layer::setup_gpu(); #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else @@ -262,6 +282,12 @@ class pooling_layer : public transform_layer { void fp_compute() override { if(this->using_gpus()) { +#ifdef LBANN_HAS_DISTCONV + if (this->distconv_enabled()) { + get_distconv_adapter().fp_compute(); + return; + } +#endif // LBANN_HAS_DISTCONV fp_compute_cudnn(); } else { fp_compute_im2col(); @@ -270,6 +296,12 @@ class pooling_layer : public transform_layer { void bp_compute() override { if(this->using_gpus()) { +#ifdef LBANN_HAS_DISTCONV + if (this->distconv_enabled()) { + get_distconv_adapter().bp_compute(); + return; + } +#endif // LBANN_HAS_DISTCONV bp_compute_cudnn(); } else { bp_compute_im2col(); @@ -283,11 +315,12 @@ class pooling_layer : public transform_layer { #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); + using ScalingType = cudnn::ScalingParamType; + const auto& local_input = this->get_local_prev_activations(); + auto& local_output = this->get_local_activations(); if (local_input.Height() > 0 && local_input.Width() > 0) { - const DataType zero = DataType(0); - const DataType one = DataType(1); + const auto zero = El::TypeTraits::Zero(); + const auto one = El::TypeTraits::One(); CHECK_CUDNN(cudnnPoolingForward(cudnn::get_handle(), m_pooling_cudnn_desc, &one, @@ -305,15 +338,16 @@ class pooling_layer : public transform_layer { #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else - const auto& local_input = get_local_prev_activations(); - const auto& local_output = get_local_activations(); - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); + using ScalingType = cudnn::ScalingParamType; + const auto& local_input = this->get_local_prev_activations(); + const auto& local_output = this->get_local_activations(); + const auto& local_gradient_wrt_output = this->get_local_prev_error_signals(); + auto& local_gradient_wrt_input = this->get_local_error_signals(); if (local_input.Height() > 0 && local_input.Width() > 0) { // Useful constants - const DataType one = DataType(1); - const DataType zero = DataType(0); + const auto one = El::TypeTraits::One(); + const auto zero = El::TypeTraits::Zero(); // Perform backprop on GPU CHECK_CUDNN(cudnnPoolingBackward(cudnn::get_handle(), @@ -340,23 +374,23 @@ class pooling_layer : public transform_layer { } // Local matrices - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); + const auto& local_input = this->get_local_prev_activations(); + auto& local_output = this->get_local_activations(); // Pool parameters const int local_width = local_input.Width(); - const auto& input_dims = get_input_dims(); + const auto& input_dims = this->get_input_dims(); const int num_channels = input_dims[0]; - const int num_per_output_channel = get_output_size() / num_channels; + const int num_per_output_channel = this->get_output_size() / num_channels; // Initialize max pool indices if needed if(m_pool_mode == pool_mode::max) { - m_max_pool_indices.assign(get_output_size() * local_width, 0); + m_max_pool_indices.assign(this->get_output_size() * local_width, 0); } // Initialize matrices - DMat im2col_mat(m_pool_size * num_channels, num_per_output_channel); - DMat input_mat; + El::Matrix im2col_mat(m_pool_size * num_channels, num_per_output_channel); + El::Matrix input_mat; // Iterate through data samples for(int sample = 0; sample < local_width; ++sample) { @@ -364,7 +398,7 @@ class pooling_layer : public transform_layer { // Construct im2col matrix from input El::LockedView(input_mat, local_input, El::ALL, El::IR(sample)); - im2col(input_mat, + im2col(input_mat, im2col_mat, num_channels, input_dims.size() - 1, @@ -375,16 +409,16 @@ class pooling_layer : public transform_layer { if(m_pool_mode == pool_mode::max) { // Apply max pooling - DataType *output_buffer = local_output.Buffer(0, sample); - int *indices_buffer = &m_max_pool_indices[sample * get_output_size()]; + TensorDataType *output_buffer = local_output.Buffer(0, sample); + int *indices_buffer = &m_max_pool_indices[sample * this->get_output_size()]; LBANN_OMP_PARALLEL_FOR for(int channel = 0; channel < num_channels; ++channel) { for(int j = 0; j < num_per_output_channel; ++j) { - DataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j); - DataType max_entry = im2col_buffer[0]; + TensorDataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j); + TensorDataType max_entry = im2col_buffer[0]; int max_index = 0; for(int i = 1; i < m_pool_size; ++i) { - const DataType current_entry = im2col_buffer[i]; + const TensorDataType current_entry = im2col_buffer[i]; if(current_entry > max_entry) { max_entry = current_entry; max_index = i; @@ -399,13 +433,13 @@ class pooling_layer : public transform_layer { if(m_pool_mode == pool_mode::average) { // Apply average pooling - DataType *output_buffer = local_output.Buffer(0, sample); + TensorDataType *output_buffer = local_output.Buffer(0, sample); LBANN_OMP_PARALLEL_FOR for(int channel = 0; channel < num_channels; ++channel) { for(int j = 0; j < num_per_output_channel; ++j) { - const DataType *im2col_buffer + const TensorDataType *im2col_buffer = im2col_mat.LockedBuffer(channel*m_pool_size, j); - DataType output_entry = 0; + TensorDataType output_entry = El::TypeTraits::Zero(); for(int i = 0; i < m_pool_size; ++i) { output_entry += im2col_buffer[i]; } @@ -422,23 +456,24 @@ class pooling_layer : public transform_layer { /// Pooling forward propagation with im2col void bp_compute_im2col() { + using CPUMatType = El::Matrix; if(m_pool_mode != pool_mode::max && m_pool_mode != pool_mode::average) { LBANN_ERROR("CPU pooling layer only supports max and average pooling"); } // Local matrices - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); + const auto& local_gradient_wrt_output = this->get_local_prev_error_signals(); + auto& local_gradient_wrt_input = this->get_local_error_signals(); // Pool parameters const int local_width = local_gradient_wrt_output.Width(); - const auto& input_dims = get_input_dims(); + const auto& input_dims = this->get_input_dims(); const int num_channels = input_dims[0]; - const int num_per_input_channel = get_output_size() / num_channels; + const int num_per_input_channel = this->get_output_size() / num_channels; // Initialize matrices - CPUMat im2col_mat(m_pool_size * num_channels, num_per_input_channel); - CPUMat gradient_wrt_input_col; + CPUMatType im2col_mat(m_pool_size * num_channels, num_per_input_channel); + CPUMatType gradient_wrt_input_col; // Iterate through data samples for(int sample = 0; sample < local_width; ++sample) { @@ -451,16 +486,16 @@ class pooling_layer : public transform_layer { // Copy previous error signal to im2col matrix entries // corresponding to max - const DataType *gradient_wrt_output_buffer + const TensorDataType *gradient_wrt_output_buffer = local_gradient_wrt_output.LockedBuffer(0, sample); const int *indices_buffer - = &m_max_pool_indices[sample * get_output_size()]; + = &m_max_pool_indices[sample * this->get_output_size()]; LBANN_OMP_PARALLEL_FOR for(int channel = 0; channel < num_channels; ++channel) { for(int j = 0; j < num_per_input_channel; ++j) { const int input_index = j + channel * num_per_input_channel; const int max_index = indices_buffer[input_index]; - DataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j); + TensorDataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j); im2col_buffer[max_index] = gradient_wrt_output_buffer[input_index]; } @@ -470,15 +505,15 @@ class pooling_layer : public transform_layer { // Compute gradient w.r.t. im2col matrix for average pooling if(m_pool_mode == pool_mode::average) { - const DataType *gradient_wrt_output_buffer + const TensorDataType *gradient_wrt_output_buffer = local_gradient_wrt_output.LockedBuffer(0, sample); LBANN_OMP_PARALLEL_FOR for(int channel = 0; channel < num_channels; ++channel) { for(int j = 0; j < num_per_input_channel; ++j) { - DataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j); + TensorDataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j); const int input_index = j + channel * num_per_input_channel; - const DataType output_entry - = gradient_wrt_output_buffer[input_index] / m_pool_size; + const TensorDataType output_entry + = gradient_wrt_output_buffer[input_index] / El::To(m_pool_size); for(int i = 0; i < m_pool_size; ++i) { im2col_buffer[i] = output_entry; } @@ -490,7 +525,7 @@ class pooling_layer : public transform_layer { // Compute error signal (i.e. gradient w.r.t. input) El::View(gradient_wrt_input_col, local_gradient_wrt_input, El::ALL, El::IR(sample)); - col2im(im2col_mat, + col2im(im2col_mat, gradient_wrt_input_col, num_channels, input_dims.size() - 1, @@ -503,6 +538,18 @@ class pooling_layer : public transform_layer { } +#ifdef LBANN_HAS_DISTCONV + friend class pooling_distconv_adapter; + protected: + bool is_distconv_supported() const override; + void setup_distconv_adapter() override { + this->get_distconv_adapter_ptr() = make_unique< + pooling_distconv_adapter>(*this); + } + pooling_distconv_adapter& get_distconv_adapter() override; + const pooling_distconv_adapter& get_distconv_adapter() const override; +#endif // LBANN_HAS_DISTCONV + #ifdef LBANN_HAS_CUDNN /** Copy pooling cuDNN descriptor. */ static void copy_pooling_cudnn_desc(const cudnnPoolingDescriptor_t& src, @@ -553,6 +600,187 @@ class pooling_layer : public transform_layer { }; +#ifdef LBANN_HAS_DISTCONV +template +pooling_distconv_adapter& +pooling_layer::get_distconv_adapter() { + return const_cast&>( + static_cast&>(*this).get_distconv_adapter()); +} + +template +const pooling_distconv_adapter& +pooling_layer::get_distconv_adapter() const { + return dynamic_cast&>( + data_type_layer::get_distconv_adapter()); +} + +template +bool pooling_layer::is_distconv_supported() const { + if (Dev != El::Device::GPU || T_layout != data_layout::DATA_PARALLEL) { + return false; + } + + bool cond = true; + for(int i = 0; i < dc::get_num_spatial_dims(*this); i++) { + cond &= (m_pool_dims[i] % 2 != 0) || + (m_pool_dims[i] == m_strides[i]); + } + if (!cond) { + dc::MPIPrintStreamDebug() << "pooling: unsupported due to window shape: " + << dc::util::join_xd_array(m_pool_dims); + return false; + } + + for (int i = 0; i < dc::get_num_spatial_dims(*this); i++) { + bool odd = m_pool_dims[i] % 2; + if (odd) { + int stencil = (m_pool_dims[i] - 1) / 2; + if (!(m_pads[i] == 0 || m_pads[i] == stencil)) { + dc::MPIPrintStreamDebug() << "pooling: unsupported due to padding: " + << m_pads[i]; + return false; + } + if (!(m_strides[i] == 1 || m_strides[i] == stencil + 1)) { + dc::MPIPrintStreamDebug() << "pooling: unsupported due to strides"; + return false; + } + } else { + if (m_pads[i] != 0) return false; + if (m_pool_dims[i] != m_strides[i]) return false; + } + } + + return true; +} + +template +void pooling_distconv_adapter:: +setup_distributions(tensor_overlap_constraints &constraints) { + data_type_distconv_adapter::setup_distributions( + constraints); + const auto &l = dynamic_cast&>( + this->layer()); + dc::IntVector overlap(dc::get_num_dims(l), 0); + const auto &ps = l.get_parallel_strategy(); + auto pool_dims = l.m_pool_dims; + std::reverse(pool_dims.begin(), pool_dims.end()); + for(int i = 0; i < dc::get_num_spatial_dims(l); i++) { + int splits = 0; + switch (i) { + case 0: splits = ps.width_splits; break; + case 1: splits = ps.height_splits; break; + case 2: splits = ps.depth_splits; break; + } + if(splits == 1) continue; + int ov = 0; + if (pool_dims[i] % 2) { + ov = (pool_dims[i] - 1) / 2; + } else { + // no halo dependency is assumed for now + ov = 0; + } + overlap[i] = ov; + } + auto &prev_activations_dist = this->get_prev_activations_dist(); + auto &activations_dist = this->get_activations_dist(); + auto &error_signals_dist = this->get_error_signals_dist(); + auto &prev_error_signals_dist = this->get_prev_error_signals_dist(); + prev_activations_dist.set_overlap(overlap); + constraints.mark_updated(prev_activations_dist); + constraints.mark_invariant(prev_activations_dist); + // cudnnPoolingBackward requires activations and + // prev_error_signals must have the same stride + constraints.mark_equivalent(activations_dist, prev_error_signals_dist); + // cudnnPoolingBackward requires prev_activations and + // error_signals must have the same stride + constraints.mark_equivalent(error_signals_dist, prev_activations_dist); +} + +template +dc::Shape pooling_distconv_adapter:: +get_activations_local_shape(int index) const { + assert_eq(index, 0); + const auto &layer = dynamic_cast&>(this->layer()); + auto filter_dims = layer.m_pool_dims; + std::reverse(std::begin(filter_dims), std::end(filter_dims)); + auto strides = layer.m_strides; + std::reverse(std::begin(strides), std::end(strides)); + const std::vector dilations( + dc::get_num_spatial_dims(layer), 1); + bool use_padding = layer.m_pads[0] != 0; + auto output_spatial_local_shape = + ::distconv::get_pooling_output_local_tensor_shape( + this->get_prev_activations(), filter_dims, strides, use_padding, dilations); + return output_spatial_local_shape; +} + +template +void pooling_distconv_adapter:: +setup_layer(size_t workspace_capacity) { + auto &l = dynamic_cast&>( + this->layer()); + + // Init the dc::Pooling layer + m_pooling = make_unique>( + dc::get_backend(), dc::get_num_dims(l), + dc::get_halo_exchange_method()); + + std::string mode; + switch(l.m_pool_mode) { + case pool_mode::max: + mode = "MAX"; break; + case pool_mode::average: + mode = "AVERAGE"; break; + case pool_mode::average_no_pad: + mode = "AVERAGE_NO_PAD"; break; + default: + LBANN_ERROR("pooling_layer: no DISTCONV implementation for pooling mode"); + } + + std::vector pool_dims = l.m_pool_dims; + std::reverse(pool_dims.begin(), pool_dims.end()); + std::vector pads = l.m_pads; + std::reverse(pads.begin(), pads.end()); + std::vector strides = l.m_strides; + std::reverse(strides.begin(), strides.end()); + + m_pooling->setup(this->get_prev_activations(), + this->get_activations(), + this->get_error_signals(), + this->get_prev_error_signals(), + pool_dims, pads, strides, + mode); +} + +template +void pooling_distconv_adapter:: +fp_compute() { + m_pooling->forward(TensorDataType{1}, this->get_prev_activations(), + TensorDataType{0}, this->get_activations()); +} + +template +void pooling_distconv_adapter:: +bp_compute() { + m_pooling->backward(TensorDataType{1}, this->get_activations(), + this->get_prev_error_signals(), + this->get_prev_activations(), TensorDataType{0}, + this->get_error_signals()); +} +#endif // LBANN_HAS_DISTCONV + +LBANN_DEFINE_LAYER_BUILDER(pooling); + +#ifndef LBANN_POOLING_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class pooling_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_POOLING_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_POOLING_HPP_INCLUDED diff --git a/include/lbann/layers/transform/reduction.hpp b/include/lbann/layers/transform/reduction.hpp index 15df56534e1..8cccc0ce13a 100644 --- a/include/lbann/layers/transform/reduction.hpp +++ b/include/lbann/layers/transform/reduction.hpp @@ -38,24 +38,26 @@ enum class reduction_mode {INVALID, SUM, AVERAGE}; * * @todo Reduction over specified dimensions. */ -template -class reduction_layer : public transform_layer { +template +class reduction_layer : public transform_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "reduction currently only supports DATA_PARALLEL"); private: /** Reduction mode. */ const reduction_mode m_mode; /** Vector composed of ones. */ - DMat m_ones; + El::Matrix m_ones; public: reduction_layer(lbann_comm *comm, reduction_mode mode) - : transform_layer(comm), + : transform_layer(comm), m_mode(mode) { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "reduction currently only supports DATA_PARALLEL"); if (mode == reduction_mode::INVALID) { LBANN_ERROR("invalid reduction mode"); } @@ -67,7 +69,7 @@ class reduction_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); std::string mode_str; switch (m_mode) { case reduction_mode::SUM: mode_str = "sum"; break; @@ -82,16 +84,16 @@ class reduction_layer : public transform_layer { protected: - void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + this->set_output_dims({1}); } void fp_compute() override { // Local matrices - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); + const auto& local_input = this->get_local_prev_activations(); + auto& local_output = this->get_local_activations(); const El::Int input_size = local_input.Height(); // Apply reduction @@ -99,14 +101,15 @@ class reduction_layer : public transform_layer { case reduction_mode::SUM: El::Ones(m_ones, input_size, 1); El::Gemv(El::TRANSPOSE, - DataType(1), local_input, m_ones, - DataType(0), local_output); + El::TypeTraits::One(), local_input, m_ones, + El::TypeTraits::Zero(), local_output); break; case reduction_mode::AVERAGE: El::Ones(m_ones, input_size, 1); El::Gemv(El::TRANSPOSE, - DataType(1) / input_size, local_input, m_ones, - DataType(0), local_output); + El::TypeTraits::One() / El::To(input_size), + local_input, m_ones, + El::TypeTraits::Zero(), local_output); break; default: LBANN_ERROR("invalid reduction mode"); @@ -117,8 +120,8 @@ class reduction_layer : public transform_layer { void bp_compute() override { // Local matrices - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); + const auto& local_gradient_wrt_output = this->get_local_prev_error_signals(); + auto& local_gradient_wrt_input = this->get_local_error_signals(); const El::Int input_size = local_gradient_wrt_input.Height(); // Compute gradients w.r.t. inputs @@ -126,14 +129,15 @@ class reduction_layer : public transform_layer { case reduction_mode::SUM: El::Ones(m_ones, input_size, 1); El::Gemm(El::NORMAL, El::NORMAL, - DataType(1), m_ones, local_gradient_wrt_output, - DataType(0), local_gradient_wrt_input); + El::TypeTraits::One(), m_ones, local_gradient_wrt_output, + El::TypeTraits::Zero(), local_gradient_wrt_input); break; case reduction_mode::AVERAGE: El::Ones(m_ones, input_size, 1); El::Gemm(El::NORMAL, El::NORMAL, - DataType(1) / input_size, m_ones, local_gradient_wrt_output, - DataType(0), local_gradient_wrt_input); + El::TypeTraits::One() / El::To(input_size), + m_ones, local_gradient_wrt_output, + El::TypeTraits::Zero(), local_gradient_wrt_input); break; default: LBANN_ERROR("invalid reduction mode"); @@ -143,6 +147,14 @@ class reduction_layer : public transform_layer { }; +#ifndef LBANN_REDUCTION_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class reduction_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_REDUCTION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_REDUCTION_HPP_INCLUDED diff --git a/include/lbann/layers/transform/reshape.hpp b/include/lbann/layers/transform/reshape.hpp index 7770080ff69..55b933add64 100644 --- a/include/lbann/layers/transform/reshape.hpp +++ b/include/lbann/layers/transform/reshape.hpp @@ -36,13 +36,13 @@ namespace lbann { * Forward and backward prop simply involve setting up tensor views, * and hence are very cheap. */ -template -class reshape_layer : public transform_layer { +template +class reshape_layer : public transform_layer { public: reshape_layer(lbann_comm *comm, std::vector dims) - : transform_layer(comm) { - set_output_dims(dims); + : transform_layer(comm) { + this->set_output_dims(dims); } reshape_layer* copy() const override { return new reshape_layer(*this); } std::string get_type() const override { return "reshape"; } @@ -51,11 +51,11 @@ class reshape_layer : public transform_layer { protected: - void setup_dims() override { - transform_layer::setup_dims(); + void setup_dims(DataReaderMetaData& dr_metadata) override { + transform_layer::setup_dims(dr_metadata); - const auto& input_dims = get_input_dims(); - auto output_dims = get_output_dims(); + const auto& input_dims = this->get_input_dims(); + auto output_dims = this->get_output_dims(); // Determine any unspecified dimensions int unspecified_dim = -1; @@ -70,12 +70,12 @@ class reshape_layer : public transform_layer { output_dims.end(), 1, std::multiplies()); - output_dims[unspecified_dim] = get_input_size() / specified_size; - set_output_dims(output_dims); + output_dims[unspecified_dim] = this->get_input_size() / specified_size; + this->set_output_dims(output_dims); } // Check that reshape is valid - if (get_input_size() != get_output_size()) { + if (this->get_input_size() != this->get_output_size()) { std::stringstream err; err << "input tensor dimensions ("; for (size_t i = 0; i < input_dims.size(); ++i) { @@ -92,16 +92,25 @@ class reshape_layer : public transform_layer { } void fp_setup_outputs(El::Int mini_batch_size) override { - El::LockedView(get_activations(), get_prev_activations()); + El::LockedView(this->get_activations(), this->get_prev_activations()); } void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { - El::LockedView(get_error_signals(), get_prev_error_signals()); + El::LockedView(this->get_error_signals(), this->get_prev_error_signals()); } void fp_compute() override {} void bp_compute() override {} }; +#ifndef LBANN_RESHAPE_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class reshape_layer; \ + extern template class reshape_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_RESHAPE_LAYER_INSTANTIATE + } // namespace lbann #endif // RESHAPE_HPP_INCLUDED diff --git a/include/lbann/layers/transform/slice.hpp b/include/lbann/layers/transform/slice.hpp index 62143bc32b8..317b803ecc7 100644 --- a/include/lbann/layers/transform/slice.hpp +++ b/include/lbann/layers/transform/slice.hpp @@ -24,11 +24,14 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#ifndef LBANN_LAYER_SLICE_HPP_INCLUDED -#define LBANN_LAYER_SLICE_HPP_INCLUDED +#ifndef LBANN_LAYERS_TRANSFORM_SLICE_HPP_INCLUDED +#define LBANN_LAYERS_TRANSFORM_SLICE_HPP_INCLUDED -#include "lbann/layers/transform/transform.hpp" +#include "lbann/layers/data_type_layer.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/data_readers/data_reader_jag_conduit.hpp" +#include "lbann/models/model.hpp" +#include "lbann/trainers/trainer.hpp" namespace lbann { @@ -44,248 +47,261 @@ namespace lbann { * \cdots\times D_n @f$ * tensor. */ -template -class slice_layer : public transform_layer { +template +class slice_layer : public data_type_layer { public: - slice_layer(lbann_comm *comm, - El::Int slice_dim, - std::vector slice_points) - : transform_layer(comm), - m_slice_dim(slice_dim), - m_slice_points(slice_points) { - this->m_expected_num_child_layers = -1; // No limit on children - } + slice_layer(lbann_comm *comm); + slice_layer(const slice_layer& other) = default; + slice_layer& operator=(const slice_layer& other) = default; - slice_layer(const slice_layer& other) - : transform_layer(other), - m_slice_dim(other.m_slice_dim), - m_slice_points(other.m_slice_points) { - m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); - m_output_v.reset(other.m_output_v ? other.m_output_v->Copy() : nullptr); - } + slice_layer* copy() const override; + std::string get_type() const override; + data_layout get_data_layout() const override; + El::Device get_device_allocation() const override; - slice_layer& operator=(const slice_layer& other) { - transform_layer::operator=(other); - m_slice_dim = other.m_slice_dim; - m_slice_points = other.m_slice_points; - m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); - m_output_v.reset(other.m_output_v ? other.m_output_v->Copy() : nullptr); + description get_description() const override; + + void setup_slice_points(size_t slice_dim, + std::vector slice_points) { + m_slice_dim = slice_dim; + m_slice_points = std::move(slice_points); } - slice_layer* copy() const override { return new slice_layer(*this); } - std::string get_type() const override { return "slice"; } - data_layout get_data_layout() const override { return T_layout; } - El::Device get_device_allocation() const override { return Dev; } - - /** Get slice points. */ - std::vector& get_slice_points() { return m_slice_points; } - /** Get slice points (const). */ - std::vector get_slice_points() const { return m_slice_points; } - - description get_description() const override { - auto&& desc = transform_layer::get_description(); - desc.add("Slice dimension", m_slice_dim); - std::stringstream ss; - for (size_t i = 0; i < m_slice_points.size(); ++i) { - ss << (i > 0 ? ", " : "") << m_slice_points[i]; - } - desc.add("Slice points", ss.str()); - return desc; + void setup_slice_points(size_t slice_dim, + bool set_slice_points_from_data_reader, + const slice_points_mode var_category) { + m_slice_dim = slice_dim; + m_set_slice_points_from_data_reader = set_slice_points_from_data_reader; + m_var_category = var_category; } protected: - void setup_matrices(const El::Grid& grid) override { - transform_layer::setup_matrices(grid); - const auto& input = get_prev_activations(); - m_input_v.reset(input.Construct(input.Grid(), input.Root())); - m_output_v.reset(input.Construct(input.Grid(), input.Root())); - } + void setup_dims(DataReaderMetaData& dr_metadata) override; - void setup_dims() override { - transform_layer::setup_dims(); - const auto& input_dims = get_input_dims(); - const auto& num_outputs = get_num_children(); - - // Check that slice parameters are valid - std::stringstream err; - if (m_slice_dim < 0 || m_slice_dim >= (El::Int) input_dims.size()) { - err << get_type() << " layer \"" << get_name() << "\" " - << "has " << input_dims.size() << " dimensions, " - << "but attempted to slice along dimension " << m_slice_dim; - LBANN_ERROR(err.str()); - } - if ((int) m_slice_points.size() <= num_outputs) { - err << get_type() << " layer \"" << get_name() << "\" " - << "requires more slice points than output tensors " - << "(found " << m_slice_points.size() << " slice points " - << "and " << m_child_layers.size() << " output tensors)"; - LBANN_ERROR(err.str()); - } - if (!std::is_sorted(m_slice_points.begin(), m_slice_points.end())) { - err << get_type() << " layer \"" << get_name() << "\" " - << "has unsorted slice points"; - LBANN_ERROR(err.str()); - } - if (m_slice_points.front() < 0 - || m_slice_points.back() > input_dims[m_slice_dim]) { - err << get_type() << " layer \"" << get_name() << "\" " - << "expects slice points in the range " - << "[0, " << input_dims[m_slice_dim] << "], " - << "but found an invalid slice point "; - if (m_slice_points.front() < 0) { - err << "(" << m_slice_points.front() << ")"; - } else { - err << "(" << m_slice_points.back() << ")"; - } - LBANN_ERROR(err.str()); - } + void fp_setup_outputs(El::Int mini_batch_size) override; + void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override; + void fp_compute() override; + void bp_compute() override; - // Set output tensor dimensions - auto output_dims = input_dims; - for (int i = 0; i < num_outputs; ++i) { - output_dims[m_slice_dim] = m_slice_points[i+1] - m_slice_points[i]; - set_output_dims(output_dims, i); - } +private: - } + /** Tensor dimension to slice. */ + size_t m_slice_dim; + /** Slice points for each child layer. */ + std::vector m_slice_points; + /** Slice points are automatically defined by the data reader */ + bool m_set_slice_points_from_data_reader; + /** Category for retrieving slice points from data reader */ + slice_points_mode m_var_category; + +#ifdef LBANN_HAS_GPU + /** @brief Workspace buffer. + * + * Parameters for CUDA kernels are copied into this buffer and + * asynchronously transferred to GPU. + */ + std::vector m_workspace; + /** @brief CUDA event for workspace buffer. + * + * Makes sure asynchronous GPU memory transfers are completed + * before modifying workspace buffer. + */ + cuda::event_wrapper m_workspace_event; +#endif // LBANN_HAS_GPU + + template + friend void fp_setup_outputs_impl(slice_layer&); + template + friend void fp_compute_impl(slice_layer&); + template + friend void bp_compute_impl(slice_layer&); - void fp_setup_outputs(El::Int mini_batch_size) override { - const auto& num_outputs = get_num_children(); - const auto& input_dims = get_input_dims(); - - // Divide input tensor into unit slices along slice dimension - // Note: Each unit slice is divided into contiguous "unit blocks" - const auto& input_num_unit_slices = input_dims[m_slice_dim]; - const auto& blocks_per_slice - = std::accumulate(&input_dims[0], &input_dims[m_slice_dim], - 1, std::multiplies()); - const auto& unit_block_size - = std::accumulate(input_dims.begin() + m_slice_dim + 1, - input_dims.end(), - 1, std::multiplies()); - const auto& input_block_stride = (input_num_unit_slices - * unit_block_size); - - // Populate output tensors with slices of input tensor - const auto& input = get_prev_activations(); - for (int i = 0; i < num_outputs; ++i) { - const auto& output_dims = get_output_dims(i); - const auto& output_size = get_output_size(i); - auto& output = get_activations(i); - output.Empty(false); - - // Divide output tensor into unit slices - const auto& output_num_unit_slices = output_dims[m_slice_dim]; - - // Merge unit slices and get first contiguous input block - const auto& block_size = output_num_unit_slices * unit_block_size; - const auto& input_block_offset = m_slice_points[i] * unit_block_size; - El::LockedView(*m_input_v, input, - El::IR(input_block_offset, - input_block_offset + block_size), - El::ALL); - - // Populate output tensor one block at a time - // Note: If there is only one block, output can be a view - if (blocks_per_slice > 1) { - output.AlignWith(*m_input_v); - output.Resize(output_size, mini_batch_size); - for (int block = 0; block < blocks_per_slice; ++block) { - const auto& input_offset = (input_block_offset - + block * input_block_stride); - const auto& output_offset = block * block_size; - El::LockedView(*m_input_v, input, - El::IR(input_offset, input_offset + block_size), - El::ALL); - El::View(*m_output_v, output, - El::IR(output_offset, output_offset + block_size), - El::ALL); - El::Copy(*m_input_v, *m_output_v); - } - } else { - El::LockedView(output, *m_input_v); - } +}; +// ========================================================= +// Implementation +// ========================================================= + +template +slice_layer::slice_layer(lbann_comm *comm) + : data_type_layer(comm), + m_set_slice_points_from_data_reader(false), + m_var_category(slice_points_mode::NA) { + this->m_expected_num_child_layers = -1; // No limit on children +} + +template +slice_layer* slice_layer::copy() const { + return new slice_layer(*this); +} + +template +std::string slice_layer::get_type() const { + return "slice"; +} + +template +data_layout slice_layer::get_data_layout() const { + return Layout; +} + +template +El::Device slice_layer::get_device_allocation() const { + return Device; +} + +template +description slice_layer::get_description() const { + auto desc = data_type_layer::get_description(); + desc.add("Slice dimension", m_slice_dim); + std::ostringstream ss; + for (size_t i = 0; i < m_slice_points.size(); ++i) { + ss << (i > 0 ? ", " : "") << m_slice_points[i]; + } + desc.add("Slice points", ss.str()); + return desc; +} + +template +void slice_layer::setup_dims(DataReaderMetaData& dr_metadata) { + data_type_layer::setup_dims(dr_metadata); + + // Setup the slice points if they are to be established by the data reader + if(m_set_slice_points_from_data_reader) { + std::vector slice_points; + std::string slice_point_method_name = "'get_slice_points_from_reader'"; + for (auto& slice_point + : dr_metadata.slice_points[m_var_category]) { + slice_points.push_back(slice_point); } + if (slice_points.size() < 2u) { + LBANN_ERROR(slice_point_method_name, " is not supported by the reader."); + return; + } + m_slice_points = std::move(slice_points); } - void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { - const auto& num_outputs = get_num_children(); - const auto& input_dims = get_input_dims(); - - // Initialize gradient w.r.t. input tensor - auto& gradient_wrt_input = get_error_signals(); - gradient_wrt_input.Empty(false); - gradient_wrt_input.AlignWith(get_prev_activations()); - gradient_wrt_input.Resize(get_input_size(), mini_batch_size); - if (m_slice_points[0] != 0 - || m_slice_points[num_outputs] != input_dims[m_slice_dim]) { - El::Zero(gradient_wrt_input); + // Check that slice parameters are valid + const auto& input_dims = this->get_input_dims(); + const size_t num_outputs = this->get_num_children(); + if (m_slice_dim >= input_dims.size()) { + std::ostringstream err; + err << this->get_type() << " layer \"" << this->get_name() << "\" " + << "is slicing along dimension " << m_slice_dim << ", " + << "but it has a " << input_dims.size() << "-D input tensor " + << "(parent layer \"" << this->get_parent_layers()[0]->get_name() << "\" " + << "outputs with dimensions "; + for (size_t d=0; d0 ? " x " : "") << input_dims[d]; } + err << ")"; + LBANN_ERROR(err.str()); + } + if (m_slice_points.size() <= num_outputs) { + LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ", + "has ",num_outputs," children, " + "but only ",m_slice_points.size()," slice points"); + } + if (!std::is_sorted(m_slice_points.begin(), m_slice_points.end())) { + LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ", + "has unsorted slice points"); + } + if (m_slice_points.back() > static_cast(input_dims[m_slice_dim])) { + LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ", + "has a slice point of ",m_slice_points.back(),", ", + "which is outside the expected range " + "[0 ",input_dims[m_slice_dim],"]"); + } - // Divide input tensor into unit slices along slice dimension - // Note: Each unit slice is divided into contiguous "unit blocks" - const auto& input_num_unit_slices = input_dims[m_slice_dim]; - const auto& blocks_per_slice - = std::accumulate(&input_dims[0], &input_dims[m_slice_dim], - 1, std::multiplies()); - const auto& unit_block_size - = std::accumulate(input_dims.begin() + m_slice_dim + 1, - input_dims.end(), - 1, std::multiplies()); - const auto& input_block_stride = (input_num_unit_slices - * unit_block_size); - - // Populate slices of gradient w.r.t. input tensor - for (int i = 0; i < num_outputs; ++i) { - const auto& output_dims = get_output_dims(i); - const auto& gradient_wrt_output = get_prev_error_signals(i); - - // Divide output tensor into unit slices - const auto& output_num_unit_slices = output_dims[m_slice_dim]; - - // Merge unit slices - const auto& block_size = output_num_unit_slices * unit_block_size; - const auto& input_block_offset = m_slice_points[i] * unit_block_size; - - // Populate gradient w.r.t. input tensor one block at a time - for (int block = 0; block < blocks_per_slice; ++block) { - const auto& input_offset = (input_block_offset - + block * input_block_stride); - const auto& output_offset = block * block_size; - El::LockedView(*m_output_v, gradient_wrt_output, - El::IR(output_offset, output_offset + block_size), - El::ALL); - El::View(*m_input_v, gradient_wrt_input, - El::IR(input_offset, input_offset + block_size), - El::ALL); - El::Copy(*m_output_v, *m_input_v); - } - - } + // Model-parallel implementation only supports flat data + if (Layout == data_layout::MODEL_PARALLEL && input_dims.size() != 1) { + LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ", + "attempted to slice along dimension ",m_slice_dim,", ", + "but model-parallel slice layer only supports flat data"); + } + // Set output tensor dimensions + auto output_dims = input_dims; + for (size_t i = 0; i < num_outputs; ++i) { + output_dims[m_slice_dim] = m_slice_points[i+1] - m_slice_points[i]; + this->set_output_dims(output_dims, i); } - void fp_compute() override {} - void bp_compute() override {} +} + +template +void fp_setup_outputs_impl( + slice_layer& l) { + + // Slice Elemental matrices + // Note: Assume each mini-batch sample is flat. + const size_t num_outputs = l.get_num_children(); + const auto& input = l.get_prev_activations(); + size_t offset = l.m_slice_points.front(); + for (size_t j=0; j m_slice_points; +template +void fp_setup_outputs_impl( + slice_layer& l) { - /** View into input tensor. */ - std::unique_ptr m_input_v; - /** View into output tensor. */ - std::unique_ptr m_output_v; + const size_t num_outputs = l.get_num_children(); + const auto& input = l.get_prev_activations(); + for (size_t j=0; j +void slice_layer::fp_setup_outputs(El::Int mini_batch_size) { + fp_setup_outputs_impl(*this); +} + +template +void slice_layer::fp_compute() { + fp_compute_impl(*this); +} + +template +void slice_layer::bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) { + const auto& output0_grad = this->get_prev_error_signals(0); + auto& input_grad = this->get_error_signals(); + input_grad.Empty(false); + input_grad.AlignWith(output0_grad); + El::Zeros(input_grad, this->get_input_size(), output0_grad.Width()); +} + +template +void slice_layer::bp_compute() { + bp_compute_impl(*this); +} + +#ifndef LBANN_SLICE_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class slice_layer< \ + T, data_layout::DATA_PARALLEL, Device>; \ + extern template class slice_layer< \ + T, data_layout::MODEL_PARALLEL, Device> + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_SLICE_LAYER_INSTANTIATE } // namespace lbann -#endif // LBANN_LAYER_SLICE_HPP_INCLUDED +#endif // LBANN_LAYERS_TRANSFORM_SLICE_HPP_INCLUDED diff --git a/include/lbann/layers/transform/sort.hpp b/include/lbann/layers/transform/sort.hpp index 8d04e25a795..91d10f8f5df 100644 --- a/include/lbann/layers/transform/sort.hpp +++ b/include/lbann/layers/transform/sort.hpp @@ -32,17 +32,19 @@ namespace lbann { /** @brief Sort tensor entries. */ -template -class sort_layer : public transform_layer { +template +class sort_layer : public transform_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "sort layer only supports DATA_PARALLEL"); public: sort_layer(lbann_comm *comm, bool descending = false) - : transform_layer(comm), m_descending(descending) { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "sort layer only supports DATA_PARALLEL"); + : transform_layer(comm), m_descending(descending) { } sort_layer(const sort_layer& other) - : transform_layer(other), + : transform_layer(other), m_descending(other.m_descending) { if (other.m_indices) { switch (other.m_indices->GetDevice()) { @@ -60,7 +62,7 @@ class sort_layer : public transform_layer { } } sort_layer& operator=(const sort_layer& other) { - transform_layer::operator=(other); + transform_layer::operator=(other); m_descending = other.m_descending; if (!other.m_indices) { m_indices.reset(nullptr); @@ -87,21 +89,21 @@ class sort_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Descending", m_descending); return desc; } protected: - void setup_dims() override { - transform_layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + transform_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); } void setup_matrices(const El::Grid& grid) override { - transform_layer::setup_matrices(grid); - const auto& dist = get_activations().DistData(); + transform_layer::setup_matrices(grid); + const auto& dist = this->get_activations().DistData(); switch (dist.device) { case El::Device::CPU: m_indices.reset(new El::Matrix()); @@ -117,8 +119,8 @@ class sort_layer : public transform_layer { } void fp_setup_outputs(El::Int mini_batch_size) override { - transform_layer::fp_setup_outputs(mini_batch_size); - const auto& output = get_activations(); + transform_layer::fp_setup_outputs(mini_batch_size); + const auto& output = this->get_activations(); m_indices->Resize(output.LocalHeight(), output.LocalWidth()); } @@ -138,6 +140,14 @@ class sort_layer : public transform_layer { }; +#ifndef LBANN_SORT_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class sort_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_SORT_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_SORT_HPP_INCLUDED diff --git a/include/lbann/layers/transform/split.hpp b/include/lbann/layers/transform/split.hpp index a7f151f7452..90248ce99a8 100644 --- a/include/lbann/layers/transform/split.hpp +++ b/include/lbann/layers/transform/split.hpp @@ -30,15 +30,32 @@ #include #include "lbann/layers/transform/transform.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/utils/distconv.hpp" namespace lbann { +#ifdef LBANN_HAS_DISTCONV +template +class split_distconv_adapter: public data_type_distconv_adapter { + public: + using TensorDevType = typename data_type_distconv_adapter::TensorDevType; + split_distconv_adapter(Layer& layer): data_type_distconv_adapter(layer) {} + virtual ~split_distconv_adapter() = default; + void setup_distributions(tensor_overlap_constraints &constraints) override; + dc::Shape get_activations_local_shape(int index) const override; + std::unique_ptr setup_activations_i(int index) const override; + void bp_compute(); +}; +#endif // LBANN_HAS_DISTCONV + /** @brief Present input tensor to multiple outputs. */ -template -class split_layer : public transform_layer { +template +class split_layer : public transform_layer { public: - split_layer(lbann_comm *comm) : transform_layer(comm) { + split_layer(lbann_comm *comm) : transform_layer(comm) { this->m_expected_num_child_layers = -1; // No limit on children } @@ -49,37 +66,118 @@ class split_layer : public transform_layer { protected: - void setup_dims() override { - Layer::setup_dims(); - for (int i = 0; i < get_num_children(); ++i) { - set_output_dims(get_input_dims(), i); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); + for (int i = 0; i < this->get_num_children(); ++i) { + this->set_output_dims(this->get_input_dims(), i); } } void fp_setup_outputs(El::Int mini_batch_size) override { - const auto& input = get_prev_activations(); - for (int i = 0; i < get_num_children(); ++i) { - El::LockedView(get_activations(i), input); + const auto& input = this->get_prev_activations(); + for (int i = 0; i < this->get_num_children(); ++i) { + El::LockedView(this->get_activations(i), input); } } void fp_compute() override {} void bp_compute() override { - auto& gradient_wrt_input = get_error_signals(); - if (get_num_children() > 0) { - El::Copy(get_prev_error_signals(0), gradient_wrt_input); +#ifdef LBANN_HAS_DISTCONV + if (this->distconv_enabled()) { + get_distconv_adapter().bp_compute(); + return; + } +#endif // LBANN_HAS_DISTCONV + auto& gradient_wrt_input = this->get_error_signals(); + if (this->get_num_children() > 0) { + El::Copy(this->get_prev_error_signals(0), gradient_wrt_input); } else { El::Zero(gradient_wrt_input); } - for (int i = 1; i < get_num_children(); ++i) { - El::Axpy(DataType(1), get_prev_error_signals(i), + for (int i = 1; i < this->get_num_children(); ++i) { + El::Axpy(DataType(1), this->get_prev_error_signals(i), gradient_wrt_input); } } +#ifdef LBANN_HAS_DISTCONV + protected: + bool is_distconv_supported() const override { + return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL; + } + void setup_distconv_adapter() override { + this->get_distconv_adapter_ptr() = make_unique>(*this); + } + split_distconv_adapter& get_distconv_adapter() override; + const split_distconv_adapter& get_distconv_adapter() const override; +#endif // LBANN_HAS_DISTCONV }; +#ifdef LBANN_HAS_DISTCONV +template +split_distconv_adapter& +split_layer::get_distconv_adapter() { + return const_cast&>( + static_cast&>(*this).get_distconv_adapter()); +} + +template +const split_distconv_adapter& +split_layer::get_distconv_adapter() const { + return dynamic_cast&>( + data_type_layer::get_distconv_adapter()); +} + +template +void split_distconv_adapter:: +setup_distributions(tensor_overlap_constraints &constraints) { + data_type_distconv_adapter::setup_distributions( + constraints); + + auto &x = this->get_prev_activations_dist(); + auto &y = this->get_activations_dist(); + auto &dx = this->get_error_signals_dist(); + auto &dy = this->get_prev_error_signals_dist(); + + constraints.mark_equivalent(x, y); + constraints.mark_equivalent(dx, dy); +} + +template +dc::Shape split_distconv_adapter:: +get_activations_local_shape(int index) const { + return data_type_distconv_adapter::get_activations_local_shape(0); +} + +template +std::unique_ptr::TensorDevType> +split_distconv_adapter:: +setup_activations_i(int index) const { + return make_unique(this->get_prev_activations(0)); +} +#endif // LBANN_HAS_DISTCONV + +LBANN_DEFINE_LAYER_BUILDER(split); + +#ifndef LBANN_SPLIT_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class split_layer; \ + extern template class split_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#ifdef LBANN_HAS_DISTCONV +#define PROTO_DEVICE(T, Device) \ + extern template class split_distconv_adapter; \ + extern template class split_distconv_adapter + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_HAS_DISTCONV +#endif // LBANN_SPLIT_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_SPLIT_HPP_INCLUDED diff --git a/include/lbann/layers/transform/stop_gradient.hpp b/include/lbann/layers/transform/stop_gradient.hpp index 4adeafbb205..b3f49339753 100644 --- a/include/lbann/layers/transform/stop_gradient.hpp +++ b/include/lbann/layers/transform/stop_gradient.hpp @@ -39,27 +39,38 @@ namespace lbann { * means that computed gradients in preceeding layers are not exact * gradients of the objective function. */ -template -class stop_gradient_layer : public transform_layer { +template +class stop_gradient_layer : public transform_layer { public: - stop_gradient_layer(lbann_comm *comm) : transform_layer(comm) {} + stop_gradient_layer(lbann_comm *comm) : transform_layer(comm) {} stop_gradient_layer* copy() const override { return new stop_gradient_layer(*this); } std::string get_type() const override { return "stop_gradient"; } data_layout get_data_layout() const override { return T_layout; } El::Device get_device_allocation() const override { return Dev; } protected: - void setup_dims() override { - transform_layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + transform_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); } void fp_setup_outputs(El::Int mini_batch_size) override { - El::LockedView(get_activations(), get_prev_activations()); + El::LockedView(this->get_activations(), this->get_prev_activations()); } void fp_compute() override {} }; +LBANN_DEFINE_LAYER_BUILDER(stop_gradient); + +#ifndef LBANN_STOP_GRADIENT_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class stop_gradient_layer; \ + extern template class stop_gradient_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_STOP_GRADIENT_LAYER_INSTANTIATE + } // namespace lbann #endif // STOP_GRADIENT_HPP_INCLUDED diff --git a/include/lbann/layers/transform/sum.hpp b/include/lbann/layers/transform/sum.hpp index ab9ce9a4af6..7786f72f634 100644 --- a/include/lbann/layers/transform/sum.hpp +++ b/include/lbann/layers/transform/sum.hpp @@ -29,15 +29,30 @@ #include "lbann/layers/transform/transform.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/utils/distconv.hpp" namespace lbann { -template -class sum_layer : public transform_layer { +#ifdef LBANN_HAS_DISTCONV +template +class sum_distconv_adapter: public data_type_distconv_adapter { + public: + using TensorDevType = typename data_type_distconv_adapter::TensorDevType; + sum_distconv_adapter(Layer& layer): data_type_distconv_adapter(layer) {} + virtual ~sum_distconv_adapter() = default; + std::unique_ptr setup_error_signals_i(int index) const override; + void fp_compute(); +}; +#endif // LBANN_HAS_DISTCONV + +template +class sum_layer : public transform_layer { public: sum_layer(lbann_comm *comm) - : transform_layer(comm) { + : transform_layer(comm) { this->m_expected_num_parent_layers = -1; // No limit on parents } @@ -49,29 +64,29 @@ class sum_layer : public transform_layer { protected: void setup_pointers() override { - transform_layer::setup_pointers(); - if (get_num_parents() < 1) { + transform_layer::setup_pointers(); + if (this->get_num_parents() < 1) { std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has no parent layers"; LBANN_ERROR(err.str()); } } - void setup_dims() override { - transform_layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + transform_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); // Check that input dimensions match - const auto& output_dims = get_output_dims(); - for (int i = 0; i < get_num_parents(); ++i) { - if (get_input_dims(i) != output_dims) { - const auto& parents = get_parent_layers(); + const auto& output_dims = this->get_output_dims(); + for (int i = 0; i < this->get_num_parents(); ++i) { + if (this->get_input_dims(i) != output_dims) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with incompatible dimensions ("; - for (int j = 0; j < get_num_parents(); ++j) { - const auto& dims = get_input_dims(j); + for (int j = 0; j < this->get_num_parents(); ++j) { + const auto& dims = this->get_input_dims(j); err << (j > 0 ? ", " : "") << "layer \"" << parents[j]->get_name() << "\" outputs "; for (size_t k = 0; k < dims.size(); ++k) { @@ -86,24 +101,83 @@ class sum_layer : public transform_layer { } void fp_compute() override { - auto& output = get_activations(); - El::Copy(get_prev_activations(0), output); - for (int i = 1; i < get_num_parents(); ++i) { - El::Axpy(DataType(1), get_prev_activations(i), output); +#ifdef LBANN_HAS_DISTCONV + if (this->distconv_enabled()) { + get_distconv_adapter().fp_compute(); + return; + } +#endif // LBANN_HAS_DISTCONV + auto& output = this->get_activations(); + El::Copy(this->get_prev_activations(0), output); + for (int i = 1; i < this->get_num_parents(); ++i) { + El::Axpy(DataType(1), this->get_prev_activations(i), output); } } void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { - const auto& gradient_wrt_output = get_prev_error_signals(); - for (int i = 0; i < get_num_parents(); ++i) { - El::LockedView(get_error_signals(i), gradient_wrt_output); + const auto& gradient_wrt_output = this->get_prev_error_signals(); + for (int i = 0; i < this->get_num_parents(); ++i) { + El::LockedView(this->get_error_signals(i), gradient_wrt_output); } } void bp_compute() override {} +#ifdef LBANN_HAS_DISTCONV + friend class sum_distconv_adapter; + protected: + bool is_distconv_supported() const override { + return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL; + } + void setup_distconv_adapter() override { + this->get_distconv_adapter_ptr() = make_unique>(*this); + } + sum_distconv_adapter& get_distconv_adapter() override; + const sum_distconv_adapter& get_distconv_adapter() const override; +#endif // LBANN_HAS_DISTCONV }; +#ifdef LBANN_HAS_DISTCONV +template +sum_distconv_adapter& +sum_layer::get_distconv_adapter() { + return const_cast&>( + static_cast&>(*this).get_distconv_adapter()); +} + +template +const sum_distconv_adapter& +sum_layer::get_distconv_adapter() const { + return dynamic_cast&>( + data_type_layer::get_distconv_adapter()); +} + +template +std::unique_ptr::TensorDevType> +sum_distconv_adapter::setup_error_signals_i(int index) const { + return make_unique(this->get_prev_error_signals(0)); +} +#endif // LBANN_HAS_DISTCONV + +LBANN_DEFINE_LAYER_BUILDER(sum); + +#ifndef LBANN_SUM_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class sum_layer; \ + extern template class sum_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#ifdef LBANN_HAS_DISTCONV +#define PROTO_DEVICE(T, Device) \ + extern template class sum_distconv_adapter; \ + extern template class sum_distconv_adapter + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_HAS_DISTCONV +#endif // LBANN_SUM_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_SUM_HPP_INCLUDED diff --git a/include/lbann/layers/transform/tessellate.hpp b/include/lbann/layers/transform/tessellate.hpp index eafe02cb9df..6c6f92e3ffc 100644 --- a/include/lbann/layers/transform/tessellate.hpp +++ b/include/lbann/layers/transform/tessellate.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_TRANSFORM_TESSELLATE_HPP_INCLUDED #define LBANN_LAYERS_TRANSFORM_TESSELLATE_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -57,20 +57,34 @@ namespace lbann { * e_n@f$. Then, denoting the modulo operator with @f$ \% @f$, * @f[ Y_{i_1,\cdots,i_n} = X_{i_1\% d_1,\cdots,i_n\% d_n} @f] */ -template -class tessellate_layer : public Layer { +template +class tessellate_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The local tensor type expected in this object. */ + using AbsMatrixType = El::AbstractMatrix; + + ///@} + public: tessellate_layer(lbann_comm *comm, std::vector dims = {}) - : Layer(comm) { - set_output_dims(dims); + : data_type_layer(comm) { + this->set_output_dims(dims); } tessellate_layer(const tessellate_layer& other) - : Layer(other), + : data_type_layer(other), m_input_v(other.m_input_v ? other.m_input_v->Copy() : nullptr) {} tessellate_layer& operator=(const tessellate_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); return *this; } @@ -80,15 +94,15 @@ class tessellate_layer : public Layer { data_layout get_data_layout() const override { return Layout; } El::Device get_device_allocation() const override { return Device; } - void setup_dims() override { - Layer::setup_dims(); + void setup_dims(DataReaderMetaData& dr_metadata) override { + data_type_layer::setup_dims(dr_metadata); std::stringstream err; // Check input and output dimensions - const auto input_dims = get_input_dims(); - const auto& output_dims = get_output_dims(); + const auto input_dims = this->get_input_dims(); + const auto& output_dims = this->get_output_dims(); if (input_dims.size() != output_dims.size()) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "attempted to tessellate a "; for (size_t i = 0; i < input_dims.size(); ++i) { err << (i > 0 ? "x" : "") << input_dims[i]; @@ -103,7 +117,7 @@ class tessellate_layer : public Layer { /// @todo Support tessellation with >3 dimensions if (input_dims.size() > 3) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "attempted to tessellate a "; for (size_t i = 0; i < input_dims.size(); ++i) { err << (i > 0 ? "x" : "") << input_dims[i]; @@ -115,10 +129,10 @@ class tessellate_layer : public Layer { } void setup_matrices(const El::Grid& grid) override { - Layer::setup_matrices(grid); - auto dist_data = get_prev_activations().DistData(); + data_type_layer::setup_matrices(grid); + auto dist_data = this->get_prev_activations().DistData(); dist_data.colDist = El::STAR; - m_input_v.reset(AbsDistMat::Instantiate(dist_data)); + m_input_v.reset(AbsDistMatrixType::Instantiate(dist_data)); } protected: @@ -126,14 +140,14 @@ class tessellate_layer : public Layer { void fp_compute() override { // Get input and output dimensions - auto input_dims = get_input_dims(); - auto output_dims = get_output_dims(); + auto input_dims = this->get_input_dims(); + auto output_dims = this->get_output_dims(); while (input_dims.size() < 3) { input_dims.insert(input_dims.begin(), 1); } while (output_dims.size() < 3) { output_dims.insert(output_dims.begin(), 1); } // Get input and output data - auto& output = get_activations(); - const auto& input = get_prev_activations(); + auto& output = this->get_activations(); + const auto& input = this->get_prev_activations(); m_input_v->Empty(false); m_input_v->AlignWith(output); if (m_input_v->DistData() == input.DistData()) { @@ -155,14 +169,14 @@ class tessellate_layer : public Layer { void bp_compute() override { // Get input and output dimensions - auto input_dims = get_input_dims(); - auto output_dims = get_output_dims(); + auto input_dims = this->get_input_dims(); + auto output_dims = this->get_output_dims(); while (input_dims.size() < 3) { input_dims.insert(input_dims.begin(), 1); } while (output_dims.size() < 3) { output_dims.insert(output_dims.begin(), 1); } // Get input and output data - const auto& gradient_wrt_output = get_prev_error_signals(); - auto& gradient_wrt_input = get_error_signals(); + const auto& gradient_wrt_output = this->get_prev_error_signals(); + auto& gradient_wrt_input = this->get_error_signals(); m_input_v->Empty(false); m_input_v->AlignWith(gradient_wrt_output); if (m_input_v->DistData() == gradient_wrt_input.DistData()) { @@ -180,7 +194,7 @@ class tessellate_layer : public Layer { // Accumulate local error signals, if needed if (m_input_v->DistData() != gradient_wrt_input.DistData()) { - m_comm->allreduce(*m_input_v, m_input_v->RedundantComm()); + this->m_comm->allreduce(*m_input_v, m_input_v->RedundantComm()); El::Copy(*m_input_v, gradient_wrt_input); } @@ -189,28 +203,37 @@ class tessellate_layer : public Layer { private: /** View into input tensor. */ - std::unique_ptr m_input_v; + std::unique_ptr m_input_v; /** Apply tessellation. * Columns of 'input' should be intact mini-batch samples. If the * data layout is not purely data-parallel, this means input data * is duplicated over the input matrix's column communicator. */ - static void fp_compute_3d(const std::vector& input_dims, - const std::vector& output_dims, - const AbsMat& input, - AbsDistMat& output); + void fp_compute_3d(const std::vector& input_dims, + const std::vector& output_dims, + const AbsMatrixType& input, + AbsDistMatrixType& output); /** Compute local contribution to tessellation back prop * The global gradient w.r.t. input can be obtained by performing * an allreduce over the input matrix's column communicator. */ - static void bp_compute_3d(const std::vector& input_dims, - const std::vector& output_dims, - const AbsDistMat& gradient_wrt_output, - AbsMat& gradient_wrt_input); + void bp_compute_3d(const std::vector& input_dims, + const std::vector& output_dims, + const AbsDistMatrixType& gradient_wrt_output, + AbsMatrixType& gradient_wrt_input); }; +#ifndef LBANN_TESSELLATE_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class tessellate_layer; \ + extern template class tessellate_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_TESSELLATE_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_TRANSFORM_TESSELLATE_HPP_INCLUDED diff --git a/include/lbann/layers/transform/transform.hpp b/include/lbann/layers/transform/transform.hpp index 98b2a169ea3..23b579c4322 100644 --- a/include/lbann/layers/transform/transform.hpp +++ b/include/lbann/layers/transform/transform.hpp @@ -27,16 +27,17 @@ #ifndef LBANN_LAYER_TRANSFORM_HPP_INCLUDED #define LBANN_LAYER_TRANSFORM_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { /** @todo Remove. Layers should inherit directly from the base layer * class. */ -class transform_layer : public Layer { +template +class transform_layer : public data_type_layer { public: - transform_layer(lbann_comm *comm) : Layer(comm) {} + transform_layer(lbann_comm *comm) : data_type_layer(comm) {} }; } // namespace lbann diff --git a/include/lbann/layers/transform/uniform.hpp b/include/lbann/layers/transform/uniform.hpp index b10bbb03375..bcb0845138f 100644 --- a/include/lbann/layers/transform/uniform.hpp +++ b/include/lbann/layers/transform/uniform.hpp @@ -28,31 +28,39 @@ #define LBANN_LAYER_UNIFORM_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/random.hpp" namespace lbann { -/** @brief Random values with uniform distribution. - * - * During validation and testing, outputs are all equal to the - * distribution mean. - */ -template -class uniform_layer : public transform_layer { +/** @brief Random values from uniform distribution. */ +template +class uniform_layer : public transform_layer { private: - /** Uniform distribution mean. */ - DataType m_min; - /** Uniform distribution standard deviation. */ - DataType m_max; + /** @brief Uniform distribution minimum. */ + TensorDataType m_min; + /** @brief Uniform distribution maximum. */ + TensorDataType m_max; + /** @brief Whether to have deterministic output when not training. + * + * Applies to execution modes other than training, e.g. validation + * and inference. If true, outputs are all equal to the + * distribution mean when not training. + */ + bool m_training_only; public: uniform_layer(lbann_comm *comm, std::vector dims, - DataType min = DataType(0), - DataType max = DataType(1)) - : transform_layer(comm), m_min(min), m_max(max) { - set_output_dims(dims); + TensorDataType min = El::TypeTraits::Zero(), + TensorDataType max = El::TypeTraits::One(), + bool training_only = false) + : transform_layer(comm), + m_min(min), m_max(max), m_training_only(training_only) { + this->set_output_dims(dims); this->m_expected_num_parent_layers = 0; } uniform_layer* copy() const override { return new uniform_layer(*this); } @@ -61,28 +69,40 @@ class uniform_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); std::stringstream ss; ss << "[" << m_min << "," << m_max << ")"; desc.add("Range", ss.str()); + desc.add("Training only", m_training_only); return desc; } protected: void fp_compute() override { - const auto& mean = (m_max + m_min) / 2; - const auto& radius = (m_max - m_min) / 2; - auto& output = get_activations(); - if (this->m_model->get_execution_mode() == execution_mode::training) { - uniform_fill(output, output.Height(), output.Width(), mean, radius); - } else { + const auto& mean = (m_max + m_min) / El::To(2); + const auto& radius = (m_max - m_min) / El::To(2); + auto& output = this->get_activations(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); + if (m_training_only && (mode != execution_mode::training)) { El::Fill(output, mean); } + else { + uniform_fill(output, output.Height(), output.Width(), mean, radius); + } } }; +#ifndef LBANN_UNIFORM_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class uniform_layer; \ + extern template class uniform_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_UNIFORM_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_UNIFORM_HPP_INCLUDED diff --git a/include/lbann/layers/transform/unpooling.hpp b/include/lbann/layers/transform/unpooling.hpp index 9a88eabcc1a..c414014f8c0 100644 --- a/include/lbann/layers/transform/unpooling.hpp +++ b/include/lbann/layers/transform/unpooling.hpp @@ -37,24 +37,24 @@ namespace lbann { /** @brief Transpose of pooling layer. * @todo GPU support. */ -template -class unpooling_layer : public transform_layer { +template +class unpooling_layer : public transform_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "unpooling only supports DATA_PARALLEL"); + static_assert(Dev == El::Device::CPU, + "unpooling only supports CPU"); private: /** Corresponding pooling layer. */ - pooling_layer* m_pooling_layer; + pooling_layer* m_pooling_layer; public: unpooling_layer(lbann_comm *comm, - pooling_layer* pool = nullptr) - : transform_layer(comm), - m_pooling_layer(pool) { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "unpooling only supports DATA_PARALLEL"); - static_assert(Dev == El::Device::CPU, - "unpooling only supports CPU"); - } + pooling_layer* pool = nullptr) + : transform_layer(comm), + m_pooling_layer(pool) { } unpooling_layer* copy() const override { return new unpooling_layer(*this); } std::string get_type() const override { return "unpooling"; } @@ -74,21 +74,21 @@ class unpooling_layer : public transform_layer { } } - void setup_dims() override { - transform_layer::setup_dims(); + void setup_dims(DataReaderMetaData& dr_metadata) override { + transform_layer::setup_dims(dr_metadata); // Check that input tensor is valid - const auto& input_dims = get_input_dims(); + const auto& input_dims = this->get_input_dims(); const auto& pool_output_dims = m_pooling_layer->get_output_dims(); if (input_dims != pool_output_dims) { std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "expects input tensors with dimensions "; for (size_t i = 0; i < pool_output_dims.size(); ++i) { err << (i > 0 ? " x " : "") << pool_output_dims[i]; } err << ", but parent layer " - << "\"" << m_parent_layers[0]->get_name() << "\" " + << "\"" << this->get_parent_layers()[0]->get_name() << "\" " << "outputs with dimensions "; for (size_t i = 0; i < input_dims.size(); ++i) { err << (i > 0 ? " x " : "") << input_dims[i]; @@ -97,22 +97,22 @@ class unpooling_layer : public transform_layer { } // Initialize output tensor based on corresponding pooling layer - set_output_dims(m_pooling_layer->get_input_dims()); + this->set_output_dims(m_pooling_layer->get_input_dims()); } - void set_pooling_layer(pooling_layer* pool) { + void set_pooling_layer(pooling_layer* pool) { m_pooling_layer = pool; } std::vector get_layer_pointers() override { - std::vector layers = transform_layer::get_layer_pointers(); + std::vector layers = transform_layer::get_layer_pointers(); layers.push_back((Layer*) m_pooling_layer); return layers; } void set_layer_pointers(std::vector layers) override { - m_pooling_layer = dynamic_cast*>(layers.back()); + m_pooling_layer = dynamic_cast*>(layers.back()); if (m_pooling_layer == nullptr) { std::stringstream err; err << __FILE__ << " " << __LINE__ @@ -120,7 +120,7 @@ class unpooling_layer : public transform_layer { throw lbann_exception(err.str()); } layers.pop_back(); - transform_layer::set_layer_pointers(layers); + transform_layer::set_layer_pointers(layers); } protected: @@ -146,19 +146,21 @@ class unpooling_layer : public transform_layer { /// Unpooling forward propagation with im2col void fp_compute_im2col() { + using DMatDT = El::Matrix; + // Get local matrices - const DMat& prev_activations_local = get_local_prev_activations(); - DMat& activations_local = get_local_activations(); + const DMatDT& prev_activations_local = this->get_local_prev_activations(); + DMatDT& activations_local = this->get_local_activations(); // Get parameters const int local_width = prev_activations_local.Width(); - const auto& output_dims = get_output_dims(); + const auto& output_dims = this->get_output_dims(); const int num_channels = output_dims[0]; - const int num_per_input_channel = get_input_size() / num_channels; + const int num_per_input_channel = this->get_input_size() / num_channels; const int pool_size = m_pooling_layer->m_pool_size; // Initialize im2col matrix - DMat im2col_mat(pool_size * num_channels, num_per_input_channel); + DMatDT im2col_mat(pool_size * num_channels, num_per_input_channel); // Iterate through data samples for(int sample = 0; sample < local_width; ++sample) { @@ -167,16 +169,16 @@ class unpooling_layer : public transform_layer { El::Zero(im2col_mat); // Populate im2col matrix - const DataType *prev_activations_buffer + const TensorDataType *prev_activations_buffer = prev_activations_local.LockedBuffer(0, sample); const int *indices_buffer - = &m_pooling_layer->m_max_pool_indices[sample * get_input_size()]; + = &m_pooling_layer->m_max_pool_indices[sample * this->get_input_size()]; LBANN_OMP_PARALLEL_FOR for(int channel = 0; channel < num_channels; ++channel) { for(int j = 0; j < num_per_input_channel; ++j) { const int input_index = j + channel * num_per_input_channel; const int max_index = indices_buffer[input_index]; - DataType *im2col_buffer + TensorDataType *im2col_buffer = im2col_mat.Buffer(channel * pool_size, j); im2col_buffer[max_index] = prev_activations_buffer[input_index]; @@ -184,8 +186,9 @@ class unpooling_layer : public transform_layer { } // Convert im2col matrix to output matrix - DMat output_mat = El::View(activations_local, El::ALL, El::IR(sample)); - col2im(im2col_mat, + DMatDT output_mat = + El::View(activations_local, El::ALL, El::IR(sample)); + col2im(im2col_mat, output_mat, num_channels, output_dims.size() - 1, @@ -193,36 +196,38 @@ class unpooling_layer : public transform_layer { m_pooling_layer->m_pads.data(), m_pooling_layer->m_pool_dims.data(), m_pooling_layer->m_strides.data(), - static_cast(&std::max)); - + [](TensorDataType const& a, TensorDataType const& b) { + return std::max(a, b); + }); } - } /// Unpooling backward propagation with im2col void bp_compute_im2col() { + using DMatDT = El::Matrix; + // Get local matrices - const DMat& prev_error_signal_local = get_local_prev_error_signals(); - DMat& error_signal_local = get_local_error_signals(); + const DMatDT& prev_error_signal_local = this->get_local_prev_error_signals(); + DMatDT& error_signal_local = this->get_local_error_signals(); // Get parameters const int local_width = prev_error_signal_local.Width(); - const auto& output_dims = get_output_dims(); + const auto& output_dims = this->get_output_dims(); const int num_channels = output_dims[0]; - const int num_per_output_channel = get_input_size() / num_channels; + const int num_per_output_channel = this->get_input_size() / num_channels; const int pool_size = m_pooling_layer->m_pool_size; // Initialize im2col matrix - DMat im2col_mat(pool_size * num_channels, num_per_output_channel); + DMatDT im2col_mat(pool_size * num_channels, num_per_output_channel); // Iterate through data samples for(int sample = 0; sample < local_width; ++sample) { // Construct im2col matrix from input - const DMat& input_mat = El::LockedView(prev_error_signal_local, - El::ALL, El::IR(sample)); - im2col(input_mat, + const DMatDT& input_mat = El::LockedView(prev_error_signal_local, + El::ALL, El::IR(sample)); + im2col(input_mat, im2col_mat, num_channels, output_dims.size() - 1, @@ -232,15 +237,15 @@ class unpooling_layer : public transform_layer { m_pooling_layer->m_strides.data()); // Propagate error signal based on pooling layer - DataType *output_buffer = error_signal_local.Buffer(0, sample); + TensorDataType *output_buffer = error_signal_local.Buffer(0, sample); const int *indices_buffer - = &m_pooling_layer->m_max_pool_indices[sample * get_input_size()]; + = &m_pooling_layer->m_max_pool_indices[sample * this->get_input_size()]; LBANN_OMP_PARALLEL_FOR for(int channel = 0; channel < num_channels; ++channel) { for(int j = 0; j < num_per_output_channel; ++j) { const int output_index = j + channel * num_per_output_channel; const int max_index = indices_buffer[output_index]; - DataType *im2col_buffer + TensorDataType *im2col_buffer = im2col_mat.Buffer(channel * pool_size, j); output_buffer[output_index] = im2col_buffer[max_index]; } @@ -252,6 +257,16 @@ class unpooling_layer : public transform_layer { }; +#ifndef LBANN_UNPOOLING_LAYER_INSTANTIATE +#define PROTO(T) \ + extern template class unpooling_layer + +#define LBANN_INSTANTIATE_CPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#undef LBANN_INSTANTIATE_CPU_HALF +#endif // LBANN_UNPOOLING_LAYER_INSTANTIATE + } // namespace lbann -#endif // LBANN_LAYER_POOLING_HPP_INCLUDED +#endif // LBANN_LAYER_UNPOOLING_HPP_INCLUDED diff --git a/include/lbann/layers/transform/weighted_sum.hpp b/include/lbann/layers/transform/weighted_sum.hpp index 5f77caeaa9a..3bc8575b6a7 100644 --- a/include/lbann/layers/transform/weighted_sum.hpp +++ b/include/lbann/layers/transform/weighted_sum.hpp @@ -34,8 +34,10 @@ namespace lbann { /** @brief Add tensors with specified scaling factors. */ -template -class weighted_sum_layer : public transform_layer { +template +class weighted_sum_layer : public transform_layer { private: /** Scaling factors for weighted sum. */ @@ -44,7 +46,7 @@ class weighted_sum_layer : public transform_layer { public: weighted_sum_layer(lbann_comm *comm, std::vector scaling_factors) - : transform_layer(comm), + : transform_layer(comm), m_scaling_factors(scaling_factors) { this->m_expected_num_parent_layers = -1; // No limit on parents } @@ -55,7 +57,7 @@ class weighted_sum_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); std::stringstream ss; for (size_t i = 0; i < m_scaling_factors.size(); ++i) { ss << (i > 0 ? ", " : "") << m_scaling_factors[i]; @@ -67,36 +69,36 @@ class weighted_sum_layer : public transform_layer { protected: void setup_pointers() override { - transform_layer::setup_pointers(); + transform_layer::setup_pointers(); std::stringstream err; - if (get_num_parents() < 1) { - err << get_type() << " layer \"" << get_name() << "\" " + if (this->get_num_parents() < 1) { + err << get_type() << " layer \"" << this->get_name() << "\" " << "has no parent layers"; LBANN_ERROR(err.str()); } - if ((int) m_scaling_factors.size() != get_num_parents()) { - err << get_type() << " layer \"" << get_name() << "\" " + if ((int) m_scaling_factors.size() != this->get_num_parents()) { + err << get_type() << " layer \"" << this->get_name() << "\" " << "has an invalid number of scaling factors " << "(found " << m_scaling_factors.size() << ", " - << "but there are " << get_num_parents() << " parent layers)"; + << "but there are " << this->get_num_parents() << " parent layers)"; LBANN_ERROR(err.str()); } } - void setup_dims() override { - transform_layer::setup_dims(); - set_output_dims(get_input_dims()); + void setup_dims(DataReaderMetaData& dr_metadata) override { + transform_layer::setup_dims(dr_metadata); + this->set_output_dims(this->get_input_dims()); // Check that input dimensions match - const auto& output_dims = get_output_dims(); - for (int i = 0; i < get_num_parents(); ++i) { - if (get_input_dims(i) != output_dims) { - const auto& parents = get_parent_layers(); + const auto& output_dims = this->get_output_dims(); + for (int i = 0; i < this->get_num_parents(); ++i) { + if (this->get_input_dims(i) != output_dims) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with incompatible dimensions ("; - for (int j = 0; j < get_num_parents(); ++j) { - const auto& dims = get_input_dims(j); + for (int j = 0; j < this->get_num_parents(); ++j) { + const auto& dims = this->get_input_dims(j); err << (j > 0 ? ", " : "") << "layer \"" << parents[j]->get_name() << "\" outputs "; for (size_t k = 0; k < dims.size(); ++k) { @@ -111,17 +113,17 @@ class weighted_sum_layer : public transform_layer { } void fp_compute() override { - auto& output = get_activations(); + auto& output = this->get_activations(); El::Zero(output); - for (int i = 0; i < get_num_parents(); ++i) { - El::Axpy(m_scaling_factors[i], get_prev_activations(i), output); + for (int i = 0; i < this->get_num_parents(); ++i) { + El::Axpy(m_scaling_factors[i], this->get_prev_activations(i), output); } } void bp_compute() override { - const auto& gradient_wrt_output = get_prev_error_signals(); - for (int i = 0; i < get_num_parents(); ++i) { - auto& gradient_wrt_input = get_error_signals(i); + const auto& gradient_wrt_output = this->get_prev_error_signals(); + for (int i = 0; i < this->get_num_parents(); ++i) { + auto& gradient_wrt_input = this->get_error_signals(i); El::Zero(gradient_wrt_input); El::Axpy(m_scaling_factors[i], gradient_wrt_output, gradient_wrt_input); @@ -130,6 +132,17 @@ class weighted_sum_layer : public transform_layer { }; +LBANN_DEFINE_LAYER_BUILDER(weighted_sum); + +#ifndef LBANN_WEIGHTED_SUM_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class weighted_sum_layer; \ + extern template class weighted_sum_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_WEIGHTED_SUM_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_WEIGHTED_SUM_HPP_INCLUDED diff --git a/include/lbann/layers/transform/weights.hpp b/include/lbann/layers/transform/weights.hpp index f6d74931347..fd48affe0c6 100644 --- a/include/lbann/layers/transform/weights.hpp +++ b/include/lbann/layers/transform/weights.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_WEIGHTS_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/models/model.hpp" namespace lbann { @@ -35,46 +36,55 @@ namespace lbann { * * Interfaces with a @c weights object and outputs its tensor. */ -template -class weights_layer : public transform_layer { +template +class weights_layer : public transform_layer { + +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The local tensor type expected in this object. */ + using AbsMatrixType = El::AbstractMatrix; + + /** @brief The device-specific local tensor type. */ + using CPUMatType = El::Matrix; + +#ifdef LBANN_HAS_GPU + /** @brief The GPU device-specific local tensor type. */ + using GPUMatType = El::Matrix; +#endif + + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} public: weights_layer(lbann_comm *comm, std::vector dims) - : transform_layer(comm) { + : transform_layer(comm) { std::vector dims_; for (const auto& d : dims) { dims_.push_back(d); } - set_output_dims(dims_); + this->set_output_dims(dims_); this->m_expected_num_parent_layers = 0; } weights_layer(const weights_layer& other) - : transform_layer(other), + : transform_layer(other), m_gradient(other.m_gradient ? other.m_gradient->Copy() : nullptr) { - if (other.m_workspace) { - switch (other.m_workspace->GetDevice()) { - case El::Device::CPU: m_workspace.reset(new CPUMat()); break; -#ifdef LBANN_HAS_GPU - case El::Device::GPU: m_workspace.reset(new GPUMat()); break; -#endif // LBANN_HAS_GPU - default: LBANN_ERROR("unknown device type"); - } - m_workspace->SetMemoryMode(other.m_workspace->MemoryMode()); - } + m_workspace.SetMemoryMode(other.m_workspace.MemoryMode()); } weights_layer& operator=(const weights_layer& other){ - transform_layer::operator=(other); + transform_layer::operator=(other); m_gradient.reset(other.m_gradient ? other.m_gradient->Copy() : nullptr); - m_workspace.reset(); - if (other.m_workspace) { - switch (other.m_workspace->GetDevice()) { - case El::Device::CPU: m_workspace.reset(new CPUMat()); break; -#ifdef LBANN_HAS_GPU - case El::Device::GPU: m_workspace.reset(new GPUMat()); break; -#endif // LBANN_HAS_GPU - default: LBANN_ERROR("unknown device type"); - } - m_workspace->SetMemoryMode(other.m_workspace->MemoryMode()); - } + m_workspace.SetMemoryMode(other.m_workspace.MemoryMode()); return *this; } weights_layer* copy() const override { return new weights_layer(*this); } @@ -85,70 +95,56 @@ class weights_layer : public transform_layer { protected: void setup_matrices(const El::Grid& grid) override { - transform_layer::setup_matrices(grid); + transform_layer::setup_matrices(grid); // Initialize weights gradient - auto dist = get_activations().DistData(); + auto dist = this->get_activations().DistData(); dist.rowDist = El::STAR; - m_gradient.reset(AbsDistMat::Instantiate(dist)); + m_gradient.reset(AbsDistMatrixType::Instantiate(dist)); // Initialize workspace - switch (Dev) { - case El::Device::CPU: m_workspace.reset(new CPUMat()); break; -#ifdef LBANN_HAS_GPU - case El::Device::GPU: - m_workspace.reset(new GPUMat()); -#ifdef HYDROGEN_HAVE_CUB - m_workspace->SetMemoryMode(1); // Use CUB GPU memory pool if possible -#endif // HYDROGEN_HAVE_CUB - break; -#endif // LBANN_HAS_GPU - default: LBANN_ERROR("unknown device type"); - } - +#if defined HYDROGEN_HAVE_CUB + if (Dev == El::Device::GPU) + m_workspace.SetMemoryMode(1); // Use CUB GPU memory pool if possible +#endif // defined HYDROGEN_HAVE_CUB } - void setup_data() override { - transform_layer::setup_data(); + void setup_data(size_t max_mini_batch_size) override { + transform_layer::setup_data(max_mini_batch_size); // Initialize default weights if none are provided - if (this->m_weights.size() > 1) { - std::stringstream err; - err << "attempted to setup " - << get_type() << " layer \"" << get_name() << "\" " - << "with an invalid number of weights " - << "(expected at most 1, " - << "but found " << this->m_weights.size() << ")"; - LBANN_ERROR(err.str()); + if (!this->has_weights()) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(DataType(0)); + auto opt = this->m_model->template create_optimizer(); + w->set_name(this->get_name() + "_weights"); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->add_weights(w.get()); + this->m_model->add_weights(std::move(w)); } - this->m_weights.resize(1, nullptr); - auto& w = this->m_weights[0]; - if (w == nullptr) { - w = new weights(get_comm()); - std::unique_ptr init(new constant_initializer(DataType(0))); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_weights"); - w->set_initializer(init); - w->set_optimizer(opt); - this->m_model->add_weights(w); + if (this->num_weights() != 1) { + LBANN_ERROR("attempted to setup ", + this->get_type()," layer \"",this->get_name(),"\" ", + "with an invalid number of weights ", + "(expected at most 1, ", + "but found ",this->num_weights(),")"); } // Setup weights and weights gradient - m_gradient->AlignWith(get_activations()); - m_gradient->Resize(get_output_size(), 1); - w->set_dims(get_output_dims()); - w->set_matrix_distribution(m_gradient->DistData()); + m_gradient->AlignWith(this->get_activations()); + m_gradient->Resize(this->get_output_size(), 1); + this->get_weights(0).set_dims(this->get_output_dims()); + this->get_weights(0).set_matrix_distribution(m_gradient->DistData()); // Initialize freeze state - if (this->m_frozen) { w->freeze(); } - else { w->unfreeze(); } - if (w->is_frozen() != this->m_frozen) { - std::stringstream err; - err << (m_frozen ? "" : "un") << "frozen " - << "layer \"" << get_name() << "\" has " - << (w->is_frozen() ? "" : "un") << "frozen " - << "weights \"" << w->get_name() << "\""; - LBANN_ERROR(err.str()); + if (this->m_frozen) { this->get_weights(0).freeze(); } + else { this->get_weights(0).unfreeze(); } + if (this->get_weights(0).is_frozen() != this->m_frozen) { + LBANN_ERROR((this->m_frozen ? "" : "un"),"frozen ", + "layer \"",this->get_name(),"\" has ", + (this->get_weights(0).is_frozen() ? "" : "un"),"frozen ", + "weights \"",this->get_weights(0).get_name(),"\""); } } @@ -156,56 +152,65 @@ class weights_layer : public transform_layer { void fp_compute() override { // Matrices - const auto& local_weights = m_weights[0]->get_values().LockedMatrix(); - auto& local_output = get_local_activations(); - m_workspace->Resize(local_output.Width(), 1); - El::Fill(*m_workspace, DataType(1)); + const auto& local_weights = this->weights_values(0).LockedMatrix(); + auto& local_output = this->get_local_activations(); + El::Ones(m_workspace, local_output.Width(), 1); // Duplicate weights across matrix columns El::Gemm(El::NORMAL, El::TRANSPOSE, - DataType(1), local_weights, *m_workspace, - DataType(0), local_output); + El::TypeTraits::One(), local_weights, m_workspace, + El::TypeTraits::Zero(), local_output); // Clean up - m_workspace->Empty(); + m_workspace.Empty(); } void bp_compute() override { - constexpr DataType zero = 0; - constexpr DataType one = 1; // Get optimizer // Note: Nothing needs to be done if there is no optimizer - auto* opt = this->m_weights[0]->get_optimizer(); + auto* opt = this->get_weights(0).get_optimizer(); if (opt == nullptr) { return; } // Matrices - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - m_workspace->Resize(local_gradient_wrt_output.Width(), 1); - El::Fill(*m_workspace, one); + const auto& local_gradient_wrt_output = this->get_local_prev_error_signals(); + El::Ones(m_workspace, local_gradient_wrt_output.Width(), 1); - // Compute gradient contribution and accumulate - const auto& scale = one / this->m_model->get_effective_mini_batch_size(); El::Gemv(El::NORMAL, - scale, local_gradient_wrt_output, *m_workspace, - zero, m_gradient->Matrix()); - opt->add_to_gradient(*m_gradient, one, true); + El::TypeTraits::One(), + local_gradient_wrt_output, m_workspace, + El::TypeTraits::Zero(), + m_gradient->Matrix()); + + opt->add_to_gradient(*m_gradient, + El::TypeTraits::One(), + true); // Clean up - m_workspace->Empty(); + m_workspace.Empty(); } private: /** Weights gradient. */ - std::unique_ptr m_gradient; + std::unique_ptr m_gradient; /** Workspace. */ - std::unique_ptr m_workspace; - + El::Matrix m_workspace; }; +LBANN_DEFINE_LAYER_BUILDER(weights); + +#ifndef LBANN_WEIGHTS_LAYER_INSTANTIATE +#define PROTO_DEVICE(T, Device) \ + extern template class weights_layer; \ + extern template class weights_layer + +#include "lbann/macros/instantiate_device.hpp" +#undef PROTO_DEVICE +#endif // LBANN_WEIGHTS_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_WEIGHTS_HPP_INCLUDED diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index 5b1ba94d470..a9dbdf2c553 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -27,6 +27,12 @@ #ifndef LBANN_LBANN_HPP_INCLUDED #define LBANN_LBANN_HPP_INCLUDED +/// Trainers +#include "lbann/trainers/trainer.hpp" + +/// Training Algorithms +#include "lbann/training_algorithms/training_algorithm.hpp" + /// Models #include "lbann/models/directed_acyclic_graph.hpp" @@ -45,6 +51,9 @@ #include "lbann/layers/learning/fully_connected.hpp" #include "lbann/layers/learning/convolution.hpp" #include "lbann/layers/learning/deconvolution.hpp" +#include "lbann/layers/learning/embedding.hpp" +#include "lbann/layers/learning/channelwise_scale_bias.hpp" +#include "lbann/layers/learning/entrywise_scale_bias.hpp" /// Loss layers #include "lbann/layers/loss/categorical_accuracy.hpp" @@ -69,7 +78,7 @@ #include "lbann/layers/transform/sum.hpp" #include "lbann/layers/transform/weighted_sum.hpp" #include "lbann/layers/transform/slice.hpp" -#include "lbann/layers/transform/concatenation.hpp" +#include "lbann/layers/transform/concatenate.hpp" #include "lbann/layers/transform/constant.hpp" #include "lbann/layers/transform/dummy.hpp" #include "lbann/layers/transform/hadamard.hpp" @@ -92,6 +101,9 @@ #include "lbann/layers/regularizers/dropout.hpp" #include "lbann/layers/regularizers/selu_dropout.hpp" #include "lbann/layers/regularizers/batch_normalization.hpp" +#include "lbann/layers/regularizers/entrywise_batch_normalization.hpp" +#include "lbann/layers/regularizers/layer_norm.hpp" +#include "lbann/layers/regularizers/instance_norm.hpp" /// Input layer #include "lbann/layers/io/input/input_layer.hpp" @@ -100,19 +112,19 @@ #include "lbann/layers/misc/covariance.hpp" #include "lbann/layers/misc/variance.hpp" #include "lbann/layers/misc/channelwise_mean.hpp" +#include "lbann/layers/misc/channelwise_softmax.hpp" #include "lbann/layers/misc/mini_batch_index.hpp" #include "lbann/layers/misc/mini_batch_size.hpp" +#include "lbann/layers/misc/argmax.hpp" +#include "lbann/layers/misc/argmin.hpp" +#include "lbann/layers/misc/one_hot.hpp" /// Data readers +#include "lbann/data_readers/data_reader_npz_ras_lipid.hpp" #include "lbann/data_readers/data_reader_imagenet.hpp" -#include "lbann/data_readers/data_reader_imagenet_patches.hpp" #include "lbann/data_readers/data_reader_cifar10.hpp" #include "lbann/data_readers/data_reader_mnist.hpp" -#include "lbann/data_readers/data_reader_multi_images.hpp" -#include "lbann/data_readers/data_reader_mnist_siamese.hpp" -#include "lbann/data_readers/data_reader_multihead_siamese.hpp" #include "lbann/data_readers/data_reader_synthetic.hpp" -#include "lbann/data_readers/data_reader_jag.hpp" #include "lbann/data_readers/data_reader_jag_conduit.hpp" #include "lbann/data_readers/data_reader_nci.hpp" #include "lbann/data_readers/data_reader_numpy.hpp" @@ -121,51 +133,53 @@ #include "lbann/data_readers/data_reader_csv.hpp" #include "lbann/data_readers/data_reader_merge_samples.hpp" #include "lbann/data_readers/data_reader_merge_features.hpp" -#include "lbann/data_readers/data_reader_ascii.hpp" #include "lbann/data_readers/data_reader_pilot2_molecular.hpp" #include "lbann/data_readers/data_reader_mesh.hpp" -#include "lbann/data_readers/data_reader_moving_mnist.hpp" #include "lbann/data_readers/data_reader_python.hpp" +#include "lbann/data_readers/data_reader_smiles.hpp" /// Data stores #include "lbann/data_store/data_store_conduit.hpp" /// Callbacks -#include "lbann/callbacks/callback_check_init.hpp" -#include "lbann/callbacks/callback_checknan.hpp" -#include "lbann/callbacks/callback_checksmall.hpp" -#include "lbann/callbacks/callback_check_dataset.hpp" -#include "lbann/callbacks/callback_print.hpp" -#include "lbann/callbacks/callback_timer.hpp" -#include "lbann/callbacks/callback_io.hpp" -#include "lbann/callbacks/callback_summary.hpp" -#include "lbann/callbacks/callback_learning_rate.hpp" -#include "lbann/callbacks/callback_debug.hpp" -#include "lbann/callbacks/callback_debug_io.hpp" -#include "lbann/callbacks/callback_imcomm.hpp" -#include "lbann/callbacks/callback_dump_weights.hpp" -#include "lbann/callbacks/callback_dump_outputs.hpp" -#include "lbann/callbacks/callback_dump_error_signals.hpp" -#include "lbann/callbacks/callback_dump_gradients.hpp" -#include "lbann/callbacks/callback_dump_minibatch_sample_indices.hpp" -#include "lbann/callbacks/callback_early_stopping.hpp" -#include "lbann/callbacks/callback_ltfb.hpp" -#include "lbann/callbacks/callback_save_images.hpp" -#include "lbann/callbacks/callback_save_model.hpp" +#include "lbann/callbacks/check_dataset.hpp" +#include "lbann/callbacks/check_gradients.hpp" +#include "lbann/callbacks/check_init.hpp" +#include "lbann/callbacks/check_metric.hpp" +#include "lbann/callbacks/check_nan.hpp" +#include "lbann/callbacks/check_small.hpp" +#include "lbann/callbacks/checkpoint.hpp" +#include "lbann/callbacks/confusion_matrix.hpp" +#include "lbann/callbacks/debug.hpp" +#include "lbann/callbacks/debug_io.hpp" +#include "lbann/callbacks/dump_error_signals.hpp" +#include "lbann/callbacks/dump_gradients.hpp" +#include "lbann/callbacks/dump_minibatch_sample_indices.hpp" +#include "lbann/callbacks/dump_outputs.hpp" +#include "lbann/callbacks/dump_weights.hpp" +#include "lbann/callbacks/early_stopping.hpp" +#include "lbann/callbacks/gpu_memory_usage.hpp" +#include "lbann/callbacks/hang.hpp" +#include "lbann/callbacks/imcomm.hpp" +#include "lbann/callbacks/learning_rate.hpp" +#include "lbann/callbacks/ltfb.hpp" +#include "lbann/callbacks/mixup.hpp" +#include "lbann/callbacks/monitor_io.hpp" +#include "lbann/callbacks/perturb_adam.hpp" +#include "lbann/callbacks/perturb_dropout.hpp" +#include "lbann/callbacks/print_model_description.hpp" +#include "lbann/callbacks/print_statistics.hpp" #include "lbann/callbacks/profiler.hpp" -#include "lbann/callbacks/callback_hang.hpp" -#include "lbann/callbacks/callback_variable_minibatch.hpp" -#include "lbann/callbacks/callback_timeline.hpp" -#include "lbann/callbacks/callback_checkpoint.hpp" -#include "lbann/callbacks/callback_save_model.hpp" -#include "lbann/callbacks/callback_replace_weights.hpp" -#include "lbann/callbacks/callback_gpu_memory_usage.hpp" -#include "lbann/callbacks/callback_sync_layers.hpp" -#include "lbann/callbacks/callback_sync_selected.hpp" -#include "lbann/callbacks/callback_confusion_matrix.hpp" -#include "lbann/callbacks/callback_check_gradients.hpp" -#include "lbann/callbacks/callback_check_metric.hpp" -#include "lbann/callbacks/callback_perturb_adam.hpp" +#include "lbann/callbacks/replace_weights.hpp" +#include "lbann/callbacks/save_images.hpp" +#include "lbann/callbacks/save_model.hpp" +#include "lbann/callbacks/load_model.hpp" +#include "lbann/callbacks/save_topk_models.hpp" +#include "lbann/callbacks/summary.hpp" +#include "lbann/callbacks/sync_layers.hpp" +#include "lbann/callbacks/timeline.hpp" +#include "lbann/callbacks/timer.hpp" +#include "lbann/callbacks/variable_minibatch.hpp" /// Weights and weight initializers #include "lbann/weights/weights.hpp" @@ -200,7 +214,5 @@ #include "lbann/utils/peek_map.hpp" #include "lbann/utils/stack_trace.hpp" #include "lbann/utils/stack_profiler.hpp" -#include "lbann/utils/threads/thread_pool.hpp" -#include "lbann/utils/threads/thread_utils.hpp" #endif // LBANN_LBANN_HPP_INCLUDED diff --git a/include/lbann/macros/CMakeLists.txt b/include/lbann/macros/CMakeLists.txt new file mode 100644 index 00000000000..45b42256ed3 --- /dev/null +++ b/include/lbann/macros/CMakeLists.txt @@ -0,0 +1,8 @@ +# Add the headers for this directory +set_full_path(THIS_DIR_HEADERS + instantiate.hpp + instantiate_device.hpp + ) + +# Propagate the files up the tree +set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/macros/instantiate.hpp b/include/lbann/macros/instantiate.hpp new file mode 100644 index 00000000000..51e3f038476 --- /dev/null +++ b/include/lbann/macros/instantiate.hpp @@ -0,0 +1,40 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +PROTO(float); +PROTO(double); + +#ifdef LBANN_HAS_HALF +#ifdef LBANN_INSTANTIATE_CPU_HALF +PROTO(cpu_fp16); +#endif // LBANN_INSTANTIATE_CPU_HALF +#endif // LBANN_HAS_HALF + +#ifdef LBANN_HAS_GPU_FP16 +#ifdef LBANN_INSTANTIATE_GPU_HALF +PROTO(fp16); +#endif // LBANN_INSTANTIATE_GPU_HALF +#endif // LBANN_HAS_GPU_FP16 diff --git a/include/lbann/macros/instantiate_device.hpp b/include/lbann/macros/instantiate_device.hpp new file mode 100644 index 00000000000..9b3d8dcd5ee --- /dev/null +++ b/include/lbann/macros/instantiate_device.hpp @@ -0,0 +1,43 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_INSTANTIATE_CPU_HALF +#define PROTO(T) \ + PROTO_DEVICE(T, El::Device::CPU) + +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#undef LBANN_INSTANTIATE_CPU_HALF + +#ifdef LBANN_HAS_GPU +#define LBANN_INSTANTIATE_GPU_HALF +#define PROTO(T) \ + PROTO_DEVICE(T, El::Device::GPU) + +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#undef LBANN_INSTANTIATE_GPU_HALF +#endif // LBANN_HAS_GPU diff --git a/include/lbann/metrics/layer_metric.hpp b/include/lbann/metrics/layer_metric.hpp index f0f9c811504..d7700b735c9 100644 --- a/include/lbann/metrics/layer_metric.hpp +++ b/include/lbann/metrics/layer_metric.hpp @@ -47,6 +47,11 @@ class layer_metric : public metric { std::string name() const override; std::string get_unit() const override { return m_unit; } + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(cereal::base_class(this), CEREAL_NVP(m_name), CEREAL_NVP(m_unit)); + } + /** Set corresponding layer. */ void set_layer(Layer& l); /** Get corresponding layer. */ @@ -59,6 +64,14 @@ class layer_metric : public metric { /** Set list of pointers to layers. */ void set_layer_pointers(std::vector layers) override; + /** Save metric state to checkpoint. */ + bool save_to_checkpoint_shared(persist& p); + /** Load metric state from checkpoint. */ + bool load_from_checkpoint_shared(persist& p); + + bool save_to_checkpoint_distributed(persist& p); + bool load_from_checkpoint_distributed(persist& p); + protected: void setup(model& m) override; @@ -86,7 +99,7 @@ class layer_metric : public metric { Layer* m_layer; /** Get corresponding evaluation layer. */ - abstract_evaluation_layer& get_evaluation_layer(); + /*abstract_evaluation_*/Layer& get_evaluation_layer(); }; diff --git a/include/lbann/metrics/metric.hpp b/include/lbann/metrics/metric.hpp index d270c361bb5..4a8be6c5e98 100644 --- a/include/lbann/metrics/metric.hpp +++ b/include/lbann/metrics/metric.hpp @@ -31,6 +31,8 @@ #include "lbann/comm.hpp" #include "lbann/utils/exception.hpp" #include "lbann/io/persist.hpp" +#include +#include namespace lbann { @@ -56,6 +58,13 @@ struct metric_statistics { metric_statistics& operator=(const metric_statistics& other) = default; /** Destructor. */ ~metric_statistics() = default; + + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(CEREAL_NVP(m_sum), + CEREAL_NVP(m_num_samples)); + } + /** Add metric value to statistics. */ void add_value(EvalType value, int num_samples = 1); /** Get mean metric value. @@ -67,19 +76,6 @@ struct metric_statistics { int get_num_samples() const { return m_num_samples; } /** Reset statistics. */ void reset(); - - //************************************************************************ - // Checkpointing - //************************************************************************ - /** struct used to serialize mode fields in file and MPI transfer */ - struct packing_header { - double sum; - uint64_t num_samples; - }; - bool pack_scalars(persist& p); - bool unpack_scalars(persist& p, struct packing_header *header); - void unpack_header(struct packing_header& header); - }; /** Abstract base class for metric functions. @@ -102,6 +98,11 @@ class metric { /** Copy function. */ virtual metric* copy() const = 0; + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(CEREAL_NVP(m_statistics)); + } + /** Return a string name for this metric. */ virtual std::string name() const = 0; /** Return a display unit for this metric. @@ -122,9 +123,13 @@ class metric { virtual EvalType evaluate(execution_mode mode, int mini_batch_size) = 0; /** Clear all statistics. */ - void reset_statistics() { m_statistics.clear(); } + void reset_statistics() { + for (auto& stats : m_statistics) { + stats.second.reset(); + } + } /** Clear statistics for an execution mode. */ - void reset_statistics(execution_mode mode) { m_statistics.erase(mode); } + void reset_statistics(execution_mode mode) { m_statistics[mode].reset(); } /** Get mean metric value. * If mini-batch sizes are not identical, the mean is over the @@ -150,12 +155,12 @@ class metric { } /** Save metric state to checkpoint. */ - virtual bool save_to_checkpoint_shared(persist& p); + virtual bool save_to_checkpoint_shared(persist& p) = 0; /** Load metric state from checkpoint. */ - virtual bool load_from_checkpoint_shared(persist& p); + virtual bool load_from_checkpoint_shared(persist& p) = 0; - virtual bool save_to_checkpoint_distributed(persist& p); - virtual bool load_from_checkpoint_distributed(persist& p); + virtual bool save_to_checkpoint_distributed(persist& p) = 0; + virtual bool load_from_checkpoint_distributed(persist& p) = 0; protected: diff --git a/include/lbann/models/directed_acyclic_graph.hpp b/include/lbann/models/directed_acyclic_graph.hpp index a47c6a8f123..8949bdf248d 100644 --- a/include/lbann/models/directed_acyclic_graph.hpp +++ b/include/lbann/models/directed_acyclic_graph.hpp @@ -30,6 +30,8 @@ #include "lbann/models/model.hpp" #include "lbann/layers/layer.hpp" +#include + namespace lbann { /** Neural network model with a DAG layer graph. */ @@ -37,13 +39,12 @@ class directed_acyclic_graph_model : public model { public: directed_acyclic_graph_model(lbann_comm *comm, - El::Int max_mini_batch_size, - objective_function *obj_fn, - optimizer *default_optimizer); + std::unique_ptr obj_fn, + std::unique_ptr default_optimizer_msg); directed_acyclic_graph_model(const directed_acyclic_graph_model& other) = default; directed_acyclic_graph_model& operator=(const directed_acyclic_graph_model& other) = default; ~directed_acyclic_graph_model() override = default; - directed_acyclic_graph_model* copy() const override { return new directed_acyclic_graph_model(*this); } + std::unique_ptr copy_model() const override { return make_unique(*this); } std::string get_type() const override { return "directed acyclic graph"; } protected: diff --git a/include/lbann/models/model.hpp b/include/lbann/models/model.hpp index 7e8671a5289..de30914d0c8 100644 --- a/include/lbann/models/model.hpp +++ b/include/lbann/models/model.hpp @@ -30,24 +30,42 @@ #include "lbann/base.hpp" #include "lbann/comm.hpp" #include "lbann/layers/layer.hpp" +#include "lbann/data_coordinator/data_coordinator_metadata.hpp" +#include "lbann/execution_contexts/execution_context.hpp" #include "lbann/utils/summary.hpp" #include "lbann/utils/graph.hpp" #include "lbann/io/file_io.hpp" #include "lbann/io/persist.hpp" -#include "lbann/objective_functions/objective_function.hpp" #include "lbann/metrics/metric.hpp" -#include "lbann/weights/weights.hpp" +#include "lbann/objective_functions/objective_function.hpp" #include "lbann/optimizers/optimizer.hpp" +#include "lbann/proto/factories.hpp" +#include "lbann/weights/weights.hpp" #include "lbann/utils/threads/thread_pool.hpp" -#include +#include + +// Note (trb): There's what is, IMO, an STL error in GCC in which the +// dtor for unique_ptr is checking sizeof(T), so this must be a +// complete type. Sigh. (The greater implication of this is that you +// cannot have `unique_ptr` as a drop-in for +// `IncompleteType*`, which is annoying. +#include + #include #include #include +// Forward-declare protobuf class +namespace lbann_data { +class Model; +} + namespace lbann { // Forward declarations class lbann_callback; +class training_algorithm; +class callback_base; /** @brief Abstract base class for neural network models. */ class model { @@ -58,13 +76,17 @@ class model { // =========================================== model(lbann_comm* comm, - El::Int mini_batch_size, - objective_function* obj_fn, - optimizer* default_optimizer = nullptr); + std::unique_ptr obj_fn, + std::unique_ptr default_optimizer_msg = nullptr); model(const model& other); model& operator=(const model& other); virtual ~model(); - virtual model* copy() const = 0; + virtual std::unique_ptr copy_model() const = 0; + + /** Archive for checkpoint and restart */ + template void serialize(Archive & ar) { + ar(CEREAL_NVP(*m_objective_function)); + } // =========================================== // Access functions @@ -91,8 +113,8 @@ class model { virtual description get_description() const; /** @brief Mathematical function to be minimized during training. */ - objective_function* get_objective_function() const { - return m_objective_function; + observer_ptr get_objective_function() const { + return m_objective_function.get(); } /** @brief Return the model's metrics. */ @@ -120,63 +142,40 @@ class model { std::vector get_weights(); /** @brief Get the list of callbacks for the model. */ - virtual std::vector& get_callbacks() { - return m_callbacks; + virtual std::vector> get_callbacks() { + std::vector> callback_list; + callback_list.reserve(m_callbacks.size()); + for (const auto& ptr : m_callbacks) { + callback_list.push_back(ptr.get()); + } + return callback_list; } - /** @brief Return the I/O thread pool */ - std::shared_ptr get_io_thread_pool() { return m_io_thread_pool; } + virtual std::vector>& get_callbacks_with_ownership() { + return m_callbacks; + } /** @brief Get the model's comm. */ - inline lbann_comm *get_comm() const { + lbann_comm *get_comm() const { return m_comm; } - void set_execution_mode(execution_mode mode); - execution_mode get_execution_mode() const noexcept; - - /** @brief Number of times the training set has been traversed. */ - inline El::Int get_epoch() const noexcept { return m_epoch; } - - /** @brief Current mini-batch step for current execution mode. - * @details Step counts are not reset after each epoch. - */ - El::Int get_step() const noexcept; - - /** @brief Current mini-batch step for given execution mode. - * @details Step counts are not reset after each epoch. - */ - El::Int get_step(execution_mode mode) const noexcept; - - /** @brief Set the model's current mini-batch size. */ - inline void set_current_mini_batch_size(int mini_batch_size) { - m_current_mini_batch_size = mini_batch_size; - } - /** @brief Get the model's current mini-batch size. */ - inline int get_current_mini_batch_size() const { - return m_current_mini_batch_size; - } - /** @brief Get the model's maximum mini-batch size. */ - inline int get_max_mini_batch_size() const { - return m_max_mini_batch_size; + /** Check to see if there is a valid training context for the model */ + bool has_valid_execution_context() const { + return (m_execution_context != nullptr); } - /** @brief Get the model's effective mini-batch size. */ - inline int get_effective_mini_batch_size() const { - return m_effective_mini_batch_size; - } - /** @brief Set the model's effective mini-batch size. */ - inline void set_effective_mini_batch_size(int mini_batch_size) { - m_effective_mini_batch_size = mini_batch_size; - } - int get_num_iterations_per_epoch(execution_mode mode) const; - /** @brief Return true if the flag to stop training is set. */ - bool get_terminate_training() const { - return m_terminate_training; + /** Grab the training context of the model */ + const execution_context& get_execution_context() const { + if(m_execution_context == nullptr) { + LBANN_ERROR("execution context is not set"); + } + return *m_execution_context; } - /** @brief Set the terminate training flag (on or off). */ - void set_terminate_training(bool f) { - m_terminate_training = f; + + /** Grab the training context of the model */ + execution_context& get_execution_context() { + return const_cast(static_cast(*this).get_execution_context()); } // =========================================== @@ -187,10 +186,13 @@ class model { virtual void add_layer(std::unique_ptr l); /** @brief Add weights to model. */ - void add_weights(weights *w); + void add_weights(std::unique_ptr w); + + /** @brief Register a new callback for the model. */ + void add_callback(std::shared_ptr cb); /** @brief Register a new callback for the model. */ - void add_callback(lbann_callback *cb); + // void add_callbacks(std::vector>& cb); /** @brief Register a new metric for the model. */ void add_metric(metric *m); @@ -209,7 +211,14 @@ class model { * * If there is no default optimizer, a null pointer is returned. */ - optimizer* create_optimizer() const; + template + std::unique_ptr create_optimizer() const + { + if (m_default_optimizer_msg) + return proto::construct_optimizer( + *m_default_optimizer_msg); + return nullptr; + } /** @brief Set a flag that can be used to enable / disable the * background I/O activities @@ -219,27 +228,15 @@ class model { /** @brief Are background I/O activities enabled by the input layers */ bool background_io_activity_allowed() { return m_background_io_allowed; } + size_t get_num_iterations_per_epoch(execution_mode mode) const; + // =========================================== // Setup // =========================================== /** @details Must be called after model specification and before * execution. */ - virtual void setup(std::shared_ptr io_thread_pool); - - // =========================================== - // Execution - // =========================================== - - /** @brief Evaluate model. */ - virtual void evaluate(execution_mode mode, int num_batches=0); - - /** @brief Train model. */ - virtual void train(int num_epochs, int num_batches=0); - - /** @brief Complete any background I/O data fetch for the execution - mode requested */ - virtual void collect_background_data_fetch(execution_mode mode); + virtual void setup(size_t max_mini_batch_size, DataReaderMetaData& dr_metadata); virtual void make_data_store_preloaded(execution_mode mode); @@ -287,9 +284,6 @@ class model { protected: - /** @brief Check if the model execution mode is valid. */ - virtual bool is_execution_mode_valid(execution_mode mode) const; - /** @brief Reorder layer list with a gather. * * The new layer list is the same length as @c gather_indices and @@ -339,7 +333,7 @@ class model { * * Called in setup function. */ - virtual void setup_layers(); + virtual void setup_layers(size_t max_mini_batch_size, DataReaderMetaData& dr_metadata); /** @brief Set up weights. * * Called in setup function. All weights being used by layers or @@ -348,19 +342,31 @@ class model { */ virtual void setup_weights(); +public: + // =========================================== + // Execution + // =========================================== + /** @brief Reset model pointer and execution mode. */ - virtual void reset_mode_and_model(execution_mode mode); + virtual void reset_mode(execution_context& context, execution_mode mode); /** @brief Reset model statistics for an epoch. */ virtual void reset_epoch_statistics(execution_mode mode); - /** @brief Evaluate model on a mini-batch */ - virtual bool evaluate_mini_batch(execution_mode mode); - /** @brief Train model on a mini-batch. */ - virtual bool train_mini_batch(); + + /** @brief Check if the trainer execution mode is valid for this model. + @todo this should be moved to the trainer when the data readers move. */ + virtual bool is_execution_mode_valid(execution_mode mode) const; + + /** @brief Complete any background I/O data fetch for the execution + mode requested */ + virtual void collect_background_data_fetch(execution_mode mode); /** @brief Forward propagation step. */ virtual void forward_prop(execution_mode mode); /** @brief Backward propagation step. */ virtual void backward_prop(); + /** Evaluate any metrics in the model */ + virtual void evaluate_metrics(execution_mode mode, + size_t current_mini_batch_size); /** @brief Clear each optimizer's gradient. * * This must be called before training forward prop since layers @@ -382,22 +388,8 @@ class model { // Callbacks // =========================================== - /** @brief Execute callbacks at start of training. */ - virtual void do_train_begin_cbs(); - /** @brief Execute callbacks at end of training. */ - virtual void do_train_end_cbs(); - /** @brief Execute callbacks at start of evaluation. */ - virtual void do_evaluate_begin_cbs(execution_mode mode); - /** @brief Execute callbacks at end of evaluation. */ - virtual void do_evaluate_end_cbs(execution_mode mode); - /** @brief Execute callbacks at start of epoch. */ - virtual void do_epoch_begin_cbs(); - /** @brief Execute callbacks at end of epoch. */ - virtual void do_epoch_end_cbs(); - /** @brief Execute callbacks at start of mini-batch. */ - virtual void do_batch_begin_cbs(execution_mode mode); - /** @brief Execute callbacks at end of mini-batch. */ - virtual void do_batch_end_cbs(execution_mode mode); + /** @brief Execute callbacks at end of setup. */ + virtual void do_setup_end_cbs(); /** @brief Execute callbacks at start of model forward propagation. */ virtual void do_model_forward_prop_begin_cbs(execution_mode mode); /** @brief Execute callbacks at end of model forward propagation. */ @@ -425,6 +417,9 @@ class model { private: + /** Pointer to the execution context object used for training or evaluating this model */ + observer_ptr m_execution_context; + /** @brief LBANN communicator. */ lbann_comm* m_comm; @@ -434,54 +429,23 @@ class model { */ std::string m_name; - /** @brief Current execution mode. */ - execution_mode m_execution_mode = execution_mode::training; - - /** @brief Number of times the training data set has been traversed. */ - El::Int m_epoch = 0; - - /** @brief Number of mini-batch steps performed. - * @details Step counts are not reset after each epoch. - */ - std::map m_step; - - /** @brief Whether to terminate training. - * @details If true, training will terminate immediately before - * the next epoch. - */ - bool m_terminate_training = false; - - /** @brief Size of the current mini-batch in the model. */ - int m_current_mini_batch_size; - /** @details Maximum possible minibatch size supported by layers in - * this model. Note that this is local to the particular model, - * not across multiple models. - */ - int m_max_mini_batch_size; - /** @brief The "effective" size of a minibatch. - * - * This is the size of the minibatch across all models and used for - * e.g. correctly averaging gradients from multiple models. - */ - int m_effective_mini_batch_size; - /** @brief Tensor operations. * @details The list is in execution order for forward propagation. */ std::vector> m_layers; /** @brief Trainable parameters. */ - std::vector m_weights; + std::vector> m_weights; /** @details If a layer needs to construct an optimizer during * setup, it will make a copy of the default optimizer. This object * is just used to create copies and is not actually used for * optimization. */ - optimizer* m_default_optimizer = nullptr; + std::unique_ptr m_default_optimizer_msg; /** @brief Mathematical function to be minimized during training. */ - objective_function* m_objective_function; + std::unique_ptr m_objective_function; /** @brief Numerical quantities to evaluate model performance. * @details Does not affect training. @@ -489,14 +453,16 @@ class model { std::vector m_metrics; /** @brief Current callbacks to process. */ - std::vector m_callbacks; - - /** @brief Threads available for I/O */ - std::shared_ptr m_io_thread_pool; + std::vector> m_callbacks; /** @brief Flag that allows input layers to fetch data in the background */ bool m_background_io_allowed = true; + /** @brief Is the model setup + * @details Flag to indicate if the setup function has been called + */ + bool m_model_is_setup = false; + // =========================================== // Functions to add utility layers // =========================================== @@ -534,6 +500,11 @@ class model { */ void add_split_layers(std::unordered_set& layer_names); +#ifdef LBANN_HAS_DISTCONV + void setup_distconv(); + void setup_distributions(); + void print_distributions() const; +#endif // LBANN_HAS_DISTCONV }; } // namespace lbann diff --git a/include/lbann/objective_functions/layer_term.hpp b/include/lbann/objective_functions/layer_term.hpp index 7a3622537fe..7d8aa1508a5 100644 --- a/include/lbann/objective_functions/layer_term.hpp +++ b/include/lbann/objective_functions/layer_term.hpp @@ -58,7 +58,7 @@ class layer_term : public objective_function_term { private: /** Get corresponding evaluation layer. */ - abstract_evaluation_layer& get_evaluation_layer(); + /*abstract_evaluation_*/Layer& get_evaluation_layer(); }; diff --git a/include/lbann/objective_functions/objective_function.hpp b/include/lbann/objective_functions/objective_function.hpp index 9e0195bdb80..ad55f33cd13 100644 --- a/include/lbann/objective_functions/objective_function.hpp +++ b/include/lbann/objective_functions/objective_function.hpp @@ -48,6 +48,17 @@ class objective_function { /** Copy function. */ objective_function* copy() const { return new objective_function(*this); } + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(CEREAL_NVP(m_statistics)); + + // Serialized each objective function term object explicitly, not the pointer to + // the objective function term + for(auto&& t : m_terms) { + ar(CEREAL_NVP(*t)); + } + } + /** Add a term to the objective function. * The objective function takes ownership of the objective function * term and deallocates it during destruction. @@ -84,9 +95,13 @@ class objective_function { void compute_weight_regularization(); /** Clear all statistics. */ - void reset_statistics() { m_statistics.clear(); } + void reset_statistics() { + for (auto& stats : m_statistics) { + stats.second.reset(); + } + } /** Clear statistics for an execution mode. */ - void reset_statistics(execution_mode mode) { m_statistics.erase(mode); } + void reset_statistics(execution_mode mode) { m_statistics[mode].reset(); } /** Get mean objective function value. * This is a weighted average such that each mini-batch sample makes diff --git a/include/lbann/objective_functions/objective_function_term.hpp b/include/lbann/objective_functions/objective_function_term.hpp index 1fa13bff220..e5f4546729b 100644 --- a/include/lbann/objective_functions/objective_function_term.hpp +++ b/include/lbann/objective_functions/objective_function_term.hpp @@ -49,6 +49,11 @@ class objective_function_term { /** Copy function. */ virtual objective_function_term* copy() const = 0; + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(CEREAL_NVP(m_scale_factor)); + } + /** Get the name of the objective function term. */ virtual std::string name() const = 0; diff --git a/include/lbann/objective_functions/weight_regularization/l2.hpp b/include/lbann/objective_functions/weight_regularization/l2.hpp index d8ef6fa47c0..75c4eece8f8 100644 --- a/include/lbann/objective_functions/weight_regularization/l2.hpp +++ b/include/lbann/objective_functions/weight_regularization/l2.hpp @@ -31,6 +31,9 @@ namespace lbann { +template class data_type_optimizer; +template class data_type_weights; + /** @class l2_weight_regularization * @brief Apply L2 regularization to a set of weights. * @@ -40,12 +43,29 @@ namespace lbann { */ class l2_weight_regularization : public objective_function_term { public: + using AccumulateDataType = DataType; + + using OptimizerType = data_type_optimizer; + + using WeightsType = data_type_weights; + + template + using DMatType = El::Matrix; + using CPUMatType = DMatType; + +public: /** @param scale_factor The objective function term is * @f$ \text{scale\_factor} \times \sum L2(w_i) @f$ */ l2_weight_regularization(EvalType scale_factor = 1); l2_weight_regularization* copy() const override { return new l2_weight_regularization(*this); } + + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(cereal::base_class(this)); + } + std::string name() const override { return "L2 weight regularization"; } void setup(model& m) override; void start_evaluation() override; @@ -69,7 +89,7 @@ class l2_weight_regularization : public objective_function_term { private: /** Contributions to evaluated value. */ - std::map m_contributions; + std::map m_contributions; /** For non-blocking allreduces. */ Al::request m_allreduce_req; @@ -85,8 +105,8 @@ class l2_weight_regularization : public objective_function_term { * accumulation variable. */ template - static void accumulate_contribution(const DMat& vals, - DMat& contribution); + static void accumulate_contribution(const DMatType& vals, + DMatType& contribution); }; diff --git a/include/lbann/optimizers/CMakeLists.txt b/include/lbann/optimizers/CMakeLists.txt index 877cc8bb815..3147d074337 100644 --- a/include/lbann/optimizers/CMakeLists.txt +++ b/include/lbann/optimizers/CMakeLists.txt @@ -2,6 +2,7 @@ set_full_path(THIS_DIR_HEADERS adagrad.hpp adam.hpp + data_type_optimizer.hpp hypergradient_adam.hpp optimizer.hpp rmsprop.hpp diff --git a/include/lbann/optimizers/adagrad.hpp b/include/lbann/optimizers/adagrad.hpp index 9a5cc8adbe6..c0255ed7eac 100644 --- a/include/lbann/optimizers/adagrad.hpp +++ b/include/lbann/optimizers/adagrad.hpp @@ -27,7 +27,9 @@ #ifndef LBANN_OPTIMIZERS_ADAGRAD_HPP_INCLUDED #define LBANN_OPTIMIZERS_ADAGRAD_HPP_INCLUDED -#include "lbann/optimizers/optimizer.hpp" +#include "lbann/optimizers/data_type_optimizer.hpp" +#include "lbann/io/persist.hpp" +#include namespace lbann { @@ -39,39 +41,63 @@ namespace lbann { * methods for online learning and stochastic optimization." Journal * of Machine Learning Research 12, no. Jul (2011): 2121-2159. */ -class adagrad : public optimizer { +template +class adagrad : public Cloneable, + data_type_optimizer> { + using BaseType = Cloneable, + data_type_optimizer>; public: + /** @name Public Types */ + ///@{ - adagrad(lbann_comm* comm, DataType learning_rate, DataType eps = 1e-8); + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The optimizer base type of this object. */ + using OptimizerType = data_type_optimizer; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} + +public: + + adagrad(TensorDataType learning_rate, TensorDataType eps = 1e-8); adagrad(const adagrad& other); adagrad& operator=(const adagrad& other); ~adagrad() override = default; - adagrad* copy() const override { return new adagrad(*this); } + + /** Archive for checkpoint and restart */ + template void serialize(Archive & ar) { + ar(cereal::base_class>(this), + CEREAL_NVP(m_eps)); + } /** Human-readable type name. */ std::string get_type() const override { return "AdaGrad"; } /** Human-readable description. */ description get_description() const override; - void setup(weights* w = nullptr) override; + void setup(WeightsType* w = nullptr) override; protected: /** Computation for an optimization step. */ - void step_compute(AbsDistMat& values, const AbsDistMat& gradient) override; + void step_compute(AbsDistMatrixType& values, const AbsDistMatrixType& gradient) override; private: /** Small factor to avoid division by zero. */ - DataType m_eps; + TensorDataType m_eps; /** AdaGrad cache. */ - std::unique_ptr m_cache; + std::unique_ptr m_cache; /** CPU implementation of optimization step. */ - void step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient); + void step_compute_cpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #ifdef LBANN_HAS_CUDNN /** GPU implementation of optimization step. */ - void step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient); + void step_compute_gpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #endif // LBANN_HAS_CUDNN // =========================================== @@ -85,6 +111,11 @@ class adagrad : public optimizer { }; +template +std::unique_ptr +build_adagrad_optimizer_from_pbuf( + google::protobuf::Message const&); + } // namespace lbann #endif // LBANN_OPTIMIZERS_ADAGRAD_HPP_INCLUDED diff --git a/include/lbann/optimizers/adam.hpp b/include/lbann/optimizers/adam.hpp index 696c8416599..e64c67929f1 100644 --- a/include/lbann/optimizers/adam.hpp +++ b/include/lbann/optimizers/adam.hpp @@ -27,9 +27,16 @@ #ifndef LBANN_OPTIMIZERS_ADAM_HPP_INCLUDED #define LBANN_OPTIMIZERS_ADAM_HPP_INCLUDED -#include "lbann/optimizers/optimizer.hpp" +#include "lbann/optimizers/data_type_optimizer.hpp" +#include "lbann/io/persist.hpp" +#include +#include +//#include namespace lbann { +namespace callback { +class perturb_adam; +} // namespace callback /** @brief Adam optimizer. * @@ -38,22 +45,48 @@ namespace lbann { * Diederik P. Kingma and Jimmy Ba. "Adam: A method for stochastic * optimization." arXiv preprint arXiv:1412.6980 (2014). */ -class adam : public optimizer { +template +class adam : public Cloneable, + data_type_optimizer> { + using BaseType = Cloneable, + data_type_optimizer>; +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The optimizer base type of this object. */ + using OptimizerType = data_type_optimizer; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} + public: /** @name Life cycle functions */ ///@{ - adam(lbann_comm* comm, - DataType learning_rate, - DataType beta1 = 0.9, - DataType beta2 = 0.99, - DataType eps = 1e-8); + adam(TensorDataType learning_rate, + TensorDataType beta1 = 0.9, + TensorDataType beta2 = 0.99, + TensorDataType eps = 1e-8); adam(const adam& other); adam& operator=(const adam& other); ~adam() = default; - adam* copy() const override { return new adam(*this); } + /** Archive for checkpoint and restart */ + template void serialize(Archive & ar) { + ar(cereal::base_class>(this), + CEREAL_NVP(m_beta1), + CEREAL_NVP(m_beta2), + CEREAL_NVP(m_eps), + CEREAL_NVP(m_current_beta1), + CEREAL_NVP(m_current_beta2)); + } ///@} /** @name Descriptions */ @@ -70,132 +103,91 @@ class adam : public optimizer { ///@{ /** Update factor for first moment estimate. */ - DataType get_beta1() const noexcept { return m_beta1; } + TensorDataType get_beta1() const noexcept { return m_beta1; } /** Update factor for first moment estimate. */ - void set_beta1(DataType beta1) { m_beta1 = beta1; } + void set_beta1(TensorDataType beta1) { m_beta1 = beta1; } /** Update factor for second moment estimate. */ - DataType get_beta2() const noexcept { return m_beta2; } + TensorDataType get_beta2() const noexcept { return m_beta2; } /** Update factor for second moment estimate. */ - void set_beta2(DataType beta2) { m_beta2 = beta2; } + void set_beta2(TensorDataType beta2) { m_beta2 = beta2; } /** Small factor to avoid division by zero. */ - DataType get_eps() const noexcept { return m_eps; } + TensorDataType get_eps() const noexcept { return m_eps; } /** Small factor to avoid division by zero. */ - void set_eps(DataType eps) { m_eps = eps; } + void set_eps(TensorDataType eps) { m_eps = eps; } /** First moment estimates. */ - const AbsDistMat& get_moment1() const; + const AbsDistMatrixType& get_moment1() const; /** First moment estimates. */ - AbsDistMat& get_moment1(); + AbsDistMatrixType& get_moment1(); /** Second moment estimates. */ - const AbsDistMat& get_moment2() const; + const AbsDistMatrixType& get_moment2() const; /** Second moment estimates. */ - AbsDistMat& get_moment2(); + AbsDistMatrixType& get_moment2(); /** beta1 ^ iteration. * @todo This probably shouldn't be exposed. */ - DataType get_current_beta1() const noexcept { return m_current_beta1; } + TensorDataType get_current_beta1() const noexcept { return m_current_beta1; } /** beta1 ^ iteration. * @todo This probably shouldn't be exposed. */ - void set_current_beta1(DataType current_beta1) { m_current_beta1 = current_beta1; } + void set_current_beta1(TensorDataType current_beta1) { m_current_beta1 = current_beta1; } /** beta2 ^ iteration. * @todo This probably shouldn't be exposed. */ - DataType get_current_beta2() const noexcept { return m_current_beta2; } + TensorDataType get_current_beta2() const noexcept { return m_current_beta2; } /** beta2 ^ iteration. * @todo This probably shouldn't be exposed. */ - void set_current_beta2(DataType current_beta2) { m_current_beta2 = current_beta2; } + void set_current_beta2(TensorDataType current_beta2) { m_current_beta2 = current_beta2; } ///@} /** @name Setup */ ///@{ - void setup(weights* w = nullptr) override; + void setup(WeightsType* w = nullptr) override; ///@} protected: /** Computation for an optimization step. */ - void step_compute(AbsDistMat& values, - const AbsDistMat& gradient) override; + void step_compute(AbsDistMatrixType& values, + const AbsDistMatrixType& gradient) override; private: /** Update factor for first moment estimate. */ - DataType m_beta1; + TensorDataType m_beta1; /** Update factor for second moment estimate. */ - DataType m_beta2; + TensorDataType m_beta2; /** Small factor to avoid division by zero. */ - DataType m_eps; + TensorDataType m_eps; /** beta1 ^ iteration. */ - DataType m_current_beta1 = 1; + TensorDataType m_current_beta1 = TensorDataType(1.); /** beta2 ^ iteration. */ - DataType m_current_beta2 = 1; + TensorDataType m_current_beta2 = TensorDataType(1.); /** First moment estimates. */ - std::unique_ptr m_moment1; + std::unique_ptr m_moment1; /** Second moment estimates. */ - std::unique_ptr m_moment2; + std::unique_ptr m_moment2; /** Hyperparameter exploration. */ - friend class lbann_callback_perturb_adam; + friend class callback::perturb_adam; /** CPU implementation of optimization step. */ - void step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient); + void step_compute_cpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient, + const TensorDataType& correction); #ifdef LBANN_HAS_CUDA /** GPU implementation of optimization step. */ - void step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient); + void step_compute_gpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient, + const TensorDataType& correction); #endif // LBANN_HAS_CUDA /** @name Checkpointing */ ///@{ - /* struct used to serialize mode fields in file and MPI transfer */ - struct packing_header { - DataType beta1; - DataType beta2; - DataType eps; - DataType current_beta1; - DataType current_beta2; - }; - - bool pack_scalars(persist& p) { - p.write_datatype(persist_type::train, "beta1", m_beta1); - p.write_datatype(persist_type::train, "beta2", m_beta2); - p.write_datatype(persist_type::train, "eps", m_eps); - p.write_datatype(persist_type::train, "current_beta1", m_current_beta1); - p.write_datatype(persist_type::train, "current_beta2", m_current_beta2); - return true; - } - - bool unpack_scalars(persist& p, struct packing_header *header) { - p.read_datatype(persist_type::train, "beta1", &m_beta1); - p.read_datatype(persist_type::train, "beta2", &m_beta2); - p.read_datatype(persist_type::train, "eps", &m_eps); - p.read_datatype(persist_type::train, "current_beta1", &m_current_beta1); - p.read_datatype(persist_type::train, "current_beta2", &m_current_beta2); - - if(header != nullptr) { - header->beta1 = m_beta1; - header->beta2 = m_beta2; - header->eps = m_eps; - header->current_beta1 = m_current_beta1; - header->current_beta2 = m_current_beta2; - } - return true; - } - - void unpack_header(struct packing_header& header) { - m_beta1 = header.beta1; - m_beta2 = header.beta2; - m_eps = header.eps; - m_current_beta1 = header.current_beta1; - m_current_beta2 = header.current_beta2; - } - bool save_to_checkpoint_shared(persist& p, std::string m_name) override; bool load_from_checkpoint_shared(persist& p, std::string m_name) override; bool save_to_checkpoint_distributed(persist& p, std::string m_name) override; @@ -205,6 +197,11 @@ class adam : public optimizer { }; +template +std::unique_ptr +build_adam_optimizer_from_pbuf( + google::protobuf::Message const&); + } // namespace lbann #endif // LBANN_OPTIMIZERS_ADAM_HPP_INCLUDED diff --git a/include/lbann/optimizers/data_type_optimizer.hpp b/include/lbann/optimizers/data_type_optimizer.hpp new file mode 100644 index 00000000000..9aadbbffde3 --- /dev/null +++ b/include/lbann/optimizers/data_type_optimizer.hpp @@ -0,0 +1,180 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_OPTIMIZERS_DATA_TYPE_OPTIMIZER_HPP_INCLUDED +#define LBANN_OPTIMIZERS_DATA_TYPE_OPTIMIZER_HPP_INCLUDED + +#include "lbann/optimizers/optimizer.hpp" + +namespace lbann { + +// Forward declarations +template +class data_type_weights; + +template +class data_type_optimizer + : public Cloneable< + HasAbstractFunction>, + optimizer> { + + using BaseType = + Cloneable>, + optimizer>; + + friend class data_type_weights; + +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} + +public: + data_type_optimizer(TensorDataType learning_rate = 0); + virtual ~data_type_optimizer() = default; + + /** @brief Human-readable description. */ + virtual description get_description() const override; + + /** @brief Must be called before training. + * + * @param w Weights being optimized. If null, no change is made to + * the weights. + */ + virtual void setup(data_type_weights* w = nullptr); + + /** @name Weights management */ + ///@{ + + /** @brief Weights being optimized. */ + data_type_weights& get_weights(); + /** @brief Weights being optimized. */ + const data_type_weights& get_weights() const; + /** @brief Weights being optimized. */ + void set_weights(data_type_weights* w) { m_weights = w; } + + ///@} + /** @name Gradient update management */ + ///@{ + + /** @brief Objective function gradient w.r.t. the weights. + * + * An allreduce may be launched and/or synchronized if needed. + */ + AbsDistMatrixType& get_gradient(); + + /** @brief Optimization step. */ + void step() override; + ///@} + + /** @brief Access the scaling factor for optimization step sizes. */ + TensorDataType get_learning_rate() const; + /** @brief Set the scaling factor for optimization step sizes. */ + void set_learning_rate(TensorDataType learning_rate); + + /** @name Checkpointing functionality */ + ///@{ + /** @brief Archive for checkpoint and restart */ + template + void serialize(Archive & ar) { + ar(cereal::base_class(this), CEREAL_NVP(m_learning_rate)); + } + ///@} + +protected: + + data_type_optimizer(const data_type_optimizer& other); + data_type_optimizer& operator=(const data_type_optimizer& other); + + /** @brief Computation for an optimization step. + * + * @c values and @c gradient can be assumed to have the same + * distribution. + */ + virtual void step_compute(AbsDistMatrixType& values, + const AbsDistMatrixType& gradient) = 0; + + /** @brief Get the info needed to construct a new gradient matrix. + * @return Tuple of height, width, and DistData. + */ + std::tuple get_matrix_info() const final; + +private: + + /** @brief Weights being optimized. */ + data_type_weights* m_weights = nullptr; + + /** @brief Objective function gradient w.r.t. weights. */ + std::unique_ptr m_gradient; + + /** @brief Workspace matrix. + * + * Helps ensure gradient contributions are in the right + * distribution. Most of the time, this should just be a matrix + * view. + */ + std::unique_ptr m_gradient_v; + + /** @brief Communication request object for gradient allreduce. + * + * Used to synchronize non-blocking allreduce. + */ + Al::request m_gradient_allreduce_req; + + /** @brief Scaling factor for optimization step sizes. + * + * This is not used by the base optimizer class, but is currently + * used by all derived optimizer classes. There are several cases + * where it is convenient to expose this in the base class, + * e.g. for variable learning rate schedules. + * + * @todo Consider moving this to the derived classes. + */ + TensorDataType m_learning_rate; +}; + +#ifndef LBANN_DATA_TYPE_OPTIMIZER_INSTANTIATE +#define PROTO(T) \ + extern template class data_type_optimizer + +#define LBANN_INSTANTIATE_CPU_HALF +#define LBANN_INSTANTIATE_GPU_HALF +#include "lbann/macros/instantiate.hpp" +#undef PROTO +#undef LBANN_INSTANTIATE_CPU_HALF +#undef LBANN_INSTANTIATE_GPU_HALF +#endif // LBANN_DATA_TYPE_OPTIMIZER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_OPTIMIZERS_DATA_TYPE_OPTIMIZER_HPP_INCLUDED diff --git a/include/lbann/optimizers/hypergradient_adam.hpp b/include/lbann/optimizers/hypergradient_adam.hpp index b0d362ad02e..57e9a7b7845 100644 --- a/include/lbann/optimizers/hypergradient_adam.hpp +++ b/include/lbann/optimizers/hypergradient_adam.hpp @@ -27,7 +27,9 @@ #ifndef LBANN_OPTIMIZERS_HYPERGRADIENT_ADAM_HPP_INCLUDED #define LBANN_OPTIMIZERS_HYPERGRADIENT_ADAM_HPP_INCLUDED -#include "lbann/optimizers/optimizer.hpp" +#include "lbann/optimizers/data_type_optimizer.hpp" +#include "lbann/io/persist.hpp" +#include namespace lbann { @@ -39,7 +41,27 @@ namespace lbann { * Baydin et al. "Online Learning Rate Adaptation with Hypergradient * Descent", 2017. */ -class hypergradient_adam : public optimizer { +template +class hypergradient_adam + : public Cloneable, + data_type_optimizer> { + using BaseType = Cloneable, + data_type_optimizer>; +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + /** @brief The base optimizer type for this class. */ + using OptimizerType = data_type_optimizer; + + ///@} + public: /** @brief Construct a Hypergradient Adam optimizer object @@ -55,105 +77,63 @@ class hypergradient_adam : public optimizer { * @param eps Small factor to avoid division by * zero. */ - hypergradient_adam(lbann_comm *comm, - DataType init_learning_rate = 1e-3, - DataType hyper_learning_rate = 1e-7, - DataType beta1 = 0.9, - DataType beta2 = 0.99, - DataType eps = 1e-8); + hypergradient_adam(TensorDataType init_learning_rate = 1e-3, + TensorDataType hyper_learning_rate = 1e-7, + TensorDataType beta1 = 0.9, + TensorDataType beta2 = 0.99, + TensorDataType eps = 1e-8); hypergradient_adam(const hypergradient_adam& other); hypergradient_adam& operator=(const hypergradient_adam& other); ~hypergradient_adam() override = default; - hypergradient_adam* copy() const override { return new hypergradient_adam(*this); } + + /** Archive for checkpoint and restart */ + template void serialize(Archive & ar) { + ar(cereal::base_class>(this), + CEREAL_NVP(m_hyper_learning_rate), + CEREAL_NVP(m_beta1), + CEREAL_NVP(m_beta2), + CEREAL_NVP(m_eps), + CEREAL_NVP(m_current_beta1), + CEREAL_NVP(m_current_beta2)); + } /** @brief Human-readable type name. */ std::string get_type() const override { return "hypergradient Adam"; } /** @brief Human-readable description. */ description get_description() const override; - void setup(weights* w = nullptr) override; + void setup(WeightsType* w = nullptr) override; protected: /** @brief Computation for an optimization step. */ - void step_compute(AbsDistMat& values, const AbsDistMat& gradient) override; + void step_compute(AbsDistMatrixType& values, const AbsDistMatrixType& gradient) override; private: /** @brief Hypergradient learning rate. */ - DataType m_hyper_learning_rate; + TensorDataType m_hyper_learning_rate; /** @brief Update factor for first moment estimate. */ - DataType m_beta1; + TensorDataType m_beta1; /** @brief Update factor for second moment estimate. */ - DataType m_beta2; + TensorDataType m_beta2; /** @brief Small factor to avoid division by zero. */ - DataType m_eps; + TensorDataType m_eps; /** @brief beta1 ^ iteration. */ - DataType m_current_beta1; + TensorDataType m_current_beta1; /** @brief beta2 ^ iteration. */ - DataType m_current_beta2; + TensorDataType m_current_beta2; /** @brief First moment estimates. */ - std::unique_ptr m_moment1; + std::unique_ptr m_moment1; /** @brief Second moment estimates. */ - std::unique_ptr m_moment2; + std::unique_ptr m_moment2; /** @brief Gradient estimate from the prior step (for hypergradient). */ - std::unique_ptr m_old_gradient; + std::unique_ptr m_old_gradient; // =========================================== // Checkpointing // =========================================== - /** @struct packing_header - * @brief Used to serialize mode fields in file and MPI transfer - */ - struct packing_header { - DataType hyper_learning_rate; - DataType beta1; - DataType beta2; - DataType eps; - DataType current_beta1; - DataType current_beta2; - }; - - bool pack_scalars(persist& p) { - p.write_datatype(persist_type::train, "hyper_learning_rate", m_hyper_learning_rate); - p.write_datatype(persist_type::train, "beta1", m_beta1); - p.write_datatype(persist_type::train, "beta2", m_beta2); - p.write_datatype(persist_type::train, "eps", m_eps); - p.write_datatype(persist_type::train, "current_beta1", m_current_beta1); - p.write_datatype(persist_type::train, "current_beta2", m_current_beta2); - return true; - } - - bool unpack_scalars(persist& p, struct packing_header *header) { - p.read_datatype(persist_type::train, "hyper_learning_rate", &m_hyper_learning_rate); - p.read_datatype(persist_type::train, "beta1", &m_beta1); - p.read_datatype(persist_type::train, "beta2", &m_beta2); - p.read_datatype(persist_type::train, "eps", &m_eps); - p.read_datatype(persist_type::train, "current_beta1", &m_current_beta1); - p.read_datatype(persist_type::train, "current_beta2", &m_current_beta2); - - if(header != nullptr) { - header->hyper_learning_rate = m_hyper_learning_rate; - header->beta1 = m_beta1; - header->beta2 = m_beta2; - header->eps = m_eps; - header->current_beta1 = m_current_beta1; - header->current_beta2 = m_current_beta2; - } - - return true; - } - - void unpack_header(struct packing_header& header) { - m_hyper_learning_rate = header.hyper_learning_rate; - m_beta1 = header.beta1; - m_beta2 = header.beta2; - m_eps = header.eps; - m_current_beta1 = header.current_beta1; - m_current_beta2 = header.current_beta2; - } - bool save_to_checkpoint_shared(persist& p, std::string m_name) override; bool load_from_checkpoint_shared(persist& p, std::string m_name) override; bool save_to_checkpoint_distributed(persist& p, std::string m_name) override; @@ -161,6 +141,11 @@ class hypergradient_adam : public optimizer { }; +template +std::unique_ptr +build_hypergradient_adam_optimizer_from_pbuf( + google::protobuf::Message const&); + } // namespace lbann #endif // LBANN_OPTIMIZER_HYPERGRADIENT_ADAM_HPP_INCLUDED diff --git a/include/lbann/optimizers/optimizer.hpp b/include/lbann/optimizers/optimizer.hpp index 6e0e9ee6712..d433423b0b4 100644 --- a/include/lbann/optimizers/optimizer.hpp +++ b/include/lbann/optimizers/optimizer.hpp @@ -27,18 +27,23 @@ #ifndef LBANN_OPTIMIZERS_OPTIMIZER_HPP_INCLUDED #define LBANN_OPTIMIZERS_OPTIMIZER_HPP_INCLUDED -#include -#include -#include -#include "lbann/utils/compiler_control.hpp" #include "lbann/base.hpp" #include "lbann/comm.hpp" -#include "lbann/utils/exception.hpp" -#include "lbann/utils/description.hpp" -#include "lbann/weights/weights.hpp" +#include "lbann/utils/cloneable.hpp" +#include "lbann/utils/compiler_control.hpp" #ifdef LBANN_HAS_GPU #include "lbann/utils/cuda.hpp" #endif // LBANN_HAS_GPU +#include "lbann/utils/description.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/memory.hpp" +#include "lbann/weights/weights.hpp" + +#include + +#include +#include +#include namespace lbann { @@ -56,14 +61,13 @@ enum class optimizer_gradient_status { * @details Non-blocking allreduce must be synchronized before * accessing. */ - allreduce_started + allreduce_started, }; /** @brief Human-readable string for status of gradient in optimizer. */ std::string to_string(optimizer_gradient_status status); // Forward declarations -class weights; class persist; /** @brief Abstract base class for gradient-based optimization algorithms. @@ -74,37 +78,24 @@ class persist; * optimization step requires the objective function gradient * w.r.t. the weights. */ -class optimizer { +class optimizer : public Cloneable> { public: - optimizer(lbann_comm* comm, DataType learning_rate = 0); - optimizer(const optimizer& other); - optimizer& operator=(const optimizer& other); + /** @name Constructors and Destructor */ + ///@{ + + optimizer(); virtual ~optimizer() = default; - /** @brief Create a copy of the class instance. - * - * The caller is responsible for deallocating the returned object. - */ - virtual optimizer* copy() const = 0; + ///@} /** @brief Human-readable type name. */ virtual std::string get_type() const = 0; /** @brief Human-readable description. */ virtual description get_description() const; - /** @brief Weights being optimized. */ - weights& get_weights(); - /** @brief Weights being optimized. */ - const weights& get_weights() const; - /** @brief Weights being optimized. */ - void set_weights(weights* w) { m_weights = w; } - - /** @brief Objective function gradient w.r.t. the weights. - * - * An allreduce may be launched and/or synchronized if needed. - */ - AbsDistMat& get_gradient(); + /** @name Gradient update management */ + ///@{ /** @brief Add to the objective function gradient w.r.t. the weights. * @param gradient Contribution to gradient. @@ -118,36 +109,30 @@ class optimizer { * allreduce is performed lazily when the * gradient is accessed. */ - void add_to_gradient(const AbsDistMat& gradient, - DataType scale = DataType(1), - bool allreduce_needed = false); + template + void add_to_gradient(El::AbstractDistMatrix const& contrib, + TensorDataType scale = 1.f, + bool allreduce_needed = false) { + TensorDataType buf_scale, in_scale; + auto& grad = get_gradient_buffer(buf_scale, in_scale, allreduce_needed); + El::Scale(buf_scale, grad); + El::Axpy(in_scale*scale, contrib, grad); + } + /** @brief Zero out the objective function gradient w.r.t. the weights. */ - void clear_gradient(); - /** @brief Get the gradient buffer. - * - * This provides access to the underlying gradient buffer, which may be - * directly summed into. This buffer should be considered ephemeral and not - * stored. The caller must also ensure the buffer has an appropriate - * distribution. buf_scale provides the caller with a scale factor that must - * be applied to the gradient buffer before writing to it, and in_scale - * provides a scaling factor that must be applied to the user's data. - * Essentially, this enables computations of the form - * gradient = buf_scale*gradient + in_scale*new_gradient - * This is an expert-mode function and is intended to help eliminate copies - * and facilitate kernel fusion. - * - * @param buf_scale A scale factor provided to the caller to scale the - * returned buffer by. - * @param in_scale A scale factor provided to the caller to scale their - * gradient contributions by. - * @param allreduce_needed Whether this gradient contribution will need to - * be allreduced. - */ - AbsDistMat& get_gradient_buffer(DataType& buf_scale, - DataType& in_scale, - bool allreduce_needed = false); + void clear_gradient() { + for (auto& g : gradients_) { + if (g.second->get_status() == + optimizer_gradient_status::allreduce_started) { + g.second->complete_allreduce(*m_comm); + } + g.second->clear(); + } + this->get_gradient_sources().clear(); + } /** @brief Objects that are expected to contribute to the gradient. */ + El::Int get_num_gradient_sources() const; /** @brief Register a gradient source. * @@ -157,6 +142,7 @@ class optimizer { * forward prop. */ void add_gradient_source(const void* source); + /** @brief Unregister a gradient source. * * When an object adds its contribution to the objective function @@ -166,59 +152,196 @@ class optimizer { */ void remove_gradient_source(const void* source); - /** @brief Must be called before training. + /** @brief Perform optimization step. */ + virtual void step() = 0; + + /** @brief Get the gradient buffer. + * + * This provides access to the underlying gradient buffer, which + * may be directly summed into. This buffer should be considered + * ephemeral and not stored. The caller must also ensure the buffer + * has an appropriate distribution. buf_scale provides the caller + * with a scale factor that must be applied to the gradient buffer + * before writing to it, and in_scale provides a scaling factor + * that must be applied to the user's data. Essentially, this + * enables computations of the form + * @verbatim + * gradient = buf_scale*gradient + in_scale*new_gradient + * @endverbatim + * This is an expert-mode function and is intended to help + * eliminate copies and facilitate kernel fusion. * - * @param w Weights being optimized. If null, no change is made to - * the weights. + * @param buf_scale A scale factor provided to the caller to scale + * the returned buffer by. + * @param in_scale A scale factor provided to the caller to scale + * their gradient contributions by. + * @param allreduce_needed Whether this gradient contribution will need to + * be allreduced. */ - virtual void setup(weights* w = nullptr); + template + El::AbstractDistMatrix& get_gradient_buffer( + TensorDataType& buf_scale, + TensorDataType& in_scale, + bool allreduce_needed = false); - /** @brief Optimization step. */ - void step(); + ///@} + /** @brief Communicator access */ + ///@{ - /** @brief LBANN communicator. */ + /** @brief Access LBANN communicator. */ lbann_comm& get_comm() { return *m_comm; } - /** @brief LBANN communicator. */ + + /** @brief Access LBANN communicator. */ const lbann_comm& get_comm() const { return *m_comm; } - /** @brief Scaling factor for optimization step sizes. */ - DataType get_learning_rate() const; - /** @brief Scaling factor for optimization step sizes. */ - void set_learning_rate(DataType learning_rate); + ///@} + /** @brief Statistics access and management */ + ///@{ /** @brief Time spent in optimization step. */ EvalType get_step_time() const { return m_step_time; } + /** @brief Reset stats counters. */ virtual void reset_counters() { m_step_time = 0; } + ///@} + /** @name Checkpointing */ + ///@{ + + /** @brief Store state to archive for checkpoint and restart */ + template void serialize(Archive & ar) { + // Do not save the optimizer's step time + } + + virtual bool save_to_checkpoint_shared(persist& p, std::string m_name) = 0; + virtual bool load_from_checkpoint_shared(persist& p, std::string m_name) = 0; + virtual bool save_to_checkpoint_distributed(persist& p, std::string m_name) = 0; + virtual bool load_from_checkpoint_distributed(persist& p, std::string m_name) = 0; + ///@} + protected: + /** @brief Manage gradient information. */ + class GradientHelper { + public: + virtual ~GradientHelper() = default; + optimizer_gradient_status get_status() const noexcept { return status_; } + void set_status(optimizer_gradient_status s) noexcept { status_ = s; } + virtual El::BaseDistMatrix& gradient() noexcept = 0; + virtual El::BaseDistMatrix const& gradient() const noexcept = 0; + virtual void start_allreduce(lbann_comm&) = 0; + virtual void complete_allreduce(lbann_comm&) = 0; + virtual void clear() = 0; + private: + optimizer_gradient_status status_ = optimizer_gradient_status::cleared; + };// class GradientHelper + + template + class GradientHelperImpl : public GradientHelper { + public: + using AbsDistMatType = El::AbstractDistMatrix; + public: + GradientHelperImpl(El::Int height, El::Int width, El::DistData dist_data) + : gradient_{AbsDistMatType::Instantiate(dist_data)} + { + El::Zeros(*gradient_, height, width); + } + AbsDistMatType& gradient() noexcept override { return *gradient_; } + AbsDistMatType const& gradient() const noexcept override { + return *gradient_; + } + void start_allreduce(lbann_comm& comm) override { + switch (this->get_status()) { + case optimizer_gradient_status::allreduce_needed: + comm.nb_allreduce(*gradient_, + gradient_->RedundantComm(), + allreduce_req_); + this->set_status(optimizer_gradient_status::allreduce_started); + break; + case optimizer_gradient_status::ready: + case optimizer_gradient_status::cleared: + case optimizer_gradient_status::allreduce_started: + break; + default: LBANN_ERROR("unexpected gradient status " + "(" + to_string(this->get_status()) + ")"); + } + } + void complete_allreduce(lbann_comm& comm) override { + switch (this->get_status()) { + case optimizer_gradient_status::allreduce_started: + comm.wait(allreduce_req_); + this->set_status(optimizer_gradient_status::ready); + break; + case optimizer_gradient_status::ready: + case optimizer_gradient_status::cleared: + break; + case optimizer_gradient_status::allreduce_needed: + LBANN_ERROR("attempted to finish gradient allreduce " + "before starting it"); + break; + default: + LBANN_ERROR("unexpected gradient status " + "(" + to_string(this->get_status()) + ")"); + } + } + void clear() { + this->set_status(optimizer_gradient_status::cleared); + } + private: + std::unique_ptr gradient_; + Al::request allreduce_req_; + };// class GradientHelperImpl + + /** @brief Copy construct/copy assign */ + optimizer(const optimizer& other); + optimizer& operator=(const optimizer& other); - /** @brief Computation for an optimization step. - * - * @c values and @c gradient can be assumed to have the same - * distribution. - */ - virtual void step_compute(AbsDistMat& values, - const AbsDistMat& gradient) = 0; + /** @brief Return the current gradient status */ + optimizer_gradient_status get_gradient_status() const { + return m_gradient_status; + } + void set_gradient_status(const optimizer_gradient_status status) { + m_gradient_status = status; + } + std::unordered_set& get_gradient_sources() { + return m_gradient_sources; + } + void set_comm(lbann_comm& comm) { m_comm = &comm; } -private: + void set_step_time(EvalType time) { m_step_time = time; } - /** @brief LBANN communicator. */ - lbann_comm* m_comm; + void inc_step_time(EvalType time) { m_step_time += time; } - /** @brief Weights being optimized. */ - weights* m_weights = nullptr; + virtual std::tuple get_matrix_info() const = 0; - /** @brief Objective function gradient w.r.t. weights. */ - std::unique_ptr m_gradient; + template + void accumulate_all_gradient_contributions( + El::AbstractDistMatrix& gradient); - /** @brief Workspace matrix. + /** @brief Launch non-blocking allreduce on the gradient, if needed. * - * Helps ensure gradient contributions are in the right - * distribution. Most of the time, this should just be a matrix - * view. + * Does nothing if an allreduce is not needed or has already been + * started. */ - std::unique_ptr m_gradient_v; + void start_gradient_allreduce() { + for (auto& grad_mgr : gradients_) { + grad_mgr.second->start_allreduce(*m_comm); + } + } + + /** @brief Synchronize non-blocking allreduce on the gradient, if needed. + * + * Does nothing if an allreduce isn't needed. Throws an exception + * if an allreduce is needed but hasn't been started. + */ + void finish_gradient_allreduce() { + for (auto& grad_mgr : gradients_) { + grad_mgr.second->complete_allreduce(*m_comm); + } + } +private: + + /** @brief LBANN communicator. */ + lbann_comm* m_comm; /** @brief Sources of gradient contributions. * @@ -235,51 +358,156 @@ class optimizer { /** @brief Status of values in objective function gradient. */ optimizer_gradient_status m_gradient_status = optimizer_gradient_status::cleared; - /** @brief Communication request object for gradient allreduce. - * - * Used to synchronize non-blocking allreduce. - */ - Al::request m_gradient_allreduce_req; - - /** @brief Scaling factor for optimization step sizes. - * - * This is not used by the base optimizer class, but is currently - * used by all derived optimizer classes. There are several cases - * where it is convenient to expose this in the base class, - * e.g. for variable learning rate schedules. - * @todo Consider moving this to the derived classes. - */ - DataType m_learning_rate; - /** @brief Time spent in optimization step. */ EvalType m_step_time = 0; - /** @brief Launch non-blocking allreduce on the gradient, if needed. - * - * Does nothing if an allreduce is not needed or has already been - * started. - */ - void start_gradient_allreduce(); - - /** @brief Synchronize non-blocking allreduce on the gradient, if needed. - * - * Does nothing if an allreduce isn't needed. Throws an exception - * if an allreduce is needed but hasn't been started. + /** @brief Map from data types to gradient contributions. + * @todo Refactor this out. It's a hack. */ - void finish_gradient_allreduce(); - -public: - - // =========================================== - // Checkpointing - // =========================================== - virtual bool save_to_checkpoint_shared(persist& p, std::string m_name); - virtual bool load_from_checkpoint_shared(persist& p, std::string m_name); - virtual bool save_to_checkpoint_distributed(persist& p, std::string m_name); - virtual bool load_from_checkpoint_distributed(persist& p, std::string m_name); + using gradient_manager_type = GradientHelper; + using gradient_manager_ptr = std::unique_ptr; + std::unordered_map gradients_; }; +template +El::AbstractDistMatrix& optimizer::get_gradient_buffer( + TensorDataType& buf_scale, + TensorDataType& in_scale, + bool allreduce_needed) { + + // Anon enum to clarify "get<#>" calls below. + enum { HEIGHT=0, WIDTH, DISTDATA }; + using GradMgrType = GradientHelperImpl; + + auto& grad_mgr_ptr = gradients_[std::type_index(typeid(TensorDataType))]; + // If the manager hasn't been created, let's make it. + if (!grad_mgr_ptr) { + auto mat_info = this->get_matrix_info(); + grad_mgr_ptr = make_unique( + std::get(mat_info), + std::get(mat_info), + std::get(mat_info)); + grad_mgr_ptr->set_status(optimizer_gradient_status::cleared); + } + // Get the underlying matrix back out. + auto& grad_mgr = static_cast(*grad_mgr_ptr); + // Complete outstanding allreduce, if needed. + if (grad_mgr.get_status() == optimizer_gradient_status::allreduce_started) { + grad_mgr.complete_allreduce(*(this->m_comm)); + } + auto& buffer = grad_mgr.gradient(); + + // Determine scaling factor and transition state. + switch (grad_mgr.get_status()) { + case optimizer_gradient_status::ready: + buf_scale = DataType(1); + in_scale = DataType(1); + if (allreduce_needed) { + buf_scale /= buffer.RedundantSize(); + grad_mgr.set_status(optimizer_gradient_status::allreduce_needed); + } + break; + case optimizer_gradient_status::cleared: + buf_scale = DataType(0); + in_scale = DataType(1); + grad_mgr.set_status(allreduce_needed ? + optimizer_gradient_status::allreduce_needed : + optimizer_gradient_status::ready); + break; + case optimizer_gradient_status::allreduce_needed: + buf_scale = DataType(1); + // Properly scale data that does not need to be allreduced. + in_scale = (allreduce_needed ? + DataType(1) : + DataType(1) / buffer.RedundantSize()); + break; + case optimizer_gradient_status::allreduce_started: + default: + LBANN_ERROR("unexpected gradient status (" + + to_string(grad_mgr.get_status()) + ")"); + } + return buffer; +} + +template +void optimizer::accumulate_all_gradient_contributions( + El::AbstractDistMatrix& gradient) +{ + using AbsDistMatType = El::AbstractDistMatrix; + static const TensorDataType one = TensorDataType(1.f); + + // There are a few cases to note here: + // 1. One update of the same type. + // 2. One update of a different type. + // 3. Multiple updates of multiple types. In this case, some work + // can be saved if one of the updates has the same type as + // "gradient". + + // Some general information + auto num_updates = this->gradients_.size(); + auto const this_type_idx = std::type_index(typeid(TensorDataType)); + + if (num_updates == 0UL) + return; + + // Handle the case that one of the updates is TensorDataType. In + // this case, the input gradients matrix can be made to "view" the + // update, rather than requiring a copy. + auto this_type_contrib = this->gradients_.find(this_type_idx); + if (this_type_contrib != this->gradients_.end()) { + // Check for invariant consistency. + auto const& grad_mgr = *(this_type_contrib->second); + if (grad_mgr.get_status() != optimizer_gradient_status::ready) { + LBANN_ERROR("Expected ready status. Got: ", + to_string(grad_mgr.get_status())); + } + // Sync the input gradient with the contribution, one way or another. + auto const& contrib = + dynamic_cast(grad_mgr.gradient()); + if (contrib.DistData() == gradient.DistData()) { + El::LockedView(gradient, contrib); + } + else { + LBANN_ERROR("Should never need this copy."); + El::Copy(contrib, gradient); + } + --num_updates; + } + else { + // No sync possible; zero out the matrix instead + El::Zero(gradient); + } + + // Handle the case that only 1 update of a different type is needed. + if (num_updates == 1UL && this->gradients_.size() == 1UL) { + auto const& grad_mgr = *(this->gradients_.begin()->second); + if (grad_mgr.get_status() != optimizer_gradient_status::ready) { + LBANN_ERROR("Expected ready status. Got: ", + to_string(grad_mgr.get_status())); + } + El::Copy(grad_mgr.gradient(), gradient); + } + else if (this->gradients_.size() > 1UL) { + // Need a temporary matrix for the type-casted copy. + auto tmp = std::unique_ptr{ + gradient.Construct(gradient.Grid(), gradient.Root())}; + + for (auto const& grad_mgr_v : this->gradients_) { + if (grad_mgr_v.first == this_type_idx) + continue; + auto const& grad_mgr = *(grad_mgr_v.second); + if (grad_mgr.get_status() != optimizer_gradient_status::ready) { + LBANN_ERROR("Expected ready status. Got: ", + to_string(grad_mgr.get_status())); + } + auto const& grad_base = grad_mgr.gradient(); + El::Copy(grad_base, *tmp); + El::Axpy(one, *tmp, gradient); + } + } +} + } // namespace lbann #endif // LBANN_OPTIMIZERS_OPTIMIZER_HPP_INCLUDED diff --git a/include/lbann/optimizers/rmsprop.hpp b/include/lbann/optimizers/rmsprop.hpp index a8debaa076c..dd0b63b6ecd 100644 --- a/include/lbann/optimizers/rmsprop.hpp +++ b/include/lbann/optimizers/rmsprop.hpp @@ -27,8 +27,10 @@ #ifndef LBANN_OPTIMIZERS_RMSPROP_HPP_INCLUDED #define LBANN_OPTIMIZERS_RMSPROP_HPP_INCLUDED -#include "lbann/optimizers/optimizer.hpp" +#include "lbann/optimizers/data_type_optimizer.hpp" #include +#include "lbann/io/persist.hpp" +#include namespace lbann { @@ -37,74 +39,74 @@ namespace lbann { * See * https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf. */ -class rmsprop : public optimizer { +template +class rmsprop : public Cloneable, + data_type_optimizer> { + using BaseType = Cloneable, + data_type_optimizer>; public: + /** @name Public Types */ + ///@{ - rmsprop(lbann_comm* comm, - DataType learning_rate, - DataType decay_rate, - DataType eps = 1e-8); + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The optimizer base type of this object. */ + using OptimizerType = data_type_optimizer; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} + +public: + + rmsprop(TensorDataType learning_rate, + TensorDataType decay_rate, + TensorDataType eps = 1e-8); rmsprop(const rmsprop& other); rmsprop& operator=(const rmsprop& other); ~rmsprop() override = default; - rmsprop* copy() const override { return new rmsprop(*this); } + + /** Archive for checkpoint and restart */ + template void serialize(Archive & ar) { + ar(cereal::base_class>(this), + CEREAL_NVP(m_decay_rate)); + } /** Human-readable type name. */ std::string get_type() const override { return "RMSprop"; } /** Human-readable description. */ description get_description() const override; - void setup(weights* w = nullptr) override; + void setup(WeightsType* w = nullptr) override; protected: /** Computation for an optimization step. */ - void step_compute(AbsDistMat& values, - const AbsDistMat& gradient) override; + void step_compute(AbsDistMatrixType& values, + const AbsDistMatrixType& gradient) override; private: /** Decay rate. */ - DataType m_decay_rate; + TensorDataType m_decay_rate; /** Small factor to avoid division by zero. */ - DataType m_eps; + TensorDataType m_eps; /** RMSprop cache. */ - std::unique_ptr m_cache; + std::unique_ptr m_cache; /** CPU implementation of optimization step. */ - void step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient); + void step_compute_cpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #ifdef LBANN_HAS_CUDA /** GPU implementation of optimization step. */ - void step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient); + void step_compute_gpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #endif // LBANN_HAS_CUDA // =========================================== // Checkpointing // =========================================== - struct packing_header { - DataType decay_rate; - }; - - bool pack_scalars(persist& p) { - p.write_datatype(persist_type::train, "decay_rate", m_decay_rate); - return true; - } - - bool unpack_scalars(persist& p, struct packing_header *header){ - p.read_datatype(persist_type::train, "momentum", &m_decay_rate); - - if(header != nullptr){ - header->decay_rate = m_decay_rate; - } - - return true; - } - - void unpack_header(struct packing_header& header){ - m_decay_rate = header.decay_rate; - } - bool save_to_checkpoint_shared(persist& p, std::string m_name) override; bool load_from_checkpoint_shared(persist& p, std::string m_name) override; bool save_to_checkpoint_distributed(persist& p, std::string m_name) override; @@ -112,6 +114,11 @@ class rmsprop : public optimizer { }; +template +std::unique_ptr +build_rmsprop_optimizer_from_pbuf( + google::protobuf::Message const&); + } // namespace lbann #endif // LBANN_OPTIMIZERS_RMSPROP_HPP_INCLUDED diff --git a/include/lbann/optimizers/sgd.hpp b/include/lbann/optimizers/sgd.hpp index 2d59b8c2ffe..3cfc66f952a 100644 --- a/include/lbann/optimizers/sgd.hpp +++ b/include/lbann/optimizers/sgd.hpp @@ -27,7 +27,9 @@ #ifndef LBANN_OPTIMIZERS_SGD_HPP_INCLUDED #define LBANN_OPTIMIZERS_SGD_HPP_INCLUDED -#include "lbann/optimizers/optimizer.hpp" +#include "lbann/optimizers/data_type_optimizer.hpp" +#include "lbann/io/persist.hpp" +#include namespace lbann { @@ -35,22 +37,44 @@ namespace lbann { * @details Supports momentum and Nesterov acceleration. * @todo Dedicated optimizers for momentum or Nesterov SGD. */ -class sgd : public optimizer { +template +class sgd : public Cloneable, + data_type_optimizer> { + using BaseType = Cloneable, + data_type_optimizer>; + +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The optimizer base type of this object. */ + using OptimizerType = data_type_optimizer; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} public: /** @name Life cycle functions */ ///@{ - sgd(lbann_comm *comm, - DataType learning_rate, - DataType momentum = 0, + sgd(TensorDataType learning_rate, + TensorDataType momentum = 0, bool nesterov = false); sgd(const sgd& other); sgd& operator=(const sgd& other); ~sgd() override = default; - sgd* copy() const override { return new sgd(*this); } + /** Archive for checkpoint and restart */ + template void serialize(Archive & ar) { + ar(cereal::base_class>(this), + CEREAL_NVP(m_momentum)); + } ///@} /** @name Descriptions */ @@ -69,11 +93,11 @@ class sgd : public optimizer { /** @brief Decay rate for gradient accumulation. * @details A momentum of zero corresponds to vanilla SGD. */ - DataType get_momentum() const noexcept { return m_momentum; } + TensorDataType get_momentum() const noexcept { return m_momentum; } /** @brief Decay rate for gradient accumulation. * @details A momentum of zero corresponds to vanilla SGD. */ - void set_momentum(DataType momentum) { m_momentum = momentum; } + void set_momentum(TensorDataType momentum) { m_momentum = momentum; } /** Whether Nesterov acceleration is applied. */ bool using_nesterov() const noexcept { return m_nesterov; } @@ -81,70 +105,47 @@ class sgd : public optimizer { void set_nesterov(bool nesterov) { m_nesterov = nesterov; } /** Accumulated gradients for momentum optimizer. */ - const AbsDistMat& get_velocity() const; + const AbsDistMatrixType& get_velocity() const; /** Accumulated gradients for momentum optimizer. */ - AbsDistMat& get_velocity(); + AbsDistMatrixType& get_velocity(); ///@} /** @name Setup */ ///@{ - void setup(weights* w = nullptr) override; + void setup(WeightsType* w = nullptr) override; ///@} protected: /** Computation for an optimization step. */ - void step_compute(AbsDistMat& values, const AbsDistMat& gradient) override; + void step_compute(AbsDistMatrixType& values, const AbsDistMatrixType& gradient) override; private: /** @brief Decay rate for gradient accumulation. * @details A momentum of zero corresponds to vanilla SGD. */ - DataType m_momentum; + TensorDataType m_momentum; /** Whether Nesterov acceleration is used. */ bool m_nesterov; /** @brief Accumulated gradients. * @details Not used for vanilla SGD. */ - std::unique_ptr m_velocity; + std::unique_ptr m_velocity; /** CPU implementation of momentum or Nesterov step. */ - void momentum_step_cpu(AbsDistMat& values, const AbsDistMat& gradient); + void momentum_step_cpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #ifdef LBANN_HAS_CUDA /** GPU implementation of momentum or Nesterov step. */ - void momentum_step_gpu(AbsDistMat& values, const AbsDistMat& gradient); + void momentum_step_gpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #endif // LBANN_HAS_CUDA /** @name Checkpointing */ ///@{ - struct packing_header { - DataType momentum; - }; - - bool pack_scalars(persist& p) { - p.write_datatype(persist_type::train, "momentum", m_momentum); - return true; - } - - bool unpack_scalars(persist& p, struct packing_header *header){ - p.read_datatype(persist_type::train, "momentum", &m_momentum); - - if(header != nullptr){ - header->momentum = m_momentum; - } - - return true; - } - - void unpack_header(struct packing_header& header){ - m_momentum = header.momentum; - } - bool save_to_checkpoint_shared(persist& p, std::string m_name) override; bool load_from_checkpoint_shared(persist& p, std::string m_name) override; bool save_to_checkpoint_distributed(persist& p, std::string m_name) override; @@ -154,6 +155,11 @@ class sgd : public optimizer { }; +template +std::unique_ptr +build_sgd_optimizer_from_pbuf( + google::protobuf::Message const&); + } // namespace lbann #endif // LBANN_OPTIMIZERS_SGD_HPP_INCLUDED diff --git a/include/lbann/proto/CMakeLists.txt b/include/lbann/proto/CMakeLists.txt index 59dbee3097d..6ac2825f3e7 100644 --- a/include/lbann/proto/CMakeLists.txt +++ b/include/lbann/proto/CMakeLists.txt @@ -2,6 +2,8 @@ set_full_path(THIS_DIR_HEADERS init_image_data_readers.hpp proto_common.hpp + helpers.hpp + datatype_helpers.hpp ) # Propagate the files up the tree diff --git a/include/lbann/proto/datatype_helpers.hpp b/include/lbann/proto/datatype_helpers.hpp new file mode 100644 index 00000000000..0c91d878de5 --- /dev/null +++ b/include/lbann/proto/datatype_helpers.hpp @@ -0,0 +1,70 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_PROTO_DATATYPE_HELPERS_HPP_INCLUDED +#define LBANN_PROTO_DATATYPE_HELPERS_HPP_INCLUDED + +#include + +namespace lbann +{ +namespace proto +{ + +template +struct TypeToProtoDataType; + +template <> +struct TypeToProtoDataType +{ + static constexpr auto value = lbann_data::FLOAT; +}; + +template <> +struct TypeToProtoDataType +{ + static constexpr auto value = lbann_data::DOUBLE; +}; + +#ifdef LBANN_HAS_HALF +template <> +struct TypeToProtoDataType +{ + static constexpr auto value = lbann_data::FP16; +}; +#endif // LBANN_HAS_HALF + +#ifdef LBANN_HAS_GPU_FP16 +template <> +struct TypeToProtoDataType +{ + static constexpr auto value = lbann_data::FP16; +}; +#endif // LBANN_HAS_GPU_FP16 + +}// namespace proto +}// namespace lbann +#endif /* LBANN_PROTO_DATATYPE_HELPERS_HPP_INCLUDED */ diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp index ca68f30975d..2b76613be66 100644 --- a/include/lbann/proto/factories.hpp +++ b/include/lbann/proto/factories.hpp @@ -27,81 +27,104 @@ #ifndef LBANN_PROTO_FACTORIES_HPP_INCLUDED #define LBANN_PROTO_FACTORIES_HPP_INCLUDED -#include "lbann/proto/proto_common.hpp" #include "lbann/data_readers/data_reader.hpp" +#include "lbann/proto/proto_common.hpp" +#include "lbann/transforms/transform.hpp" +#include "lbann/transforms/transform_pipeline.hpp" + +#include + +#include +#include + +namespace lbann_data { +class Layer; +class Model; +class ObjectiveFunction; +class Optimizer; +class Reader; +class Transform; +class Weights; +}// namespace lbann_data namespace lbann { + +// Forward declarations +class callback_base; +class Layer; +class lbann_summary; +class model; +class objective_function; +class optimizer; +class trainer; +class weights; + namespace proto { +/** Construct a trainer specified with a prototext. */ +std::unique_ptr construct_trainer(lbann_comm* comm, + const lbann_data::Trainer& proto_trainer); + /** Construct a model specified with a prototext. */ -model* construct_model(lbann_comm* comm, - const std::map& data_readers, - const lbann_data::Optimizer& proto_opt, - const lbann_data::Model& proto_model); +std::unique_ptr construct_model( + lbann_comm* comm, + int training_dr_linearized_data_size, + const lbann_data::Optimizer& proto_opt, + const lbann_data::Trainer& proto_trainer, + const lbann_data::Model& proto_model); /** Construct a layer graph specified with a prototext. */ std::vector> construct_layer_graph( lbann_comm* comm, - const std::map& data_readers, + int training_dr_linearized_data_size, + const lbann_data::Trainer& proto_trainer, const lbann_data::Model& proto_model); /** Construct a layer specified with prototext. */ -template +template std::unique_ptr construct_layer( lbann_comm* comm, - const std::map& data_readers, + int training_dr_linearized_data_size, int num_parallel_readers, const lbann_data::Layer& proto_layer); /** Construct weights specified with prototext. */ -weights* construct_weights(lbann_comm* comm, - const lbann_data::Optimizer& proto_opt, - const lbann_data::Weights& proto_weights); +std::unique_ptr construct_weights( + lbann_comm* comm, + const lbann_data::Optimizer& proto_opt, + const lbann_data::Weights& proto_weights); /** Construct a callback specified with prototext. */ -lbann_callback* construct_callback(lbann_comm* comm, - const lbann_data::Callback& proto_cb, - const std::map& data_readers, - std::vector layer_list, - std::vector weights_list, - lbann_summary* summarizer); +std::unique_ptr +construct_callback(const google::protobuf::Message& proto_cb); + +/** Construct a callback specified with prototext. */ +std::unique_ptr +construct_callback(const google::protobuf::Message& proto_cb, + std::shared_ptr const& summarizer); /** Construct a summarizer specified with prototext. * The summarizer is only constructed if the summarizer callback is * enabled. */ -lbann_summary* construct_summarizer(lbann_comm* comm, - const lbann_data::Model& m); +std::unique_ptr construct_summarizer(lbann_comm* comm, + const lbann_data::Model& m); /** Construct an optimizer specified with prototext. */ -optimizer* construct_optimizer(lbann_comm* comm, - const lbann_data::Optimizer& proto_opt); +template +std::unique_ptr construct_optimizer( + const lbann_data::Optimizer& proto_opt); /** Construct an objective function specified with prototext. */ -objective_function* construct_objective_function(const lbann_data::ObjectiveFunction& proto_obj); - -/** Parse a space-separated list. */ -template -std::vector parse_list(std::string str) { - std::vector list; - std::stringstream ss(str); - for (T entry; ss >> entry;) { - list.push_back(entry); - } - return list; -} -template <> -std::vector parse_list(std::string str); - -/** Parse a space-separated set. */ -template -std::set parse_set(std::string str) { - std::set set; - for (const auto& entry : parse_list(str)) { - set.insert(entry); - } - return set; -} +std::unique_ptr +construct_objective_function(const lbann_data::ObjectiveFunction& proto_obj); + +/** Construct a transform given a prototext. */ +std::unique_ptr construct_transform( + const lbann_data::Transform& trans); +/** Construct a transform pipeline given a data reader prototext. */ +transform::transform_pipeline construct_transform_pipeline( + const lbann_data::Reader& data_reader); } // namespace proto } // namespace lbann diff --git a/include/lbann/proto/helpers.hpp b/include/lbann/proto/helpers.hpp new file mode 100644 index 00000000000..0a8cf656409 --- /dev/null +++ b/include/lbann/proto/helpers.hpp @@ -0,0 +1,66 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_PROTO_HELPERS_HPP_INCLUDED +#define LBANN_PROTO_HELPERS_HPP_INCLUDED + +#include + +#include +#include +#include + +namespace lbann +{ +namespace proto +{ + +template +struct GenerateBuilderType_struct +{ + using type = std::function(Args...)>; +}; + +template +using generate_builder_type = + typename GenerateBuilderType_struct::type; + +namespace helpers +{ + +/** @brief Test whether the message has the oneof field. */ +bool has_oneof( + google::protobuf::Message const& msg, std::string const& oneof_name); + +/** @brief Get a "derived type" message from the given message. */ +google::protobuf::Message const& +get_oneof_message( + google::protobuf::Message const& msg_in, std::string const& oneof_name); + +}// namespace helpers +}// namespace proto +}// namespace lbann +#endif /* LBANN_PROTO_HELPERS_HPP_INCLUDED */ diff --git a/include/lbann/proto/init_image_data_readers.hpp b/include/lbann/proto/init_image_data_readers.hpp index f35a5797e2b..4b585998599 100644 --- a/include/lbann/proto/init_image_data_readers.hpp +++ b/include/lbann/proto/init_image_data_readers.hpp @@ -26,13 +26,18 @@ #ifndef LBANN_PROTO_INIT_IMAGE_DATA_READERS_HPP_INCLUDED #define LBANN_PROTO_INIT_IMAGE_DATA_READERS_HPP_INCLUDED + #include "lbann/proto/proto_common.hpp" #include "lbann/comm.hpp" +namespace lbann_data { +class Reader; +class DataSetMetaData; +} + namespace lbann { extern void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_data::DataSetMetaData& pb_metadata, const bool master, generic_data_reader* &reader); -extern void init_generic_preprocessor(const lbann_data::Reader& pb_readme, const bool master, generic_data_reader* reader); extern void init_org_image_data_reader(const lbann_data::Reader& pb_readme, const bool master, generic_data_reader* &reader); } // namespace lbann diff --git a/include/lbann/proto/proto_common.hpp b/include/lbann/proto/proto_common.hpp index b9986dfcc99..8bb4d50fc25 100644 --- a/include/lbann/proto/proto_common.hpp +++ b/include/lbann/proto/proto_common.hpp @@ -1,16 +1,50 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + #ifndef LBANN_PROTO_PROTO_COMMON_HPP_INCLUDED #define LBANN_PROTO_PROTO_COMMON_HPP_INCLUDED -#include "lbann/lbann.hpp" -#include -#include "lbann/proto/factories.hpp" +#include "lbann/data_readers/data_reader.hpp" -namespace lbann { +#define LBANN_ASSERT_MSG_HAS_FIELD(MSG, FIELD) \ + do { \ + if (!MSG.has_##FIELD()) { \ + LBANN_ERROR("No field \"" #FIELD "\" in the given message:\n{\n", \ + MSG.DebugString(), "\n}\n"); \ + } \ + } \ + while(false) -/** @brief Returns true if the Model contains at least one MotifLayer */ -bool has_motifs(const lbann_comm& comm, const lbann_data::LbannPB& p); +// Forward declaration of protobuf classes +namespace lbann_data { +class LbannPB; +class Trainer; +} -void expand_motifs(const lbann_comm& comm, lbann_data::LbannPB& pb); +namespace lbann { /** @brief Customize the name of the index list * @@ -25,27 +59,31 @@ void expand_motifs(const lbann_comm& comm, lbann_data::LbannPB& pb); _t_. @endverbatim */ void customize_data_readers_index_list(const lbann_comm& comm, - lbann_data::LbannPB& p); + ::lbann_data::LbannPB& p); /** @brief instantiates one or more generic_data_readers and inserts * them in &data_readers */ void init_data_readers( lbann_comm *comm, - const lbann_data::LbannPB& p, + const ::lbann_data::LbannPB& p, std::map& data_readers, bool is_shareable_training_data_reader, bool is_shareable_testing_data_reader, bool is_shareable_validation_data_reader = false); /** @brief adjusts the number of parallel data readers */ -void set_num_parallel_readers(const lbann_comm& comm, lbann_data::LbannPB& p); +void set_num_parallel_readers(const lbann_comm& comm, ::lbann_data::LbannPB& p); /** @brief adjusts the values in p by querying the options db */ -void get_cmdline_overrides(const lbann_comm& comm, lbann_data::LbannPB& p); +void get_cmdline_overrides(const lbann_comm& comm, ::lbann_data::LbannPB& p); /** @brief print various params (learn_rate, etc) to cout */ -void print_parameters(const lbann_comm& comm, lbann_data::LbannPB& p); +void print_parameters(const lbann_comm& comm, + ::lbann_data::LbannPB& p, + std::vector& root_random_seeds, + std::vector& random_seeds, + std::vector& data_seq_random_seeds); /** @brief prints usage information */ void print_help(const lbann_comm& comm); @@ -56,18 +94,85 @@ void print_help(std::ostream& os); /** @brief prints prototext file, cmd line, etc to file */ void save_session(const lbann_comm& comm, const int argc, char * const* argv, - lbann_data::LbannPB& p); + ::lbann_data::LbannPB& p); /** @brief Read prototext from a file into a protobuf message. */ void read_prototext_file( const std::string& fn, - lbann_data::LbannPB& pb, + ::lbann_data::LbannPB& pb, const bool master); /** @brief Write a protobuf message into a prototext file. */ bool write_prototext_file( const std::string& fn, - lbann_data::LbannPB& pb); + ::lbann_data::LbannPB& pb); + +/** @brief Trim leading and trailing whitespace from a string. */ +std::string trim(std::string const& str); + +// These functions work on trimmed, nonempty strings +namespace details { + +template +std::vector parse_list_impl(std::string const& str) { +#ifdef LBANN_HAS_GPU_FP16 + using ParseType = typename std::conditional::value, float, T>::type; +#else + using ParseType = T; +#endif + ParseType entry; + std::vector list; + std::istringstream iss(str); + while (iss.good()) { + iss >> entry; + list.emplace_back(std::move(entry)); + } + return list; +} + +template +std::set parse_set_impl(std::string const& str) { +#ifdef LBANN_HAS_GPU_FP16 + using ParseType = typename std::conditional::value, float, T>::type; +#else + using ParseType = T; +#endif + ParseType entry; + std::set set; + std::istringstream iss(str); + while(iss.good()) { + iss >> entry; + set.emplace(std::move(entry)); + } + return set; +} + +// TODO (trb 07/25/19): we should think about what to do about bad +// input. That is, if a user calls parse_list("one two three"), +// the result is undefined (one test I did gave [0,0,0] and another +// gave [INT_MAX,INT_MAX,INT_MAX]). In most cases in LBANN, I would +// guess that this will result in a logic error further down the +// codepath, but we shouldn't count on it. + +}// namespace details + +/** @brief Parse a space-separated list. */ +template +std::vector parse_list(std::string const& str) { + auto trim_str = trim(str); + if (trim_str.size()) + return details::parse_list_impl(trim_str); + return {}; +} + +/** @brief Parse a space-separated set. */ +template +std::set parse_set(std::string const& str) { + auto trim_str = trim(str); + if (trim_str.size()) + return details::parse_set_impl(trim_str); + return {}; +} } // namespace lbann diff --git a/include/lbann/trainers/CMakeLists.txt b/include/lbann/trainers/CMakeLists.txt new file mode 100644 index 00000000000..827647c3c7a --- /dev/null +++ b/include/lbann/trainers/CMakeLists.txt @@ -0,0 +1,7 @@ +# Add the headers for this directory +set_full_path(THIS_DIR_HEADERS + trainer.hpp + ) + +# Propagate the files up the tree +set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/trainers/trainer.hpp b/include/lbann/trainers/trainer.hpp new file mode 100644 index 00000000000..9d4ade530ba --- /dev/null +++ b/include/lbann/trainers/trainer.hpp @@ -0,0 +1,251 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRAINER_HPP +#define LBANN_TRAINER_HPP + +#include "lbann/base.hpp" +#include "lbann/comm.hpp" +#include "lbann/data_coordinator/data_coordinator.hpp" +#include "lbann/models/model.hpp" +#include "lbann/execution_contexts/execution_context.hpp" +#include "lbann/io/persist.hpp" +#include "lbann/utils/threads/thread_pool.hpp" +#include "lbann/utils/hash.hpp" +#include +#include +#include +#include + +namespace lbann { + +// Forward-declare this. +class lbann_callback; +class training_algorithm; +class termination_criteria; + +/** Represents an LBANN trainer and its context. */ +class trainer { +public: + + /** Constructor. */ + trainer(lbann_comm *comm, + size_t mini_batch_size); + + /** Copy constructor. */ + trainer(const trainer& other); + /** Copy assignment operator. */ + trainer& operator=(const trainer& other); + /** Destructor. */ + ~trainer(); + + /** Archive for checkpoint and restart */ + template void serialize(Archive & ar) { + ar(CEREAL_NVP(m_persist), + CEREAL_NVP(m_max_mini_batch_size), + CEREAL_NVP(m_root_random_seed), + CEREAL_NVP(m_random_seed), + CEREAL_NVP(m_data_seq_random_seed)); + } + + /** Set the trainer's name; this is an arbitrary string + * that may be useful in multi-trainer scenarios, e.g, + * LTFB, jag + */ + void set_name(std::string const& name); + + /** Return the trainer's name; this is an arbitrary string + * that may be useful in multi-trainer scenarios, e.g, + * LTFB, jag + */ + std::string get_name() const { + return m_name; + } + + /** Human-readable description. */ + description get_description() const; + + /** Set the random seeds used for the trainer */ + void set_random_seeds(int root_random_seed, int random_seed, int data_seq_random_seed) { + m_root_random_seed = root_random_seed; + m_random_seed = random_seed; + m_data_seq_random_seed = data_seq_random_seed; + } + + int get_random_seed() const { return m_random_seed; } + int get_data_seq_random_seed() const { return m_data_seq_random_seed; } + + /** @brief Get the list of callbacks for the trainer. */ + std::vector> get_callbacks() { + std::vector> callback_list; + callback_list.reserve(m_callbacks.size()); + for (const auto& ptr : m_callbacks) { + callback_list.push_back(ptr.get()); + } + return callback_list; + } + + void add_callback(std::shared_ptr cb) { + if (cb == nullptr) { + throw lbann_exception("model: Attempted to add null pointer as a callback."); + } + m_callbacks.push_back(std::move(cb)); + } + + std::vector>& get_callbacks_with_ownership() { + return m_callbacks; + } + + /** Set up the trainer. */ + void setup(std::unique_ptr io_thread_pool, std::map data_readers); + + using execution_context_key_pair_t = typename std::pair, execution_mode>; + + execution_context_key_pair_t + check_and_build_execution_context(training_algorithm& alg, + observer_ptr model, + execution_mode mode); + + execution_context_key_pair_t + check_and_build_execution_context(execution_context& c, + model& model, + execution_mode mode); + + execution_context& get_execution_context(observer_ptr model, + execution_mode mode); + + execution_context& get_execution_context(execution_context_key_pair_t key); + + void delete_execution_context(execution_context_key_pair_t key); + + void for_each_execution_context(std::function)>fn); + + data_coordinator& get_data_coordinator() { return m_data_coordinator; } + + void apply(training_algorithm& alg, + observer_ptr model, + execution_mode mode, + termination_criteria const& term_criteria); + + void train(observer_ptr model, El::Int num_epochs, El::Int num_batches=0); + + void evaluate(observer_ptr model, execution_mode mode, El::Int num_batches=0); + + /** Return the I/O thread pool */ + thread_pool& get_io_thread_pool() const { + if (!m_io_thread_pool) { LBANN_ERROR("m_io_thread_pool is null"); } + return *(m_io_thread_pool.get()); + } + + /** Get the trainer's comm. */ + inline lbann_comm *get_comm() const { + return m_comm; + } + + /** Get the trainer's persist object */ + inline persist& get_persist_obj() { + return m_persist; + } + + /** Get the trainer's maximum mini-batch size. */ + inline size_t get_max_mini_batch_size() const { + return m_max_mini_batch_size; + } + + /** Set a flag that can be used to enable / disable the background I/O activities */ + void allow_background_io_activity(bool enable) { m_background_io_allowed = enable; } + + /** Are background I/O activities enabled by the input layers */ + bool background_io_activity_allowed() { return m_background_io_allowed; } + + // =========================================== + // Checkpointing + // =========================================== + + /** @brief Checkpoint model to given file descriptor, return number of bytes written */ + bool save_to_checkpoint_shared(); + /** @brief Restore model by reading checkpoint from given file descriptor, return number of bytes read */ + bool load_from_checkpoint_shared(persist& p); + bool load_from_checkpoint_shared(model& m, execution_context& c); + + bool save_to_checkpoint_distributed(); + bool load_from_checkpoint_distributed(persist& p); + bool load_from_checkpoint_distributed(model& m, execution_context& c); + + /** @brief Write model to proto file */ + void write_proto(lbann_data::Trainer* proto); + +private: + + /** Give trainer a name. */ + std::string m_name; + + /** Communicator for the trainer. */ + lbann_comm *m_comm; + + /** @details Maximum possible minibatch size supported by models and + * layers in this trainer. Note that this field will eventually be + * local to the particular, instance of the training context.. + */ + size_t m_max_mini_batch_size; + + // Root of the random seed tree: either default or user supplied + int m_root_random_seed; + // Random seed used for the general RNGs + int m_random_seed; + // Random seed used for the RNG used to fetch data + int m_data_seq_random_seed; + + /** Threads available for I/O */ + std::unique_ptr m_io_thread_pool; + + /** Flag that allows input layers to fetch data in the background */ + bool m_background_io_allowed; + + /** Persist object used for serializing LBANN classes */ + persist m_persist; + + /** Hash function for @c m_model_execution_context */ + using model_execution_context_hash_t = pair_hash, + execution_mode, + std::hash>, + enum_hash>; + + /** @brief Map from model and execution mode to its execution context */ + std::unordered_map, execution_mode>, + std::unique_ptr, + model_execution_context_hash_t> m_model_execution_context; + + /** @brief Current callbacks to process. */ + std::vector> m_callbacks; + + /** @brief Data Coordinator holding trainers data readers */ + data_coordinator m_data_coordinator; +}; + +} // namespace lbann + +#endif // LBANN_TRAINER_HPP diff --git a/include/lbann/training_algorithms/CMakeLists.txt b/include/lbann/training_algorithms/CMakeLists.txt new file mode 100644 index 00000000000..2240711572d --- /dev/null +++ b/include/lbann/training_algorithms/CMakeLists.txt @@ -0,0 +1,8 @@ +# Add the headers for this directory +set_full_path(THIS_DIR_HEADERS + training_algorithm.hpp + sgd_training_algorithm.hpp + ) + +# Propagate the files up the tree +set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/training_algorithms/sgd_training_algorithm.hpp b/include/lbann/training_algorithms/sgd_training_algorithm.hpp new file mode 100644 index 00000000000..5721b00d670 --- /dev/null +++ b/include/lbann/training_algorithms/sgd_training_algorithm.hpp @@ -0,0 +1,111 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_SGD_TRAINING_ALGORITHM_HPP +#define LBANN_SGD_TRAINING_ALGORITHM_HPP + +#include "lbann/training_algorithms/training_algorithm.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" + +namespace lbann { + +/** @brief Base class for LBANN SGD-family training algorithms. */ +class sgd_training_algorithm : public training_algorithm { +public: + + /** Constructor. */ + sgd_training_algorithm() {}; + /** Copy constructor. */ + sgd_training_algorithm(const sgd_training_algorithm& other) = default; + /** Copy assignment operator. */ + sgd_training_algorithm& operator=(const sgd_training_algorithm& other) = default; + /** Move constructor. */ + sgd_training_algorithm(sgd_training_algorithm&& other) = default; + /** Move assignment operator. */ + sgd_training_algorithm& operator=(sgd_training_algorithm&& other) = default; + /** Destructor. */ + virtual ~sgd_training_algorithm() = default; + /** Copy training_algorithm. */ + // virtual sgd_training_algorithm* copy() const = default; + + std::string get_name() const { return "sgd"; } + + // =========================================== + // Execution + // =========================================== + + /** Apply the training algorithm to the model with the provided + context and execution mode */ + void apply(execution_context& c, + model& model, + data_coordinator& dc, + execution_mode mode, + termination_criteria const& term_criteria) override; + + /** Train a model using an iterative SGD solver. */ + void train(sgd_execution_context& c, + model& model, + data_coordinator& dc, + size_t num_epochs, size_t num_batches=0); + + /** Evaluate a model using the forward pass of an SGD solver. */ + void evaluate(sgd_execution_context& c, + model& model, + data_coordinator& dc, + execution_mode mode, size_t num_batches=0); + +protected: + /** Train model on one step / mini-batch of an SGD forward pass */ + virtual bool train_mini_batch(sgd_execution_context& c, model& model, data_coordinator& dc); + + /** Evaluate model on one step / mini-batch of an SGD forward pass */ + virtual bool evaluate_mini_batch(sgd_execution_context& c, model& model, data_coordinator& dc, execution_mode mode); + + //////////////////////////////////////////////////////////// + // Callbacks + //////////////////////////////////////////////////////////// + + /** Execute callbacks at start of training. */ + virtual void do_train_begin_cbs(model& model); + /** Execute callbacks at end of training. */ + virtual void do_train_end_cbs(model& model); + /** Execute callbacks at start of evaluation. */ + virtual void do_evaluate_begin_cbs(model& model, execution_mode mode); + /** Execute callbacks at end of evaluation. */ + virtual void do_evaluate_end_cbs(model& model, execution_mode mode); + /** Execute callbacks at start of epoch. */ + virtual void do_epoch_begin_cbs(model& model); + /** Execute callbacks at end of epoch. */ + virtual void do_epoch_end_cbs(model& model); + /** Execute callbacks at start of mini-batch. */ + virtual void do_batch_begin_cbs(model& model, execution_mode mode); + /** Execute callbacks at end of mini-batch. */ + virtual void do_batch_end_cbs(model& model, execution_mode mode); +}; + +} // namespace lbann + +#endif // LBANN_SGD_TRAINING_ALGORITHM_HPP diff --git a/include/lbann/training_algorithms/training_algorithm.hpp b/include/lbann/training_algorithms/training_algorithm.hpp new file mode 100644 index 00000000000..dfec6bfeef1 --- /dev/null +++ b/include/lbann/training_algorithms/training_algorithm.hpp @@ -0,0 +1,73 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRAINING_ALGORITHM_HPP +#define LBANN_TRAINING_ALGORITHM_HPP + +#include "lbann/base.hpp" +#include "lbann/execution_contexts/execution_context.hpp" +#include "lbann/models/model.hpp" +#include "lbann/data_coordinator/data_coordinator.hpp" + +namespace lbann { + +// Forward-declare this. +class execution_context; + +/** Base class for LBANN training_algorithms. */ +class training_algorithm { +public: + + /** Constructor. */ + training_algorithm() {}; + /** Copy constructor. */ + training_algorithm(const training_algorithm& other) = default; + /** Copy assignment operator. */ + training_algorithm& operator=(const training_algorithm& other) = default; + /** Move constructor. */ + training_algorithm(training_algorithm&& other) = default; + /** Move assignment operator. */ + training_algorithm& operator=(training_algorithm&& other) = default; + /** Destructor. */ + virtual ~training_algorithm() = default; + /** Copy training_algorithm. */ + // virtual training_algorithm* copy() const = default; + + virtual std::string get_name() const = 0; + + virtual void apply(execution_context& context, + model& model, + data_coordinator& dc, + execution_mode mode, + termination_criteria const& term_criteria) = 0; + + void setup_models(std::vector> models, size_t max_mini_batch_size, DataReaderMetaData& dr_metadata); + +}; + +} // namespace lbann + +#endif // LBANN_TRAINING_ALGORITHM_HPP diff --git a/include/lbann/transforms/CMakeLists.txt b/include/lbann/transforms/CMakeLists.txt new file mode 100644 index 00000000000..73511e8f331 --- /dev/null +++ b/include/lbann/transforms/CMakeLists.txt @@ -0,0 +1,15 @@ +# Add the headers for this directory +set_full_path(THIS_DIR_HEADERS + normalize.hpp + repack_HWC_to_CHW_layout.hpp + sample_normalize.hpp + scale.hpp + scale_and_translate.hpp + transform.hpp + transform_pipeline.hpp + ) + +add_subdirectory(vision) + +# Propagate the files up the tree +set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/transforms/normalize.hpp b/include/lbann/transforms/normalize.hpp new file mode 100644 index 00000000000..77bfa649489 --- /dev/null +++ b/include/lbann/transforms/normalize.hpp @@ -0,0 +1,79 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_NORMALIZE_HPP_INCLUDED +#define LBANN_TRANSFORMS_NORMALIZE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" +#include "lbann/utils/exception.hpp" + +#include + +#include + +namespace lbann { +namespace transform { + +/** + * Normalize with mean and standard deviation. + * This is done channel-wise for images. If the input does not have channels, + * (e.g. it is not an image), it is treated as having one "channel". + * This is only applicable after conversion to an LBANN CPUMat. + */ +class normalize : public transform { +public: + /** Apply channel-wise means and standard deviations. */ + normalize(std::vector means, std::vector stds) : + transform(), m_means(means), m_stds(stds) { + if (m_means.size() != m_stds.size()) { + LBANN_ERROR("Normalize mean and std have different numbers of channels."); + } + } + + transform* copy() const override { return new normalize(*this); } + + std::string get_type() const override { return "normalize"; } + + bool supports_non_inplace() const override { return true; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + void apply(utils::type_erased_matrix& data, CPUMat& out, + std::vector& dims) override; +private: + /** Channel-wise means. */ + std::vector m_means; + /** Channel-wise standard deviations. */ + std::vector m_stds; +}; + +// Builder function +std::unique_ptr +build_normalize_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_NORMALIZED_CENTER_CROP_HPP_INCLUDED diff --git a/include/lbann/transforms/repack_HWC_to_CHW_layout.hpp b/include/lbann/transforms/repack_HWC_to_CHW_layout.hpp new file mode 100644 index 00000000000..59a02fc78fe --- /dev/null +++ b/include/lbann/transforms/repack_HWC_to_CHW_layout.hpp @@ -0,0 +1,56 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_REPACK_HWC_TO_CHW_LAYOUT_HPP_INCLUDED +#define LBANN_TRANSFORMS_REPACK_HWC_TO_CHW_LAYOUT_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** + * Convert data to LBANN's native data layout. + * Currently only supports converting from and interleaved channel format. + */ +class repack_HWC_to_CHW_layout : public transform { +public: + transform* copy() const override { return new repack_HWC_to_CHW_layout(*this); } + + std::string get_type() const override { return "to_lbann_layout"; } + + bool supports_non_inplace() const override { return true; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + + void apply(utils::type_erased_matrix& data, CPUMat& out, + std::vector& dims) override; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_REPACK_HWC_TO_CHW_LAYOUT_HPP_INCLUDED diff --git a/include/lbann/transforms/sample_normalize.hpp b/include/lbann/transforms/sample_normalize.hpp new file mode 100644 index 00000000000..b6766d16915 --- /dev/null +++ b/include/lbann/transforms/sample_normalize.hpp @@ -0,0 +1,57 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_SAMPLE_NORMALIZE_HPP_INCLUDED +#define LBANN_TRANSFORMS_SAMPLE_NORMALIZE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** + * Normalize to have mean 0, standard deviation 1. + * This only works after conversion to an LBANN CPUMat. + */ +class sample_normalize : public transform { +public: + transform* copy() const override { return new sample_normalize(*this); } + + std::string get_type() const override { return "sample_normalize"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +}; + +// Builder function +std::unique_ptr +build_sample_normalize_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_SAMPLE_NORMALIZE_HPP_INCLUDED diff --git a/include/lbann/transforms/scale.hpp b/include/lbann/transforms/scale.hpp new file mode 100644 index 00000000000..36ff3bad6ad --- /dev/null +++ b/include/lbann/transforms/scale.hpp @@ -0,0 +1,62 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_SCALE_HPP_INCLUDED +#define LBANN_TRANSFORMS_SCALE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +#include + +namespace lbann { +namespace transform { + +/** Scale data by a constant. */ +class scale : public transform { +public: + /** Scale all data by scale_val. */ + scale(float scale_val) : transform(), m_scale(scale_val) {} + + transform* copy() const override { return new scale(*this); } + + std::string get_type() const override { return "scale"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Amount to scale data by. */ + float m_scale; +}; + +// Builder function +std::unique_ptr +build_scale_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_SCALE_HPP_INCLUDED diff --git a/include/lbann/transforms/scale_and_translate.hpp b/include/lbann/transforms/scale_and_translate.hpp new file mode 100644 index 00000000000..42821168b33 --- /dev/null +++ b/include/lbann/transforms/scale_and_translate.hpp @@ -0,0 +1,57 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_SCALE_AND_TRANSLATE_HPP_INCLUDED +#define LBANN_TRANSFORMS_SCALE_AND_TRANSLATE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** Scale and Translate data by a constant pair of constants. */ +class scale_and_translate : public transform { +public: + /** Scale_And_Translate all data by scale_and_translate_val. */ + scale_and_translate(float scale_val, float translate_val) + : transform(), m_scale(scale_val), m_translate(translate_val) {} + + transform* copy() const override { return new scale_and_translate(*this); } + + std::string get_type() const override { return "scale"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Amount to scale data by. */ + float m_scale; + /** Amount to translate data by. */ + float m_translate; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_SCALE_AND_TRANSLATE_HPP_INCLUDED diff --git a/include/lbann/transforms/transform.hpp b/include/lbann/transforms/transform.hpp new file mode 100644 index 00000000000..140028a6429 --- /dev/null +++ b/include/lbann/transforms/transform.hpp @@ -0,0 +1,112 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_TRANSFORM_HPP_INCLUDED +#define LBANN_TRANSFORMS_TRANSFORM_HPP_INCLUDED + +#include "lbann/base.hpp" +#include "lbann/utils/description.hpp" +#include "lbann/utils/random.hpp" +#include "lbann/utils/type_erased_matrix.hpp" +#include "lbann/utils/exception.hpp" + +namespace lbann { +namespace transform { + +/** + * Abstract base class for transforms on data. + * + * A transform takes a CPUMat and modifies it in-place. Transforms should + * be thread-safe, as one instance of a transform may be called concurrently + * within multiple threads. + * + * Because transforms may switch between underlying data types throughout the + * pipeline, everything is done in terms of a type_erased_matrix, which can + * swap between underlying data types. + */ +class transform { +public: + transform() = default; + transform(const transform&) = default; + transform& operator=(const transform&) = default; + virtual ~transform() = default; + + /** Create a copy of the transform instance. */ + virtual transform* copy() const = 0; + + /** Human-readable type name. */ + virtual std::string get_type() const = 0; + /** Human-readable description. */ + virtual description get_description() const { + return description(get_type() + " transform"); + } + + /** True if the transform supports non-in-place apply. */ + virtual bool supports_non_inplace() const { + return false; + } + + /** + * Apply the transform to data. + * @param data The input data to transform, which is modified in-place. The + * matrix shuold be contiguous. + * @param dims The dimensions of the data tensor. For "plain data", dims + * should have one entry, giving its size. For images, dims should have three + * entries: channels, height, width. + * @note dims is a hack until we have proper tensors. + */ + virtual void apply(utils::type_erased_matrix& data, std::vector& dims) = 0; + + /** + * Apply the transform to data. + * This does not modify data in-place but places its output in out. + */ + virtual void apply(utils::type_erased_matrix& data, CPUMat& out, + std::vector& dims) { + LBANN_ERROR("Non-in-place apply not implemented."); + } +protected: + /** Return a value uniformly at random in [a, b). */ + static inline float get_uniform_random(float a, float b) { + fast_rng_gen& gen = get_fast_io_generator(); + std::uniform_real_distribution dist(a, b); + return dist(gen); + } + /** Return true with probability p. */ + static inline bool get_bool_random(float p) { + return get_uniform_random(0.0, 1.0) < p; + } + /** Return an integer uniformly at random in [a, b). */ + static inline El::Int get_uniform_random_int(El::Int a, El::Int b) { + fast_rng_gen& gen = get_fast_io_generator(); + return fast_rand_int(gen, b - a) + a; + } +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_TRANSFORM_HPP_INCLUDED diff --git a/include/lbann/transforms/transform_pipeline.hpp b/include/lbann/transforms/transform_pipeline.hpp new file mode 100644 index 00000000000..50ffb91b799 --- /dev/null +++ b/include/lbann/transforms/transform_pipeline.hpp @@ -0,0 +1,95 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_TRANSFORM_PIPELINE_HPP_INCLUDED +#define LBANN_TRANSFORMS_TRANSFORM_PIPELINE_HPP_INCLUDED + +#include "lbann/base.hpp" +#include "lbann/utils/description.hpp" +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** + * Applies a sequence of transforms to input data. + */ +class transform_pipeline { +public: + transform_pipeline() {} + transform_pipeline(const transform_pipeline&); + transform_pipeline(transform_pipeline&&) = default; + transform_pipeline& operator=(const transform_pipeline&); + transform_pipeline& operator=(transform_pipeline&&) = default; + ~transform_pipeline() {} + + transform_pipeline* copy() const { return new transform_pipeline(*this); } + + /** + * Add trans as the next transform to apply. + */ + void add_transform(std::unique_ptr&& trans) { + m_transforms.push_back(std::move(trans)); + } + + /** + * Set the expected dimensions of the data after applying the transforms. + * This is primarily meant as a debugging aid/sanity check. + */ + void set_expected_out_dims(std::vector expected_out_dims) { + m_expected_out_dims = expected_out_dims; + } + + /** + * Apply the transforms to data. + * @param data The data to transform. data will be modified in-place. + * @param dims Dimensions of data. Will be modified in-place. + */ + void apply(utils::type_erased_matrix& data, std::vector& dims); + /** Apply to CPUMat data, which will be modified in-place. */ + void apply(CPUMat& data, std::vector& dims); + /** + * Apply the transforms to data. + * @param data The data to transform. Will be modified in-place. + * @param out_data Output will be placed here. It will not be reallocated. + * @param dims Dimensions of data. Will be modified in-place. + */ + void apply(El::Matrix& data, CPUMat& out_data, + std::vector& dims); +private: + /** Ordered list of transforms to apply. */ + std::vector> m_transforms; + /** Expected dimensions after applying all transforms. */ + std::vector m_expected_out_dims; + + /** Assert dims matches expected_out_dims (if set). */ + void assert_expected_out_dims(const std::vector& dims); +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_TRANSFORM_PIPELINE_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/CMakeLists.txt b/include/lbann/transforms/vision/CMakeLists.txt new file mode 100644 index 00000000000..2bd30f178c3 --- /dev/null +++ b/include/lbann/transforms/vision/CMakeLists.txt @@ -0,0 +1,24 @@ +# Add the headers for this directory +set_full_path(THIS_DIR_HEADERS + adjust_brightness.hpp + adjust_contrast.hpp + adjust_saturation.hpp + center_crop.hpp + colorize.hpp + color_jitter.hpp + cutout.hpp + grayscale.hpp + horizontal_flip.hpp + normalize_to_lbann_layout.hpp + random_affine.hpp + random_crop.hpp + random_resized_crop.hpp + random_resized_crop_with_fixed_aspect_ratio.hpp + resize.hpp + resized_center_crop.hpp + to_lbann_layout.hpp + vertical_flip.hpp + ) + +# Propagate the files up the tree +set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/transforms/vision/adjust_brightness.hpp b/include/lbann/transforms/vision/adjust_brightness.hpp new file mode 100644 index 00000000000..649c24c8feb --- /dev/null +++ b/include/lbann/transforms/vision/adjust_brightness.hpp @@ -0,0 +1,68 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_ADJUST_BRIGHTNESS_HPP_INCLUDED +#define LBANN_TRANSFORMS_ADJUST_BRIGHTNESS_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** Adjust the brightness of an image. */ +class adjust_brightness : public transform { +public: + /** + * Adjust brightness with given factor. + * @param factor A non-negative factor. 0 gives a black image, 1 the original. + */ + adjust_brightness(float factor) : transform(), m_factor(factor) { + if (factor < 0.0f) { + LBANN_ERROR("Brightness factor must be non-negative."); + } + } + + transform* copy() const override { return new adjust_brightness(*this); } + + std::string get_type() const override { return "adjust_brightness"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Factor to adjust brightness by. */ + float m_factor; +}; + +// Builder function +std::unique_ptr +build_adjust_brightness_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_ADJUST_BRIGHTNESS_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/adjust_contrast.hpp b/include/lbann/transforms/vision/adjust_contrast.hpp new file mode 100644 index 00000000000..3c33a747289 --- /dev/null +++ b/include/lbann/transforms/vision/adjust_contrast.hpp @@ -0,0 +1,73 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_ADJUST_CONTRAST_HPP_INCLUDED +#define LBANN_TRANSFORMS_ADJUST_CONTRAST_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** + * Adjust the contrast of an image. + * + * This operates similarly to the contrast control on a television. + */ +class adjust_contrast : public transform { +public: + /** + * Adjust contrast with given factor. + * @param factor A non-negative factor. 0 gives a solid grey image, + * 1 the original. + */ + adjust_contrast(float factor) : transform(), m_factor(factor) { + if (factor < 0.0f) { + LBANN_ERROR("Contrast factor must be non-negative."); + } + } + + transform* copy() const override { return new adjust_contrast(*this); } + + std::string get_type() const override { return "adjust_contrast"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Factor to adjust contrast by. */ + float m_factor; +}; + +// Builder function +std::unique_ptr +build_adjust_contrast_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_ADJUST_CONTRAST_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/adjust_saturation.hpp b/include/lbann/transforms/vision/adjust_saturation.hpp new file mode 100644 index 00000000000..65fb2f9e636 --- /dev/null +++ b/include/lbann/transforms/vision/adjust_saturation.hpp @@ -0,0 +1,75 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_ADJUST_SATURATION_HPP_INCLUDED +#define LBANN_TRANSFORMS_ADJUST_SATURATION_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** + * Adjust the saturation of an image. + * + * This operates similarly to the controls on a color television + * (as opposed to a direct adjustment of saturation) by interpolating + * between the original value and its grayscale value. + */ +class adjust_saturation : public transform { +public: + /** + * Adjust saturation with given factor. + * @param factor A non-negative factor. 0 gives a grayscale image, + * 1 the original. + */ + adjust_saturation(float factor) : transform(), m_factor(factor) { + if (factor < 0.0f) { + LBANN_ERROR("Saturation factor must be non-negative."); + } + } + + transform* copy() const override { return new adjust_saturation(*this); } + + std::string get_type() const override { return "adjust_saturation"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Factor to adjust saturation by. */ + float m_factor; +}; + + +std::unique_ptr +build_adjust_saturation_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_ADJUST_SATURATION_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/center_crop.hpp b/include/lbann/transforms/vision/center_crop.hpp new file mode 100644 index 00000000000..9d4b2026a7e --- /dev/null +++ b/include/lbann/transforms/vision/center_crop.hpp @@ -0,0 +1,59 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_CENTER_CROP_HPP_INCLUDED +#define LBANN_TRANSFORMS_CENTER_CROP_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** Crop an image at the center. */ +class center_crop : public transform { +public: + /** Crop to an h x w image. */ + center_crop(size_t h, size_t w) : transform(), m_h(h), m_w(w) {} + + transform* copy() const override { return new center_crop(*this); } + + std::string get_type() const override { return "center_crop"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Height and width of the crop. */ + size_t m_h, m_w; +}; + +std::unique_ptr +build_center_crop_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_CENTER_CROP_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/color_jitter.hpp b/include/lbann/transforms/vision/color_jitter.hpp new file mode 100644 index 00000000000..cd0ac8805a0 --- /dev/null +++ b/include/lbann/transforms/vision/color_jitter.hpp @@ -0,0 +1,85 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_COLOR_JITTER_HPP_INCLUDED +#define LBANN_TRANSFORMS_COLOR_JITTER_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** + * Randomly change brightness, contrast, and saturation. + * This randomly adjusts brightness, contrast, and saturation, in a random + * order. + */ +class color_jitter : public transform { +public: + /** + * Randomly adjust brightness, contrast, and saturation within given ranges. + * Set both min and max to 0 to disable that adjustment. + * @param min_brightness_factor Minimum brightness adjustment (>= 0). + * @param max_brightness_factor Maximum brightness adjustment. + * @param min_contrast_factor Minimum contrast adjustment (>= 0). + * @param max_contrast_factor Maximum contrast adjustment. + * @param min_saturation_factor Minimum saturation adjustment (>= 0). + * @param max_saturation_factor Maximum saturation adjustment. + */ + color_jitter(float min_brightness_factor, float max_brightness_factor, + float min_contrast_factor, float max_contrast_factor, + float min_saturation_factor, float max_saturation_factor); + + transform* copy() const override { return new color_jitter(*this); } + + std::string get_type() const override { return "color_jitter"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Minimum brightness factor. */ + float m_min_brightness_factor; + /** Maximum brightness factor. */ + float m_max_brightness_factor; + /** Minimum contrast factor. */ + float m_min_contrast_factor; + /** Maximum contrast factor. */ + float m_max_contrast_factor; + /** Minimum saturation factor. */ + float m_min_saturation_factor; + /** Maximum saturation factor. */ + float m_max_saturation_factor; +}; + +std::unique_ptr +build_color_jitter_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_COLOR_JITTER_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/colorize.hpp b/include/lbann/transforms/vision/colorize.hpp new file mode 100644 index 00000000000..48864b0869f --- /dev/null +++ b/include/lbann/transforms/vision/colorize.hpp @@ -0,0 +1,53 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_COLORIZE_HPP_INCLUDED +#define LBANN_TRANSFORMS_COLORIZE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** Convert an image from grayscale to color. */ +class colorize : public transform { +public: + transform* copy() const override { return new colorize(*this); } + + std::string get_type() const override { return "colorize"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +}; + +std::unique_ptr +build_colorize_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_COLORIZE_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/cutout.hpp b/include/lbann/transforms/vision/cutout.hpp new file mode 100644 index 00000000000..b41c71f2800 --- /dev/null +++ b/include/lbann/transforms/vision/cutout.hpp @@ -0,0 +1,87 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_CUTOUT_HPP_INCLUDED +#define LBANN_TRANSFORMS_CUTOUT_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** + * Cutout data augmentation which randomly masks out square regions of input. + * + * See: + * + * DeVries and Taylor. "Improved Regularization of Convolutional Neural + * Networks with Cutout". arXiv preprint arXiv:1708.04552 (2017). + * + * This will randomly select a center pixel for each square and set all pixels + * within that square to 0. It is permissible for portions of the masks to lie + * outside of the image. + * + * Normalization about 0 should be applied after applying cutout. + */ +class cutout : public transform { +public: + /** + * Cutout with a given number of squares of a given size. + * @param num_holes Number of squares to mask out (must be positive). + * @param length Length of a side of the square (must be positive). + */ + cutout(size_t num_holes, size_t length) : + transform(), m_num_holes(num_holes), m_length(length) { + if (num_holes == 0) { + LBANN_ERROR("num_holes must be positive, got 0"); + } + if (length == 0) { + LBANN_ERROR("length must be positive, got 0"); + } + } + + transform* copy() const override { return new cutout(*this); } + + std::string get_type() const override { return "cutout"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Number of squares that will be masked out. */ + size_t m_num_holes; + /** Length of a side of each square that will be masked out. */ + size_t m_length; +}; + +std::unique_ptr +build_cutout_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_CUTOUT_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/grayscale.hpp b/include/lbann/transforms/vision/grayscale.hpp new file mode 100644 index 00000000000..a03b2b940cd --- /dev/null +++ b/include/lbann/transforms/vision/grayscale.hpp @@ -0,0 +1,53 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_GRAYSCALE_HPP_INCLUDED +#define LBANN_TRANSFORMS_GRAYSCALE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** Convert an image to grayscale. */ +class grayscale : public transform { +public: + transform* copy() const override { return new grayscale(*this); } + + std::string get_type() const override { return "grayscale"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +}; + +std::unique_ptr +build_grayscale_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_GRAYSCALE_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/horizontal_flip.hpp b/include/lbann/transforms/vision/horizontal_flip.hpp new file mode 100644 index 00000000000..0d7a640f698 --- /dev/null +++ b/include/lbann/transforms/vision/horizontal_flip.hpp @@ -0,0 +1,60 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_HORIZONTAL_FLIP_HPP_INCLUDED +#define LBANN_TRANSFORMS_HORIZONTAL_FLIP_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** Horizontally flip image data with given probability. */ +class horizontal_flip : public transform { +public: + /** Flip image with probability p. */ + horizontal_flip(float p) : transform(), m_p(p) {} + + transform* copy() const override { return new horizontal_flip(*this); } + + std::string get_type() const override { return "horizontal_flip"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Probability that that the image is flipped. */ + float m_p; +}; + +std::unique_ptr +build_horizontal_flip_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_HORIZONTAL_FLIP_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp b/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp new file mode 100644 index 00000000000..ef91c7fedaa --- /dev/null +++ b/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp @@ -0,0 +1,79 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_NORMALIZE_TO_LBANN_LAYOUT_HPP_INCLUDED +#define LBANN_TRANSFORMS_NORMALIZE_TO_LBANN_LAYOUT_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** + * Normalize and convert data to LBANN's native data layout. + * Currently only supports converting from OpenCV layouts. + * This normalizes with provided channel-wise means and standard deviations, + * scales from [0, 255] to [0, 1], and converts to LBANN's data layout. + * Normalization is applied after the scaling to [0, 1]. + * This essentially fuses the to_lbann_layout and normalize transforms. + */ +class normalize_to_lbann_layout : public transform { +public: + /** Apply channel-wise means and standard deviations. */ + normalize_to_lbann_layout(std::vector means, std::vector stds) : + transform(), m_means(means), m_stds(stds) { + if (m_means.size() != m_stds.size()) { + LBANN_ERROR("Normalize mean and std have different numbers of channels."); + } + } + + transform* copy() const override { return new normalize_to_lbann_layout(*this); } + + std::string get_type() const override { return "normalize_to_lbann_layout"; } + + bool supports_non_inplace() const override { return true; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + + void apply(utils::type_erased_matrix& data, CPUMat& out, + std::vector& dims) override; +private: + /** Channel-wise means. */ + std::vector m_means; + /** Channel-wise standard deviations. */ + std::vector m_stds; +}; + +std::unique_ptr +build_normalize_to_lbann_layout_transform_from_pbuf( + google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_NORMALIZE_TO_LBANN_LAYOUT_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/random_affine.hpp b/include/lbann/transforms/vision/random_affine.hpp new file mode 100644 index 00000000000..4ef0c587a24 --- /dev/null +++ b/include/lbann/transforms/vision/random_affine.hpp @@ -0,0 +1,82 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_RANDOM_AFFINE_HPP_INCLUDED +#define LBANN_TRANSFORMS_RANDOM_AFFINE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** Apply a random affine transform to an image. */ +class random_affine : public transform { +public: + /** + * Set up the affine transform. + * Rotate a random number of degrees selected in [rotate_min, rotate_max]. + * Translate the vertical dimension in a random amount in [-h*translate_h, + * h*translate_h], and the horizontal dimension in [-w*translate_w, + * w*translate_w]. + * Scale by a random amount in [scale_min, scale_max]. + * Shear by a random number of degrees in [shear_min, shear_max]. + * Set arguments to 0 to disable that transform. + */ + random_affine(float rotate_min, float rotate_max, + float translate_h, float translate_w, + float scale_min, float scale_max, + float shear_min, float shear_max) : + transform(), + m_rotate_min(rotate_min), m_rotate_max(rotate_max), + m_translate_h(translate_h), m_translate_w(translate_w), + m_scale_min(scale_min), m_scale_max(scale_max), + m_shear_min(shear_min), m_shear_max(shear_max) {} + + transform* copy() const override { return new random_affine(*this); } + + std::string get_type() const override { return "random_affine"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Range in degrees to rotate. */ + float m_rotate_min, m_rotate_max; + /** Fraction of height/width to translate. */ + float m_translate_h, m_translate_w; + /** Range for fraction to scale by. */ + float m_scale_min, m_scale_max; + /** Range for degrees to shear. */ + float m_shear_min, m_shear_max; +}; + +std::unique_ptr +build_random_affine_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_RANDOM_AFFINED_CENTER_CROP_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/random_crop.hpp b/include/lbann/transforms/vision/random_crop.hpp new file mode 100644 index 00000000000..dce14b98111 --- /dev/null +++ b/include/lbann/transforms/vision/random_crop.hpp @@ -0,0 +1,59 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_RANDOM_CROP_HPP_INCLUDED +#define LBANN_TRANSFORMS_RANDOM_CROP_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** Crop an image at a random location. */ +class random_crop : public transform { +public: + /** Crop to an h x w image. */ + random_crop(size_t h, size_t w) : transform(), m_h(h), m_w(w) {} + + transform* copy() const override { return new random_crop(*this); } + + std::string get_type() const override { return "random_crop"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Height and width of the crop. */ + size_t m_h, m_w; +}; + +std::unique_ptr +build_random_crop_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_RANDOM_CROP_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/random_resized_crop.hpp b/include/lbann/transforms/vision/random_resized_crop.hpp new file mode 100644 index 00000000000..8f957106303 --- /dev/null +++ b/include/lbann/transforms/vision/random_resized_crop.hpp @@ -0,0 +1,80 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_HPP_INCLUDED +#define LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** + * Extract a crop of random size and aspect ratio, then crop to a size. + * This is commonly used for Inception-style networks and some other + * image classification networks. + */ +class random_resized_crop : public transform { +public: + /** + * Crop to a random size and aspect ratio, then resize to h x w. + * The random crop has area in [scale_min, scale_max] of the original image + * area, and aspect ratio in [ar_min, ar_max] of the original. This random + * crop is then resized to be h x w. + * These default to (0.08, 1.0) and (3/4, 4/3), respectively, which are the + * standard. + */ + random_resized_crop(size_t h, size_t w, + float scale_min=0.08, float scale_max=1.0, + float ar_min=0.75, float ar_max=4.0f/3.0f) : + transform(), + m_h(h), m_w(w), + m_scale_min(scale_min), m_scale_max(scale_max), + m_ar_min(ar_min), m_ar_max(ar_max) {} + + transform* copy() const override { return new random_resized_crop(*this); } + + std::string get_type() const override { return "random_resized_crop"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Height and width of the final crop. */ + size_t m_h, m_w; + /** Range for the area of the random crop. */ + float m_scale_min, m_scale_max; + /** Range for the aspect ratio of the random crop. */ + float m_ar_min, m_ar_max; +}; + +std::unique_ptr +build_random_resized_crop_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp b/include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp new file mode 100644 index 00000000000..8290254aa82 --- /dev/null +++ b/include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp @@ -0,0 +1,68 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_WITH_FIXED_ASPECT_RATIO_HPP_INCLUDED +#define LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_WITH_FIXED_ASPECT_RATIO_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** Resize an image then extract a random crop. */ +class random_resized_crop_with_fixed_aspect_ratio : public transform { +public: + /** Resize to h x w, then extract a random crop_h x crop_w crop. */ + random_resized_crop_with_fixed_aspect_ratio( + size_t h, size_t w, size_t crop_h, size_t crop_w) : + transform(), m_h(h), m_w(w), m_crop_h(crop_h), m_crop_w(crop_w) {} + + transform* copy() const override { + return new random_resized_crop_with_fixed_aspect_ratio(*this); + } + + std::string get_type() const override { + return "random_resized_crop_with_fixed_aspect_ratio"; + } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Height and width of the resized image. */ + size_t m_h, m_w; + /** Height and width of the crop. */ + size_t m_crop_h, m_crop_w; +}; + +std::unique_ptr +build_random_resized_crop_with_fixed_aspect_ratio_transform_from_pbuf( + google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_WITH_FIXED_ASPECT_RATIO_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/resize.hpp b/include/lbann/transforms/vision/resize.hpp new file mode 100644 index 00000000000..668b925c9b9 --- /dev/null +++ b/include/lbann/transforms/vision/resize.hpp @@ -0,0 +1,59 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_RESIZE_HPP_INCLUDED +#define LBANN_TRANSFORMS_RESIZE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** Resize an image. */ +class resize : public transform { +public: + /** Resize to h x w. */ + resize(size_t h, size_t w) : transform(), m_h(h), m_w(w) {} + + transform* copy() const override { return new resize(*this); } + + std::string get_type() const override { return "resize"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Height and width of the resized image. */ + size_t m_h, m_w; +}; + +std::unique_ptr +build_resize_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_RESIZE_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/resized_center_crop.hpp b/include/lbann/transforms/vision/resized_center_crop.hpp new file mode 100644 index 00000000000..0ccb0ef93e6 --- /dev/null +++ b/include/lbann/transforms/vision/resized_center_crop.hpp @@ -0,0 +1,62 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_RESIZED_CENTER_CROP_HPP_INCLUDED +#define LBANN_TRANSFORMS_RESIZED_CENTER_CROP_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** Resize an image and then crop its center. */ +class resized_center_crop : public transform { +public: + /** Resize to h x w, then extract a crop_h x crop_w crop from the center. */ + resized_center_crop(size_t h, size_t w, size_t crop_h, size_t crop_w) : + transform(), m_h(h), m_w(w), m_crop_h(crop_h), m_crop_w(crop_w) {} + + transform* copy() const override { return new resized_center_crop(*this); } + + std::string get_type() const override { return "resized_center_crop"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Height and width of the resized image. */ + size_t m_h, m_w; + /** Height and width of the crop. */ + size_t m_crop_h, m_crop_w; +}; + +std::unique_ptr +build_resized_center_crop_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_RESIZED_CENTER_CROP_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/to_lbann_layout.hpp b/include/lbann/transforms/vision/to_lbann_layout.hpp new file mode 100644 index 00000000000..5cbb81f699a --- /dev/null +++ b/include/lbann/transforms/vision/to_lbann_layout.hpp @@ -0,0 +1,62 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_TO_LBANN_LAYOUT_HPP_INCLUDED +#define LBANN_TRANSFORMS_TO_LBANN_LAYOUT_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** + * Convert data to LBANN's native data layout. + * Currently only supports converting from OpenCV layouts. + * This will also rescale data from [0, 255] to [0, 1]. + */ +class to_lbann_layout : public transform { +public: + transform* copy() const override { return new to_lbann_layout(*this); } + + std::string get_type() const override { return "to_lbann_layout"; } + + bool supports_non_inplace() const override { return true; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + + void apply(utils::type_erased_matrix& data, CPUMat& out, + std::vector& dims) override; +}; + +std::unique_ptr +build_to_lbann_layout_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_TO_LBANN_LAYOUT_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/vertical_flip.hpp b/include/lbann/transforms/vision/vertical_flip.hpp new file mode 100644 index 00000000000..712547c733a --- /dev/null +++ b/include/lbann/transforms/vision/vertical_flip.hpp @@ -0,0 +1,60 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_VERTICAL_FLIP_HPP_INCLUDED +#define LBANN_TRANSFORMS_VERTICAL_FLIP_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +#include + +namespace lbann { +namespace transform { + +/** Vertically flip image data with given probability. */ +class vertical_flip : public transform { +public: + /** Flip image with probability p. */ + vertical_flip(float p) : transform(), m_p(p) {} + + transform* copy() const override { return new vertical_flip(*this); } + + std::string get_type() const override { return "vertical_flip"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Probability that that the image is flipped. */ + float m_p; +}; + +std::unique_ptr +build_vertical_flip_transform_from_pbuf(google::protobuf::Message const&); + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_VERTICAL_FLIP_HPP_INCLUDED diff --git a/include/lbann/utils/CMakeLists.txt b/include/lbann/utils/CMakeLists.txt index a07932b662f..d5b680771a3 100644 --- a/include/lbann/utils/CMakeLists.txt +++ b/include/lbann/utils/CMakeLists.txt @@ -1,6 +1,7 @@ # Add the headers for this directory set_full_path(THIS_DIR_HEADERS any.hpp + argument_parser.hpp compiler_control.hpp cublas.hpp cuda.hpp @@ -8,27 +9,50 @@ set_full_path(THIS_DIR_HEADERS dataset.hpp description.hpp entrywise_operator.hpp + enum_iterator.hpp + environment_variable.hpp + eti_macros.hpp exception.hpp factory.hpp factory_error_policies.hpp file_utils.hpp glob.hpp + hydrogen_utils.hpp im2col.hpp + image.hpp jag_utils.hpp lbann_library.hpp mild_exception.hpp number_theory.hpp + nvshmem.hpp omp_diagnostics.hpp + opencv.hpp options.hpp + nvshmem.hpp profiling.hpp prototext.hpp + python.hpp random.hpp + random_number_generators.hpp + serialization.hpp statistics.hpp summary.hpp + summary_impl.hpp timer.hpp + trainer_file_utils.hpp type_erased_matrix.hpp + typename.hpp ) +if (LBANN_HAS_HALF) + list(APPEND THIS_DIR_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/serialization.hpp) +endif (LBANN_HAS_HALF) + +if (LBANN_HAS_DISTCONV) + list(APPEND THIS_DIR_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/distconv.hpp") +endif () + # Add the subdirectories add_subdirectory(threads) add_subdirectory(impl) diff --git a/include/lbann/utils/any.hpp b/include/lbann/utils/any.hpp index 6b55e7caf8b..ae95c6dee47 100644 --- a/include/lbann/utils/any.hpp +++ b/include/lbann/utils/any.hpp @@ -21,7 +21,8 @@ namespace lbann namespace utils { -#ifdef LBANN_HAS_STD_ANY +// Note (tym 4/8/20): CMake doesn't support NVCC with C++17 +#if defined(LBANN_HAS_STD_ANY) && !defined(__CUDACC__) // This case is simple symbol injection; don't feel great about this, // but it's not my fault they couldn't get this into C++11... @@ -64,7 +65,7 @@ class any ///@{ /** @brief Default construct an empty "any" */ - any() noexcept = default; + any() noexcept {} /** @brief Construct an object holding a T */ template diff --git a/include/lbann/utils/argument_parser.hpp b/include/lbann/utils/argument_parser.hpp new file mode 100644 index 00000000000..f213a13f690 --- /dev/null +++ b/include/lbann/utils/argument_parser.hpp @@ -0,0 +1,790 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UTILS_ARGUMENT_PARSER_HPP_INCLUDED +#define LBANN_UTILS_ARGUMENT_PARSER_HPP_INCLUDED + +#include "lbann/utils/any.hpp" +#include "lbann/utils/environment_variable.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace lbann +{ +namespace utils +{ + +/** @class parse_error + * @brief std::exception subclass that is thrown if the parser + * can not parse the arguments. + */ +struct parse_error : std::runtime_error +{ + /** @brief Construct the exception with the string to be + * return by what() + */ + template + parse_error(T&& what_arg) + : std::runtime_error{std::forward(what_arg)} {} +};// parse_error + +/** @class strict_parsing + * + * Allows any valid subset of parameters. This will throw an + * exception for any error raised by the underlying parser. + */ +struct strict_parsing +{ + void handle_error(clara::detail::InternalParseResult result, + clara::Parser& parser, + std::vector& argv); +};// struct strict_parsing + +/** @class allow_extra_parameters + * + * Ignores "unknown token" errors raised by the parser and attempts + * to proceed until all tokens are processed or another error is + * detected. + */ +struct allow_extra_parameters +{ + void handle_error(clara::detail::InternalParseResult result, + clara::Parser& parser, + std::vector& argv); +};// struct allow_extra_parameters + +/** @class argument_parser + * @brief Basic argument parsing with automatic help messages. + * + * @section arg_parser_params Supported parameter types + * + * The argument parser supports 3 types of command line parameters: + * flags, options, and arguments. + * + * @subsection arg_parser_flags Flags + * + * Flags default to "false" and toggle to "true" when they are given + * on the command line. It is an error to provide a value to a flag + * on the command line (e.g., "-flag 0"). If a flag called "-v" is + * tied to a variable called `verbose`, `verbose` will have default + * value `false`. Passing "-v" on the command line, `a.out -v`, will + * result in `verbose` having post-parse value `true`. + * + * @subsection arg_parser_options Options + * + * Options represent key-value pairs. They must take only a single + * value (e.g. `a.out -key value`). It is an error to omit a value + * for a parameter of option type (e.g., `a.out -key`). Options are + * strongly typed to match their default values. The string passed on + * the command line must be convertible to the type of the default + * value provided by the developer programmatically. + * + * @subsection arg_parser_arguments Arguments + * + * Arguments (or "positional arguments") do not name a key on the + * command line and are implicitly keyed by their index in the + * argument list. A corollary to this is that required arguments must + * appear before optional arguments. Arguments with each category + * ("required" and "optional") are keyed in the order in which they + * are added. + * + * On command line, "optional" arguments are ordered after the + * "required" arguments, in the order in which they are added. For + * example, adding an (optional) argument called "A", then adding + * a required argument called "B", then adding an (optioinal) + * argument called "C" will require that these arguments be passed + * as `a.out B A C`. Since "A" and "C" are optional, it is also + * valid to pass `a.out B` or `a.out B A`. It is undefined + * behavior to pass `a.out B C`. + * + * Erroneously passing `a.out B C` might be accepted by the parser + * if "A" and "C" have the same (or sufficiently compatible) + * types, but the output will not be as unexpected (the variable + * bound to "A" will have the value expected in "C", and the + * variable bound to "C" will have its default value). If "A" and + * "C" are not compatible types, an exception will be thrown. In + * the first case, the parser cannot read your mind to know if you + * passed things in the right order; it is the application + * developer's responsibility to ensure that all arguments have + * been added before the help message is printed, and it is the + * user's responsibility to consult the help message for the + * runtime ordering of arguments. + * + * @section arg_parser_finalize Finalization + * + * To accomodate the presence of required arguments with the + * maintenance-intensive practice of adding arguments willy-nilly + * (because I don't believe a PR without said terrifying + * capability would ever make it through), parsing of the + * arguments can be done two ways: with or without finalization. + * + * If there are no required arguments registered in the parser, + * these should be equivalent. If there are required arguments, + * they must all have been registered with the parser and seen in + * the arguments given to the parse functions before + * finalization. Semantically, the parser must be finalized before + * attempting to use any of the required arguments. + */ +template +class argument_parser : ErrorHandler +{ +public: + + /** @name Public types */ + ///@{ + + /** @brief A proxy class representing the current value associated + * with an option. + * + * This class is best manipulated generically, through `auto` + * variables. + * + * @tparam T The type of the held object. + */ + template + class readonly_reference + { + public: + readonly_reference(T& val) noexcept : ref_(val) {} + T const& get() const noexcept { return ref_; } + operator T const& () const noexcept { return this->get(); } + + template + bool operator==(S const& y) const noexcept + { return this->get() == y; } + + private: + T& ref_; + };// class readonly_reference + + /** @class parse_error + * @brief std::exception subclass that is thrown if the parser + * can not parse the arguments. + */ + struct parse_error : std::runtime_error + { + /** @brief Construct the exception with the string to be + * return by what() + */ + template + parse_error(T&& what_arg) + : std::runtime_error{std::forward(what_arg)} {} + }; + + /** @class missing_required_arguments + * @brief std::exception subclass that is thrown if a required + * argument is not found. + */ + struct missing_required_arguments : std::runtime_error + { + /** @brief Construct the exception with a list of the missing + * argument names. + * + * @param[in] missing_args A container that holds the names + * of the missing arguments. + */ + template + missing_required_arguments(Container const& missing_args) + : std::runtime_error{build_what_string_(missing_args)} + {} + + private: + template + std::string build_what_string_(Container const& missing_args) + { + std::ostringstream oss; + oss << "The following required arguments are missing: {"; + for (auto const& x : missing_args) + oss << " \"" << x << "\""; + oss << " }"; + return oss.str(); + } + }; + + ///@} + +public: + + /** @name Constructors */ + ///@{ + + /** @brief Create the parser */ + argument_parser(); + + ///@} + /** @name Adding options and arguments */ + ///@{ + + /** @brief Add a flag (i.e. a boolean parameter that is "true" if + * given and "false" if not given). + * + * The value of a flag defaults to `false`. If, for some strange + * reason, users should be forced to type the boolean value on + * the command line, e.g., "my_exe -b 1", use add_option() + * instead. If a flag with default value `true` is desired, + * invert the logic and use this instead. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] cli_flags The valid command line flags to trigger + * this flag to `true`. At least one must be given. + * @param[in] description A brief description of the argument, + * used for the help message. + * + * @return A read-only reference to the value pointed to by this + * flag. + */ + readonly_reference + add_flag(std::string const& name, + std::initializer_list cli_flags, + std::string const& description); + + /** @brief Add a flag with environment variable override. + * + * The value of a flag defaults to `false`. The flag may be set to + * `true` by passing the flag on the command line. Alternatively, + * it may be set to `true` if the environment variable `env` is + * defined and has a value that converts to `true`. + * + * @tparam AccessPolicy The access method for the environment + * variable. (Deduced.) + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] cli_flags The valid command line flags to trigger + * this flag to `true`. At least one must be given. + * @param[in] env The environment variable to prefer over the + * default parameter value. + * @param[in] description A brief description of the argument, + * used for the help message. + * + * @return A read-only reference to the value pointed to by this + * flag. + */ + template + readonly_reference + add_flag(std::string const& name, + std::initializer_list cli_flags, + EnvVariable env, + std::string const& description) + { + if (env.exists() && env.template value()) + return add_flag_impl_(name, std::move(cli_flags), description, true); + else + return add_flag(name, std::move(cli_flags), description); + } + + /** @brief Add an additional named option. + * + * Currently, named options are all optional. This could be + * expanded if needed. + * + * @tparam T The type associated with the option. Deduced if a + * default value is given. If the default value is not + * given, the template parameter must be named explicitly + * and the default value will be default-constructed. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] cli_flags The valid command line flags to identify + * this option and its value. At least one must be + * given. + * @param[in] description A brief description of the argument, + * used for the help message. + * @param[in] default_value The default value to be returned if + * the option is not passed to the command line. + * + * @return A read-only reference to the value pointed to by this + * option. + */ + template + readonly_reference + add_option(std::string const& name, + std::initializer_list cli_flags, + std::string const& description, + T default_value = T()); + + /** @brief Add an additional named option. + * + * Currently, named options are all optional. This could be + * expanded if needed. + * + * @tparam T The type associated with the option. Deduced if a + * default value is given. If the default value is not + * given, the template parameter must be named explicitly + * and the default value will be default-constructed. + * @tparam AccessPolicy The access method for the environment + * variable. (Deduced.) + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] cli_flags The valid command line flags to identify + * this option and its value. At least one must be + * given. + * @param[in] env The environment variable to prefer over the + * default parameter value. + * @param[in] description A brief description of the argument, + * used for the help message. + * @param[in] default_value The default value to be returned if + * the option is not passed to the command line. + * + * @return A read-only reference to the value pointed to by this + * option. + */ + template + readonly_reference + add_option(std::string const& name, + std::initializer_list cli_flags, + EnvVariable env, + std::string const& description, + T default_value = T()) + { + if (env.exists()) + return add_option(name, std::move(cli_flags), description, + env.template value()); + else + return add_option(name, std::move(cli_flags), description, + std::move(default_value)); + } + + /** @brief Add an additional named option; overloaded for "char + * const*" parameters. + * + * The value will be stored as an `std::string`. Its value must + * be extracted using `get(name)`. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] cli_flags The valid command line flags to trigger + * this flag to `true`. At least one must be given. + * @param[in] description A brief description of the argument, + * used for the help message. + * @param[in] default_value The default value to be returned if + * the option is not passed to the command line. + * + * @return A read-only reference to the value pointed to by this + * option. + */ + readonly_reference + add_option(std::string const& name, + std::initializer_list cli_flags, + std::string const& description, + char const* default_value) + { + return add_option(name, std::move(cli_flags), description, + std::string(default_value)); + } + + /** @brief Add an additional named option; overloaded for "char + * const*" parameters. + * + * The value will be stored as an `std::string`. Its value must + * be extracted using `get(name)`. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] cli_flags The valid command line flags to trigger + * this flag to `true`. At least one must be given. + * @param[in] env The environment variable to prefer over the + * default parameter value. + * @param[in] description A brief description of the argument, + * used for the help message. + * @param[in] default_value The default value to be returned if + * the option is not passed to the command line. + * + * @return A read-only reference to the value pointed to by this + * option. + */ + template + readonly_reference + add_option(std::string const& name, + std::initializer_list cli_flags, + EnvVariable env, + std::string const& description, + char const* default_value) + { + return add_option(name, cli_flags, std::move(env), + description, std::string(default_value)); + } + + /** @brief Add an optional positional argument. + * + * These are essentially defaulted positional arguments. They must + * be given on the command line in the order in which they are + * added to the parser. If the arguments have all been added by the + * time the help message is produced, the help message will display + * the correct ordering. + * + * @tparam T The type to which the argument maps. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] description A brief description of the argument, + * used for the help message. + * @param[in] default_value The value to use for this argument if + * not detected in the formal argument list. + * + * @return A read-only reference to the value pointed to by this + * argument. + */ + template + readonly_reference add_argument( + std::string const& name, + std::string const& description, + T default_value = T()); + + /** @brief Add a positional argument; char const* overload + * + * The data is stored in an std::string object internally and + * must be accessed using `get(name)`. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] description A brief description of the argument, + * used for the help message. + * @param[in] default_value The value to use for this argument if + * not detected in the formal argument list. + * + * @return A read-only reference to the value pointed to by this + * argument. + */ + readonly_reference add_argument( + std::string const& name, + std::string const& description, + char const* default_value) + { + return add_argument( + name, description, std::string(default_value)); + } + + /** @brief Add a "required" positional argument. + * + * @tparam T The type to which the argument maps. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] description A brief description of the argument, + * used for the help message. + * + * @return A read-only reference to the value pointed to by this + * argument. + */ + template + readonly_reference add_required_argument( + std::string const& name, + std::string const& description); + + ///@} + /** @name Command-line-like parsing */ + ///@{ + + /** @brief Parse the command line arguments and finalize the + * arguments. + * + * This is equivalent to calling parse_no_finalize() followed + * immediately by finalize(). + * + * @param[in] argc The number of arguments + * @param[in] argv The list of arguments + * + * @throws parse_error if an internal parsing error is detected. + */ + void parse(int argc, char const* const argv[]); + + /** @brief Parse the command line arguments but do not finalize + * the parser. + * + * This parses command-line-like arguments but does no checks for + * required arguments. Users should call finalize() before + * attempting to use the values associated with any required + * arguments. + * + * @param[in] argc The number of arguments + * @param[in] argv The list of arguments + * + * @throws parse_error if an internal parsing error is detected. + */ + void parse_no_finalize(int argc, char const* const argv[]); + + /** @brief Assert that all required components are set properly. + * + + * This should be called sometime after parse_no_finalize() and + * before using the values. This is implicitly called by parse(). + * + * @throws missing_required_arguments If a missing argument is + * detected. + */ + void finalize() const; + + ///@} + /** @name Queries */ + ///@{ + + /** @brief Get the executable name. + * + * This is only meaningful after calling either parse() or + * parse_no_finalize(). + * + * @return The name of the executable. + */ + std::string const& get_exe_name() const noexcept; + + /** @brief Test if an option exists in the parser. + * + * This only tests whether the argument or option is known to the + * parser, not whether it has been set or modified by the parser. + * + * @param[in] option_name The name of the option/argument. + */ + bool option_is_defined(std::string const& option_name) const; + + /** @brief Test if help has been requested. */ + bool help_requested() const; + + /** @brief Get the requested value from the argument list. + * @tparam T The type of the requested parameter. + * @param option_name The name given to the option or argument. + * @return A const-reference to the held value. + */ + template + T const& get(std::string const& option_name) const; + + ///@} + /** @name Output */ + ///@{ + + /** @brief Print a help string to a stream. + * @param[in] stream The ostream to print the help message to. + */ + void print_help(std::ostream& stream) const; + + ///@} + +private: + + /** @brief Implementation of add_flag */ + readonly_reference + add_flag_impl_(std::string const& name, + std::initializer_list cli_flags, + std::string const& description, + bool default_value); + +private: + /** @brief Dictionary of arguments to their values */ + std::unordered_map params_; + /** @brief Patch around in-progress clara limitation */ + std::unordered_set required_; + /** @brief The underlying clara object */ + clara::Parser parser_; + /** @brief The name of the executable. */ + std::string exe_name_ = ""; + +}; + +template +inline bool +argument_parser::option_is_defined(std::string const& option_name) const +{ + return params_.count(option_name); +} + +template +template +inline T const& argument_parser::get(std::string const& option_name) const +{ + return utils::any_cast(params_.at(option_name)); +} + +template +template +inline auto argument_parser::add_option( + std::string const& name, + std::initializer_list cli_flags, + std::string const& description, + T default_value) + -> readonly_reference +{ + params_[name] = std::move(default_value); + auto& param_ref = any_cast(params_[name]); + clara::Opt option(param_ref, name); + for (auto const& f : cli_flags) + option[f]; + parser_ |= option(description).optional(); + return param_ref; +} + +template +template +inline auto argument_parser::add_argument( + std::string const& name, + std::string const& description, + T default_value) + -> readonly_reference +{ + params_[name] = std::move(default_value); + auto& param_ref = utils::any_cast(params_[name]); + parser_ |= clara::Arg + (param_ref, name) + (description).optional(); + return param_ref; +} + +template +template +inline auto argument_parser::add_required_argument( + std::string const& name, + std::string const& description) + -> readonly_reference +{ + // Add the reference to bind to + params_[name] = T{}; + auto& param_any = params_[name]; + auto& param_ref = any_cast(param_any); + + required_.insert(name); + + // Make sure the required arguments are all grouped together. + auto iter = parser_.m_args.cbegin(), invalid = parser_.m_args.cend(); + while (iter != invalid && !iter->isOptional()) + ++iter; + + // Create the argument + auto ret = parser_.m_args.emplace( + iter, + [name,¶m_ref,this](std::string const& value) + { + auto result = clara::detail::convertInto(value, param_ref); + if (result) + required_.erase(name); + return result; + }, + name); + ret->operator() (description).required(); + return param_ref; +} + +template +argument_parser::argument_parser() +{ + params_["print help"] = false; + parser_ |= clara::ExeName(exe_name_); + parser_ |= clara::Help(utils::any_cast(params_["print help"])); + + // Work around a bug in Clara logic + parser_.m_exeName.set(exe_name_); +} + +template +void argument_parser::parse(int argc, char const* const argv[]) +{ + parse_no_finalize(argc, argv); + finalize(); +} + +template +void argument_parser::parse_no_finalize(int argc, char const* const argv[]) +{ + std::vector newargv(argv, argv+argc); + auto parse_result = + parser_.parse(clara::Args(newargv.size(), newargv.data())); + + if (!parse_result) + this->handle_error(parse_result, parser_, newargv); +} + +template +void argument_parser::finalize() const +{ + if (!help_requested() && required_.size()) + throw missing_required_arguments(required_); +} + +template +auto argument_parser::add_flag( + std::string const& name, + std::initializer_list cli_flags, + std::string const& description) + -> readonly_reference +{ + return add_flag_impl_(name, std::move(cli_flags), description, false); +} + +template +std::string const& argument_parser::get_exe_name() const noexcept +{ + return exe_name_; +} + +template +bool argument_parser::help_requested() const +{ + return utils::any_cast(params_.at("print help")); +} + +template +void argument_parser::print_help(std::ostream& out) const +{ + out << parser_ << std::endl; +} + +template +auto argument_parser::add_flag_impl_( + std::string const& name, + std::initializer_list cli_flags, + std::string const& description, + bool default_value) + -> readonly_reference +{ + params_[name] = default_value; + auto& param_ref = any_cast(params_[name]); + clara::Opt option(param_ref); + for (auto const& f : cli_flags) + option[f]; + parser_ |= option(description).optional(); + return param_ref; +} + +}// namespace utils + +using default_arg_parser_type = + utils::argument_parser; + +default_arg_parser_type& global_argument_parser(); + +}// namespace lbann + +/** @brief Write the parser's help string to the given @c ostream */ +template +std::ostream& operator<<( + std::ostream& os, + lbann::utils::argument_parser const& parser) +{ + parser.print_help(os); + return os; +} + +#endif /* LBANN_UTILS_ARGUMENT_PARSER_HPP_INCLUDED */ diff --git a/include/lbann/utils/beta.hpp b/include/lbann/utils/beta.hpp new file mode 100644 index 00000000000..eef834e2466 --- /dev/null +++ b/include/lbann/utils/beta.hpp @@ -0,0 +1,233 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UTILS_BETA_HPP +#define LBANN_UTILS_BETA_HPP + +#include +#include +#include +#include + +#include "lbann/utils/random.hpp" +#include "lbann/utils/exception.hpp" + +namespace lbann { + +/** + * Produces random floating point values drawn from a Beta distribution with + * parameters a > 0 and b > 0. + * + * See: + * + * https://en.wikipedia.org/wiki/Beta_distribution + * + * for more details. + */ +template +class beta_distribution { +public: + using result_type = RealType; + + class param_type { + public: + using distribution_type = beta_distribution; + + explicit param_type(RealType param_a, RealType param_b) : + m_a(param_a), m_b(param_b) { + if (param_a <= RealType(0) || param_b <= RealType(0)) { + LBANN_ERROR("Beta distribution parameters must be positive"); + } + } + + constexpr RealType a() const { return m_a; } + constexpr RealType b() const { return m_b; } + + bool operator==(const param_type& other) const { + return m_a == other.m_a && m_b == other.m_b; + } + bool operator!=(const param_type& other) const { + return m_a != other.m_a || m_b != other.m_b; + } + private: + RealType m_a, m_b; + }; + + explicit beta_distribution(RealType a, RealType b) : + m_params(a, b), m_gamma_a(a), m_gamma_b(b) {} + explicit beta_distribution(const param_type& p) : + m_params(p), m_gamma_a(p.a()), m_gamma_b(p.b()) {} + + result_type a() const { return m_params.a(); } + result_type b() const { return m_params.b(); } + + void reset() {} + + param_type param() const { return m_params; } + void param(const param_type& p) { + m_params = p; + m_gamma_a = gamma_dist(p.a()); + m_gamma_b = gamma_dist(p.b()); + } + + template + result_type operator()(Generator& g) { + return generate(g); + } + template + result_type operator()(Generator& g, const param_type& p) { + return generate(g, p); + } + + result_type min() const { return result_type(0); } + result_type max() const { return result_type(1); } + + bool operator==(const beta_distribution& other) const { + return param() == other.param(); + } + bool operator!=(const beta_distribution& other) const { + return param() != other.param(); + } + +private: + param_type m_params; + + using gamma_dist = std::gamma_distribution; + gamma_dist m_gamma_a, m_gamma_b; + + // Generator for when we use the distribution's parameters. + template + result_type generate(Generator& g) { + if (a() <= result_type(1) && b() <= result_type(1)) { + return generate_johnk(g, m_params.a(), m_params.b()); + } else { + return generate_gamma(g, m_gamma_a, m_gamma_b); + } + } + // Generator for when we use specified parameters. + template + result_type generate(Generator& g, const param_type& p) { + if (p.a() <= result_type(1) && p.b() <= result_type(1)) { + return generate_johnk(g, p.a(), p.b()); + } else { + gamma_dist gamma_a(p.a()), gamma_b(p.b()); + return generate_gamma(g, gamma_a, gamma_b); + } + } + + /** + * Generate Beta-distributed values using Johnk's algorithm. + * This is a rejection-sampling algorithm that only needs a few + * uniformly random values. + * + * See: + * + * Johnk, H. D. "Erzeugung von betaverteilten und gammaverteilten + * Zufallszahlen." Metrika 8, no. 1 (1964). + * + * For an English-language presentation, see: + * + * Atkinson, A. C. and M. C. Pearce. "The computer generation of beta, + * gamma and normal random variables." Journal of the Royal Statistical + * Society: Series A (General) 139, no. 4 (1976). + * + * This includes fixes for numerical stability when the parameters are small, + * see: + * + * https://github.com/numpy/numpy/issues/5851 + * + * for discussion there; and a catch for the (extremely rare) case of the RNG + * giving us U and V both exactly 0. + * + * Note: There should be an umlaut on the "o" in "Johnk", but blame poor + * unicode support. + */ + template + result_type generate_johnk(Generator& g, result_type a, result_type b) { + while (true) { + const result_type U = random_uniform(g); + const result_type V = random_uniform(g); + const result_type X = std::pow(U, result_type(1) / a); + const result_type Y = std::pow(V, result_type(1) / b); + const result_type XplusY = X + Y; + if (XplusY <= result_type(1.0)) { + if (XplusY > result_type(0)) { + return X / XplusY; + } else if (U != result_type(0) && V != result_type(0)) { + // Work with logs instead if a/b is too small. + result_type logX = std::log(U) / a; + result_type logY = std::log(V) / b; + const result_type log_max = std::max(logX, logY); + logX -= log_max; + logY -= log_max; + return std::exp(logX - std::log(std::exp(logX) + std::exp(logY))); + } + } + } + } + + /** + * Generate Beta-distributed values based on Gamma distributions. + * See: + * https://en.wikipedia.org/wiki/Beta_distribution#Generating_beta-distributed_random_variates + * for details. + */ + template + result_type generate_gamma(Generator& g, gamma_dist& gamma_a, + gamma_dist& gamma_b) { + const result_type Ga = gamma_a(g); + const result_type Gb = gamma_b(g); + return Ga / (Ga + Gb); + } +}; + +template +std::basic_ostream& operator<<(std::basic_ostream& os, + const beta_distribution& d) { + os << "~Beta(" << d.a() << "," << d.b() << ")"; + return os; +} + +template +std::basic_istream& operator>>(std::basic_istream& is, + beta_distribution& d) { + std::string s; + RealType a, b; + if (std::getline(is, s, '(') && s == "~Beta" + && is >> a + && is.get() == ',' + && is >> b + && is.get() == ')') { + d = beta_distribution(a, b); + } else { + is.setstate(std::ios::failbit); + } + return is; +} + +} // namespace lbann + +#endif // LBANN_UTILS_BETA_HPP diff --git a/include/lbann/utils/cloneable.hpp b/include/lbann/utils/cloneable.hpp new file mode 100644 index 00000000000..7e5c825f2f3 --- /dev/null +++ b/include/lbann/utils/cloneable.hpp @@ -0,0 +1,234 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// +#ifndef LBANN_UTILS_CLONEABLE_HPP_INCLUDED +#define LBANN_UTILS_CLONEABLE_HPP_INCLUDED + +#include +#include + +/** @file + * + * This file implements covariant returns via smart pointers for a + * polymorphic @c clone function. The implementation largely follows + * the solution presented by + * the FluentC++ blog. Some class/tag names have been updated to + * be clearer, in my opinion. Additionally, a semi-useful predicate + * has been added to aid metaprogramming down the line. + */ + +namespace lbann { + +/** @brief Declare @c Base to be a virtual base. + * + * This metafunction adds @c Base as a virtual base + * class. Constructors of @c Base are added to this class. + * + * @tparam Base The class to be declared as a virtual base. + */ +template +struct AsVirtualBase : virtual Base +{ + using Base::Base; +}; + +/** @brief Declare that @c T has unimplemented virtual functions. + * + * Due to metaprogramming restrictions on CRTP interfaces, we rely on + * the user of these mechanisms to declare when a class has + * unimplemented virtual functions (or "is abstract"). + * + * @tparam T The type that has at least one unimplemented virtual + * function. + */ +template +struct HasAbstractFunction {}; + +/** @brief Alias for HasAbstractFunction. + * + * Good OO practice suggests that non-leaf classes should be abstract + * -- that is, have at least one unimplemented virtual + * function. LBANN fits this paradigm, so this alias is appropriate. +*/ +template +using NonLeafClass = HasAbstractFunction; + +/** @brief Inject polymorphic clone functions into hierarchies. + * + * This class uses CRTP to inject the derived class's clone() + * function directly into the class and uses + * the + * Template Method to virtualize it. + * + * @tparam T The concrete class to be cloned. + * @tparam Base The base class of T. + */ +template +class Cloneable + : public Base... +{ +public: + /** @brief Return an exception-safe, memory-safe copy of this object. */ + std::unique_ptr clone() const { + return std::unique_ptr{static_cast(this->do_clone_())}; + } +private: + /** @brief Implement the covariant raw-pointer-based clone operation. */ + virtual Cloneable* do_clone_() const override { + return new T(static_cast(*this)); + } +};// class Cloneable + +template +class Cloneable + : public Base +{ +public: + /** @brief Return an exception-safe, memory-safe copy of this object. */ + std::unique_ptr clone() const { + return std::unique_ptr{static_cast(this->do_clone_())}; + } +protected: + using Base::Base; +private: + /** @brief Implement the covariant raw-pointer-based clone operation. */ + virtual Cloneable* do_clone_() const override { + return new T(static_cast(*this)); + } +};// class Cloneable + +/** @brief Specialization of Cloneable to handle stand-alone classes. */ +template +class Cloneable +{ +public: + virtual ~Cloneable() = default; + + std::unique_ptr clone() const { + return std::unique_ptr{static_cast(this->do_clone_())}; + } +private: + Cloneable* do_clone_() const { + return new T(static_cast(*this)); + } +};// class Cloneable + +/** @brief Specialization of Cloneable for intermediate classes. + * + * Classes that are neither the top of the hierarchy nor a leaf of + * the class tree should be virtual. An unfortunate consequence of + * the CRTP method is that the target of the CRTP, @c T in this case, + * is not a complete class when this class is instantiated, so + * metaprogramming based on @c T is very restricted. Thus, users must + * tag the target class with HasAbstractFunction. Doing so will + * ensure that the @c do_clone_() function is declared pure virtual. + */ +template +class Cloneable, Base...> + : public Base... +{ +public: + std::unique_ptr clone() const { + return std::unique_ptr{static_cast(this->do_clone_())}; + } +private: + virtual Cloneable* do_clone_() const = 0; +}; + +template +class Cloneable, Base> + : public Base +{ +public: + std::unique_ptr clone() const { + return std::unique_ptr{static_cast(this->do_clone_())}; + } +protected: + using Base::Base; +private: + virtual Cloneable* do_clone_() const = 0; +}; + +/** @brief Specialization of Cloneable to handle the top of hierarchies. */ +template +class Cloneable> +{ +public: + virtual ~Cloneable() = default; + + std::unique_ptr clone() const { + return std::unique_ptr{static_cast(this->do_clone_())}; + } +private: + virtual Cloneable* do_clone_() const = 0; +};// class Cloneable + +/** @brief Predicate testing for Cloneable interface. + * + * This predicate determines whether a class supports the Cloneable + * interface. If true, this class will support a smart-pointer-to-T + * return from a @c clone() method. + * + * This predicate type suffers a deficiency that it can be fooled + * rather easily. It is generally not possible to determine from the + * specific Cloneable instantiation used for a given type. Thus, + * alternative strategies must be used. As it stands, any class that + * provides a @c clone() method that returns a @c std::unique_ptr + * will satisfy this predicate. + * + * @tparam T The type being tested. + */ +template +struct IsCloneableT; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +// The obvious case; I'd be concerned if this were ever called. +template +struct IsCloneableT> : std::true_type {}; + +namespace details { + +struct definitely_not_a_unique_ptr; + +template +auto has_right_clone(T const& x) -> decltype(x.clone()); + +definitely_not_a_unique_ptr has_right_clone(...); + +}// namespace details + +template +struct IsCloneableT + : std::is_same())), + std::unique_ptr> +{}; +#endif // DOXYGEN_SHOULD_SKIP_THIS + +template +constexpr bool IsCloneable_v() { return IsCloneableT::value; }; + +}// namespace lbann +#endif // LBANN_UTILS_CLONEABLE_HPP_INCLUDED diff --git a/include/lbann/utils/commify.hpp b/include/lbann/utils/commify.hpp new file mode 100644 index 00000000000..d5c43ab1956 --- /dev/null +++ b/include/lbann/utils/commify.hpp @@ -0,0 +1,16 @@ +#ifndef LBANN_UTILS_COMMIFY_INCLUDED +#define LBANN_UTILS_COMMIFY_INCLUDED + +#include + +namespace lbann +{ +namespace utils +{ + +/** @brief Inserts commas large integers for pretty-printing */ +std::string commify(size_t n); + +}// namespace utils +}// namespace lbann +#endif // LBANN_UTILS_ANY_HPP_INCLUDED diff --git a/include/lbann/utils/cublas.hpp b/include/lbann/utils/cublas.hpp index 49225ff2336..e206b0e3813 100644 --- a/include/lbann/utils/cublas.hpp +++ b/include/lbann/utils/cublas.hpp @@ -29,6 +29,7 @@ #include "lbann/base.hpp" #include "lbann/utils/cuda.hpp" +#include "lbann/utils/exception.hpp" #ifdef LBANN_HAS_CUDA #include @@ -44,8 +45,9 @@ const cublasStatus_t status_FORCE_CHECK_CUBLAS = (cublas_call); \ if (status_FORCE_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS) { \ cudaDeviceReset(); \ - LBANN_ERROR(std::string("cuBLAS error: ") \ - + lbann::cublas::get_error_string(status_FORCE_CHECK_CUBLAS)); \ + LBANN_ERROR("cuBLAS error: ", \ + lbann::cublas::get_error_string( \ + status_FORCE_CHECK_CUBLAS)); \ } \ } \ { \ @@ -55,8 +57,8 @@ status_FORCE_CHECK_CUBLAS = cudaGetLastError(); \ if (status_FORCE_CHECK_CUBLAS != cudaSuccess) { \ cudaDeviceReset(); \ - LBANN_ERROR(std::string("CUDA error: ") \ - + cudaGetErrorString(status_FORCE_CHECK_CUBLAS)); \ + LBANN_ERROR("CUDA error: ", \ + cudaGetErrorString(status_FORCE_CHECK_CUBLAS)); \ } \ } \ } while (0) @@ -67,20 +69,19 @@ const cublasStatus_t status_FORCE_CHECK_CUBLAS = (cublas_call); \ if (status_FORCE_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS) { \ cudaDeviceReset(); \ - LBANN_ERROR(std::string("cuBLAS error: ") \ - + lbann::cublas::get_error_string(status_FORCE_CHECK_CUBLAS)); \ + LBANN_ERROR("cuBLAS error: ", \ + lbann::cublas::get_error_string( \ + status_FORCE_CHECK_CUBLAS)); \ } \ } \ } while (0) -#define FORCE_CHECK_CUBLAS_SYNC(cuda_call) \ - do { \ - const cudaError_t cuda_status = cuda_call; \ - if (cuda_status != cudaSuccess) { \ - std::cerr << "CUDA error: " << cudaGetErrorString(cuda_status) << "\n"; \ - std::cerr << "Error at " << __FILE__ << ":" << __LINE__ << "\n"; \ - cudaDeviceReset(); \ - throw lbann::lbann_exception("CUDA error"); \ - } \ +#define FORCE_CHECK_CUBLAS_SYNC(cuda_call) \ + do { \ + const cudaError_t cuda_status = cuda_call; \ + if (cuda_status != cudaSuccess) { \ + cudaDeviceReset(); \ + LBANN_ERROR("CUDA error: ", cudaGetErrorString(cuda_status)); \ + } \ } while (0) #ifdef LBANN_DEBUG #define CHECK_CUBLAS(cublas_call) \ @@ -99,61 +100,88 @@ namespace cublas { const std::string get_error_string(cublasStatus_t status); // BLAS Level-1 functions +template void axpy(cublasHandle_t const& handle, int n, - DataType alpha, - DataType const* x, int incx, - DataType * y, int incy); + TensorDataType alpha, + TensorDataType const* x, int incx, + TensorDataType * y, int incy); +template void dot(cublasHandle_t const& handle, int n, - DataType const* x, int incx, - DataType const* y, int incy, - DataType * result); -DataType dot(cublasHandle_t const& handle, + TensorDataType const* x, int incx, + TensorDataType const* y, int incy, + TensorDataType * result); +template +TensorDataType dot(cublasHandle_t const& handle, int n, - DataType const* x, int incx, - DataType const* y, int incy); + TensorDataType const* x, int incx, + TensorDataType const* y, int incy); +template void nrm2(cublasHandle_t const& handle, int n, - DataType const* x, int incx, - DataType * result); -DataType nrm2(cublasHandle_t const& handle, + TensorDataType const* x, int incx, + TensorDataType * result); +template +TensorDataType nrm2(cublasHandle_t const& handle, int n, - DataType const* x, int incx); + TensorDataType const* x, int incx); +template void scal(cublasHandle_t const& handle, int n, - DataType alpha, - DataType * x, int incx); + TensorDataType alpha, + TensorDataType * x, int incx); // BLAS Level-2 functions +template void gemv(cublasHandle_t const& handle, cublasOperation_t trans, int m, int n, - DataType alpha, - DataType const * A, int lda, - DataType const * x, int incx, - DataType beta, - DataType * y, int iny); + TensorDataType alpha, + TensorDataType const * A, int lda, + TensorDataType const * x, int incx, + TensorDataType beta, + TensorDataType * y, int iny); // BLAS Level-3 functions +template void gemm(cublasHandle_t const& handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - DataType alpha, - DataType const * A, int lda, - DataType const * B, int ldb, - DataType beta, - DataType * C, int ldc); + TensorDataType alpha, + TensorDataType const * A, int lda, + TensorDataType const * B, int ldb, + TensorDataType beta, + TensorDataType * C, int ldc); // BLAS-like extension +template void geam(cublasHandle_t const& handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, - DataType alpha, - DataType const * A, int lda, - DataType beta, - DataType const * B, int ldb, - DataType * C, int ldc); + TensorDataType alpha, + TensorDataType const * A, int lda, + TensorDataType beta, + TensorDataType const * B, int ldb, + TensorDataType * C, int ldc); +template +void gemm_strided_batched(cublasHandle_t const& handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + TensorDataType alpha, + TensorDataType const * A, int lda, + long long int strideA, + TensorDataType const * B, int ldb, + long long int strideB, + TensorDataType beta, + TensorDataType * C, int ldc, + long long int strideC, + int batchCount); + +/** @brief Set the default to use tensor core operations, allowing + * FP32->FP16 conversions. + */ +void default_to_tensor_ops(); } // namespace cublas } // namespace lbann diff --git a/include/lbann/utils/cuda.hpp b/include/lbann/utils/cuda.hpp index 87201c0fe8d..d124487df3e 100644 --- a/include/lbann/utils/cuda.hpp +++ b/include/lbann/utils/cuda.hpp @@ -110,6 +110,40 @@ namespace cuda { template __device__ __forceinline__ T atomic_add(T* address, T val); +/** @brief Sum over threads in CUDA block + * + * Every thread in a CUDA block must enter this function. The sum is + * returned on thread 0. + * + * @tparam bdimx x-dimension of CUDA block + * @tparam bdimy y-dimension of CUDA block + * @tparam bdimz z-dimension of CUDA block + * @tparam T Data type + * @param val Contribution from thread + * @returns On thread 0, the sum. Not meaningful on other threads. + */ +template +__device__ __forceinline__ +T block_reduce(T val); + +/** @brief Reduction over threads in CUDA block + * + * Every thread in a CUDA block must enter this function. The reduced + * value is returned on thread 0. + * + * @tparam bdimx x-dimension of CUDA block + * @tparam bdimy y-dimension of CUDA block + * @tparam bdimz z-dimension of CUDA block + * @tparam T Data type + * @tparam Op Functor for reduction operation + * @param val Contribution from each thread + * @returns On thread 0, the reduced value. Not meaningful on other + * threads. + */ +template +__device__ __forceinline__ +T block_reduce(T val); + // Unary math functions template __device__ __forceinline__ T abs(const T& x); template __device__ __forceinline__ T round(const T& x); @@ -146,6 +180,15 @@ template constexpr __device__ __forceinline__ T max(); template constexpr __device__ __forceinline__ T epsilon(); template __device__ __forceinline__ T infinity(); +/** @brief Array with fixed type and size. */ +template +struct array { + T vals[N]; + __host__ __device__ __forceinline__ size_t size() const; + __host__ __device__ __forceinline__ T& operator[](size_t i); + __host__ __device__ __forceinline__ const T& operator[](size_t i) const; +}; + #endif // __CUDACC__ // ------------------------------------------------------------- @@ -187,36 +230,40 @@ class event_wrapper { * The input and output data must be on GPU and must have the same * dimensions. */ -template -void apply_entrywise_unary_operator(const AbsMat& input, - AbsMat& output); +template