Skip to content

Commit

Permalink
Merge branch 'support_hdfs_loading-jershi' into 'v3.5-integration'
Browse files Browse the repository at this point in the history
Support HDFS loading parameters

See merge request dl/hugectr/hugectr!674
  • Loading branch information
zehuanw committed Mar 1, 2022
2 parents 0bc1c8a + 9152f96 commit 5989232
Show file tree
Hide file tree
Showing 30 changed files with 1,228 additions and 245 deletions.
76 changes: 50 additions & 26 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,56 +11,64 @@ nightly_build_train:
variables:
REMOTE_REPO: https://github.com/NVIDIA-Merlin/Merlin.git
DST_IMAGE: ${IMAGE_TRAIN}
DOCKER_FILE: dockerfile.ctr
DOCKER_FILE: ./training/dockerfile.ctr
BUILD_ARGS: --build-arg HUGECTR_DEV_MODE=true

nightly_build_train_with_hdfs:
extends: .build_nightly
variables:
REMOTE_REPO: https://github.com/NVIDIA-Merlin/Merlin.git
DST_IMAGE: ${IMAGE_TRAIN_WITH_HDFS}
DOCKER_FILE: ./training/dockerfile.ctr
BUILD_ARGS: --build-arg HUGECTR_DEV_MODE=true --build-arg INSTALL_HDFS=true

nightly_build_inference:
extends: .build_nightly
variables:
REMOTE_REPO: https://github.com/NVIDIA-Merlin/Merlin.git
DST_IMAGE: ${IMAGE_INFER}
DOCKER_FILE: dockerfile.tri
DOCKER_FILE: ./inference/dockerfile.ctr
BUILD_ARGS: --build-arg HUGECTR_DEV_MODE=true

nightly_build_sok_tf2:
extends: .build_nightly
variables:
REMOTE_REPO: https://github.com/NVIDIA-Merlin/Merlin.git
DST_IMAGE: ${IMAGE_SOK_TF2}
DOCKER_FILE: dockerfile.tf
DOCKER_FILE: ./training/dockerfile.tf
BUILD_ARGS: --build-arg HUGECTR_DEV_MODE=true

nightly_build_sok_tf1:
extends: .build_nightly
variables:
REMOTE_REPO: https://github.com/NVIDIA-Merlin/Merlin.git
DST_IMAGE: ${IMAGE_SOK_TF1}
DOCKER_FILE: dockerfile.tf
DOCKER_FILE: ./training/dockerfile.tf
BUILD_ARGS: --build-arg HUGECTR_DEV_MODE=true --build-arg IMAGE=nvcr.io/nvidia/tensorflow:21.11-tf1-py3 --build-arg INSTALL_NVT=false

nightly_build_unified_container.ctr:
extends: .build_nightly
variables:
REMOTE_REPO: https://github.com/NVIDIA-Merlin/Merlin.git
DST_IMAGE: ${CI_REGISTRY}/dl/hugectr/hugectr:unified.ctr.latest
DOCKER_FILE: dockerfile.ctr
BUILD_ARGS: --build-arg _CI_JOB_TOKEN=${RD_CI_JOB_TOKEN} --build-arg _HUGECTR_REPO=gitlab-master.nvidia.com/dl/hugectr/hugectr.git --build-arg _HUGECTR_BRANCH=${CI_COMMIT_SHA}
DOCKER_FILE: ./training/dockerfile.ctr
BUILD_ARGS: --build-arg _CI_JOB_TOKEN=${RD_CI_JOB_TOKEN} --build-arg _HUGECTR_REPO=gitlab-master.nvidia.com/dl/hugectr/hugectr.git --build-arg HUGECTR_VER=${CI_COMMIT_SHA}

nightly_build_unified_container.tf:
extends: .build_nightly
variables:
REMOTE_REPO: https://github.com/NVIDIA-Merlin/Merlin.git
DST_IMAGE: ${CI_REGISTRY}/dl/hugectr/hugectr:unified.tf.latest
DOCKER_FILE: dockerfile.tf
BUILD_ARGS: --build-arg _CI_JOB_TOKEN=${RD_CI_JOB_TOKEN} --build-arg _HUGECTR_REPO=gitlab-master.nvidia.com/dl/hugectr/hugectr.git --build-arg _HUGECTR_BRANCH=${CI_COMMIT_SHA}
DOCKER_FILE: ./training/dockerfile.tf
BUILD_ARGS: --build-arg _CI_JOB_TOKEN=${RD_CI_JOB_TOKEN} --build-arg _HUGECTR_REPO=gitlab-master.nvidia.com/dl/hugectr/hugectr.git --build-arg HUGECTR_VER=${CI_COMMIT_SHA}

nightly_build_unified_container.tri:
extends: .build_nightly
variables:
REMOTE_REPO: https://github.com/NVIDIA-Merlin/Merlin.git
DST_IMAGE: ${CI_REGISTRY}/dl/hugectr/hugectr:unified.tri.latest
DOCKER_FILE: dockerfile.tri
BUILD_ARGS: --build-arg _CI_JOB_TOKEN=${RD_CI_JOB_TOKEN} --build-arg _HUGECTR_REPO=gitlab-master.nvidia.com/dl/hugectr/hugectr.git --build-arg _HUGECTR_BRANCH=${CI_COMMIT_SHA} --build-arg _HUGECTR_BACKEND_REPO=gitlab-master.nvidia.com/dl/hugectr/hugectr_inference_backend.git --build-arg _HUGECTR_BACKEND_BRANCH=hugectr_performance_test
DOCKER_FILE: ./inference/dockerfile.ctr
BUILD_ARGS: --build-arg _CI_JOB_TOKEN=${RD_CI_JOB_TOKEN} --build-arg _HUGECTR_REPO=gitlab-master.nvidia.com/dl/hugectr/hugectr.git --build-arg HUGECTR_VER=${CI_COMMIT_SHA} --build-arg _HUGECTR_BACKEND_REPO=gitlab-master.nvidia.com/dl/hugectr/hugectr_inference_backend.git --build-arg HUGECTR_BACKEND_VER=hugectr_performance_test

### Stage: build
build_train_single_node:
Expand All @@ -72,6 +80,16 @@ build_train_single_node:
BUILD_HUGECTR: 1
BUILD_HUGECTR2ONNX: 1

build_train_single_node_with_hdfs:
extends: .build
variables:
FROM_IMAGE: ${IMAGE_TRAIN_WITH_HDFS}
DST_IMAGE: $TRAIN_IMAGE_VERSIONED_WITH_HDFS
CMAKE_OPTION: "-DCMAKE_BUILD_TYPE=Release -DKEY_HIT_RATIO=ON -DSM=\"60;61;70;75;80\" -DCLANGFORMAT=OFF -DENABLE_HDFS=ON"
BUILD_HUGECTR: 1
BUILD_HUGECTR2ONNX: 1


build_train_multi_node:
extends: .build
variables:
Expand Down Expand Up @@ -401,21 +419,6 @@ data_generator_single_node:
DGXNNODES: 1
TEST_CMD: ./ci/integration_test/data_generator/data_generator.sub

#hdfs backend test
hdfs_backend_test:
extends: .dlcluster_job
allow_failure: false
stage: test
needs:
- build_train_single_node
script:
- export CONT=${INFER_IMAGE_VERSIONED}
- srun -N 1 -p dgx1v32g,v100pcie32g bash ./ci/integration_test/hdfs/hdfs_backend_test.sh
rules:
- if: $CI_PIPELINE_SOURCE =~ /^(web|merge_request_event)$/
when: always
- when: never

# # python interface single node
py_single_node:
extends: .cluster_test_job
Expand Down Expand Up @@ -533,6 +536,8 @@ inference_ps_test:
rules:
- if: $CI_PIPELINE_SOURCE =~ /^(web|merge_request_event)$/
when: always
- if: $TEST_NEW_IMAGE == "1"
when: always
- when: never

inference_embedding_cache_update_test:
Expand All @@ -547,8 +552,27 @@ inference_embedding_cache_update_test:
rules:
- if: $CI_PIPELINE_SOURCE =~ /^(web|merge_request_event)$/
when: always
- if: $TEST_NEW_IMAGE == "1"
when: always
- when: never

#hdfs backend test
hdfs_backend_test:
extends: .dlcluster_job
allow_failure: false
stage: test
needs:
- build_train_single_node_with_hdfs
script:
- export CONT=${TRAIN_IMAGE_VERSIONED_WITH_HDFS}
- srun -N 1 -p dgx1v32g,v100pcie32g bash ./ci/integration_test/hdfs/hdfs_backend_test.sh
rules:
- if: $CI_PIPELINE_SOURCE =~ /^(web|merge_request_event)$/
when: always
- if: $TEST_NEW_IMAGE == "1"
when: always
- when: never

wdl_check:
# Push logs to gitlab
extends: .cluster_post_test_job
Expand Down
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,13 @@ install(DIRECTORY ${CMAKE_BINARY_DIR}/bin DESTINATION /usr/local/hugectr)

add_subdirectory(gpu_cache/src)
add_subdirectory(gpu_cache/test)

option(ENABLE_HDFS "Enable HDFS" OFF)
if(ENABLE_HDFS)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DENABLE_HDFS")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_HDFS")
endif()

option(ENABLE_INFERENCE "Enable Inference" OFF)
if(ENABLE_INFERENCE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DENABLE_INFERENCE")
Expand Down
5 changes: 3 additions & 2 deletions HugeCTR/include/embedding.hpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class IEmbedding {
virtual void backward() = 0;
virtual void update_params() = 0;
virtual void init_params() = 0;
virtual void load_parameters(std::string sparse_model) = 0;
virtual void load_parameters(std::string sparse_model, DataSourceParams data_source_params) = 0;
virtual void dump_parameters(std::string sparse_model,
DataSourceParams data_source_params) const = 0;
virtual void set_learning_rate(float lr) = 0;
Expand All @@ -54,7 +54,8 @@ class IEmbedding {

virtual void dump_opt_states(std::ofstream& stream, std::string write_path,
DataSourceParams data_source_params) = 0;
virtual void load_opt_states(std::ifstream& stream) = 0;
virtual void load_opt_states(std::ifstream& stream, std::string read_path,
DataSourceParams data_source_params) = 0;

virtual const SparseEmbeddingHashParams& get_embedding_params() const = 0;
virtual std::vector<TensorBag2> get_train_output_tensors() const = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ class DistributedSlotSparseEmbeddingHash : public IEmbedding {
* upload it onto multi-GPUs global memory.
* @param sparse_model the folder name of sparse model.
*/
void load_parameters(std::string sparse_model) override;
void load_parameters(std::string sparse_model, DataSourceParams data_source_params) override;
void load_parameters(BufferBag &buf_bag, size_t num) override;

/**
Expand All @@ -278,7 +278,8 @@ class DistributedSlotSparseEmbeddingHash : public IEmbedding {

void dump_opt_states(std::ofstream &stream, std::string sparse_model,
DataSourceParams data_source_params) override;
void load_opt_states(std::ifstream &stream) override;
void load_opt_states(std::ifstream &stream, std::string read_path,
DataSourceParams data_source_params) override;
void reset_optimizer() override;

/**
Expand Down
5 changes: 3 additions & 2 deletions HugeCTR/include/embeddings/hybrid_sparse_embedding.hpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ class HybridSparseEmbedding : public IEmbedding {
void backward() override;
void update_params() override;
void init_params() override;
void load_parameters(std::string sparse_model) override;
void load_parameters(std::string sparse_model, DataSourceParams data_source_params) override;
void dump_parameters(std::string sparse_model,
DataSourceParams data_source_params) const override;
void set_learning_rate(float lr) override;
Expand All @@ -263,7 +263,8 @@ class HybridSparseEmbedding : public IEmbedding {

void dump_opt_states(std::ofstream& stream, std::string sparse_model,
DataSourceParams data_source_params) override {}
void load_opt_states(std::ifstream& stream) override {}
void load_opt_states(std::ifstream& stream, std::string read_path,
DataSourceParams data_source_params) override {}
void reset_optimizer() override {}
void reset() override {}

Expand Down
5 changes: 3 additions & 2 deletions HugeCTR/include/embeddings/localized_slot_sparse_embedding_hash.hpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ class LocalizedSlotSparseEmbeddingHash : public IEmbedding {
* upload it onto multi-GPUs global memory.
* @param sparse_model the folder name of sparse model.
*/
void load_parameters(std::string sparse_model) override;
void load_parameters(std::string sparse_model, DataSourceParams data_source_params) override;
void load_parameters(BufferBag &buf_bag, size_t num) override;
/**
* Download the hash table from multi-GPUs global memroy to CPU memory
Expand All @@ -404,7 +404,8 @@ class LocalizedSlotSparseEmbeddingHash : public IEmbedding {

void dump_opt_states(std::ofstream &stream, std::string sparse_model,
DataSourceParams data_source_params) override;
void load_opt_states(std::ifstream &stream) override;
void load_opt_states(std::ifstream &stream, std::string read_path,
DataSourceParams data_source_params) override;
void reset_optimizer() override;

/**
Expand Down
5 changes: 3 additions & 2 deletions HugeCTR/include/embeddings/localized_slot_sparse_embedding_one_hot.hpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ class LocalizedSlotSparseEmbeddingOneHot : public IEmbedding {
* upload it onto multi-GPUs global memory.
* @param sparse_model the folder name of sparse model.
*/
void load_parameters(std::string sparse_model) override;
void load_parameters(std::string sparse_model, DataSourceParams data_source_params) override;
void load_parameters(BufferBag &buf_bag, size_t num) override;
/**
* Download the hash table from multi-GPUs global memroy to CPU memory
Expand All @@ -317,7 +317,8 @@ class LocalizedSlotSparseEmbeddingOneHot : public IEmbedding {

void dump_opt_states(std::ofstream &stream, std::string sparse_model,
DataSourceParams data_source_params) override {}
void load_opt_states(std::ifstream &stream) override {}
void load_opt_states(std::ifstream &stream, std::string read_path,
DataSourceParams data_source_params) override {}
void reset_optimizer() override {}

/**
Expand Down
6 changes: 4 additions & 2 deletions HugeCTR/include/embeddings/sparse_embedding_functors.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -534,8 +534,10 @@ class SparseEmbeddingFunctors {
DataSourceParams data_source_params, const ResourceManager &resource_manager,
std::vector<Tensors2<TypeEmbeddingComp>> &opt_states);
template <typename TypeEmbeddingComp>
void load_opt_states(std::ifstream &stream, const ResourceManager &resource_manager,
std::vector<Tensors2<TypeEmbeddingComp>> &opt_states);
void load_opt_states(std::ifstream &stream, std::string &read_path,
const ResourceManager &resource_manager,
std::vector<Tensors2<TypeEmbeddingComp>> &opt_states,
DataSourceParams data_source_params);
};

// TODO: consider to move them; they are currently only used for an utest
Expand Down
9 changes: 8 additions & 1 deletion HugeCTR/include/hdfs_backend.hpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
#include <string>
#include <vector>

#ifdef ENABLE_HDFS
#include "hdfs.h"
#endif

namespace HugeCTR {

Expand Down Expand Up @@ -90,8 +92,10 @@ class HdfsService {
int batchCopyToLocal(const std::string& hdfs_dir_path, const std::string& local_path);

private:
#ifdef ENABLE_HDFS
hdfsFS fs_;
hdfsFS local_fs_;
#endif
std::string name_node_;
int hdfs_port_;

Expand All @@ -100,14 +104,17 @@ class HdfsService {
*
* @return hdfsFS The FS handler for HDFS.
*/
#ifdef ENABLE_HDFS
hdfsFS connect();

#endif
/**
* @brief Connect to local File system.
*
* @return hdfsFS
*/
#ifdef ENABLE_HDFS
hdfsFS connectToLocal();
#endif
/**
* @brief Disconnect to HDFS server.
*
Expand Down
29 changes: 19 additions & 10 deletions HugeCTR/include/pybind/model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -436,13 +436,18 @@ class Model {
return !embeddings_[0]->get_learning_rate_schedulers().empty();
}

void load_dense_weights(const std::string& dense_model_file);
void load_sparse_weights(const std::vector<std::string>& sparse_embedding_files);
void load_sparse_weights(const std::map<std::string, std::string>& sparse_embedding_files_map);
void load_dense_optimizer_states(const std::string& dense_opt_states_file);
void load_sparse_optimizer_states(const std::vector<std::string>& sparse_opt_states_files);
void load_dense_weights(const std::string& dense_model_file, DataSourceParams data_source_params);
void load_sparse_weights(const std::vector<std::string>& sparse_embedding_files,
DataSourceParams data_source_params);
void load_sparse_weights(const std::map<std::string, std::string>& sparse_embedding_files_map,
DataSourceParams data_source_params);
void load_dense_optimizer_states(const std::string& dense_opt_states_file,
DataSourceParams data_source_params);
void load_sparse_optimizer_states(const std::vector<std::string>& sparse_opt_states_files,
DataSourceParams data_source_params);
void load_sparse_optimizer_states(
const std::map<std::string, std::string>& sparse_opt_states_files_map);
const std::map<std::string, std::string>& sparse_opt_states_files_map,
DataSourceParams data_source_params);
void freeze_embedding() {
for (auto& one_embedding : embeddings_) {
one_embedding->freeze();
Expand Down Expand Up @@ -564,10 +569,14 @@ class Model {
const std::vector<std::string>& sparse_embedding_files,
const std::vector<std::string>& local_paths,
const std::vector<HMemCacheConfig>& hmem_cache_configs);
Error_t load_params_for_dense_(const std::string& model_file);
Error_t load_params_for_sparse_(const std::vector<std::string>& embedding_file);
Error_t load_opt_states_for_dense_(const std::string& dense_opt_states_file);
Error_t load_opt_states_for_sparse_(const std::vector<std::string>& sparse_opt_states_files);
Error_t load_params_for_dense_(const std::string& model_file,
DataSourceParams data_source_params);
Error_t load_params_for_sparse_(const std::vector<std::string>& embedding_file,
DataSourceParams data_source_params);
Error_t load_opt_states_for_dense_(const std::string& dense_opt_states_file,
DataSourceParams data_source_params);
Error_t load_opt_states_for_sparse_(const std::vector<std::string>& sparse_opt_states_files,
DataSourceParams data_source_params);
virtual void exchange_wgrad(size_t device_id);
virtual void train_overlapped();
virtual void add_dense_layer(DenseLayer& dense_layer);
Expand Down
Loading

0 comments on commit 5989232

Please sign in to comment.