Skip to content
This repository was archived by the owner on Jan 3, 2023. It is now read-only.

Langjian/distributed json #366

Merged
merged 41 commits into from
Jan 18, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
3878e43
Fix new klocwork error reports on 11/20/2018
jianyinglang Nov 21, 2018
4decf57
Fix the FusedBatchNorm with the Bessel correction in variance
jianyinglang Dec 4, 2018
cfb0a83
Fix the format
jianyinglang Dec 4, 2018
76ab3fa
Merge remote-tracking branch 'origin/master' into langjian/BatchNorm_…
jianyinglang Dec 4, 2018
289a43d
Add distributed macro
jianyinglang Dec 4, 2018
2163ac1
Add multi-node .json file output
jianyinglang Dec 6, 2018
34c4ddf
Merge remote-tracking branch 'origin/master' into langjian/distribute…
jianyinglang Dec 6, 2018
2e70b97
Change CMake file to be consistent with master
jianyinglang Dec 6, 2018
1c34c3d
Format change
jianyinglang Dec 6, 2018
70449be
Add a simple distributed mnist model
jianyinglang Dec 10, 2018
5ea9070
Merge remote-tracking branch 'origin/master' into langjian/distribute…
jianyinglang Dec 19, 2018
73cf9d0
Add distributed option for Makefile
jianyinglang Dec 19, 2018
2c594ee
modify distributed example
jianyinglang Dec 19, 2018
d788247
Add distributed flags for multi-process graph dumps
jianyinglang Dec 20, 2018
0371935
Changes the Makefile to enable distributed build
jianyinglang Dec 21, 2018
d16f2d9
Merge remote-tracking branch 'origin/master' into langjian/distribute…
jianyinglang Dec 21, 2018
f205cb2
Format fix
jianyinglang Dec 21, 2018
0ecd43b
Fix the typo and delete debug comment and add run command
jianyinglang Dec 21, 2018
100c129
Fix the python file format
jianyinglang Dec 21, 2018
dda11aa
Fix python file format
jianyinglang Dec 21, 2018
6caf977
Add mnist data directory
jianyinglang Dec 21, 2018
e2ba875
Merge branch 'master' into langjian/distributed_json
avijit-nervana Dec 22, 2018
59ffb37
Fix the typo
jianyinglang Jan 4, 2019
7f6c498
Merge branch 'langjian/distributed_json' of https://github.com/Nervan…
jianyinglang Jan 4, 2019
6507c3a
Set the default distributed build as false
jianyinglang Jan 5, 2019
dbb302e
Add initialization if not initialized in MPI
jianyinglang Jan 7, 2019
b163c5c
Merge remote-tracking branch 'origin/master' into langjian/distribute…
jianyinglang Jan 7, 2019
35af774
Fix the format
jianyinglang Jan 7, 2019
97bb170
Fix the format using python2
jianyinglang Jan 7, 2019
7515794
Merge branch 'master' into langjian/distributed_json
avijit-nervana Jan 9, 2019
2f515af
Merge remote-tracking branch 'origin/master' into langjian/distribute…
jianyinglang Jan 10, 2019
e97fd82
Merge branch 'langjian/distributed_json' of https://github.com/Nervan…
jianyinglang Jan 10, 2019
1f9f611
Fix the build with no specified mpi library
jianyinglang Jan 11, 2019
7922d0b
Comment out the unused lines in CMakeLists.txt
jianyinglang Jan 11, 2019
e8553cf
Fix some errors
jianyinglang Jan 12, 2019
0499c73
Merge branch 'master' into langjian/distributed_json
avijit-nervana Jan 13, 2019
361c725
Change to if define
jianyinglang Jan 14, 2019
4533c46
Merge branch 'langjian/distributed_json' of https://github.com/Nervan…
jianyinglang Jan 14, 2019
e2c3009
Added the source reference for mnist_softmax_distributed.py
jianyinglang Jan 14, 2019
ae96c77
Merge branch 'master' into langjian/distributed_json
avijit-nervana Jan 16, 2019
8a941ee
Merge branch 'master' into langjian/distributed_json
avijit-nervana Jan 17, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -226,12 +226,12 @@ message(STATUS "NGRAPH_PLAIDML_ENABLE: ${NGRAPH_PLAIDML_ENABLE}")
message(STATUS "NGRAPH_TARGET_ARCH: ${NGRAPH_TARGET_ARCH}")
message(STATUS "NGRAPH_TUNE_ARCH: ${NGRAPH_TUNE_ARCH}")

if(NGRAPH_DISTRIBUTED_ENABLE)
find_package(MPI REQUIRED)
add_definitions(-DNGRAPH_DISTRIBUTED)
include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH})
link_directories(${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES})
endif()
#if(NGRAPH_DISTRIBUTED_ENABLE)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this temporary? If not please remove the commented out code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed back to include the MPI build

# find_package(MPI REQUIRED)
# add_definitions(-DNGRAPH_DISTRIBUTED)
# include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH})
# link_directories(${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES})
#endif()

# Find and build ngraph - if not using pre-built one
if (NOT USE_PRE_BUILT_NGRAPH)
Expand Down
17 changes: 15 additions & 2 deletions build_ngtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,11 @@ def main():
action="store_true"
)

parser.add_argument(
'--distributed_build',
help="Builds a distributed version of the nGraph components\n",
action="store_true")

arguments = parser.parse_args()

if (arguments.debug_build):
Expand Down Expand Up @@ -484,7 +489,6 @@ def main():

ngraph_cmake_flags = [
"-DNGRAPH_INSTALL_PREFIX=" + artifacts_location,
"-DNGRAPH_DISTRIBUTED_ENABLE=FALSE",
"-DNGRAPH_USE_CXX_ABI=" + cxx_abi,
"-DNGRAPH_UNIT_TEST_ENABLE=NO",
"-DNGRAPH_DEX_ONLY=TRUE",
Expand All @@ -502,6 +506,11 @@ def main():
if (arguments.debug_build):
ngraph_cmake_flags.extend(["-DCMAKE_BUILD_TYPE=Debug"])

if (arguments.distributed_build):
ngraph_cmake_flags.extend(["-DNGRAPH_DISTRIBUTED_ENABLE=TRUE"])
else:
ngraph_cmake_flags.extend(["-DNGRAPH_DISTRIBUTED_ENABLE=FALSE"])

build_ngraph("./ngraph", ngraph_cmake_flags, verbosity)

# Next build CMAKE options for the bridge
Expand All @@ -512,13 +521,17 @@ def main():
"-DNGRAPH_TUNE_ARCH=" + target_arch,
"-DNGRAPH_ARTIFACTS_DIR=" + artifacts_location,
"-DUNIT_TEST_ENABLE=ON",
"-DNGRAPH_DISTRIBUTED_ENABLE=TRUE",
"-DTF_SRC_DIR=" + tf_src_dir, "-DUNIT_TEST_TF_CC_DIR=" + os.path.join(
artifacts_location, "tensorflow")
]
if (arguments.debug_build):
ngraph_tf_cmake_flags.extend(["-DCMAKE_BUILD_TYPE=Debug"])

if (arguments.distributed_build):
ngraph_tf_cmake_flags.extend(["-DNGRAPH_DISTRIBUTED_ENABLE=TRUE"])
else:
ngraph_tf_cmake_flags.extend(["-DNGRAPH_DISTRIBUTED_ENABLE=FALSE"])

# Now build the bridge
ng_tf_whl = build_ngraph_tf(artifacts_location, "../", venv_dir,
ngraph_tf_cmake_flags, verbosity)
Expand Down
5 changes: 5 additions & 0 deletions examples/mnist/mnist_softmax_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@

See extensive documentation at
https://www.tensorflow.org/get_started/mnist/beginners
Add distributed fetaure with horovod
1. hvd.init()
2. Add distributed wrapper from hvd.DistributedOptimizer
3. Broadcast the variables from root rank to the rest processors: hvd.BroadcastGlobalVariablesHook(0)
4. Print the output for root rank only
"""
from __future__ import absolute_import
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide a summary of modifications made to this file and a reference to the source of this file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Listed the changes I made from the source file and added the link to the reference source of this file

from __future__ import division
Expand Down
10 changes: 3 additions & 7 deletions src/ngraph_encapsulate_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
#include "ngraph/runtime/interpreter/int_backend.hpp"

#ifdef NGRAPH_DISTRIBUTED
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's better to use the if defined(NGRAPH_DISTRIBUTED)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed to #if defined NGRAPH_DISTRIBUTED

#include <mpi.h>
#include "ngraph/distributed.hpp"
#endif

using namespace std;
Expand Down Expand Up @@ -267,13 +267,9 @@ class NGraphEncapsulateOp : public OpKernel {
std::string file_name =
"tf_function_" + ctx->op_kernel().name() + ".json";
#ifdef NGRAPH_DISTRIBUTED
int flag = 0;
MPI_Initialized(&flag);
if (!flag) {
MPI_Init(NULL, NULL);
}
ngraph::Distributed dist;
int Rank_ID;
MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID);
Rank_ID = dist->get_rank();
NgraphSerialize("tf_function_" + ctx->op_kernel().name() + "_" +
to_string(Rank_ID) + ".json",
ng_function);
Expand Down
20 changes: 5 additions & 15 deletions src/ngraph_rewrite_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
#include <iomanip>

#ifdef NGRAPH_DISTRIBUTED
#include <mpi.h>
#include "ngraph/distributed.hpp"
#endif

using namespace std;
Expand Down Expand Up @@ -107,13 +107,8 @@ class NGraphRewritePass : public GraphOptimizationPass {
std::stringstream ss;
ss << kind << "_" << std::setfill('0') << std::setw(4) << idx;
#ifdef NGRAPH_DISTRIBUTED
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider a function that can be used instead of the code repetition below.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Used the distributed class defined in nGraph core

int flag = 0;
MPI_Initialized(&flag);
if (!flag) {
MPI_Init(NULL, NULL);
}
int Rank_ID;
MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID);
ngraph::Distributed dist;
int Rank_ID = dist->get_rank();
ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID;
#endif
return ss.str();
Expand All @@ -124,13 +119,8 @@ class NGraphRewritePass : public GraphOptimizationPass {
ss << GraphFilenamePrefix(kind, idx) << "_" << std::setfill('0')
<< std::setw(4) << sub_idx;
#ifdef NGRAPH_DISTRIBUTED
int flag = 0;
MPI_Initialized(&flag);
if (!flag) {
MPI_Init(NULL, NULL);
}
int Rank_ID;
MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID);
ngraph::Distributed dist;
int Rank_ID = dist->get_rank();
ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID;
#endif
return ss.str();
Expand Down