-
Notifications
You must be signed in to change notification settings - Fork 15
Langjian/distributed json #366
Changes from 29 commits
3878e43
4decf57
cfb0a83
76ab3fa
289a43d
2163ac1
34c4ddf
2e70b97
1c34c3d
70449be
5ea9070
73cf9d0
2c594ee
d788247
0371935
d16f2d9
f205cb2
0ecd43b
100c129
dda11aa
6caf977
e2ba875
59ffb37
7f6c498
6507c3a
dbb302e
b163c5c
35af774
97bb170
7515794
2f515af
e97fd82
1f9f611
7922d0b
e8553cf
0499c73
361c725
4533c46
e2c3009
ae96c77
8a941ee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
# Copyright 2015 The TensorFlow Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# ============================================================================== | ||
"""A very simple MNIST classifier. | ||
|
||
See extensive documentation at | ||
https://www.tensorflow.org/get_started/mnist/beginners | ||
""" | ||
from __future__ import absolute_import | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please provide a summary of modifications made to this file and a reference to the source of this file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Listed the changes I made from the source file and added the link to the reference source of this file |
||
from __future__ import division | ||
from __future__ import print_function | ||
|
||
import argparse | ||
import sys | ||
import time | ||
|
||
from tensorflow.examples.tutorials.mnist import input_data | ||
|
||
import tensorflow as tf | ||
import ngraph_bridge | ||
import horovod.tensorflow as hvd | ||
learn = tf.contrib.learn | ||
|
||
FLAGS = None | ||
|
||
hvd.init() | ||
|
||
|
||
def main(_): | ||
run_mnist(_) | ||
|
||
|
||
def run_mnist(_): | ||
# Import data | ||
mnist = learn.datasets.mnist.read_data_sets( | ||
FLAGS.data_dir + 'MNIST-data-%d' % hvd.rank(), one_hot=True) | ||
|
||
# Create the model | ||
with tf.name_scope("mnist_placholder"): | ||
x = tf.placeholder(tf.float32, [None, 784]) | ||
W = tf.Variable(tf.zeros([784, 10])) | ||
b = tf.Variable(tf.zeros([10])) | ||
y = tf.matmul(x, W) + b | ||
|
||
# Define loss and optimizer | ||
y_ = tf.placeholder(tf.float32, [None, 10]) | ||
|
||
# The raw formulation of cross-entropy, | ||
# | ||
# tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), | ||
# reduction_indices=[1])) | ||
# | ||
# can be numerically unstable. | ||
# | ||
# So here we use tf.nn.softmax_cross_entropy_with_logits on the raw | ||
# outputs of 'y', and then average across the batch. | ||
cross_entropy = tf.reduce_mean( | ||
tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) | ||
#global_step = tf.train.get_or_create_global_step() | ||
global_step = tf.contrib.framework.get_or_create_global_step() | ||
opt = tf.train.GradientDescentOptimizer(0.5) | ||
# Add MPI Distributed Optimizer | ||
with tf.name_scope("horovod_opt"): | ||
opt = hvd.DistributedOptimizer(opt) | ||
train_step = opt.minimize(cross_entropy, global_step=global_step) | ||
|
||
# The StopAtStepHook handles stopping after running given steps. | ||
hooks = [ | ||
hvd.BroadcastGlobalVariablesHook(0), | ||
tf.train.StopAtStepHook(last_step=10) | ||
] | ||
|
||
# Test trained model | ||
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) | ||
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) | ||
|
||
# Enable soft placement and tracing as needed | ||
config = tf.ConfigProto( | ||
allow_soft_placement=True, | ||
log_device_placement=True, | ||
inter_op_parallelism_threads=1) | ||
|
||
#config.graph_options.optimizer_options.global_jit_level = jit_level | ||
run_metadata = tf.RunMetadata() | ||
|
||
#init_op = tf.global_variables_initializer() | ||
print("Variables initialized ...") | ||
|
||
# The MonitoredTrainingSession takes care of session initialization | ||
with tf.train.MonitoredTrainingSession( | ||
hooks=hooks, config=config) as mon_sess: | ||
start = time.time() | ||
train_writer = tf.summary.FileWriter(FLAGS.log_dir, mon_sess.graph) | ||
while not mon_sess.should_stop(): | ||
# Train | ||
batch_xs, batch_ys = mnist.train.next_batch(100) | ||
mon_sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) | ||
|
||
# Test trained model | ||
if not mon_sess.should_stop(): | ||
print("Accuracy: ", | ||
mon_sess.run( | ||
accuracy, | ||
feed_dict={ | ||
x: mnist.test.images, | ||
y_: mnist.test.labels | ||
})) | ||
|
||
end = time.time() | ||
|
||
if hvd.rank() == 0: | ||
print("Training time: %f seconds" % (end - start)) | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
'--data_dir', | ||
type=str, | ||
default='/tmp/tensorflow/mnist/input_data', | ||
help='Directory for storing input data') | ||
parser.add_argument( | ||
'--log_dir', | ||
type=str, | ||
default='/tmp/tensorflow/mnist/logs/mnist_with_summaries', | ||
help='Summaries log directory') | ||
FLAGS, unparsed = parser.parse_known_args() | ||
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) | ||
# run command for this distributed script | ||
# mpirun -np 2 python mnist_softmax_distributed.py --data_dir=/mnt/data/mnist |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,6 +36,10 @@ | |
|
||
#include "ngraph/runtime/interpreter/int_backend.hpp" | ||
|
||
#ifdef NGRAPH_DISTRIBUTED | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's better to use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed to |
||
#include <mpi.h> | ||
#endif | ||
|
||
using namespace std; | ||
namespace ng = ngraph; | ||
|
||
|
@@ -260,8 +264,20 @@ class NGraphEncapsulateOp : public OpKernel { | |
|
||
// Serialize to nGraph if needed | ||
if (std::getenv("NGRAPH_ENABLE_SERIALIZE") != nullptr) { | ||
NgraphSerialize("tf_function_" + ctx->op_kernel().name() + ".json", | ||
std::string file_name = | ||
"tf_function_" + ctx->op_kernel().name() + ".json"; | ||
#ifdef NGRAPH_DISTRIBUTED | ||
int flag = 0; | ||
MPI_Initialized(&flag); | ||
if (!flag) { | ||
MPI_Init(NULL, NULL); | ||
} | ||
int Rank_ID; | ||
MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID); | ||
NgraphSerialize("tf_function_" + ctx->op_kernel().name() + "_" + | ||
to_string(Rank_ID) + ".json", | ||
ng_function); | ||
#endif | ||
} | ||
|
||
m_ng_functions[signature] = ng_function; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,10 @@ | |
|
||
#include <iomanip> | ||
|
||
#ifdef NGRAPH_DISTRIBUTED | ||
#include <mpi.h> | ||
#endif | ||
|
||
using namespace std; | ||
|
||
namespace tensorflow { | ||
|
@@ -102,13 +106,33 @@ class NGraphRewritePass : public GraphOptimizationPass { | |
static std::string GraphFilenamePrefix(std::string kind, int idx) { | ||
std::stringstream ss; | ||
ss << kind << "_" << std::setfill('0') << std::setw(4) << idx; | ||
#ifdef NGRAPH_DISTRIBUTED | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider a function that can be used instead of the code repetition below. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Used the distributed class defined in nGraph core |
||
int flag = 0; | ||
MPI_Initialized(&flag); | ||
if (!flag) { | ||
MPI_Init(NULL, NULL); | ||
} | ||
int Rank_ID; | ||
MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID); | ||
ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID; | ||
#endif | ||
return ss.str(); | ||
} | ||
static std::string GraphFilenamePrefix(std::string kind, int idx, | ||
int sub_idx) { | ||
std::stringstream ss; | ||
ss << GraphFilenamePrefix(kind, idx) << "_" << std::setfill('0') | ||
<< std::setw(4) << sub_idx; | ||
#ifdef NGRAPH_DISTRIBUTED | ||
int flag = 0; | ||
MPI_Initialized(&flag); | ||
if (!flag) { | ||
MPI_Init(NULL, NULL); | ||
} | ||
int Rank_ID; | ||
MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID); | ||
ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID; | ||
#endif | ||
return ss.str(); | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The default should be false i.e., for regular builds no need to enable distributed. Need to add a command line option to the build_ngtf.py (e.g.,
--enable_distributed
) which when specified will add this flag. See the --debug_build as an example of how to extend the cmake options.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added the option of
--distributed_build
for distributed build