Skip to content

Commit 04aaf74

Browse files
committed
Prevent infinite recursion when processing instructions when finding tensor dataflow sources.
Don't process an instruction more than once during recursive calls.
1 parent 0ad8631 commit 04aaf74

File tree

3 files changed

+277
-24
lines changed

3 files changed

+277
-24
lines changed

com.ibm.wala.cast.python.ml.test/source/com/ibm/wala/cast/python/ml/test/TestTensorflowModel.java

+1
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,7 @@ public void testTf2()
258258
testTf2("tf2_test_add4.py", "f", 1, 1, 2);
259259
testTf2("tf2_test_add5.py", "f", 1, 1, 2);
260260
testTf2("tf2_test_add6.py", "f", 1, 1, 2);
261+
testTf2("multigpu_training.py", "run_optimization", 2, 4, 2, 3);
261262
}
262263

263264
private void testTf2(

com.ibm.wala.cast.python.ml/source/com/ibm/wala/cast/python/ml/client/PythonTensorAnalysisEngine.java

+41-24
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package com.ibm.wala.cast.python.ml.client;
22

3+
import static com.google.common.collect.Sets.newHashSet;
34
import static com.ibm.wala.cast.python.ml.types.TensorFlowTypes.DATASET;
45
import static com.ibm.wala.cast.types.AstMethodReference.fnReference;
56

@@ -142,7 +143,8 @@ private static Set<PointsToSetVariable> getDataflowSources(
142143
src,
143144
sources,
144145
callGraph,
145-
pointerAnalysis);
146+
pointerAnalysis,
147+
newHashSet());
146148
}
147149
} else if (inst instanceof PythonPropertyRead) {
148150
// We are potentially pulling a tensor out of a non-scalar tensor iterable.
@@ -160,7 +162,14 @@ private static Set<PointsToSetVariable> getDataflowSources(
160162
} else if (def instanceof EachElementGetInstruction
161163
|| def instanceof PythonPropertyRead) {
162164
processInstruction(
163-
def, du, localPointerKeyNode, src, sources, callGraph, pointerAnalysis);
165+
def,
166+
du,
167+
localPointerKeyNode,
168+
src,
169+
sources,
170+
callGraph,
171+
pointerAnalysis,
172+
newHashSet());
164173
}
165174
}
166175
}
@@ -181,6 +190,7 @@ private static Set<PointsToSetVariable> getDataflowSources(
181190
* @param callGraph The {@link CallGraph} containing the given {@link SSAInstruction}.
182191
* @param pointerAnalysis The {@link PointerAnalysis} corresponding to the given {@link
183192
* CallGraph}.
193+
* @param seen A {@link Set} of previously processed {@link SSAInstruction}.
184194
* @return True iff the given {@link PointsToSetVariable} was added to the given {@link Set} of
185195
* {@link PointsToSetVariable} dataflow sources.
186196
*/
@@ -191,28 +201,35 @@ private static boolean processInstruction(
191201
PointsToSetVariable src,
192202
Set<PointsToSetVariable> sources,
193203
CallGraph callGraph,
194-
PointerAnalysis<InstanceKey> pointerAnalysis) {
195-
logger.fine(() -> "Processing instruction: " + instruction + ".");
196-
197-
if (instruction != null && instruction.getNumberOfUses() > 0) {
198-
int use = instruction.getUse(0);
199-
SSAInstruction def = du.getDef(use);
200-
201-
// First try intraprocedural analysis.
202-
if (definesTensorIterable(def, node, callGraph, pointerAnalysis)) {
203-
sources.add(src);
204-
logger.info("Added dataflow source from tensor iterable: " + src + ".");
205-
return true;
206-
} else {
207-
// Use interprocedural analysis using the PA.
208-
boolean added =
209-
processInstructionInterprocedurally(
210-
instruction, use, node, src, sources, pointerAnalysis);
211-
212-
if (added) return true;
213-
else
214-
// keep going up.
215-
return processInstruction(def, du, node, src, sources, callGraph, pointerAnalysis);
204+
PointerAnalysis<InstanceKey> pointerAnalysis,
205+
Set<SSAInstruction> seen) {
206+
if (seen.contains(instruction))
207+
logger.fine(() -> "Skipping instruction: " + instruction + ". We've seen it before.");
208+
else {
209+
logger.fine(() -> "Processing instruction: " + instruction + ".");
210+
seen.add(instruction);
211+
212+
if (instruction != null && instruction.getNumberOfUses() > 0) {
213+
int use = instruction.getUse(0);
214+
SSAInstruction def = du.getDef(use);
215+
216+
// First try intraprocedural analysis.
217+
if (definesTensorIterable(def, node, callGraph, pointerAnalysis)) {
218+
sources.add(src);
219+
logger.info("Added dataflow source from tensor iterable: " + src + ".");
220+
return true;
221+
} else {
222+
// Use interprocedural analysis using the PA.
223+
boolean added =
224+
processInstructionInterprocedurally(
225+
instruction, use, node, src, sources, pointerAnalysis);
226+
227+
if (added) return true;
228+
else
229+
// keep going up.
230+
return processInstruction(
231+
def, du, node, src, sources, callGraph, pointerAnalysis, seen);
232+
}
216233
}
217234
}
218235

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
# From https://github.com/aymericdamien/TensorFlow-Examples/blob/6dcbe14649163814e72a22a999f20c5e247ce988/tensorflow_v2/notebooks/6_Hardware/multigpu_training.ipynb.
2+
# %%
3+
"""
4+
# Multi-GPU Training Example
5+
6+
Train a convolutional neural network on multiple GPU with TensorFlow 2.0+.
7+
8+
- Author: Aymeric Damien
9+
- Project: https://github.com/aymericdamien/TensorFlow-Examples/
10+
"""
11+
12+
# %%
13+
"""
14+
## Training with multiple GPU cards
15+
16+
In this example, we are using data parallelism to split the training accross multiple GPUs. Each GPU has a full replica of the neural network model, and the weights (i.e. variables) are updated synchronously by waiting that each GPU process its batch of data.
17+
18+
First, each GPU process a distinct batch of data and compute the corresponding gradients, then, all gradients are accumulated in the CPU and averaged. The model weights are finally updated with the gradients averaged, and the new model weights are sent back to each GPU, to repeat the training process.
19+
20+
<img src="https://www.tensorflow.org/images/Parallelism.png" alt="Parallelism" style="width: 400px;"/>
21+
22+
## CIFAR10 Dataset Overview
23+
24+
The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images.
25+
26+
![CIFAR10 Dataset](https://storage.googleapis.com/kaggle-competitions/kaggle/3649/media/cifar-10.png)
27+
28+
More info: https://www.cs.toronto.edu/~kriz/cifar.html
29+
"""
30+
31+
# %%
32+
33+
import tensorflow as tf
34+
from tensorflow.keras import Model, layers
35+
import time
36+
import numpy as np
37+
38+
# %%
39+
# MNIST dataset parameters.
40+
num_classes = 10 # total classes (0-9 digits).
41+
num_gpus = 4
42+
43+
# Training parameters.
44+
learning_rate = 0.001
45+
training_steps = 1000
46+
# Split batch size equally between GPUs.
47+
# Note: Reduce batch size if you encounter OOM Errors.
48+
batch_size = 1024 * num_gpus
49+
display_step = 20
50+
51+
# Network parameters.
52+
conv1_filters = 64 # number of filters for 1st conv layer.
53+
conv2_filters = 128 # number of filters for 2nd conv layer.
54+
conv3_filters = 256 # number of filters for 2nd conv layer.
55+
fc1_units = 2048 # number of neurons for 1st fully-connected layer.
56+
57+
# %%
58+
# Prepare MNIST data.
59+
from tensorflow.keras.datasets import cifar10
60+
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
61+
# Convert to float32.
62+
x_train, x_test = np.array(x_train, np.float32), np.array(x_test, np.float32)
63+
# Normalize images value from [0, 255] to [0, 1].
64+
x_train, x_test = x_train / 255., x_test / 255.
65+
y_train, y_test = np.reshape(y_train, (-1)), np.reshape(y_test, (-1))
66+
67+
# %%
68+
# Use tf.data API to shuffle and batch data.
69+
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
70+
train_data = train_data.repeat().shuffle(batch_size * 10).batch(batch_size).prefetch(num_gpus)
71+
72+
73+
# %%
74+
class ConvNet(Model):
75+
76+
# Set layers.
77+
def __init__(self):
78+
super(ConvNet, self).__init__()
79+
80+
# Convolution Layer with 64 filters and a kernel size of 3.
81+
self.conv1_1 = layers.Conv2D(conv1_filters, kernel_size=3, padding='SAME', activation=tf.nn.relu)
82+
self.conv1_2 = layers.Conv2D(conv1_filters, kernel_size=3, padding='SAME', activation=tf.nn.relu)
83+
# Max Pooling (down-sampling) with kernel size of 2 and strides of 2.
84+
self.maxpool1 = layers.MaxPool2D(2, strides=2)
85+
86+
# Convolution Layer with 128 filters and a kernel size of 3.
87+
self.conv2_1 = layers.Conv2D(conv2_filters, kernel_size=3, padding='SAME', activation=tf.nn.relu)
88+
self.conv2_2 = layers.Conv2D(conv2_filters, kernel_size=3, padding='SAME', activation=tf.nn.relu)
89+
self.conv2_3 = layers.Conv2D(conv2_filters, kernel_size=3, padding='SAME', activation=tf.nn.relu)
90+
# Max Pooling (down-sampling) with kernel size of 2 and strides of 2.
91+
self.maxpool2 = layers.MaxPool2D(2, strides=2)
92+
93+
# Convolution Layer with 256 filters and a kernel size of 3.
94+
self.conv3_1 = layers.Conv2D(conv3_filters, kernel_size=3, padding='SAME', activation=tf.nn.relu)
95+
self.conv3_2 = layers.Conv2D(conv3_filters, kernel_size=3, padding='SAME', activation=tf.nn.relu)
96+
self.conv3_3 = layers.Conv2D(conv3_filters, kernel_size=3, padding='SAME', activation=tf.nn.relu)
97+
98+
# Flatten the data to a 1-D vector for the fully connected layer.
99+
self.flatten = layers.Flatten()
100+
101+
# Fully connected layer.
102+
self.fc1 = layers.Dense(1024, activation=tf.nn.relu)
103+
# Apply Dropout (if is_training is False, dropout is not applied).
104+
self.dropout = layers.Dropout(rate=0.5)
105+
106+
# Output layer, class prediction.
107+
self.out = layers.Dense(num_classes)
108+
109+
# Set forward pass.
110+
@tf.function
111+
def call(self, x, is_training=False):
112+
x = self.conv1_1(x)
113+
x = self.conv1_2(x)
114+
x = self.maxpool1(x)
115+
x = self.conv2_1(x)
116+
x = self.conv2_2(x)
117+
x = self.conv2_3(x)
118+
x = self.maxpool2(x)
119+
x = self.conv3_1(x)
120+
x = self.conv3_2(x)
121+
x = self.conv3_3(x)
122+
x = self.flatten(x)
123+
x = self.fc1(x)
124+
x = self.dropout(x, training=is_training)
125+
x = self.out(x)
126+
if not is_training:
127+
# tf cross entropy expect logits without softmax, so only
128+
# apply softmax when not training.
129+
x = tf.nn.softmax(x)
130+
return x
131+
132+
133+
# %%
134+
# Cross-Entropy Loss.
135+
# Note that this will apply 'softmax' to the logits.
136+
@tf.function
137+
def cross_entropy_loss(x, y):
138+
# Convert labels to int 64 for tf cross-entropy function.
139+
y = tf.cast(y, tf.int64)
140+
# Apply softmax to logits and compute cross-entropy.
141+
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=x)
142+
# Average loss across the batch.
143+
return tf.reduce_mean(loss)
144+
145+
146+
# Accuracy metric.
147+
@tf.function
148+
def accuracy(y_pred, y_true):
149+
# Predicted class is the index of highest score in prediction vector (i.e. argmax).
150+
correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
151+
return tf.reduce_mean(tf.cast(correct_prediction, tf.float32), axis=-1)
152+
153+
154+
@tf.function
155+
def backprop(batch_x, batch_y, trainable_variables):
156+
# Wrap computation inside a GradientTape for automatic differentiation.
157+
with tf.GradientTape() as g:
158+
# Forward pass.
159+
pred = conv_net(batch_x, is_training=True)
160+
# Compute loss.
161+
loss = cross_entropy_loss(pred, batch_y)
162+
# Compute gradients.
163+
gradients = g.gradient(loss, trainable_variables)
164+
return gradients
165+
166+
167+
# Build the function to average the gradients.
168+
@tf.function
169+
def average_gradients(tower_grads):
170+
avg_grads = []
171+
for tgrads in zip(*tower_grads):
172+
grads = []
173+
for g in tgrads:
174+
expanded_g = tf.expand_dims(g, 0)
175+
grads.append(expanded_g)
176+
177+
grad = tf.concat(axis=0, values=grads)
178+
grad = tf.reduce_mean(grad, 0)
179+
180+
avg_grads.append(grad)
181+
182+
return avg_grads
183+
184+
185+
# %%
186+
with tf.device('/cpu:0'):
187+
# Build convnet.
188+
conv_net = ConvNet()
189+
# Stochastic gradient descent optimizer.
190+
optimizer = tf.optimizers.Adam(learning_rate)
191+
192+
193+
# %%
194+
# Optimization process.
195+
def run_optimization(x, y):
196+
# Save gradients for all GPUs.
197+
tower_grads = []
198+
# Variables to update, i.e. trainable variables.
199+
trainable_variables = conv_net.trainable_variables
200+
201+
with tf.device('/cpu:0'):
202+
for i in range(num_gpus):
203+
# Split data between GPUs.
204+
gpu_batch_size = int(batch_size / num_gpus)
205+
batch_x = x[i * gpu_batch_size: (i + 1) * gpu_batch_size]
206+
batch_y = y[i * gpu_batch_size: (i + 1) * gpu_batch_size]
207+
208+
# Build the neural net on each GPU.
209+
with tf.device('/gpu:%i' % i):
210+
grad = backprop(batch_x, batch_y, trainable_variables)
211+
tower_grads.append(grad)
212+
213+
# Last GPU Average gradients from all GPUs.
214+
if i == num_gpus - 1:
215+
gradients = average_gradients(tower_grads)
216+
217+
# Update vars following gradients.
218+
optimizer.apply_gradients(list(zip(gradients, trainable_variables)))
219+
220+
221+
# %%
222+
# Run training for the given number of steps.
223+
ts = time.time()
224+
for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):
225+
# Run the optimization to update W and b values.
226+
run_optimization(batch_x, batch_y)
227+
228+
if step % display_step == 0 or step == 1:
229+
dt = time.time() - ts
230+
speed = batch_size * display_step / dt
231+
pred = conv_net(batch_x)
232+
loss = cross_entropy_loss(pred, batch_y)
233+
acc = accuracy(pred, batch_y)
234+
print(("step: %i, loss: %f, accuracy: %f, speed: %f examples/sec" % (step, loss, acc, speed)))
235+
ts = time.time()

0 commit comments

Comments
 (0)