-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathestimate_gradient_norm.py
655 lines (564 loc) · 35 KB
/
estimate_gradient_norm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
estimate_gradient_norm.py
A multithreaded gradient norm sampler
Copyright (C) 2017-2018, IBM Corp.
Copyright (C) 2017, Lily Weng <[email protected]>
and Huan Zhang <[email protected]>
This program is licenced under the Apache 2.0 licence,
contained in the LICENCE file in this directory.
"""
from __future__ import division
import numpy as np
import random
import ctypes
import time
import sys
import os
import tensorflow as tf
from multiprocessing import Pool, current_process, cpu_count
from shmemarray import ShmemRawArray, NpShmemArray
from functools import partial
from randsphere import randsphere
from tensorflow.python.ops import gradients_impl
class EstimateLipschitz(object):
def __init__(self, sess, seed = 1215, nthreads = 0):
"""
sess: tensorflow session
Nsamp: number of samples to take per iteration
Niters: number of iterations, each iteration we return a max L
"""
self.sess = sess
self.seed = seed
# create a pool of workers to compute samples in advance
if nthreads == 0:
self.n_processes = max(cpu_count() // 2, 1)
else:
self.n_processes = nthreads
# set up random seed during initialization
def initializer(s):
np.random.seed(s + current_process()._identity[0])
# using only 1 OpenMP thread
os.environ['OMP_NUM_THREADS'] = "1"
self.pool = Pool(processes = self.n_processes, initializer = initializer, initargs=(self.seed,))
def load_model(self, dataset = "mnist", model_name = "2-layer", activation = "relu", model = None, batch_size = 0, compute_slope = False, order = 1):
"""
model: if set to None, then load dataset with model_name. Otherwise use the model directly.
dataset: mnist, cifar and imagenet. recommend to use mnist and cifar as a starting point.
model_name: possible options are 2-layer, distilled, and normal
"""
from setup_cifar import CIFAR, CIFARModel, TwoLayerCIFARModel
from setup_mnist import MNIST, MNISTModel, TwoLayerMNISTModel
from nlayer_model import NLayerModel
from setup_imagenet import ImageNet, ImageNetModel
# if set this to true, we will use the logit layer output instead of probability
# the logit layer's gradients are usually larger and more stable
output_logits = True
self.dataset = dataset
self.model_name = model_name
if model is None:
print('Loading model...')
if dataset == "mnist":
self.batch_size = 1024
if model_name == "2-layer":
model = TwoLayerMNISTModel("models/mnist_2layer", self.sess, not output_logits)
elif model_name == "normal":
if activation == "relu":
model = MNISTModel("models/mnist", self.sess, not output_logits)
else:
print("actviation = {}".format(activation))
model = MNISTModel("models/mnist_cnn_7layer_"+activation, self.sess, not output_logits, activation= activation)
time.sleep(5)
elif model_name == "brelu":
model = MNISTModel("models/mnist_brelu", self.sess, not output_logits, use_brelu = True)
elif model_name == "distilled":
model = MNISTModel("models/mnist-distilled-100", self.sess, not output_logits)
else:
# specify model parameters as N,M,opts
model_params = model_name.split(",")
if len(model_params) < 3:
raise(RuntimeError("incorrect model option" + model_name))
numlayer = int(model_params[0])
nhidden = int(model_params[1])
modelfile = "models/mnist_{}layer_relu_{}_{}".format(numlayer, nhidden, model_params[2])
print("loading", modelfile)
model = NLayerModel([nhidden] * (numlayer - 1), modelfile)
elif dataset == "cifar":
self.batch_size = 1024
if model_name == "2-layer":
model = TwoLayerCIFARModel("models/cifar_2layer", self.sess, not output_logits)
elif model_name == "normal":
if activation == "relu":
model = CIFARModel("models/cifar", self.sess, not output_logits)
else:
model = CIFARModel("models/cifar_cnn_7layer_"+activation, self.sess, not output_logits, activation = activation)
elif model_name == "brelu":
model = CIFARModel("models/cifar_brelu", self.sess, not output_logits, use_brelu = True)
elif model_name == "distilled":
model = CIFARModel("models/cifar-distilled-100", self.sess, not output_logits)
else:
# specify model parameters as N,M,opts
model_params = model_name.split(",")
if len(model_params) < 3:
raise(RuntimeError("incorrect model option" + model_name))
numlayer = int(model_params[0])
nhidden = int(model_params[1])
modelfile = "models/cifar_{}layer_relu_{}_{}".format(numlayer, nhidden, model_params[2])
print("loading", modelfile)
model = NLayerModel([nhidden] * (numlayer - 1), modelfile, image_size=32, image_channel=3)
elif dataset == "imagenet":
self.batch_size = 32
model = ImageNetModel(self.sess, use_softmax = not output_logits, model_name = model_name, create_prediction = False)
else:
raise(RuntimeError("dataset unknown"))
#print("*** Loaded model successfully")
self.model = model
self.compute_slope = compute_slope
if batch_size != 0:
self.batch_size = batch_size
## placeholders: self.img, self.true_label, self.target_label
# img is the placeholder for image input
self.img = tf.placeholder(shape = [None, model.image_size, model.image_size, model.num_channels], dtype = tf.float32)
# output is the output tensor of the entire network
self.output = model.predict(self.img)
# create the graph to compute gradient
# get the desired true label and target label
self.true_label = tf.placeholder(dtype = tf.int32, shape = [])
self.target_label = tf.placeholder(dtype = tf.int32, shape = [])
true_output = self.output[:, self.true_label]
target_output = self.output[:, self.target_label]
# get the difference
self.objective = true_output - target_output
# get the gradient(deprecated arguments)
self.grad_op = tf.gradients(self.objective, self.img)[0]
# compute gradient norm: (in computation graph, so is faster)
grad_op_rs = tf.reshape(self.grad_op, (tf.shape(self.grad_op)[0], -1))
self.grad_2_norm_op = tf.norm(grad_op_rs, axis = 1)
self.grad_1_norm_op = tf.norm(grad_op_rs, ord=1, axis = 1)
self.grad_inf_norm_op = tf.norm(grad_op_rs, ord=np.inf, axis = 1)
### Lily: added Hessian-vector product calculation here for 2nd order bound:
if order == 2:
## _hessian_vector_product(ys, xs, v): return a list of tensors containing the product between the Hessian and v
## ys: a scalar valur or a tensor or a list of tensors to be summed to yield of scalar
## xs: a list of tensors that we should construct the Hessian over
## v: a list of tensors with the same shape as xs that we want to multiply by the Hessian
# self.randv: shape = (Nimg,28,28,1) (the v in _hessian_vector_product)
self.randv = tf.placeholder(shape = [None, model.image_size, model.image_size, model.num_channels], dtype = tf.float32)
# hv_op_tmp: shape = (Nimg,28,28,1) for mnist, same as self.img (the xs in _hessian_vector_product)
hv_op_tmp = gradients_impl._hessian_vector_product(self.objective, [self.img], [self.randv])[0]
# hv_op_rs: reshape hv_op_tmp to hv_op_rs whose shape = (Nimg, 784) for mnist
hv_op_rs = tf.reshape(hv_op_tmp, (tf.shape(hv_op_tmp)[0],-1))
# self.hv_norm_op: norm of hessian vector product, keep shape = (Nimg,1) using keepdims
self.hv_norm_op = tf.norm(hv_op_rs, axis = 1, keepdims=True)
# hv_op_rs_normalize: normalize Hv to Hv/||Hv||, shape = (Nimg, 784)
hv_op_rs_normalize = hv_op_rs/self.hv_norm_op
# self.hv_op: reshape hv_op_rs_normalize to shape = (Nimg,28,28,1)
self.hv_op = tf.reshape(hv_op_rs_normalize, tf.shape(hv_op_tmp))
## reshape randv and compute its norm
# shape: (Nimg, 784)
randv_rs = tf.reshape(self.randv, (tf.shape(self.randv)[0],-1))
# shape: (Nimg,)
self.randv_norm_op = tf.norm(randv_rs, axis = 1)
## compute v'Hv: use un-normalized Hv (hv_op_tmp, hv_op_rs)
# element-wise multiplication and then sum over axis = 1 (now shape: (Nimg,))
self.vhv_op = tf.reduce_sum(tf.multiply(randv_rs,hv_op_rs),axis=1)
## compute Rayleigh quotient: v'Hv/v'v (estimated largest eigenvalue), shape: (Nimg,)
# note: self.vhv_op and self.randv_norm_op has to be in the same dimension (either (Nimg,) or (Nimg,1))
self.eig_est = self.vhv_op/tf.square(self.randv_norm_op)
## Lily added the tf.while to compute the eigenvalue in computational graph later
# cond for computing largest abs/neg eigen-value
def cond(it, randv, eig_est, eig_est_prev, tfconst):
norm_diff = tf.norm(eig_est-eig_est_prev,axis=0)
return tf.logical_and(it < 500, norm_diff > 0.001)
# compute largest abs eigenvalue: tfconst = 0
# compute largest neg eigenvalue: tfconst = 10
def body(it, randv, eig_est, eig_est_prev, tfconst):
#hv_op_tmp = gradients_impl._hessian_vector_product(self.objective, [self.img], [randv])[0]-10*randv
hv_op_tmp = gradients_impl._hessian_vector_product(self.objective, [self.img], [randv])[0]-tf.multiply(tfconst,randv)
hv_op_rs = tf.reshape(hv_op_tmp, (tf.shape(hv_op_tmp)[0],-1))
hv_norm_op = tf.norm(hv_op_rs, axis = 1, keepdims=True)
hv_op_rs_normalize = hv_op_rs/hv_norm_op
hv_op = tf.reshape(hv_op_rs_normalize, tf.shape(hv_op_tmp))
randv_rs = tf.reshape(randv, (tf.shape(randv)[0],-1))
randv_norm_op = tf.norm(randv_rs, axis = 1)
vhv_op = tf.reduce_sum(tf.multiply(randv_rs,hv_op_rs),axis=1)
eig_est_prev = eig_est
eig_est = vhv_op/tf.square(randv_norm_op)
return (it+1, hv_op, eig_est, eig_est_prev, tfconst)
it = tf.constant(0)
# compute largest abs eigenvalue
result = tf.while_loop(cond, body, [it, self.randv, self.vhv_op, self.eig_est, tf.constant(0.0)])
# compute largest neg eigenvalue
self.shiftconst = tf.placeholder(shape = (), dtype = tf.float32)
result_1 = tf.while_loop(cond, body, [it, self.randv, self.vhv_op, self.eig_est, self.shiftconst])
# computing largest abs eig value and save result
self.it = result[0]
self.while_hv_op = result[1]
self.while_eig = result[2]
# computing largest neg eig value and save result
self.it_1 = result_1[0]
#self.while_eig_1 = tf.add(result_1[2], tfconst)
self.while_eig_1 = tf.add(result_1[2], result_1[4])
show_tensor_op = False
if show_tensor_op:
print("====================")
print("Define hessian_vector_product operator: ")
print("hv_op_tmp = {}".format(hv_op_tmp))
print("hv_op_rs = {}".format(hv_op_rs))
print("self.hv_norm_op = {}".format(self.hv_norm_op))
print("hv_op_rs_normalize = {}".format(hv_op_rs_normalize))
print("self.hv_op = {}".format(self.hv_op))
print("self.grad_op = {}".format(self.grad_op))
print("randv_rs = {}".format(randv_rs))
print("self.randv_norm_op = {}".format(self.randv_norm_op))
print("self.vhv_op = {}".format(self.vhv_op))
print("self.eig_est = {}".format(self.eig_est))
print("====================")
return self.img, self.output
def _estimate_Lipschitz_multiplerun(self, num, niters, input_image, target_label, true_label, sample_norm = "l2", transform=None, order = 1):
"""
num: number of samples per iteration
niters: number of iterations
input_image: original image (h*w*c)
"""
batch_size = self.batch_size
shape = (batch_size, self.model.image_size, self.model.image_size, self.model.num_channels)
dimension = self.model.image_size * self.model.image_size * self.model.num_channels
if num < batch_size:
print("Increasing num to", batch_size)
num = batch_size
"""
1. Compute input_image related quantities:
"""
# get the original prediction and gradient, gradient norms values on input image:
pred, grad_val, grad_2_norm_val, grad_1_norm_val, grad_inf_norm_val = self.sess.run(
[self.output, self.grad_op, self.grad_2_norm_op, self.grad_1_norm_op, self.grad_inf_norm_op],
feed_dict = {self.img: [input_image], self.true_label: true_label, self.target_label: target_label})
pred = np.squeeze(pred)
# print(pred)
# print(grad_val)
# class c and class j in Hein's paper. c is original class
c = true_label
j = target_label
# get g_x0 = f_c(x_0) - f_j(x_0)
g_x0 = pred[c] - pred[j]
# grad_z_norm should be scalar
g_x0_grad_2_norm = np.squeeze(grad_2_norm_val)
g_x0_grad_1_norm = np.squeeze(grad_1_norm_val)
g_x0_grad_inf_norm = np.squeeze(grad_inf_norm_val)
print("** Evaluating g_x0, grad_2_norm_val on the input image x0: ")
print("shape of input_image = {}".format(input_image.shape))
print("g_x0 = {:.3f}, grad_2_norm_val = {:3f}, grad_1_norm_val = {:.3f}, grad_inf_norm_val = {:3f}".format(g_x0, g_x0_grad_2_norm, g_x0_grad_1_norm, g_x0_grad_inf_norm))
##### Lily #####
if order == 2: # evaluate the hv and hv norm on input_image
# set randv as a random matrix with the same shape as input_image
print("** Evaluating hv and hv_norm on the input image x0:")
randv = np.random.randn(*input_image.shape)
hv, hv_norm = self.sess.run([self.hv_op, self.hv_norm_op],
feed_dict = {self.img: [input_image], self.randv:[randv], self.true_label: true_label, self.target_label: target_label})
print("hv shape = {}, hv_norm = {}".format(hv.shape, hv_norm))
"""
2. Prepare for sampling:
"""
def div_work_to_cores(njobs, nprocs):
process_item_list = []
while njobs > 0:
process_item_list.append(int(np.ceil(njobs / float(nprocs))))
njobs -= process_item_list[-1]
nprocs -= 1
return process_item_list
# n is the dimension
if self.dataset == "imagenet":
# for imagenet, generate random samples for this batch only
# array in shared memory storing results of all threads
total_item_size = batch_size
else:
# for cifar and mnist, generate random samples for this entire iteration
total_item_size = num
# divide the jobs evenly to all available threads
process_item_list = div_work_to_cores(total_item_size, self.n_processes)
self.n_processes = len(process_item_list)
# select random sample generation function
if sample_norm == "l2":
# the scaling constant in [a,b]: scale the L2 norm of each sample (has originally norm ~1)
a = 0; b = 3;
elif sample_norm == "li":
# for Linf we don't need the scaling
a = 0.1; b = 0.1;
elif sample_norm == "l1":
# TODO: make the sample ball radius adjustable
a = 0; b = 30;
else:
raise RuntimeError("Unknown sample_norm " + sample_norm)
print('Using sphere', sample_norm)
## create necessary shared array structures (saved in /dev/shm) and will be used (and written) in randsphere.py:
# result_arr, scale, input_example, all_inputs
# note: need to use scale[:] = ... not scale = ..., o.w. the contents will not be saved to the shared array
# inputs_0 is the image x_0
inputs_0 = np.array(input_image)
tag_prefix = str(os.getpid()) + "_"
result_arr = NpShmemArray(np.float32, (total_item_size, dimension), tag_prefix + "randsphere")
# we have an extra batch_size to avoid overflow
scale = NpShmemArray(np.float32, (num+batch_size), tag_prefix + "scale")
scale[:] = (b-a)*np.random.rand(num+batch_size)+a;
input_example = NpShmemArray(np.float32, inputs_0.shape, tag_prefix + "input_example")
# this is a read-only array
input_example[:] = inputs_0
# all_inputs is a shared memeory array and will be written in the randsphere to save the samples
# all_inputs holds the perturbations for one batch or all samples
all_inputs = NpShmemArray(np.float32, (total_item_size,) + inputs_0.shape, tag_prefix + "all_inputs")
# holds the results copied from all_inputs
clipped_all_inputs = np.empty(dtype=np.float32, shape = (total_item_size,) + inputs_0.shape)
# prepare the argument list
offset_list = [0]
for item in process_item_list[:-1]:
offset_list.append(offset_list[-1] + item)
print(self.n_processes, "threads launched with parameter", process_item_list, offset_list)
## create multiple process to generate samples
# randsphere: generate samples (see randsphere.py); partial is a function similar to lambda, now worker_func is a function of idx only
worker_func = partial(randsphere, n = dimension, input_shape = inputs_0.shape, total_size = total_item_size, scale_size = num+batch_size, tag_prefix = tag_prefix, r = 1.0, norm = sample_norm, transform = transform)
worker_args = list(zip(process_item_list, offset_list, [0] * self.n_processes))
# sample_results is an object to monitor if the process has ended (meaning finish generating samples in randsphere.py)
# this line of code will initiate the worker_func to start working (like initiate the job)
sample_results = self.pool.map_async(worker_func, worker_args)
# num: # of samples to be run, \leq samples.shape[0]
# number of iterations
Niters = niters;
if order == 1:
# store the max L in each iteration
L2_max = np.zeros(Niters)
L1_max = np.zeros(Niters)
Li_max = np.zeros(Niters)
# store the max G in each iteration
G2_max = np.zeros(Niters)
G1_max = np.zeros(Niters)
Gi_max = np.zeros(Niters)
# store computed Lispschitz constants in each iteration
L2 = np.zeros(num)
L1 = np.zeros(num)
Li = np.zeros(num)
# store computed gradient norm in each iteration
G2 = np.zeros(num)
G1 = np.zeros(num)
Gi = np.zeros(num)
elif order == 2:
# store the max H in each iteration
H2_max = np.zeros(Niters)
# store computed 2 norm of H in each iteration
H2 = np.zeros(num)
H2_neg = np.zeros(num)
# how many batches we have
Nbatches = num // batch_size
# timer
search_begin_time = time.time()
"""
3. Start performing sampling:
"""
## Start
# multiple runs: generating the samples
## use worker_func to generate x samples, and then use sess.run to evaluate the gradient norm operator
for iters in range(Niters):
iter_begin_time = time.time()
# shuffled index
# idx_shuffle = np.random.permutation(num);
# the scaling constant in [a,b]: scale the L2 norm of each sample (has originally norm ~1)
scale[:] = (b-a)*np.random.rand(num+batch_size)+a;
# number of L's we have computed
L_counter = 0
G_counter = 0
H_counter = 0
overhead_time = 0.0
overhead_start = time.time()
# for cifar and mnist, generate all the random input samples (x in the paper) at once
# for imagenet, generate one batch of input samples (x in the paper) for each iteration
if self.dataset != "imagenet":
# get samples for this iteration: make sure randsphere finished computing samples and stored in all_inputs
# if the samples have not yet done generating, then this line will block the codes until the processes are done, then it will return
sample_results.get()
# copy the results to a buffer and do clipping
np.clip(all_inputs, -0.5, 0.5, out = clipped_all_inputs)
# create multiple process again to generate samples for next batch (initiate a new job) because in below we will need to do sess.run in GPU which might be slow. So we want to generate samples on CPU while running sess.run on GPU to save time
sample_results = self.pool.map_async(worker_func, worker_args)
overhead_time += time.time() - overhead_start
## generate input samples "batch_inputs" and compute corresponding gradient norms samples "perturbed_grad_x_norm"
for i in range(Nbatches):
overhead_start = time.time()
# for imagenet, generate random samples for this batch only
if self.dataset == "imagenet":
# get samples for this batch
sample_results.get()
# copy the results to a buffer and do clipping
np.clip(all_inputs, -0.5, 0.5, out = clipped_all_inputs)
# create multiple threads to generate samples for next batch
worker_args = zip(process_item_list, offset_list, [(i + 1) * batch_size] * self.n_processes)
sample_results = self.pool.map_async(worker_func, worker_args)
if self.dataset == "imagenet":
# we generate samples for each batch at a time
batch_inputs = clipped_all_inputs
else:
# we generate samples for all batches
batch_inputs = clipped_all_inputs[i * batch_size: (i + 1) * batch_size]
# print(result_arr.shape, result_arr)
# print('------------------------')
# print(batch_inputs.shape, batch_inputs.reshape(result_arr.shape))
# print('------------------------')
overhead_time += time.time() - overhead_start
if order == 1:
# run inference and get the gradient
perturbed_predicts, perturbed_grad_2_norm, perturbed_grad_1_norm, perturbed_grad_inf_norm = self.sess.run(
[self.output, self.grad_2_norm_op, self.grad_1_norm_op, self.grad_inf_norm_op],
feed_dict = {self.img: batch_inputs, self.target_label: target_label, self.true_label: true_label})
if self.compute_slope:
s = batch_inputs.reshape(batch_size, -1)
# compute distance between consecutive samples: not use sequential samples
s12_2_norm = np.linalg.norm(s[0:batch_size-1:2] - s[1:batch_size:2], axis = 1)
s12_1_norm = np.linalg.norm(s[0:batch_size-1:2] - s[1:batch_size:2], ord=1, axis = 1)
s12_i_norm = np.linalg.norm(s[0:batch_size-1:2] - s[1:batch_size:2], ord=np.inf, axis = 1)
# compute function value differences: not use sequential samples
g_x1 = perturbed_predicts[0:batch_size-1:2, c] - perturbed_predicts[0:batch_size-1:2, j]
g_x2 = perturbed_predicts[1:batch_size:2, c] - perturbed_predicts[1:batch_size:2, j]
# estimated Lipschitz constants for this batch
# for slope estimate, we need the DUAL norm
batch_L2 = np.abs(g_x1 - g_x2) / s12_2_norm
batch_L1 = np.abs(g_x1 - g_x2) / s12_i_norm
batch_Li = np.abs(g_x1 - g_x2) / s12_1_norm
L2[L_counter : L_counter + batch_size//2] = batch_L2
L1[L_counter : L_counter + batch_size//2] = batch_L1
Li[L_counter : L_counter + batch_size//2] = batch_Li
G2[G_counter : G_counter + batch_size] = perturbed_grad_2_norm
G1[G_counter : G_counter + batch_size] = perturbed_grad_1_norm
Gi[G_counter : G_counter + batch_size] = perturbed_grad_inf_norm
L_counter += (batch_size//2)
G_counter += batch_size
elif order == 2:
##### Lily #####
randv_batch = np.random.randn(*batch_inputs.shape)
perturbed_hv, perturbed_hv_norm = self.sess.run([self.hv_op, self.hv_norm_op],
feed_dict = {self.img: batch_inputs, self.randv: randv_batch,
self.true_label: true_label, self.target_label: target_label})
show_tensor_dim = False
if show_tensor_dim:
print("====================")
print("** Evaluating perturbed_hv and perturbed_hv_norm in batch {}: ".format(iters))
print("pertubed_hv_prod shape = {}".format(perturbed_hv.shape))
print("randv_batch shape = {}".format(randv_batch.shape))
print("perturbed_hv_norm = {}".format(perturbed_hv_norm[:,0])) # size: (Nimg, 1)
print("perturbed_hv_norm shape = {}".format(perturbed_hv_norm.shape))
#print("perturbed_grad_2_norm= {}".format(perturbed_grad_2_norm))
#print("perturbed_grad_2_norm shape = {}".format(perturbed_grad_2_norm.shape))
pt_hvs = []
pt_hvs.append(perturbed_hv+0*randv_batch)
#print("************** Using tf.while_loop:********************")
# compute max eigenvalue
temp_hv, temp_eig, niter_eig = self.sess.run([self.while_hv_op, self.while_eig, self.it], feed_dict = {self.img: batch_inputs, self.randv: randv_batch, self.true_label: true_label, self.target_label: target_label})
##print("converge in {} steps, temp_eig = {}".format(niter_eig, temp_eig))
# if max eigenvalue is positive, compute the max neg eigenvalue by using the shiftconst
if max(temp_eig) > 0:
shiftconst = max(temp_eig)
temp_eig_1, niter_eig_1 = self.sess.run([self.while_eig_1, self.it_1], feed_dict = {self.img: batch_inputs, self.randv: randv_batch, self.true_label: true_label, self.target_label: target_label, self.shiftconst: shiftconst})
##print("converge in {} steps, temp_eig_1 = {}".format(niter_eig_1, temp_eig_1))
else:
temp_eig_1 = temp_eig
niter_eig_1 = -1
print("temp_eig (abs) converge in {} steps, temp_eig_1 (neg) converge in {} steps".format(niter_eig, niter_eig_1))
## use outer while_loop
#max_eig_iters = 10
#print_flag = True
#final_est_eig_1 = self._compute_max_abseig(pt_hvs, batch_inputs, true_label, target_label, max_eig_iters, print_flag)
#print("************** Using outer while_loop:********************")
#print("outer loop final_est_eig_1 = {}".format(final_est_eig_1))
## use tf while_loop
final_est_eig = temp_eig
final_est_eig_neg = temp_eig_1
H2[H_counter : H_counter + batch_size] = final_est_eig
H2_neg[H_counter : H_counter + batch_size] = final_est_eig_neg
H_counter += batch_size
if order == 1:
# at the end of each iteration: get the per-iteration max gradient norm
if self.compute_slope:
L2_max[iters] = np.max(L2)
L1_max[iters] = np.max(L1)
Li_max[iters] = np.max(Li)
G2_max[iters] = np.max(G2)
G1_max[iters] = np.max(G1)
Gi_max[iters] = np.max(Gi)
if self.compute_slope:
print('[STATS][L2] loop = {}, time = {:.5g}, iter_time = {:.5g}, overhead = {:.5g}, L2 = {:.5g}, L1 = {:.5g}, Linf = {:.5g}, G2 = {:.5g}, G1 = {:.5g}, Ginf = {:.5g}'.format(iters, time.time() - search_begin_time, time.time() - iter_begin_time, overhead_time, L2_max[iters], L1_max[iters], Li_max[iters], G2_max[iters], G1_max[iters], Gi_max[iters]))
else:
print('[STATS][L2] loop = {}, time = {:.5g}, iter_time = {:.5g}, overhead = {:.5g}, G2 = {:.5g}, G1 = {:.5g}, Ginf = {:.5g}'.format(iters, time.time() - search_begin_time, time.time() - iter_begin_time, overhead_time, G2_max[iters], G1_max[iters], Gi_max[iters]))
sys.stdout.flush()
# reset per iteration L and G by filling 0
if self.compute_slope:
L2.fill(0)
L1.fill(0)
Li.fill(0)
G2.fill(0)
G1.fill(0)
Gi.fill(0)
elif order == 2:
## consider -lambda_min
idx = H2 > 0
H2[idx] = H2_neg[idx]
idx_max = np.argmax(abs(H2))
H2_max[iters] = H2[idx_max]
print('[STATS][L2] loop = {}, time = {:.5g}, iter_time = {:.5g}, overhead = {:.5g}, H2 = {:.5g}'.format(iters, time.time() - search_begin_time, time.time() - iter_begin_time, overhead_time, H2_max[iters]))
if order == 1:
print('[STATS][L1] g_x0 = {:.5g}, L2_max = {:.5g}, L1_max = {:.5g}, Linf_max = {:.5g}, G2_max = {:.5g}, G1_max = {:.5g}, Ginf_max = {:.5g}'.format(
g_x0, np.max(L2_max), np.max(L1_max), np.max(Li_max), np.max(G2_max), np.max(G1_max), np.max(Gi_max)))
# when compute the bound we need the DUAL norm
if self.compute_slope:
print('[STATS][L1] bnd_L2_max = {:.5g}, bnd_L1_max = {:.5g}, bnd_Linf_max = {:.5g}, bnd_G2_max = {:.5g}, bnd_G1_max = {:.5g}, bnd_Ginf_max = {:.5g}'.format(g_x0/np.max(L2_max), g_x0/np.max(Li_max), g_x0/np.max(L1_max), g_x0/np.max(G2_max), g_x0/np.max(Gi_max), g_x0/np.max(G1_max)))
else:
print('[STATS][L1] bnd_G2_max = {:.5g}, bnd_G1_max = {:.5g}, bnd_Ginf_max = {:.5g}'.format(g_x0/np.max(G2_max), g_x0/np.max(Gi_max), g_x0/np.max(G1_max)))
sys.stdout.flush()
# discard the last batch of samples
sample_results.get()
return [L2_max,L1_max,Li_max,G2_max,G1_max,Gi_max,g_x0,pred]
elif order == 2:
# find positive eig value and substitute with its corresponding negative eig value, then we only need to sort once
#print("H2_max = {}".format(H2_max))
# find max abs(H2_max)
H2_max_val = max(abs(H2_max))
print('[STATS][L1] g_x0 = {:.5g}, g_x0_grad_2_norm = {:.5g}, g_x0_grad_1_norm = {:.5g}, g_x0_grad_inf_norm = {:.5g}, H2_max = {:.5g}'.format(g_x0, g_x0_grad_2_norm, g_x0_grad_1_norm, g_x0_grad_inf_norm, H2_max_val))
bnd = (-g_x0_grad_2_norm + np.sqrt(g_x0_grad_2_norm**2+2*g_x0*H2_max_val))/H2_max_val
print('[STATS][L1] bnd_H2_max = {:.5g}'.format(bnd))
sys.stdout.flush()
sample_results.get()
return [H2_max, g_x0, g_x0_grad_2_norm, g_x0_grad_1_norm, g_x0_grad_inf_norm, pred]
def _compute_max_abseig(self, pt_hvs, batch_inputs, true_label, target_label, max_eig_iters, print_flag):
## compute hv and est_eig:
i = 0
cond = False
pt_eigs = []
print("pt_hvs[0] shape = {}".format(pt_hvs[0].shape))
# perform power iteration loop outside tensorflow
while (i<max_eig_iters and cond==False):
tmp_hv, tmp_hv_norm, tmp_vhv, tmp_vnorm, tmp_est_eig = self.sess.run([self.hv_op, self.hv_norm_op, self.vhv_op, self.randv_norm_op, self.eig_est], feed_dict = {self.img: batch_inputs, self.randv: pt_hvs[i], self.true_label: true_label, self.target_label: target_label})
tmp_vhv = np.squeeze(tmp_vhv)
tmp_vnorm = np.squeeze(tmp_vnorm)
tmp_est_eig = np.squeeze(tmp_est_eig)
if print_flag:
#print("current step = {}, norm = {}".format(i, tmp_hv_norm[:,0]))
#print("current step = {}, pt_hv_prod.shape = {}, pt_hvs_norm.shape = {}".format(i,tmp_hv.shape, tmp_hv_norm.shape))
print("current step = {}, est_eig = {}".format(i,tmp_est_eig-0))
#print("current step = {}, vhv = {}".format(i,tmp_vhv))
#print("current step = {}, vnorm (check: should be 1) = {}".format(i,tmp_vnorm))
pt_hvs.append(tmp_hv+0*pt_hvs[i])
pt_eigs.append(tmp_est_eig)
# conditions
if i > 0:
cond_element = abs(tmp_est_eig-pt_eigs[i-1]) < 1e-3
if print_flag:
print("cond = {}".format(cond_element))
cond = cond_element.all()
i+=1
if i == max_eig_iters:
print("==== Reach max iterations!!! ====")
return pt_eigs[-1]
def __del__(self):
# terminate the pool
self.pool.terminate()
def estimate(self, x_0, true_label, target_label, Nsamp, Niters, sample_norm, transform, order):
result = self._estimate_Lipschitz_multiplerun(Nsamp,Niters,x_0,target_label,true_label,sample_norm, transform, order)
return result