From a1c854242a88daf661a9fcedfaa83157ea4c3cf4 Mon Sep 17 00:00:00 2001 From: Ian Goodfellow Date: Mon, 28 May 2012 13:38:15 -0400 Subject: [PATCH] added some options to jia's code --- jiayq/grafting/cifar_random_mb.py | 6 +- jiayq/grafting/grafting_mb.py | 138 ++++++++++++++++-------------- 2 files changed, 77 insertions(+), 67 deletions(-) diff --git a/jiayq/grafting/cifar_random_mb.py b/jiayq/grafting/cifar_random_mb.py index 061759cb..b4c8963a 100644 --- a/jiayq/grafting/cifar_random_mb.py +++ b/jiayq/grafting/cifar_random_mb.py @@ -21,6 +21,7 @@ parser.add_argument('-l', '--read_local_cache', type=int, default=0, help='whether to read local cache or not') parser.add_argument('-c', '--nClass', type=int, default=10, help='number of classes') parser.add_argument('-t', '--random_iterations', type=int, default=1, help='number of random iterations') +parser.add_argument('-s', '--skip_normalization', default=False, action = 'store_true') mpi.rootprint(str(sys.argv)) args = parser.parse_args(sys.argv[1:]) @@ -43,10 +44,11 @@ grafter.load_data_batch(args.data_root, args.batch_size, data_file, label_file,\ rootRead = True, \ - local_cache_root = args.local_cache_root, read_local_cache = args.read_local_cache) + local_cache_root = args.local_cache_root, read_local_cache = args.read_local_cache, + should_normalize = not args.skip_normalization) tester.load_data_batch(args.data_root, args.batch_size, test_data_file, test_label_file,\ rootRead = True, isTest=True, \ local_cache_root = args.local_cache_root, read_local_cache = args.read_local_cache) -grafter.randomselecttest(tester, args.random_iterations) +grafter.randomselecttest(tester, args.random_iterations, should_normalize = not args.skip_normalization) diff --git a/jiayq/grafting/grafting_mb.py b/jiayq/grafting/grafting_mb.py index 85d430c3..1836128a 100644 --- a/jiayq/grafting/grafting_mb.py +++ b/jiayq/grafting/grafting_mb.py @@ -62,17 +62,17 @@ def dot_Asafe(A, B, out, matmul_batch=1024): def exp_safe(x): ''' - compute the safe exp + compute the safe exp ''' return np.exp(np.minimum(x,EXP_MAX)) - + def gL_bnll(y,f): ''' The BNLL gradient ''' expnyf = exp_safe(-y*f+1) return -y*expnyf / (1.0+expnyf) - + def LgL_bnll(y,f): ''' jointly computing the loss and gradient is usually faster @@ -89,7 +89,7 @@ def ObjFuncIncrement(wb, X, y, currwxb, gamma): gwb[:-1] = np.dot(X, gL) / X.shape[1] + gamma*w gwb[-1] = np.mean(gL) return np.mean(L) + gamma/2.0*np.sum(w**2), gwb - + class GrafterMPI: ''' The main grafter class, implemented with MPI support @@ -98,27 +98,27 @@ def __init__(self): self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() - - + + def safebarrier(self, tag=0, sleep=0.01): ''' This is a better mpi barrier than MPI.comm.barrier(): the original barrier may cause idle processes to still occupy the CPU, while this barrier waits. ''' comm = self.comm - size = comm.Get_size() - if size == 1: - return - rank = comm.Get_rank() - mask = 1 - while mask < size: - dst = (rank + mask) % size - src = (rank - mask + size) % size - req = comm.isend(None, dst, tag) - while not comm.Iprobe(src, tag): - time.sleep(sleep) - comm.recv(None, src, tag) - req.Wait() + size = comm.Get_size() + if size == 1: + return + rank = comm.Get_rank() + mask = 1 + while mask < size: + dst = (rank + mask) % size + src = (rank - mask + size) % size + req = comm.isend(None, dst, tag) + while not comm.Iprobe(src, tag): + time.sleep(sleep) + comm.recv(None, src, tag) + req.Wait() mask <<= 1 @@ -201,7 +201,7 @@ def compute_feature(self, codeLocalid, metabinid, normalize=True, target = None) fm.fastmaxm(self.featSlice[codeLocalid], np.nonzero(self.metabins[metabinid])[0], target) if normalize: fm.normalizev(target, self.mLocal[codeLocalid, metabinid], self.stdLocal[codeLocalid, metabinid]) - + def compute_feature_for_code(self, codeLocalid, normalize=True): ''' compute all the features for codeLocalid and store them at featBufferPerCode @@ -223,7 +223,7 @@ def dump_current_state(self, filename): 'weights': self.weights,\ 'b': self.b}, oned_as = 'row') - def normalize_data(self, m = None, std = None): + def normalize_data(self, m = None, std = None, sabotage = False): if self.normalized: mpi.rootprint('Warning: you are re-normalizing.') if m is None or std is None: @@ -233,18 +233,22 @@ def normalize_data(self, m = None, std = None): self.compute_feature(i,j,normalize=False) self.mLocal[i,j] = np.mean(self.featBuffer) self.stdLocal[i,j] = np.std(self.featBuffer)+1e-8 + if sabotage: + self.mLocal[i,j] *= 0. + self.stdLocal[i,j] *= 0. + self.stdLocal[i,j] += 1. else: self.mLocal[:] = m self.stdLocal[:] = std self.normalized = True - + def load_data_batch(self, root, batch_size, file_template, labelfile, \ rootRead = True, isTest = False, \ - local_cache_root = None, read_local_cache = False): + local_cache_root = None, read_local_cache = False, should_normalize = True): ''' - load the data in batches. file_template should be 'filename_{}_{}.mat' - where the batch size and batch id will be filled. The mat file will - contain a variable called 'feat'. labelfile is the file for labels + load the data in batches. file_template should be 'filename_{}_{}.mat' + where the batch size and batch id will be filled. The mat file will + contain a variable called 'feat'. labelfile is the file for labels starting from either 0 or 1 (our code converts the labels to 0 ~ nLabel-1). ''' from scipy import io @@ -340,20 +344,20 @@ def load_data_batch(self, root, batch_size, file_template, labelfile, \ matdata = None self.comm.Bcast(self.rawlabels, root=0) for i in range(self.nData): - # we need to make the label matrix a -1/1 matrix + # we need to make the label matrix a -1/1 matrix self.labels[self.rawlabels[i],i] = 1 if not isTest: mpi.rootprint('Normalizing training data') timer = Timer() - self.normalize_data() + self.normalize_data(sabotage = not should_normalize) mpi.nodeprint('Normalization took {} secs.'.format(timer.lap())) - + def append_feature(self,codeid, metabinid): ''' - find the owner of the feature, broadcast it to all the nodes, and append the + find the owner of the feature, broadcast it to all the nodes, and append the feature to the currently selected features if necessary. - from the owner of the feature, broadcast this feature and append it to the - current selected features. Each instance will update the slice of data it + from the owner of the feature, broadcast this feature and append it to the + current selected features. Each instance will update the slice of data it is responsible for ''' # find the owner @@ -367,7 +371,7 @@ def append_feature(self,codeid, metabinid): self.selCodeID[self.nSelFeats] = codeid self.selMetabinID[self.nSelFeats] = metabinid self.nSelFeats += 1 - + def append_multiple_features(self, codeidlist, metabinidlist, reset=True): ''' Append a set of features in idxlist to the selected Features. @@ -379,7 +383,7 @@ def append_multiple_features(self, codeidlist, metabinidlist, reset=True): self.nSelFeats = 0 for i in range(len(codeidlist)): self.append_feature(codeidlist[i],metabinidlist[i]) - + def select_new_feature_by_grad(self, samplePerRun): ''' the routine to select a new feature by the gradient magnitude @@ -387,7 +391,7 @@ def select_new_feature_by_grad(self, samplePerRun): # compute the local gradient magnitude for feature selection # gL is a nData*nLabel matrix self.gL = np.ascontiguousarray(gL_bnll(self.labels, self.curr_wxb).T, dtype=self.dtype) - + if samplePerRun == 1: # this might take some time: for each feature, we basically # need to regenerate features and compute the dot. @@ -397,7 +401,7 @@ def select_new_feature_by_grad(self, samplePerRun): self.compute_feature_for_code(codeLocalid, normalize=True) #self.localGradMat is a [self.nCodeLocal, self.nMetabins, self.nLabel] matrix self.localGradMat[codeLocalid] = np.dot(self.featBufferPerCode, self.gL) - + # for those features that are selected, we need to add their regularizers my_features = np.nonzero((self.selCodeID[:self.nSelFeats] >= self.codeRange[0]) & \ (self.selCodeID[:self.nSelFeats] < self.codeRange[1]))[0] @@ -406,7 +410,7 @@ def select_new_feature_by_grad(self, samplePerRun): self.localGradMat[self.selCodeID[feat]-self.codeRange[0],self.selMetabinID[feat]] \ += self.gamma * self.nData * curr_weight self.scoreVec[:] = np.sum(self.localGradMat**2, axis=2) - + local_opt_feat_id = self.scoreVec.argmax() local_opt_feat_codeid = local_opt_feat_id / self.nMetabins local_opt_feat_metabinid = local_opt_feat_id % self.nMetabins @@ -423,7 +427,7 @@ def select_new_feature_by_grad(self, samplePerRun): sampleSize = len(not_selected_ones[0]) # shuffle randlist = np.array(range(len(not_selected_ones[0])), dtype=np.int) - np.random.shuffle(randlist) + rng.shuffle(randlist) temp_feat_codelocalid = not_selected_ones[0][randlist] temp_feat_metabinid = not_selected_ones[1][randlist] # do things in batches @@ -444,17 +448,17 @@ def select_new_feature_by_grad(self, samplePerRun): local_opt_feat_score = temp_opt_feat_score local_opt_feat_codeid = temp_feat_codelocalid[temp_opt_feat_id+start]+self.codeRange[0] local_opt_feat_metabinid = temp_feat_metabinid[temp_opt_feat_id+start] - + self.safebarrier() [opt_feat_score, opt_feat_codeid, opt_feat_metabinid] = self.comm.allreduce(\ [local_opt_feat_score, local_opt_feat_codeid, local_opt_feat_metabinid], \ op=MPI.MAX) - + return opt_feat_score, opt_feat_codeid, opt_feat_metabinid def retrain_model(self, nActiveSet=None, samplePerRun = 1.0, factr = 10, pgtol = 1e-08, iprint=-1): ''' - train the current model. Since we often have multiple labels, we will ask + train the current model. Since we often have multiple labels, we will ask each node to do one optimization ''' loss = 0 @@ -464,7 +468,7 @@ def retrain_model(self, nActiveSet=None, samplePerRun = 1.0, factr = 10, pgtol = gw_reduced = np.zeros((self.nLabel, self.nSelFeats), dtype = self.dtype) my_features = np.nonzero((self.selCodeID[:self.nSelFeats] >= self.codeRange[0]) & \ (self.selCodeID[:self.nSelFeats] < self.codeRange[1]))[0] - + if samplePerRun == 1: # distributed gradient computation - actually, lookup. for feat in my_features: @@ -479,7 +483,7 @@ def retrain_model(self, nActiveSet=None, samplePerRun = 1.0, factr = 10, pgtol = # over all the nodes so MPI.SUM will work. self.safebarrier() self.comm.Allreduce(gw_local, gw_reduced, op = MPI.SUM) - + # do approximate model retraining for idx in range(self.nLabel): if idx % self.size != self.rank: @@ -495,7 +499,7 @@ def retrain_model(self, nActiveSet=None, samplePerRun = 1.0, factr = 10, pgtol = else: # we will retrain the model involving the previous 'nActiveSet' number # of features. - # if nActiveSet == 0, this is equivalent to boosting + # if nActiveSet == 0, this is equivalent to boosting # if nActiveSet > 0, we choose the latest features # if nActiveSet < 0, we choose the features with the largest gradients if nActiveSet >= 0: @@ -538,14 +542,14 @@ def retrain_model(self, nActiveSet=None, samplePerRun = 1.0, factr = 10, pgtol = self.b[idx] = self.comm.bcast(self.b[idx], root = idx % self.size) loss = self.comm.allreduce(loss, op=MPI.SUM) return loss - + def compute_current_accuracy(self): ''' Using the current w'x+b to predict the accuracy. The label is simply determined as the one with the largest wxb value. - ''' + ''' return np.sum( np.argmax(self.curr_wxb, axis=0) == self.rawlabels ) / float(self.nData) - + def compute_test_accuracy(self, w, b, confMat = False): ''' compute accuracy for test data @@ -562,7 +566,7 @@ def compute_test_accuracy(self, w, b, confMat = False): return np.sum(predict==self.rawlabels)/float(self.nData), confMat else: return np.sum(predict == self.rawlabels) / float(self.nData) - + def restore_from_dump_file(self, filename, tester=None, dataOnly = False): print 'Not implemented yet.' ''' @@ -579,7 +583,7 @@ def restore_from_dump_file(self, filename, tester=None, dataOnly = False): # debug code if self.nSelFeats != nSelFeatsDump: print 'Warning: {} != {}'.format(self.nSelFeats, nSelFeatsDump) - + if not dataOnly: self.weights[:,:nSelFeatsDump] = matdata['weights'][:,:nSelFeatsDump] self.b[:] = matdata['b'].reshape(self.nLabel) @@ -599,11 +603,11 @@ def train_whole_model(self, tester=None): mpi.rootprint('*'*46) mpi.rootprint('*'*15+'whole featureset'+'*'*15) mpi.rootprint('*'*46) - + if tester is not None: # normalize the test data with the stats of the training data tester.normalize_data(self.mLocal, self.stdLocal) - + timer = Timer() timer.reset() if self.maxGraftDim != self.nMetabins*self.nCodes: @@ -625,22 +629,26 @@ def train_whole_model(self, tester=None): if tester is not None: mpi.rootprint('Current Testing accuracy: {}'.format(tester.compute_test_accuracy(self.weights, self.b))) - def randomselecttest(self, tester=None, random_iterations=1): + def randomselecttest(self, tester=None, random_iterations=1, should_normalize = True): ''' test the performance of random selection + modified by Ian Goodfellow to use seeded random number generation so + that results are replicable ''' self.comm.barrier() mpi.rootprint('*'*46) mpi.rootprint('*'*15+'random selection'+'*'*15) mpi.rootprint('*'*46) - + trainaccu = np.zeros(random_iterations) testaccu = np.zeros(random_iterations) - + + rng = np.random.RandomState([1,2,3]) + if tester is not None: # normalize the test data with the stats of the training data - tester.normalize_data(self.mLocal, self.stdLocal) - + tester.normalize_data(self.mLocal, self.stdLocal, sabotage = not should_normalize) + itertimer = Timer() for iter in range(random_iterations): itertimer.reset() @@ -648,7 +656,7 @@ def randomselecttest(self, tester=None, random_iterations=1): if self.rank == 0: #decide which features we are going to select allidx = np.array(range(self.nCodes*self.nMetabins),dtype=np.int) - np.random.shuffle(allidx) + rng.shuffle(allidx) codeidlist = allidx / self.nMetabins metabinidlist = allidx % self.nMetabins else: @@ -656,7 +664,7 @@ def randomselecttest(self, tester=None, random_iterations=1): metabinidlist = None codeidlist = self.comm.bcast(codeidlist, root=0) metabinidlist = self.comm.bcast(metabinidlist, root=0) - + self.append_multiple_features(codeidlist[:self.maxGraftDim], metabinidlist[:self.maxGraftDim]) mpi.rootprint('Feature selection took {} secs'.format(itertimer.lap())) mpi.rootprint('Training...') @@ -670,7 +678,7 @@ def randomselecttest(self, tester=None, random_iterations=1): mpi.rootprint('Current Testing accuracy: {}'.format(testaccu[iter])) mpi.rootprint('Testing selection took {} secs'.format(itertimer.lap())) self.safebarrier() - + mpi.rootprint('*'*15+'Summary'+'*'*15) mpi.rootprint('Training accuracy: {} +- {}'.format(np.mean(trainaccu),np.std(trainaccu))) mpi.rootprint('Testing accuracy: {} +- {}'.format(np.mean(testaccu),np.std(testaccu))) @@ -717,13 +725,13 @@ def graft(self, dump_every = 0, \ mpi.rootprint('dump_every = {}\nnActiveSet={}\ntest_every={}\nsamplePerRun={}'.format(\ dump_every, nActiveSet, test_every, samplePerRun)) self.comm.barrier() - + if tester is not None: # normalize the test data with the stats of the training data tester.normalize_data(self.mLocal, self.stdLocal) if fromDumpFile is not None: self.restore_from_dump_file(fromDumpFile, tester) - + old_loss = 1e10 timer = Timer() itertimer = Timer() @@ -749,19 +757,19 @@ def graft(self, dump_every = 0, \ # print test accuracy test_accuracy = tester.compute_test_accuracy(self.weights, self.b) mpi.rootprint('Current Testing accuracy: {}'.format(test_accuracy)) - + self.safebarrier() mpi.rootprint('This round took {} secs, total {} secs'.format(timer.lap(), timer.total())) mpi.rootprint('ETA {} secs.'.format(timer.total() * (self.maxGraftDim-T)/(T+1.0e-5))) - + if dump_every > 0 and (T+1) % dump_every == 0 and dump_file is not None: mpi.rootprint('*'*15 + 'Dumping' + '*'*15) self.dump_current_state(dump_file + str(T)+'.mat') - + mpi.rootprint('*'*15+'Finalizing'.format(T)+'*'*15) if dump_file is not None: self.dump_current_state(dump_file + 'final.mat') - + if __name__ == "__main__": # Let's moo if it works. import utils.moo