-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
executable file
·169 lines (134 loc) · 5.51 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import sys
from sklearn.datasets import fetch_mldata
import random
import numpy as np
from unif_costgen import UnifCostGen
from bernoulli_costgen import BernoulliCostGen
from grad_desc import GradientDescent
from baseline import BaselineMech
from naive import NaiveMech
from rs12 import RS12Mech
from ours import OurMech
# In the dataset, the features are named as follows:
# (28*28 pixel image = 784-length vector)
FEATURES = ['pixel'+str(i) for i in range(784)]
# generate a seed to initialize randomness of anything we create
# for reproducibility and in case they all use system time to seed
def gen_seed():
return random.randint(0,sys.maxint-1)
#--------------------------------------------------------------
# Parameters
# print data into this file
OUTFILE = "plot.py"
# What percentage of the dataset to train on?
TRAIN_FRACTION = 0.5
# which digits to include
INCLUDE = [1,4,9,8]
POS_LABELS = [8,9] # others have negative labels
EXPENSIVE = [1,8] # make these labels more expensive
CHEAP = [4,9]
# number of trials to run
TRIALS = 10
# the budgets to test
budgets = map(float, [40, 80, 160, 320])
# the mechanisms to test
# make sure 'mech_names' correspond to 'mechs'
mech_names = ["baseline", "naive", "rs12 (unif)", "ours"]
mechs = [BaselineMech(GradientDescent(len(FEATURES), POS_LABELS)),
NaiveMech(GradientDescent(len(FEATURES), POS_LABELS)),
RS12Mech(GradientDescent(len(FEATURES), POS_LABELS), gen_seed()),
OurMech(GradientDescent(len(FEATURES), POS_LABELS), gen_seed())]
# the costs to test
# 'cost_names' should correspond to 'costgens'
cost_names = ["unif_indep", "unif_corr", "bernoulli_indep_p=0.2", "bernoulli_corr_p=0.2"]
costgens = [UnifCostGen(gen_seed()),
UnifCostGen(gen_seed(), cheap=CHEAP, expensive=EXPENSIVE),
BernoulliCostGen(gen_seed(), p=0.2),
BernoulliCostGen(gen_seed(), p=0.2, expensive=EXPENSIVE)]
#--------------------------------------------------------------
# Useful method
def split_dataset(Xlist, Ylist):
# divide the dataset into train and test
num_train_points = int(len(Xlist) * TRAIN_FRACTION)
indices = range(len(Xlist))
random.shuffle(indices)
Xtrain = [Xlist[i] for i in indices[0:num_train_points]]
Ytrain = [Ylist[i] for i in indices[0:num_train_points]]
Xtest = [Xlist[i] for i in indices[num_train_points:-1]]
Ytest = [Ylist[i] for i in indices[num_train_points:-1]]
return (Xtrain,Ytrain,Xtest,Ytest)
# ----------------------------------------------------
# start of main script
my_seed = gen_seed() # so we can record it and reproduce the entire experiment
random.seed(my_seed)
print "Loading data (if mldata/mnist-original.mat does not yet exist, will download it)..."
mnist = fetch_mldata('MNIST original', data_home = "./")
print "...loaded."
# filter for the labels we want
Xlist = [mnist.data[i] for i in xrange(len(mnist.data)) if int(mnist.target[i]) in INCLUDE]
Ylist = [int(y) for y in mnist.target if int(y) in INCLUDE]
num_examples = [0]*10
for y in Ylist:
num_examples[y] += 1 # labels are 0,...,9
# save average error and average squared error (for calculating sample variance)
errs = [[[0.0]*len(budgets) for c in cost_names] for m in mech_names]
squared_errs = [[[0.0]*len(budgets) for c in cost_names] for m in mech_names]
for trial in xrange(TRIALS):
print "TRIAL " + str(trial)
(Xtrain,Ytrain,Xtest,Ytest) = split_dataset(Xlist, Ylist)
T = len(Xtrain)
num_features = len(Xtrain[0])
avg_data_norm = np.apply_along_axis(np.linalg.norm, 1, Xtrain).mean()
eta = 0.1 / avg_data_norm # rough heuristic, because the norm of the data is not normalized [0,1]
for ci,costgen in enumerate(costgens):
costgen.normalize(num_examples)
costs = [costgen.draw_cost(Ytrain[i]) for i in xrange(T)]
for bi,B in enumerate(budgets):
for mi,mech in enumerate(mechs):
mech.reset(eta, T, B, cmax=1.0)
temp = mech.train_and_get_err(costs, Xtrain, Ytrain, Xtest, Ytest)
errs[mi][ci][bi] += temp / float(TRIALS)
squared_errs[mi][ci][bi] += temp*temp / float(TRIALS)
# ----------------------------------------------------
# writing out data
# note nothing below here is actually run in this script,
# it is all just written to OUTFILE
# write out the data into a python file that plots it
f = open(OUTFILE, "w")
f.write("# auto-generated by " + sys.argv[0] + "\n")
f.write("import matplotlib, matplotlib.pyplot as plt\n\n")
# write down all of these variables
for s in ["my_seed", "TRIALS", "INCLUDE", "POS_LABELS", "EXPENSIVE", "CHEAP", "budgets", "cost_names", "mech_names", "errs", "squared_errs"]:
try:
f.write(s + " = " + str(eval(s)) + "\n")
except:
pass # if there was some error, try to keep going so we do not lose data
f.write("""
def stddev(mi,ci,bi):
return float(TRIALS)*(squared_errs[mi][ci][bi] - errs[mi][ci][bi]**2.0)/float(TRIALS - 1)
print "maximum sample std deviation: " + str(max([max([max([stddev(mi,ci,bi) for bi in range(len(budgets))]) for ci in range(len(cost_names))]) for mi in range(len(mech_names))]))
linewidth = 1.5
# compare mechanisms
for ci,costname in enumerate(cost_names):
plt.figure()
for mi in range(len(mech_names)):
plt.plot(budgets, errs[mi][ci], linewidth = linewidth)
plt.legend(mech_names)
plt.title("Cost type = " + costname)
plt.xlabel("Budget")
plt.ylabel("risk")
# compare costtypes for our mechanism
try:
mi = mech_names.index("ours")
plt.figure()
for ci,costname in enumerate(cost_names):
plt.plot(budgets,errs[mi][ci], linewidth=linewidth)
plt.legend(cost_names)
plt.title("Ours")
plt.xlabel("Budget")
plt.ylabel("risk")
except:
pass
plt.show()
""") # end f.write()
f.close()