-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathdataset.py
304 lines (272 loc) · 12.6 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# (C) William W. Cohen and Carnegie Mellon University, 2016
import sys
import re
import math
import os.path
import collections
import scipy.sparse as SS
import scipy.io as SIO
import numpy as NP
import numpy.random as NR
import logging
from tensorlog import config
from tensorlog import mutil
from tensorlog import matrixdb
from tensorlog import declare
from tensorlog import util
conf = config.Config()
conf.normalize_outputs = True; conf.help.normalize_outputs = "In .exam files, l1-normalize the weights of valid outputs"
#
# dealing with labeled training data
#
class Dataset(object):
def __init__(self,xDict,yDict):
# dict which maps mode declaration to X matrices for training
self.xDict = xDict
# likewise for Y matrices
self.yDict = yDict
def isSinglePredicate(self):
"""Returns true if all the examples are for a single predicate."""
return len(list(self.xDict.keys()))==1
def extractMode(self,mode):
"""Return a new Dataset that just contains this mode."""
assert mode in self.xDict, 'dataset does not contain mode %s' % str(mode)
return Dataset({mode:self.xDict[mode]}, {mode:self.yDict[mode]})
def modesToLearn(self):
"""Return list of modes associated with the data."""
return list(self.xDict.keys())
def hasMode(self,mode):
"""True if there are examples of the mode in the dataset."""
return (mode in self.yDict and mode in self.xDict)
def getX(self,mode):
"""Get a matrix of all inputs for the mode."""
return self.xDict[mode]
def getY(self,mode):
"""Get a matrix of all desired outputs for the mode."""
return self.yDict[mode]
def size(self):
return sum([m.nnz for m in list(self.xDict.values())]) + sum([m.nnz for m in list(self.yDict.values())])
def shuffle(self):
for mode in self.xDict:
shuffledRowNums = NP.arange(mutil.numRows(self.xDict[mode]))
NR.shuffle(shuffledRowNums)
self.xDict[mode] = mutil.shuffleRows(self.xDict[mode],shuffledRowNums)
self.yDict[mode] = mutil.shuffleRows(self.yDict[mode],shuffledRowNums)
def minibatchIterator(self,batchSize=100,shuffleFirst=True):
"""Iterate over triples (mode,X',Y') where X' and Y' are sets of
batchSize rows from the full data for mode, randomly selected
(without replacement) from the dataset."""
# randomize the order of the examples
if shuffleFirst: self.shuffle()
# then sample an ordering of the modes
modeList = self.modesToLearn()
modeSampleDict = {}
for modeIndex,mode in enumerate(modeList):
numBatches = int(math.ceil( mutil.numRows(self.getX(mode)) / float(batchSize) ))
modeSampleDict[mode] = NP.ones(numBatches,dtype='int')*modeIndex
modeSamples = NP.concatenate(list(modeSampleDict.values()))
NR.shuffle(modeSamples)
# finally produce the minibatches
currentOffset = [0] * len(modeList)
for modeIndex in modeSamples:
mode = modeList[modeIndex]
lo = currentOffset[modeIndex]
bX = mutil.selectRows(self.getX(mode),lo,lo+batchSize)
bY = mutil.selectRows(self.getY(mode),lo,lo+batchSize)
currentOffset[modeIndex] += batchSize
yield mode,bX,bY
def pprint(self):
return ['%s: X %s Y %s' % (str(mode),mutil.pprintSummary(self.xDict[mode]),mutil.pprintSummary(self.yDict[mode])) for mode in self.xDict]
#
# i/o and conversions
#
def serialize(self,dir):
"""Save the dataset on disk."""
if not os.path.exists(dir):
os.mkdir(dir)
dx = dict([(str(k_v[0]),k_v[1]) for k_v in list(self.xDict.items())])
dy = dict([(str(k_v[0]),k_v[1]) for k_v in list(self.yDict.items())])
SIO.savemat(os.path.join(dir,"xDict"),dx,do_compression=True)
SIO.savemat(os.path.join(dir,"yDict"),dy,do_compression=True)
@staticmethod
def deserialize(dir):
"""Recover a saved dataset."""
logging.info('deserializing dataset file '+ dir)
xDict = {}
yDict = {}
SIO.loadmat(os.path.join(dir,"xDict"),xDict)
SIO.loadmat(os.path.join(dir,"yDict"),yDict)
#serialization converts modes to strings so convert them
#back.... it also converts matrices to csr
for d in (xDict,yDict):
for stringKey,mat in list(d.items()):
del d[stringKey]
if not stringKey.startswith('__'):
d[declare.asMode(stringKey)] = SS.csr_matrix(mat)
dset = Dataset(xDict,yDict)
logging.info('deserialized dataset has %d modes and %d non-zeros' % (len(dset.modesToLearn()), dset.size()))
return dset
@staticmethod
def uncacheExamples(dsetFile,db,exampleFile,proppr=True):
"""Build a dataset file from an examples file, serialize it, and
return the de-serialized dataset. Or if that's not necessary,
just deserialize it.
"""
if not os.path.exists(dsetFile) or os.path.getmtime(exampleFile)>os.path.getmtime(dsetFile):
logging.info('serializing examples in %s to %s' % (exampleFile,dsetFile))
dset = Dataset.loadExamples(db,exampleFile,proppr=proppr)
dset.serialize(dsetFile)
os.utime(dsetFile,None) #update the modification time for the directory
return dset
else:
return Dataset.deserialize(dsetFile)
@staticmethod
def uncacheMatrix(dsetFile,db,functorToLearn,functorInDB):
"""Build a dataset file from a DB matrix as specified with loadMatrix
and serialize it. Or if that's not necessary, just
deserialize it.
"""
if not os.path.exists(dsetFile):
print(('preparing examples from',functorToLearn,'...'))
dset = Dataset.loadMatrix(db,functorToLearn,functorInDB)
print(('serializing dsetFile',dsetFile,'...'))
dset.serialize(dsetFile)
return dset
else:
print(('de-serializing dsetFile',dsetFile,'...'))
return Dataset.deserialize(dsetFile)
# TODO remove or make type-aware
@staticmethod
def loadMatrix(db,functorToLearn,functorInDB):
"""Convert a DB matrix containing pairs x,f(x) to training data for a
learner. For each row x with non-zero entries, copy that row
to Y, and and also append a one-hot representation of x to the
corresponding row of X.
"""
assert db.isTypeless(),'cannot run loadMatrix on database with defined types'
functorToLearn = declare.asMode(functorToLearn)
xrows = []
yrows = []
m = db.matEncoding[(functorInDB,2)].tocoo()
n = db.dim()
for i in range(len(m.data)):
x = m.row[i]
xrows.append(SS.csr_matrix( ([1.0],([0],[x])), shape=(1,n) ))
rx = m.getrow(x)
yrows.append(rx * (1.0/rx.sum()))
return Dataset({functorToLearn:mutil.stack(xrows)},{functorToLearn:mutil.stack(yrows)})
@staticmethod
def _parseLine(line,proppr=True):
#returns mode, x, positive y's where x and ys are symbols
if not line.strip() or line[0]=='#':
return None,None,None
parts = line.strip().split("\t")
if not proppr:
assert len(parts)>=2, 'bad line: %r parts %r' % (line,parts)
return declare.asMode(parts[0]+"/io"),parts[1],parts[2:]
else:
regex = re.compile('(\w+)\((\w+),(\w+)\)')
mx = regex.search(parts[0])
if not mx:
return None,None,None
else:
mode = declare.asMode(mx.group(1)+"/io")
x = mx.group(2)
pos = []
for ans in parts[1:]:
label = ans[0]
my = regex.search(ans[1:])
assert my,'problem at line '+line
assert my.group(1)==mode.functor,'mismatched modes %s %s at line %s' % (my.group(1),mode,line)
assert my.group(2)==x,'mismatched x\'s at line '+line
if label=='+':
pos.append(my.group(3))
return mode,x,pos
@staticmethod
def loadProPPRExamples(db,fileName):
"""Convert a proppr-style foo.examples file to a two dictionaries of
modename->matrix pairs, one for the Xs, one for the Ys"""
return Dataset.loadExamples(db,fileName,proppr=True)
@staticmethod
def loadExamples(db,fileName,proppr=False):
"""Convert foo.exam file, where each line is of the form
functor <TAB> x <TAB> y1 ... yk
to two dictionaries of modename->matrix pairs, one for the Xs,
one for the Ys.
"""
logging.info('loading examples from '+ str(fileName))
# map from relation to lists that buffer data,row
# index,colindex information for each of the X,Y matrices
xDatabuf = collections.defaultdict(list)
xRowbuf = collections.defaultdict(list)
xColbuf = collections.defaultdict(list)
yDatabuf = collections.defaultdict(list)
yRowbuf = collections.defaultdict(list)
yColbuf = collections.defaultdict(list)
xsResult = {}
ysResult = {}
def getId(typeName,symbol):
s = symbol if db.schema.hasId(typeName,symbol) else matrixdb.OOV_ENTITY_NAME
return db.schema.getId(typeName,s)
for line in util.linesIn(fileName):
pred,x,ys = Dataset._parseLine(line,proppr=proppr)
if pred:
xType = db.schema.getDomain(pred.getFunctor(),2)
yType = db.schema.getRange(pred.getFunctor(),2)
row_index = len(xDatabuf[pred])
xDatabuf[pred].append(1.0)
xRowbuf[pred].append(row_index)
xColbuf[pred].append(getId(xType,x))
for y in ys:
yDatabuf[pred].append( 1.0/len(ys) if conf.normalize_outputs else 1.0)
yRowbuf[pred].append(row_index)
yColbuf[pred].append(getId(yType,y))
for pred in list(xDatabuf.keys()):
xType = db.schema.getDomain(pred.getFunctor(),2)
yType = db.schema.getRange(pred.getFunctor(),2)
nrows = len(xDatabuf[pred])
coo_x = SS.coo_matrix((xDatabuf[pred],(xRowbuf[pred],xColbuf[pred])), shape=(nrows,db.dim(xType)))
xsResult[pred] = SS.csr_matrix(coo_x,dtype='float32')
coo_y = SS.coo_matrix((yDatabuf[pred],(yRowbuf[pred],yColbuf[pred])), shape=(nrows,db.dim(yType)))
ysResult[pred] = SS.csr_matrix(coo_y,dtype='float32')
dset = Dataset(xsResult,ysResult)
logging.info('loaded dataset has %d modes and %d non-zeros' % (len(dset.modesToLearn()), dset.size()))
logging.info('in loaded dataset, example normalization (so sum_{y} score[pred(x,y)] == 1) is %r' % conf.normalize_outputs)
return dset
#TODO refactor to also save examples in form: 'functor X Y1
#... Yk'
def saveProPPRExamples(self,fileName,db,append=False,mode=None):
"""Convert X and Y to ProPPR examples and store in a file."""
fp = open(fileName,'a' if append else 'w')
modeKeys = [mode] if mode else list(self.xDict.keys())
for mode in modeKeys:
assert mode in self.yDict, "No mode '%s' in yDict" % mode
functor,arity = mode.getFunctor(),mode.getArity()
dx = db.matrixAsSymbolDict(self.xDict[mode],db.schema.getDomain(functor,arity))
dy = db.matrixAsSymbolDict(self.yDict[mode],db.schema.getRange(functor,arity))
theoryPred = mode.functor
for i in range(max(dx.keys())+1):
dix = dx[i]
diy = dy[i]
assert len(list(dix.keys()))==1,'X row %d is not onehot: %r' % (i,dix)
x = list(dix.keys())[0]
fp.write('%s(%s,Y)' % (theoryPred,x))
for y in list(diy.keys()):
fp.write('\t+%s(%s,%s)' % (theoryPred,x,y))
fp.write('\n')
if __name__ == "__main__":
usage = 'usage: python -m dataset.py --serialize foo.cfacts|foo.db bar.exam|bar.examples glob.dset'
if sys.argv[1]=='--serialize':
assert len(sys.argv)==5,usage
dbFile = sys.argv[2]
examFile = sys.argv[3]
dsetFile = sys.argv[4]
if dbFile.endswith(".cfacts"):
db = matrixdb.MatrixDB.loadFile(dbFile)
elif dbFile.endswith(".db"):
db = matrixdb.MatrixDB.deserialize(dbFile)
else:
assert False,usage
assert examFile.endswith(".examples") or examFile.endswith(".exam"),usage
dset = Dataset.loadExamples(db,examFile,proppr=examFile.endswith(".examples"))
dset.serialize(dsetFile)