docs and some fixes to high-level apis

TeamCohen · Jul 1, 2016 · 619c695 · 619c695
1 parent 7021ec8
commit 619c695
Show file tree

Hide file tree

Showing 14 changed files with 211 additions and 84 deletions.
diff --git a/README.txt b/README.txt
@@ -1 +1,5 @@
-Theano-based version of ProPPR - see 'docs' directory
+TensorLog is outlined in a [http://arxiv.org/abs/1605.06523 technical
+paper].  There is documentation on the
+[https://github.com/TeamCohen/TensorLog/wiki GitHub wiki page].
+
+
diff --git a/datasets/Makefile b/datasets/Makefile
@@ -0,0 +1,11 @@
+test:
+	(cd cora; make)
+	(cd wordnet; make)
+	(cd grid; make)
+	(cd smokers; make)
+
+clean:
+	(cd cora; make clean)
+	(cd wordnet; make clean)
+	(cd grid; make clean)
+	(cd smokers; make clean)
diff --git a/datasets/family/Makefile b/datasets/family/Makefile
@@ -1,3 +1,4 @@
+#TODO: fix for new API
 
 VPATH= raw
 

diff --git a/datasets/fb15k/Makefile b/datasets/fb15k/Makefile
@@ -1,3 +1,5 @@
+#TODO: update for new API
+
 default:
 	echo make what?
 

diff --git a/datasets/smokers/Makefile b/datasets/smokers/Makefile
@@ -2,4 +2,4 @@ expt:
 	python smokers-expt.py 
 
 clean:
-	rm *~ *.pyc
+	rm -f *~ *.pyc
diff --git a/datasets/wordnet/wnet-expt.py b/datasets/wordnet/wnet-expt.py
@@ -1,28 +1,35 @@
 import sys
 
-import os.path
-import matrixdb
 import expt
-import dataset
 import declare
 import tensorlog
-import ops
 import learn
 
 if __name__=="__main__":
     #usage: [targetPredicate] [epochs]
+
+    #get the command-line options for this experiment
     pred = 'hypernym' if len(sys.argv)<=1 else sys.argv[1]
     epochs = 30 if len(sys.argv)<=2 else int(sys.argv[2])
+
+    # use tensorlog.parseCommandLine to set up the program, etc
     optdict,args = tensorlog.parseCommandLine([
             '--db', 'wnet.db|wnet.cfacts',
             '--prog','wnet-learned.ppr', '--proppr',
             '--train','%s-train.dset|%s-train.examples' % (pred,pred),
             '--test', '%s-test.dset|%s-test.examples' % (pred,pred)])
+
+    # prog is shortcut to the output optdict, for convenience.
     prog = optdict['prog']
+
+    # the weight vector is sparse - just the constants in the unary predicate rule
     prog.setWeights(prog.db.vector(declare.asMode("rule(i)")))
+
+    # use a non-default learner, overriding the tracing function,
+    # number of epochs, and regularizer
     learner = learn.FixedRateGDLearner(prog,regularizer=learn.L2Regularizer(),traceFun=learn.Learner.cheapTraceFun,epochs=epochs)
 
-    #ops.conf.trace = True
+    # configute the experiment
     params = {'prog':prog,
               'trainData':optdict['trainData'], 
               'testData':optdict['testData'],
@@ -32,4 +39,6 @@
               'savedTestExamples':'tmp-cache/%s-test.examples' % pred,
               'learner':learner
     }
+
+    # run the experiment
     expt.Expt(params).run()
diff --git a/doc/QUICKSTART.txt b/doc/QUICKSTART.txt
@@ -0,0 +1,107 @@
+BASICS
+
+A Tensorlog DATABASE is holds a bunch of unary and binary relations,
+which are encoded as scipy sparse matrixes.  The human-readable format
+for this is a set of files with the .cfacts extension.  Some examples,
+from src/test/textcattoy.cfacts:
+
+ hasWord	dh	a
+ hasWord	dh	pricy
+ hasWord	dh	doll
+ hasWord	dh	house
+ hasWord	ft	a
+ hasWord	ft	little
+ hasWord	ft	red
+ hasWord	ft	fire
+ hasWord	ft	truck
+ ...
+ label	pos	
+ label	neg
+
+An additional column can be added which is a numeric weight (so don't
+use any constant that parses to a number in a cfacts file to avoid
+program confusion.)  A database can be SERIALIZED and should be stored
+in a directory with extension .db.
+
+A Tensorlog PROGRAM usually has extension .ppr.  Some examples:
+
+------------------------------------------------------------------------------
+ predict(X,Pos) :- assign(Pos,pos) {all(F): hasWord(X,W),posPair(W,F)}.
+ predict(X,Neg) :- assign(Neg,neg) {all(F): hasWord(X,W),negPair(W,F)}.
+
+ match(R,S) :- fname(R,FR),fmatch(FR,FS),fname(S,FS) {f}.
+ match(R,S) :- lname(R,LR),lmatch(LR,LS),lname(S,LS) {l}.
+ match(R,S) :- addr(R,AR),amatch(AR,AS),addr(S,AS) {a}.
+------------------------------------------------------------------------------
+
+If you use the ProPPR-style rule features (in the curly braces) you
+should 
+ - make sure any constants appearing there are in the database
+ - load the rule file as 'proppr' format, which is NOT the default.
+There's no serialized form of a program.  
+
+A Tensorlog DATASET is given in a file .exam with lines of the form "P
+<tab> X <tab> Y1 <tab> Yk" where
+
+ - P is the functor of some predicate defined in your program
+ - X is an input
+ - Y1...Yk are ALL the outputs for X that are considered correct
+
+Eg, this dataset essentially labels match(r1,r2) and match(r1,r1) as
+positive, and any other fact match(r1,foo) as negative.  The constant
+r3 should only match itself:
+
+----------------------------------------
+ match	r1	r2	r1
+ match	r3	r3
+ ....
+----------------------------------------
+
+A serialized dataset has extension .dset.
+
+
+HOW TO RUN AN EXPERIMENT:
+
+Look at the sample main in src/expt.py, and the sample input files in
+src/test/textcattoy.cfacts and src/test/textcat.ppr.  Some other
+larger examples are in datasets/cora/cora-expt.py and
+datasets/wordnet/wnet-expt.py.
+
+
+HOW TO CONFIGURE TENSORLOG:
+
+Some of the modules have a config.Config() object, which is just an
+object that contains fields which can be used as options.  Any
+user-settable parameters should be in these objects.
+
+HOW TO SERIALIZE A .cfacts FILE AND CREATE A DB FILE:
+
+  % python matrixdb.py --serialize foo.cfacts foo.db
+
+HOW TO DEBUG A TENSORLOG PROGRAM:
+
+Start up an interpreter with the command
+
+  % python -i -m tensorlog --programFiles foo.db:foo.ppr:foo.cfacts:...
+
+You can then evaluate functions with commands like:
+
+ % ti.eval("foo/io", "input_constant")
+
+Try setting these config options before you do 
+
+  ops.trace = True
+  conf.trace = True
+
+You can also insert "printf literals" into a clause, eg
+
+  p(X,Z1):-printf(X,X1),spouse(X1,Y),printf(Y,Y1),sister(Y1,Z),printf(Z,Z1).
+
+These literals just copy the input to the output, but will echo the
+bindings of the variables when the message-passing happens.  (Make
+sure the output variable of the printf is used somewhere "downstream",
+otherwise it's undefined when the print will actually happen.)
+
+Finally there is the debug.py module, which contains a start at a
+graphical debugger.
+
diff --git a/doc/README.txt b/doc/README.txt
diff --git a/src/Makefile b/src/Makefile
@@ -9,6 +9,9 @@ benchmark-test:
 #fb15k-valid.db: matrixdb.py
 #	python $< --serialize test/fb15k-valid.cfacts:test/fb15k.tensorLog.cfacts $@
 
+textcattoy-expt:
+	python expt.py --prog test/textcat.ppr --db test/textcattoy.cfacts --train
+
 fb15k-valid.db:
 	python matrixdb.py --serialize test/fb15k-valid.cfacts  fb15k-valid.db
 

diff --git a/src/debug.py b/src/debug.py
@@ -161,6 +161,8 @@ def mainloop(self):
 
 if __name__ == "__main__":
 
+    #TODO more useful main?
+
     TRAINED = True
 
     if not TRAINED:

diff --git a/src/expt.py b/src/expt.py
@@ -9,6 +9,7 @@
 import logging
 import collections
 
+import tensorlog
 import dataset
 import matrixdb
 import tensorlog
@@ -140,34 +141,49 @@ def printStats(modelMsg,testSet,goldData,predictedData):
 
 if __name__=="__main__":
 
-    if len(sys.argv)<=1 or sys.argv[1]=='textcattoy':
-        db = matrixdb.MatrixDB.uncache('tlog-cache/textcat.db','test/textcattoy.cfacts')
-        trainData = dataset.Dataset.uncacheMatrix('tlog-cache/train.dset',db,'predict/io','train')
-        testData = dataset.Dataset.uncacheMatrix('tlog-cache/test.dset',db,'predict/io','test')
-        prog = tensorlog.ProPPRProgram.load(["test/textcat.ppr"],db=db)
-        initWeights = \
-            (prog.db.matrixPreimage(declare.asMode("posPair(o,i)")) + \
-                 prog.db.matrixPreimage(declare.asMode("negPair(o,i)"))) * 0.5
-    elif len(sys.argv)>1 and sys.argv=='matchtoy':
-        db = matrixdb.MatrixDB.loadFile('test/matchtoy.cfacts')
-        trainData = dataset.Dataset.loadExamples(db,'test/matchtoy-train.exam')
-        testData = trainData
-        prog = tensorlog.ProPPRProgram.load(["test/matchtoy.ppr"],db=db)
-        initWeights = prof.db.ones()
-    else:
-        assert False,'usage: python expt.py [textcattoy|matchtoy]'
-
-    prog.setWeights(initWeights)
-
-    myLearner = learn.FixedRateGDLearner(prog)
-    #myLearner = learn.FixedRateSGDLearner(prog)
-
-    params = {'prog':prog,
-              'trainData':trainData, 'testData':testData,
-              'savedModel':'toy-trained.db',
-              'savedTestPredictions':'tlog-cache/toy-test.solutions.txt',
-              'savedTrainExamples':'tlog-cache/toy-train.examples',
-              'savedTestExamples':'tlog-cache/toy-test.examples',
-              'learner':myLearner
-              }
-    Expt(params).run()
+    try: 
+        optdict,args = tensorlog.parseCommandLine(sys.argv[1:])
+        optdict['prog'].setWeights(optdict['prog'].db.ones())
+        params = {'prog':optdict['prog'],
+                  'trainData':optdict['trainData'],
+                  'testData':optdict['testData'],
+                  'savedModel':'expt-model.db'}
+        Expt(params).run()
+        print 'saved in expt-model.db'
+
+    except Exception:
+
+        def usage():
+            print 'usage: python expt.py --prog a --db b --trainData c --testData d'
+            print 'usage: python expt.py [textcattoy|matchtoy]'
+
+        if len(sys.argv)<2:
+            usage()
+        elif sys.argv[1]=='textcattoy':
+            db = matrixdb.MatrixDB.uncache('tlog-cache/textcat.db','test/textcattoy.cfacts')
+            trainData = dataset.Dataset.uncacheMatrix('tlog-cache/train.dset',db,'predict/io','train')
+            testData = dataset.Dataset.uncacheMatrix('tlog-cache/test.dset',db,'predict/io','test')
+            prog = tensorlog.ProPPRProgram.load(["test/textcat.ppr"],db=db)
+            initWeights = \
+                (prog.db.matrixPreimage(declare.asMode("posPair(o,i)")) + \
+                     prog.db.matrixPreimage(declare.asMode("negPair(o,i)"))) * 0.5
+        elif sys.argv[1]=='matchtoy':
+            db = matrixdb.MatrixDB.loadFile('test/matchtoy.cfacts')
+            trainData = dataset.Dataset.loadExamples(db,'test/matchtoy-train.exam')
+            testData = trainData
+            prog = tensorlog.ProPPRProgram.load(["test/matchtoy.ppr"],db=db)
+            initWeights = prog.db.ones()
+        else:
+            usage()
+
+        prog.setWeights(initWeights)
+
+        myLearner = learn.FixedRateGDLearner(prog)
+        #myLearner = learn.FixedRateSGDLearner(prog)
+
+        params = {'prog':prog,
+                  'trainData':trainData, 'testData':testData,
+                  'savedModel':'toy-trained.db',
+                  'learner':myLearner
+                  }
+        Expt(params).run()
diff --git a/src/tensorlog.py b/src/tensorlog.py
@@ -310,10 +310,10 @@ def usage():
 
     if '--help' in optdict: 
         usage()
-        sys.exit(0)
+        exit(0)
     if (not '--db' in optdict) or (not '--prog' in optdict):
         usage()
-        sys.exit(-1)
+        assert False,'--db and --prog are required options'
 
     db = parseDBSpec(optdict['--db'])
     optdict['--db'] = db
@@ -340,7 +340,7 @@ def parseDatasetSpec(spec,db):
         return dataset.Dataset.uncacheExamples(cache,db,src,proppr=src.endswith(".examples"))
     else:
         assert spec.endswith(".examples") or spec.endswith(".exam"), 'illegal --train or --test file'
-        return dataset.Dataset.loadExamples(cache,db,src,proppr=src.endswith(".examples"))
+        return dataset.Dataset.loadExamples(db,spec,proppr=spec.endswith(".examples"))
 
 def parseDBSpec(spec):
     """Parse a specification for a database, see usage() for parseCommandLine"""

diff --git a/src/test/toytest.exam b/src/test/toytest.exam
@@ -0,0 +1,7 @@
+predict	pb	pos
+predict	yc	pos
+predict	rb2	pos
+predict	rp	pos
+predict	bp	neg
+predict	he	neg
+predict	wt	neg
diff --git a/src/test/toytrain.exam b/src/test/toytrain.exam
@@ -0,0 +1,8 @@
+predict	pb	pos
+predict	yc	pos
+predict	rb2	pos
+predict	rp	pos
+predict	bp	neg
+predict	he	neg
+predict	wt	neg
+