-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrainModel.py
executable file
·176 lines (145 loc) · 6.53 KB
/
trainModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python3
# Take a python file defining a Pipeline, train it on a given data set,
# and pickle it to a file so the model can be used to predict new samples.
#
# The python file should define a variable 'pipelines'.
# this can be a Pipeline object
# or a list of Pipelines, in which case pipelines[0] is trained
#
# If the Pipeline has a method to get features and their weights/coefficients,
# also write a top weighted feature report after training.
#
import sys
import os.path
import argparse
import pickle
import sklearnHelperLib as skHelper
import utilsLib
import tuningReportsLib as trl
from sklearn.pipeline import Pipeline
NUM_TOP_FEATURES=50 # number of highly weighted features to report
PIPELINE_FILE = "goodPipelines.py"
OUTPUT_PICKLE_FILE = "goodModel.pkl"
DEFAULT_SAMPLEDATALIB = "sampleDataLib"
DEFAULT_SAMPLE_TYPE = "ClassifiedSample"
#-----------------------
def parseCmdLine():
parser = argparse.ArgumentParser( \
description='Train a model and pickle it so it can be used to predict.')
parser.add_argument('inputFiles', nargs='+',
help='files of samples or -, may be sklearn load_files dirs')
parser.add_argument('-m', '--model', dest='pipelineFile',
default=PIPELINE_FILE,
help='Pipeline source (.py).' +
'Expects "pipeline" a Pipeline object or list (trains the 0th). ' +
'May be a Pipeline .pkl file. Default: %s' % PIPELINE_FILE)
parser.add_argument('-p', '--preprocessor', metavar='PREPROCESSOR',
dest='preprocessors', action='append', required=False, default=None,
help='preprocessor, multiples are applied in order. Default is none.' )
parser.add_argument('-o', '--output', dest='outputPklFile',
default=OUTPUT_PICKLE_FILE,
help='output pickle file for trained model. Default: "%s"' \
% OUTPUT_PICKLE_FILE)
parser.add_argument('-f', '--features', dest='featureFile', default=None,
help='output file for top weighted features. Default: None')
parser.add_argument('--numfeatures', dest='numTopFeatures',
type=int, default=NUM_TOP_FEATURES,
help='num of top weighted features to output. Default: %d' % \
NUM_TOP_FEATURES)
parser.add_argument('--sampledatalib', dest='sampleDataLib',
default=DEFAULT_SAMPLEDATALIB,
help="Module to import that defines python sample class. " +
"Default: %s" % DEFAULT_SAMPLEDATALIB)
parser.add_argument('--sampletype', dest='sampleObjTypeName',
default=DEFAULT_SAMPLE_TYPE,
help="Sample class name to use if not specified in sample file. " +
"Default: %s" % DEFAULT_SAMPLE_TYPE)
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
default=True, help="include helpful messages to stdout, default")
parser.add_argument('-q', '--quiet', dest='verbose', action='store_false',
help="skip helpful messages to stdout")
return parser.parse_args()
#-----------------------
args = parseCmdLine()
sampleDataLib = utilsLib.importPyFile(args.sampleDataLib)
#-----------------------
def main():
#-----------------------
# get default sampleObjType
if not hasattr(sampleDataLib, args.sampleObjTypeName):
sys.stderr.write("invalid sample class name '%s'\n" \
% args.sampleObjTypeName)
exit(5)
sampleObjType = getattr(sampleDataLib, args.sampleObjTypeName)
pipeline = getPipeline()
trainSet = getTrainingSet(sampleObjType)
if args.preprocessors:
verbose("Running preprocessors %s\n" % str(args.preprocessors))
rejects = trainSet.preprocess(args.preprocessors)
verbose("...done\n")
verbose("Training...\n")
pipeline.fit(trainSet.getDocuments(), trainSet.getKnownYvalues())
verbose("Done\n")
with open(args.outputPklFile, 'wb') as fp:
pickle.dump(pipeline, fp)
verbose("Trained model written to '%s'\n" % \
os.path.abspath(args.outputPklFile))
if args.featureFile:
writeFeaturesFile(pipeline, args.featureFile)
#-----------------------
def writeFeaturesFile(pipeline, fileName):
vectorizer = pipeline.named_steps['vectorizer']
classifier = pipeline.named_steps['classifier']
orderedFeatures = skHelper.getOrderedFeatures(vectorizer, classifier)
if len(orderedFeatures) == 0:
verbose("No feature weights/coefs are available for this Pipeline\n")
else:
fp = open(fileName, 'w')
fp.write(trl.getTopFeaturesReport(orderedFeatures, args.numTopFeatures))
verbose("Top weighted features written to '%s'\n" % \
os.path.abspath(fileName))
#-----------------------
def getTrainingSet(sampleObjType):
sampleSet = sampleDataLib.ClassifiedSampleSet(sampleObjType=sampleObjType)
for fn in args.inputFiles:
verbose("Reading '%s' ...\n" % os.path.abspath(fn))
if fn == '-': fn = sys.stdin
sampleSet.read(fn)
verbose("Sample type '%s'\n" % sampleSet.getSampleObjType().__name__ )
verbose("...done %d total documents.\n" % sampleSet.getNumSamples())
return sampleSet
#-----------------------
def getPipeline():
fileName = args.pipelineFile
ext = os.path.splitext(fileName)[1]
if ext == '.py':
verbose("Importing model source file '%s'\n" % \
os.path.abspath(fileName))
pipeline = utilsLib.importPyFile(fileName).pipeline
if type(pipeline) == type([]):
pipeline = pipeline[0]
elif ext == '.pkl':
verbose("Loading model '%s'\n" % os.path.abspath(fileName))
with open(fileName, 'rb') as fp:
pipeline = pickle.load(fp)
verbose("...done\n")
else:
sys.stderr.write("Invalid model file extension: '%s'\n" % ext)
exit(5)
# JIM IS THIS NECESSARY?
# Some classifiers write process info to stdout messing up our
# output.
# "classifier__verbose" assumes the model is a Pipeline
# with a classifier step. This seems like a pretty safe
# assumption, but if it turns out to be false, we'd
# need more logic to figure out the name of the "verbose"
# argument.
pipeline.set_params(classifier__verbose=0)
return pipeline
#-----------------------
def verbose(text):
if args.verbose:
sys.stdout.write(text)
sys.stdout.flush()
#-----------------------
if __name__ == "__main__": main()