-
Notifications
You must be signed in to change notification settings - Fork 1
/
plotsummary.py
executable file
·183 lines (140 loc) · 6.54 KB
/
plotsummary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python
# Plots various graphs for a series of plaintext files in a directory
# Copyright Martin Paul Eve 2015
"""plotsummary: Plots various graphs for a series of plaintext files in a directory
Usage:
plotsummary.py single <directory> <term_file> [options]
plotsummary.py hist <directory> <term_file> [options]
plotsummary.py group <directory> <term_file> <term_name> <second_term_file> <second_term_name> [options]
plotsummary.py overlap <directory> <first_term> <second_term> [options]
plotsummary.py rawcount <directory> <term_file> [options]
plotsummary.py search <directory> <term> <count> [options]
plotsummary.py (-h | --help)
plotsummary.py --version
Options:
-c, --caption <caption> Specify the output caption
-d, --debug Enable debug output
-h --help Show this screen.
-n, --nostem <nostem> Specify a path containing words that should not be stemmed
--version Show version.
-w, --words <words> Specify the word frequency to sample (default: 5000)
"""
import os
from os import listdir
from os.path import isfile, join
from text import Text
import re
from debug import Debug, Debuggable
from docopt import docopt
from interactive import Interactive
import subprocess
class KernelDensity (Debuggable):
def __init__(self):
# read command line arguments
self.args = self.read_command_line()
# absolute first priority is to initialize debugger so that anything triggered here can be logged
self.debug = Debug()
Debuggable.__init__(self, 'plotsummary')
self.in_dir = self.args['<directory>']
if self.args['<term_file>']:
self.term_file = self.args['<term_file>']
self.terms = [line.strip().lower() for line in open(self.term_file)]
elif self.args["<first_term>"] and self.args["<second_term>"]:
self.terms = []
self.terms.append(self.args["<first_term>"])
self.terms.append(self.args["<second_term>"])
elif self.args["<term>"]:
self.terms = []
self.terms.append(self.args["<term>"])
if self.args["<count>"]:
self.max = int(self.args["<count>"])
self.dir = os.path.dirname(os.path.abspath(__file__))
if self.args['--debug']:
self.debug.enable_debug()
self.debug.enable_prompt(Interactive(self.args['--debug']))
if self.args['--caption']:
self.caption = self.args['--caption']
else:
self.caption = 'Term Plot'
if self.args['--nostem']:
self.nostem = self.args['--nostem']
else:
self.nostem = None
if self.args['single']:
self.action = 'single'
elif self.args['group']:
self.second_term_file = self.args['<second_term_file>']
self.term_name = self.args['<term_name>']
self.second_term_name = self.args['<second_term_name>']
self.second_terms = [line.strip().lower() for line in open(self.second_term_file)]
self.action = 'group'
elif self.args['hist']:
self.action = 'hist'
elif self.args['rawcount']:
self.action = 'rawcount'
elif self.args['overlap']:
self.action = 'overlap'
elif self.args['search']:
self.action = 'search'
if self.args['--words']:
self.words = int(self.args['--words'])
else:
self.words = 5000
@staticmethod
def read_command_line():
return docopt(__doc__, version='kernel-density-estimation v0.1')
def run(self):
if self.args['--debug']:
if self.nostem:
with open(self.nostem) as f:
nostem_words = set(f.read().splitlines())
else:
nostem_words = []
for term in self.terms:
if not term in nostem_words and term != Text.show_stem(term):
self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term)))
else:
self.debug.print_debug(self, u'{0} will not be stemmed'.format(term))
if self.action == 'group':
for term in self.second_terms:
if not term in nostem_words:
self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term)))
else:
self.debug.print_debug(self, u'{0} will not be stemmed'.format(term))
file_list = listdir(self.in_dir)
for file_name in file_list:
if file_name.endswith(".txt"):
self.plot(file_name)
def plot(self, file_name):
self.debug.print_debug(self, u'Loading ' + file_name)
textplot = Text.from_file(join(self.in_dir, file_name), self.debug, nostem=self.nostem)
self.debug.print_debug(self, u'Plotting ' + file_name)
if self.action == 'single':
graph = textplot.plot_terms(self.terms, self.caption)
elif self.action == 'group':
graph = textplot.plot_terms_two_groups(self.terms, self.term_name, self.second_terms,self.second_term_name, self.caption)
elif self.action == 'hist':
graph = textplot.plot_terms_histogram(self.terms, self.caption, self.words)
elif self.action == 'rawcount':
graph = textplot.plot_terms_raw_count(self.terms, self.caption, self.words)
elif self.action == 'overlap':
graph = textplot.plot_kde_overlap(self.terms)
elif self.action == 'search':
newterms = textplot.anchored_scores(self.terms[0])
count = 0
self.debug.print_(self, u'Top twenty correlated terms (with more than one occurrence) for {0}: '.format(self.terms[0]))
for item in newterms:
if len(textplot.terms[item]) > 1 and item != textplot.stem(self.terms[0]):
if count > self.max:
break
self.debug.print_(self, item)
count += 1
if self.action != 'search':
self.debug.print_debug(self, u'Saving ' + file_name.replace('.txt', '.png'))
graph.savefig(join(self.in_dir, file_name.replace('.txt', '.png')))
graph.close()
def main():
cwf_instance = KernelDensity()
cwf_instance.run()
if __name__ == '__main__':
main()