-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathranker.py
482 lines (356 loc) · 15.3 KB
/
ranker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
#!/usr/bin/python
# The following section is neccesary in order for DocOpt command line argument parsing module to operate
# Do not change the following syntax unless you are familiar with DocOpt specifications.
"""Resume Ranker reviews resumes and assigns a score based on percentile matching file content. Valid
file types include .docx, .pdf, and .txt. Any other filetype will be skipped automatically.
Usage:
ranker.py [-h | --help]
ranker.py [-v | --verbose] [--rename=<rename>] --dir=<dir> --keyword-file=<keywordfile> [--output-type=<outputtype> --output-file=<outputfile>]
ranker.py --version
Options:
-h,--help : show this help message.
-v,--verbose : display more text output.
--rename : explicitly turn on/off the file renamer with yes/no option [default: yes].
--dir : set the directory for resume review.
--keyword-file : set the file path of the keyword file used in ranking each resume file contents.
--output-type : set the output file type (csv or txt). Must be used in conjunction with --output-file.
--output-file : set the directory and filename of the output file, including extension. Must be used in conjunction with --output-type.
--version : show version.
"""
__scriptname__ = "Resume Ranker"
__author__ = "Corey Farmer"
__copyright__ = "Copyright 2017"
__credits__ = []
__license__ = "GPL"
__version__ = "1.0.0"
__maintainer__ = "Corey Farmer"
__email__ = "[email protected]"
__status__ = "Development"
import os, PyPDF2, csv
from docopt import docopt
from os import sys, path
from docx import Document
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
class Environment:
"""
Setup the environment for the script; including opening any required files, checking
validity of files and directories, etc.
--
:param string|None dir - string containing the directory for iteration containing files
:param string keyword_file - string containing full file path to location of keyword file
:return Object Environment
"""
def __init__(self, dir=None, keyword_file=None):
self.dir = dir
self.keyword_file = keyword_file
def is_directory_given(self):
# check if the user provided a directory to iterate through
if not self.dir :
raise Exception("A directory must be provided. See --help option for script usage details.")
return self
def is_valid_directory(self):
# check if the directory provided by the user for files exists
if dir and not os.path.isdir(self.dir) :
raise Exception("The directory provided is not valid or found.")
return self
def is_valid_keyword_file(self):
# check if the file provided by the user for keywords exists
if not os.path.isfile(self.keyword_file):
raise Exception("The keyword file path provided is not valid or does not exist.")
return self
class Parsing:
"""
This class is designed to contain all necessary file parsing methodologies.
Each file passed as object instantiation parameter will be parsed into plain text
form and provided back to calling function.
--
:param string path - string containing the full file path of the file to open and parse into variable text
:return Object Parsing
"""
def __init__(self, path):
self.file = path
self.results = self.parse_file()
def parse_file(self) :
"""
Check the file extension against known/valid extensions and call
associated parsing method appropriately.
"""
# get the file extension from the filename
extension = os.path.splitext(self.file)[1]
# create an empty string variable
results = None
# if the file passed in is a valid file
if os.path.isfile(self.file) :
# figure out extension to determine what parsing methodology to use
d = { '.docx' : self.parse_word_doc
,'.pdf' : self.parse_pdf_doc
,'.txt' : self.parse_txt_doc
}
# invoke the value of the dict returned by extension key match
results = d[extension]()
return results
def parse_word_doc(self):
"""
Open a word document filetype and parse contents to string variable
for matching comparison.
"""
def iter_block_items(parent):
"""
Generate a reference to each paragraph and table child within *parent*,
in document order. Each returned value is an instance of either Table or
Paragraph. *parent* would most commonly be a reference to a main
Document object, but also works for a _Cell object, which itself can
contain paragraphs and tables.
"""
if isinstance(parent, _Document):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
# create empty string variable for storing file content
docText = ''
# set the document object with the file
document = Document(self.file)
# iterate over blocks in the document object
for block in iter_block_items(document) :
# if block type is paragraph, simply grab text from paragraph
if isinstance(block, Paragraph) :
# append block text to text variable
docText += block.text
# if block type is table, we must iterate over the table components to get
# content out of the cells
elif isinstance(block, Table) :
# iterate over the rows inside the table
for row in block.rows :
# iterate over each cell inside each row
for cell in row.cells :
# append cell text to text variable
docText += cell.text
return docText.strip() or None
def parse_pdf_doc(self):
"""
Open a pdf document filetype and parse contents to string variable
for matching comparison.
"""
docText = ''
# open the file, with read/binary priviledges
f = open(self.file, 'rb')
pdf = PyPDF2.PdfFileReader(f)
for page in pdf.pages :
docText += page.extractText()
f.close()
return docText.strip() or None
def parse_txt_doc(self):
"""
Open a text document filetype and parse contents to string variable
for matching comparison.
"""
# open the file, with read priviledges
with open(self.file, 'r') as f :
docText = f.read()
return docText.strip() or None
class Rank :
"""
Use this function to determine the appropriate ranking/score of each file.
When instantiated, this class will first load the keywords file
--
:param list keyword_list - list containing each keyword found in keyword_file
:return Object Rank
"""
def __init__(self, keyword_list):
self.keywords = keyword_list
self.total_keys = len(self.keywords)
def get_rank(self, text):
"""
Get the rank of the file based on total count of keywords found in the file
contents.
"""
# set the initial rank and count to 0
rank = count = 0
# get the percentage that each keyword is worth
word_percentage = round(float(100)/float(len(self.keywords)), 2)
# iterate over list of keywords
for keyword in self.keywords :
keyword, multiplier = self.get_multiplier(keyword)
# was the keyword found in the file? increase overall percentage if true
rank += word_percentage if keyword.upper() in text.upper() else 0
# get the number of occurrences of the keyword in the file
count += text.upper().count( keyword.upper() ) * int( multiplier )
return (rank,count)
def get_multiplier(self, keyword):
"""
Split the keyword on multiplier delimiter if found. Otherwise provide 1 for multiplier
"""
multiplier = 1
# set the multiplier if found in the file
if ' *' in keyword :
keyword,multiplier = keyword.split(' *')
return (keyword, multiplier)
class File :
"""
Use this method to hold any method related to file interaction including,
gathering list of valid files, acting upon that list, and renaming files.
--
:param string|None dir - string containing the directory for iteration containing files
:param string keyword_file - string containing full file path to location of keyword file
:return Object File
"""
def __init__(self, dir=None, keyword_file=None):
self.dir = dir
self.keyword_file = keyword_file
self.keywords_list = []
self.file_buf = []
self.files = None
def get_keyword_list(self, keyword_file=None):
"""
Create the list of keywords from the keywords file defined by user.
--
:param string keyword_file|None - string containing full file path to location of keyword file
:return Object Environment
"""
# allow keyword file override
self.keyword_file = keyword_file or self.keyword_file
with open(self.keyword_file, 'r') as f:
content = f.readlines()
if len(content) == 0 :
raise Exception("No keywords found for ranking, in %s." % self.keyword_file)
self.keywords_list = [l.strip() for l in content]
# return self for method chaining
return self
def get_files(self, valid_types):
"""
Get a list of valid files found in iteration directory.
--
:param list valid_types - list containing valid file extensions for parsing.
:return Object File
"""
# get a list of files in the directory (files only)
self.files = [f for f in os.listdir( self.dir ) if os.path.isfile(os.path.join(self.dir, f )) and os.path.splitext(f)[1] in valid_types and f != os.path.basename(self.keyword_file) and "~$" not in f]
# throw error if no valid files are found in directory
if len(self.files) == 0 :
raise Exception("The directory provided has no valid files. Valid types include: .docx, .pdf, .txt")
# return self for method chaining
return self
def file_iterator(self):
# iterate over the valid files list
for f in self.files :
# remove the last character in the dir string if it is a slash for another directory
path = os.path.join( self.dir.rstrip('//'), f )
# instantiate an empty parsing object
p = Parsing(path)
# are there any results?
if p.results :
# instantiate the Rank object with the keyword_file passed as argument
r = Rank(self.keywords_list)
# pass the location of the keyword file, and the results to review into the rank class
# get_rank returns a tuple of (rank, total_count)
rank,total_count = r.get_rank(p.results)
# get the filename, regardless of if there is already a percentage in front of filename or not
filename_li = f.split('] - ')
# reverse the filename split so the actual file name is always in position 0 of the list
filename_li.reverse()
# add the file information to the file buffer to be used for the last iteration
self.file_buf.append({ 'orig_path' : path
,'orig_name' : filename_li[0]
,'dir' : self.dir.rstrip('//')
,'percent_rank' : rank
,'total_count' : total_count
})
# return self for method chaining
return self
def calc_percentile(self):
# if file_buf has information in it
if len(self.file_buf) :
# resort the file list based on the total_count, in descending order so the first element is always the highest count
self.files = sorted(self.file_buf, key=lambda k: k['total_count'], reverse=True)
# iterate over the newly sorted files list
for i, d in enumerate(self.files):
percentile = self.get_percentile(d, self.files[0])
# set the new filename with percentile and count included in filename
d['new_name'] = "%s%% [%s] - %s" % (percentile, d['total_count'], d['orig_name'])
d['percentile'] = percentile
# return self for method chaining
return self
def get_percentile(self, d, f):
return round( ( float(d['total_count']) / float(f['total_count']) ) * 100, 2)
def finish_output(self, output_type=None, output_file=None, rename=None, verbose=None) :
"""
Finally output the results in the preferred method specified by the user. Or defaulted to file renaming.
--
:param string|None output_type - user defined string containing the intended output file extension type from available values
:param string|None output_file - user defined string of full path location and filename of output file
:param bool|None rename - boolean flag defining whether original file names should be renamed with new filenames including percentile
:param bool|None verbose - boolean flag defining whether output resulting filenames should be printed to console
"""
try :
f = None
# open the file pointer if output_file is specified
if output_file :
f = open(output_file, 'w')
# if the file type is csv then initialize the csv writer object
if output_type and output_type.upper() == 'CSV' :
writer = csv.writer(f)
# write the header row to the
writer.writerow( ('Percentile', 'Total Count', 'File Name') )
for i, d in enumerate(self.files) :
if verbose:
# print the new filename to the console for the user
print( os.path.basename(d['new_name']) )
# only rename the files if the rename option is set to true
if rename:
self.rename_file( d['orig_path'], os.path.join(d['dir'], d['new_name']) )
# append the filename to a string to be used to write to a file at the end of this iteration
if output_type and output_type.upper() == 'TXT' :
f.write( "%s\n" % d['new_name'] )
if output_type and output_type.upper() == 'CSV' :
writer.writerow( (d['percentile'], d['total_count'], d['new_name']) )
finally :
# close the file pointer if it exists
if f :
f.close()
def rename_file(self, opath, npath):
# rename the file name with the new rank
os.rename(opath, npath)
if __name__ == "__main__" :
# Docopt will check all arguments, and exit with the Usage string if they don't pass.
# If you simply want to pass your own modules documentation then use __doc__,
# otherwise, you would pass another docopt-friendly usage string here.
# You could also pass your own arguments instead of sys.argv with: docopt(__doc__, argv=[your, args])
docopt_args = docopt(__doc__, version='Resume Ranker 1.0.0')
verbosity = docopt_args["-v"]
rename = docopt_args["--rename"] or 'YES'
dir = docopt_args["--dir"]
keyword_file = docopt_args["--keyword-file"]
output_type = docopt_args["--output-type"]
output_file = docopt_args["--output-file"]
## -------------- CLI Argument Normalization ---------------- ##
# normalize the rename option text to True/False
rename = True if rename and rename.upper() == 'YES' else False
# ensure the user input conforms to the available output types
if output_type and output_type.upper() not in ['CSV', 'TXT']:
raise Exception("Invalid value supplied to --output-type argument. See --help for details.")
# ensure the output file is a valid directory first
e = Environment(path.dirname(output_file)).is_valid_directory()
try :
# instantiate the environment object where we will check that all environment paths and file names are valid
# begin checking the paths and files, throw exception if something is not right
e = Environment(dir, keyword_file).is_directory_given()\
.is_valid_directory()\
.is_valid_keyword_file()
# set a list of the valid file types we can use, .docx, .pdf, .txt
valid_types = ['.docx', '.txt', '.pdf']
f = File(e.dir, e.keyword_file).get_keyword_list()\
.get_files(valid_types)\
.file_iterator()\
.calc_percentile()\
.finish_output(output_type, output_file, rename, verbosity)
except Exception:
raise