Skip to content

Commit

Permalink
added ability to turn stemming on/off
Browse files Browse the repository at this point in the history
by using the -t or --stem flag, it's now possible to turn stemming on or off
  • Loading branch information
zbsimon committed May 26, 2015
1 parent 26ef3b7 commit 7d03be0
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 8 deletions.
12 changes: 7 additions & 5 deletions claims_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@


def map_claims(input=sys.stdin, output=sys.stdout,
kv_delim=INPUT_KV_DELIM, stop_words_file=None):
kv_delim=INPUT_KV_DELIM, stop_words_file=None, stem=True):
for line in input:
key, value = line.strip().split(kv_delim)
patent_id = key.strip()
if stop_words_file is not None:
stop_words = json.loads(open(stop_words_file).read())
contents = mru.clean_text(value, stop_words)
contents = mru.clean_text(value, stop_words, stem)
else:
contents = mru.clean_text(value)
contents = mru.clean_text(value, stem=stem)
key = {'filename': patent_id}
contents = {'words': [word for word in contents]}
mru.reducer_emit(key, contents, output)
Expand All @@ -27,9 +27,11 @@ def map_claims(input=sys.stdin, output=sys.stdout,
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--stop-words', dest='stop_words_file')
parser.add_argument('-t', '--stem', dest='stem', type=bool, default=True)
args = parser.parse_args()
stop_words_file = args.stop_words_file
stem = args.stem
if stop_words_file is not None:
map_claims(stop_words_file=stop_words_file)
map_claims(stop_words_file=stop_words_file, stem=stem)
else:
map_claims()
map_claims(stem=stem)
5 changes: 3 additions & 2 deletions map_reduce_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def run_map_reduce_job(mapper, reducer, input_dir, output_dir, files='',
raise MapReduceError(err_msg.format(mapper, reducer), e)


def clean_text(text, stop_word_list=stop_words):
def clean_text(text, stop_word_list=stop_words, stem=True):
"""
returns a 'cleaned' version of text by filtering out all words
that don't contain strictly alphabetic characters, converting
Expand All @@ -163,7 +163,8 @@ def clean_text(text, stop_word_list=stop_words):
is_alpha = re.compile('^[a-z]+$')
result = filter(lambda word: is_alpha.match(word), result)

result = [stemmer.stem(word) for word in result]
if stem:
result = [stemmer.stem(word) for word in result]
return filter(lambda word: word not in stop_word_list, result)


Expand Down
6 changes: 5 additions & 1 deletion mapred_tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,16 @@ def get_output_dir(sub_dir=''):
parser.add_argument('-s', '--stop-words', default=None,
help=stop_words_help, dest='stop_words')

stemmer_help = 'if true, use nltk PorterStemmer to stem ngrams'
parser.add_argument('-t', '--stem', type=bool, dest='stem', default=True)

args = parser.parse_args()
input_dir = args.input_dir
output_dir = args.output_dir
force = args.force
n = args.n
stop_words = args.stop_words
stem = args.stem
directories.append(output_dir)

# whether or not we're working in hdfs
Expand Down Expand Up @@ -125,7 +129,7 @@ def get_output_dir(sub_dir=''):

# do an MR job to clean/stem file contents
# contents_mapper_cmd = 'contents_mapper.py'
contents_mapper_cmd = 'claims_mapper.py'
contents_mapper_cmd = 'claims_mapper.py -t {}'.format(stem)
if stop_words is not None:
contents_mapper_cmd += ' -s {}'.format(stop_words)
# need to tell yarn to send stop words file using -files
Expand Down

0 comments on commit 7d03be0

Please sign in to comment.