Skip to content

Commit

Permalink
basic mapper
Browse files Browse the repository at this point in the history
  • Loading branch information
bmyerz committed Aug 24, 2014
0 parents commit 6e1d744
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 0 deletions.
14 changes: 14 additions & 0 deletions hadoop-tag.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

source ./hadoop-streaming-env.sh

corpus=$1

output=/sampa/home/bdmyers/nltk-apps/out
rm -rf $output

$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$RELATIVE_PATH_JAR \
-D mapred.reduce.tasks=0 \
-verbose \
-input /sampa/home/bdmyers/nltk-apps/$1 \
-output $output \
-mapper /sampa/home/bdmyers/nltk-apps/tagger_map.py
17 changes: 17 additions & 0 deletions tagger_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/sampa/home/bdmyers/escience/python/install/bin/python
import sys
import os
import nltk
import re

# get document name
docname = os.environ["mapreduce_map_input_file"]

sent_delims = re.compile('|'.join(['\.', ';']))

sentences = re.split(sent_delims, sys.stdin.read().replace('\n', ' '))[:-1]

for s in sentences:
tokens = nltk.word_tokenize(s)
tagged = nltk.pos_tag(tokens)
print "%s\t%s" % (docname, tagged)

0 comments on commit 6e1d744

Please sign in to comment.