Skip to content

Commit

Permalink
great! we can read avro in mrjob now
Browse files Browse the repository at this point in the history
  • Loading branch information
nelsonje committed Jun 9, 2015
1 parent 54c6de7 commit ecd77af
Showing 1 changed file with 12 additions and 14 deletions.
26 changes: 12 additions & 14 deletions mrjob/AvroReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
# Test reading Avro-encoded files
#
# run like
# python AvroReader.py --runner=hadoop --no-output --setup='source /shared/patents/settings.sh' hdfs:///patents/output/tfidf_normalized/part-00000.avro --output-dir=avro-test1
# but currently fails
# python AvroReader.py --runner=hadoop --no-output --setup='source /shared/patents/settings.sh' hdfs:///patents/output/tfidf_normalized/part-00000.avro --hadoop-arg -libjars --hadoop-arg /shared/patents/nltk_hadoop/lib/avro-1.7.7.jar,/shared/patents/nltk_hadoop/lib/avro-mapred-1.7.7.jar --output-dir=avro-test


from mrjob.job import MRJob
from mrjob.step import MRStep

from mrjob.protocol import RawProtocol
from mrjob.protocol import RawValueProtocol
from mrjob.protocol import JSONProtocol
from mrjob.protocol import JSONValueProtocol

import sys
import string
Expand All @@ -23,21 +25,17 @@
class AvroReader(MRJob):

HADOOP_INPUT_FORMAT = "org.apache.avro.mapred.AvroAsTextInputFormat"
HADOOP_OUTPUT_FORMAT = "org.apache.avro.mapred.AvroAsTextInputFormat"
#HADOOP_OUTPUT_FORMAT = "org.apache.avro.mapred.AvroTextOutputFormat"

# read initial input as tab-delimited key/value, and
# use JSON for intermediate and final output.
INPUT_PROTOCOL = JSONProtocol
INTERNAL_PROTOCOL = RawProtocol
OUTPUT_PROTOCOL = RawProtocol

def mapper(self, key, value):
yield key, value

def steps(self):
return [
MRStep( mapper = self.mapper ),
]
INPUT_PROTOCOL = JSONValueProtocol
#INPUT_PROTOCOL = RawValueProtocol
#INTERNAL_PROTOCOL = RawProtocol
#OUTPUT_PROTOCOL = RawProtocol

def mapper(self, _, value):
yield "argh", value['key']['ngram']


if __name__ == '__main__':
Expand Down

0 comments on commit ecd77af

Please sign in to comment.