-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathrdd_show_ex.py
26 lines (23 loc) · 1.13 KB
/
rdd_show_ex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
## simple program to parse text and map the text into RDD
## parse lines of text based on ","
## example to run with taxi data, assume the data is in text
# spark-submit --master yarn rdd_show_ex.py --master yarn --input_file hdfs:///opt/data/rawdata/nytaxi2019-1000.csv --output_dir hdfs:///tmp/spark7
import argparse
from pyspark import SparkConf, SparkContext
parser = argparse.ArgumentParser()
parser.add_argument("--master", help="Spark Master")
parser.add_argument("--input_file", help="Spark Master")
parser.add_argument("--output_dir", help="output dir")
args = parser.parse_args()
conf = SparkConf().setAppName("cse4640-rddshow").setMaster(args.master)
sc = SparkContext(conf=conf)
##modify the input data but we assume that the input data is csv
## however, this program considers the input file as a text file
rdd = sc.textFile(args.input_file)
## if there is a header we can filter it otherwise comment two lines
csvheader = rdd.first()
rdd = rdd.filter(lambda csventry: csventry != csvheader)
## using map to parse text entries as csv
rdd = rdd.map(lambda csventry: csventry.split(","))
rdd.repartition(1)
rdd.saveAsTextFile(args.output_dir)