-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspark1_6_0_kudu.py
54 lines (43 loc) · 1.67 KB
/
spark1_6_0_kudu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# Dated : Aug 25, 2017
# About : Sample Pyspark (1.6.0) code to count number of Rows in a Kudu Table
# Pyspark Version : 1.6.0
# Kudu Version : 1.2.0
# Coder : Ankit Sarraf
import ConfigParser
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
# Initialize the configuration
conf = (SparkConf().setMaster("yarn-client").setAppName("KuduSpark_1_6_0"))
# Initialize the SparkContext using the above conf
sc = SparkContext(conf = conf)
# Initialize the SQLContext using the above SparkContext
sqlContext = SQLContext(sc)
# Use the ConfigParser
configParser = ConfigParser.ConfigParser()
# Load the file that contains the Configuration Parameters
configParser.read('kudu_configuration.conf')
# Determine the kudu_master as provided in the Config File
kudu_master = configParser.get('Kudu', 'KuduMaster')
# Provide the table to be read
kudu_table = configParser.get('Kudu', 'KuduTable')
# Display Data Read from Config File
print('KUDU MASTER: ' + kudu_master)
print('KUDU TABLES: ' + kudu_table)
# Load the table in the memory
kudu_events_df = sqlContext.read.format('org.apache.kudu.spark.kudu'). \
option('kudu.master', kudu_master). \
option('kudu.table', kudu_table). \
load()
# Display the count of the rows in the table
print('My Count: ' + str(kudu_events_df.count()))
# Register this table as Temporary table
kudu_events_df.registerAsTempTable("myTab")
# Query the Temporary table using SparkSQL Queries
sqlContext.sql("SELECT count(*) FROM myTab").show()
# Results into
# +----+
# | _c0|
# +----+
# |1055|
# +----+