Skip to content

Commit cb0502d

Browse files
pyspark rdd to dataframe
1 parent 89e450c commit cb0502d

File tree

1 file changed

+45
-0
lines changed

1 file changed

+45
-0
lines changed

pyspark-rdd-to-dataframe.py

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# -*- coding: utf-8 -*-
2+
'''
3+
Created on Sat Jan 11 19:38:27 2020
4+
5+
@author: sparkbyexamples.com
6+
'''
7+
8+
import pyspark
9+
from pyspark.sql import SparkSession
10+
11+
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
12+
13+
dept = [("Finance",10),
14+
("Marketing",20),
15+
("Sales",30),
16+
("IT",40)
17+
]
18+
rdd = spark.sparkContext.parallelize(dept)
19+
20+
df = rdd.toDF()
21+
df.printSchema()
22+
df.show(truncate=False)
23+
24+
deptColumns = ["dept_name","dept_id"]
25+
df2 = rdd.toDF(deptColumns)
26+
df2.printSchema()
27+
df2.show(truncate=False)
28+
29+
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
30+
deptDF.printSchema()
31+
deptDF.show(truncate=False)
32+
33+
34+
from pyspark.sql.types import StructType,StructField, StringType
35+
deptSchema = StructType([
36+
StructField('dept_name', StringType(), True),
37+
StructField('dept_id', StringType(), True)
38+
])
39+
40+
deptDF1 = spark.createDataFrame(data=dept, schema = deptSchema)
41+
deptDF1.printSchema()
42+
deptDF1.show(truncate=False)
43+
44+
45+

0 commit comments

Comments
 (0)