Skip to content

Commit cd4e657

Browse files
pyspark union
1 parent 106101b commit cd4e657

File tree

2 files changed

+41
-1
lines changed

2 files changed

+41
-1
lines changed

Diff for: pyspark-collect.py

-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
deptDF.printSchema()
1919
deptDF.show(truncate=False)
2020

21-
2221
dataCollect = deptDF.collect()
2322

2423
print(dataCollect)

Diff for: pyspark-union.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
author SparkByExamples.com
4+
"""
5+
6+
import pyspark
7+
from pyspark.sql import SparkSession
8+
9+
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
10+
11+
simpleData = [("James","Sales","NY",90000,34,10000), \
12+
("Michael","Sales","NY",86000,56,20000), \
13+
("Robert","Sales","CA",81000,30,23000), \
14+
("Maria","Finance","CA",90000,24,23000) \
15+
]
16+
17+
columns= ["employee_name","department","state","salary","age","bonus"]
18+
df = spark.createDataFrame(data = simpleData, schema = columns)
19+
df.printSchema()
20+
df.show(truncate=False)
21+
22+
simpleData2 = [("James","Sales","NY",90000,34,10000), \
23+
("Maria","Finance","CA",90000,24,23000), \
24+
("Jen","Finance","NY",79000,53,15000), \
25+
("Jeff","Marketing","CA",80000,25,18000), \
26+
("Kumar","Marketing","NY",91000,50,21000) \
27+
]
28+
columns2= ["employee_name","department","state","salary","age","bonus"]
29+
30+
df2 = spark.createDataFrame(data = simpleData2, schema = columns2)
31+
32+
df2.printSchema()
33+
df2.show(truncate=False)
34+
35+
unionDF = df.union(df2)
36+
unionDF.show(truncate=False)
37+
disDF = df.union(df2).distinct()
38+
disDF.show(truncate=False)
39+
40+
unionAllDF = df.unionAll(df2)
41+
unionAllDF.show(truncate=False)

0 commit comments

Comments
 (0)