Skip to content

Commit c00a13b

Browse files
PySpark Date Functions
1 parent 00b2c1e commit c00a13b

8 files changed

+379
-0
lines changed

pyspark-date-string.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
author SparkByExamples.com
4+
"""
5+
6+
from pyspark.sql import SparkSession
7+
8+
# Create SparkSession
9+
spark = SparkSession.builder \
10+
.appName('SparkByExamples.com') \
11+
.getOrCreate()
12+
13+
from pyspark.sql.functions import *
14+
15+
df=spark.createDataFrame([["1"]],["id"])
16+
df.select(current_date().alias("current_date"), \
17+
date_format(current_date(),"yyyy MM dd").alias("yyyy MM dd"), \
18+
date_format(current_timestamp(),"MM/dd/yyyy hh:mm").alias("MM/dd/yyyy"), \
19+
date_format(current_timestamp(),"yyyy MMM dd").alias("yyyy MMMM dd"), \
20+
date_format(current_timestamp(),"yyyy MMMM dd E").alias("yyyy MMMM dd E") \
21+
).show()
22+
23+
#SQL
24+
25+
spark.sql("select current_date() as current_date, "+
26+
"date_format(current_timestamp(),'yyyy MM dd') as yyyy_MM_dd, "+
27+
"date_format(current_timestamp(),'MM/dd/yyyy hh:mm') as MM_dd_yyyy, "+
28+
"date_format(current_timestamp(),'yyyy MMM dd') as yyyy_MMMM_dd, "+
29+
"date_format(current_timestamp(),'yyyy MMMM dd E') as yyyy_MMMM_dd_E").show()

pyspark-datediff.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
author SparkByExamples.com
4+
"""
5+
6+
from pyspark.sql import SparkSession
7+
8+
# Create SparkSession
9+
spark = SparkSession.builder \
10+
.appName('SparkByExamples.com') \
11+
.getOrCreate()
12+
data = [("1","2019-07-01"),("2","2019-06-24"),("3","2019-08-24")]
13+
14+
df=spark.createDataFrame(data=data,schema=["id","date"])
15+
16+
from pyspark.sql.functions import *
17+
18+
df.select(
19+
col("date"),
20+
current_date().alias("current_date"),
21+
datediff(current_date(),col("date")).alias("datediff")
22+
).show()
23+
24+
df.withColumn("datesDiff", datediff(current_date(),col("date"))) \
25+
.withColumn("montsDiff", months_between(current_date(),col("date"))) \
26+
.withColumn("montsDiff_round",round(months_between(current_date(),col("date")),2)) \
27+
.withColumn("yearsDiff",months_between(current_date(),col("date"))/lit(12)) \
28+
.withColumn("yearsDiff_round",round(months_between(current_date(),col("date"))/lit(12),2)) \
29+
.show()
30+
31+
data2 = [("1","07-01-2019"),("2","06-24-2019"),("3","08-24-2019")]
32+
df2=spark.createDataFrame(data=data2,schema=["id","date"])
33+
df2.select(
34+
to_date(col("date"),"MM-dd-yyyy").alias("date"),
35+
current_date().alias("endDate")
36+
)
37+
38+
#SQL
39+
40+
spark.sql("select round(months_between('2019-07-01',current_date())/12,2) as years_diff").show()

pyspark-join-two-dataframes.py

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
author SparkByExamples.com
4+
"""
5+
6+
from pyspark.sql import SparkSession
7+
8+
# Create SparkSession
9+
spark = SparkSession.builder \
10+
.appName('SparkByExamples.com') \
11+
.getOrCreate()
12+
#EMP DataFrame
13+
empData = [(1,"Smith",10), (2,"Rose",20),
14+
(3,"Williams",10), (4,"Jones",30)
15+
]
16+
empColumns = ["emp_id","name","emp_dept_id"]
17+
empDF = spark.createDataFrame(empData,empColumns)
18+
empDF.show()
19+
20+
#DEPT DataFrame
21+
deptData = [("Finance",10), ("Marketing",20),
22+
("Sales",30),("IT",40)
23+
]
24+
deptColumns = ["dept_name","dept_id"]
25+
deptDF=spark.createDataFrame(deptData,deptColumns)
26+
deptDF.show()
27+
28+
#Address DataFrame
29+
addData=[(1,"1523 Main St","SFO","CA"),
30+
(2,"3453 Orange St","SFO","NY"),
31+
(3,"34 Warner St","Jersey","NJ"),
32+
(4,"221 Cavalier St","Newark","DE"),
33+
(5,"789 Walnut St","Sandiago","CA")
34+
]
35+
addColumns = ["emp_id","addline1","city","state"]
36+
addDF = spark.createDataFrame(addData,addColumns)
37+
addDF.show()
38+
39+
#Join two DataFrames
40+
empDF.join(addDF,empDF["emp_id"] == addDF["emp_id"]).show()
41+
42+
#Drop duplicate column
43+
empDF.join(addDF,["emp_id"]).show()
44+
45+
#Join Multiple DataFrames
46+
empDF.join(addDF,["emp_id"]) \
47+
.join(deptDF,empDF["emp_dept_id"] == deptDF["dept_id"]) \
48+
.show()
49+
50+
#Using Where for Join Condition
51+
empDF.join(deptDF).where(empDF["emp_dept_id"] == deptDF["dept_id"]) \
52+
.join(addDF).where(empDF["emp_id"] == addDF["emp_id"]) \
53+
.show()
54+
55+
#SQL
56+
empDF.createOrReplaceTempView("EMP")
57+
deptDF.createOrReplaceTempView("DEPT")
58+
addDF.createOrReplaceTempView("ADD")
59+
60+
spark.sql("select * from EMP e, DEPT d, ADD a " + \
61+
"where e.emp_dept_id == d.dept_id and e.emp_id == a.emp_id") \
62+
.show()
63+
64+
#
65+
df1 = spark.createDataFrame(
66+
[(1, "A"), (2, "B"), (3, "C")],
67+
["A1", "A2"])
68+
69+
df2 = spark.createDataFrame(
70+
[(1, "F"), (2, "B")],
71+
["B1", "B2"])
72+
73+
df = df1.join(df2, (df1.A1 == df2.B1) & (df1.A2 == df2.B2))
74+
df.show()

pyspark-string-date.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
author SparkByExamples.com
4+
"""
5+
6+
# -*- coding: utf-8 -*-
7+
"""
8+
author SparkByExamples.com
9+
"""
10+
11+
from pyspark.sql import SparkSession
12+
13+
# Create SparkSession
14+
spark = SparkSession.builder \
15+
.appName('SparkByExamples.com') \
16+
.getOrCreate()
17+
18+
from pyspark.sql.functions import *
19+
20+
df=spark.createDataFrame([["02-03-2013"],["05-06-2023"]],["input"])
21+
df.select(col("input"),to_date(col("input"),"MM-dd-yyyy").alias("date")) \
22+
.show()
23+
24+
#SQL
25+
spark.sql("select to_date('02-03-2013','MM-dd-yyyy') date").show()
26+
27+

pyspark-string-timestamp.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
author SparkByExamples.com
4+
"""
5+
6+
from pyspark.sql import SparkSession
7+
8+
# Create SparkSession
9+
spark = SparkSession.builder \
10+
.appName('SparkByExamples.com') \
11+
.getOrCreate()
12+
13+
from pyspark.sql.functions import *
14+
15+
df=spark.createDataFrame(
16+
data = [ ("1","2019-06-24 12:01:19.000")],
17+
schema=["id","input_timestamp"])
18+
df.printSchema()
19+
20+
#Timestamp String to DateType
21+
df.withColumn("timestamp",to_timestamp("input_timestamp")) \
22+
.show(truncate=False)
23+
24+
# Using Cast to convert TimestampType to DateType
25+
df.withColumn('timestamp', \
26+
to_timestamp('input_timestamp').cast('string')) \
27+
.show(truncate=False)
28+
29+
30+
df.select(to_timestamp(lit('06-24-2019 12:01:19.000'),'MM-dd-yyyy HH:mm:ss.SSSS')) \
31+
.show(truncate=False)
32+
33+
34+
#SQL string to TimestampType
35+
spark.sql("select to_timestamp('2019-06-24 12:01:19.000') as timestamp")
36+
#SQL CAST timestamp string to TimestampType
37+
spark.sql("select timestamp('2019-06-24 12:01:19.000') as timestamp")
38+
#SQL Custom string to TimestampType
39+
spark.sql("select to_timestamp('06-24-2019 12:01:19.000','MM-dd-yyyy HH:mm:ss.SSSS') as timestamp")

pyspark-time-diff.py

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
author SparkByExamples.com
4+
"""
5+
6+
from pyspark.sql import SparkSession
7+
8+
# Create SparkSession
9+
spark = SparkSession.builder \
10+
.appName('SparkByExamples.com') \
11+
.getOrCreate()
12+
13+
14+
dates = [("1","2019-07-01 12:01:19.111"),
15+
("2","2019-06-24 12:01:19.222"),
16+
("3","2019-11-16 16:44:55.406"),
17+
("4","2019-11-16 16:50:59.406")
18+
]
19+
20+
df = spark.createDataFrame(data=dates, schema=["id","from_timestamp"])
21+
22+
from pyspark.sql.functions import *
23+
df2=df.withColumn('from_timestamp',to_timestamp(col('from_timestamp')))\
24+
.withColumn('end_timestamp', current_timestamp())\
25+
.withColumn('DiffInSeconds',col("end_timestamp").cast("long") - col('from_timestamp').cast("long"))
26+
df2.show(truncate=False)
27+
28+
df.withColumn('from_timestamp',to_timestamp(col('from_timestamp')))\
29+
.withColumn('end_timestamp', current_timestamp())\
30+
.withColumn('DiffInSeconds',unix_timestamp("end_timestamp") - unix_timestamp('from_timestamp')) \
31+
.show(truncate=False)
32+
33+
df2.withColumn('DiffInMinutes',round(col('DiffInSeconds')/60))\
34+
.show(truncate=False)
35+
36+
df2.withColumn('DiffInHours',round(col('DiffInSeconds')/3600))\
37+
.show(truncate=False)
38+
39+
#Difference between two timestamps when input has just timestamp
40+
41+
data= [("12:01:19.000","13:01:19.000"),
42+
("12:01:19.000","12:02:19.000"),
43+
("16:44:55.406","17:44:55.406"),
44+
("16:50:59.406","16:44:59.406")]
45+
df3 = spark.createDataFrame(data=data, schema=["from_timestamp","to_timestamp"])
46+
47+
df3.withColumn("from_timestamp",to_timestamp(col("from_timestamp"),"HH:mm:ss.SSS")) \
48+
.withColumn("to_timestamp",to_timestamp(col("to_timestamp"),"HH:mm:ss.SSS")) \
49+
.withColumn("DiffInSeconds", col("from_timestamp").cast("long") - col("to_timestamp").cast("long")) \
50+
.withColumn("DiffInMinutes",round(col("DiffInSeconds")/60)) \
51+
.withColumn("DiffInHours",round(col("DiffInSeconds")/3600)) \
52+
.show(truncate=False)
53+
54+
#
55+
56+
57+
df3 = spark.createDataFrame(
58+
data=[("1","07-01-2019 12:01:19.406")],
59+
schema=["id","input_timestamp"]
60+
)
61+
df3.withColumn("input_timestamp",to_timestamp(col("input_timestamp"),"MM-dd-yyyy HH:mm:ss.SSS")) \
62+
.withColumn("current_timestamp",current_timestamp().alias("current_timestamp")) \
63+
.withColumn("DiffInSeconds",current_timestamp().cast("long") - col("input_timestamp").cast("long")) \
64+
.withColumn("DiffInMinutes",round(col("DiffInSeconds")/60)) \
65+
.withColumn("DiffInHours",round(col("DiffInSeconds")/3600)) \
66+
.withColumn("DiffInDays",round(col("DiffInSeconds")/24*3600)) \
67+
.show(truncate=False)
68+
69+
#SQL
70+
71+
spark.sql("select unix_timestamp('2019-07-02 12:01:19') - unix_timestamp('2019-07-01 12:01:19') DiffInSeconds").show()
72+
spark.sql("select (unix_timestamp('2019-07-02 12:01:19') - unix_timestamp('2019-07-01 12:01:19'))/60 DiffInMinutes").show()
73+
spark.sql("select (unix_timestamp('2019-07-02 12:01:19') - unix_timestamp('2019-07-01 12:01:19'))/3600 DiffInHours").show()

pyspark-timestamp-date.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
author SparkByExamples.com
4+
"""
5+
6+
from pyspark.sql import SparkSession
7+
8+
# Create SparkSession
9+
spark = SparkSession.builder \
10+
.appName('SparkByExamples.com') \
11+
.getOrCreate()
12+
13+
df=spark.createDataFrame(
14+
data = [ ("1","2019-06-24 12:01:19.000")],
15+
schema=["id","input_timestamp"])
16+
df.printSchema()
17+
18+
19+
from pyspark.sql.functions import *
20+
21+
# Using Cast to convert Timestamp String to DateType
22+
df.withColumn('date_type', col('input_timestamp').cast('date')) \
23+
.show(truncate=False)
24+
25+
# Using Cast to convert TimestampType to DateType
26+
df.withColumn('date_type', to_timestamp('input_timestamp').cast('date')) \
27+
.show(truncate=False)
28+
29+
df.select(to_date(lit('06-24-2019 12:01:19.000'),'MM-dd-yyyy HH:mm:ss.SSSS')) \
30+
.show()
31+
32+
#Timestamp String to DateType
33+
df.withColumn("date_type",to_date("input_timestamp")) \
34+
.show(truncate=False)
35+
36+
#Timestamp Type to DateType
37+
df.withColumn("date_type",to_date(current_timestamp())) \
38+
.show(truncate=False)
39+
40+
df.withColumn("ts",to_timestamp(col("input_timestamp"))) \
41+
.withColumn("datetype",to_date(col("ts"))) \
42+
.show(truncate=False)
43+
44+
#SQL TimestampType to DateType
45+
spark.sql("select to_date(current_timestamp) as date_type")
46+
#SQL CAST TimestampType to DateType
47+
spark.sql("select date(to_timestamp('2019-06-24 12:01:19.000')) as date_type")
48+
#SQL CAST timestamp string to DateType
49+
spark.sql("select date('2019-06-24 12:01:19.000') as date_type")
50+
#SQL Timestamp String (default format) to DateType
51+
spark.sql("select to_date('2019-06-24 12:01:19.000') as date_type")
52+
#SQL Custom Timeformat to DateType
53+
spark.sql("select to_date('06-24-2019 12:01:19.000','MM-dd-yyyy HH:mm:ss.SSSS') as date_type")
54+
55+

pyspark-unix-time.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
author SparkByExamples.com
4+
"""
5+
6+
from pyspark.sql import SparkSession
7+
8+
# Create SparkSession
9+
spark = SparkSession.builder \
10+
.appName('SparkByExamples.com') \
11+
.getOrCreate()
12+
13+
inputData = [("2019-07-01 12:01:19",
14+
"07-01-2019 12:01:19",
15+
"07-01-2019")]
16+
columns=["timestamp_1","timestamp_2","timestamp_3"]
17+
df=spark.createDataFrame(
18+
data = inputData,
19+
schema = columns)
20+
df.printSchema()
21+
df.show(truncate=False)
22+
23+
from pyspark.sql.functions import *
24+
df2 = df.select(
25+
unix_timestamp(col("timestamp_1")).alias("timestamp_1"),
26+
unix_timestamp(col("timestamp_2"),"MM-dd-yyyy HH:mm:ss").alias("timestamp_2"),
27+
unix_timestamp(col("timestamp_3"),"MM-dd-yyyy").alias("timestamp_3"),
28+
unix_timestamp().alias("timestamp_4")
29+
)
30+
df2.printSchema()
31+
df2.show(truncate=False)
32+
33+
df3=df2.select(
34+
from_unixtime(col("timestamp_1")).alias("timestamp_1"),
35+
from_unixtime(col("timestamp_2"),"MM-dd-yyyy HH:mm:ss").alias("timestamp_2"),
36+
from_unixtime(col("timestamp_3"),"MM-dd-yyyy").alias("timestamp_3"),
37+
from_unixtime(col("timestamp_4")).alias("timestamp_4")
38+
)
39+
df3.printSchema()
40+
df3.show(truncate=False)
41+
42+
#SQL

0 commit comments

Comments
 (0)