-
Notifications
You must be signed in to change notification settings - Fork 0
/
MATH0216_A3.R
105 lines (78 loc) · 3.68 KB
/
MATH0216_A3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
## ----include=FALSE---------------------------------------------------------------------------------------
library(tidyverse)
library(ggthemes)
## --------------------------------------------------------------------------------------------------------
#To load directly from `nycflights13` package
# make sure you have ran install.packages("nycflights13")
library(nycflights13)
#tblflights <- tbl_df(flights)
#OR
# to load from local .csv file
tblflights <- read.csv("tblflights.csv")
## --------------------------------------------------------------------------------------------------------
#the airline codes can be found using the following code
tbl_df(airlines)
## ---- results=FALSE--------------------------------------------------------------------------------------
mutate(.data = tblflights, route = paste(origin, dest, sep = "-"))
## --------------------------------------------------------------------------------------------------------
length(table(tblflights$carrier))
## --------------------------------------------------------------------------------------------------------
tblflights %>%
filter(is.na(dep_delay) == 0) %>%
group_by(carrier) %>%
summarize(mean.delay = mean(dep_delay))
## --------------------------------------------------------------------------------------------------------
tblflights %>%
arrange(-dep_delay) %>%
select(year, month, day, sched_dep_time, dep_time, carrier, origin, dest, dep_delay) %>%
head(n = 10)
## --------------------------------------------------------------------------------------------------------
tblflights %>%
filter(is.na(dep_delay) == 0 & dep_delay >= 0) %>%
ggplot(aes(dep_delay)) +
theme_wsj() +
geom_histogram(binwidth = 0.25, na.rm = TRUE) +
labs(title = "Departure Delays",
x = "Departure Delay (in minutes)",
y = "Frequency",
caption = "source: tblflights dataset") +
scale_x_log10()
tblflights %>%
filter(is.na(dep_delay) == 0 & dep_delay >= 0) %>%
ggplot(aes(arr_delay)) +
geom_histogram(binwidth = 0.25, na.rm = TRUE) +
theme_economist() +
labs(title = "Arrival Delays",
x = "Arrival Delay (in minutes)",
y = "Frequency",
caption = "source: tblflights dataset") +
scale_x_log10()
## --------------------------------------------------------------------------------------------------------
tblflights %>%
filter(origin == "JFK", dest == "LAX", arr_delay > 0, arr_delay <= 400) %>%
ggplot(aes(x = carrier, y = arr_delay)) +
geom_boxplot(fill = "steelblue", outlier.size = 1) +
labs(title = "Arrival Delays Across Carriers",
x = "Carrier",
y = "Delay (Minutes)",
caption = "source: tblflights dataset")
## --------------------------------------------------------------------------------------------------------
carrier_flights_over_time <- tblflights %>%
group_by(carrier, month) %>%
summarize(sum = length(carrier)) %>%
spread(key = month, value = sum)
carrier_flights_over_time
## --------------------------------------------------------------------------------------------------------
tblflights %>%
filter((origin == "JFK" | origin == "LGA" | origin == "EWR") & (dest == "SFO" | dest == "SJC" | dest == "OAK")) %>%
group_by(carrier, month) %>%
summarize(sum = length(carrier)) %>%
ggplot(aes(x = month, y = sum)) +
geom_point(aes(col=carrier), size=2) +
geom_line(aes(col=carrier)) +
labs(title = "Flights from NYC Area to SF Bay Area by Carrier",
x = "Month",
y = "Number of Flights",
caption = "source: tblflights dataset") +
scale_x_discrete(limits=c("JAN","FEB","MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC")) +
theme_dark()