-
Notifications
You must be signed in to change notification settings - Fork 0
/
datasets.R
124 lines (95 loc) · 3.04 KB
/
datasets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Sat Sep 01 07:53:36 2018 ------------------------------
require(ggplot2)
# Generate the Facebook Like data
library(truncnorm)
n = 30; avg = 50; stdev = 15;
set.seed(20)
likes = round(rtruncnorm(n, a=0, b = 100, mean = avg, sd = stdev), digits = 0)
fblikes = data.frame(likes)
ggplot(fblikes, aes(x = likes)) +
geom_histogram(color = "black", fill = "gray", binwidth = 10) +
scale_y_continuous("Count", c(seq(0,10,2)))
# Generate the daily internet usage data
library(lubridate)
start_date = '2018-01-01'
end_date = '2018-08-31'
dates = seq(as.Date(start_date), as.Date(end_date), by='days')
days = weekdays(dates)
days <- factor(days, levels= c("Sunday", "Monday",
"Tuesday", "Wednesday",
"Thursday", "Friday",
"Saturday"))
n = length(dates); avg = 5; stdev = .3;
set.seed(20)
usage = round(rlnorm(n, mean = avg, sd = stdev), digits = 2)
sex = sample(c("Male", "Female"),size = n, replace = T)
internet_usage = data.frame(dates, days, sex, usage)
# Basic box plot
p <- ggplot(internet_usage, aes(x = as.factor(days), y = usage)) +
geom_boxplot() +
labs(x="Day of the week", y = "Internet usage")
p
# Percapita spenditure on students in public universities
# Source: http://data.gov.bd/dataset/annual-expenditure-student-financial-year-basis/resource/602d73d4-8e21-4544-bc37
library(tidyverse)
fname = "annual_expenditure_per_student_2018.xlsx"
# df <- readxl::read_xlsx(fname, sheet = 1, range = "B9:S24", col_names = F)
df <- openxlsx::read.xlsx(fname, sheet = 1, startRow=9, colNames=FALSE)
for (i in 2:9) {
df0 <- openxlsx::read.xlsx(fname, sheet = i, startRow=9, colNames=FALSE)
df <- rbind(df, df0)
}
dim(df)
head(df)
# Remove all columns with NA
df <- df %>%
discard(~all(is.na(.x))) %>%
map_df(~.x)
names(df)
# Drop col X7, X14, X15
df <- df %>%
select(- c(X7, X14, X15))
# Remove all rows with all NAs
df <- df[!apply(is.na(df), 1, all),]
# Drop the row with NA for column X1
df <- df %>%
filter(!is.na(X1 ))
dim(df)
head(df)
vnames = c("sl", "univ", "stud_2012", "expn_2012",
"stud_2013", "expn_2013",
"stud_2014", "expn_2014",
"stud_2015", "expn_2015",
"stud_2016", "expn_2016")
colnames(df) <- vnames
head(df)
# wide to tall: step by step
df_students <- df %>%
select(sl, univ, starts_with("stud")) %>%
gather(var, students, starts_with("stud")) %>%
separate(var, into = c('name', 'year')) %>%
select(-name)
head(df_students)
df_expenses <- df %>%
select(sl, univ, starts_with("expn")) %>%
gather(var, expenses, starts_with("expn")) %>%
separate(var, into = c('name', 'year')) %>%
select(-name)
head(df_expenses)
df <- left_join(df_students, df_expenses)
df <- df %>%
mutate(
students = as.numeric(students),
expenses = round(as.numeric(expenses), 2),
total = students * expenses
)
head(df)
summary(df)
with(df, plot(students, expenses))
boxplot(df$students)
boxplot(df$expenses)
boxplot(df$total)
summary(df$total)
plot(df$total)
hist(df$total[df$total <= 120514122])
median(df$total)