-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.R
565 lines (417 loc) · 18.8 KB
/
main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
# Created by Piotr Prostko
# Presented on 04/07/2023
# email contact: [email protected] or [email protected]
# Disclaimer: I prepared this script to share the experience of working in
# RStudio throughout my PhD. Apologies in advance for any unintended mistakes in
# the following codes and for presenting potentially obvious-by-now commands (I
# started learning R in the pre-tidyverse era)
library(tidyverse)
# https://www.tidyverse.org/packages/
# RStudio keyboard shortcuts ----------------------------------------------
# KS - keyboard shortcut
# References:
# my own exploration of ALT + SHIFT + K
# part1: https://appsilon.com/r-studio-shortcuts-and-tips-2/
# part2: https://appsilon.com/r-studio-shortcuts-and-tips-part-2/
# That source notes: Although the gifs have both PC and Mac shortcuts, I am
# using the former in the text. If you are a Mac user, most of the shortcuts
# fall under this dependence: Ctrl == ⌘ Command && Alt == ⌥ Option
# but keep in mind that in some cases Ctrl will also be a Control key on Mac
# which can be confusing. You can always look up the proper shortcut in R Studio
# with ⌥ Option+Shift+K (Alt+Shift+K)
# KS1: PG UP/DN - move vertically through your document
# KS2: CTRL + HOME/END - move to the beginning/end of the entire script
# KS3: CTRL + LEFT/RIGHT ARROW - move to the left/right by ONE 'WORD' instead of one cursor position
# KS4: HOME/END - move to the beginning or the end of the current line
# KS5: SHIFT + ARROWS - select a piece of code within (left/right arr, try adding HOME or END) or across lines (up/down arr)
# KS6: ALT + UP/DN ARROW - move the selected lines of code
# KS7: CTRL + SHIFT + M - pipe operator from package magrittr, named after the
# famous Belgian surrealist painter René Magritte
# %>% %>% %>% %>% %>%
# KS8: ALT + minus sign - the '<-' assignment operator
# <- <- <- <- <- <-
# KS10: CTRL + SHIFT + C - (un)comment the line(s)
# comment me or I will throw an error
# KS11: ALT + SHIFT + DOWN ARR - copy the line(s) [sometimes while doing this you
# may change your keyboard language]
head(iris)
# KS12: ALT + ENTER - run the current line of code without proceeding to the next
# line BTW, I was raised on CTRL + R and ALT + R shortcuts for running things,
# but this is a matter of taste
head(starwars)
tail(starwars)
# KS13: CTRL + ALT + B(E) - run everything above (or below) your current position in the code
# KS14: TAB: code completion or path completion, also the built-in code snippets are handy
# autocomplete the lines below
lib
for
mat
app
fun
# this allows for easy path specification within or outside the working
# directory (but the WD is the starting point)
path1 <- ''
path2 <- '../'
# NAVIGATING THROUGH R STUDIO
# KS15: moving the cursor to
# the code editor: CTRL + 1
# the console: CTRL + 2
# the history tab: CTRL + 4
# the environment tab (default): CTRL + 8
# the files tab (default): CTRL + 5
# the plots tab (default): CTRL + 6
# add SHIFT to any of the above and it will maximise the selected pane (repeat
# this combination to undo)
# KS16: CTRL + TAB + (shift) - switch between tabs with open R programs
# press CTRL + SHFIT + N to create untitled R scripts and to be able to toggle
# between tabs
# KS17: CTRL + W - close a tab
# KS18: CTRL + SHIFT + . - search among open tabs
# NEAT CODE ORGANISATION
# KS19: CTRL + I - nice code indentation (though not always work, KS20 is better)
for(i in 1:10){
x=i %% 2
if (x==0) print('div by 2')
else print('not div by 2')}
# KS20: CTRL + SHIFT + A - nice code formatting
for(i in 1:10){
x=i %% 2
if (x==0) print('div by 2')
else print('not div by 2')}
# KS21: CTRL + SHIFT + / - nice comment formatting a way too long comment: Lorem
# ipsum dolor sit amet, consectetur adipiscing elit. Proin vulputate luctus
# commodo. Phasellus fermentum dolor nec urna
# KS22: CTRL + SHIFT + R - insert structured code headers!!
# KS23: CTRL + SHIFT + O - open and close the pane with code headers
# KS24: ALT + L (+SHIFT) - collapse (or open) LOCAL code section embraced with curly parentheses
# KS25: ALT + O (+SHIFT) - collapse (or open) EVERY code section embraced with
# curly parentheses. Mnemonic: L - local, O - 'OLL~ALL'
# OTHER SHORTCUTS
# KS26 press F1 while having the cursor on a function/R object to get its
# documentation
?lm
# KS27 F2 - similar but shows the source code (if available). F2 on
# data frames is equivalent to View(df)
iris
# click on the wizard wand for more info
# KS28: CTRL + L - clear console
# KS29: CTRL + SHIFT + F10 - restart the RStudio session
# KS30: CTRL + .(i.e., dot) - go to a file/function
# KS31: CTRL + shift + F - search in multiple files inside a specified directory directories
# KS32: CTRL + P - jump between brackets
set.seed(123)
LETTERS[sample(seq_along(LETTERS), size = 5)]
# KS33: CTRL + up/down arr - do this in the console to browse through the relevant command history
# KS34: CTRL + 4, and then browse through the relevant command history, press ENTER or SHIFT + ENTER
# KS35: CTRL + F - search functionality
# F3 - to go the next hit
# SHIFT + F3 - go to the previous hit
# KS36: CTRL + ALT + UP/DN ARROW - cursor selection across multiple lines;
# create a vector from the following values copied from an xlsx files
0.2
0.2
0.2
0.2
0.2
0.1
0.1
# KS37: CTRL + SHIFT + S - source the current file
# Cool addins:
# datapasta - addin for copying from websites and pasting as R objects (tribbles)
install.packages('datapasta')
# go to https://cran.r-project.org/web/packages/datapasta/vignettes/how-to-datapasta.html
# select the first table and copy it,
# go to RStudio --> Tools --> Addins --> Browse addins --> data pasta paste as a data frame
# click and drag ggplot generation
install.packages('esquisse')
p1 <- starwars %>%
filter(!(hair_color %in% c("blond", "auburn, white", "grey", "unknown")) | is.na(hair_color)) %>%
ggplot() +
aes(x = mass, y = height, colour = sex) +
geom_point(shape = "circle", size = 2.65) +
geom_smooth(span = 0.29) +
scale_color_viridis_d(option = "viridis", direction = 1) +
theme_classic() +
theme(legend.position = "none")
# manual modifications of theme elements
install.packages('ggThemeAssist')
# addin for the nice formatting of the entire scripts
install.packages("styler")
# Addins --> Style active file or Style selection
# Optional
{
# KS38: CTRL + ALT + X - function extraction,
# KS39: CTRL + ALT + V for variable extraction
# KS40: pipe + ALT - renaming in scope
# change rnorm to runif
x1 <- rnorm(0, 1, 5)
x2 <- rnorm(0, 1, 10)
}
# Selected processing operations with tidyverse ---------------------------
# References:
# https://r4ds.had.co.nz/
# ggplot, tidyr, dplyr documentation
# cool tidyr functions
# complete(), drop_na(), replace_na(), fill(), expand_grid(),
# pivot_, unnest(), unnest_wider(), unnest_longer(), hoist()
## ggplot ------------------------------------------------------------------
# Data and mappings specified in the ggplot() call are fed into the subsequent
# geoms; however 'local' values (i.e. data/mappings specified within individual geoms) have a higher
# priority. Thus it's possible to specify different datasets and/or mappings for different geoms.
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point(mapping = aes(color = class)) +
geom_smooth(data = filter(mpg, class == "subcompact"), se = FALSE)
df <- data.frame(displ2 = 6, hwy2 = 40)
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point(aes(colour = class)) +
geom_point(data = df, aes(x = displ2, y = hwy2), size = 5)
# several tricks for manipulating plot legend using guides() function:
ggplot(mpg, aes(x = displ, y = hwy, colour = fl, shape = drv)) +
geom_point() +
theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 1, override.aes = list(size = 8), order = 2, title = ''),
shape = guide_legend(ncol = 1, override.aes = list(size = 0.5), order = 1, title.position = 'right', label.position = 'bottom'))
ggplot(mpg, aes(x = displ, y = hwy, colour = fl, shape = drv)) +
geom_point() +
theme(legend.position = "bottom") +
guides(colour = 'none',
shape = guide_legend(ncol = 1, override.aes = list(size = 0.5), order = 1, title.position = 'right', label.position = 'bottom'))
# order - control order of legends
# nrow, ncol - how legend elements should be displayed
## dplyr -------------------------------------------------------------------
starwars %>%
group_by(species) %>%
summarise(mean_height = mean(height, na.rm = TRUE))
# group_by() + mutate() computes a given 'summary statistic' and joins it to the
# input dataset
starwars %>%
group_by(species) %>%
mutate(mean_height = mean(height, na.rm = TRUE), .before = everything())
starwars %>%
group_by(species) %>%
transmute(mean_height = mean(height, na.rm = TRUE))
mtcars_w_rownames <- mtcars %>% rownames_to_column("model")
# pull and create a named vector where the names come from another variable
mtcars_w_rownames %>% pull(hp)
mtcars_w_rownames %>% pull(hp, name = model)
# rename within select
mtcars %>% select(mpg2 = mpg, everything())
# use predicate functions (those returning TRUE or FALSE) to select columns (e.g. useful when you need only numeric vars)
mtcars_w_rownames %>% select(where(is.numeric))
mtcars_w_rownames %>% summarise(across(where(is.numeric), mean))
# you can modify the group by variables on the fly
group_by(starwars, new = case_when(eye_color == "black" ~ "black",
eye_color == "blue" ~ "blue",
TRUE ~ "other")) %>%
summarise(mean_height = mean(height, na.rm = TRUE))
# proper SQL would not allow for this but in tidyverse RHS or LHS in filter can be a summary statistic value
mtcars %>% filter(disp == max(disp))
mtcars %>% group_by(cyl) %>% filter(n() > 10)
# Intro to tidyverse::purrr -----------------------------------------------
# References:
# https://r4ds.had.co.nz/
# purrr documentation
# https://www.monicathieu.com/posts/2020-04-08-tidy-multilevel/
# https://github.com/jennybc/row-oriented-workflows
## basics of purrr ---------------------------------------------------------
{
# this won't work cause the new variable has two values per each observation.
mtcars_w_rownames %>%
group_by(cyl) %>%
mutate(quan = quantile(mpg, c(0.25, 0.75)))
# we must wrap it into a list
(temp <- mtcars_w_rownames %>%
group_by(cyl) %>%
mutate(quan = list(quantile(mpg, c(
0.25, 0.75
)))))
# working with such a list-column may be awkward at times
temp$quan[[1]]
temp$quan[[1]][1]
temp$quan[[2]]
# since this is a list-column, we could continue using standard tools for
# processing such as for loops or lapply
new_vec <- temp$quan
for (i in seq_along(temp$quan)) {
new_vec[[i]] <- temp$quan[[i]] + 10
}
# quite lengthy and repetitive
lapply(temp$quan, function(x) x = x + 10)
# this is ok, but doesn't have cool options as map() does
# map() doc says:
# map(.x, .f, ..., .progress = FALSE)
# .x A list or atomic vector.
# .f A function, specified in one of the following ways:
# A named function, e.g. mean.
# An anonymous function, e.g. \(x) x + 1 or function(x) x + 1.
# A formula, e.g. ~ .x + 1. You must use .x to refer to the first argument. Only
# recommended if you require backward compatibility with older versions of R.
# output: a list
1:10 %>%
map(rnorm, n = 10)
# Pros of map()
# 1) convenient syntax for anonnymous functions
map(temp$quan, ~. + 10)
# 2) easier element extraction from complex lists
# .default specifies value for elements that are missing or NULL
l1 <- list(list(a = 1L), list(a = NULL, b = 2L), list(b = 3L))
# from each element of l1, extract an element named 'a'; return NA if such
# doesn't exist
l1 %>% map("a", .default = NA)
# from each element of l1, extract its second value
l1 %>% map(2, .default = NA)
l2 <- list(
list(num = 1:3, letters[1:3]),
list(num = 101:103, letters[4:6]),
list()
)
l2 %>% map(c(2, 2))
x <- list(1, 1, 1)
y <- list(10, 20, 30)
map2(x, y, ~.x + .y)
x <- list(a = list(foo = 1:2, bar = 3:4), b = list(baz = 5:6))
str(x)
pluck_depth(x)
x %>%
map_depth(2, ~ paste(.x, collapse = "/")) %>%
str()
l1 <- list(
obj1 = list(
prop1 = list(param1 = 1:2, param2 = 3:4),
prop2 = list(param1 = 5:6, param2 = 7:8)
),
obj2 = list(
prop1 = list(param1 = 9:10, param2 = 11:12),
prop2 = list(param1 = 12:14, param2 = 15:17)
)
)
str(l1)
pluck_depth(l1)
# In the above list, "obj" is level 1, "prop" is level 2 and "param"
# is level 3. To apply sum() on all params, we map it at depth 3:
tmp <- l1 %>%
modify_depth(3, sum)
str(tmp)
# ?map_dbl, ...
# ?map2,
# ?pmap,
# imap, walk, ...
# cheatsheet is your friend
# https://github.com/rstudio/cheatsheets/blob/master/purrr.pdf
# useful funs for list processing
l1 <- list(a = "a", b = NULL, c = integer(0), d = NA, e = list())
str(l1)
compact(l1)
x <- list(1, list(2, 3), list(4, list(5)))
str(x)
x %>% list_flatten() %>% str()
x <- list(list(a = 1, b = 2), list(a = 3, b = 4), list(a = 5, b = 6))
x %>% list_transpose()
}
## nest + map + unnest workflow -----------------------------------------------------
{
mtcars_w_rownames_nested <- mtcars_w_rownames %>%
group_by(cyl) %>%
nest()
lin_mod_fun <- function(x) lm(data = x, mpg ~ hp)
mtcars_w_rownames_nested2 <- mtcars_w_rownames_nested %>%
mutate(lin_mod = map(data, lin_mod_fun),
rsq = map_dbl(lin_mod, ~summary(.)$r.squared),
res = map(lin_mod, residuals)) %>%
select(-lin_mod) %>%
unnest(cols = c(data, res))
# the workflow above resembles a lot BY statement in various SAS procedures
}
## lists <--> df -----------------------------------------------------------
{
# list --> df
tmp_list <- list(a = 1:2, b = 3:4, c = 5:6)
list2DF(tmp_list)
df_from_list <- enframe(tmp_list, name = 'var_name', value = 'var_value') %>%
unnest(var_value) %>%
mutate(var_value2 = var_value * 2)
tmp_list <- list(a = 1:2, b = 3:4, c = 5:8)
# list2DF(tmp_list) will fail
# df --> list
deframe(df_from_list[,1:2])
}
# Wrappers on tidyverse functions -----------------------------------------
# References:
# https://dplyr.tidyverse.org/articles/programming.html
# Metaprogramming chapter from https://adv-r.hadley.nz/index.html
# https://raw.githubusercontent.com/rstudio/cheatsheets/master/tidyeval.pdf
## Non-Standard Evaluation -------------------------------------------------
# Why should we even bother?
# Because computers need to be told what to do.
# Because Hadley Wickham wants so ;-) (aes_string() is deprecated in ggplot)
# Because writing code like
starwars[starwars$homeworld == "Naboo" & starwars$species == "Human", ,]
# is extremely tiring, and even base R functions such as ave(), with() also allow for using
# 'bare' variable names
with(mtcars, mpg[cyl == 8 & disp > 350])
# instead of
mtcars$mpg[mtcars$cyl == 8 & mtcars$disp > 350]
# This shows that non-standard evaluation is also a part of base R
# Also look at SAS macroprogramming and its variety of quoting functions: %Str,
# %Nrstr, %Bquote, %Nrbquote, %Superq !!!
# https://dplyr.tidyverse.org/articles/programming.html
# Docu says:
# Tidy evaluation is a special type of non-standard evaluation used throughout
# the tidyverse. There are two basic forms found in dplyr:
#
# arrange(), count(), filter(), group_by(), mutate(), and summarise() use data
# masking so that you can use data variables as if they were variables in the
# environment (i.e. you write my_variable not df$my_variable).
# across(), relocate(), rename(), select(), and pull() use tidy selection so you
# can easily choose variables based on their position, name, or type (e.g.
# starts_with("x") or is.numeric).
# Data masking - blurs the line between two meanings of the word “variable”:
# env-variables are “programming” variables that live in an environment. They are usually created with <-
# data-variables are “statistical” variables that live in a data frame.
df <- data.frame(x = runif(3), y = runif(3))
# df - env var; df$x - data var
# PROBLEM: indirection
# 1) When you have the data-variable in a function argument, embrace the argument in
# doubled braces, like filter(df, {{ var }})
# my interpretation within function environment, var is env variable, causing
# indirection in data-masking functions expecting data variables
var_summary <- function(data, var) {
data %>%
summarise(n = n(), min = min({{ var }}), max = max({{ var }}))
}
mtcars %>%
group_by(cyl) %>%
var_summary(mpg)
# 2) env-variable that is a character vector, you need to index into the .data
# pronoun with [[, like summarise(df, mean = mean(.data[[var]])).
#
# The following example uses .data to count the number of unique values in each
# variable of mtcars:
# }
# Note that .data is not a data frame; it’s a special construct, a pronoun, that
# allows you to access the current variables either directly, with .data$x or
# indirectly with .data[[var]]. Don’t expect other functions to work with it.
# Tidy selection
# PROBLEM: indirection
# When you have the data-variable in an env-variable that is a function
# argument, you use the same technique as data masking: you embrace the argument
# by surrounding it in doubled braces. The following function summarises a data
# frame by computing the mean of all variables selected by the user:
summarise_mean <- function(data, vars) {
data %>% summarise(n = n(), across({{ vars }}, mean))
}
mtcars %>%
group_by(cyl) %>%
summarise_mean(where(is.numeric))
# When you have an env-variable that is a character vector, you need to use
# all_of() or any_of() depending on whether you want the function to error if a
# variable is not found. The following code uses all_of() to select all of the
# variables found in a character vector; then ! plus all_of() to select all of
# the variables not found in a character vector:
vars <- c("mpg", "vs")
mtcars %>% select(all_of(vars))
mtcars %>% select(!all_of(vars))
# Structuring your analysis and reporting pipeline ------------------------
# under construction
# code versioning definitely plays an important role
# https://gjbex.github.io/Version-control-with-git/