diff --git a/DataWork/Code/1-intro-to-R-solutions.R b/DataWork/Code/1-intro-to-R-solutions.R new file mode 100644 index 0000000..2f7d7b7 --- /dev/null +++ b/DataWork/Code/1-intro-to-R-solutions.R @@ -0,0 +1,44 @@ +## R for Stata Users +## March 2023 +## Exercise solutions +## Session: Introduction to R + +## Exercise 1 ==== +whr <- read.csv("/path/to/data/file") +# note that this was executed through point-and-click +# during the actual session + +## Exercise 2 ==== +# Subset data +subset(whr, year == 2016) +# Check first 6 observations or whr +head(whr) + +## Exercise 3 ==== +# Subset data and store result in a new df +whr2016 <- subset(whr, year == 2016) +# Display head of new df +head(whr2016) +# Display head of origninal df +head(whr) + +## Exercise 4 ==== +# Create vector of strings +str_vec <- c("R", "Python", "SAS", "Excel", "Stata") +# Create string "scalar" +str_scalar <- "can be an option to" +# Concatenation +paste(str_vec[1], str_scalar, str_vec[5]) + +## Exercise 5 ==== +# Create boolean vector +inc_below_avg <- whr$economy_gdp_per_capita < mean(whr$economy_gdp_per_capita) +# See head of vector +head(inc_below_avg) + +## Exercise 6 ==== +# Create new column (vector) of zeros +whr$rank_low <- 0 +# Subset obs with income below average +# and replace values of rank_low with 1 for those obs +whr$rank_low[inc_below_avg] <- 1 \ No newline at end of file diff --git a/DataWork/Code/2-intro-to-R-programming-solutions.R b/DataWork/Code/2-intro-to-R-programming-solutions.R new file mode 100644 index 0000000..634cfa9 --- /dev/null +++ b/DataWork/Code/2-intro-to-R-programming-solutions.R @@ -0,0 +1,53 @@ +## R for Stata Users +## March 2023 +## Exercise solutions +## Session: Introduction to R programming + +## Exercise 1 ==== +# (no coding needed for exercise) + +## Exercise 2 ==== +# (no coding needed for exercise) + +## Exercise 3 ==== +library(here) +whr <- read.csv(here("DataWork", "DataSets", "Final", "whr_panel.csv")) +# note that this will only work if exercise 2 +# was executed correctly + +## Exercise 4 ==== +#install.packages("dplyr") # uncomment installation if needed +#install.packages("purrr") # uncomment installation if needed +library(dply) +library(purrr) + +## Exercise 5 ==== +# Create dataframe +df <- data.frame(replicate(50000, sample(1:100, 400, replace=TRUE))) +# Create empty vector +col_means_loop <- c() +# Loop and append means to vector (will take a few seconds) +for (column in df){ + col_means_loop <- append(col_means_loop, mean(column)) +} + +## Exercise 6 ==== +col_means_map <- map(df, mean) +# this will only work if you defined df in exercise 5 + +## Exercise 7 ==== +zscore <- function(x) { + mean <- mean(x, na.rm = TRUE) + sd <- sd(x, na.rm = TRUE) + z <- (x - mean)/sd + return(z) +} + +## Exercise 8 ==== +z_scores <- whr %>% + select(health_life_expectancy, freedom) %>% + map(zscore) +whr$hle_st <- z_scores[[1]] +whr$freedom_st <- z_scores[[2]] +# this will only run if you created the function +# zscores() in exercise 7 diff --git a/DataWork/Code/main.R b/DataWork/Code/main.R new file mode 100644 index 0000000..5efcd70 --- /dev/null +++ b/DataWork/Code/main.R @@ -0,0 +1,90 @@ +# ------------------------------------------------------------------------------ # +# # +# DIME # +# Introduction to R for Stata users # +# MAIN SCRIPT # +# # +# ------------------------------------------------------------------------------ # + +# PURPOSE: Set-up configurations and run scripts + +# NOTES: Version 2 + +# WRITTEN BY: Luiza Cardoso de Andrade, Leonardo Viotti + +# Last modified in Mar 2023 + +# PART 1: Select sections to run ---------------------------------------------- + +Lab2 <- 0 +Lab3 <- 0 +Lab4 <- 0 +Lab5 <- 0 +Lab6 <- 0 + +# PART 2: Load packages ----------------------------------------------------- + +packages <- c("readstata13","foreign", + "doBy", "broom", "dplyr", + "stargazer", + "ggplot2", "plotly", "ggrepel", + "RColorBrewer", "wesanderson", + "sp", "rgdal", "rgeos", "raster", "velox", + "ggmap", "rasterVis", "leaflet", + "htmlwidgets", "geosphere") + +# If you selected the option to install packages, install them +sapply(packages, function(x) { + if (!(x %in% installed.packages())) { + install.packages(x, dependencies = TRUE) + } +} +) + +# Load all packages -- this is equivalent to using library(package) for each +# package listed before +invisible(sapply(packages, library, character.only = TRUE)) + +# PART 3: Set folder folder paths -------------------------------------------- + +#-------------# +# Root folder # +#-------------# + +# Add your username and folder path here (for Windows computers) +# To find out what your username is, type Sys.getenv("USERNAME") +if (Sys.getenv("USERNAME") == "luiza") { + + projectFolder <- "C:/Users/luiza/Documents/GitHub/dime-r-training" + +} + +# If you're using Mac, just add your folder path, without the if statement + +#--------------------# +# Project subfolders # +#--------------------# + +rawData <- file.path(projectFolder, "Data", "Raw") +finalData <- file.path(projectFolder, "Data", "Final") +Code <- file.path(projectFolder ,"Codes") +Output <- file.path(projectFolder, "Output") + + +# PART 4: Run selected sections ----------------------------------------------- + +if (Lab2 == 1) { + source(file.path(Code, "Lab 2 - Coding for Reproducible Research")) +} +if (Lab3 == 1) { + source(file.path(Code, "Lab 3 - Data Processing")) +} +if (Lab4 == 1) { + source(file.path(Code, "Lab 4 - Descriptive Analysis")) +} +if (Lab5 == 1) { + source(file.path(Code, "Lab 5 - Data Visualization")) +} +if (Lab6 == 1) { + source(file.path(Code, "Lab 6 - Spatial Data")) +} \ No newline at end of file diff --git a/Presentations/01-intro-to-R.Rmd b/Presentations/01-intro-to-R.Rmd index 5824351..984fc3e 100644 --- a/Presentations/01-intro-to-R.Rmd +++ b/Presentations/01-intro-to-R.Rmd @@ -1,8 +1,8 @@ --- title: "Session 1 - Introduction to R" subtitle: "R for Stata Users" -author: "Luiza Andrade, Rob Marty, Rony Rodriguez-Ramirez, Luis Eduardo San Martin, Leonardo Viotti" -date: "The World Bank | [WB Github](https://github.com/worldbank)
May 2022" +author: "Luiza Andrade, Marc-Andrea Fiorina, Rob Marty, Rony Rodriguez-Ramirez, Luis Eduardo San Martin, Leonardo Viotti" +date: "The World Bank | [WB Github](https://github.com/worldbank)
March 2023" output: xaringan::moon_reader: css: ["libs/remark-css/default.css", "libs/remark-css/metropolis.css", "libs/remark-css/metropolis-fonts.css"] @@ -65,7 +65,7 @@ xaringanExtra::use_logo( ``` ```{r echo = FALSE, include = FALSE, eval = TRUE} -whr <- read_csv(here("DataWork", "DataSets", "Final", "whr_panel.csv")) +whr <- read.csv(here("DataWork", "DataSets", "Final", "whr_panel.csv")) ``` # Table of contents @@ -99,9 +99,9 @@ We're glad you're joining us today! ## Format -- Every session has two TAs. For this session, our TAs are __Luiza Cardoso De Andrade__ and __Rony Rodriguez-Ramirez__ +- Every session has two TAs. For this session, our TAs are __Luiza Cardoso De Andrade__ and __Marc-Andrea Fiorina__ -- The TAs will help you troubleshooting __particular issues__ which make you unable to follow along the presentation. Send them a message over the chat whenever you need help +- The TAs will help you troubleshooting __particular issues__ which make you unable to follow along the presentation. Send a message over the chat whenever you need help --- @@ -118,7 +118,7 @@ We're glad you're joining us today! - The materials of each session will be shared in the OSF page of the course by the end of each session: https://osf.io/86g3b/ -- The recordings will be shared in a WB internal link +- The recordings will be shared each day after the session --- @@ -165,7 +165,7 @@ knitr::include_graphics("img/Interface.png") # Getting started - RStudio interface ```{r echo = FALSE, out.width = "70%"} -knitr::include_graphics("img/scritpt1.png") +knitr::include_graphics("img/script1.png") ``` --- @@ -173,14 +173,14 @@ knitr::include_graphics("img/scritpt1.png") # Getting started - RStudio interface ```{r echo = FALSE, out.width = "70%"} -knitr::include_graphics("img/scritpt2.png") +knitr::include_graphics("img/script2.png") ``` --- # Getting started - RStudio interface ```{r echo = FALSE, out.width = "70%"} -knitr::include_graphics("img/scritpt3.png") +knitr::include_graphics("img/script3.png") ``` --- @@ -188,24 +188,25 @@ knitr::include_graphics("img/scritpt3.png") # Getting started - RStudio interface ```{r echo = FALSE, out.width = "70%"} -knitr::include_graphics("img/scritpt4.png") +knitr::include_graphics("img/script4.png") ``` --- # Getting started - Importing data -Let's start by loading the data set we'll be using: +Let's start by loading the data we'll be using: ## Exercise 1: Import data manually (`r fa("clock")` 3 min) -1. Go to the OSF page of the course (https://osf.io/86g3b/) and download the file located in `R for Stata Users - 2022 May` > `Data` > `whr_panel.csv` +1. Go to the OSF page of the course (https://osf.io/86g3b/) and download the file located in `R for Stata Users - 2023 March` > `Data` > `whr_panel.csv` 2. In RStudio, go to `File` > `Import Dataset` > `From Text (base)` and open the `whr_panel.csv` file. + Depending on your Rstudio version, it might be `File` > `Import Dataset` > `From CSV` -3. Assign the name `whr` to the dataset on the import window. +3. Assign the name `whr` to the data on the import window. + + If you solved the exercise correctly, you'll see that RStudio opens a tab with a viewer of the dataframe --- @@ -258,11 +259,11 @@ name: data-in-r ## In R: -__R__ works in a completely different way: +Datasets are called __dataframes__. R works with them in a different way: -* You can load __as many datasets as you wish__ or your computer's memory allows +* You can load __as many dataframes as you wish__ or your computer's memory allows -* Operations will have lasting effects __only if you store them__ +* Operations will have lasting effects __only if you store their results__ --- @@ -270,7 +271,7 @@ __R__ works in a completely different way: ## In R: -* Everything that exists in R's memory -- variables, datasets, functions -- __is an object__ +* Everything that exists in R's memory -variables, dataframes, functions- __is an object__ * You could think of an object like a chunk of data with some properties that has a name by which you call it @@ -294,7 +295,7 @@ View(whr) # <--- Note that the first letter is uppercase ``` -```{r echo = FALSE, out.width = "45%"} +```{r echo = FALSE, out.width = "50%"} knitr::include_graphics("img/View.png") ``` @@ -305,7 +306,7 @@ knitr::include_graphics("img/View.png") Alternatively we can print the first 6 obs. with `head()`: ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- @@ -316,7 +317,7 @@ Now, let's try some simple manipulations. First, assume we're only interested in ## Exercise 2: Subset the data (`r fa("clock")` 1 min) -- Subset the data set, keeping only observations where variable `year` equals `2016`. +- Subset the dataframe, keeping only observations where variable `year` equals `2016`. ```{r, eval=FALSE} # To do that we'll use the subset() function @@ -341,7 +342,7 @@ head(whr) ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- @@ -369,7 +370,7 @@ x <- 42 ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- @@ -388,7 +389,7 @@ knitr::include_graphics("img/x_42.png") ## Exercise 3: Create an object (`r fa("clock")` 1 min) -Create a new dataset, called `whr2016`, that is a subset of the `whr` data set containing only data from the year 2016. +Create a new dataframe, called `whr2016`, that is a subset of the `whr` dataframe containing only data from the year 2016. ```{r, include = T, results = "hide"} # Using the same function but now assigning it to an object @@ -397,30 +398,16 @@ whr2016 <- subset(whr, year == 2016) # Display the 6 first obs. of the new data head(whr2016) -# Notice that we still have the original data set intact -head(whr) - -``` - ---- - -# Data in R - -```{r, include = T, results = "hide"} -whr2016 <- subset(whr, year == 2016) -head(whr2016) +# Notice that we still have the original dataframe intact head(whr) -``` -```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") ``` --- # Data in R -You can also see that your environment panel now has two `Data` objects: +You can also see that your environment panel now has two data objects: ```{r echo = FALSE, out.width = "60%"} knitr::include_graphics("img/environment_2vars_2021.png") @@ -511,7 +498,7 @@ knitr::include_graphics("img/subset_arguments.png") * Usually the first argument is the object you want to use the function on, e.g. `subset(whr, ...)` -* Functions usually return values that you can store in an object, print or use directly as an argument of another function. +* Functions usually return values that you can store in an object, print or use directly as an argument of another function. __They rarely modify an object in-place__ We will explore these ideas in depth in a later session. @@ -542,7 +529,7 @@ This will give you the foundation to explore your data and construct analytical * An object is like a global or local in Stata, it's __something you can refer to later__ in your code to get a value -* But while you can only put a number or a string in a global, __you can put anything into an object__: scalars, strings, datasets, vectors, plots, functions, etc. +* But while you can only put a number or a string in a global, __you can put anything into an object__: scalars, strings, dataframes, vectors, plots, functions * Objects also have attributes that can be used to manipulate them @@ -556,7 +543,7 @@ Here are the object classes we will cover in this first session: * __Vectors:__ an uni-dimensional object that __stores a sequence of values of the same class__ -* __Data frames:__ a combination of different vectors of the same length (the same as your dataset in Stata) +* __Dataframes:__ a combination of different vectors of the same length (the same as a dataset in Stata) * __Lists:__ a multidimensional object that can store several objects __of different classes and dimensions__ @@ -588,7 +575,7 @@ v2 <- 1:5 # Alternative way to create an evenly spaced vector ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- @@ -602,54 +589,18 @@ v2[1:3] # Prints from the 1st to the 3rd element ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- -# R objects - Vectors - -## Vectors +# R objects - Dataframes -To R, each of the columns of the object `whr` is a vector. +## Dataframes -### Calling a vector from a `data.frame` column: - -We use the `$` character (operator) to extract vectors (variables) by their names in a `data.frame` - -For example: +The `whr` and `whr2016` objects are both dataframes. You can also construct a new dataframe from scratch by __combining vectors with the same number of elements__ with the command `data.frame()`. -```{r} -# Create a vector with the values of the "year" variable -year_vector <- whr$year - -# See the 3 first elements of the year column -whr$year[1:3] - -``` - ---- - -# R objects - Vectors - -```{r, eval=F} -year_vector <- whr$year # creates a vector with the values of the "year" variable -whr$year[1:3] # see the 3 first elements of the year column -``` - -```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") -``` - ---- - -# R objects - Data frames - -## Data frames - -The `whr` and `whr2016` objects are both data frames. You can also construct a new data frame from scratch by __combining vectors with the same number of elements__. - -#### Now, type the following code to create a new data frame +#### Now, type the following code to create a new dataframe ```{r} # Dataframe created by biding vectors df1 <- data.frame(v1,v2) @@ -658,7 +609,7 @@ df1 --- -# R objects - Data frames +# R objects - Dataframes ```{r, eval=F} df1 <- data.frame(v1,v2) #creates a df by binding to existing vectors @@ -666,16 +617,16 @@ df1 ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- -# R objects - Data frames +# R objects - Dataframes -## Data frames +## Dataframes -Since a data frame has two dimensions, you can use indices for both. The first index indicates the row selection and the second indicates the column. +Since a dataframe has two dimensions, you can use indices for both. The first index indicates the row selection and the second indicates the column. ### Numeric indexing ```{r, eval = F} @@ -692,7 +643,7 @@ whr[45,1] --- -# R objects - Data frames +# R objects - Dataframes ```{r, eval = F} whr[,1] # The first column of whr @@ -701,14 +652,14 @@ whr[45,1] # Or the 45th element of the first column ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- -# R objects - Data frames +# R objects - Dataframes -## Data frames +## Dataframes Alternatively, you can use the column names for indexing, which is the same as using the `$` sign. @@ -716,12 +667,11 @@ Alternatively, you can use the column names for indexing, which is the same as u ```{r} # The 22th element of the country column whr[22,"country"] # The same as whr$country[22] - ``` --- -# R objects - Data frames +# R objects - Dataframes ```{r, eval=F} # The 22th element of the country column @@ -729,7 +679,38 @@ whr[22,"country"] # The same as whr$country[22] ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") +``` + +--- + +# R objects - Dataframes + +## Vectors in dataframes + +To R, each of the columns of the object `whr` is a vector. + +### Calling a vector from a dataframe: + +We use the `$` character to extract vectors (variables) by their names in a dataframe + +For example: + +```{r} +# Create a vector with the values of the "year" variable +year_vector <- whr$year +``` + +--- + +# R objects - Dataframes + +```{r, eval=F} +year_vector <- whr$year # creates a vector with the values of the "year" variable +``` + +```{r echo=FALSE} +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- @@ -740,9 +721,9 @@ knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") Lists are more complex objects that can contain many objects of __different classes and dimensions__. -The outputs of many functions, a regression for example, are similar to lists. +The outputs of many functions, a regression for example, are similar to lists (more on this in a later session). -It would be beyond the scope of this introduction to go deep into them, but here's a quick example: +Here's a quick example: ### Combine several objects of different types in a list ```{r, include = T, results = "hide"} @@ -761,7 +742,7 @@ print(lst) # checking the content of lst ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- @@ -780,13 +761,13 @@ You can subset lists using single brackets (`[]`) or double brackets (`[[]]`) # R objects - Lists ```{r, eval=F} +lst <- list(v1, df1, 45) lst[[3]] # returns 45 lst[3] # returns a list of one element (45) -lst[c(1,2)] # returns a list with the first two elements of "lst" ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- @@ -815,45 +796,18 @@ As in Stata, there are two different ways to store numbers. They are different b # Basic types of data - Strings -Now we'll use string data to practice some basic object manipulations in R. - -### Exercise 4: Create a vector of strings (`r fa("clock")` 2 min) -Create a string vector containing the names of commonly used statistical software: -```{r, include = T, results = "hide"} - -# Creating string vector -str_vec <- c("R", "Python", "SAS", "Excel", "Stata") -``` - -Now print them to check them out. - ---- - -# Basic types of data - Strings - -```{r, eval=F} -# Creating string vector -str_vec <- c("R", "Python", "SAS", "Excel", "Stata") -``` - -```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") -``` - ---- - -# Basic types of data - Strings +### Exercise 4: Concatenate strings (`r fa("clock")` 3 min) -### Exercise 5: Concatenate strings (`r fa("clock")` 3 min) +1. Create the following vector of strings: `str_vec <- c("R", "Python", "SAS", "Excel", "Stata")` - 1. Create a scalar (a vector of one element) containing the phrase "can be an option to" and call it `str_scalar`. Your code will be similar to this: `str_scalar <- "can be an option to"` +1. Create a scalar (a vector of one element) containing the phrase "can be an option to" and call it `str_scalar`. Your code will be similar to this: `str_scalar <- "can be an option to"` - 2. Use the function `paste()` with 3 arguments separated by commas: +1. Use the function `paste()` with 3 arguments separated by commas: + The first argument as the 1st element of `str_vec`. + The second argument as the `str_scalar`. + The third argument as the 5th element of `str_vec`. - 3. If you're not sure where to start, type: +1. If you're not sure where to start, type: ```{r, eval=FALSE} help(paste) ``` @@ -863,12 +817,13 @@ help(paste) # Basic types of data - Strings ```{r, eval=F} +str_vec <- c("R", "Python", "SAS", "Excel", "Stata") str_scalar <- "can be an option to" # creating str_scalar paste(str_vec[1], str_scalar, str_vec[5]) # using paste() ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- @@ -898,20 +853,6 @@ Booleans are __logical binary variables__, accepting either `TRUE` or `FALSE` as # Advanced types of data -## Factors - -We'll learn more about factors in a later session, since they are important for the kind of analysis we usually do. For now, here are two important things to keep in mind when using them. - -Unlike Stata, in R: - -1. __You use the labels to refer to factors__ - -2. __You cannot choose the underlying values__ - ---- - -# Advanced types of data - ## Booleans Boolean data is the result of logical conditions. It can take two possible values: `TRUE` or `FALSE`. @@ -941,7 +882,7 @@ boolean_false <- FALSE ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- @@ -950,7 +891,7 @@ knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") ## Booleans -### Exercise 6 (`r fa("clock")` 3 min) +### Exercise 5 (`r fa("clock")` 3 min) Create a boolean vector with the condition of annual income below average: @@ -972,7 +913,7 @@ head(inc_below_avg) # See the 6 first elements of the vector ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- @@ -1008,7 +949,7 @@ boolean2 <- c(FALSE, TRUE, TRUE, TRUE, TRUE) # And this to select every elemen ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- @@ -1017,9 +958,9 @@ knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") ## Booleans -Now let's use the boolean vector `inc_below_avg` to add a dummy variable in the `whr` data set for the same condition. +Now let's use the boolean vector `inc_below_avg` to add a dummy variable in the `whr` dataframe for the same condition. -### Exercise 7 (`r fa("clock")` 3 min) +### Exercise 6 (`r fa("clock")` 3 min) * Create a column in `whr` containing zeros and call it `rank_low`. You can do this by typing: @@ -1046,7 +987,7 @@ whr$rank_low[inc_below_avg] <- 1 ``` ```{r echo=FALSE} -knitr::include_app("https://rrmaximiliano.shinyapps.io/learnr-app/") +knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") ``` --- @@ -1138,26 +1079,26 @@ Some possible disadvantages of Stata: Here are some advantages of R: -* R is a free and open source software! +* R is a free and open source software, a huge advantage for open science -* It allows you to have several datasets open simultaneously. +* It allows you to have several dataframes open simultaneously + No need to use `keep`, `preserve`, `restore` -* It can run complex Geographic Information System (GIS) analyses. +* It can run complex Geographic Information System (GIS) analyses -* You can use it for web scrapping. +* You can use it for web scrapping and APIs -* You can run machine learning algorithms with it. +* You can easily run machine learning algorithms with it -* You can create complex Markdown documents. This presentation, for example, is entirely done in R. +* You can create complex Markdown documents. This presentation, for example, is entirely done in R -* You can create interactive dashboards and online applications with the Shiny package. +* You can create interactive dashboards and online applications with the Shiny package --- # Appendix - Syntax -R's syntax is a bit heavier than Stata's: +R's syntax is heavier than Stata's: * Parentheses to separate function names from its arguments. * Commas to separate arguments. @@ -1322,7 +1263,7 @@ round(num) Help in R works very much like in Stata: the help files usually start with a brief description of the function, explain its syntax and arguments and list a few examples. There are two ways to access help files: -## Exercise 8: Use help +## Exercise 7: Use help ```{r, eval=FALSE} # You can use the help() function diff --git a/Presentations/01-intro-to-R.html b/Presentations/01-intro-to-R.html index 5de57a4..a2b927d 100644 --- a/Presentations/01-intro-to-R.html +++ b/Presentations/01-intro-to-R.html @@ -3,7 +3,7 @@ Session 1 - Introduction to R - + @@ -31,10 +31,10 @@ ## R for Stata Users ] .author[ -### Luiza Andrade, Rob Marty, Rony Rodriguez-Ramirez, Luis Eduardo San Martin, Leonardo Viotti +### Luiza Andrade, Marc-Andrea Fiorina, Rob Marty, Rony Rodriguez-Ramirez, Luis Eduardo San Martin, Leonardo Viotti ] .date[ -### The World Bank | WB Github
May 2022 +### The World Bank | WB Github
March 2023 ] --- @@ -84,9 +84,9 @@ ## Format -- Every session has two TAs. For this session, our TAs are __Luiza Cardoso De Andrade__ and __Rony Rodriguez-Ramirez__ +- Every session has two TAs. For this session, our TAs are __Luiza Cardoso De Andrade__ and __Marc-Andrea Fiorina__ -- The TAs will help you troubleshooting __particular issues__ which make you unable to follow along the presentation. Send them a message over the chat whenever you need help +- The TAs will help you troubleshooting __particular issues__ which make you unable to follow along the presentation. Send a message over the chat whenever you need help --- @@ -103,7 +103,7 @@ - The materials of each session will be shared in the OSF page of the course by the end of each session: https://osf.io/86g3b/ -- The recordings will be shared in a WB internal link +- The recordings will be shared each day after the session --- @@ -147,40 +147,41 @@ # Getting started - RStudio interface -<img src="img/scritpt1.png" width="70%" style="display: block; margin: auto;" /> +<img src="img/script1.png" width="70%" style="display: block; margin: auto;" /> --- # Getting started - RStudio interface -<img src="img/scritpt2.png" width="70%" style="display: block; margin: auto;" /> +<img src="img/script2.png" width="70%" style="display: block; margin: auto;" /> --- # Getting started - RStudio interface -<img src="img/scritpt3.png" width="70%" style="display: block; margin: auto;" /> +<img src="img/script3.png" width="70%" style="display: block; margin: auto;" /> --- # Getting started - RStudio interface -<img src="img/scritpt4.png" width="70%" style="display: block; margin: auto;" /> +<img src="img/script4.png" width="70%" style="display: block; margin: auto;" /> --- # Getting started - Importing data -Let's start by loading the data set we'll be using: +Let's start by loading the data we'll be using: -## Exercise 1: Import data manually <font size="5">( 3 min)</font> +## Exercise 1: Import data manually <font size="5">( 3 min)</font> -1. Go to the OSF page of the course (https://osf.io/86g3b/) and download the file located in `R for Stata Users - 2022 May` > `Data` > `whr_panel.csv` +1. Go to the OSF page of the course (https://osf.io/86g3b/) and download the file located in `R for Stata Users - 2023 March` > `Data` > `whr_panel.csv` 2. In RStudio, go to `File` > `Import Dataset` > `From Text (base)` and open the `whr_panel.csv` file. + Depending on your Rstudio version, it might be `File` > `Import Dataset` > `From CSV` -3. Assign the name `whr` to the dataset on the import window. +3. Assign the name `whr` to the data on the import window. + + If you solved the exercise correctly, you'll see that RStudio opens a tab with a viewer of the dataframe --- @@ -227,11 +228,11 @@ ## In R: -__R__ works in a completely different way: +Datasets are called __dataframes__. R works with them in a different way: -* You can load __as many datasets as you wish__ or your computer's memory allows +* You can load __as many dataframes as you wish__ or your computer's memory allows -* Operations will have lasting effects __only if you store them__ +* Operations will have lasting effects __only if you store their results__ --- @@ -239,7 +240,7 @@ ## In R: -* Everything that exists in R's memory -- variables, datasets, functions -- __is an object__ +* Everything that exists in R's memory -variables, dataframes, functions- __is an object__ * You could think of an object like a chunk of data with some properties that has a name by which you call it @@ -262,7 +263,7 @@ View(whr) # <--- Note that the first letter is uppercase ``` -<img src="img/View.png" width="45%" style="display: block; margin: auto;" /> +<img src="img/View.png" width="50%" style="display: block; margin: auto;" /> --- @@ -270,7 +271,7 @@ Alternatively we can print the first 6 obs. with `head()`: -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- @@ -278,9 +279,9 @@ Now, let's try some simple manipulations. First, assume we're only interested in data of the year 2016. -## Exercise 2: Subset the data <font size="5">( 1 min)</font> +## Exercise 2: Subset the data <font size="5">( 1 min)</font> -- Subset the data set, keeping only observations where variable `year` equals `2016`. +- Subset the dataframe, keeping only observations where variable `year` equals `2016`. ```r @@ -307,7 +308,7 @@ head(whr) ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- @@ -335,7 +336,7 @@ x <- 42 ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- @@ -349,9 +350,9 @@ # Data in R -## Exercise 3: Create an object <font size="5">( 1 min)</font> +## Exercise 3: Create an object <font size="5">( 1 min)</font> -Create a new dataset, called `whr2016`, that is a subset of the `whr` data set containing only data from the year 2016. +Create a new dataframe, called `whr2016`, that is a subset of the `whr` dataframe containing only data from the year 2016. ```r @@ -361,7 +362,7 @@ # Display the 6 first obs. of the new data head(whr2016) -# Notice that we still have the original data set intact +# Notice that we still have the original dataframe intact head(whr) ``` @@ -369,20 +370,7 @@ # Data in R - -```r -whr2016 <- subset(whr, year == 2016) -head(whr2016) -head(whr) -``` - -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> - ---- - -# Data in R - -You can also see that your environment panel now has two `Data` objects: +You can also see that your environment panel now has two data objects: <img src="img/environment_2vars_2021.png" width="60%" style="display: block; margin: auto;" /> --- @@ -468,7 +456,7 @@ * Usually the first argument is the object you want to use the function on, e.g. `subset(whr, ...)` -* Functions usually return values that you can store in an object, print or use directly as an argument of another function. +* Functions usually return values that you can store in an object, print or use directly as an argument of another function. __They rarely modify an object in-place__ We will explore these ideas in depth in a later session. @@ -499,7 +487,7 @@ * An object is like a global or local in Stata, it's __something you can refer to later__ in your code to get a value -* But while you can only put a number or a string in a global, __you can put anything into an object__: scalars, strings, datasets, vectors, plots, functions, etc. +* But while you can only put a number or a string in a global, __you can put anything into an object__: scalars, strings, dataframes, vectors, plots, functions * Objects also have attributes that can be used to manipulate them @@ -513,7 +501,7 @@ * __Vectors:__ an uni-dimensional object that __stores a sequence of values of the same class__ -* __Data frames:__ a combination of different vectors of the same length (the same as your dataset in Stata) +* __Dataframes:__ a combination of different vectors of the same length (the same as a dataset in Stata) * __Lists:__ a multidimensional object that can store several objects __of different classes and dimensions__ @@ -545,7 +533,7 @@ v2 <- 1:5 # Alternative way to create an evenly spaced vector ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- @@ -558,56 +546,17 @@ v2[1:3] # Prints from the 1st to the 3rd element ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- -# R objects - Vectors - -## Vectors +# R objects - Dataframes -To R, each of the columns of the object `whr` is a vector. +## Dataframes -### Calling a vector from a `data.frame` column: - -We use the `$` character (operator) to extract vectors (variables) by their names in a `data.frame` - -For example: +The `whr` and `whr2016` objects are both dataframes. You can also construct a new dataframe from scratch by __combining vectors with the same number of elements__ with the command `data.frame()`. - -```r -# Create a vector with the values of the "year" variable -year_vector <- whr$year - -# See the 3 first elements of the year column -whr$year[1:3] -``` - -``` -## [1] 2015 2015 2015 -``` - ---- - -# R objects - Vectors - - -```r -year_vector <- whr$year # creates a vector with the values of the "year" variable -whr$year[1:3] # see the 3 first elements of the year column -``` - -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> - ---- - -# R objects - Data frames - -## Data frames - -The `whr` and `whr2016` objects are both data frames. You can also construct a new data frame from scratch by __combining vectors with the same number of elements__. - -#### Now, type the following code to create a new data frame +#### Now, type the following code to create a new dataframe ```r # Dataframe created by biding vectors @@ -626,7 +575,7 @@ --- -# R objects - Data frames +# R objects - Dataframes ```r @@ -634,15 +583,15 @@ df1 ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- -# R objects - Data frames +# R objects - Dataframes -## Data frames +## Dataframes -Since a data frame has two dimensions, you can use indices for both. The first index indicates the row selection and the second indicates the column. +Since a dataframe has two dimensions, you can use indices for both. The first index indicates the row selection and the second indicates the column. ### Numeric indexing @@ -659,7 +608,7 @@ --- -# R objects - Data frames +# R objects - Dataframes ```r @@ -668,13 +617,13 @@ whr[45,1] # Or the 45th element of the first column ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- -# R objects - Data frames +# R objects - Dataframes -## Data frames +## Dataframes Alternatively, you can use the column names for indexing, which is the same as using the `$` sign. @@ -686,15 +635,12 @@ ``` ``` -## # A tibble: 1 × 1 -## country -## <chr> -## 1 Oman +## [1] "Oman" ``` --- -# R objects - Data frames +# R objects - Dataframes ```r @@ -702,7 +648,38 @@ whr[22,"country"] # The same as whr$country[22] ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> + +--- + +# R objects - Dataframes + +## Vectors in dataframes + +To R, each of the columns of the object `whr` is a vector. + +### Calling a vector from a dataframe: + +We use the `$` character to extract vectors (variables) by their names in a dataframe + +For example: + + +```r +# Create a vector with the values of the "year" variable +year_vector <- whr$year +``` + +--- + +# R objects - Dataframes + + +```r +year_vector <- whr$year # creates a vector with the values of the "year" variable +``` + +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- @@ -712,9 +689,9 @@ Lists are more complex objects that can contain many objects of __different classes and dimensions__. -The outputs of many functions, a regression for example, are similar to lists. +The outputs of many functions, a regression for example, are similar to lists (more on this in a later session). -It would be beyond the scope of this introduction to go deep into them, but here's a quick example: +Here's a quick example: ### Combine several objects of different types in a list @@ -734,7 +711,7 @@ print(lst) # checking the content of lst ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- @@ -753,12 +730,12 @@ ```r +lst <- list(v1, df1, 45) lst[[3]] # returns 45 lst[3] # returns a list of one element (45) -lst[c(1,2)] # returns a list with the first two elements of "lst" ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- class: inverse, center, middle @@ -786,44 +763,18 @@ # Basic types of data - Strings -Now we'll use string data to practice some basic object manipulations in R. - -### Exercise 4: Create a vector of strings <font size="5">( 2 min)</font> -Create a string vector containing the names of commonly used statistical software: - -```r -# Creating string vector -str_vec <- c("R", "Python", "SAS", "Excel", "Stata") -``` - -Now print them to check them out. - ---- - -# Basic types of data - Strings - +### Exercise 4: Concatenate strings <font size="5">( 3 min)</font> -```r -# Creating string vector -str_vec <- c("R", "Python", "SAS", "Excel", "Stata") -``` - -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> - ---- - -# Basic types of data - Strings +1. Create the following vector of strings: `str_vec <- c("R", "Python", "SAS", "Excel", "Stata")` -### Exercise 5: Concatenate strings <font size="5">( 3 min)</font> +1. Create a scalar (a vector of one element) containing the phrase "can be an option to" and call it `str_scalar`. Your code will be similar to this: `str_scalar <- "can be an option to"` - 1. Create a scalar (a vector of one element) containing the phrase "can be an option to" and call it `str_scalar`. Your code will be similar to this: `str_scalar <- "can be an option to"` - - 2. Use the function `paste()` with 3 arguments separated by commas: +1. Use the function `paste()` with 3 arguments separated by commas: + The first argument as the 1st element of `str_vec`. + The second argument as the `str_scalar`. + The third argument as the 5th element of `str_vec`. - 3. If you're not sure where to start, type: +1. If you're not sure where to start, type: ```r help(paste) @@ -835,11 +786,12 @@ ```r +str_vec <- c("R", "Python", "SAS", "Excel", "Stata") str_scalar <- "can be an option to" # creating str_scalar paste(str_vec[1], str_scalar, str_vec[5]) # using paste() ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- @@ -868,20 +820,6 @@ # Advanced types of data -## Factors - -We'll learn more about factors in a later session, since they are important for the kind of analysis we usually do. For now, here are two important things to keep in mind when using them. - -Unlike Stata, in R: - -1. __You use the labels to refer to factors__ - -2. __You cannot choose the underlying values__ - ---- - -# Advanced types of data - ## Booleans Boolean data is the result of logical conditions. It can take two possible values: `TRUE` or `FALSE`. @@ -924,7 +862,7 @@ boolean_false <- FALSE ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- @@ -932,7 +870,7 @@ ## Booleans -### Exercise 6 <font size="5">( 3 min)</font> +### Exercise 5 <font size="5">( 3 min)</font> Create a boolean vector with the condition of annual income below average: @@ -959,7 +897,7 @@ head(inc_below_avg) # See the 6 first elements of the vector ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- @@ -1011,7 +949,7 @@ boolean2 <- c(FALSE, TRUE, TRUE, TRUE, TRUE) # And this to select every element but the first ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- @@ -1019,9 +957,9 @@ ## Booleans -Now let's use the boolean vector `inc_below_avg` to add a dummy variable in the `whr` data set for the same condition. +Now let's use the boolean vector `inc_below_avg` to add a dummy variable in the `whr` dataframe for the same condition. -### Exercise 7 <font size="5">( 3 min)</font> +### Exercise 6 <font size="5">( 3 min)</font> * Create a column in `whr` containing zeros and call it `rank_low`. You can do this by typing: @@ -1050,7 +988,7 @@ # this ^ turns its values to 1, for the observations with a TRUE value in inc_below_avg ``` -<iframe src="https://rrmaximiliano.shinyapps.io/learnr-app/?showcase=0" width="100%" height="400px" data-external="1"></iframe> +<iframe src="https://luizaandrade.shinyapps.io/learnr/?showcase=0" width="100%" height="400px" data-external="1"></iframe> --- @@ -1141,26 +1079,26 @@ Here are some advantages of R: -* R is a free and open source software! +* R is a free and open source software, a huge advantage for open science -* It allows you to have several datasets open simultaneously. +* It allows you to have several dataframes open simultaneously + No need to use `keep`, `preserve`, `restore` -* It can run complex Geographic Information System (GIS) analyses. +* It can run complex Geographic Information System (GIS) analyses -* You can use it for web scrapping. +* You can use it for web scrapping and APIs -* You can run machine learning algorithms with it. +* You can easily run machine learning algorithms with it -* You can create complex Markdown documents. This presentation, for example, is entirely done in R. +* You can create complex Markdown documents. This presentation, for example, is entirely done in R -* You can create interactive dashboards and online applications with the Shiny package. +* You can create interactive dashboards and online applications with the Shiny package --- # Appendix - Syntax -R's syntax is a bit heavier than Stata's: +R's syntax is heavier than Stata's: * Parentheses to separate function names from its arguments. * Commas to separate arguments. @@ -1359,7 +1297,7 @@ Help in R works very much like in Stata: the help files usually start with a brief description of the function, explain its syntax and arguments and list a few examples. There are two ways to access help files: -## Exercise 8: Use help +## Exercise 7: Use help ```r diff --git a/Presentations/01-intro-to-R.pdf b/Presentations/01-intro-to-R.pdf index 083ff98..c12e085 100644 Binary files a/Presentations/01-intro-to-R.pdf and b/Presentations/01-intro-to-R.pdf differ diff --git a/Presentations/02-intro-to-R-programming.Rmd b/Presentations/02-intro-to-R-programming.Rmd index 40800a1..b6ad390 100644 --- a/Presentations/02-intro-to-R-programming.Rmd +++ b/Presentations/02-intro-to-R-programming.Rmd @@ -1,8 +1,8 @@ --- title: "Session 2: Introduction to R Programming" subtitle: "R for Stata Users" -author: "Luiza Andrade, Rob Marty, Rony Rodriguez-Ramirez, Luis Eduardo San Martin, Leonardo Viotti" -date: "The World Bank | [WB Github](https://github.com/worldbank)
May 2022" +author: "Luiza Andrade, Marc-Andrea Fiorina, Rob Marty, Rony Rodriguez-Ramirez, Luis Eduardo San Martin, Leonardo Viotti" +date: "The World Bank | [WB Github](https://github.com/worldbank)
March 2023" output: xaringan::moon_reader: css: ["libs/remark-css/default.css", @@ -74,16 +74,13 @@ whr <- read_csv(here("DataWork", "DataSets", "Final", "whr_panel.csv")) # Table of contents 1. [Introduction](#introduction) -2. [Initial settings](#initial-settings) -3. [File paths](#file-paths) -4. [Exploring a dataset](#exploring-a-dataset) -5. [Creating a document outline in RStudio](#creating-a-document-outline-in-rstudio) -6. [Using packages](#using-packages) -7. [Functions inception](#functions-inception) -8. [Mapping and iterations](#mapping-and-iterations) -9. [Custom functions](#custom-functions) -10. [Indentation](#indentation) -11. [Appendix](#appendix) +1. [Initial settings](#initial-settings) +1. [File paths](#file-paths) +1. [Using packages](#using-packages) +1. [Functions inception](#functions-inception) +1. [Mapping and iterations](#mapping-and-iterations) +1. [Custom functions](#custom-functions) +1. [Appendix](#appendix) --- @@ -133,7 +130,7 @@ name: initial-settings * Notice two things: - 1. Your environment is *probably* empty (it's OK if it's not) + 1. Your environment is *probably* empty (it's okay if it's not) ```{r echo = FALSE, out.width = "60%"} knitr::include_graphics("img/empty_environment.png") @@ -181,18 +178,18 @@ knitr::include_graphics("img/stataheader.jpg") # Initial settings +Have you ever seen these lines of code before? + +```{r echo = FALSE, out.width = "40%"} +knitr::include_graphics("img/stataheader.jpg") +``` + * We __don't need to set the memory or the maximum number of variables__ in R * The equivalent of `set more off` is the default * The equivalent of `clear all` is not a default setting, but we'll change that in exercise 1 -* In any case, remember that you can see all the objects in your computer's memory at any point in the `Environment` panel - -```{r echo = FALSE, out.width = "65%"} -knitr::include_graphics("img/environment_2vars_2021.png") -``` - --- # Initial settings @@ -263,55 +260,69 @@ setwd("your/path") * Instead, you should use RStudio projects and the `here` library -> __Important:__ We won't get into the specifics of directory organization here, but we'll assume that all the files you use for a specific project (data, scripts, and outputs) reside in the same project directory. We'll call this the __working directory__. - * RStudio projects let you "bind" your project files to a root directory, regardless of the path to it * This is crucial because it allows smooth interoperability between different computers where the exact path to the project root directory differs * Additionally, each RStudio project you work on keeps their own history of commands! +__Important:__ We won't get into the specifics of directory organization here, but we'll assume that all the files you use for a specific project (data, scripts, and outputs) reside in the same project directory. We'll call this the __working directory__. + --- # RStudio projects .exercise[ -### Exercise 2 `r fa("keyboard")` (`r fa("clock")` 1 min) +### Exercise 2 `r fa("keyboard")` (`r fa("clock")` 3 min) + +1. Create a folder named `dime-r-training-mar2023` in your preferred location in your computer + +1. Go to https://osf.io/86g3b/ and download the file in: `R for Stata Users - 2023 March` > `Data` > `DataWork.zip` + +1. Unzip `DataWork.zip` in the folder `dime-r-training-mar2023` 1. On RStudio, select `File` > `New Project...` -2. Select `New Directory` > `New Project` +2. Select `Existing Directory` -3. Assign the name: `dime-r-training-project` to the project +3. Browse to the location of `dime-r-training-mar2023` and select `Create Project` ] --- # RStudio projects -```{r echo = FALSE, out.width = "80%"} +```{r echo = FALSE, out.width = "60%"} knitr::include_graphics("img/dime-r-training-project.png") ``` --- +# RStudio projects + +```{r echo = FALSE, out.width = "60%"} +knitr::include_graphics("img/dime-r-training-project-dir.png") +``` + +--- + # The `here` library * `here` locates files relative to your project root * It uses the root project directory to build paths to files easily -* Similar to RStudio projects, it allows for interoperability between different computers where the absolute path to the same file is not the same +* It allows for interoperability between different computers where the absolute path to the same file is not the same --- # Usage of `here` -- Install and load the `here` library: +- Load `here` ```{r, eval=FALSE} -install.packages("here") +install.packages("here") # install first if you don't have it library(here) ``` @@ -337,13 +348,7 @@ df <- read.csv(path) ### Exercise 3 `r fa("keyboard")` (`r fa("clock")` 3 min) -1. Go to the [OSF page of the course](https://osf.io/86g3b/) and download the file in: `R for Stata Users - 2022 May` > `Data` > `DataWork.zip` - -2. Unzip the file in your RStudio project root folder. This is the folder where the file `dime-r-training-project.Rproj` sits - -3. On RStudio, go to `File` > `New File` > `R Script` and save this new empty script in `DataWork` > `Code` as `exercises-session2.R` - -5. Now let's test if that worked. Load the `here` library and read the `csv` file `DataWork/DataSets/Final/whr_panel.csv` using `here()` +1. Load `here` and read the `.csv` file in `DataWork/DataSets/Final/whr_panel.csv` using `here()` + Use the function `read.csv()` to load the file. The argument for `read.csv()` is the result of `here()` + Remember to assign the dataframe you're reading to an object. You can call it `whr` as we did yesterday @@ -363,7 +368,7 @@ whr <- read.csv(here("DataWork", "DataSets", "Final", "whr_panel.csv")) # RStudio projects and `here` -If you did the exercise correctly, you should see the `whr` data frame listed in the Environment panel +If you did the exercise correctly, you should see the `whr` dataframe listed in the Environment panel ```{r echo = FALSE, out.width = "80%"} knitr::include_graphics("img/environment_2021.png") @@ -371,151 +376,6 @@ knitr::include_graphics("img/environment_2021.png") --- -class: inverse, center, middle -name: creating-a-document-outline-in-rstudio - -# Creating a document outline in RStudio - -

- ---- - -# Document outline - -* RStudio allows you to __create an interactive index__ for your scripts - -* To add a section to your code, create a commented line with the title of your section and add at least 4 trailing dashes (`----`), pound signs (`####`) or equal signs (`====`) after it - ---- - -# Document outline - -.exercise[ - -### Exercise 4 `r fa("keyboard")` (`r fa("clock")` 1 min) - -1. In your script, add a header before the line where you used `library(here)` with the text: `# Part 1: Loading libraries----` - -2. Before `read.csv(...)`, add the following header: `Part 2: Loading data----` - - + Remember: you create a section header by adding at least 4 trailing dashes (`-`), pound (`#`) or equal (`=`) signs in a comment line - -3. Note that once you create a section header, an arrow appears right next to the row number. Click on the arrows to see what happens. -] - ---- - -# Document outline - -* The outline can be accessed by clicking on the button on the top right corner of the script window. You can use it to jump from one section to another - -* You can also use the keyboard shortcuts `Alt + L` (`Cmd + Option + L` on Mac) and `Alt + Shift + L` to collapse and expand sections - -```{r echo = FALSE, out.width = "65%"} -knitr::include_graphics("img/document-outline.png") -``` - ---- - -class: inverse, center, middle -name: exploring-a-dataset - -# Exploring a dataset - -

- ---- - -# Exploring a dataset - -Some useful functions: - -* **`View()`:** opens the data set - -* **`class()`:** reports object type or type of data stored - -* **`dim()`:** reports the size of each one of an object's dimension - -* **`names()`:** returns the variable names of a data set - -* **`str()`:** general information about the structure of an R object - -* **`summary()`:** summary information about the variables in a data frame - -* **`head()`:** shows the first few observations in the dataset - -* **`tail()`:** shows the last few observations in the dataset - ---- - -# Exploring a dataset - -```{r, eval = F} -# View the data set (notice the uppercase "V") -View(whr) -``` - -This is the same as clicking on the object name in the environment panel. It opens a spreadsheet-style data viewer of a dataframe. - -```{r echo = FALSE, out.width = "80%"} -knitr::include_graphics("img/whr_viewer.png") -``` - ---- - -# Exploring a dataset - -```{r, eval = F} -# Object type and dimensions -class(whr) -dim(whr) -``` - -```{r echo=FALSE} -knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") -``` - ---- - -# Exploring a dataset - -```{r, eval = F} -# Object structure -str(whr) -``` - -```{r echo=FALSE} -knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") -``` - ---- - -# Exploring a dataset - -```{r, eval = F} -# Summarize a dataframe -summary(whr) -``` - -```{r echo=FALSE} -knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") -``` - ---- - -# Exploring a dataset - -```{r, eval = F} -# Printing the first rows of a dataframe -head(whr) -``` - -```{r echo=FALSE} -knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") -``` - ---- - class: inverse, center, middle name: using-packages @@ -527,21 +387,21 @@ name: using-packages # Packages -* Since there is a lot of people developing for R, it can have many different functionalities. +* Since there is a lot of people developing for R, it can have many different functionalities -* To make it simpler, these functionalities are bundled into packages. +* To make it simpler, these functionalities are bundled into packages -* A package is just __a unit of shareable code__. +* A package is just __a unit of shareable code__ --- # Packages -* It may contain new functions, but also more complex functionalities, such as a Graphic User Interface (GUI) or settings for parallel processing (similar to Stata MP). +* Packages may contain new functions, but also more complex functionalities, such as a Graphic User Interface (GUI) or settings for parallel processing (similar to Stata MP) -* They can be shared through R's official repository - CRAN (18,000+ packages reviewed and tested). +* They are usually shared through R's official repository - CRAN (19,000+ packages reviewed and tested) -* There are many other online sources such as GitHub, but it's important to be careful, as these probably haven't gone through a review process as rigorous as those in CRAN. +* There are many other online sources such as GitHub, but it's important to be careful, as these probably haven't gone through a review process as rigorous as those in CRAN --- @@ -551,7 +411,7 @@ name: using-packages ```{r, eval = F} # Installing a package -install.packages("tidyverse", +install.packages("dplyr", dependencies = TRUE) # the dependencies argument also installs all other packages # that it may depend upon to run @@ -565,10 +425,10 @@ install.packages("tidyverse", .exercise[ -### Exercise 5 `r fa("keyboard")` (`r fa("clock")` 1 min) +### Exercise 4 `r fa("keyboard")` (`r fa("clock")` 1 min) -1. Load the `tidyverse` meta package in part 1 of your script using `library(tidyverse)` -2. Run your script +1. Load the packages `dplyr` and `purrr` in part 1 of your script using `library(dplyr)` and `library(purrr)` +1. Run your script ] @@ -579,17 +439,17 @@ install.packages("tidyverse", What if this happens? ```{r echo = FALSE, out.width = "70%"} -knitr::include_graphics("img/warning_2021.png") +knitr::include_graphics("img/Warning.png") ``` --- # Warnings vs errors -R has two types of error messages, `warnings` and actual `errors`: +R has two types of error messages, warnings and actual errors: - * `Errors` - break your code, usually preventing it from running. - * `Warnings` - usually mean that nothing went wrong yet, but you should be careful. + * **Errors** - break your code, usually preventing it from running + * **Warnings** - your code kept running, but R wants you to be aware of something that might be a problem later RStudio's default is to print warning messages, but not to stop the code at the lines where they occur. You can configure R to stop at warnings if you want. @@ -653,7 +513,7 @@ knitr::include_graphics("img/statalog.png") * Metaprogramming is a __very powerful technique__, as you will soon see -* It's __also a common source of error__, as you can only use one function inside the other if the output of the inner function is the same as the input of the outer function +* It's __also a common source of error__, as you can only use one function inside the other if the output of the inner function can be taken as the input of the outer function * It can also get quite tricky to follow what a line of code with multiple functions inceptions is doing @@ -716,9 +576,13 @@ name: mapping-and-iterations } ``` +--- + +# Map + * R, however, has a set of functions that allows users to loop through an object __in a more efficient way__, without using explicit loops -* In this training we'll introduce `map()`. It is a function part of the tidyverse meta package +* In this training we'll introduce `map()`. It is a function part of `purrr`, a package that contains tools for functional programming * Also, in case you have not noticed yet: __R is vectorized!__ this means that many operations are applied element-wise by default so you don't have to code loops to apply them to each element of a vector or dataframe @@ -726,15 +590,15 @@ name: mapping-and-iterations # Map -* To use `map()`, you need to either load the package `purrr` eithre by itself or alongside other `tidyverse` packages +* To use `map()`, you need to load the package `purrr` * The basic syntax of `map()` is: .command[ -**`map(X, function, ...)`:** applies `function` to each of the elements of `X`. If `X` is a data frame then `function` is applied column-wise while if it's a vector or a list it is applied item-wise. The output of `map()` is always a list with the results. +**`map(X, function, ...)`:** applies `function` to each of the elements of `X`. If `X` is a dataframe then `function` is applied column-wise while if it's a vector or a list it is applied item-wise. The output of `map()` is always a list with the results. - + **X:** a data frame, matrix or vector the function will be applied to + + **X:** a dataframe, matrix or vector the function will be applied to + **function:** the name of the function you want to apply to each of the elements of `X` ] @@ -768,6 +632,10 @@ knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") + You can collect the answers by asking each one of them individually -- __this is looping__ + Otherwise, you can ask them to raise their hands and collect all answers at once -- __this is `map()`__ +--- + +# Map vs looping + * The output of a loop is the regular output of the operation you're repeating, times the number of iterations you did * The output of `map()` will be always a list @@ -783,12 +651,12 @@ knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") .exercise[ -### Exercise 6: Looping over a dataframe (`r fa("clock")` 3 min) +### Exercise 5: Looping over a dataframe (`r fa("clock")` 3 min) -- Create a toy dataframe of 70,000 columns and 400 observations using this code +- Create a toy dataframe of 50,000 columns and 400 observations using this code ```{r, eval = FALSE} -df <- data.frame(replicate(70000, sample(1:100, 400, replace=TRUE))) +df <- data.frame(replicate(50000, sample(1:100, 400, replace=TRUE))) ``` - Create an empty vector named `col_means_loop` where you will store column means with this code: `col_means_loop <- c()` @@ -814,12 +682,12 @@ for (column in df) { The solution is this: ```{r, eval=FALSE} -df <- data.frame(replicate(70000, sample(1:100, 400, replace=TRUE))) +df <- data.frame(replicate(50000, sample(1:100, 400, replace=TRUE))) col_means_loop <- c() -for (col in df){ - col_means_loop <- append(col_means_loop, mean(col)) +for (column in df){ + col_means_loop <- append(col_means_loop, mean(column)) } ``` @@ -829,16 +697,17 @@ for (col in df){ .exercise[ -### Exercise 7: Now use `map()` `r fa("keyboard")` (`r fa("clock")` 1 min) +### Exercise 6: Now use `map()` `r fa("keyboard")` (`r fa("clock")` 1 min) -- Use `map()` to produce a list with the means of the columns of `df` -- Store the result in a list named `col_means_map` +1. Use `map()` to produce a list with the means of the columns of `df` +1. Store the result in a list named `col_means_map` -> Hint: - + Remember the syntax of `map()`: `map(X, function_name)` - ] +Hints: + + Remember the syntax of `map()`: `map(X, function_name)` + + The function name inside `map()` shouldn't have parentheses next to it (i.e.: `mean` instead of `mean()`) + --- # Map vs looping @@ -846,7 +715,10 @@ for (col in df){ Compare the syntax of the solutions of both exercises: ```{r, eval=FALSE} -# Looping exercise +# Dataframe creation +df <- data.frame(replicate(50000, sample(1:100, 400, replace=TRUE))) + +# Loop exercise col_means_loop <- c() for (col in df){ @@ -863,7 +735,7 @@ Do you remember which one ran faster? # Map vs looping -Last but not least, remember we said that loops produce side effects? +Also, remember we said that loops produce side effects? ```{r echo = FALSE, out.width = "50%"} knitr::include_graphics("img/loop-side-effects.png") @@ -921,7 +793,7 @@ knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") .exercise[ -### Exercise 8 `r fa("keyboard")` (`r fa("clock")` 2 min) +### Exercise 7 `r fa("keyboard")` (`r fa("clock")` 2 min) Create a function named `zscore` that standardizes the values of a vector. @@ -969,20 +841,19 @@ knitr::include_app("https://luizaandrade.shinyapps.io/learnr/") .exercise[ -### Exercise 9 `r fa("keyboard")` +### Exercise 8 `r fa("keyboard")` 1. Subselect the columns `health_life_expectancy` and `freedom` in `whr` - + Use tidyverse's `select()` for this, as in: `whr %>% select(freedom, happiness_score)` + + Use dplyr's `select()` for this, as in: `whr %>% select(freedom, happiness_score)` 2. Use `map()` combined with the `zscore` function to get the z-score of these two columns and assign the resulting list to an object named `z_scores` 3. Use list indexing on `z_scores` to generate two new columns in `whr` with the standardized values of `health_life_expectancy` and `freedom` -> **Hints:** -* Use the pipes (`%>%`) successively -* Remember that we don't use parenthesis next to the function name we're using `map()` with -* Remember that we use double brackets instead of single brackets to index the actual elements of a list + **Hints:** +* Don't use parenthesis next to the function name we're using `map()` with +* Use double brackets instead of single brackets or the symbol `$` to index the elements of a list ] @@ -1023,11 +894,11 @@ name: appendix * `.RData` stores the objects in your environment only if you save your workspace, and loads them again in the next RStudio session -* Both files are relative to the working directory where your RStudio session started +* Both files are stored in the working directory where your RStudio session started --- -# Appendix - Using packages +# Appendix - More on packages Once a package is loaded, you can use its features and functions. Here's a list of some useful packages: @@ -1037,21 +908,13 @@ Once a package is loaded, you can use its features and functions. Here's a list * `stargazer` - awesome latex regression and summary statistics tables * `foreign` - reads `.dta` and other formats from inferior statistical software * `zoo` - time series and panel data manipulation useful functions -* `data.table` - some functions to deal with huge data sets +* `data.table` - some functions to deal with huge dataframes * `sp` and `rgeos` - spatial analysis * `multiwayvcov` and `sandwich` - clustered and robust standard errors * `RODBC`, `RMySQL`, `RPostgresSQL`, `RSQLite` - For relational databases and using SQL in R. --- -# Appendix - Resources - -* A discussion of folder structure and data managament can be found here: https://dimewiki.worldbank.org/wiki/DataWork_Folder - -* For a broader discussion of data management, go to https://dimewiki.worldbank.org/wiki/Data_Management - ---- - # Appendix - Git Git is a version-control system for tracking changes in code and other text files. It is a great resource to include in your work flow. @@ -1071,41 +934,11 @@ https://r4ds.had.co.nz/workflow-projects.html --- -# Appendix - Commenting - -* To comment a line, write `#` as its first character - -```{r, eval=FALSE} -# This is a comment -print("But this part is not") -``` - -* You can also add `#` halfway through a line to comment whatever comes after it - -```{r, eval=FALSE} -print("This part is not a comment") # And this is a comment -``` - -* In Stata, you can use `/*` and `*/` to comment in the middle of a line's code. That is not possible in R: everything that comes after `#` will always be a comment - -* To comment a selection of lines, press `Ctrl` + `Shift` + `C` - ---- - -# Appendix - Assignment 1 - -.exercise[ - -### Exercise `r fa("keyboard")` - -1. In your script panel, select all the lines of your script +# Appendix - More on folder management -2. Use the keyboard shortcut to comment these lines. - + Shortcut: `Ctrl` + `Shift` + `C` - -3. Use the keyboard shortcut to comment these lines again. What happened? +* A discussion of folder structure and data managament can be found here: https://dimewiki.worldbank.org/wiki/DataWork_Folder -] +* For a broader discussion of data management, go to https://dimewiki.worldbank.org/wiki/Data_Management --- @@ -1162,7 +995,7 @@ for (col in colnames(whr)) { # Appendix - Apply -* Apart from tidyverse's `map()`, base R also has a set of functions that allows users to apply a function to a number of objects without using explicit loops +* Apart from purrr's `map()`, base R also has a set of functions that allows users to apply a function to a number of objects without using explicit loops * They're called `apply` and there are many of them, with different use cases @@ -1182,10 +1015,10 @@ sapply(X, FUN, ...) * Its main arguments are: - + **X:** a data frame, matrix or vector the function will be applied to + + **X:** a dataframe, matrix or vector the function will be applied to + **FUN:** the function you want to apply -* `sapply()` applies the function (`FUN`) to all the elements of `X`. If `X` is a data frame then the function is applied column-wise, while if it's a vector or a list it is applied item-wise +* `sapply()` applies the function (`FUN`) to all the elements of `X`. If `X` is a dataframe then the function is applied column-wise, while if it's a vector or a list it is applied item-wise * The output of `sapply()` is usually a vector with the results, but it can be a matrix if the results have more than one dimension @@ -1228,7 +1061,7 @@ apply(X, MARGIN, FUN, ...) * Arguments: - + **X:** a data frame (or matrix) the function will be applied to + + **X:** a dataframe (or matrix) the function will be applied to + **MARGIN:** 1 to apply the function to all rows or 2 to apply the function to all columns + **FUN:** the function you want to apply @@ -1246,7 +1079,7 @@ apply(matrix, 2, mean) # column means --- -# Appendix - Assignment 2 +# Appendix - Assignment 1 ### Exercise: Get the row max @@ -1273,6 +1106,64 @@ whr %>% --- +# Appendix - Commenting + +* To comment a line, write `#` as its first character + +```{r, eval=FALSE} +# This is a comment +print("But this part is not") +``` + +* You can also add `#` halfway through a line to comment whatever comes after it + +```{r, eval=FALSE} +print("This part is not a comment") # And this is a comment +``` + +* In Stata, you can use `/*` and `*/` to comment in the middle of a line's code. That is not possible in R: everything that comes after `#` will always be a comment + +* To comment a selection of lines, press `Ctrl` + `Shift` + `C` + +--- + +# Appendix - Assignment 2 + +.exercise[ + +### Exercise `r fa("keyboard")` + +1. In your script panel, select all the lines of your script + +2. Use the keyboard shortcut to comment these lines. + + Shortcut: `Ctrl` + `Shift` + `C` + +3. Use the keyboard shortcut to comment these lines again. What happened? + +] + +--- + +# Appendix - Document outline + +* RStudio allows you to __create an interactive index__ for your scripts + +* To add a section to your code, create a commented line with the title of your section and add at least 4 trailing dashes (`----`), pound signs (`####`) or equal signs (`====`) after it + +--- + +# Appendix - Document outline + +* The outline can be accessed by clicking on the button on the top right corner of the script window. You can use it to jump from one section to another + +* You can also use the keyboard shortcuts `Alt + L` (`Cmd + Option + L` on Mac) and `Alt + Shift + L` to collapse and expand sections + +```{r echo = FALSE, out.width = "65%"} +knitr::include_graphics("img/document-outline.png") +``` + +--- + # Appendix - Indentation ```{r, eval = F} @@ -1376,6 +1267,28 @@ map(c(1.2, round) ``` +--- + +# Appendix - Exploring a dataframe + +Some useful functions: + +* **`View()`:** opens a visualization of the dataframe + +* **`class()`:** reports object type or type of data stored + +* **`dim()`:** reports the size of each one of an object's dimension + +* **`names()`:** returns the variable names of a dataframe + +* **`str()`:** general information about the structure of an R object + +* **`summary()`:** summary information about the variables in a dataframe + +* **`head()`:** shows the first few observations in the dataframe + +* **`tail()`:** shows the last few observations in the dataframe + --- exclude: true diff --git a/Presentations/02-intro-to-R-programming.html b/Presentations/02-intro-to-R-programming.html index 65402f9..4e11324 100644 --- a/Presentations/02-intro-to-R-programming.html +++ b/Presentations/02-intro-to-R-programming.html @@ -3,7 +3,7 @@ Session 2: Introduction to R Programming - + @@ -25,10 +25,18 @@