diff --git a/tutorial/tour-of-the-tidyverse.Rmd b/tutorial/tour-of-the-tidyverse.Rmd
index 67ca7d1..9a23f41 100644
--- a/tutorial/tour-of-the-tidyverse.Rmd
+++ b/tutorial/tour-of-the-tidyverse.Rmd
@@ -176,51 +176,65 @@ penguins %>%
## group_by() and summarize()
Summarizing the data using `group_by()` and `summarize()`
+
+We can use `group_by()` to group our data by **species** and **sex**, and `summarize()` to calculate the average **body_mass_g** for each grouping.
```{r group-by-summarize}
+penguins %>%
+ select(species, sex, body_mass_g) %>%
+ group_by(species, sex) %>%
+ summarize(mean = mean(body_mass_g))
+```
+
+## count() and add_count()
+If we're just interested in _counting_ the observations in each grouping, we can group and summarize with special functions `count()` and `add_count()`.
+
+Counting can be done with `group_by()` and `summarize()`, but it's a little cumbersome.
+
+It involves...
+1. using `mutate()` to create an intermediate variable **n_species** that adds up all observations per **species**, and
+2. an `ungroup()`-ing step
+
+```{r}
penguins %>%
- group_by(species, sex) %>%
+ group_by(species) %>%
+ mutate(n_species = n()) %>%
+ ungroup() %>%
+ group_by(species, sex, n_species) %>%
summarize(n = n())
```
-## count() and add_count()
-Because we're just _counting_ observations in this example, we also have the option to use `count()` which simplifies our code a little.
+In contrast, `count()` and `add_count()` offer a simplified approach.
-> Thank you to Alison Hill for [these suggestions](https://github.com/spcanelon/2020-rladies-chi-tidyverse/issues/2)!
+> Thank you to Alison Hill for [this suggestion](https://github.com/spcanelon/2020-rladies-chi-tidyverse/issues/2)!
```{r count}
-penguins %>%
- count(species, sex)
+penguins %>%
+ count(species, sex) %>%
+ add_count(species, wt = n,
+ name = "n_species")
```
## mutate()
+We can add to our counting example by using `mutate()` to create a new variable **prop**, which represents the proportion of penguins of each **sex**, grouped by **species**
-### Option 1
-Creating new variables with `mutate()`
-```{r group-by-summarize-mutate}
-penguins %>%
- group_by(species) %>%
- mutate(n_species = n()) %>%
- ungroup() %>%
- group_by(species, sex, n_species) %>%
- summarize(n = n()) %>%
- mutate(prop = n/n_species*100)
-```
+> Thank you to Alison Hill for [this suggestions](https://github.com/spcanelon/2020-rladies-chi-tidyverse/issues/2)!
-### Option 2
-We can also use `mutate()` along with `add_count()` to add up the counts per species group to use as a denominator ("n_species") when we calculate the proportion by sex.
-```{r count-mutate}
+```{r}
penguins %>%
count(species, sex) %>%
- add_count(species, wt = n, name = "n_species") %>%
- mutate(prop = n/n_species*100)
+ add_count(species, wt = n,
+ name = "n_species") %>%
+ mutate(prop = n/n_species*100)
```
+
## filter()
-Regardless of which approach we take to summarize our data, we can proceed to filtering rows by adding on a filtering step to our pipeline using `filter()`
+Finally, we can filter rows to only show us **Chinstrap** penguin summaries by adding `filter()` to our pipeline
```{r filter}
penguins %>%
count(species, sex) %>%
- add_count(species, wt = n, name = "n_species") %>%
+ add_count(species, wt = n,
+ name = "n_species") %>%
mutate(prop = n/n_species*100) %>%
filter(species == "Chinstrap")
```
diff --git a/tutorial/tour-of-the-tidyverse.nb.html b/tutorial/tour-of-the-tidyverse.nb.html
index dde20d4..1f87c3b 100644
--- a/tutorial/tour-of-the-tidyverse.nb.html
+++ b/tutorial/tour-of-the-tidyverse.nb.html
@@ -3011,9 +3011,6 @@
First created on Aug 31, 2020 (updated on Sept 22, 2020)
-
-knitr::opts_chunk$set(message = FALSE, warning = FALSE, collapse = TRUE)
-
@@ -3025,8 +3022,8 @@
About {palmerpenguins}
-
# install.packages("remotes")
-# remotes::install_github("allisonhorst/palmerpenguins")
+
# install.packages("remotes")
+# remotes::install_github("allisonhorst/palmerpenguins")
@@ -3035,14 +3032,6 @@
About {palmerpenguins}
Loading packages
-
-
# loading packages
-library(tidyverse)
-library(palmerpenguins)
-
-# viewing data sets in package "palmerpenguins"
-data(package = "palmerpenguins")
-
@@ -3051,13 +3040,6 @@ readr
Let’s get data into R!
-
-# option 1: load using URL ----
-raw_adelie_url <- read_csv("https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff")
-
-# option 2: load using filepath ----
-raw_adelie_filepath <- read_csv("raw_adelie.csv")
-
Lucky for us, Allison Horst compiled data from all three species together for us in the {palmerpenguins}
package!
@@ -3067,14 +3049,6 @@ readr
-
-# saves package tibble into global environment
-penguins <- palmerpenguins::penguins
-head(penguins)
-
-penguins_raw <- palmerpenguins::penguins_raw
-head(penguins_raw)
-
@@ -3083,12 +3057,6 @@ tibble
A tibble
is much like the data frame
in base R, but optimized for use in the Tidyverse. Let’s take a look at the differences.
-
-# try each of these commands in the console and see if you can spot the differences!
-
-as_tibble(penguins)
-as.data.frame(penguins)
-
@@ -3106,9 +3074,6 @@ What differences do you see?
Try it out here!
-
-
-
@@ -3116,17 +3081,11 @@
Taking a closer look at penguins
Get a full view of the dataset:
-
-
-
Or catch a glimpse
:
-
-
-
@@ -3139,33 +3098,6 @@ ggplot2
Let’s see if body mass varies by penguin sex
-
-penguins %>%
- ggplot()
-
-penguins %>%
- ggplot(aes(x = sex, y = body_mass_g))
-
-penguins %>%
- ggplot(aes(x = sex, y = body_mass_g)) +
- geom_point()
-
-# A scatter plot doesn't really tell us much.
-# Let's try a different geometry
-
-penguins %>%
- ggplot(aes(x = sex, y = body_mass_g)) +
- geom_boxplot()
-
-# That's more informative!
-# Let's see if there are differences by penguin species
-
-penguins %>%
- ggplot(aes(x = sex, y = body_mass_g)) +
- geom_boxplot(aes(fill = species))
-
-# What do you notice?
-
@@ -3186,9 +3118,6 @@ What observations can you make from the plot?
dplyr
-
-
-
@@ -3196,10 +3125,6 @@
select()
Selecting dataset columns with select()
-
-
penguins %>%
- select(species, sex, body_mass_g)
-
@@ -3208,88 +3133,66 @@ arrange()
Reordering the data set with arrange()
-
-penguins %>%
- select(species, sex, body_mass_g) %>%
- arrange(desc(body_mass_g))
-
group_by() and summarize()
Summarizing the data using group_by()
and summarize()
+
We can use group_by()
to group our data by species and sex, and summarize()
to calculate the average body_mass_g for each grouping.
-
-
penguins %>%
- group_by(species, sex) %>%
- summarize(n = n())
+
+
penguins %>%
+ select(species, sex, body_mass_g) %>%
+ group_by(species, sex) %>%
+ summarize(mean = mean(body_mass_g))
count() and add_count()
-
Because we’re just counting observations in this example, we also have the option to use count()
which simplifies our code a little.
-
-Thank you to Alison Hill for these suggestions!
-
+
If we’re just interested in counting the observations in each grouping, we can group and summarize with special functions count()
and add_count()
.
+
Counting can be done with group_by()
and summarize()
, but it’s a little cumbersome.
+
It involves… 1. using mutate()
to create an intermediate variable n_species that adds up all observations per species, and 2. an ungroup()
-ing step
-
-
penguins %>%
- count(species, sex)
+
+
penguins %>%
+ group_by(species) %>%
+ mutate(n_species = n()) %>%
+ ungroup() %>%
+ group_by(species, sex, n_species) %>%
+ summarize(n = n())
-
-
-
mutate()
-
-
Option 1
-
Creating new variables with mutate()
+
In contrast, count()
and add_count()
offer a simplified approach.
+
+Thank you to Alison Hill for this suggestion!
+
-
-
penguins %>%
- group_by(species) %>%
- mutate(n_species = n()) %>%
- ungroup() %>%
- group_by(species, sex, n_species) %>%
- summarize(n = n()) %>%
- mutate(prop = n/n_species*100)
-
-
-
Option 2
-
We can also use mutate()
along with add_count()
to add up the counts per species group to use as a denominator (“n_species”) when we calculate the proportion by sex.
+
+
mutate()
+
We can add to our counting example by using mutate()
to create a new variable prop, which represents the proportion of penguins of each sex, grouped by species
+
+Thank you to Alison Hill for this suggestions!
+
-
-
penguins %>%
- count(species, sex) %>%
- add_count(species, wt = n, name = "n_species") %>%
- mutate(prop = n/n_species*100)
-
-
filter()
-
Regardless of which approach we take to summarize our data, we can proceed to filtering rows by adding on a filtering step to our pipeline using filter()
+
Finally, we can filter rows to only show us Chinstrap penguin summaries by adding filter()
to our pipeline
-
-
penguins %>%
- count(species, sex) %>%
- add_count(species, wt = n, name = "n_species") %>%
- mutate(prop = n/n_species*100) %>%
- filter(species == "Chinstrap")
-
@@ -3301,30 +3204,16 @@
forcats
The factor()
function is perfect for this.
-
-
penguins %>%
- mutate(year_factor = factor(year, levels = unique(year)))
-
The result is a new factor year_factor
with levels 2007
, 2008
and 2009
!
-
-
penguins_new <-
- penguins %>%
- mutate(year_factor = factor(year, levels = unique(year)))
-penguins_new
-
Double check the variable class and factor levels below:
-
-
class(penguins_new$year_factor)
-levels(penguins_new$year_factor)
-
@@ -3334,22 +3223,11 @@ stringr
From what we’ve learned so far, take a guess at what this code chunk will do before running it.
-
-penguins %>%
- select(species, island) %>%
- mutate(ISLAND = str_to_upper(island))
-
How about this one? How is it different from the previous code chunk?
-
-penguins %>%
- select(species, island) %>%
- mutate(ISLAND = str_to_upper(island)) %>%
- mutate(species_island = str_c(species, ISLAND, sep = "_"))
-
@@ -3359,24 +3237,11 @@ tidyr
We can pretend that it wasn’t and that body_mass_g
was recorded separately for male
, female
, and sex NA
penguins. Like untidy_penguins
below:
-
-untidy_penguins <-
- penguins %>%
- pivot_wider(names_from = sex,
- values_from = body_mass_g)
-untidy_penguins
-
Now let’s make it tidy again with the help of the pivot_longer()
function! pivot_wider()
is another very popular tidying function. Have you seen it before? Hint: see the code chunk above!
-
-untidy_penguins %>%
- pivot_longer(cols = male:`NA`,
- names_to = "sex",
- values_to = "body_mass_g")
-
@@ -3388,59 +3253,25 @@ purrr
Let’s turn this plot:
-
-penguins %>%
- ggplot(aes(x = sex, y = body_mass_g)) +
- geom_boxplot(aes(fill = species))
-
Into this one!
-
-penguins %>%
- ggplot(aes(x = sex, y = body_mass_g)) +
- geom_boxplot(aes(fill = species)) +
- scale_fill_manual(values = nord::nord_palettes$frost)
-
Let’s try out the frost
palette.
-
-# we'll need to load the {nord} package
-library(nord)
-
-# you can choose colors using the color hex codes
-nord::nord_palettes$frost
-
-
-# but you might prefer to use `scale_fill_manual()`
-# or more specialized functions like `scale_fill_nord()`
-# included in the {nord} package
-penguins %>%
- ggplot(aes(x = sex, y = body_mass_g)) +
- geom_boxplot(aes(fill = species)) +
- scale_fill_manual(values = nord::nord_palettes$frost)
- #scale_fill_nord(palette = "frost")
-
Ok now for a handy package/function trio!
-
-# we'll have to load the {prismatic} package
-library(prismatic)
-
-prismatic::color(nord::nord_palettes$frost)
-
purrr
’s map()
function can help us iterate the prismatic::color()
function over all palettes in a palette package like nord
!
@@ -3450,7 +3281,7 @@ purrr
-nord::nord_palettes %>% map(prismatic::color)
+nord::nord_palettes %>% map(prismatic::color)
@@ -3466,71 +3297,14 @@ Recreating a {palmerpenguins} plot
-
-# scatterplot sequence ----
-penguins %>%
- ggplot() +
- geom_point(aes(x = flipper_length_mm, y = bill_length_mm)) # add aesthetics
-
-penguins %>%
- ggplot() +
- geom_point(aes(x = flipper_length_mm, y = bill_length_mm,
- color = species)) # add color per species
-
-penguins %>%
- ggplot() +
- geom_point(aes(x = flipper_length_mm, y = bill_length_mm,
- color = species, shape = species)) # add shape per species
-
-penguins %>%
- ggplot() +
- geom_point(aes(x = flipper_length_mm, y = bill_length_mm,
- color = species, shape = species)) # add shape per species
-
-penguins %>%
- ggplot() +
- geom_point(aes(x = flipper_length_mm, y = bill_length_mm,
- color = species, shape = species)) +
- geom_smooth(aes(x = flipper_length_mm, y = bill_length_mm,
- color = species))
-
-penguins %>%
- ggplot(aes(x = flipper_length_mm, y = bill_length_mm)) +
- geom_point(aes(color = species, shape = species)) +
- geom_smooth(aes(color = species), se = FALSE, method = "lm")
-
-
-penguins %>%
- ggplot() +
- geom_point(aes(x = flipper_length_mm, y = body_mass_g,
- color = species, shape = species))
-
-
-penguins %>%
- ggplot() +
- geom_histogram(aes(x = flipper_length_mm))
-
-penguins %>%
- ggplot() +
- geom_histogram(aes(x = flipper_length_mm, color = species))
-
-penguins %>%
- ggplot() +
- geom_histogram(aes(x = flipper_length_mm, fill = species))
-
-penguins %>%
- ggplot() +
- geom_histogram(aes(x = flipper_length_mm, fill = species,
- position = "identity", alpha = 0.5))
-
@@ -3553,19 +3327,19 @@ tidytuesdayR
-# install.packages("tidytuesdayR")
-# remotes::install_github("thebioengineer/tidytuesdayR")
-
-library(tidytuesdayR)
-
-# load the data
-tt_data <- tt_load("2020-07-27") # error message
-tt_data <- tt_load("2020-07-28")
-tt_data <- tt_load(2020, week=31)
-
-# take a peek
-readme(tt_data)
-print(tt_data)
+# install.packages("tidytuesdayR")
+# remotes::install_github("thebioengineer/tidytuesdayR")
+
+library(tidytuesdayR)
+
+# load the data
+tt_data <- tt_load("2020-07-27") # error message
+tt_data <- tt_load("2020-07-28")
+tt_data <- tt_load(2020, week=31)
+
+# take a peek
+readme(tt_data)
+print(tt_data)
@@ -3594,7 +3368,7 @@ Examples

