diff --git a/tutorial/tour-of-the-tidyverse.Rmd b/tutorial/tour-of-the-tidyverse.Rmd index 67ca7d1..9a23f41 100644 --- a/tutorial/tour-of-the-tidyverse.Rmd +++ b/tutorial/tour-of-the-tidyverse.Rmd @@ -176,51 +176,65 @@ penguins %>% ## group_by() and summarize() Summarizing the data using `group_by()` and `summarize()` + +We can use `group_by()` to group our data by **species** and **sex**, and `summarize()` to calculate the average **body_mass_g** for each grouping. ```{r group-by-summarize} +penguins %>% + select(species, sex, body_mass_g) %>% + group_by(species, sex) %>% + summarize(mean = mean(body_mass_g)) +``` + +## count() and add_count() +If we're just interested in _counting_ the observations in each grouping, we can group and summarize with special functions `count()` and `add_count()`. + +Counting can be done with `group_by()` and `summarize()`, but it's a little cumbersome. + +It involves... +1. using `mutate()` to create an intermediate variable **n_species** that adds up all observations per **species**, and +2. an `ungroup()`-ing step + +```{r} penguins %>% - group_by(species, sex) %>% + group_by(species) %>% + mutate(n_species = n()) %>% + ungroup() %>% + group_by(species, sex, n_species) %>% summarize(n = n()) ``` -## count() and add_count() -Because we're just _counting_ observations in this example, we also have the option to use `count()` which simplifies our code a little. +In contrast, `count()` and `add_count()` offer a simplified approach. -> Thank you to Alison Hill for [these suggestions](https://github.com/spcanelon/2020-rladies-chi-tidyverse/issues/2)! +> Thank you to Alison Hill for [this suggestion](https://github.com/spcanelon/2020-rladies-chi-tidyverse/issues/2)! ```{r count} -penguins %>% - count(species, sex) +penguins %>% + count(species, sex) %>% + add_count(species, wt = n, + name = "n_species") ``` ## mutate() +We can add to our counting example by using `mutate()` to create a new variable **prop**, which represents the proportion of penguins of each **sex**, grouped by **species** -### Option 1 -Creating new variables with `mutate()` -```{r group-by-summarize-mutate} -penguins %>% - group_by(species) %>% - mutate(n_species = n()) %>% - ungroup() %>% - group_by(species, sex, n_species) %>% - summarize(n = n()) %>% - mutate(prop = n/n_species*100) -``` +> Thank you to Alison Hill for [this suggestions](https://github.com/spcanelon/2020-rladies-chi-tidyverse/issues/2)! -### Option 2 -We can also use `mutate()` along with `add_count()` to add up the counts per species group to use as a denominator ("n_species") when we calculate the proportion by sex. -```{r count-mutate} +```{r} penguins %>% count(species, sex) %>% - add_count(species, wt = n, name = "n_species") %>% - mutate(prop = n/n_species*100) + add_count(species, wt = n, + name = "n_species") %>% + mutate(prop = n/n_species*100) ``` + ## filter() -Regardless of which approach we take to summarize our data, we can proceed to filtering rows by adding on a filtering step to our pipeline using `filter()` +Finally, we can filter rows to only show us **Chinstrap** penguin summaries by adding `filter()` to our pipeline ```{r filter} penguins %>% count(species, sex) %>% - add_count(species, wt = n, name = "n_species") %>% + add_count(species, wt = n, + name = "n_species") %>% mutate(prop = n/n_species*100) %>% filter(species == "Chinstrap") ``` diff --git a/tutorial/tour-of-the-tidyverse.nb.html b/tutorial/tour-of-the-tidyverse.nb.html index dde20d4..1f87c3b 100644 --- a/tutorial/tour-of-the-tidyverse.nb.html +++ b/tutorial/tour-of-the-tidyverse.nb.html @@ -3011,9 +3011,6 @@

First created on Aug 31, 2020 (updated on Sept 22, 2020)

- -
knitr::opts_chunk$set(message = FALSE, warning = FALSE, collapse = TRUE)
-
@@ -3025,8 +3022,8 @@

About {palmerpenguins}

-
# install.packages("remotes")
-# remotes::install_github("allisonhorst/palmerpenguins")
+
# install.packages("remotes")
+# remotes::install_github("allisonhorst/palmerpenguins")
@@ -3035,14 +3032,6 @@

About {palmerpenguins}

Loading packages

- -
# loading packages
-library(tidyverse)
-library(palmerpenguins)
-
-# viewing data sets in package "palmerpenguins"
-data(package = "palmerpenguins")
-
@@ -3051,13 +3040,6 @@

readr

Let’s get data into R!

- -
# option 1: load using URL ----
-raw_adelie_url <- read_csv("https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff")
-
-# option 2: load using filepath ----
-raw_adelie_filepath <- read_csv("raw_adelie.csv")
-

Lucky for us, Allison Horst compiled data from all three species together for us in the {palmerpenguins} package!

@@ -3067,14 +3049,6 @@

readr

- -
# saves package tibble into global environment
-penguins <- palmerpenguins::penguins 
-head(penguins)
-
-penguins_raw <- palmerpenguins::penguins_raw
-head(penguins_raw)
- @@ -3083,12 +3057,6 @@

tibble

A tibble is much like the data frame in base R, but optimized for use in the Tidyverse. Let’s take a look at the differences.

- -
# try each of these commands in the console and see if you can spot the differences!
-
-as_tibble(penguins)
-as.data.frame(penguins)
- @@ -3106,9 +3074,6 @@

What differences do you see?

Try it out here!

- -
vignette("tibble")
-
@@ -3116,17 +3081,11 @@

Taking a closer look at penguins

Get a full view of the dataset:

- -
View(penguins)
-

Or catch a glimpse:

- -
glimpse(penguins)
-
@@ -3139,33 +3098,6 @@

ggplot2

Let’s see if body mass varies by penguin sex

- -
penguins %>%
-  ggplot()
-
-penguins %>%
-  ggplot(aes(x = sex, y = body_mass_g))
-
-penguins %>%
-  ggplot(aes(x = sex, y = body_mass_g)) +
-  geom_point()
-
-# A scatter plot doesn't really tell us much.
-# Let's try a different geometry
-
-penguins %>%
-  ggplot(aes(x = sex, y = body_mass_g)) +
-  geom_boxplot()
-
-# That's more informative!
-# Let's see if there are differences by penguin species
-
-penguins %>%
-  ggplot(aes(x = sex, y = body_mass_g)) +
-  geom_boxplot(aes(fill = species))
-
-# What do you notice?
- @@ -3186,9 +3118,6 @@

What observations can you make from the plot?

dplyr

- -
glimpse(penguins)
-
@@ -3196,10 +3125,6 @@

select()

Selecting dataset columns with select()

- -
penguins %>%
-  select(species, sex, body_mass_g)
-
@@ -3208,88 +3133,66 @@

arrange()

Reordering the data set with arrange()

- -
penguins %>%
-  select(species, sex, body_mass_g) %>%
-  arrange(desc(body_mass_g))
-

group_by() and summarize()

Summarizing the data using group_by() and summarize()

+

We can use group_by() to group our data by species and sex, and summarize() to calculate the average body_mass_g for each grouping.

- -
penguins %>% 
-  group_by(species, sex) %>%
-  summarize(n = n())
+ +
penguins %>%
+  select(species, sex, body_mass_g) %>%
+  group_by(species, sex) %>%         
+  summarize(mean = mean(body_mass_g))

count() and add_count()

-

Because we’re just counting observations in this example, we also have the option to use count() which simplifies our code a little.

-
-

Thank you to Alison Hill for these suggestions!

-
+

If we’re just interested in counting the observations in each grouping, we can group and summarize with special functions count() and add_count().

+

Counting can be done with group_by() and summarize(), but it’s a little cumbersome.

+

It involves… 1. using mutate() to create an intermediate variable n_species that adds up all observations per species, and 2. an ungroup()-ing step

- -
penguins %>%
-  count(species, sex)
+ +
penguins %>% 
+  group_by(species) %>%
+  mutate(n_species = n()) %>%            
+  ungroup() %>%                          
+  group_by(species, sex, n_species) %>%
+  summarize(n = n())
-
-
-

mutate()

-
-

Option 1

-

Creating new variables with mutate()

+

In contrast, count() and add_count() offer a simplified approach.

+
+

Thank you to Alison Hill for this suggestion!

+
- -
penguins %>% 
-  group_by(species) %>%
-  mutate(n_species = n()) %>%
-  ungroup() %>%
-  group_by(species, sex, n_species) %>%
-  summarize(n = n()) %>%
-  mutate(prop = n/n_species*100)
-
-
-

Option 2

-

We can also use mutate() along with add_count() to add up the counts per species group to use as a denominator (“n_species”) when we calculate the proportion by sex.

+
+

mutate()

+

We can add to our counting example by using mutate() to create a new variable prop, which represents the proportion of penguins of each sex, grouped by species

+
+

Thank you to Alison Hill for this suggestions!

+
- -
penguins %>% 
-  count(species, sex) %>%
-  add_count(species, wt = n, name = "n_species") %>%
-  mutate(prop = n/n_species*100)
-
-

filter()

-

Regardless of which approach we take to summarize our data, we can proceed to filtering rows by adding on a filtering step to our pipeline using filter()

+

Finally, we can filter rows to only show us Chinstrap penguin summaries by adding filter() to our pipeline

- -
penguins %>% 
-  count(species, sex) %>%
-  add_count(species, wt = n, name = "n_species") %>%
-  mutate(prop = n/n_species*100) %>%
-  filter(species == "Chinstrap")
-
@@ -3301,30 +3204,16 @@

forcats

The factor() function is perfect for this.

- -
penguins %>%
-  mutate(year_factor = factor(year, levels = unique(year)))
-

The result is a new factor year_factor with levels 2007, 2008 and 2009!

- -
penguins_new <-
-  penguins %>%
-  mutate(year_factor = factor(year, levels = unique(year)))
-penguins_new
-

Double check the variable class and factor levels below:

- -
class(penguins_new$year_factor)
-levels(penguins_new$year_factor)
-
@@ -3334,22 +3223,11 @@

stringr

From what we’ve learned so far, take a guess at what this code chunk will do before running it.

- -
penguins %>%
-  select(species, island) %>%
-  mutate(ISLAND = str_to_upper(island))
-

How about this one? How is it different from the previous code chunk?

- -
penguins %>%
-  select(species, island) %>%
-  mutate(ISLAND = str_to_upper(island)) %>%
-  mutate(species_island = str_c(species, ISLAND, sep = "_"))
- @@ -3359,24 +3237,11 @@

tidyr

We can pretend that it wasn’t and that body_mass_g was recorded separately for male, female, and sex NA penguins. Like untidy_penguins below:

- -
untidy_penguins <-
-  penguins %>%
-    pivot_wider(names_from = sex,
-                values_from = body_mass_g)
-untidy_penguins
-

Now let’s make it tidy again with the help of the pivot_longer() function! pivot_wider()is another very popular tidying function. Have you seen it before? Hint: see the code chunk above!

- -
untidy_penguins %>%
-  pivot_longer(cols = male:`NA`, 
-               names_to = "sex",
-               values_to = "body_mass_g")
- @@ -3388,59 +3253,25 @@

purrr

Let’s turn this plot:

- -
penguins %>%
-  ggplot(aes(x = sex, y = body_mass_g)) +
-  geom_boxplot(aes(fill = species))
-

Into this one!

- -
penguins %>%
-  ggplot(aes(x = sex, y = body_mass_g)) +
-  geom_boxplot(aes(fill = species)) +
-  scale_fill_manual(values = nord::nord_palettes$frost)
-

Let’s try out the frost palette.

- -
# we'll need to load the {nord} package
-library(nord)
-
-# you can choose colors using the color hex codes
-nord::nord_palettes$frost
- - -
# but you might prefer to use `scale_fill_manual()` 
-# or more specialized functions like `scale_fill_nord()` 
-# included in the {nord} package
-penguins %>%
-  ggplot(aes(x = sex, y = body_mass_g)) +
-  geom_boxplot(aes(fill = species)) +
-  scale_fill_manual(values = nord::nord_palettes$frost)
-  #scale_fill_nord(palette = "frost")
-

Ok now for a handy package/function trio!

- -
# we'll have to load the {prismatic} package
-library(prismatic)
-
-prismatic::color(nord::nord_palettes$frost)
-

purrr’s map() function can help us iterate the prismatic::color() function over all palettes in a palette package like nord!

@@ -3450,7 +3281,7 @@

purrr

-
nord::nord_palettes %>% map(prismatic::color)
+
nord::nord_palettes %>% map(prismatic::color)
@@ -3466,71 +3297,14 @@

Recreating a {palmerpenguins} plot

- -
# scatterplot sequence ----
-penguins %>%
-  ggplot() + 
-  geom_point(aes(x = flipper_length_mm, y = bill_length_mm)) # add aesthetics
-
-penguins %>%
-  ggplot() +
-  geom_point(aes(x = flipper_length_mm, y = bill_length_mm, 
-                 color = species)) # add color per species
-
-penguins %>%
-  ggplot() +
-  geom_point(aes(x = flipper_length_mm, y = bill_length_mm, 
-                 color = species, shape = species)) # add shape per species
-
-penguins %>%
-  ggplot() +
-  geom_point(aes(x = flipper_length_mm, y = bill_length_mm, 
-                 color = species, shape = species)) # add shape per species
-
-penguins %>%
-  ggplot() +
-  geom_point(aes(x = flipper_length_mm, y = bill_length_mm, 
-                 color = species, shape = species)) +
-  geom_smooth(aes(x = flipper_length_mm, y = bill_length_mm, 
-                  color = species))
-
-penguins %>%
-  ggplot(aes(x = flipper_length_mm, y = bill_length_mm)) + 
-  geom_point(aes(color = species, shape = species)) +
-  geom_smooth(aes(color = species), se = FALSE, method = "lm")
- - -
penguins %>%
-  ggplot() +
-  geom_point(aes(x = flipper_length_mm, y = body_mass_g, 
-                 color = species, shape = species))
- - -
penguins %>%
-  ggplot() +
-  geom_histogram(aes(x = flipper_length_mm))
-
-penguins %>%
-  ggplot() +
-  geom_histogram(aes(x = flipper_length_mm, color = species))
-
-penguins %>%
-  ggplot() +
-  geom_histogram(aes(x = flipper_length_mm, fill = species))
-
-penguins %>%
-  ggplot() +
-  geom_histogram(aes(x = flipper_length_mm, fill = species, 
-                     position = "identity", alpha = 0.5))
- @@ -3553,19 +3327,19 @@

tidytuesdayR

-
# install.packages("tidytuesdayR")
-# remotes::install_github("thebioengineer/tidytuesdayR")
-
-library(tidytuesdayR)
-
-# load the data
-tt_data <- tt_load("2020-07-27") # error message
-tt_data <- tt_load("2020-07-28")
-tt_data <- tt_load(2020, week=31)
-
-# take a peek
-readme(tt_data)
-print(tt_data)
+
# install.packages("tidytuesdayR")
+# remotes::install_github("thebioengineer/tidytuesdayR")
+
+library(tidytuesdayR)
+
+# load the data
+tt_data <- tt_load("2020-07-27") # error message
+tt_data <- tt_load("2020-07-28")
+tt_data <- tt_load(2020, week=31)
+
+# take a peek
+readme(tt_data)
+print(tt_data)
@@ -3594,7 +3368,7 @@

Examples

-

+
