-
Notifications
You must be signed in to change notification settings - Fork 0
/
01-song-titles-solution.R
61 lines (50 loc) · 1.78 KB
/
01-song-titles-solution.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
library(tidyverse)
library(tidytext)
library(here)
# import song lyrics and population data
song_lyrics <- here("data", "billboard_lyrics_1964-2015.csv") %>%
read_csv()
glimpse(song_lyrics)
pop_df <- here("data", "pop2016.csv") %>%
read_csv()
# Use tidytext to create a data frame with one row for each token in each song
# Hint: To search for matching state names, this data frame should include both
# unigrams and bi-grams.
## tokenize
lyrics_unigrams <- unnest_tokens(
tbl = song_lyrics,
output = word,
input = Lyrics)
lyrics_bigrams <- unnest_tokens(
tbl = song_lyrics,
output = word,
input = Lyrics,
token = "ngrams", n = 2
)
## combine together
tidy_lyrics <- bind_rows(lyrics_unigrams, lyrics_bigrams)
tidy_lyrics
# Find all the state names occurring in the song lyrics
# - First create a data frame that meets this criteria
# - Save a new data frame that only includes one observation for each matching song.
# That is, if the song is "New York, New York", there should only be one row in
# the resulting table for that song.
## use inner_join() to combine and only keep words that are state names
## distinct() to deduplicate the states per song
tidy_lyrics <- inner_join(tidy_lyrics, pop_df, by = c("word" = "state_name")) %>%
distinct(Rank, Song, Artist, Year, word, .keep_all = TRUE)
tidy_lyrics
# Calculate the frequency for each state's mention in a song and
# create a new column for the frequency adjusted by the state's population
## aggregate per state
(state_counts <- tidy_lyrics %>%
count(word) %>%
arrange(desc(n)))
## normalize for population
pop_df <- pop_df %>%
left_join(state_counts, by = c("state_name" = "word")) %>%
mutate(rate = n / population * 1e6)
## which are the top ten states by rate?
pop_df %>%
arrange(desc(rate)) %>%
top_n(10)