Skip to content

Latest commit



312 lines (249 loc) · 7.14 KB

File metadata and controls

312 lines (249 loc) · 7.14 KB

Data Analysis 3: Week 7

Alexey Bessudnov 1 March 2019

Plan for today:

  1. Assignment 3: solution.
  2. Assignment 4.
  3. Homework for next week: functions.
  4. Exercises on data visualisation.


Data8 <- read_tsv("data/UKDA-6614-tab/tab/ukhls_w8/")
Data8 <- Data8 %>%
  select(pidp, h_sex_dv, h_age_dv, h_gor_dv, h_fimnnet_dv)

Bivariate distributions.

  1. Create a bar chart showing mean income by region.

    Data8 <- Data8 %>%
      mutate(region = recode(h_gor_dv,
                         `-9` = NA_character_,
                         `1` = "North East",
                         `2` = "North West",
                         `3` = "Yorkshire",
                         `4` = "East Midlands",
                         `5` = "West Midlands",
                         `6` = "East of England",
                         `7` = "London",
                         `8` = "South East",
                         `9` = "Souh West",
                         `10` = "Wales",
                         `11` = "Scotland",
                         `12` = "Northern Ireland"))
    byRegion <- Data8 %>%
      filter(! %>%
      group_by(region) %>%
    medianIncome = median(h_fimnnet_dv, na.rm = TRUE)
    byRegion %>%
      aes(x = reorder(region, medianIncome), y = medianIncome)
      ) +
      geom_bar(stat = "identity") +
      xlab("") +
      ylab("Median net monthly personal income") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))

    byRegion %>%
      aes(x = reorder(region, -medianIncome), y = medianIncome)
      ) +
      geom_bar(stat = "identity") +
      xlab("") +
      ylab("Median net monthly personal income") +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))

  2. Make a dot plot showing the same information as above (without splitting by sex). Sort regions in the descending order by mean income.

    byRegion %>%
      aes(y = reorder(region, medianIncome), x = medianIncome)
      ) +
      geom_point(size = 3) +
      xlab("Median net monthly personal income") +

  3. Make a line chart showing median income by age.

    byAge <- Data8 %>%
      group_by(h_age_dv) %>%
    medianIncome = median(h_fimnnet_dv, na.rm = TRUE)
    ## # A tibble: 87 x 2
    ##    h_age_dv medianIncome
    ##       <dbl>        <dbl>
    ##  1       16          0  
    ##  2       17         16.0
    ##  3       18        229. 
    ##  4       19        436. 
    ##  5       20        534. 
    ##  6       21        667. 
    ##  7       22        922. 
    ##  8       23       1092. 
    ##  9       24       1165. 
    ## 10       25       1253. 
    ## # … with 77 more rows
    byAge %>%
        ggplot(aes(x = h_age_dv, y = medianIncome)) +
        geom_line() +
        geom_smooth() +
        xlim(21,80) +
        xlab("Age") +
        ylab("Median income")

  4. Open the data from the youth questionnaire from wave 8. We will be working with the variable on BMI (h_ypbmi_dv) and visualise the distribution of BMI by sex, age, and ethnic group.

youth8 <- read_tsv("data/UKDA-6614-tab/tab/ukhls_w8/")

# summary(youth8$h_ypbmi_dv)
youth8 %>% pull(h_ypbmi_dv) %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -9.000  -9.000  -9.000  -2.477  -9.000  39.000
youth8 <- youth8 %>%
  mutate(bmi = recode(h_ypbmi_dv, `-9` = NA_real_))

youth8 %>% pull(bmi) %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   10.40   17.20   19.10   19.81   21.80   39.00    2531
# simple histogram

youth8 %>%
  ggplot(aes(x = bmi)) +
  geom_histogram(bins = 50) +
  geom_vline(xintercept = 30, colour = "red") +
  xlab("Body mass index") +
  ylab("Number of observations")

# BBC style

youth8 %>%
  ggplot(aes(x = bmi)) +
  geom_histogram(bins = 50) +
  geom_vline(xintercept = 30, colour = "red") +
  ylab("Number of observations") +
  bbc_style() +
  xlab("Body mass index")

  1. BMI by sex.


youth8 <- youth8 %>%
  mutate(sex = ifelse(h_sex_dv == 2, "female",
                      ifelse(h_sex_dv == 1, "male", NA)))
youth8 %>% count(h_sex_dv, sex)
## # A tibble: 2 x 3
##   h_sex_dv sex        n
##      <dbl> <chr>  <int>
## 1        1 male    1620
## 2        2 female  1652
youth8 %>%
  ggplot(aes(x = sex, y = bmi)) +
  geom_boxplot() +
  # this changes the boxplots from vertical to horizontal

  1. Density by group.
youth8 %>%
  ggplot(aes(x = bmi, fill = sex)) +
  geom_histogram(position = "dodge")

youth8 %>%
  ggplot(aes(x = bmi, fill = sex)) +
  geom_histogram(bins = 50, position = "identity", alpha = 0.5)

youth8 %>%
  ggplot(aes(x = bmi, colour = sex)) +

youth8 %>%
  ggplot(aes(x = bmi, fill = sex)) +

youth8 %>%
  ggplot(aes(x = bmi, fill = sex)) +
  geom_density() +
  # manually setting the colours
  scale_fill_manual(values = c("purple", "yellow"))

  1. Barplot with means.
youth8 %>%
  group_by(sex) %>%
    meanBMI = mean(bmi, na.rm = TRUE)
  ) %>%
  ggplot(aes(x = sex, y = meanBMI, fill = sex)) +
  geom_bar(stat = "identity")

youth8 %>%
  group_by(sex) %>%
    meanBMI = mean(bmi, na.rm = TRUE)
  ) %>%
  ggplot(aes(x = sex, y = meanBMI)) +
  geom_point() +
  ylim(0, 25) +

youth8 %>%
  group_by(h_gor_dv) %>%
    meanBMI = mean(bmi, na.rm = TRUE)
  ) %>%
  ggplot(aes(x = reorder(as.factor(h_gor_dv), meanBMI), y = meanBMI)) +
  geom_point() +

  1. Faceted chart.
youth8 %>%
  ggplot(aes(x = bmi)) +
  geom_histogram(bins = 50) +
  geom_vline(xintercept = 30, colour = "red") +
  xlab("Body mass index") +
  ylab("Number of observations") +
  facet_wrap(~ sex)