13.us-race-analysis.Rmd

---
title: "Representation analysis of race/ethnicity in the US"
---


```{r setup, include=FALSE}
library(dplyr)
library(ggplot2)
library(broom)
library(lubridate)
library(forcats)

source('utils/r-utils.R')
library(wru)
theme_set(theme_bw() + 
            theme(panel.grid.minor = element_blank(),
                  legend.title = element_blank()))

load('Rdata/raws.Rdata')

alpha_threshold <- qnorm(0.975)
start_year <- '1993-01-01'
end_year <- '2019-12-31'
n_years <- year(end_year) - year(start_year)

race_levels <- c('White', 'Asian', 'Black', 'Hispanic', 'Other', 'Other categories')
```

## Race/ethnicity predictions

```{r}
# looking at only US affiliation
pubmed_race_pmids <- corr_authors %>% 
  tidyr::separate_rows(countries, sep = ',') %>% 
  filter(countries == 'US') %>%
  rename('surname' = last_name_simple) %>% 
  predict_race(surname.only = T, impute.missing = F) 

pubmed_us_race <- pubmed_race_pmids %>% 
  group_by(pmid, journal, publication_date, year, adjusted_citations) %>% 
  summarise_at(vars(contains('pred.')), mean, na.rm = T, .groups = 'drop') %>% 
  {.}

iscb_us_race <- keynotes %>%
  rename('surname' = last_name_simple) %>%
  predict_race(surname.only = T, impute.missing = F)

my_jours <- unique(pubmed_us_race$journal)
my_confs <- unique(iscb_us_race$conference)
n_jours <- length(my_jours)
n_confs <- length(my_confs)
```


```{r}
iscb_pubmed <- iscb_us_race %>%
  rename('journal' = conference) %>% 
  select(year, journal, contains('pred')) %>%
  mutate(publication_date = year,
         type = 'Keynote speakers/Fellows',
         adjusted_citations = 1) %>%
  bind_rows(
    pubmed_us_race %>%
      select(year, journal, contains('pred'), publication_date, adjusted_citations) %>%
      mutate(type = 'Pubmed authors')
  ) %>%
  mutate(pred_sum_others = pred.his + pred.oth + pred.bla) %>% 
  tidyr::pivot_longer(contains('pred'),
                      names_to = 'Race',
                      values_to = 'probabilities') %>%
  filter(!is.na(probabilities)) %>% 
  recode_race() %>%
  group_by(type, year, Race) %>%
  mutate(
    pmc_citations_year = mean(adjusted_citations),
    weight = adjusted_citations/pmc_citations_year,
    weighted_probs = probabilities*weight
  )

iscb_pubmed_sum <- iscb_pubmed %>% 
  summarise(
    mean_prob = mean(weighted_probs),
    mean_raw = mean(probabilities),
    se_prob = sqrt(var(probabilities) * sum(weight^2)/(sum(weight)^2)),
    me_prob = alpha_threshold * se_prob,
    .groups = 'drop'
  )
```

```{r}
iscb_race <- vector('list', length = n_confs)
i <- 0
for (conf in my_confs){
  i <- i + 1
  iscb_race[[i]] <- iscb_pubmed %>%
    filter(type != 'Pubmed authors' & journal == conf & (Race != 'Other categories')) %>%
    group_by(year, Race, journal) %>%
    summarise(mean_prob = mean(probabilities, na.rm = T), .groups = 'drop') %>%
    # ungroup() %>% 
    {.}
}
```

```{r}
save(iscb_pubmed, iscb_race, file = 'Rdata/iscb-us-race.Rdata')
```

```{r fig.height=7}
fig_stats <- iscb_pubmed_sum %>% 
  filter(Race %in% c('White', 'Asian', 'Other categories')) %>% 
  gam_and_ci(
    df2 = iscb_pubmed %>% 
      filter(Race %in% c('White', 'Asian', 'Other categories')),
    start_y = start_year, end_y = end_year) + 
  theme(legend.position = c(0.84, 0.78),
        panel.grid.minor = element_blank(),
        legend.margin = margin(-0.5, 0, 0, 0, unit='cm')) +
  facet_wrap(vars(forcats::fct_rev(Race)))

fig_2a <- iscb_pubmed_sum %>%
  mutate(type = fct_rev(as.factor(type))) %>% 
  filter(Race != 'Other categories') %>%
  # group_by(year, type, Race) %>%
  # summarise(mean_prob = mean(probabilities, na.rm = T), .groups = 'drop') %>%
  race_breakdown(category = 'main', race_levels, type)

fig_2 <- cowplot::plot_grid(
  fig_2a, fig_stats, labels = 'AUTO', ncol = 1, rel_heights = c(1.5,1))
fig_2

ggsave('figs/us_racial_makeup.png', fig_2, width = 6.5, height = 5.5)
ggsave('figs/us_racial_makeup.svg', fig_2, width = 6.5, height = 5.5)
```

## Hypothesis testing

Regression of the probability of a name of a certain race on the type of scientists (authors vs. speakers) and year (interaction term included):

```{r}
main_lm <- function(racei){
  lm(weighted_probs ~ year + type, 
     data = iscb_pubmed %>% 
       ungroup() %>% 
       filter(Race == racei, !is.na(weighted_probs)) %>% 
       mutate(year = c(scale(year)),
              type = relevel(as.factor(type), ref = 'Pubmed authors')))
}
inte_lm <- function(racei){
  lm(weighted_probs ~ year * type, 
     data = iscb_pubmed %>% 
       ungroup() %>% 
       filter(Race == racei, !is.na(weighted_probs)) %>% 
       mutate(year = c(scale(year)),
              type = relevel(as.factor(type), ref = 'Pubmed authors')))
}
inte_list <- lapply(c('White', 'Asian', 'Other categories'), main_lm)
lapply(inte_list, summary)
```

```{r echo = F}
get_p <- function(i, colu){
  tidy(inte_list[[i]]) %>% 
    filter(term == 'typeKeynote speakers/Fellows') %>% 
    pull(colu) %>% 
    sprintf("%0.5g", .)
}
```

### Should we include interaction terms?

Interaction terms do not predict `probabilities` over and above the main effect of group of scientists and year.
```{r}
compare_lm <- function(racei) anova(main_lm(racei), inte_lm(racei))
lapply(c('White', 'Asian', 'Other categories'), compare_lm)
```

## Conclusion

Specifically, a name coming from the group of honorees has significantly higher probability of being white ($\beta_\textrm{white} =$ `r get_p(1, 'estimate')`, _p_ = `r get_p(1, 'p.value')`) and lower probability of being Asian, ($\beta_\textrm{Asian} =$ `r get_p(2, 'estimate')`, _p_ = `r get_p(2, 'p.value')`).
The two groups of scientists did not have a significant association with names predicted to be in Other categories (_p_ = `r get_p(3, 'p.value')`).