Full_analysis.Rmd

---
title: "Review Analysis - A tale of native american whole genome sequencing..."
author:
- "Developer / Analyst: Israel Aguilar"
- "With Data from: Josue Guzman"
- "Data collection: Aguilar's Virtual Lab Team"
date: "24 March 2022"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE )

# read libs
library( "pacman" )

p_load( "openxlsx",
        "dplyr",
        "ggplot2",
        "ggrepel",
        "treemapify",
        "cowplot",
        "stringr",
        "ggsci",
        "tidyr",
        "scales",
        "knitr",
        "rnaturalearth",
        "sf",
        "tmap",
        "rgeos",
        "scales" )
```

# Count projects by technology (Array, WES, WGS)
```{r }
# create the map
source( file = "auxiliary_scripts/map_figure.R" ) 

# Vis
america_map.p

```

# Count projects by technology (Array, WES, WGS)
```{r }
# Cargar datos
allprojects <- read.xlsx( xlsxFile = "Tables A tale of native american whole genome sequencing.xlsx", sheet = "ST2", startRow = 2 )

# say how many projects are under review
message( nrow(allprojects), " reviewed projetcs" )

# create a DF to summarize projects
bytech <- data.frame(
  technology = c(
    "Array",
    "Whole Exome Sequencing (WES)",
    "Whole Genome Sequencing (WGS)"
  ),
  projects = c( 
    allprojects %>% filter( str_detect( string = Technology,
                                        pattern = "Genotyping" ) ) %>% nrow(),
    allprojects %>% filter( str_detect( string = Technology,
                                        pattern = "WES" ) ) %>% nrow(),
    allprojects %>% filter( str_detect( string = Technology,
                                        pattern = "WGS" ) ) %>% nrow()
  )
)

kable( x = bytech )

```

# Count individuals by technology (Array, WES, WGS)
```{r}
# sum every individual studied
paste( sum( allprojects$WES.Samples,
            allprojects$WGS.Samples,
            allprojects$Array.Samples ), "NatAm individuals have been studied" )

# sum by tech
paste( sum( allprojects$WES.Samples), "studied by WES" ) 
paste( sum( allprojects$WGS.Samples), "studied by WGS" ) 
paste( sum( allprojects$Array.Samples), "studied by Array tech" ) 
```

# Timeline for projects
```{r}
# Simplify data
# we require:
# project year of publication
# Technology
# Category of samples ( Contemporaneous, ancient, or both )
# WES Samples
# WGS Samples
# Array Samples
# Data.Availability
simplified <- allprojects %>% 
  select( Year.Publication,
          Technology,
          `DNA.Studied.(Contemporaneous,.aDNA,.Both)`, WES.Samples,
          WGS.Samples, Array.Samples,
          Data.Availability ) %>% 
  rename( year = Year.Publication,
          time_of_samples = `DNA.Studied.(Contemporaneous,.aDNA,.Both)` )

# extract individual DFs by type of samples
array_projects <- simplified %>% 
  filter( Array.Samples > 0 ) %>% 
  select( -WES.Samples, -WGS.Samples ) %>%
  mutate( Technology = "Array" ) %>% 
  rename( samples = Array.Samples )

WES_projects <- simplified %>% 
  filter( WES.Samples > 0 ) %>% 
  select( -Array.Samples, -WGS.Samples ) %>%
  mutate( Technology = "WES" ) %>% 
  rename( samples = WES.Samples )

WGS_projects <- simplified %>% 
  filter( WGS.Samples > 0 ) %>% 
  select( -Array.Samples, -WES.Samples ) %>%
  mutate( Technology = "WGS" ) %>% 
  rename( samples = WGS.Samples )

# gather all techs
alltech <- bind_rows( array_projects,
                      WES_projects,
                      WGS_projects ) %>%
  mutate( sample_tag = case_when( samples < 10 ~ "< 10",
                                  samples < 100 ~ "< 100",
                                  samples < 1000 ~ "< 1000",
                                  TRUE ~ "more than 1000"
  ) )
```

```{r}
# lets try dots
timeline <- ggplot( data = alltech,
                    mapping = aes( x = year,
                                   y = Technology ) ) +
  geom_point( mapping = aes(size = sample_tag,
                            fill = Data.Availability) ,
              shape = 21, alpha = 0.5,
              position = position_jitter( width = 0,
                                          height = 0.2,
                                          seed = 66 ) ) +
  scale_x_continuous( limits = c(2009, 2021),
                      breaks = seq( 2009, 2021, 2) ) +
  scale_fill_startrek( ) +
  labs( title = "Timeline for Native American genomic projects",
        x = "year of publication",
        y = "technology used",
        fill = "Data Available?",
        size = "n. of studied indiviuals" ) +
  theme_light( base_size = 10 ) +
  theme( panel.grid.major.x = element_line( linetype = "dashed" ),
         panel.grid.minor.x = element_line( linetype = "dashed" ),
         axis.title = element_blank( ),
         axis.text.y = element_text( face = "bold"),
         plot.title = element_text( face = "bold" ),
         legend.title = element_text( face = "bold" ) )

# save plot for manuscript
ggsave( filename = "timeline.tiff",
        plot = timeline,
        width = 7,
        height = 5,
        dpi = 600 )

ggsave( filename = "timeline.svg",
        plot = timeline,
        width = 7,
        height = 5,
        dpi = 600 )

```
# Accumulated number of individuals studied

```{r}
# line plot by tech
# number of samples acuumulated
sum_by_year <- alltech %>% 
  group_by( year, Technology ) %>% 
  summarize( total_year = sum(samples) ) %>% 
  ungroup( ) %>% 
  group_by( Technology ) %>% 
  summarize( acc = cumsum(total_year),
             year = year,
             total_year = total_year )

# plot
# prepare for y axis control
thebreaks <- seq( 0, 9000, 1000 )

line_dot <- ggplot( data = sum_by_year,
                    mapping = aes( x = year,
                                   y = acc,
                                   group = Technology,
                                   color = Technology,
                                   # lty = Technology
                    ) ) +
  geom_line( size = 0.4 ) +
  geom_point( data = sum_by_year %>% 
                filter( year == 2021 ) ) +
  scale_y_continuous( position = "right",
                      breaks = thebreaks,
                      labels = prettyNum( thebreaks, big.mark = "," ) ) +
  scale_x_continuous( limits = c(2009, 2021),
                      breaks = seq( 2009, 2021, 2) ) +
  scale_color_jama( ) +
  labs( title = "The increase in NatAm sequencing",
        x = "Year of publication",
        y = "Acc. studied individuals") +
  theme_classic( ) +
  theme( legend.position = "top",
         axis.title = element_text( face = "bold" ),
         legend.title = element_text( face = "bold" ),
         plot.title = element_blank( ) )

# save plot for manuscript
ggsave( filename = "acc_individuals.tiff",
        width = 7,
        height = 5,
        dpi = 600 )

ggsave( filename = "acc_individuals.svg",
        width = 7,
        height = 5,
        dpi = 600 )
```

```{r}
# lets try a panel fig
timedot <- plot_grid( timeline,
                      line_dot,
                      labels = c("A","B"),
                      ncol = 1, align = "v",
                      axis = "lr" )

# save plot for manuscript
ggsave( filename = "timepanel.tiff",
        plot = timedot,
        width = 7,
        height = 10,
        dpi = 600 )

ggsave( filename = "timepanel.svg",
        plot = timedot,
        width = 7,
        height = 10,
        dpi = 600 )
```

# Claculate the share of available samples

```{r }
full_natam_samples <- read.xlsx( xlsxFile = "Tables A tale of native american whole genome sequencing.xlsx", sheet = "ST3", startRow = 2 ) %>% 
  mutate( year = as.numeric( Publication.Year ) )

# clear the df and summarize
cleared <- full_natam_samples %>% 
  select( year, Technology, Data.Availability ) %>% 
  group_by( year, Technology, Data.Availability ) %>% 
  summarise( bydata_total_year = n( ) )
```

```{r }
# lets create the base dataframe
all_years <- unique( cleared$year ) %>% sort( )

all_technologies <- unique( cleared$Technology ) %>% sort( )

all_availability <- unique( cleared$Data.Availability ) %>% sort( )

# create the crossinf of variables
base_df <- crossing( all_years, all_technologies, all_availability ) %>% 
  rename( year = all_years,
          Technology = all_technologies,
          Data.Availability = all_availability )

# annotate base df
annotated <- left_join( x = base_df,
                        y = cleared,
                        by = c( "year" = "year",
                                "Technology" = "Technology",
                                "Data.Availability" = "Data.Availability" ) ) %>% 
  mutate( bydata_total_year = ifelse( test = is.na( bydata_total_year ), yes = 0, no = bydata_total_year ) )

# create a function for stacked area plot
plot_areas <- function( the_data, the_tech ){
  
  ## lets try only with arrays
  percentages <- the_data %>%
    filter( Technology == the_tech ) %>% 
    group_by( Data.Availability ) %>%
    summarize( year = year,
               Technology = Technology,
               bydata_total_year = bydata_total_year,
               accumulated_bydata = cumsum( bydata_total_year ) ) %>% 
    ungroup( ) %>% 
    group_by( year ) %>% 
    summarize( Data.Availability,
               Technology = Technology,
               bydata_total_year = bydata_total_year,
               accumulated_bydata = accumulated_bydata,
               total_accumulated_bydata_peryear = sum(accumulated_bydata),
               fraction = accumulated_bydata / total_accumulated_bydata_peryear ) %>% 
    ungroup( ) %>% 
    mutate( fraction = ifelse( test = is.nan( fraction ),
                               yes = 0,
                               no = fraction ) )
  
  # show table for 2021 share of available data
  print( the_tech )
  print( kable( x = percentages %>% filter( year == 2021 ) %>% select(year, Data.Availability, accumulated_bydata, fraction ) ) )
  
  # Create area plot...
  ggplot( data = percentages,
          mapping = aes( x = year,
                         y = fraction,
                         fill = Data.Availability ) ) + 
    geom_area( alpha = 0.6,
               size = 1,
               colour = "black" ) +
    scale_x_continuous( limits = c(2009, 2021),
                        breaks = seq( 2009, 2021, 1),
                        expand = c( 0, 0 ) ) +
    scale_y_continuous( labels = percent, 
                        expand = c(0, 0),
                        sec.axis = sec_axis( trans = ~.,
                                             name = "Accumulated data",
                                             labels = percent ) ) +
    # scale_fill_startrek( ) +
    scale_fill_manual( values = c("#cc1c00", "#84bd00")) +
    labs( title = "Trends in Data Availability",
          x = "Year",
          y = the_tech,
          fill = "Data available?" ) +
    theme_light( ) +
    theme( panel.grid.minor = element_blank( ),
           axis.text.y.left = element_blank( ),
           axis.ticks.y.left = element_blank( ),
           panel.background = element_rect( color = "black",
                                            size = 2, linetype = "solid" ),
           plot.title = element_text( face = "bold" ),
           legend.title = element_text( face = "bold" ) )
  
}

# plot the DATA!

wgs_stack <- plot_areas( the_data = annotated,
                         the_tech = "WGS"  )

wes_stack <-plot_areas( the_data = annotated,
                        the_tech = "WES"  )

array_stack <- plot_areas( the_data = annotated,
                           the_tech = "Array"  )

# create a panel
share_panel <- plot_grid( wgs_stack +
                            theme( axis.title.x = element_blank( ),
                                   axis.text.x = element_blank( ),
                                   axis.title.y.right = element_blank( ),
                                   legend.position = "none" ),
                          wes_stack +
                            theme( axis.title.x = element_blank( ),
                                   axis.text.x = element_blank( ),
                                   plot.title = element_blank( ) ),
                          array_stack +
                            theme( axis.title.y.right = element_blank( ),
                                   plot.title = element_blank( ),
                                   axis.text.x = element_text( angle = 90, vjust = 0.5, hjust = 0.5 ),
                                   legend.position = "none" ),
                          ncol = 1,
                          axis = "lr",
                          align = "v",
                          rel_heights = c( 0.35, 0.3, 0.35)  )

# save plot for manuscript
ggsave( filename = "sharepanel.tiff",
        plot = share_panel,
        width = 7,
        height = 10,
        dpi = 600 )

ggsave( filename = "sharepanel.svg",
        plot = share_panel,
        width = 7,
        height = 10,
        dpi = 600 )

# print total samples with availble data
message( annotated %>%
           filter(Data.Availability == "Yes") %>%
           pull(bydata_total_year) %>% 
           sum( ),
         " samples with available data")
```

```{r}
# lets try a panel fig
plot_grid( timedot,
           share_panel,
           labels = c("","C"),
           nrow = 1 )

# Save
ggsave( filename = "timepanelv2.tiff",
        # plot = timedot,
        width = 14,
        height = 7,
        dpi = 600 )

# Save
ggsave( filename = "timepanelv2.svg",
        # plot = timedot,
        width = 14,
        height = 7,
        dpi = 600 )

```

## The openness in ancient studies
```{r}
##  get the number of ancient samples
message( full_natam_samples %>%
           filter( DNA.Type.of.Samples == "AncientDNA" ) %>% 
           nrow( ) ,
         " samples are ancient DNA")

# Lets create a pie chart for ancient and modern dna
pie_data <- full_natam_samples %>% 
  group_by( DNA.Type.of.Samples ) %>% 
  summarise( samples = n( ) ) %>% 
  mutate( fraction = samples / sum(samples) ,
          percent = percent(fraction),
          lab_pos = 1 - ( fraction / 2) ) %>% 
  ungroup( )

# print table
kable( x = pie_data )

# plot pie...
pieplot <- ggplot( data = pie_data,
                   mapping = aes( x = 2,
                                  y = fraction,
                                  fill = DNA.Type.of.Samples ) ) +
  geom_label_repel( mapping = aes( label = paste( DNA.Type.of.Samples,
                                                  "\n",
                                                  prettyNum( samples, big.mark = "," ),
                                                  "samples" ),
                                   x = 2,
                                   y = lab_pos ),
                    min.segment.length = 0,
                    segment.curvature = -1e-20,
                    nudge_x = 1,
                    fill = "white" ) +
  geom_col( color = "black" ) +
  xlim( c(0, 3) ) +
  scale_fill_manual( values = c( 
    "deepskyblue3",
    "cornflowerblue") ) +
  coord_polar( theta = "y" ) +
  labs( title = "Ancient DNA in NatAm genomics" ) +
  theme_void( ) +
  theme( legend.position = "none",
         panel.background = element_rect( fill = "white", color = NA ),
         plot.background = element_rect( fill = "white", color = NA ),
         plot.title = element_text( hjust = 0.5, vjust = 0.5, face = "bold" )
  )

```

```{r}

# Get the share of available data per type of DNA
fractions <- full_natam_samples %>% 
  group_by( DNA.Type.of.Samples,
            Data.Availability ) %>% 
  summarise( samples = n( ) ) %>% 
  ungroup(  ) %>% 
  group_by( DNA.Type.of.Samples ) %>% 
  summarise( Data.Availability = Data.Availability,
             samples = samples,
             totals = sum(samples),
             frac = samples / totals )

# print table
kable( x = fractions )

# Lets plot cols stacked
sharebar <- ggplot( data = fractions,
                    mapping = aes( x = DNA.Type.of.Samples,
                                   y = frac,
                                   fill = Data.Availability ) ) +
  geom_col( color = "black", width = 0.9, alpha = 0.6 ) +
  scale_fill_manual( values = c("#cc1c00", "#84bd00")) +
  scale_y_continuous( limits = c(0,1),
                      breaks = seq( 0, 1, by = 0.2 ),
                      labels = percent,
                      expand = c(0,0) ) +
  labs( title = "Data availability",
        x = "",
        y = "Accumulated samples",
        fill = "Is Publicly Available?" ) +
  theme_classic( ) +
  theme( legend.position = "top",
         axis.text.x = element_text( face = "bold" ),
         plot.title = element_text( hjust = 0.5, vjust = 0.5, face = "bold" ),
         legend.title = element_blank( ) )

# create a panel for ancient overview
ancient_panel <- plot_grid( pieplot,
                            sharebar,
                            labels = c("A","B"),
                            nrow = 1,
                            rel_widths = c(0.35, 0.65) )

```

```{r }
# create a function for treemap
treemaper <- function( the_data, the_time, n_cut, the_color ) {
  #
  tmp_data <- the_data %>% 
    filter( DNA.Type.of.Samples == the_time )
  
  # sample cutofffor other tagging
  sample_cutoff <- n_cut
  
  # Lets create a treemap to see the most studied pops
  pop_summary <- tmp_data %>% 
    group_by( Homologated.Pop.Code ) %>%
    summarise( samples = n( ) ) %>% 
    arrange( -samples ) %>% 
    ungroup( ) %>% 
    mutate( tag = ifelse( test = samples < sample_cutoff,
                          yes = "other",
                          no = Homologated.Pop.Code )) %>% 
    group_by( tag ) %>% 
    summarise( totals = sum(samples)  ) %>% 
    arrange( -totals ) %>% 
    mutate( units = "individuals" ) 
  
  # number of other pops
  other_n <- tmp_data %>% 
    group_by( Homologated.Pop.Code ) %>% 
    summarise( samples = n( ) ) %>% 
    arrange( -samples ) %>% 
    ungroup( ) %>% 
    filter( samples < sample_cutoff ) %>% 
    nrow( )
  
  #
  pop_summary$units[ 2:nrow(pop_summary) ] <- ""
  
  pop_summary <- pop_summary %>% 
    mutate( subtag = paste( prettyNum( totals, big.mark = ","),
                            units ),
            tag = ifelse( test = tag == "other",
                          yes = paste( "other", other_n, "Pops",
                                       "with less than", sample_cutoff, "individuals"),
                          no = tag ) )
  
  # lets make treemap
  ggplot( data = pop_summary,                     # el datframe que tiene las etiquetas y subetiquetas
          mapping = aes( area = totals ) ) +   # el tamanio de los cuadrados sera proporcional a la cantidad de anomalias contadas
    geom_treemap( 
      mapping = aes(fill = totals ),        # Usamos geometria de treemap; el relleno fill sera un gradiente dependiendo de la cantidad de anomalias
      color = "black",                          # el borde de los cuadros sera blanco
      size = 2 ) +                                # el grosor del borde sera de 3 unidades
    geom_treemap_text( mapping = aes( label = tag ),    # su mapeo label indica que columna aparecera en el texto
                       place = "centre",                     # en que parte del cuadro aparecera la palabra
                       reflow = TRUE,                        # ajustar el texto a la caja
                       size = 10,                            # el tamanio de la palabra
                       fontface = "bold" ) +
    geom_treemap_text( mapping = aes( label = subtag ),     # el texto viene de la subetiqueta que creamos
                       place = "bottomright",                    # aparecera abajo a la derecha
                       color = "black",                          # el color del texto es blanco
                       padding.x = unit( x = 1,                  # se puede mover la etiqueta, esta se mueve 3
                                         units = "mm" ),         # milimetros
                       padding.y = unit( x = 1,                  # se mueve en y 3 mm tambien
                                         units = "mm" ),
                       size = 8 ) +
    scale_fill_gradient( low = "white",      # el color mas bajo es este hexadecimal que es un naranjita
                         high = the_color ) +
    labs( title = "    Most Studied NatAm populations",
          subtitle = the_time ) +
    theme( legend.position = "none",                       # eliminamos la leyenda; no dice mucho
           plot.title =  element_text( size = 17,          # cambiamos el tamanio del titulo
                                       face = "bold" ) )
}

```

# How many groups have been studied?

```{r }
## How many uniq groups in the table?
studiedgroups <- full_natam_samples$Homologated.Pop.Code %>% unique( ) %>% sort( )

# mess
message( length( studiedgroups ), " unique studied groups with any tech" )

# see studied groups by technology
full_natam_samples %>% 
  select( Technology, Homologated.Pop.Code ) %>% 
  unique( ) %>% 
  group_by( Technology ) %>% 
  summarise( studied_pops = n( ) ) %>% 
  kable( )

modern_tree <- treemaper( the_data = full_natam_samples,
                          the_time = "ModernDNA",
                          the_color = "cornflowerblue",
                          n_cut = 100 )

ancient_tree <- treemaper( the_data = full_natam_samples,
                           the_time = "AncientDNA",
                           the_color = "deepskyblue3",
                           n_cut = 5 )

treegrid <- plot_grid( 
  # modern_tree,
  # ancient_tree + theme( plot.title = element_blank( ) ),
  ancient_tree,
  modern_tree + theme( plot.title = element_blank( ) ),
  nrow = 1,
  align = "h" )

# gather grids
thepanel <- plot_grid( ancient_panel,
                       treegrid,
                       labels = c("","C"),
                       ncol = 1,
                       rel_heights = c( 0.4, 0.6 ) )

# save plot for manuscript
ggsave( filename = "ancientpanel.tiff",
        plot = thepanel,
        width = 10,
        height = 10,
        dpi = 600 )

ggsave( filename = "ancientpanel.svg",
        plot = thepanel,
        width = 10,
        height = 10,
        dpi = 600 )

# vis
pieplot
sharebar

modern_tree
ancient_tree
```

# Placeholder for analysing he missing groups 

```{r}
# studiedgroups has every code already studied

# read all census data
allcensus <- read.xlsx(xlsxFile = "Tables A tale of native american whole genome sequencing.xlsx", sheet = "ST4", startRow = 2)

# find the "censed" natam groups that have not been represented
notrepresented <- allcensus %>% 
  filter( !(allcensus$Homologated.Pop.Code %in% studiedgroups) ) %>% 
  # tag not represented pops by size
  mutate( tag = case_when( Population < 100 ~ "< 100",
                           Population < 1000 ~ "100 - 1,000",
                           Population < 10000 ~ "1,000 - 10,000",
                           Population < 100000 ~ "10,000 - 100,000",
                           Population < 1000000 ~ "100,000 - 1,000,000",
                           Population >= 1000000 ~ "> 1,000,000" ) ) %>% 
  mutate( order = case_when( Population < 100 ~ "1",
                             Population < 1000 ~ "2",
                             Population < 10000 ~ "3",
                             Population < 100000 ~ "4",
                             Population < 1000000 ~ "5",
                             Population >= 1000000 ~ "6" ) )

# summarize missing groups by pop size...
missumarised <- notrepresented %>% 
  filter( !is.na(Population) ) %>% 
  group_by( Home.Country, tag, order ) %>% 
  summarise( groups = n( ) ) %>% 
  pivot_wider( data = .,
               id_cols = c(tag, order),
               names_from = Home.Country,
               values_from = groups ) %>% 
  arrange( desc(order) ) %>% 
  select( -order )

# replace NAs with 0 in the table
missumarised[is.na(missumarised)] <- 0

# from wide to long
long <- pivot_longer( data = missumarised,
                      cols = -tag,
                      names_to = "country",
                      values_to = "groups" )

misstable <- pivot_wider( data = long, id_cols = country, names_from = tag, values_from = groups  )

# create an order for the print table
tmptable <- misstable 

tmptable <- tmptable %>% 
  select( -country )

rownames( tmptable ) <- misstable$country

order <- rowSums( tmptable ) %>% sort( decreasing = T ) %>% names( )

# reorder the table
misstable <- misstable %>% 
  arrange( factor( country, levels = order ) )

kable( x = misstable )

# save the table with the census description of the missing groups...
write.csv( x = notrepresented, file = "norepresented_groups.csv", quote = T, row.names = F, na = ""  )

# and save the table with the data summarized...
write.csv( x = misstable, file = "summary_norepresented_groups.csv",
           quote = T, row.names = F, na = ""  )

# print total of not rep groups
message( "There is a total of ", nrow( notrepresented ), " NatAm groups not yet genomically represented across different countries" )

# order of pop size
orderx <- c( "< 100", "100 - 1,000", "1,000 - 10,000",
             "10,000 - 100,000", "100,000 - 1,000,000" )
labelx <- c( "less than 100", "100 to 1K", "1K to 10K", "10K to 100K", "100K to 1M")

# try a heatmap
missing_heat <- ggplot( data = long %>% filter( groups > 0 ),
                        mapping = aes( x = tag,
                                       y = country,
                                       fill = groups,
                                       label = groups ) ) +
  geom_tile( color = "black" ) +
  geom_text( ) +
  scale_x_discrete( limits = rev( orderx ),
                    labels = labelx,
                    position = "top") +
  scale_y_discrete( limits = rev( order ) ) +
  scale_fill_gradient( low = "white",
                       high = "tomato" ) +
  labs( title = "NatAm groups missing in genomics",
        x = "Population size",
        fill = "# of groups" ) +
  theme_light( ) +
  theme_void( ) +
  theme( axis.title.y = element_blank( ),
         axis.text.y = element_text( face = "bold" ),
         axis.text.x = element_text( angle = 90 ),
         axis.title.x = element_text( face = "bold" ),
         legend.position = "none" )

# vis
missing_heat

# create a panel with map and heatmap
plot_grid( america_map.p + theme( legend.position = "right", plot.caption = element_blank() ),
           labels = c('A', 'B'),
           missing_heat )

# Save plot for paper
ggsave( filename = "map_and_missing.tiff",
        width = 10,
        height = 5,
        dpi = 600 )

ggsave( filename = "map_and_missing.svg",
        width = 10,
        height = 5,
        dpi = 600 )

```

# The countries studying NatAm genomics
```{r}

# read data
allcountries <- read.xlsx( xlsxFile = "Tables A tale of native american whole genome sequencing.xlsx", sheet = "ST6", startRow = 2 )

# create a function for pie chart
piecharter <- function( thedata, thecolumn, lowcolor, highcolor, thetitle ) {
  
  piedata <- thedata %>% 
    select( thecolumn ) %>% 
    rename( countries = 1 ) %>%
    separate_rows( countries, sep = "," ) %>% 
    mutate( countries = str_remove_all( string = countries, pattern = " ") ) %>% 
    group_by( countries ) %>% 
    summarise( studies = n( ) ) %>% 
    arrange( -studies ) %>% 
    mutate( fraction = studies / sum(studies) ,
            percent = percent(fraction),
            # lab_pos = 1 - (cumsum( fraction ) - fraction / 2 ) ) %>% 
            lab_pos = (cumsum( fraction ) - (fraction / 2) ) ) %>% 
    ungroup( ) %>% 
    mutate( countries = factor( x = countries,
                                levels = .$countries) )
  
  # print table
  piedata %>%
    select( countries, studies ) %>%
    kable( x = ., caption = thetitle ) %>% 
    print( )
  
  # plot a pie
  ggplot( data = piedata,
          mapping = aes( x = 2,
                         y = fraction,
                         fill = studies ) ) +
    geom_label_repel( mapping = aes( label = paste( countries,
                                                    # "\n",
                                                    studies),
                                     x = 2,
                                     y = lab_pos ),
                      min.segment.length = 0,
                      segment.curvature = -1e-20,
                      nudge_x = 2,
                      fill = "white",
                      size = 2 ) +
    geom_col( color = "black" ) +
    xlim( c(0, 4) ) +
    scale_fill_gradient( low = lowcolor,
                         high = highcolor ) +
    coord_polar( theta = "y" ) +
    labs( title = thetitle ) +
    theme_void( ) +
    theme( legend.position = "none",
           # plot.background = element_rect( fill = "white") 
    )
}

# create a pie chart for countries first author
firstauthorpie <- piecharter( thedata = allcountries,
                              thecolumn = "First.Author.Research.Institute.Country",
                              lowcolor = "deepskyblue",
                              highcolor = "deepskyblue3",
                              thetitle = "First Author Institution" )

# vis
firstauthorpie

# create a pie chart for countries corresponding author
correspondingpie <- piecharter( thedata = allcountries,
                                thecolumn = "Corresponding.Research.Institute.Country",
                                lowcolor = "slategray1",
                                highcolor = "slategray3",
                                thetitle = "Corresponding Institution" )

# vis
correspondingpie

# create a bar chart for other participant countries
participants <- allcountries %>% 
  separate_rows( Participant.countries.nonfirst.noncorresponding, sep = "," ) %>% 
  mutate( participant_country = str_remove_all( string = Participant.countries.nonfirst.noncorresponding, pattern = " ") ) %>% 
  filter( participant_country != "" ) %>% 
  group_by( participant_country ) %>% 
  summarise( studies = n( ) ) %>% 
  arrange( -studies, participant_country ) %>% 
  filter( !is.na( participant_country ) )

# print table
kable( x = participants )

# plot bars
barplotparticipants <- ggplot( data = participants,
                               mapping = aes( x = participant_country,
                                              y = studies ) ) +
  geom_col( fill = "slategray" ) +
  scale_x_discrete( limits = participants$participant_country) +
  scale_y_continuous( limits = c(0, 21),
                      breaks = seq( from = 0, to = 21, by = 2),
                      expand = c( 0, 0 ) ) +
  ggtitle(label = "Non-first, non-corresponding collaborating Institution" ) +
  theme_linedraw( ) +
  theme( panel.grid.major.x = element_blank( ),
         axis.text.x = element_text( angle = 90,
                                     vjust = 0.5,
                                     hjust = 1,
                                     size = 5 ),
         axis.title.x = element_blank( ) )

# vis
barplotparticipants

# create a panel
authorspanel <- plot_grid( firstauthorpie, correspondingpie, nrow = 1 )

# Save the panel for paper
ggsave( filename = "authorship_panel.tiff",
        plot = authorspanel,
        width = 7,
        height = 7,
        dpi = 600 )

ggsave( filename = "authorship_panel.svg",
        plot = authorspanel,
        width = 7,
        height = 7,
        dpi = 600 )

```