From 3ad3433b221eb418d01d293f26180829338d8d28 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Mon, 19 Aug 2024 08:48:17 -0400
Subject: [PATCH 001/127] Created functions each plot

---
 scripts/src/QC_images.R | 398 ++++++++++++++++++++++++++++++----------
 1 file changed, 300 insertions(+), 98 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index 322fd110..29ef4d87 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -1,3 +1,24 @@
+#' validate_columns_exist
+#' 
+#' This function checks that a list of columns are present in a dataframe.
+#' Columns that were not found in the dataframe are printed out.
+#' 
+#' @param selected_columns A vector of strings each representing a column name
+#' @param df A dataframe to check against
+#' @return Boolean
+validate_columns_exist= function(selected_columns, df) {
+  # Check that all of selected_columns are in df
+  unmatched_cols= setdiff(selected_columns, colnames(df))
+  
+  if(length(unmatched_cols) > 0) {
+    print('The following columns are missing: ')
+    print(unmatched_cols)
+    return(FALSE)
+  } else {
+    return(TRUE)
+  }
+}
+
 #' Calculate index summaries
 #' 
 #' Generates some simple summaries for each unique index.
@@ -27,6 +48,232 @@ get_index_summary= function(df, index_col, valid_indices) {
   return(output_summary)
 }
 
+#' Calculate purity metrics
+#' 
+#' Create the qc table with index purity and cell line purity
+#' 
+#' @param raw_counts_uncollapsed Dataframe output from nori.
+#' @param raw_counts Raw counts dataframe outputed from collate_fastq_reads.
+#' @param filtered_counts Filtered counts dataframe outputed from filter_raw_reads.
+#' @param counts_col String name of the counts column in all three dataframes.
+#' @param file_path Location to write out the output.
+create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts,
+                          counts_col= 'n', file_path) {
+  index_purity= sum(raw_counts[[counts_col]]) / sum(raw_counts_uncollapsed[[counts_col]])
+  print(paste0('Index purity: ', round(index_purity, 4)))
+  cell_line_purity= sum(filtered_counts[[counts_col]]) / sum(raw_counts[[counts_col]])
+  print(paste0('Cell line purity: ', round(cell_line_purity, 4)))
+  
+  qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity)
+  
+  print(paste0('Writing QC table out to ', file_path))
+  qc_table %>% write.csv(file_path, row.names= FALSE, quote= FALSE)
+}
+
+#' Total counts barplot
+#' 
+#' Creates the total counts barplot with bars colored by the barcode type,
+#' either a cell line barcode or control barcode.
+#'
+#' @param filtered_counts Filtered counts dataframe.
+#' @param id_cols Vector of columns names that identify each sample.
+#' @param facet_col String name of the column in filtered_counts to facet the plot.
+plot_total_counts= function(filtered_counts, id_cols, facet_col= NA) {
+  total_counts= filtered_counts %>%
+    dplyr::mutate(barcode_type= case_when(!is.na(CCLE_name) ~ 'cell line',
+                                          !is.na(Name) ~ 'ctrl barcode')) %>%
+    tidyr::unite(all_of(id_cols), col= 'sample_id', sep= ':', remove= FALSE, na.rm= FALSE) %>%
+    dplyr::group_by(pick(all_of(na.omit(c('sample_id', facet_col, 'barcode_type'))))) %>%
+    dplyr::summarise(total_counts= sum(n)) %>% dplyr::ungroup()
+  
+  total_counts_plot= total_counts %>% 
+    ggplot(aes(x=sample_id, y=total_counts, fill=barcode_type)) +
+    geom_col(alpha=0.75, position='identity') +
+    geom_hline(yintercept= 10^4, linetype=2) + 
+    {if(!is.na(facet_col)) facet_wrap(~.data[[facet_col]], scale= 'free_x')} +
+    labs(x= "Sample constructed using id_cols", y="Total counts", fill= 'Barcode\ntype', 
+         title= 'Filtered counts - unstacked') + 
+    theme_bw() + theme(axis.text.x = element_text(angle=70, hjust=1))
+  
+  return(total_counts_plot)
+}
+
+
+#' Cell line recover barplot
+#' 
+#' text
+#' 
+#' @param filtered_counts Filtered counts dataframe.
+#' @param id_cols Vector of column names that identify each sample.
+#' @param facet_col String name of the column in filtered_counts to facet the plot.
+#' @param counts_col String name of the column in filtered_counts that contains the counts.
+#' @param counts_threshold Threshold used to determine low counts.
+#' @param plot_type description
+#' @param include_ctrl_bcs description
+plot_cl_recovery= function(filtered_counts, id_cols, facet_col= NA, counts_col= 'n', counts_threshold, 
+                           plot_type= 'percent', include_ctrl_bcs= FALSE) {
+  
+  # Filter out control barcodes if it is specified.
+  if(include_ctrl_bcs == FALSE) {
+    filtered_counts= filtered_counts %>% dplyr::filter(is.na(Name))
+  }
+  
+  # Count number of cell lines/ barcodes for a detection group.
+  recovery= filtered_counts %>%
+    dplyr::add_count(pick(all_of(id_cols)), name= 'total_num_cls') %>%
+    dplyr::mutate(detect_type= case_when(.data[[counts_col]] == 0 ~ 'Not detected',
+                                         .data[[counts_col]] <= counts_threshold ~ 'Low counts',
+                                         .data[[counts_col]] > counts_threshold ~ 'Detected')) %>%
+    dplyr::count(pick(all_of(c(id_cols, facet_col, 'detect_type', 'total_num_cls'))), name= 'num_cls_by_type') %>%
+    tidyr::unite(all_of(id_cols), col= 'sample_id', sep= ':', remove= FALSE, na.rm= FALSE) %>%
+    dplyr::mutate(percent= (num_cls_by_type / total_num_cls) * 100)
+  
+  # Set the y axis depending on the plot type.
+  if(plot_type == 'count') {
+    y_col= 'num_cls_by_type'
+    y_text= 'Number of cell lines'
+  } else {
+    if(plot_type != 'percent') {
+      print(paste0('Warning: ', plot_type, ' is not a valid plot type. Please use either count or percent.'))
+      print('Defaulting to percent plot.')
+    }
+    y_col= 'percent'
+    y_text= 'Percentage of cell lines recovered (%)'
+  }
+  
+  # Create recovery plot.
+  recov_plot= recovery %>%
+    ggplot(aes(x= sample_id, y= .data[[y_col]], fill= reorder(detect_type, dplyr::desc(detect_type)))) +
+    geom_col(alpha=0.75, position='stack') +
+    {if(!is.na(facet_col)) facet_wrap(~.data[[facet_col]], scale= 'free_x')} +
+    labs(x= "Sample constructed using id_cols", y= y_text, fill= '', title= 'Cell line recovery') + 
+    theme_bw() + theme(axis.text.x = element_text(angle=70, hjust=1))
+  
+  return(recov_plot)
+}
+
+#' Control barcode scatter plot
+#' 
+#' text
+#' 
+#' @param name description
+plot_ctrl_bc_trend= function(normalized_counts, id_cols, counts_col= 'log2_n') {
+  # Detect norm_r2 and norm_mae.
+  # If columns do not exist, then roughly calculate those columns.
+  if(any(!c('norm_r2', 'norm_mae') %in% colnames(normalized_counts))) {
+    print('WARNING: Columns "norm_r2" and/or "norm_mae" were not detected in normalized_counts.', quote= FALSE)
+    print('Calculating both columns - this method may not be as robust as the normalize module.')
+    
+    normalized_counts= normalized_counts %>% 
+      dplyr::filter(!is.na(Name), control_barcodes %in% c("Y", "T", T), n != 0) %>%
+      dplyr::group_by(pick(all_of(id_cols))) %>%
+      dplyr::mutate(mean_y= mean(log2_dose),
+                    residual2= (log2_dose - log2_normalized_n)^2,
+                    squares2= (log2_dose - mean_y)^2,
+                    norm_r2= 1 - sum(residual2) / sum(squares2),
+                    norm_mae= median(abs(log2_dose- log2_normalized_n))) %>% ungroup()
+  } 
+  
+  # Filter for just the control barcodes, create a profile_id for faceting, 
+  # and determine the x and y positions for the r2 + mae label.
+  cb_trend= normalized_counts %>% dplyr::filter(!is.na(Name), control_barcodes %in% c("Y", "T", T)) %>%
+    tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= TRUE) %>%
+    dplyr::group_by(profile_id) %>% dplyr::mutate(label_x_pos= min(.data[[counts_col]]),
+                                                  label_y_pos= max(log2_dose)) %>% dplyr::ungroup()
+  
+  # Create control barcode trend plot
+  trend_scatter_plot= cb_trend %>% ggplot(aes(x= .data[[counts_col]], y= log2_dose)) + 
+    geom_point() +
+    geom_abline(aes(slope=1, intercept= cb_intercept) , color='blue', alpha= 0.5) +
+    geom_text(aes(x= label_x_pos, y= label_y_pos,
+                  label= paste0('r2= ', round(norm_r2, 4), '\nmae= ', round(norm_mae, 4))), 
+              hjust='inward', vjust='inward', alpha= 0.5) +
+    facet_wrap(~profile_id, scales='free_x') +
+    labs(title= 'Linear fit of control barcodes across all samples') + theme_bw()
+  
+  return(trend_scatter_plot)
+}
+
+
+#' Heatmap of correlations
+#' 
+#' text
+#' 
+#' @param input_df description
+plot_cor_heatmap= function(input_df, row_id_cols, col_id_cols, counts_col,
+                           cor_method= 'pearson') {
+  
+  # Validate that specified columns are in the dataframe.
+  if(!validate_columns_exist(c(row_id_cols, col_id_cols, counts_col), input_df)) {
+    stop('Not all columns were detected in the input dataframe.')
+  }
+  
+  # Create row and column names for pivoting to a matrix
+  correlation_mx= input_df %>%
+    tidyr::unite(all_of(row_id_cols), col= 'row_id', sep= ':', remove= TRUE) %>%
+    tidyr::unite(all_of(col_id_cols), col= 'col_id', sep= ':', remove= TRUE)
+  
+  # Check that the row and column ids specify one value.
+  validate_ids= correlation_mx %>% dplyr::group_by(row_id, col_id) %>%
+    dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup()
+  if(nrow(validate_ids) != 0) {
+    print('The provide columns specify more than one value.')
+    print(head(validate_ids))
+    stop('Multiple values detected for a unique combination of "row_id_cols" and "col_id_cols".')
+  }
+  
+  # Pivot and calculate correlations
+  correlation_mx= correlation_mx %>% reshape2::acast(row_id~col_id, value.var= counts_col) %>%
+    WGCNA::cor(use= 'pairwise.complete.obs', method= cor_method)
+  
+  # Create heatmap 
+  cor_heatmap= correlation_mx %>% reshape2::melt() %>% 
+    ggplot(aes(x= Var1, y= Var2, fill= value)) + 
+    geom_tile() +
+    labs(x= '', y= '', fill= '', title= paste0('Correlations using ', counts_col)) +
+    scale_fill_gradientn(breaks= c(0, 0.5, 1), 
+                         colours= c('blue', 'white','red'),
+                         limits=c(0, 1), oob= scales::squish) +
+    theme(axis.text.x = element_text(angle=70, hjust=1))
+  
+  return(cor_heatmap)
+}
+
+#' Scatterplots of two replicates
+#' 
+#' @param input_df description
+make_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_cols, replicate_col, values_col,
+                                      x_axis_rep= '1', y_axis_rep= '2') {
+  reps_piv= input_df %>% 
+    tidyr::unite(all_of(replicate_group_cols), col= 'replicate_group', sep= ':', remove= TRUE, na.rm= FALSE) %>%
+    dplyr::group_by(pick(all_of(c(cell_line_cols, 'replicate_group')))) %>%
+    dplyr::filter(n!= 0, dplyr::n() >= 2, !is.na(.data[[replicate_col]]), .data[[replicate_col]] != '') %>% 
+    dplyr::ungroup() 
+  
+  # Retun a null object if no entries pass the filter.
+  if(nrow(reps_piv) == 0) {return(NULL)}
+  
+  reps_piv= reps_piv %>%
+    pivot_wider(id_cols= all_of(c(cell_line_cols, 'replicate_group')),
+                names_from= replicate_col, names_prefix= replicate_col, values_from= values_col) %>%
+    dplyr::mutate(type= ifelse(!is.na(CCLE_name), "cell line", "control barcode")) %>% dplyr::ungroup()
+  
+  # Create names of the columns to plot on xy axes
+  x_col_name= paste0(replicate_col, x_axis_rep)
+  y_col_name= paste0(replicate_col, y_axis_rep)
+  
+  reps_scatter= reps_piv %>% dplyr::filter(!is.na(.data[[x_col_name]]), !is.na(.data[[y_col_name]])) %>%
+    ggplot(aes(x= .data[[x_col_name]], y= .data[[y_col_name]])) +
+    geom_point(aes(color= type), alpha=0.75) +
+    geom_smooth(method='lm', se=F, color='black', linewidth=0.5, linetype=2) +
+    ggpmisc::stat_correlation(mapping = use_label(c("R2", "n")))+ 
+    facet_wrap(~replicate_group, scales= 'free') +
+    labs(x= paste0(replicate_col, '1 ', values_col), y= paste0(replicate_col, '2 ', values_col)) +
+    theme_bw()
+  
+  return(reps_scatter)
+}
+
 #'  QC_images
 #'
 #'  Takes in the metadata, raw counts, annotated counts, and normalized counts to generate some QC images.
@@ -53,7 +300,6 @@ get_index_summary= function(df, index_col, valid_indices) {
 #' @param reverse_index2 reverse index 2 if newer sequencers are used.
 #' @return - NA, QC images are written out to the specified folder
 #' @export
-
 QC_images = function(raw_counts, annotated_counts, normalized_counts= NA,
                      sample_meta, CB_meta, cell_set_meta,
                      id_cols, sig_cols, count_col_name= 'normalized_n',
@@ -98,6 +344,13 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA,
   #
   
   # Sequencing QCs ____________________ ----
+  ## Purity metrics ----
+  # call this function
+  print('Generating QC table')
+  create_qc_table(raw_counts_uncollapsed, raw_counts, filtered_counts,
+                            counts_col= 'n', file_path)
+  #
+  
   ## Index count summaries ----
   print("Generating index counts tables")
   # Check that "IndexBarcode1" and "index_1" columns are present.
@@ -121,46 +374,21 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA,
   
   ## Total counts ----
   print("generating total_counts image")
-  total_counts= annotated_counts %>% dplyr::filter(expected_read) %>%
-    mutate(barcode_type = ifelse(!is.na(CCLE_name), "cell line", "control barcode"),
-           sample_id= paste(pcr_plate, pcr_well, sep='_')) %>%
-    group_by(pick(all_of(c('pcr_plate', 'pcr_well', 'sample_id', id_cols, 'barcode_type')))) %>% 
-    dplyr::summarise(total_counts = sum(n))
-  
-  tc= total_counts %>% ggplot() +
-    geom_col(aes(x=sample_id, y=total_counts, fill=barcode_type), alpha=0.75, position='identity') +
-    geom_hline(yintercept= 10^4, linetype=2) + 
-    facet_wrap(~pcr_plate, scale= 'free_x') +
-    labs(x="PCR location", y="total counts", fill="", title= 'Raw counts - unstacked') + theme_bw() +
-    theme(axis.text.x = element_text(angle=70, hjust=1, size=5)) 
+  
+  tc= plot_total_counts(filtered_counts, id_cols, facet_col= 'pcr_plate')
   
   pdf(file=paste(out, "total_counts.pdf", sep="/"),
       width=sqrt(num_profiles)*2, height=sqrt(num_profiles))
   print(tc)
   dev.off()
-  rm(total_counts, tc)
+  rm(tc)
   
   # Assay QCs _________________________ ----
   ## Cell lines recovered ----
   print("generating cell_lines_present image")
-  recovery= annotated_counts %>% dplyr::filter(expected_read, !is.na(CCLE_name)) %>%
-    dplyr::select(!any_of(c('members'))) %>%
-    dplyr::left_join(cell_set_meta, by="cell_set", relationship= 'many-to-one') %>%
-    dplyr::mutate(members= if_else(is.na(members), cell_set, members), # for custom cell sets
-                  expected_num_cl= as.character(members) %>% purrr::map(strsplit, ";") %>% 
-                    purrr::map(`[[`, 1) %>% purrr::map(length) %>% as.numeric(),
-                  count_type= ifelse(n > count_threshold, 'Detected', 'Low'),
-                  count_type= ifelse(n==0, 'Missing', count_type)) %>%
-    dplyr::count(pick(all_of(c('pcr_plate', 'pcr_well', id_cols, 'count_type', 'expected_num_cl'))), name= 'count') %>%
-    dplyr::mutate(frac_type= count/expected_num_cl)
-  
-  cl_rec= recovery %>% tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove=FALSE) %>%
-    ggplot() +
-    geom_col(aes(x=profile_id, y=frac_type*100, fill= reorder(count_type, dplyr::desc(count_type)))) +
-    facet_wrap(~pcr_plate, scales= 'free_x') +
-    labs(x="", y="Percentage of expected cell lines", fill= '') +
-    theme_bw() +
-    theme(axis.text.x = element_text(angle=70, hjust=1, size=5))
+  
+  cl_rec= plot_cl_recovery(filtered_counts, id_cols= id_cols, facet_col= 'pcr_plate', 
+                           counts_threshold= counts_threshold, plot_type= 'percent')
   
   pdf(file=paste(out, "cell_lines_present.pdf", sep="/"),
       width=sqrt(num_profiles)*2, height=sqrt(num_profiles))
@@ -291,32 +519,7 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA,
   if(contains_cbs & is.data.frame(normalized_counts)) {
     print("generating control_barcode_trend image")
     
-    # calculate r2 and mae if columns do not exist
-    if (!'norm_r2' %in% colnames(normalized_counts) | !'norm_mae' %in% colnames(normalized_counts)) {
-      cb_trend= normalized_counts %>%
-        dplyr::filter(control_barcodes %in% c("Y", "T", T),
-                      !(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type),
-                      !is.na(Name)) %>%
-        tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= TRUE) %>%
-        dplyr::group_by(profile_id) %>%
-        dplyr::mutate(mean_y= mean(log2_dose),
-                      residual2= (log2_dose - log2_normalized_n)^2,
-                      squares2= (log2_dose - mean_y)^2,
-                      norm_r2= 1 - sum(residual2)/sum(squares2),
-                      norm_mae= median(abs(log2_dose- log2_normalized_n))) %>% ungroup()
-    } else {
-      cb_trend= normalized_counts %>% dplyr::filter(!is.na(Name)) %>%
-        tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= TRUE)
-    }
-    
-    trend_sc= cb_trend %>% dplyr::mutate(profile_id= reorder(profile_id, dplyr::desc(norm_mae))) %>%
-      ggplot(aes(x=log2_n, y=log2_dose)) + geom_point() +
-      geom_abline(aes(slope=1, intercept= cb_intercept) , color='blue') +
-      geom_text(aes(x= min(log2_n), y= dplyr::desc(sort(unique(log2_dose)))[1], 
-                    label= paste('r2=', round(norm_r2, 4), '\nmae=', round(norm_mae, 4), sep='')), 
-                hjust='inward', vjust='inward') +
-      facet_wrap(~profile_id, scales= 'free_x') +
-      labs(x= 'log2(n)', y= 'log2(dose)') + theme_bw()
+    trend_sc= plot_ctrl_bc_trend(normalized_counts, id_cols, counts_col= 'log2_n')
     
     pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"),
         width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2)
@@ -327,21 +530,15 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA,
   
   ## Sample correlation -----
   print("generating sample_cor image")
-  correlation_matrix= annotated_counts %>%
-    dplyr::filter(expected_read, !is.na(CCLE_name),
-                  (!trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type)) %>% 
-    tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= TRUE) %>%
-    dplyr::mutate(log2_n = log2(n +1)) %>% 
-    reshape2::dcast(CCLE_name~profile_id, value.var="log2_n") %>% 
-    column_to_rownames("CCLE_name") %>% 
-    cor(use="pairwise.complete.obs") 
-  
-  cp= correlation_matrix %>% reshape2::melt() %>% 
-    ggplot() + geom_tile(aes(x=Var1, y=Var2, fill=value)) +
-    labs(x="", y="", fill="correlation") +
-    scale_fill_gradient(low="yellow", high="red") +
-    theme(axis.text.x = element_text(angle=70, hjust=1, size=5),
-          axis.text.y = element_text(size=5))
+  
+  cor_df= filtered_counts %>% 
+    dplyr::filter(!is.na(DepMap_ID), !is.na(trt_type), !trt_type %in% c("empty", "", "CB_only")) %>%
+    dplyr::mutate(log2_n= log2(n + 1))
+  cp= plot_cor_heatmap(input_df= cor_df,
+                       row_id_cols= c('DepMap_ID'),
+                       col_id_cols= c(sig_cols, id_cols),
+                       counts_col= 'log2_n')
+  
   pdf(file=paste(out, "sample_cor.pdf", sep="/"),
       width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2)
   print(cp)
@@ -349,35 +546,40 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA,
   rm(correlation_matrix, cp)
   
   ## Tech rep correlations ----
-  # assumes that tech reps are the last component of profile_id
-  if('tech_rep' %in% colnames(normalized_counts)) {
-    if(max(unique(normalized_counts$tech_rep), na.rm= TRUE) == 2) {
-      print("generating tech rep correlations image")
-   
-      static_cols= c('project_code', 'CCLE_name', 'DepMap_ID', 'Name', 'cell_set')
-      tech_reps_piv= normalized_counts %>% dplyr::mutate(bio_rep_id= str_replace(profile_id, ':\\d+$', '')) %>%
-        dplyr::group_by_at(c(static_cols, 'bio_rep_id')) %>% dplyr::filter(n!=0, n()==2) %>% dplyr::ungroup() %>%
-        pivot_wider(id_cols= all_of(c(static_cols, 'bio_rep_id')),
-                    names_from= tech_rep, names_prefix= 'tech_rep', values_from= log2_n) %>%
-        dplyr::group_by(bio_rep_id) %>%
-        dplyr::mutate(r2= cor(tech_rep1, tech_rep2, use='p')^2,
-                      type= ifelse(!is.na(CCLE_name), "cell line", "control barcode")) %>% dplyr::ungroup()
+  if(is.data.frame(normalized_counts)) {
+    if('tech_rep' %in% colnames(normalized_counts)) {
+      # Set up replicate groups depending "bio_rep" column
+      if('bio_rep' %in% colnames(normalized_counts) & !'bio_rep' %in% sig_cols) {
+        replicate_group_cols= c(sig_cols, 'bio_rep')
+      } else {
+        replicate_group_cols= sig_cols
+      }
       
-      tech_reps_plt= tech_reps_piv %>% dplyr::mutate(bio_rep_id= reorder(bio_rep_id, r2)) %>%
-        ggplot(aes(x= tech_rep1, y= tech_rep2)) +
-        geom_point(aes(color= type), alpha=0.75) +
-        geom_smooth(method='lm', se=F, color='black', linewidth=0.5, linetype=2) +
-        stat_correlation(mapping = use_label(c("R2", "n")))+ 
-        facet_wrap(~bio_rep_id, scales= 'free') +
-        labs(x="tech rep 1 log2(n)", y="tech rep 2 log2(n)") + theme_bw()
+      # Handle cases if control barcodes are used.
+      if('Name' %in% colnames(normalized_counts)) {
+        unique_cell_line_cols= c(cell_line_cols, 'Name')
+      } else {
+        unique_cell_line_cols= cell_line_cols
+      }
       
-      pdf(file=paste(out, "tech_reps_plt.pdf", sep="/"),
-          width=sqrt(num_profiles), height=sqrt(num_profiles))
-      print(tech_reps_plt)
-      dev.off()
+      # Create replicate scatter plot
+      print("generating tech rep correlations image")
+      tech_reps_plt= make_replicate_scatterplots(input_df= normalized_counts, 
+                                                 cell_line_cols= unique_cell_line_cols, 
+                                                 replicate_group_cols= replicate_group_cols, 
+                                                 replicate_col= 'tech_rep', 
+                                                 values_col= 'log2_n')
+      
+      if(!is.null(tech_reps_plt)) {
+        pdf(file=paste(out, "tech_reps_plt.pdf", sep="/"),
+            width=sqrt(num_profiles), height=sqrt(num_profiles))
+        print(tech_reps_plt)
+        dev.off()
+      } else {
+        print('No technical replicates detected - skipping plot.')
+      }
     }
   }
-  
   ## Bio rep correlations ----
   if('bio_rep' %in% colnames(normalized_counts)) {
     num_bio_reps= normalized_counts %>% 

From 102a06b3b3e57d0f35832fcbb2bba94e3157c6ca Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Mon, 19 Aug 2024 08:49:49 -0400
Subject: [PATCH 002/127] Mapped reads to id cols

---
 scripts/src/collate_fastq_reads.R | 83 +++++++++++++++++++++++++------
 1 file changed, 68 insertions(+), 15 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index e6b75298..2ea5fd75 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -59,8 +59,8 @@ validate_unique_samples= function(selected_columns, df) {
 #' @param detected_flowcells A dataframe with the columns "flowcell_name" and "flowcell_lane".
 #' @param expected_flowcells A dataframe with the columns "flowcell_name" and "flowcell_lane".
 validate_detected_flowcells= function(detected_flowcells, expected_flowcells) {
-  missing_flowcells= expected_flowcells %>%
-    dplyr::anti_join(detected_flowcells, by= c('flowcell_name', 'flowcell_lane'))
+  missing_flowcells= expected_flowcells %>% dplyr::anti_join(detected_flowcells, by= c('flowcell_name', 'flowcell_lane'))
+
   if(nrow(missing_flowcells) != 0) {
     print('The following flowcells/lanes specified in the sample meta were not detected in the fastq reads.')
     print(missing_flowcells)
@@ -84,19 +84,42 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) {
 #'                              This defaults onto the following columns: "index_1", "index_2"
 #' @returns Returns a dataframe with columns specified by the sequencing_index_cols, "forward_read_cl_barcode", and "n".
 #' @import tidyverse
-collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, sequencing_index_cols= c('index_1', 'index_2')) {
+collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, 
+                              sequencing_index_cols= c('index_1', 'index_2'),
+                              id_cols= c('pcr_plate', 'pcr_well'),
+                              reverse_index2= FALSE,
+                              barcode_col= 'forward_read_cl_barcode') {
   require(tidyverse)
   
+  # Reverse index 2 if specified ----
+  if(reverse_index2) {
+    if('index_2' %in% colnames(sample_meta)) {
+      print("Reverse-complementing index 2 barcode ...")
+      sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))
+    } else {
+      stop('Reverse index 2 is set to TRUE, but index_2 does not exists.')
+    }
+  }
+  
   # Validation: Check that flowcell_names and flowcell_lanes exist in the sample meta ----
   if(!validate_columns_exist(c('flowcell_names', 'flowcell_lanes'), sample_meta)) {
-    stop('flowcell_names and/or flowcell_lanes is NOT present in the sample meta.')
+    stop('Flowcell_names and/or flowcell_lanes is NOT present in the sample meta.')
   }
   
   # Validation: Check that sequencing_index_cols exist in the sample meta ----
   if(!validate_columns_exist(sequencing_index_cols, sample_meta)) {
+    print('The following sequencing_index_cols are not present in the sample meta.')
+    print(sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)])
     stop('One or more sequencing_index_cols is NOT present in the sample meta.')
   }
   
+  # Validation: Check that id_cols exist in the sample meta ----
+  if(!validate_columns_exist(id_cols, sample_meta)) {
+    print('The following id_cols are not present in the sample meta.')
+    print(id_cols[!id_cols %in% colnames(sample_meta)])
+    stop('One or more id_cols is NOT present in the sample meta.')
+  }
+  
   # Validation: Check that sequencing_index_cols in the sample meta are filled out ----
   # Check for rows in sequencing_index_cols that equate to empty - NA, "NA", "", " "
   # Error out of the sequencing_index_cols are not filled out in the sample meta.
@@ -104,12 +127,6 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, sequencing_in
     stop('One or more sequencing_index_cols in the sample meta is not filled out.')
   }
   
-  # Validation: Check that sequencing_index_cols uniquely identify rows of sample meta ----
-  if(!validate_unique_samples(sequencing_index_cols, sample_meta)) {
-    print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.')
-    stop('The specified sequencing index columns do NOT uniquely identify every PCR well.')
-  }
-  
   # Determine which flowcell names + lanes are expected ----
   # "flowcell_names" and "flowcell_lanes" are strings that can contain more than one item.
   # Columns can be parsed by splitting on the chars , ; :
@@ -139,13 +156,49 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, sequencing_in
   print(detected_flowcells)
   validate_detected_flowcells(detected_flowcells, expected_flowcells)
   
-  # Create raw counts by summing over appropriate sequencing_index_cols ----
-  # Use an inner join to collect reads with valid flowcell name/lane combinations, 
-  # then sum reads across sequencing columns
+  # Validation: Check that sequencing_index_cols uniquely identify rows of sample meta ----
+  if(!validate_unique_samples(sequencing_index_cols, sample_meta)) {
+    print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.')
+    stop('The specified sequencing index columns do NOT uniquely identify every PCR well.')
+  }
+  
+  # Validation: Check that id_cols uniquely identify rows of sample meta ----
+  if(!validate_unique_samples(id_cols, sample_meta)) {
+    print('There may be multiple entries in the sample meta that have the same combination of ID columns.')
+    stop('The specified ID columns do NOT uniquely identify every PCR well.')
+  }
+  
+  # Create sequence map ----
+  sequencing_map= sample_meta %>% dplyr::distinct(pick(all_of(c(sequencing_index_cols, id_cols))))
+  
+  # Validation: Check that mapping is one to one ----
+  check_mapping= sequencing_map %>% dplyr::group_by(pick(all_of(sequencing_index_cols))) %>%
+    dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup()
+  if(nrow(check_mapping) > 0) {
+    print('The following sequening locations map to multiple conditions.')
+    print(check_mapping)
+    stop('The sequencing index columns do not map 1 to 1 to the ID columns.')
+  }
+  
+  # Create raw counts file ----
+  # Filter for the expected flowcells and summed up the reads over the ID cols.
+  print('Summing up reads ...')
   raw_counts= uncollapsed_raw_counts %>% 
-    dplyr::inner_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane'), relationship= 'many-to-one') %>%
-    dplyr::group_by(pick(all_of(c(sequencing_index_cols, 'forward_read_cl_barcode')))) %>% 
+    dplyr::semi_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane')) %>%
+    dplyr::inner_join(sequencing_map, by= intersect(colnames(.), colnames(sequencing_map)), relationship= 'many-to-one') %>%
+    dplyr::group_by(pick(all_of(c(id_cols, barcode_col)))) %>% 
     dplyr::summarize(n= sum(n)) %>% dplyr::ungroup()
   
+  # Calculate index purity ----
+  index_purity= sum(raw_counts$n) / sum(uncollapsed_raw_counts$n)
+  print(paste0('Index purity: ', round(index_purity, 4)))
+  if(index_purity > 1) {
+    stop('ERROR: Index purity is greater than 1!')
+  }
+  if(index_purity < 0.5) {
+    print('Warning: Low index purity!')
+  }
+  
+  print('Done!')
   return(raw_counts)
 }

From df5e36ac45dd6b051370583e72d5d3d2003300b0 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Mon, 19 Aug 2024 09:04:17 -0400
Subject: [PATCH 003/127] Removed sequencing_index_cols

Raw counts is now assumed to contain the columns specified in id_cols
---
 scripts/src/filter_raw_reads.R | 71 +++++++++++++++-------------------
 1 file changed, 32 insertions(+), 39 deletions(-)

diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R
index ad4b7e22..8e663bc0 100755
--- a/scripts/src/filter_raw_reads.R
+++ b/scripts/src/filter_raw_reads.R
@@ -26,6 +26,13 @@ validate_columns_exist= function(selected_columns, df) {
 validate_unique_samples= function(selected_columns, df) {
   unique_column_values= df %>% dplyr::distinct(pick(all_of(selected_columns)))
   if(nrow(unique_column_values) != nrow(df)) {
+    print('The selected columns do not uniquely identify all rows.')
+    
+    dups= df %>% dplyr::group_by(pick(all_of(selected_columns))) %>%
+      dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup() %>% 
+      dplyr::arrange(pick(all_of(selected_columns)))
+    print(dups)
+    
     return(FALSE)
   } else {
     return(TRUE)
@@ -93,7 +100,7 @@ validate_cell_set_luas= function(sample_meta, cell_set_meta) {
 #' @export 
 filter_raw_reads = function(raw_counts, 
                             sample_meta, cell_line_meta, cell_set_meta, CB_meta,
-                            sequencing_index_cols= c('index_1', 'index_2'),
+                            id_cols= c('pcr_plate', 'pcr_well'),
                             reverse_index2= FALSE, count_threshold= 40) {
   
   require(tidyverse)
@@ -101,10 +108,12 @@ filter_raw_reads = function(raw_counts,
   
   # Processing metadata and inputs ---- 
   # CB meta is in log10 and should be converted to log2.
-  print("Converting CB_meta from log10 to log2 ...")
-  CB_meta= CB_meta %>% dplyr::mutate(log2_dose= log_dose/log10(2)) %>% dplyr::select(-log_dose)
+  if('log_dose' %in% colnames(CB_meta)) {
+    print("Converting CB_meta from log10 to log2 ...")
+    CB_meta= CB_meta %>% dplyr::mutate(log2_dose= log_dose/log10(2)) %>% dplyr::select(-log_dose)
+  }
   
-  if (reverse_index2) {
+  if(reverse_index2) {
     if ('index_2' %in% colnames(sample_meta)) {
       print("Reverse-complementing index 2 barcode ...")
       sample_meta$index_2 <- chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))
@@ -113,15 +122,15 @@ filter_raw_reads = function(raw_counts,
     }
   }
   
-  # Validation: Check that sequencing_index_cols exist in the sample meta ----
-  if(!validate_columns_exist(sequencing_index_cols, sample_meta)) {
-    stop('One or more sequencing_index_cols is NOT present in the sample meta.')
+  # Validation: Check that id_cols exist in the sample meta ----
+  if(!validate_columns_exist(id_cols, sample_meta)) {
+    stop('One or more id_cols is NOT present in the sample meta.')
   }
   
-  # Validation: Check that sequencing_index_cols uniquely identify every rows of sample meta ----
-  if(!validate_unique_samples(sequencing_index_cols, sample_meta)) {
-    print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.')
-    stop('The specified sequencing index columns do NOT uniquely identify every PCR well.')
+  # Validation: Check that id_cols uniquely identify every rows of sample meta ----
+  if(!validate_unique_samples(id_cols, sample_meta)) {
+    print('There may be multiple entries in the sample meta that have the same combination of id_cols.')
+    stop('The specified ID columns do NOT uniquely identify every PCR well.')
   }
   
   # Validation: Check that cell sets do not contain duplicate LUAs ----
@@ -129,24 +138,6 @@ filter_raw_reads = function(raw_counts,
   # This currently does NOT result in an error. Error avoided using a distinct later in line 162
   validate_cell_set_luas(sample_meta, cell_set_meta)
   
-  # Filtering by sequencing columns ----
-  # Filter raw counts using the sequencing columns. 
-  # Also create "mapped" column to identify reads that mapped to all known PRISM sequences.
-  print("Filtering by sequencing columns ...")
-  unique_sequencing_index_vals= sample_meta %>% dplyr::distinct(pick(all_of(sequencing_index_cols)))
-  index_filtered= raw_counts %>% dplyr::semi_join(unique_sequencing_index_vals, by= sequencing_index_cols) %>%
-    dplyr::mutate(mapped= forward_read_cl_barcode %in% c(cell_line_meta$Sequence, CB_meta$Sequence))
-  
-  # Calculate index purity for QC table.
-  index_purity= sum(index_filtered$n)/ sum(raw_counts$n)
-  
-  # Split off unmapped reads ----
-  # Unmapped reads are defined as having valid indices but do not map to barcodes in PRISM.
-  # Also sorted reads in descending order by read count.
-  print('Splitting off unmapped reads ...')
-  unmapped_reads= index_filtered %>% dplyr::filter(mapped==F) %>% dplyr::select(-mapped) %>% 
-    dplyr::arrange(dplyr::desc(n))
-  
   # Creating a template of all expected reads in the run ----
   # Use all 4 meta data files to create a "template" dataframe where
   # every row is a cell line that is expected in a PCR well. 
@@ -174,12 +165,12 @@ filter_raw_reads = function(raw_counts,
   # Reads that to not match to the template are contaminants and,
   # reads that are only present in the template are missing/not detected by PCR.
   print("Annotating reads ...")
-  annotated_counts= index_filtered %>% dplyr::filter(mapped) %>%
+  annotated_counts= raw_counts %>%
     dplyr::left_join(cell_line_meta, by= join_by('forward_read_cl_barcode'=='Sequence'),
                      relationship= 'many-to-one') %>%
     dplyr::left_join(CB_meta, by= join_by('forward_read_cl_barcode'=='Sequence'),
                      relationship= 'many-to-one') %>%
-    dplyr::left_join(sample_meta, by= sequencing_index_cols, relationship= 'many-to-one') %>%
+    dplyr::left_join(sample_meta, by= id_cols, relationship= 'many-to-one') %>%
     dplyr::full_join(template %>% dplyr::mutate(expected_read= T),
                      by= c('forward_read_cl_barcode'='Sequence', intersect(colnames(template), colnames(.))),
                      relationship= 'one-to-one') %>%
@@ -197,15 +188,17 @@ filter_raw_reads = function(raw_counts,
     dplyr::mutate(flag= ifelse(n==0, 'Missing', NA),
                   flag= ifelse(n!=0 & n < count_threshold, 'low counts', flag))
   
-  # Calculate cell line purity for the QC table.
-  cell_line_purity= sum(filtered_counts$n)/ sum(index_filtered$n)
-  
-  # Generating QC table ----
-  print('Generating QC table ...')
-  qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity)
+  # Calculate cell line purity ----
+  cell_line_purity= sum(filtered_counts$n)/ sum(raw_counts$n)
+  print(paste0('Cell line purity: ', round(cell_line_purity, 4)))
+  if(cell_line_purity > 1) {
+    stop('ERROR: Cell line purity is greater than 1!')
+  }
+  if(cell_line_purity < 0.5) {
+    print('Warning: Low cell line purity!')
+  }
   
-  return(list(unmapped_reads= unmapped_reads, annotated_counts= annotated_counts, 
-              filtered_counts= filtered_counts, qc_table= qc_table))
+  return(list(annotated_counts= annotated_counts, filtered_counts= filtered_counts))
 }
 
 # checks is a string can be numeric

From 2ba40b10c551e569facd63d956147eb8862ff9f6 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Mon, 19 Aug 2024 09:04:36 -0400
Subject: [PATCH 004/127] Updated script to reflect function changes

---
 scripts/collate_fastq_reads.R | 39 ++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index d6cf1bd5..29f4d994 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -13,6 +13,12 @@ parser$add_argument("-c", "--uncollapsed_raw_counts", default="raw_counts_uncoll
 parser$add_argument("--sample_meta", default="sample_meta.csv", help = "Sample metadata")
 parser$add_argument("--sequencing_index_cols", default= "index_1,index_2", 
                     help = "Sequencing columns in the sample meta")
+parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", 
+                    help = "Columns that identify a unique PCR well")
+parser$add_argument("--reverse_index2", action="store_true", default=FALSE, 
+                    help= "Reverse complement of index 2 for NovaSeq and NextSeq")
+parser$add_argument("--barcode_col", default= "forward_read_cl_barcode", 
+                    help= "Name of the column in uncollapsed_raw_counts that contains the barcode sequences.")
 parser$add_argument("-o", "--out", default=getwd(), help = "Output path. Default is working directory")
 
 # get command line options, if help option encountered print help and exit
@@ -27,27 +33,40 @@ if (args$out == "") {
 expected_file_path <- paste(args$out, "raw_counts_uncollapsed.csv", sep='/')
 
 if(file.exists(expected_file_path)) {
-  sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',', data.table= F)
+  # Read in files and parse vector arguments
   uncollapsed_raw_counts= data.table::fread(expected_file_path, header= T, sep= ',', data.table= F)
+  sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',', data.table= F)
+  
   sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ","))
+  id_cols= unlist(strsplit(args$id_cols, ","))
+  
+  # Validation: Check that sequencing_index_cols are from sample meta column names
+  if(!all(sequencing_index_cols %in% colnames(sample_meta))) {
+    stop(paste('The following sequencing_index_cols were not found in the sample meta: ',
+               sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)]))
+  }
   
-  # Validation: Check if sequencing_index_cols is composed of sample meta column names
-  if (!all(sequencing_index_cols %in% colnames(sample_meta))) {
-    stop(paste("Colnames not found in sample_meta, check metadata or --sequencing_index_cols argument:", 
-               args$sequencing_index_cols))
+  # Validation: Check that id_cols are from sample meta column names
+  if(!all(id_cols %in% colnames(sample_meta))) {
+    stop(paste('The following id_cols were not found in the sample meta: ',
+               id_cols[!id_cols %in% colnames(sample_meta)]))
   }
   
-  print("Collating fastq reads")
-  raw_counts= collate_fastq_reads(uncollapsed_raw_counts, sample_meta, sequencing_index_cols)
+  print("Collating fastq reads ...")
+  raw_counts= collate_fastq_reads(uncollapsed_raw_counts, sample_meta, 
+                                  sequencing_index_cols,
+                                  id_cols,
+                                  reverse_index2= args$reverse_index2,
+                                  barcode_col= args$barcode_col)
   
   # Validation: Basic file size check
   if(nrow(raw_counts) == 0) {
     stop('ERROR: Empty file generated. No rows in raw_counts output.')
   } 
   
-  rc_out_file = paste(args$out, 'raw_counts.csv', sep='/')
-  print(paste("Writing to file: ", rc_out_file))
-  write.csv(raw_counts, rc_out_file, row.names=F, quote=T)
+  rc_out_file= paste(args$out, 'raw_counts.csv', sep='/')
+  print(paste("Writing raw_counts.csv to ", rc_out_file))
+  write.csv(raw_counts, rc_out_file, row.names= FALSE, quote= FALSE)
 } else {
   print("Uncollapsed raw counts file not detected. Proceeding with generating filtered counts file.")
 }

From 66d58f511a99eb05e8f9eb8c3e51dd0a88b23a39 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Mon, 19 Aug 2024 14:52:20 -0400
Subject: [PATCH 005/127] Wrapped qcs in trycatches

---
 scripts/src/QC_images.R | 600 +++++++++++++++++++++++-----------------
 1 file changed, 353 insertions(+), 247 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index 29ef4d87..d5bc5994 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -6,9 +6,9 @@
 #' @param selected_columns A vector of strings each representing a column name
 #' @param df A dataframe to check against
 #' @return Boolean
-validate_columns_exist= function(selected_columns, df) {
-  # Check that all of selected_columns are in df
-  unmatched_cols= setdiff(selected_columns, colnames(df))
+validate_columns_exist= function(selected_cols, df) {
+  # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B].
+  unmatched_cols= base::setdiff(selected_cols, colnames(df))
   
   if(length(unmatched_cols) > 0) {
     print('The following columns are missing: ')
@@ -55,13 +55,13 @@ get_index_summary= function(df, index_col, valid_indices) {
 #' @param raw_counts_uncollapsed Dataframe output from nori.
 #' @param raw_counts Raw counts dataframe outputed from collate_fastq_reads.
 #' @param filtered_counts Filtered counts dataframe outputed from filter_raw_reads.
-#' @param counts_col String name of the counts column in all three dataframes.
+#' @param value_col String name of the counts column in all three dataframes.
 #' @param file_path Location to write out the output.
 create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts,
-                          counts_col= 'n', file_path) {
-  index_purity= sum(raw_counts[[counts_col]]) / sum(raw_counts_uncollapsed[[counts_col]])
+                          value_col= 'n', file_path) {
+  index_purity= sum(raw_counts[[value_col]]) / sum(raw_counts_uncollapsed[[value_col]])
   print(paste0('Index purity: ', round(index_purity, 4)))
-  cell_line_purity= sum(filtered_counts[[counts_col]]) / sum(raw_counts[[counts_col]])
+  cell_line_purity= sum(filtered_counts[[value_col]]) / sum(raw_counts[[value_col]])
   print(paste0('Cell line purity: ', round(cell_line_purity, 4)))
   
   qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity)
@@ -78,7 +78,7 @@ create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts,
 #' @param filtered_counts Filtered counts dataframe.
 #' @param id_cols Vector of columns names that identify each sample.
 #' @param facet_col String name of the column in filtered_counts to facet the plot.
-plot_total_counts= function(filtered_counts, id_cols, facet_col= NA) {
+create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) {
   total_counts= filtered_counts %>%
     dplyr::mutate(barcode_type= case_when(!is.na(CCLE_name) ~ 'cell line',
                                           !is.na(Name) ~ 'ctrl barcode')) %>%
@@ -106,12 +106,12 @@ plot_total_counts= function(filtered_counts, id_cols, facet_col= NA) {
 #' @param filtered_counts Filtered counts dataframe.
 #' @param id_cols Vector of column names that identify each sample.
 #' @param facet_col String name of the column in filtered_counts to facet the plot.
-#' @param counts_col String name of the column in filtered_counts that contains the counts.
+#' @param value_col String name of the column in filtered_counts that contains the counts.
 #' @param counts_threshold Threshold used to determine low counts.
 #' @param plot_type description
 #' @param include_ctrl_bcs description
-plot_cl_recovery= function(filtered_counts, id_cols, facet_col= NA, counts_col= 'n', counts_threshold, 
-                           plot_type= 'percent', include_ctrl_bcs= FALSE) {
+create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value_col= 'n', count_threshold, 
+                                  plot_type= 'percent', include_ctrl_bcs= FALSE) {
   
   # Filter out control barcodes if it is specified.
   if(include_ctrl_bcs == FALSE) {
@@ -121,9 +121,9 @@ plot_cl_recovery= function(filtered_counts, id_cols, facet_col= NA, counts_col=
   # Count number of cell lines/ barcodes for a detection group.
   recovery= filtered_counts %>%
     dplyr::add_count(pick(all_of(id_cols)), name= 'total_num_cls') %>%
-    dplyr::mutate(detect_type= case_when(.data[[counts_col]] == 0 ~ 'Not detected',
-                                         .data[[counts_col]] <= counts_threshold ~ 'Low counts',
-                                         .data[[counts_col]] > counts_threshold ~ 'Detected')) %>%
+    dplyr::mutate(detect_type= case_when(.data[[value_col]] == 0 ~ 'Not detected',
+                                         .data[[value_col]] <= count_threshold ~ 'Low counts',
+                                         .data[[value_col]] > count_threshold ~ 'Detected')) %>%
     dplyr::count(pick(all_of(c(id_cols, facet_col, 'detect_type', 'total_num_cls'))), name= 'num_cls_by_type') %>%
     tidyr::unite(all_of(id_cols), col= 'sample_id', sep= ':', remove= FALSE, na.rm= FALSE) %>%
     dplyr::mutate(percent= (num_cls_by_type / total_num_cls) * 100)
@@ -157,7 +157,7 @@ plot_cl_recovery= function(filtered_counts, id_cols, facet_col= NA, counts_col=
 #' text
 #' 
 #' @param name description
-plot_ctrl_bc_trend= function(normalized_counts, id_cols, counts_col= 'log2_n') {
+create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n') {
   # Detect norm_r2 and norm_mae.
   # If columns do not exist, then roughly calculate those columns.
   if(any(!c('norm_r2', 'norm_mae') %in% colnames(normalized_counts))) {
@@ -178,11 +178,11 @@ plot_ctrl_bc_trend= function(normalized_counts, id_cols, counts_col= 'log2_n') {
   # and determine the x and y positions for the r2 + mae label.
   cb_trend= normalized_counts %>% dplyr::filter(!is.na(Name), control_barcodes %in% c("Y", "T", T)) %>%
     tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= TRUE) %>%
-    dplyr::group_by(profile_id) %>% dplyr::mutate(label_x_pos= min(.data[[counts_col]]),
+    dplyr::group_by(profile_id) %>% dplyr::mutate(label_x_pos= min(.data[[value_col]]),
                                                   label_y_pos= max(log2_dose)) %>% dplyr::ungroup()
   
   # Create control barcode trend plot
-  trend_scatter_plot= cb_trend %>% ggplot(aes(x= .data[[counts_col]], y= log2_dose)) + 
+  trend_scatter_plot= cb_trend %>% ggplot(aes(x= .data[[value_col]], y= log2_dose)) + 
     geom_point() +
     geom_abline(aes(slope=1, intercept= cb_intercept) , color='blue', alpha= 0.5) +
     geom_text(aes(x= label_x_pos, y= label_y_pos,
@@ -200,11 +200,11 @@ plot_ctrl_bc_trend= function(normalized_counts, id_cols, counts_col= 'log2_n') {
 #' text
 #' 
 #' @param input_df description
-plot_cor_heatmap= function(input_df, row_id_cols, col_id_cols, counts_col,
+create_cor_heatmap= function(input_df, row_id_cols, col_id_cols, value_col,
                            cor_method= 'pearson') {
   
   # Validate that specified columns are in the dataframe.
-  if(!validate_columns_exist(c(row_id_cols, col_id_cols, counts_col), input_df)) {
+  if(!validate_columns_exist(c(row_id_cols, col_id_cols, value_col), input_df)) {
     stop('Not all columns were detected in the input dataframe.')
   }
   
@@ -223,14 +223,14 @@ plot_cor_heatmap= function(input_df, row_id_cols, col_id_cols, counts_col,
   }
   
   # Pivot and calculate correlations
-  correlation_mx= correlation_mx %>% reshape2::acast(row_id~col_id, value.var= counts_col) %>%
+  correlation_mx= correlation_mx %>% reshape2::acast(row_id~col_id, value.var= value_col) %>%
     WGCNA::cor(use= 'pairwise.complete.obs', method= cor_method)
   
   # Create heatmap 
   cor_heatmap= correlation_mx %>% reshape2::melt() %>% 
     ggplot(aes(x= Var1, y= Var2, fill= value)) + 
     geom_tile() +
-    labs(x= '', y= '', fill= '', title= paste0('Correlations using ', counts_col)) +
+    labs(x= '', y= '', fill= '', title= paste0('Correlations using ', value_col)) +
     scale_fill_gradientn(breaks= c(0, 0.5, 1), 
                          colours= c('blue', 'white','red'),
                          limits=c(0, 1), oob= scales::squish) +
@@ -242,12 +242,13 @@ plot_cor_heatmap= function(input_df, row_id_cols, col_id_cols, counts_col,
 #' Scatterplots of two replicates
 #' 
 #' @param input_df description
-make_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_cols, replicate_col, values_col,
+create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_cols, replicate_col, value_col,
                                       x_axis_rep= '1', y_axis_rep= '2') {
   reps_piv= input_df %>% 
     tidyr::unite(all_of(replicate_group_cols), col= 'replicate_group', sep= ':', remove= TRUE, na.rm= FALSE) %>%
     dplyr::group_by(pick(all_of(c(cell_line_cols, 'replicate_group')))) %>%
-    dplyr::filter(n!= 0, dplyr::n() >= 2, !is.na(.data[[replicate_col]]), .data[[replicate_col]] != '') %>% 
+    dplyr::filter(!is.na(.data[[replicate_col]]), .data[[replicate_col]] != '', .data[[value_col]] != 0,
+                  dplyr::n() >= 2) %>% 
     dplyr::ungroup() 
   
   # Retun a null object if no entries pass the filter.
@@ -255,7 +256,7 @@ make_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_
   
   reps_piv= reps_piv %>%
     pivot_wider(id_cols= all_of(c(cell_line_cols, 'replicate_group')),
-                names_from= replicate_col, names_prefix= replicate_col, values_from= values_col) %>%
+                names_from= replicate_col, names_prefix= replicate_col, values_from= value_col) %>%
     dplyr::mutate(type= ifelse(!is.na(CCLE_name), "cell line", "control barcode")) %>% dplyr::ungroup()
   
   # Create names of the columns to plot on xy axes
@@ -264,11 +265,12 @@ make_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_
   
   reps_scatter= reps_piv %>% dplyr::filter(!is.na(.data[[x_col_name]]), !is.na(.data[[y_col_name]])) %>%
     ggplot(aes(x= .data[[x_col_name]], y= .data[[y_col_name]])) +
+    geom_abline(color='black', linewidth=0.5, linetype=2) +
     geom_point(aes(color= type), alpha=0.75) +
-    geom_smooth(method='lm', se=F, color='black', linewidth=0.5, linetype=2) +
     ggpmisc::stat_correlation(mapping = use_label(c("R2", "n")))+ 
     facet_wrap(~replicate_group, scales= 'free') +
-    labs(x= paste0(replicate_col, '1 ', values_col), y= paste0(replicate_col, '2 ', values_col)) +
+    labs(x= paste0(replicate_col, '1 ', value_col), y= paste0(replicate_col, '2 ', value_col),
+         title= paste0('Scatter plot of ', replicate_col, ' with ', value_col)) +
     theme_bw()
   
   return(reps_scatter)
@@ -300,7 +302,8 @@ make_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_
 #' @param reverse_index2 reverse index 2 if newer sequencers are used.
 #' @return - NA, QC images are written out to the specified folder
 #' @export
-QC_images = function(raw_counts, annotated_counts, normalized_counts= NA,
+QC_images = function(raw_counts_uncollapsed, raw_counts, 
+                     filtered_counts, normalized_counts= NA, l2fc, 
                      sample_meta, CB_meta, cell_set_meta,
                      id_cols, sig_cols, count_col_name= 'normalized_n',
                      control_type, count_threshold= 40, 
@@ -315,6 +318,7 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA,
   }
   
   # Some preprocessing ----
+  skipped_qcs= c() # empty vector to collect potential errors
   num_profiles = annotated_counts %>% dplyr::distinct(pick(all_of(id_cols))) %>% nrow()
   
   # Reverse index 2 barcodes
@@ -346,208 +350,286 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA,
   # Sequencing QCs ____________________ ----
   ## Purity metrics ----
   # call this function
-  print('Generating QC table')
+  print('1. Generating QC table ...')
   create_qc_table(raw_counts_uncollapsed, raw_counts, filtered_counts,
-                            counts_col= 'n', file_path)
+                  value_col= 'n', file_path= paste0(out, '/QC_table.csv'))
   #
   
   ## Index count summaries ----
-  print("Generating index counts tables")
+  print("2. Generating index counts tables ...")
   # Check that "IndexBarcode1" and "index_1" columns are present.
   # If so, calculate index summary and write out.
-  if('index_1' %in% colnames(sample_meta) & 'index_1' %in% colnames(raw_counts)) {
+  if('index_1' %in% colnames(sample_meta) & 'index_1' %in% colnames(raw_counts_uncollapsed)) {
     expected_index1= unique(sample_meta$index_1)
-    index1_counts= get_index_summary(raw_counts, 'index_1', expected_index1)
+    index1_counts= get_index_summary(raw_counts_uncollapsed, 'index_1', expected_index1)
     index1_counts %>% write.csv(file= paste(out, 'index1_counts.csv', sep='/'), row.names=F)
   } else {
-    print('Column "index_1" not detected. Skipping index 1 summaries ...')
+    print('Column "index_1" not detected. Skipping index 1 summaries ...', quote= FALSE)
   }
   
   # Do the same for index 2.
-  if('index_2' %in% colnames(sample_meta) & 'index_2' %in% colnames(raw_counts)) {
+  if('index_2' %in% colnames(sample_meta) & 'index_2' %in% colnames(raw_counts_uncollapsed)) {
     expected_index2= unique(sample_meta$index_2)
-    index2_counts= get_index_summary(raw_counts, 'index_2', expected_index2)
+    index2_counts= get_index_summary(raw_counts_uncollapsed, 'index_2', expected_index2)
     index2_counts %>% write.csv(file= paste(out, 'index2_counts.csv', sep='/'), row.names=F)
   } else {
-    print('Column "index_2" not detected. Skipping index 2 summaries ...')
+    print('Column "index_2" not detected. Skipping index 2 summaries ...',  quote= FALSE)
   }
   
   ## Total counts ----
-  print("generating total_counts image")
-  
-  tc= plot_total_counts(filtered_counts, id_cols, facet_col= 'pcr_plate')
-  
-  pdf(file=paste(out, "total_counts.pdf", sep="/"),
-      width=sqrt(num_profiles)*2, height=sqrt(num_profiles))
-  print(tc)
-  dev.off()
-  rm(tc)
+  print("3. Generating total_counts image ...")
+  potential_error= base::tryCatch({
+    tc= create_total_counts_barplot(filtered_counts, id_cols, facet_col= 'pcr_plate')
+    
+    pdf(file=paste(out, "total_counts.pdf", sep="/"),
+        width=sqrt(num_profiles)*2, height=sqrt(num_profiles))
+    print(tc)
+    dev.off()
+    rm(tc)
+  }, error= function(e) {
+    print(e)
+    print('Encountered an error when creating the total counts barplot. Skipping this output ...') 
+    return('QC table')
+  })
+  
+  # Collect returned string if an error occurred
+  if(!is.null(potential_error)) {
+    skipped_qcs = c(skipped_qcs, potential_error)
+  }
   
   # Assay QCs _________________________ ----
   ## Cell lines recovered ----
-  print("generating cell_lines_present image")
-  
-  cl_rec= plot_cl_recovery(filtered_counts, id_cols= id_cols, facet_col= 'pcr_plate', 
-                           counts_threshold= counts_threshold, plot_type= 'percent')
-  
-  pdf(file=paste(out, "cell_lines_present.pdf", sep="/"),
-      width=sqrt(num_profiles)*2, height=sqrt(num_profiles))
-  print(cl_rec)
-  dev.off()
-  rm(recovery, cl_rec)
+  print("4. Generating cell_lines_present image ...")
+  potential_error= base::tryCatch({
+    cl_rec= create_recovery_barplot(filtered_counts, id_cols= id_cols, facet_col= 'pcr_plate', 
+                                    count_threshold= count_threshold, plot_type= 'percent')
+    
+    pdf(file=paste(out, "cell_lines_present.pdf", sep="/"),
+        width=sqrt(num_profiles)*2, height=sqrt(num_profiles))
+    print(cl_rec)
+    dev.off()
+    rm(cl_rec)
+  }, error= function(e) {
+    print(e)
+    print('Encountered an error when creating the recovery barplot. Skipping this output ...')
+    return('CL recovery')
+  })
+  
+  # Collect returned string if an error occurred
+  if(!is.null(potential_error)) {
+    skipped_qcs = c(skipped_qcs, potential_error)
+  }
   
   ## Contaminants ----
-  print('generating contaminate cell lines')
-  contams= annotated_counts %>% dplyr::filter(expected_read==F) %>%
-    dplyr::mutate(barcode_id= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>%
-    dplyr::group_by(forward_read_cl_barcode, barcode_id) %>% 
-    dplyr::summarise(num_wells= n(), median_n=median(n), max_n= max(n)) %>% ungroup() %>%
-    dplyr::arrange(desc(num_wells))
-  
-  contams %>% write.csv(file= paste(out, 'contam_cell_lines.csv', sep='/'), row.names=F)
-  rm(contams)
-  
-  # Contaminates for ursula ----
-  print('generating contaminate reads for Ursula')
-  # Determine which seq cols are present.
-  rc_seq_cols= c('flowcell_names', 'flowcell_lanes', 'index_1', 'index_2')
-  present_seq_cols= intersect(rc_seq_cols, colnames(raw_counts))
-  
-  # map of seq_cols to PCR locations
-  pcr_plate_map= sample_meta %>%
-    dplyr::distinct(pick(any_of(c(present_seq_cols, 'pcr_plate', 'pcr_well', 'cell_set')))) %>%
-    dplyr::group_by(pcr_plate) %>% dplyr::mutate(num_wells_in_plate= dplyr::n()) %>% dplyr::ungroup() %>%
-    dplyr::group_by(cell_set) %>% dplyr::mutate(num_wells_in_set= dplyr::n()) %>% dplyr::ungroup()
-  
-  # index filter and identify reads as mapped or not
-  unique_seq_col_vals= sample_meta %>% dplyr::distinct(pick(all_of(present_seq_cols)))
-  sequencing_filter= raw_counts %>% 
-    dplyr::semi_join(unique_seq_col_vals, by= present_seq_cols) %>%
-    dplyr::mutate(mapped= ifelse(forward_read_cl_barcode %in% unique(annotated_counts$forward_read_cl_barcode), T, F))
-  
-  # total counts per well - used to calculate fractions
-  counts_per_well= sequencing_filter %>% dplyr::group_by(pick(all_of(present_seq_cols))) %>% 
-    dplyr::summarise(well_total_n= sum(n)) %>% dplyr::ungroup()
-  
-  # mapped contaminates to bind
-  mapped_contams= annotated_counts %>% dplyr::filter(!expected_read) %>%
-    dplyr::mutate(barcode_name= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>%
-    dplyr::select(all_of(c(present_seq_cols, 'forward_read_cl_barcode', 'n', 'barcode_name')))
-  
-  contam_reads= sequencing_filter %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>%
-    dplyr::bind_rows(mapped_contams) %>%
-    dplyr::left_join(counts_per_well, by= present_seq_cols) %>%
-    dplyr::left_join(pcr_plate_map, by= present_seq_cols) %>%
-    # filter out barcodes that only appear in one well
-    dplyr::group_by(forward_read_cl_barcode) %>% dplyr::filter(dplyr::n() >1) %>% dplyr::ungroup() %>%
-    # number of wells in a pcr plate a barcode is detected in
-    dplyr::group_by(forward_read_cl_barcode, pcr_plate) %>%
-    dplyr::mutate(num_wells_detected_plate= n()) %>% dplyr::ungroup() %>%
-    # number of wells in a cell set a barcode is detected in
-    dplyr::group_by(forward_read_cl_barcode, cell_set) %>%
-    dplyr::mutate(num_wells_detected_set= n()) %>% dplyr::ungroup() %>%
-    # determine if contamination is project, plate, or set
-    dplyr::group_by(forward_read_cl_barcode) %>%
-    dplyr::mutate(num_wells_detected= dplyr::n(),
-                  project_code= unique(sample_meta$project_code),
-                  fraction= n/well_total_n,
-                  type1= ifelse(sum(num_wells_detected== nrow(pcr_plate_map))>1, 'project_contam', NA),
-                  type2= ifelse(sum(num_wells_detected== num_wells_detected_plate & 
-                                      num_wells_detected_plate == num_wells_in_plate)>1, 'plate_contam', NA),
-                  type3= ifelse(sum(num_wells_detected == num_wells_detected_set &
-                                      num_wells_detected_set== num_wells_in_set)>1, 'set_contam', NA)) %>%
-    dplyr::ungroup() %>%
-    tidyr::unite(scope, all_of(c('type1', 'type2', 'type3')), sep=',', remove = T, na.rm = T) %>%
-    dplyr::group_by(project_code, forward_read_cl_barcode, barcode_name, scope, num_wells_detected) %>%
-    dplyr::summarise(min_n= min(n), med_n= median(n), max_n= max(n),
-                     min_fraction= min(fraction), med_fraction= median(fraction), max_fraction=max(fraction)) %>%
-    dplyr::arrange(desc(max_fraction))
-  
-  # write out
-  contam_reads %>% write.csv(paste0(out, 'contam_reads.csv'), row.names=F)
+  print('5. Generating contaminate cell lines ...')
+  potential_error= base::tryCatch({
+    contams= annotated_counts %>% dplyr::filter(expected_read==F) %>%
+      dplyr::mutate(barcode_id= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>%
+      dplyr::group_by(forward_read_cl_barcode, barcode_id) %>% 
+      dplyr::summarise(num_wells= n(), median_n=median(n), max_n= max(n)) %>% ungroup() %>%
+      dplyr::arrange(desc(num_wells))
+    
+    contams %>% write.csv(file= paste(out, 'contam_cell_lines.csv', sep='/'), row.names=F)
+    rm(contams)
+  }, error= function(e) {
+    print(e)
+    print('Encountered an error when creating the contaminants file. Skipping this output ...') 
+    return('contam_cell_lines.csv')
+  })
+  
+  # Collect returned string if an error occurred
+  if(!is.null(potential_error)) {
+    skipped_qcs = c(skipped_qcs, potential_error)
+  }
+  
+  ## Contaminates for ursula ----
+  print('6. Generating contaminate reads for Ursula ...')
+  potential_error= base::tryCatch({
+    # Determine which seq cols are present.
+    rc_seq_cols= c('flowcell_names', 'flowcell_lanes', 'index_1', 'index_2')
+    present_seq_cols= intersect(rc_seq_cols, colnames(raw_counts))
+    
+    # map of seq_cols to PCR locations
+    pcr_plate_map= sample_meta %>%
+      dplyr::distinct(pick(any_of(c(present_seq_cols, 'pcr_plate', 'pcr_well', 'cell_set')))) %>%
+      dplyr::group_by(pcr_plate) %>% dplyr::mutate(num_wells_in_plate= dplyr::n()) %>% dplyr::ungroup() %>%
+      dplyr::group_by(cell_set) %>% dplyr::mutate(num_wells_in_set= dplyr::n()) %>% dplyr::ungroup()
+    
+    # index filter and identify reads as mapped or not
+    unique_seq_col_vals= sample_meta %>% dplyr::distinct(pick(all_of(present_seq_cols)))
+    sequencing_filter= raw_counts %>% 
+      dplyr::semi_join(unique_seq_col_vals, by= present_seq_cols) %>%
+      dplyr::mutate(mapped= ifelse(forward_read_cl_barcode %in% unique(annotated_counts$forward_read_cl_barcode), T, F))
+    
+    # total counts per well - used to calculate fractions
+    counts_per_well= sequencing_filter %>% dplyr::group_by(pick(all_of(present_seq_cols))) %>% 
+      dplyr::summarise(well_total_n= sum(n)) %>% dplyr::ungroup()
+    
+    # mapped contaminates to bind
+    mapped_contams= annotated_counts %>% dplyr::filter(!expected_read) %>%
+      dplyr::mutate(barcode_name= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>%
+      dplyr::select(all_of(c(present_seq_cols, 'forward_read_cl_barcode', 'n', 'barcode_name')))
+    
+    contam_reads= sequencing_filter %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>%
+      dplyr::bind_rows(mapped_contams) %>%
+      dplyr::left_join(counts_per_well, by= present_seq_cols) %>%
+      dplyr::left_join(pcr_plate_map, by= present_seq_cols) %>%
+      # filter out barcodes that only appear in one well
+      dplyr::group_by(forward_read_cl_barcode) %>% dplyr::filter(dplyr::n() >1) %>% dplyr::ungroup() %>%
+      # number of wells in a pcr plate a barcode is detected in
+      dplyr::group_by(forward_read_cl_barcode, pcr_plate) %>%
+      dplyr::mutate(num_wells_detected_plate= n()) %>% dplyr::ungroup() %>%
+      # number of wells in a cell set a barcode is detected in
+      dplyr::group_by(forward_read_cl_barcode, cell_set) %>%
+      dplyr::mutate(num_wells_detected_set= n()) %>% dplyr::ungroup() %>%
+      # determine if contamination is project, plate, or set
+      dplyr::group_by(forward_read_cl_barcode) %>%
+      dplyr::mutate(num_wells_detected= dplyr::n(),
+                    project_code= unique(sample_meta$project_code),
+                    fraction= n/well_total_n,
+                    type1= ifelse(sum(num_wells_detected== nrow(pcr_plate_map))>1, 'project_contam', NA),
+                    type2= ifelse(sum(num_wells_detected== num_wells_detected_plate & 
+                                        num_wells_detected_plate == num_wells_in_plate)>1, 'plate_contam', NA),
+                    type3= ifelse(sum(num_wells_detected == num_wells_detected_set &
+                                        num_wells_detected_set== num_wells_in_set)>1, 'set_contam', NA)) %>%
+      dplyr::ungroup() %>%
+      tidyr::unite(scope, all_of(c('type1', 'type2', 'type3')), sep=',', remove = T, na.rm = T) %>%
+      dplyr::group_by(project_code, forward_read_cl_barcode, barcode_name, scope, num_wells_detected) %>%
+      dplyr::summarise(min_n= min(n), med_n= median(n), max_n= max(n),
+                       min_fraction= min(fraction), med_fraction= median(fraction), max_fraction=max(fraction)) %>%
+      dplyr::arrange(desc(max_fraction))
+    
+    # write out
+    contam_reads %>% write.csv(paste0(out, 'contam_reads.csv'), row.names=F)
+  }, error= function(e) {
+    print(e)
+    print('Encountered an error when creating the contams for UW file. Skipping this output ...')
+    return('contam for UW')
+  })
+  
+  # Collect returned string if an error occurred
+  if(!is.null(potential_error)) {
+    skipped_qcs = c(skipped_qcs, potential_error)
+  }
   
   ## Cumulative counts by lines in negcons ----
-  print("generating cummulative image")
-  cdf= annotated_counts %>% dplyr::select(!any_of(c('members'))) %>%
-    dplyr::filter(expected_read, trt_type == control_type) %>%
-    dplyr::left_join(num_cls_in_set, by= "cell_set") %>%
-    dplyr::mutate(expected_num_cl= ifelse(control_barcodes, expected_num_cl + length(unique(CB_meta$Name)),
-                                          expected_num_cl)) %>% # add CBs to expected_num_cl if there are CBs
-    tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= FALSE) %>%
-    dplyr::group_by(pcr_plate, pcr_well, profile_id, expected_num_cl) %>% 
-    dplyr::mutate(total_counts= sum(n), pct_counts= n/total_counts,) %>% dplyr::arrange(-n) %>% 
-    dplyr::mutate(cum_pct= cumsum(pct_counts), rank= row_number(),
-                  rank_pct= rank/expected_num_cl) %>% dplyr::ungroup()
-  
-  # additional tables
-  mark50= cdf %>% dplyr::filter(cum_pct >= 0.5) %>% dplyr::group_by(profile_id) %>% 
-    arrange(cum_pct) %>% dplyr::filter(row_number()==1) %>% ungroup() %>% 
-    dplyr::select(profile_id, rank_pct= rank_pct, num50= rank, num50_loc= rank_pct)
-  mark95= cdf %>% dplyr::group_by(profile_id) %>% 
-    dplyr::mutate(auc= sum(cum_pct*(1/expected_num_cl))) %>% # calculate AUCs
-    dplyr::filter(cum_pct >= 0.95) %>% 
-    arrange(cum_pct) %>% dplyr::filter(row_number() ==1) %>% ungroup() %>% 
-    dplyr::select(profile_id, rank_pct= rank_pct, num95= rank, num95_loc= rank_pct, auc)
-  
-  cdf_plot= cdf %>% 
-    merge(mark50, by= c('profile_id', 'rank_pct'), all.x=T) %>% 
-    merge(mark95, by= c('profile_id', 'rank_pct'), all.x= T) %>%
-    ggplot(aes(x= rank_pct, y=cum_pct)) +
-    { if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), 
-               mapping=aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size=3) } + 
-    geom_line(color='black') +
-    # point for 50% of counts
-    geom_segment(aes(x= -Inf , y= .50, xend = num50_loc, yend = .50), color= 'black', linetype='dashed') +
-    geom_segment(aes(x= num50_loc, y= -Inf, xend = num50_loc, yend = .50), color= 'black', linetype='dashed') +
-    geom_label(aes(x=num50_loc, y= .25, label= num50), hjust= 0, color= 'black') +
-    # point for 95% of counts
-    geom_segment(aes(x= -Inf , y= .95, xend = num95_loc, yend = .95), color= 'black', linetype='dashed') +
-    geom_segment(aes(x= num95_loc, y= -Inf, xend = num95_loc, yend = .95), color= 'black', linetype='dashed') +
-    geom_label(aes(x=num95_loc, y= .75, label= num95), hjust= 0, color= 'black') +
-    # label for AUC
-    geom_label(aes(x=num95_loc, y= .25, label= paste0('AUC ', round(auc,3))), hjust= 'inward', color= 'black') +
-    facet_wrap(~profile_id) + 
-    labs(x='% rank of unique reads', y='Cumulative percentage', color= 'CBs') + theme_bw()
-  
-  pdf(file=paste(out, "cdf_plot.pdf", sep="/"),
-      width=sqrt(num_profiles)*2, height=sqrt(num_profiles))
-  print(cdf_plot)
-  dev.off()
-  rm(cdf, mark50, mark95, cdf_plot)
+  print("7. Generating cummulative image ...")
+  potential_error= base::tryCatch({
+    cdf= filtered_counts %>% dplyr::filter(trt_type == control_type) %>%
+      dplyr::left_join(num_cls_in_set, by= "cell_set") %>%
+      dplyr::mutate(expected_num_cl= ifelse(control_barcodes, expected_num_cl + length(unique(CB_meta$Name)),
+                                            expected_num_cl)) %>% # add CBs to expected_num_cl if there are CBs
+      tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= FALSE) %>%
+      dplyr::group_by(pcr_plate, pcr_well, profile_id, expected_num_cl) %>% 
+      dplyr::mutate(total_counts= sum(n), pct_counts= n/total_counts,) %>% dplyr::arrange(-n) %>% 
+      dplyr::mutate(cum_pct= cumsum(pct_counts), rank= row_number(),
+                    rank_pct= rank/expected_num_cl) %>% dplyr::ungroup()
+    
+    # additional tables
+    mark50= cdf %>% dplyr::filter(cum_pct >= 0.5) %>% dplyr::group_by(profile_id) %>% 
+      arrange(cum_pct) %>% dplyr::filter(row_number()==1) %>% ungroup() %>% 
+      dplyr::select(profile_id, rank_pct= rank_pct, num50= rank, num50_loc= rank_pct)
+    mark95= cdf %>% dplyr::group_by(profile_id) %>% 
+      dplyr::mutate(auc= sum(cum_pct*(1/expected_num_cl))) %>% # calculate AUCs
+      dplyr::filter(cum_pct >= 0.95) %>% 
+      arrange(cum_pct) %>% dplyr::filter(row_number() ==1) %>% ungroup() %>% 
+      dplyr::select(profile_id, rank_pct= rank_pct, num95= rank, num95_loc= rank_pct, auc)
+    
+    cdf_plot= cdf %>% 
+      merge(mark50, by= c('profile_id', 'rank_pct'), all.x=T) %>% 
+      merge(mark95, by= c('profile_id', 'rank_pct'), all.x= T) %>%
+      ggplot(aes(x= rank_pct, y=cum_pct)) +
+      { if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), 
+                                    mapping=aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size=3) } + 
+      geom_line(color='black') +
+      # point for 50% of counts
+      geom_segment(aes(x= -Inf , y= .50, xend = num50_loc, yend = .50), color= 'black', linetype='dashed') +
+      geom_segment(aes(x= num50_loc, y= -Inf, xend = num50_loc, yend = .50), color= 'black', linetype='dashed') +
+      geom_label(aes(x=num50_loc, y= .25, label= num50), hjust= 0, color= 'black') +
+      # point for 95% of counts
+      geom_segment(aes(x= -Inf , y= .95, xend = num95_loc, yend = .95), color= 'black', linetype='dashed') +
+      geom_segment(aes(x= num95_loc, y= -Inf, xend = num95_loc, yend = .95), color= 'black', linetype='dashed') +
+      geom_label(aes(x=num95_loc, y= .75, label= num95), hjust= 0, color= 'black') +
+      # label for AUC
+      geom_label(aes(x=num95_loc, y= .25, label= paste0('AUC ', round(auc,3))), hjust= 'inward', color= 'black') +
+      facet_wrap(~profile_id) + 
+      labs(x='% rank of unique reads', y='Cumulative percentage', color= 'CBs') + theme_bw()
+    
+    pdf(file=paste(out, "cdf_plot.pdf", sep="/"),
+        width=sqrt(num_profiles)*2, height=sqrt(num_profiles))
+    print(cdf_plot)
+    dev.off()
+    rm(cdf, mark50, mark95, cdf_plot)
+  }, error= function(e) {
+    print(e)
+    print('Encountered an error when creating the cdf plot. Skipping this output ...') 
+    return('cdf plot')
+  })
+  
+  # Collect returned string if an error occurred
+  if(!is.null(potential_error)) {
+    skipped_qcs = c(skipped_qcs, potential_error)
+  }
   
   ## Control barcode trends ----
   if(contains_cbs & is.data.frame(normalized_counts)) {
-    print("generating control_barcode_trend image")
+    print("8. Generating control_barcode_trend image")
+    potential_error= base::tryCatch({
+      trend_sc= create_ctrlBC_scatterplots(normalized_counts, id_cols, value_col= 'log2_n')
+      
+      pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"),
+          width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2)
+      print(trend_sc)
+      dev.off()
+      rm(cb_trend, trend_sc)
+    }, error= function(e) {
+      print(e)
+      print('Encountered an error when creating the CB trends plot. Skipping this output ...') 
+      return('cb trend')
+    })
     
-    trend_sc= plot_ctrl_bc_trend(normalized_counts, id_cols, counts_col= 'log2_n')
+    # Collect returned string if an error occurred
+    if(!is.null(potential_error)) {
+      skipped_qcs = c(skipped_qcs, potential_error)
+    }
+  } else {
+    print('8. No control barcodes detected. Skipping control_barcode_trend image.')
+  }
+  
+  ## Sample correlation -----
+  print("9. Generating sample_cor image ...")
+  potential_error= base::tryCatch({
+    cor_df= filtered_counts %>% 
+      dplyr::filter(!is.na(DepMap_ID), !is.na(trt_type), !trt_type %in% c("empty", "", "CB_only")) %>%
+      dplyr::mutate(log2_n= log2(n + 1))
+    cp= create_cor_heatmap(input_df= cor_df,
+                           row_id_cols= c('DepMap_ID'),
+                           col_id_cols= c(sig_cols, id_cols),
+                           value_col= 'log2_n')
     
-    pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"),
+    pdf(file=paste(out, "sample_cor.pdf", sep="/"),
         width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2)
-    print(trend_sc)
+    print(cp)
     dev.off()
-    rm(cb_trend, trend_sc)
+    rm(correlation_matrix, cp)
+  }, error= function(e) {
+    print(e)
+    print('Encountered an error when creating the sample_cor figure. Skipping this output ...')
+    return('sample_cor')
+  })
+  
+  # Collect returned string if an error occurred
+  if(!is.null(potential_error)) {
+    skipped_qcs = c(skipped_qcs, potential_error)
   }
   
-  ## Sample correlation -----
-  print("generating sample_cor image")
-  
-  cor_df= filtered_counts %>% 
-    dplyr::filter(!is.na(DepMap_ID), !is.na(trt_type), !trt_type %in% c("empty", "", "CB_only")) %>%
-    dplyr::mutate(log2_n= log2(n + 1))
-  cp= plot_cor_heatmap(input_df= cor_df,
-                       row_id_cols= c('DepMap_ID'),
-                       col_id_cols= c(sig_cols, id_cols),
-                       counts_col= 'log2_n')
-  
-  pdf(file=paste(out, "sample_cor.pdf", sep="/"),
-      width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2)
-  print(cp)
-  dev.off()
-  rm(correlation_matrix, cp)
-  
   ## Tech rep correlations ----
-  if(is.data.frame(normalized_counts)) {
-    if('tech_rep' %in% colnames(normalized_counts)) {
+  if(is.data.frame(normalized_counts) & 'tech_rep' %in% colnames(normalized_counts)) {
+    # Check if there are more at least two tech reps
+    unique_tech_reps= na.omit(unique(normalized_counts$tech_rep))
+    
+    if(length(unique_tech_reps) >= 2) {
+      print("10. Generating tech rep correlations image ...")
       # Set up replicate groups depending "bio_rep" column
       if('bio_rep' %in% colnames(normalized_counts) & !'bio_rep' %in% sig_cols) {
         replicate_group_cols= c(sig_cols, 'bio_rep')
@@ -563,71 +645,95 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA,
       }
       
       # Create replicate scatter plot
-      print("generating tech rep correlations image")
-      tech_reps_plt= make_replicate_scatterplots(input_df= normalized_counts, 
-                                                 cell_line_cols= unique_cell_line_cols, 
-                                                 replicate_group_cols= replicate_group_cols, 
-                                                 replicate_col= 'tech_rep', 
-                                                 values_col= 'log2_n')
+      potential_error= base::tryCatch({
+        tech_reps_plt= create_replicate_scatterplots(input_df= normalized_counts, 
+                                                     cell_line_cols= unique_cell_line_cols, 
+                                                     replicate_group_cols= replicate_group_cols, 
+                                                     replicate_col= 'tech_rep', 
+                                                     value_col= 'log2_n')
+        if(!is.null(tech_reps_plt)) {
+          pdf(file=paste(out, "tech_reps_plt.pdf", sep="/"),
+              width=sqrt(num_profiles), height=sqrt(num_profiles))
+          print(tech_reps_plt)
+          dev.off()
+        } else {
+          print('No technical replicates detected - skipping plot.')
+        }
+      }, error= function(e) {
+        print(e)
+        print('Encountered an error when creating the tech_reps_plt figure. Skipping this output ...')
+        return('tech_reps_plt')
+      })
       
-      if(!is.null(tech_reps_plt)) {
-        pdf(file=paste(out, "tech_reps_plt.pdf", sep="/"),
-            width=sqrt(num_profiles), height=sqrt(num_profiles))
-        print(tech_reps_plt)
-        dev.off()
-      } else {
-        print('No technical replicates detected - skipping plot.')
+      # Collect returned string if an error occurred
+      if(!is.null(potential_error)) {
+        skipped_qcs = c(skipped_qcs, potential_error)
       }
+      
+    } else {
+      print('10. No technical replicates detected. Skipping tech_reps scatter plot.')
     }
+  } else {
+    print('10. No technical replicates detected. Skipping tech_reps scatter plot.')
   }
+  
   ## Bio rep correlations ----
-  if('bio_rep' %in% colnames(normalized_counts)) {
-    num_bio_reps= normalized_counts %>% 
-      dplyr::filter((!trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type)) %>% 
-      dplyr::pull(bio_rep) %>% unique() %>% length()
+  if('bio_rep' %in% colnames(l2fc)) {
+    unique_bio_reps= na.omit(unique(l2fc$bio_rep))
     
-    if(num_bio_reps > 1) {
-      print("generating bio rep correlations image")
+    if(length(unique_bio_reps) >= 2) {
+      l2fc_with_log2= l2fc %>% dplyr::mutate(log2_mean_normalized_n= log2(mean_normalized_n))
       
-      if('bio_rep' %in% colnames(normalized_counts)) {
-        bio_rep_id_cols= c(sig_cols, 'bio_rep')
-      } else {
-        bio_rep_id_cols= sig_cols
-        print('WARNING: bio_rep column not detected. Assuming that there are NO biological replicates.') 
-        print('Technical replicate collapse will be performed across the sig_cols.')
-      }
+      # Bio replicate scatter plots
+      # bio_reps_plt= create_replicate_scatterplots(input_df= l2fc_with_log2s, 
+      #                                           cell_line_cols= cell_line_cols, 
+      #                                           replicate_group_cols= sig_cols, 
+      #                                           replicate_col= 'bio_rep', 
+      #                                           value_col= 'log2_mean_normalized_n')
+      # if(!is.null(bio_reps_plt)) {
+      #   pdf(file=paste(out, "bio_reps_plt.pdf", sep="/"),
+      #       width=sqrt(num_profiles), height=sqrt(num_profiles))
+      #   print(bio_reps_plt)
+      #   dev.off()
+      # } else {
+      #   print('No technical replicates detected - skipping plot.')
+      # }
       
-      # collapse tech reps taken from 'compute_l2fc'
-      collapsed_tech_rep= normalized_counts %>%
-        dplyr::filter(!(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type), !is.na(CCLE_name)) %>%
-        dplyr::group_by(pick(all_of(c('CCLE_name', 'trt_type', bio_rep_id_cols)))) %>%
-        dplyr::summarise(mean_normalized_n = mean(!! rlang::sym(count_col_name)), 
-                         num_tech_reps= n()) %>% dplyr::ungroup()
-      collapsed_tech_rep$sig_id= do.call(paste,c(collapsed_tech_rep[sig_cols], sep=':'))
-      
-      bio_corr= collapsed_tech_rep %>% ungroup() %>% 
-        filter(!is.na(CCLE_name),
-               (!trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type)) %>% 
-        mutate(plt_id= paste(sig_id, bio_rep, sep=':')) %>% 
-        reshape2::acast(CCLE_name~plt_id, value.var="mean_normalized_n") %>% 
-        cor(use="pairwise.complete.obs")
+      # Bio replicate heatmap
+      print("11. Generating bio rep correlations heatmap ...")
+      potential_error= base::tryCatch({
+        bio_corr_hm= create_cor_heatmap(input_df= l2fc_with_log2, 
+                                        row_id_cols= cell_line_cols, 
+                                        col_id_cols= c(sig_cols, 'bio_rep'), 
+                                        value_col= 'l2fc',
+                                        cor_method= 'pearson') 
+        pdf(file=paste(out, "bio_corr_hm.pdf", sep="/"),
+            width=sqrt(num_profiles), height=sqrt(num_profiles))
+        print(bio_corr_hm)
+        dev.off()
+      }, error= function(e) {
+        print(e)
+        print('Encountered an error when creating the bio_corr_hm figure. Skipping this output ...') 
+        return('bio_corr_hm')
+      })
       
-      bio_corr_hm= bio_corr %>% reshape2::melt() %>% ggplot() +
-        geom_tile(aes(x=Var1, y=Var2, fill=value)) +
-        labs(x="", y="", fill="correlation\nnorm_n") +
-        scale_fill_gradientn(breaks= c(0, 0.5, 1), 
-                             colours= c('white', 'yellow', 'red'),
-                             limits=c(0,1), oob=squish) +
-        theme(axis.text.x = element_text(angle=70, hjust=1, size=5),
-              axis.text.y = element_text(size=5))
+      # Collect returned string if an error occurred
+      if(!is.null(potential_error)) {
+        skipped_qcs = c(skipped_qcs, potential_error)
+      }
       
-      pdf(file=paste(out, "bio_corr_hm.pdf", sep="/"),
-          width=sqrt(num_profiles), height=sqrt(num_profiles))
-      print(bio_corr_hm)
-      dev.off()
+    } else {
+      print('11. No biological replicates detected. Skipping bio_rep heatmap.')
     }
   }
-  
+
   # End _________________________ ----
   print('QC finishing')
+  if(length(na.omit(skipped_qcs)) != 0) {
+    print(paste0('WARNING: The following ', length(skipped_qcs), ' QCs encountered errors and were skipped - '))
+    print(na.omit(skipped_qcs))
+  } else {
+    print('No errors encountered.')
+  }
+  return(skipped_qcs)
 }

From ecb6dc45c8db6cd49c9da5f52bf3f1ce9e84d014 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Mon, 19 Aug 2024 14:55:32 -0400
Subject: [PATCH 006/127] Updated validator

validate_columns_exist now prints missing items
---
 scripts/src/collate_fastq_reads.R | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 2ea5fd75..c74e4861 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -1,13 +1,18 @@
 #' validate_columns_exist
 #' 
 #' This function checks that a list of columns are present in a dataframe.
+#' Columns that were not found in the dataframe are printed out.
 #' 
 #' @param selected_columns A vector of strings each representing a column name
 #' @param df A dataframe to check against
 #' @return Boolean
-validate_columns_exist= function(selected_columns, df) {
-  # Check that all of selected_columns are in df
-  if(any(!selected_columns %in% colnames(df))) {
+validate_columns_exist= function(selected_cols, df) {
+  # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B].
+  unmatched_cols= base::setdiff(selected_cols, colnames(df))
+  
+  if(length(unmatched_cols) > 0) {
+    print('The following columns are missing: ')
+    print(unmatched_cols)
     return(FALSE)
   } else {
     return(TRUE)
@@ -103,7 +108,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   
   # Validation: Check that flowcell_names and flowcell_lanes exist in the sample meta ----
   if(!validate_columns_exist(c('flowcell_names', 'flowcell_lanes'), sample_meta)) {
-    stop('Flowcell_names and/or flowcell_lanes is NOT present in the sample meta.')
+    stop('The above column(s) are NOT present in the sample meta.')
   }
   
   # Validation: Check that sequencing_index_cols exist in the sample meta ----
@@ -115,8 +120,6 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   
   # Validation: Check that id_cols exist in the sample meta ----
   if(!validate_columns_exist(id_cols, sample_meta)) {
-    print('The following id_cols are not present in the sample meta.')
-    print(id_cols[!id_cols %in% colnames(sample_meta)])
     stop('One or more id_cols is NOT present in the sample meta.')
   }
   
@@ -194,10 +197,9 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   print(paste0('Index purity: ', round(index_purity, 4)))
   if(index_purity > 1) {
     stop('ERROR: Index purity is greater than 1!')
-  }
-  if(index_purity < 0.5) {
+  } else if(index_purity < 0.5) {
     print('Warning: Low index purity!')
-  }
+  } else {}
   
   print('Done!')
   return(raw_counts)

From 04e83a22f883af4e03a4014f50720c294a00399e Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Mon, 19 Aug 2024 14:57:05 -0400
Subject: [PATCH 007/127] Remove index2 RC-ing

Removed index2 RC-ing - moved to collate
---
 scripts/src/filter_raw_reads.R | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R
index 8e663bc0..e29f811a 100755
--- a/scripts/src/filter_raw_reads.R
+++ b/scripts/src/filter_raw_reads.R
@@ -1,15 +1,18 @@
-#suppressPackageStartupMessages(library(sets))
-
 #' validate_columns_exist
 #' 
 #' This function checks that a list of columns are present in a dataframe.
+#' Columns that were not found in the dataframe are printed out.
 #' 
 #' @param selected_columns A vector of strings each representing a column name
 #' @param df A dataframe to check against
 #' @return Boolean
-validate_columns_exist= function(selected_columns, df) {
-  # Check that all of selected_columns are in df
-  if(any(!selected_columns %in% colnames(df))) {
+validate_columns_exist= function(selected_cols, df) {
+  # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B].
+  unmatched_cols= base::setdiff(selected_cols, colnames(df))
+  
+  if(length(unmatched_cols) > 0) {
+    print('The following columns are missing: ')
+    print(unmatched_cols)
     return(FALSE)
   } else {
     return(TRUE)
@@ -113,14 +116,14 @@ filter_raw_reads = function(raw_counts,
     CB_meta= CB_meta %>% dplyr::mutate(log2_dose= log_dose/log10(2)) %>% dplyr::select(-log_dose)
   }
   
-  if(reverse_index2) {
-    if ('index_2' %in% colnames(sample_meta)) {
-      print("Reverse-complementing index 2 barcode ...")
-      sample_meta$index_2 <- chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))
-    } else {
-      stop('ERROR: Reverse index 2 is set to TRUE, but index_2 does not exists.')
-    }
-  }
+  # if(reverse_index2) {
+  #   if ('index_2' %in% colnames(sample_meta)) {
+  #     print("Reverse-complementing index 2 barcode ...")
+  #     sample_meta$index_2 <- chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))
+  #   } else {
+  #     stop('ERROR: Reverse index 2 is set to TRUE, but index_2 does not exists.')
+  #   }
+  # }
   
   # Validation: Check that id_cols exist in the sample meta ----
   if(!validate_columns_exist(id_cols, sample_meta)) {

From 5313fb9ebf6eb2615a9969a38c847058e4ab7460 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Mon, 19 Aug 2024 17:30:11 -0400
Subject: [PATCH 008/127] Documented new function

---
 scripts/src/QC_images.R | 183 +++++++++++++++++++++++++++-------------
 1 file changed, 124 insertions(+), 59 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index d5bc5994..273aad88 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -5,7 +5,7 @@
 #' 
 #' @param selected_columns A vector of strings each representing a column name
 #' @param df A dataframe to check against
-#' @return Boolean
+#' @returns Boolean
 validate_columns_exist= function(selected_cols, df) {
   # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B].
   unmatched_cols= base::setdiff(selected_cols, colnames(df))
@@ -26,7 +26,7 @@ validate_columns_exist= function(selected_cols, df) {
 #' @param df A dataframe which must contain the column "n" which represents the count of a read.
 #' @param index_col The name of the column contain the index barcodes as a string. This column must be present in "df".
 #' @param valid_indices. A vector of all the valid indices for "index_col".
-#' @return A dataframe with the follow columns:
+#' @returns A dataframe with the follow columns:
 #'         - index_col: String, The column containing the index barcodes.
 #'         - idx_n: Numeric, Number of reads associated with a specific index barcode.
 #'         - fraction: Numeric, "idx_n" divided by the total number of reads in the run.
@@ -50,22 +50,34 @@ get_index_summary= function(df, index_col, valid_indices) {
 
 #' Calculate purity metrics
 #' 
-#' Create the qc table with index purity and cell line purity
+#' Create the qc table with index purity and cell line purity.
 #' 
 #' @param raw_counts_uncollapsed Dataframe output from nori.
 #' @param raw_counts Raw counts dataframe outputed from collate_fastq_reads.
 #' @param filtered_counts Filtered counts dataframe outputed from filter_raw_reads.
-#' @param value_col String name of the counts column in all three dataframes.
+#' @param value_col String name of the counts column present all three dataframes.
 #' @param file_path Location to write out the output.
-create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts,
-                          value_col= 'n', file_path) {
+#' @returns Writes out a QC_table to the file_path.
+create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts, value_col= 'n', file_path) {
+  # Validation: Check that value_col is present in the three files.
+  if(!validate_columns_exist(value_col, raw_counts_uncollapsed)) {
+    stop(paste0('The column ', value_col, " was not detected in uncollapsed raw counts."))
+  }
+  if(!validate_columns_exist(value_col, raw_counts)) {
+    stop(paste0('The column ', value_col, " was not detected in raw counts."))
+  }
+  if(!validate_columns_exist(value_col, filtered_counts)) {
+    stop(paste0('The column ', value_col, " was not detected in filtered counts."))
+  }
+  
+  # Calculate purities
   index_purity= sum(raw_counts[[value_col]]) / sum(raw_counts_uncollapsed[[value_col]])
   print(paste0('Index purity: ', round(index_purity, 4)))
   cell_line_purity= sum(filtered_counts[[value_col]]) / sum(raw_counts[[value_col]])
   print(paste0('Cell line purity: ', round(cell_line_purity, 4)))
-  
   qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity)
   
+  # Write out table
   print(paste0('Writing QC table out to ', file_path))
   qc_table %>% write.csv(file_path, row.names= FALSE, quote= FALSE)
 }
@@ -78,7 +90,15 @@ create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts,
 #' @param filtered_counts Filtered counts dataframe.
 #' @param id_cols Vector of columns names that identify each sample.
 #' @param facet_col String name of the column in filtered_counts to facet the plot.
+#'                  This can be left as NA if there isn't a column to facet on.
+#' @returns Returns a ggplot object.
 create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) {
+  # Validation: Check that id_cols and facet_col exist in filtered counts.
+  if(!validate_columns_exist(na.omit(c(id_cols, facet_col)), filtered_counts)) {
+    stop('Some input columns were not detected in filtered counts.')
+  }
+  
+  # Sum up reads 
   total_counts= filtered_counts %>%
     dplyr::mutate(barcode_type= case_when(!is.na(CCLE_name) ~ 'cell line',
                                           !is.na(Name) ~ 'ctrl barcode')) %>%
@@ -101,17 +121,25 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) {
 
 #' Cell line recover barplot
 #' 
-#' text
+#' Creates barplots of the cell lines recovered. The parameter "plot_type" can be used to plot the percentage or
+#' the total cell line counts on teh y axis. The parameter "include_ctrl_bcs" can be used to include the control
+#' barcodes in the cell line count. 
 #' 
 #' @param filtered_counts Filtered counts dataframe.
 #' @param id_cols Vector of column names that identify each sample.
 #' @param facet_col String name of the column in filtered_counts to facet the plot.
 #' @param value_col String name of the column in filtered_counts that contains the counts.
 #' @param counts_threshold Threshold used to determine low counts.
-#' @param plot_type description
-#' @param include_ctrl_bcs description
+#' @param plot_type String of either "percent" or "count" to adjust the y axis to be either the percentage or the 
+#'                  total number of cell lines.
+#' @param include_ctrl_bcs Boolean. Set to TRUE if control barcodes are to be counted. 
+#' @returns Returns a ggplot plot.
 create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value_col= 'n', count_threshold, 
                                   plot_type= 'percent', include_ctrl_bcs= FALSE) {
+  # Validation: Check that id_cols, facet_col, or value_col exist in filtered counts.
+  if(!validate_columns_exist(na.omit(c(id_cols, facet_col, value_col)), filtered_counts)) {
+    stop('Some input columns were not detected in filtered counts.')
+  }
   
   # Filter out control barcodes if it is specified.
   if(include_ctrl_bcs == FALSE) {
@@ -154,12 +182,19 @@ create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value
 
 #' Control barcode scatter plot
 #' 
-#' text
+#' Creates a scatter plot of the control barcodes.
 #' 
-#' @param name description
+#' @param normalized_counts Dataframe output from the normalize module.
+#' @param id_cols Vector of column names that identify every PCR well.
+#' @param value_col Name of the column that contains the values.
+#' @returns Returns a ggplot object.
 create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n') {
-  # Detect norm_r2 and norm_mae.
-  # If columns do not exist, then roughly calculate those columns.
+  # Validation: Check that id_cols and value_col exist in filtered counts.
+  if(!validate_columns_exist(c(id_cols, value_col), normalized_counts)) {
+    stop('Some input columns were not detected in normalized counts.')
+  }
+  
+  # Detect norm_r2 and norm_mae. If columns do not exist, then roughly calculate those columns.
   if(any(!c('norm_r2', 'norm_mae') %in% colnames(normalized_counts))) {
     print('WARNING: Columns "norm_r2" and/or "norm_mae" were not detected in normalized_counts.', quote= FALSE)
     print('Calculating both columns - this method may not be as robust as the normalize module.')
@@ -194,14 +229,26 @@ create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log
   return(trend_scatter_plot)
 }
 
-
 #' Heatmap of correlations
 #' 
-#' text
+#' Creates a correlation heatmap. A matrix of values is created from the input_df. The row_id_cols
+#' are used identify each row and the col_id_cols are used to identify each column. The value_col is 
+#' used to fill the matrix. Correlations are then computed.
 #' 
-#' @param input_df description
+#' @import tidyverse
+#' @import WGCNA
+#' @import reshape2
+#' @param input_df Dataframe.
+#' @param row_id_cols Vector of column names from input_df that identifies the cell lines. For example,
+#'                    this can be "DepMap_ID", "CCLE_name" if only cell lines exist. It can also be 
+#'                    "DepMap_ID", "CCLE_name", "Name" if control barcodes are also present.
+#' @param col_id_cols Vector of column names from input_df that identifies the PCR wells or conditions.
+#'                    For example, this can be "pcr_plate", "pcr_well" or a list of conditions like those in sig_cols.
+#' @param value_col String name of the column in input_df to be used as the values.
+#' @param cor_method WGCNA correlation method. This defaults to "pearson".
+#' @returns Returns a ggplot object.
 create_cor_heatmap= function(input_df, row_id_cols, col_id_cols, value_col,
-                           cor_method= 'pearson') {
+                             cor_method= 'pearson') {
   
   # Validate that specified columns are in the dataframe.
   if(!validate_columns_exist(c(row_id_cols, col_id_cols, value_col), input_df)) {
@@ -239,11 +286,31 @@ create_cor_heatmap= function(input_df, row_id_cols, col_id_cols, value_col,
   return(cor_heatmap)
 }
 
-#' Scatterplots of two replicates
+#' Scatter plots of two replicates
+#' 
+#' From a long table, creates scatter plots to two replicates.
 #' 
-#' @param input_df description
+#' @param input_df Dataframe.
+#' @param cell_line_cols List of column names used to identify each cell line or control barcode.
+#' @param replicate_group_cols List of column names that describe a group of similar conditions.
+#' @param replicate_col Name of the column that specifies the replicate. This column should not be 
+#'                      in replicate_group_cols!
+#' @param value_col Name of the column in input_df that contains the values.
+#' @param x_axis_rep String of the replicate identifier that should be on the x axis of the plot.
+#' @param y_axis_rep String of the replicate identifier that should be on the y axis of the plot.
+#' @returns Returns a ggplot object or NULL if all entries are filtered out.
 create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_cols, replicate_col, value_col,
-                                      x_axis_rep= '1', y_axis_rep= '2') {
+                                        x_axis_rep= '1', y_axis_rep= '2') {
+  # Validation: Check that input columns are present in the dataframe.
+  if(!validate_columns_exist(c(cell_line_cols, replicate_group_cols, replicate_col, value_col), input_df)) {
+    stop('Some input columns were not detected in normalized counts.')
+  }
+  
+  # Validation: Check that replicate_col is not in replicate_group_cols.
+  if(replicate_col %in% replicate_group_cols) {
+    stop(paste0(replicate_col, ' should not be included in replicate_group_cols!'))
+  }
+  
   reps_piv= input_df %>% 
     tidyr::unite(all_of(replicate_group_cols), col= 'replicate_group', sep= ':', remove= TRUE, na.rm= FALSE) %>%
     dplyr::group_by(pick(all_of(c(cell_line_cols, 'replicate_group')))) %>%
@@ -276,38 +343,36 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou
   return(reps_scatter)
 }
 
-#'  QC_images
+#' QC_images
+#'
+#' Takes in various pipeline outputs and generates 11 QC files.
 #'
-#'  Takes in the metadata, raw counts, annotated counts, and normalized counts to generate some QC images.
-#'  
-#' @param sample_meta - sample metadata
-#' @param annotated_counts - dataframe of annotated readcounts that must include the following columns:
-#'           n: raw readcounts
-#'           profile_id: string unique to each sample as defined by filter_counts method
-#'           Name: name of the control barcode that the read corresponds to, or NA (if read is cell line)
-#'           CCLE_name: name of the cell line that the read corresponds to, or NA (if read is control barcode)
-#'           cell_set: string identifier of cell set expected in a given sample, must match a cell set 
-#'           found in cell_set_meta
-#' @param normalized_counts - 
-#' @param CB_meta - control barcode metadata
-#' @param cell_set_meta - a metadata dataframe that contains a mapping from cell set names (e.g. CS5) to 
-#'           lists of LUAs in that cell set separated by semicolons
-#' @param out - the filepath to the folder in which QC images are meant to be saved, NA by default and 
-#'           images are saved in the working directory 
-#' @param id_cols 
-#' @param sig_cols - 
-#' @param count_col_names - which counts to plot
-#' @param control_type - how the negative controls are designated in the trt_type column in the sample metadata
-#' @param count_threshold - threshold for low counts
-#' @param reverse_index2 reverse index 2 if newer sequencers are used.
-#' @return - NA, QC images are written out to the specified folder
-#' @export
-QC_images = function(raw_counts_uncollapsed, raw_counts, 
-                     filtered_counts, normalized_counts= NA, l2fc, 
-                     sample_meta, CB_meta, cell_set_meta,
-                     id_cols, sig_cols, count_col_name= 'normalized_n',
-                     control_type, count_threshold= 40, 
-                     reverse_index2= FALSE, out = NA) {
+#' @param raw_counts_uncollapsed Dataframe output from nori. This is used to generate purity metrics and
+#'                               the index summaries.
+#' @param raw_counts Raw counts dataframe from the collate_fastq_reads modules. This is used to generate puritu metrics.
+#' @param annotated_counts Annotated counts dataframe from the filter_raw_reads module.
+#' @param filtered_counts Filtered counts dataframe from the filter_raw_reads module.
+#' @param normalized_counts Normalized counts dataframe from the normalize module. This is an optional parameter.
+#' @param l2fc L2FC dataframe from the compute_l2fc module. This is used for the bio_reps plot. 
+#' @param sample_meta Dataframe of the sample metadata for the sequencing run.
+#' @param CB_meta Dataframe of the control barcode metadata. This is only used for the CDF plot.
+#' @param cell_set_meta Dataframe of the cell set metadata. This is only used for the CDF plot.
+#' @param cell_line_cols Vector of sample meta column names used to describe a cell line or barcode.
+#' @param id_cols Vector of sample meta column names used to identify each PCR well. 
+#'                This defaults to "pcr_plate", "pcr_well".
+#' @param sig_cols Vector of sample meta column names used to identify a unique treatment condition.
+#' @param control_type String of how the negative controls are designated in the trt_type column in the sample_meta.
+#' @param count_threshold Threshold for low read counts.
+#' @param reverse_index2 Boolean set to TRUE if the sequencing involved the reverse complement workflow.
+#' @param out Path to the directory to save the QC images.
+#' @returns NA. QC images are written out to the specified folder.
+QC_images= function(raw_counts_uncollapsed, raw_counts, 
+                    annotated_counts, filtered_counts, normalized_counts= NA, l2fc, 
+                    sample_meta, CB_meta, cell_set_meta,
+                    cell_line_cols, 
+                    id_cols= c('pcr_plate', 'pcr_well'), sig_cols,
+                    control_type= 'negcon', count_threshold= 40, 
+                    reverse_index2= FALSE, out = NA) {
   require(tidyverse)
   require(magrittr)
   require(reshape2)
@@ -321,12 +386,6 @@ QC_images = function(raw_counts_uncollapsed, raw_counts,
   skipped_qcs= c() # empty vector to collect potential errors
   num_profiles = annotated_counts %>% dplyr::distinct(pick(all_of(id_cols))) %>% nrow()
   
-  # Reverse index 2 barcodes
-  if(reverse_index2) {
-    print("Reverse-complementing index 2 barcode.")
-    sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))
-  }
-  
   # Detect control barcodes
   cb_check= sample_meta %>%
     dplyr::filter(control_barcodes %in% c("Y", "T", T),
@@ -368,6 +427,12 @@ QC_images = function(raw_counts_uncollapsed, raw_counts,
   }
   
   # Do the same for index 2.
+  # Reverse index 2 barcodes if needed.
+  if(reverse_index2) {
+    print("Reverse-complementing index 2 barcode.")
+    sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))
+  }
+  
   if('index_2' %in% colnames(sample_meta) & 'index_2' %in% colnames(raw_counts_uncollapsed)) {
     expected_index2= unique(sample_meta$index_2)
     index2_counts= get_index_summary(raw_counts_uncollapsed, 'index_2', expected_index2)
@@ -726,7 +791,7 @@ QC_images = function(raw_counts_uncollapsed, raw_counts,
       print('11. No biological replicates detected. Skipping bio_rep heatmap.')
     }
   }
-
+  
   # End _________________________ ----
   print('QC finishing')
   if(length(na.omit(skipped_qcs)) != 0) {

From 2435648fdda0b7698e7a11e259716ebcab46697d Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Mon, 19 Aug 2024 17:31:07 -0400
Subject: [PATCH 009/127] Update module calls

---
 scripts/collate_fastq_reads.R |  4 ++--
 scripts/filter_counts.R       | 25 ++++++++-----------------
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index 29f4d994..ea2ac734 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -54,8 +54,8 @@ if(file.exists(expected_file_path)) {
   
   print("Collating fastq reads ...")
   raw_counts= collate_fastq_reads(uncollapsed_raw_counts, sample_meta, 
-                                  sequencing_index_cols,
-                                  id_cols,
+                                  sequencing_index_cols= sequencing_index_cols,
+                                  id_cols= id_cols,
                                   reverse_index2= args$reverse_index2,
                                   barcode_col= args$barcode_col)
   
diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R
index 11958c7d..61b18f31 100755
--- a/scripts/filter_counts.R
+++ b/scripts/filter_counts.R
@@ -36,11 +36,9 @@ parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help = "Ce
 parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help = "Cell set metadata")
 parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata")
 parser$add_argument("--CB_meta", default="../metadata/CB_meta.csv", help = "Control Barcode metadata")
-parser$add_argument("--sequencing_index_cols", default= "index_1,index_2", 
+parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", 
                     help = "Sequencing columns in the sample meta")
 parser$add_argument("--count_threshold", default= 40, help = "Low counts threshold")
-parser$add_argument("--reverse_index2", action="store_true", default=FALSE, 
-                    help = "Reverse complement of index 2 for NovaSeq and NextSeq")
 parser$add_argument("--rm_data", action="store_true", default=FALSE, help = "Remove bad experimental data")
 parser$add_argument("--pool_id", action="store_true", default=FALSE, help = "Pull pool IDs from CellDB.")
 parser$add_argument("--control_type", default="negcon", 
@@ -64,7 +62,7 @@ raw_counts= data.table::fread(args$raw_counts, header= T, sep= ',', data.table=
 
 # Convert strings to vectors ----
 # Also check that column names are present in the sample meta.
-sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ","))
+id_cols= unlist(strsplit(args$id_cols, ","))
 if (!all(sequencing_index_cols %in% colnames(sample_meta))){
   stop(paste("All seq columns not found in sample_meta, check metadata or --sequencing_index_cols argument:",
              args$sequencing_index_cols))
@@ -93,14 +91,12 @@ cell_line_meta %<>%
 
 # Run filter_raw_reads -----
 print("creating filtered count file")
-filtered_counts = filter_raw_reads(raw_counts,
-                                   sample_meta,
-                                   cell_line_meta,
-                                   cell_set_meta,
-                                   CB_meta,
-                                   sequencing_index_cols= sequencing_index_cols,
-                                   count_threshold= as.numeric(args$count_threshold),
-                                   reverse_index2= args$reverse_index2)
+filtered_counts = filter_raw_reads(raw_counts= raw_counts, sample_meta= sample_meta,
+                                   cell_line_meta= cell_line_meta,
+                                   cell_set_meta= cell_set_meta,
+                                   CB_meta= CB_meta,
+                                   id_cols= id_cols,
+                                   count_threshold= as.numeric(args$count_threshold))
 
 # Pulling pool_id when db_flag and pool_id flags are passed
 if (args$pool_id) {
@@ -129,11 +125,6 @@ if(sum(cl_entries$n) == 0) {
 }
 
 # Write out module outputs ----
-qc_table = filtered_counts$qc_table
-qc_out_file = paste(args$out, 'QC_table.csv', sep='/')
-print(paste("writing QC_table to: ", qc_out_file))
-write.csv(qc_table, qc_out_file, row.names=F, quote=F)
-
 unmapped_reads= filtered_counts$unmapped_reads
 unmapped_out = paste(args$out, 'unmapped_reads.csv', sep='/')
 print(paste("writing unmapped reads to: ", unmapped_out))

From 16966e9dd48373dc770a1815349d61cd43e09aec Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Mon, 19 Aug 2024 17:31:23 -0400
Subject: [PATCH 010/127] Update function documentation

---
 scripts/src/collate_fastq_reads.R | 27 +++++++---
 scripts/src/filter_raw_reads.R    | 83 +++++++++++++------------------
 2 files changed, 54 insertions(+), 56 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index c74e4861..26f0063f 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -77,17 +77,28 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) {
 #' collate_fastq_reads
 #' 
 #' This function takes in the fastq reads (uncollapsed_raw_counts) and
-#' filters for reads coming from flowcells specificed in the sample meta.
-#' The function then sums up the reads across specified sequencing index columns.
+#' filters for reads coming from flowcells specified in the sample meta.
+#' The function then sums up the reads across specified sequencing index columns and
+#' maps the sequencing index columns to the ID columns.
 #' 
-#' @param uncollapsed_raw_counts Data frame of reads from all the fastq files with the following columns - \cr
-#'                    "flowcell_name", "flowcell_lane", "index_1", "index_2", and "forward_read_cl_barcode", "n"
+#' @param uncollapsed_raw_counts Dataframe of reads from all the fastq files with the following columns -
+#'                    "flowcell_name", "flowcell_lane", "index_1", "index_2", "forward_read_cl_barcode", and "n".
 #' @param sample_meta Sample metadata generate for the project which may contain the following columns - 
-#'                    "flowcell_names", "flowcell_lanes", "index_1", "index_2". The sample meta must contain
+#'                    "flowcell_names", "flowcell_lanes", "index_1", "index_2". The sample meta MUST contain
 #'                    "flowcell_names" and "flowcell_lanes" for filtering.
-#' @param sequencing_index_cols Sequencing columns from the sample meta that the counts should be collapsed on. \cr
-#'                              This defaults onto the following columns: "index_1", "index_2"
-#' @returns Returns a dataframe with columns specified by the sequencing_index_cols, "forward_read_cl_barcode", and "n".
+#' @param sequencing_index_cols Sequencing columns from the sample meta that the counts should be collapsed on.
+#'                              These columns should be a subset of the four sequencing related columns in the
+#'                              sample meta - "flowcell_names", "flowcell_lanes", "index_1", and "index_2". They 
+#'                              should also uniquely identify every PCR well. This parameter defaults onto 
+#'                              the following columns: "index_1", "index_2".
+#' @param id_cols ID columns from the sample meta that uniquely identify every PCR well. These columns should not 
+#'                include any sequencing related columns. This parameter defaults onto "pcr_plate", "pcr_well". This 
+#'                parameter can also be a list of the sample conditions columns as long as they uniquely identify every
+#'                PCR well. For example "cell_set", "treatment", "dose", "day", "bio_rep", "tech_rep" can also be used.
+#' @param reverse_index2 Index 2 should be reversed if the sequencer uses a reverse complement workflow. 
+#'                       Defaults to FALSE.
+#' @param barcode_col String name of the column in uncollapsed_raw_counts that contains the sequences.  
+#' @returns Returns a dataframe with columns specified by the id_cols along with barcode_col, and "n".
 #' @import tidyverse
 collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, 
                               sequencing_index_cols= c('index_1', 'index_2'),
diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R
index e29f811a..1cf909b1 100755
--- a/scripts/src/filter_raw_reads.R
+++ b/scripts/src/filter_raw_reads.R
@@ -64,48 +64,33 @@ validate_cell_set_luas= function(sample_meta, cell_set_meta) {
 
 #' filter raw reads
 #' 
-#' takes the raw readcount table and filters for expected indices and cell lines
-#' using the given metadata. QC metrics are returned as a data.frame
+#' Takes the raw readcount table and filters for expected indices and cell lines
+#' using the given metadata.
 #'
-#' @param raw_counts - an unfiltered counts table
-#' @param sample_meta - the sample metadata for the particular experiment. Must follow the given set of 
-#'                      guidelines for metadata. Required columns include:
-#'                      - index_1
-#'                      - index_2
-#'                      - cell_set
-#' @param cell_line_meta - master metadata of cell lines with the following required columns:
-#'                       - CCLE_name
-#'                       - DepMap_ID
-#'                       - LUA
-#'                       - Sequence
-#' @param cell_set_meta - master metdata of cell sets and their contents with the following required columns:
-#'                      - cell_set
-#'                      - members
-#' @param CB_meta - master metdata of control barcodes, their sequences, and their doses. 
-#'                The file should contain the columns:
-#'                - Sequence
-#'                - Name
-#'                - log_dose
-#' @param sequencing_index_cols Vector of column names from the sample meta that is used to uniquely identify where a
-#'                              sequencing read is coming from. This defaults to "index_1" and "index_2", but 
-#'                              it can be expanded to include "flowcell_names" and "flowcell_lanes".
-#' @param reverse_index2 Reverses index2 for certain sequencers
-#' @param count_threshold Threshold to call low counts. 
-#' @param control_type - how the negative controls are designated in the trt_type column in the sample metadata
-#' @return - list with the following elements
+#' @param raw_counts Dataframe of reads. The columns of this dataframe should include the id_cols,
+#'                   "forward_read_cl_barcode", and "n".
+#' @param sample_meta Dataframe of the metadata for the sequencing run. This file should contain the id_cols,
+#'                    "cell_set", "control_barcodes", etc.
+#' @param cell_line_meta Master metadata of cell lines with the following required columns - "CCLE_name",
+#'                       "DepMap_ID", "LUA", and "Sequence".
+#' @param cell_set_meta Master metadata of cell sets and their contents with the following required columns -
+#'                      "cell_set" and "members".
+#' @param CB_meta Master metadata of control barcodes, their sequences, and their doses. The file should contain 
+#'                the columns - "Sequence", "Name", and "log_dose".
+#' @param id_cols Columns present in both raw_counts and sample_meta that uniquely identify each PCR well. 
+#'                This defaults to "pcr_plate", "pcr_well".
+#' @param count_threshold Threshold to call low counts. This defaults to 40. 
+#' @returns List with the following elements:
 #' #' \itemize{
 #'   \item unmapped_reads: table of reads with valid index pairs but did not map to any known barcode.
-#'         The table contains the following columns - index_1, index_2, forward_read_cl_barcode, and n
-#'   \item annotated_counts: table of reads and the associated well and well conditions
+#'         The table contains the following columns - id_cols, "forward_read_cl_barcode", and "n".
+#'   \item annotated_counts: table of reads and the associated well and well conditions.
 #'   \item filtered_counts: table of all expected reads for the project, this is a subset of annotated counts.
-#'   \item qc_table: QC table of index_purity and cell_line_purity 
 #' }
-#' @export 
 filter_raw_reads = function(raw_counts, 
                             sample_meta, cell_line_meta, cell_set_meta, CB_meta,
-                            id_cols= c('pcr_plate', 'pcr_well'),
-                            reverse_index2= FALSE, count_threshold= 40) {
-  
+                            id_cols= c('pcr_plate', 'pcr_well'), 
+                            count_threshold= 40) {
   require(tidyverse)
   require(magrittr)
   
@@ -116,15 +101,6 @@ filter_raw_reads = function(raw_counts,
     CB_meta= CB_meta %>% dplyr::mutate(log2_dose= log_dose/log10(2)) %>% dplyr::select(-log_dose)
   }
   
-  # if(reverse_index2) {
-  #   if ('index_2' %in% colnames(sample_meta)) {
-  #     print("Reverse-complementing index 2 barcode ...")
-  #     sample_meta$index_2 <- chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))
-  #   } else {
-  #     stop('ERROR: Reverse index 2 is set to TRUE, but index_2 does not exists.')
-  #   }
-  # }
-  
   # Validation: Check that id_cols exist in the sample meta ----
   if(!validate_columns_exist(id_cols, sample_meta)) {
     stop('One or more id_cols is NOT present in the sample meta.')
@@ -138,9 +114,18 @@ filter_raw_reads = function(raw_counts,
   
   # Validation: Check that cell sets do not contain duplicate LUAs ----
   # This will produce a warning if a LUA appears in a cell set more than once!
-  # This currently does NOT result in an error. Error avoided using a distinct later in line 162
+  # This currently does NOT result in an error. Error avoided using a distinct when creating the template
   validate_cell_set_luas(sample_meta, cell_set_meta)
   
+  # Split off unmapped reads ----
+  # Unmapped reads are defined as reads that are identified from valid PCR locations,
+  # but do not map to known barcodes in PRISM.
+  # Also sorted reads in descending order by read count.
+  print('Splitting off unmapped reads ...')
+  raw_counts %<>% dplyr::mutate(mapped= forward_read_cl_barcode %in% c(cell_line_meta$Sequence, CB_meta$Sequence))
+  unmapped_reads= raw_counts %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>% 
+    dplyr::arrange(dplyr::desc(n))
+  
   # Creating a template of all expected reads in the run ----
   # Use all 4 meta data files to create a "template" dataframe where
   # every row is a cell line that is expected in a PCR well. 
@@ -168,14 +153,14 @@ filter_raw_reads = function(raw_counts,
   # Reads that to not match to the template are contaminants and,
   # reads that are only present in the template are missing/not detected by PCR.
   print("Annotating reads ...")
-  annotated_counts= raw_counts %>%
+  annotated_counts= raw_counts %>% dplyr::filter(mapped) %>%
     dplyr::left_join(cell_line_meta, by= join_by('forward_read_cl_barcode'=='Sequence'),
                      relationship= 'many-to-one') %>%
     dplyr::left_join(CB_meta, by= join_by('forward_read_cl_barcode'=='Sequence'),
                      relationship= 'many-to-one') %>%
     dplyr::left_join(sample_meta, by= id_cols, relationship= 'many-to-one') %>%
     dplyr::full_join(template %>% dplyr::mutate(expected_read= T),
-                     by= c('forward_read_cl_barcode'='Sequence', intersect(colnames(template), colnames(.))),
+                     by= c('forward_read_cl_barcode'= 'Sequence', intersect(colnames(template), colnames(.))),
                      relationship= 'one-to-one') %>%
     # drop unneeded columns and fill in any new NAs from the merge
     dplyr::select(!any_of(c('prism_cell_set', 'members', 'mapped'))) %>%
@@ -201,7 +186,9 @@ filter_raw_reads = function(raw_counts,
     print('Warning: Low cell line purity!')
   }
   
-  return(list(annotated_counts= annotated_counts, filtered_counts= filtered_counts))
+  return(list(unmapped_reads= unmapped_reads, 
+              annotated_counts= annotated_counts, 
+              filtered_counts= filtered_counts))
 }
 
 # checks is a string can be numeric

From 86be77df3965ce8f4b5636fc254525067e52bdfb Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 20 Aug 2024 09:30:23 -0400
Subject: [PATCH 011/127] Update filter_counts.R

---
 scripts/filter_counts.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R
index 61b18f31..2d09f2c6 100755
--- a/scripts/filter_counts.R
+++ b/scripts/filter_counts.R
@@ -31,9 +31,9 @@ parser$add_argument("-q", "--quietly", action="store_false",
 parser$add_argument("--wkdir", default=getwd(), help="Working directory")
 parser$add_argument("-c", "--raw_counts", default="raw_counts.csv", help = "path to file containing raw counts")
 parser$add_argument("-o", "--out", default="", help = "Output path. Default is working directory")
-parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help = "Sample metadata")
-parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help = "Cell Line metadata")
-parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help = "Cell set metadata")
+parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata")
+parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata")
+parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help= "Cell set metadata")
 parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata")
 parser$add_argument("--CB_meta", default="../metadata/CB_meta.csv", help = "Control Barcode metadata")
 parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", 

From 1458ed7c40f1ccaacc34989cf3704700804c8208 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 20 Aug 2024 09:31:38 -0400
Subject: [PATCH 012/127] Update parameters

---
 scripts/filteredCounts_QC.R | 72 ++++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 28 deletions(-)

diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R
index 2e9da455..3d83625f 100755
--- a/scripts/filteredCounts_QC.R
+++ b/scripts/filteredCounts_QC.R
@@ -22,25 +22,30 @@ parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="
 parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output")
 parser$add_argument("--wkdir", default=getwd(), help="Working directory")
 parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata")
-parser$add_argument("--raw_counts", default= "raw_counts.csv", help="path to file containing raw counts")
-parser$add_argument("--annotated_counts", default="annotated_counts.csv",
-                    help="path to file containing annotated counts")
+parser$add_argument("-c", "--uncollapsed_raw_counts", default="raw_counts_uncollapsed.csv",
+                    help="path to file containing uncollapsed raw counts file")
+parser$add_argument("--raw_counts", default= "raw_counts.csv", help="path to raw counts file")
+parser$add_argument("--annotated_counts", default= "annotated_counts.csv",
+                    help= "path to file containing annotated counts")
+parser$add_argument("--filtered_counts", default= "filtered_counts.csv", help= "path to filtered_counts file")
 parser$add_argument("--normalized_counts", default="normalized_counts.csv",
                     help="path to file containing normalized counts")
+parser$add_argument("--l2fc", default="l2fc.csv", help= "path to l2fc file")
+parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help = "Sample metadata")
 parser$add_argument("--CB_meta", default="../metadata/CB_meta.csv", help = "control barcode metadata")
 parser$add_argument("--cell_set_meta", default="../metadata/cell_set_meta.csv", help = "Cell set metadata")
-parser$add_argument("-o","--out", default="", help = "Output path. Default is working directory")
-parser$add_argument("--id_cols", default="cell_set,treatment,dose,dose_unit,day,bio_rep,tech_rep",
-                    help = "Columns to identify each PCR well")
+parser$add_argument("--cell_line_cols", default= 'DepMap_ID,CCLE_name',
+                    help= "Columns that identify cell lines or barcodes")
+parser$add_argument("--id_cols", default= 'pcr_plate,pcr_well',
+                    help= "Columns to identify each PCR well")
 parser$add_argument("--sig_cols", default="cell_set,treatment,dose,dose_unit,day", 
-                    help = "columns used to generate signature ids")
-parser$add_argument("--count_col_name", default="normalized_n", 
-                    help = "column containing counts with which to calculate l2fc")
-parser$add_argument("--count_threshold", default=40, 
-                    help = "Low counts threshold")
-parser$add_argument("--reverse_index2", default=FALSE, help = "Reverse index 2")
+                    help= 'Columns used to identify the treatment conditions')
 parser$add_argument("--control_type", default = "negcon",
-                    help = "how negative control wells are distinguished in the trt_type column")
+                    help= "how negative control wells are distinguished in the trt_type column")
+parser$add_argument("--count_threshold", default=40, help= "Low counts threshold")
+parser$add_argument("--reverse_index2", default=FALSE, help = "Reverse index 2")
+parser$add_argument("-o","--out", default="", help = "Output path. Default is working directory")
+
 # parser$add_argument("--db_flag", action="store_true", default=FALSE, help = "Use CellDB to locate cell set information")
 
 # get command line options, if help option encountered print help and exit
@@ -51,21 +56,29 @@ if (args$out == ""){
 }
 
 # Read in files and pull out parameters ----
-sample_meta= data.table::fread(args$sample_meta, header=TRUE, sep=',', data.table=FALSE)
-raw_counts= data.table::fread(args$raw_counts, header=TRUE, sep=',', data.table=FALSE)
-annotated_counts= data.table::fread(args$annotated_counts, header=TRUE, sep=',', data.table=FALSE)
+# Pipeline outputs
+raw_counts_uncollapsed= data.table::fread(args$uncollapsed_raw_counts, header= TRUE, sep= ',')
+raw_counts= data.table::fread(args$raw_counts, header= TRUE, sep= ',')
+annotated_counts= data.table::fread(args$annotated_counts, header= TRUE, sep= ',')
+filtered_counts= data.table::fread(args$filtered_counts, header= TRUE, sep= ',')
 if(file.exists(args$normalized_counts)) {
   normalized_counts= data.table::fread(args$normalized_counts, header=TRUE, sep=',', data.table=FALSE)
 } else {
   normalized_counts= NA
 }
+l2fc= data.table::fread(args$l2fc, header= TRUE, sep= ',')
+
+# Metadata files
+sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',', data.table= FALSE)
 CB_meta= data.table::fread(args$CB_meta, header=TRUE, sep=',', data.table=FALSE)
-id_cols = unlist(strsplit(args$id_cols, ","))
-sig_cols = unlist(strsplit(args$sig_cols, ","))
-count_col_name = args$count_col_name
-count_threshold = as.numeric(args$count_threshold)
 cell_set_meta = data.table::fread(args$cell_set_meta, header=TRUE, sep=',', data.table=FALSE)
+
+# Parameters
+cell_line_cols = unlist(strsplit(args$cell_line_cols, ","))
+id_cols= unlist(strsplit(args$id_cols, ","))
+sig_cols= unlist(strsplit(args$sig_cols, ","))
 control_type = args$control_type
+count_threshold= as.numeric(args$count_threshold)
 
 # # If flag passed, use cell_set_meta file generated for the project via CellDB
 # if (args$db_flag) {
@@ -78,16 +91,19 @@ control_type = args$control_type
 # }
 
 print("Generating QC images ...")
-QC_images(raw_counts= raw_counts,
+QC_images(raw_counts_uncollapsed= raw_counts_uncollapsed, 
+          raw_counts= raw_counts, 
           annotated_counts= annotated_counts, 
-          normalized_counts= normalized_counts,
-          sample_meta= sample_meta,
+          filtered_counts= filtered_counts, 
+          normalized_counts= normalized_counts, 
+          l2fc= l2fc, 
+          sample_meta= sample_meta, 
           CB_meta= CB_meta, 
-          cell_set_meta= cell_set_meta, 
-          id_cols= id_cols,
+          cell_set_meta= cell_set_meta,
+          cell_line_cols= c('DepMap_ID', 'CCLE_name'), 
+          id_cols= id_cols, 
           sig_cols= sig_cols,
-          count_col_name= count_col_name, 
-          control_type = control_type,
-          count_threshold= count_threshold,
+          control_type= control_type, 
+          count_threshold= count_threshold, 
           reverse_index2= args$reverse_index2,
           out= args$out)

From 1dba7688f51bc1582a0454bdf993e383e5a830cc Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 20 Aug 2024 11:04:14 -0400
Subject: [PATCH 013/127] Fixed contams_reads output

---
 scripts/src/QC_images.R | 43 ++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index 273aad88..a8ec5bac 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -510,43 +510,43 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   ## Contaminates for ursula ----
   print('6. Generating contaminate reads for Ursula ...')
   potential_error= base::tryCatch({
-    # Determine which seq cols are present.
-    rc_seq_cols= c('flowcell_names', 'flowcell_lanes', 'index_1', 'index_2')
-    present_seq_cols= intersect(rc_seq_cols, colnames(raw_counts))
+    pcr_locations= c('pcr_plate', 'pcr_well')
     
-    # map of seq_cols to PCR locations
-    pcr_plate_map= sample_meta %>%
-      dplyr::distinct(pick(any_of(c(present_seq_cols, 'pcr_plate', 'pcr_well', 'cell_set')))) %>%
+    # Validation: Check that the PCR columns are present in raw_counts.
+    if(!validate_columns_exist(pcr_locations, raw_counts)) {
+      stop('pcr_plate and pcr_well are required raw_counts.csv for this to work.')
+    }
+      
+    # count number of wells a cell_set appears in.
+    pcr_plate_map= sample_meta %>% dplyr::distinct(pick(any_of(c(pcr_locations, 'cell_set')))) %>%
       dplyr::group_by(pcr_plate) %>% dplyr::mutate(num_wells_in_plate= dplyr::n()) %>% dplyr::ungroup() %>%
       dplyr::group_by(cell_set) %>% dplyr::mutate(num_wells_in_set= dplyr::n()) %>% dplyr::ungroup()
-    
+
     # index filter and identify reads as mapped or not
-    unique_seq_col_vals= sample_meta %>% dplyr::distinct(pick(all_of(present_seq_cols)))
-    sequencing_filter= raw_counts %>% 
-      dplyr::semi_join(unique_seq_col_vals, by= present_seq_cols) %>%
-      dplyr::mutate(mapped= ifelse(forward_read_cl_barcode %in% unique(annotated_counts$forward_read_cl_barcode), T, F))
-    
+    sequencing_filter= raw_counts %>%
+      dplyr::mutate(mapped= forward_read_cl_barcode %in% unique(annotated_counts$forward_read_cl_barcode))
+
     # total counts per well - used to calculate fractions
-    counts_per_well= sequencing_filter %>% dplyr::group_by(pick(all_of(present_seq_cols))) %>% 
+    counts_per_well= sequencing_filter %>% dplyr::group_by(pick(all_of(pcr_locations))) %>%
       dplyr::summarise(well_total_n= sum(n)) %>% dplyr::ungroup()
-    
+
     # mapped contaminates to bind
     mapped_contams= annotated_counts %>% dplyr::filter(!expected_read) %>%
       dplyr::mutate(barcode_name= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>%
-      dplyr::select(all_of(c(present_seq_cols, 'forward_read_cl_barcode', 'n', 'barcode_name')))
+      dplyr::select(all_of(c(pcr_locations, 'forward_read_cl_barcode', 'n', 'barcode_name')))
     
     contam_reads= sequencing_filter %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>%
       dplyr::bind_rows(mapped_contams) %>%
-      dplyr::left_join(counts_per_well, by= present_seq_cols) %>%
-      dplyr::left_join(pcr_plate_map, by= present_seq_cols) %>%
+      dplyr::left_join(counts_per_well, by= pcr_locations) %>%
+      dplyr::left_join(pcr_plate_map, by= pcr_locations) %>%
       # filter out barcodes that only appear in one well
       dplyr::group_by(forward_read_cl_barcode) %>% dplyr::filter(dplyr::n() >1) %>% dplyr::ungroup() %>%
       # number of wells in a pcr plate a barcode is detected in
       dplyr::group_by(forward_read_cl_barcode, pcr_plate) %>%
-      dplyr::mutate(num_wells_detected_plate= n()) %>% dplyr::ungroup() %>%
+      dplyr::mutate(num_wells_detected_plate= dplyr::n()) %>% dplyr::ungroup() %>%
       # number of wells in a cell set a barcode is detected in
       dplyr::group_by(forward_read_cl_barcode, cell_set) %>%
-      dplyr::mutate(num_wells_detected_set= n()) %>% dplyr::ungroup() %>%
+      dplyr::mutate(num_wells_detected_set= dplyr::n()) %>% dplyr::ungroup() %>%
       # determine if contamination is project, plate, or set
       dplyr::group_by(forward_read_cl_barcode) %>%
       dplyr::mutate(num_wells_detected= dplyr::n(),
@@ -774,8 +774,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
                                         cor_method= 'pearson') 
         pdf(file=paste(out, "bio_corr_hm.pdf", sep="/"),
             width=sqrt(num_profiles), height=sqrt(num_profiles))
-        print(bio_corr_hm)
-        dev.off()
+        #print(bio_corr_hm)
+        #dev.off()
       }, error= function(e) {
         print(e)
         print('Encountered an error when creating the bio_corr_hm figure. Skipping this output ...') 
@@ -800,5 +800,4 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   } else {
     print('No errors encountered.')
   }
-  return(skipped_qcs)
 }

From 6bb451cd985c590a1dacd2fd315950857b774cb0 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 20 Aug 2024 11:04:37 -0400
Subject: [PATCH 014/127] Updated scripts

---
 scripts/collate_fastq_reads.R | 1 +
 scripts/filter_counts.R       | 6 ------
 scripts/filteredCounts_QC.R   | 1 +
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index ea2ac734..198c5ad1 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -37,6 +37,7 @@ if(file.exists(expected_file_path)) {
   uncollapsed_raw_counts= data.table::fread(expected_file_path, header= T, sep= ',', data.table= F)
   sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',', data.table= F)
   
+  # Parse vector inputs
   sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ","))
   id_cols= unlist(strsplit(args$id_cols, ","))
   
diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R
index 2d09f2c6..31849b77 100755
--- a/scripts/filter_counts.R
+++ b/scripts/filter_counts.R
@@ -61,13 +61,7 @@ sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',', data.table
 raw_counts= data.table::fread(args$raw_counts, header= T, sep= ',', data.table= F)
 
 # Convert strings to vectors ----
-# Also check that column names are present in the sample meta.
 id_cols= unlist(strsplit(args$id_cols, ","))
-if (!all(sequencing_index_cols %in% colnames(sample_meta))){
-  stop(paste("All seq columns not found in sample_meta, check metadata or --sequencing_index_cols argument:",
-             args$sequencing_index_cols))
-}
-
 count_threshold = as.numeric(args$count_threshold)
 
 # make sure LUA codes in cell line meta are unique
diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R
index 3d83625f..ce93e655 100755
--- a/scripts/filteredCounts_QC.R
+++ b/scripts/filteredCounts_QC.R
@@ -13,6 +13,7 @@ suppressPackageStartupMessages(library(ggplot2))
 suppressPackageStartupMessages(library(ggpubr))
 suppressPackageStartupMessages(library(scales)) # for out of bound handling in plots
 suppressPackageStartupMessages(library(ggpmisc)) # with ggplot to add fit line and labels
+suppressPackageStartupMessages(library(WGCNA))
 source("./src/QC_images.R") # source function
 
 # Argument parser ----

From c06f35a4c95578e2bb2bfb50836a8418419f530e Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 20 Aug 2024 13:42:06 -0400
Subject: [PATCH 015/127] Create joining_sample_meta_columns.R

---
 scripts/joining_sample_meta_columns.R | 58 +++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 scripts/joining_sample_meta_columns.R

diff --git a/scripts/joining_sample_meta_columns.R b/scripts/joining_sample_meta_columns.R
new file mode 100644
index 00000000..eb39d84f
--- /dev/null
+++ b/scripts/joining_sample_meta_columns.R
@@ -0,0 +1,58 @@
+library(argparse)
+library(tidyverse)
+source("./src/join_sample_meta.R")
+
+# Argument parser ----
+parser <- ArgumentParser()
+# specify our desired options 
+parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Sample meta data for the sequencing run.')
+parser$add_argument('--l2fc', default= 'l2fc.csv', help= 'L2FC data.') # level 4
+parser$add_argument('--collapsed_l2fc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5
+parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', 
+                    help= 'Columns that uniquely identify a condition.') 
+parser$add_argument('--out', default= getwd(), help= 'Path to the output directory.')
+
+args <- parser$parse_args()
+
+# set output to working directory if none is specified
+if (args$out == "") {
+  args$out = args$wkdir
+}
+
+# Prepare args ----
+sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',')
+sig_cols= unlist(strsplit(args$sig_cols, ","))
+
+# Add in metadata for l2fc file ----
+if(file.exists(args$l2fc)) {
+  l2fc= data.table::fread(args$l2fc, header= T, sep= ',')
+  if('bio_rep' %in% sample_meta & 'bio_rep' %in% l2fc) {
+    input_cols= c(sig_cols, 'bio_rep')
+  } else {
+    input_cols= sig_cols
+    print('WARNING: No "bio_rep" column detected. Proceeding with just sig_cols.')
+  }
+  l2fc_with_sm= join_sample_meta(df= l2fc, sample_meta, key_cols= input_cols)
+  
+  # Write out
+  outpath= paste(args$out, 'l2fc_with_sm.csv', sep='/')
+  print(paste("Writing l2fc_with_sm.csv to ", outpath))
+  write.csv(l2fc_with_sm, outpath, row.names= FALSE, quote= FALSE)
+} else {
+  print('WARNING: l2fc.csv does not exist. Skipping this file.')
+}
+#
+
+# Add in metadata for collapsed_l2fc file ----
+if(file.exists(args$collapsed_l2fc)) {
+  collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',')
+  collapsed_l2fc_with_sm= join_sample_meta(df= collapsed_l2fc, sample_meta, key_cols= sig_cols)
+  
+  # Write out
+  outpath= paste(args$out, 'collapsed_l2fc_with_sm.csv', sep='/')
+  print(paste("Writing collapsed_l2fc_with_sm.csv to ", outpath))
+  write.csv(collapsed_l2fc_with_sm, outpath, row.names= FALSE, quote= FALSE)
+} else {
+  print('WARNING: collapsed_l2fc.csv does not exist. Skipping this file.')
+}
+#

From 0b53666d41be5d1c365396f0d3277d88471a0f6e Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 20 Aug 2024 13:42:39 -0400
Subject: [PATCH 016/127] Pick columns with 1 unique value per group

---
 scripts/src/join_sample_meta.R | 35 +++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/scripts/src/join_sample_meta.R b/scripts/src/join_sample_meta.R
index 4fd9b43e..13f66b30 100644
--- a/scripts/src/join_sample_meta.R
+++ b/scripts/src/join_sample_meta.R
@@ -1,13 +1,18 @@
 #' validate_columns_exist
 #' 
 #' This function checks that a list of columns are present in a dataframe.
+#' Columns that were not found in the dataframe are printed out.
 #' 
 #' @param selected_columns A vector of strings each representing a column name
 #' @param df A dataframe to check against
 #' @return Boolean
-validate_columns_exist= function(selected_columns, df) {
-  # Check that all of selected_columns are in df
-  if(any(!selected_columns %in% colnames(df))) {
+validate_columns_exist= function(selected_cols, df) {
+  # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B].
+  unmatched_cols= base::setdiff(selected_cols, colnames(df))
+  
+  if(length(unmatched_cols) > 0) {
+    print('The following columns are missing: ')
+    print(unmatched_cols)
     return(FALSE)
   } else {
     return(TRUE)
@@ -25,28 +30,32 @@ validate_columns_exist= function(selected_columns, df) {
 join_sample_meta= function(df, sample_meta, key_cols) {
   # Validation: Check that key_cols are present in df ----
   if(validate_columns_exist(key_cols, df) == FALSE) {
-    print(key_cols)
     stop('Not all key_cols (printed above) are present in the provided data frame.')
   }
   
   # Validation: Check that key_cols are present in the sample meta ----
   if(validate_columns_exist(key_cols, sample_meta) == FALSE) {
-    print(key_cols)
     stop('Not all key_cols (printed above) are present in the sample meta.')
   }
   
   # Collapse the sample meta using key_cols and join onto the input df ----
+  # Collapse unique values into a single row and then filter out columns with the separator.
+  # Columns with only one unique value in a group are selected.
   collapsed_metadata= sample_meta %>% dplyr::group_by(pick(all_of(key_cols))) %>% 
-    dplyr::summarise(across(everything(), function(x) paste(sort(unique(x)), collapse= ' | '))) %>% dplyr::ungroup()
+    dplyr::summarise(across(everything(), function(x) paste(sort(unique(x)), collapse= ':::'))) %>% dplyr::ungroup() %>%
+    dplyr::select(all_of(key_cols), where(function(x) base::any(!grepl(':::', x))))
 
-  expanded_df= df %>% dplyr::left_join(collapsed_metadata, by= key_cols, relationship='many-to-one')
+  expanded_df= dplyr::left_join(df, collapsed_metadata, 
+                                by= base::intersect(colnames(df), colnames(collapsed_metadata)), 
+                                relationship='many-to-one')
   
-  # Validation: Check for duplicate columns ----
-  duplicate_columns= setdiff(c(colnames(df), colnames(sample_meta)), colnames(expanded_df))
-  if(length(duplicate_columns > 0)) {
-    print("WARNING: The following column(s) appear in the dataframe and the sample meta, but not in key_cols.")
-    print(duplicate_columns)
-    print('The columns(s) thus appear twice in the output dataframe.')
+  # Print out the sample meta columns that were added to the dataframe ----
+  added_cols= base::setdiff(colnames(expanded_df), colnames(df))
+  if(length(added_cols > 0)) {
+    print('The following columns from the sample meta were added:')
+    print(added_cols)
+  } else {
+    print('No additional columns from the sample meta were added.')
   }
   
   return(expanded_df)

From bbb2e66218f31f1556687a4f8d752b8ac2503ab2 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Thu, 22 Aug 2024 20:27:58 -0400
Subject: [PATCH 017/127] Allow for ctrl_cols to exclude sig_cols

---
 scripts/src/compute_l2fc.R | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/scripts/src/compute_l2fc.R b/scripts/src/compute_l2fc.R
index eeadafb3..f89d9797 100755
--- a/scripts/src/compute_l2fc.R
+++ b/scripts/src/compute_l2fc.R
@@ -51,11 +51,6 @@ compute_l2fc= function(normalized_counts,
     stop('Not all cell_line_cols (printed above) are present in normalized_counts.')
   }
   
-  # Validation: Check that ctrl_cols are in sig_cols ----
-  if(!all(ctrl_cols %in% sig_cols)) {
-    stop('Control columns are not a subset of sig columns.')
-  }
-  
   # Collapsing technical replicates ----
   # Detect bio_rep column to be used to collapse technical replicates
   if('bio_rep' %in% colnames(normalized_counts)) {
@@ -67,10 +62,11 @@ compute_l2fc= function(normalized_counts,
   }
   
   # collapse tech reps
-  print('Collapsing technical replicates ...')
+  print('Collapsing technical replicates on the following columns: ')
+  print(unique(c(cell_line_cols, 'trt_type', bio_rep_id_cols, ctrl_cols)))
   collapsed_tech_rep= normalized_counts %>%
     dplyr::filter(!(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type), !is.na(CCLE_name)) %>%
-    dplyr::group_by(pick(all_of(c(cell_line_cols, 'trt_type', bio_rep_id_cols)))) %>%
+    dplyr::group_by(pick(all_of(c(cell_line_cols, 'trt_type', bio_rep_id_cols, ctrl_cols)))) %>%
     dplyr::summarise(mean_n= mean(n),
                      mean_normalized_n = mean(!!rlang::sym(count_col_name)), 
                      num_tech_reps= dplyr::n()) %>% dplyr::ungroup()
@@ -81,6 +77,8 @@ compute_l2fc= function(normalized_counts,
           dplyr::summarise(count= dplyr::n()) %>% dplyr::ungroup())
     
   # Pull out negative controls and collapse any biological replicates ----
+  print('Collapsing control conditions on the following columns: ')
+  print(unique(c(cell_line_cols, ctrl_cols)))
   controls= collapsed_tech_rep %>% dplyr::filter(trt_type== control_type) %>% 
     dplyr::group_by(pick(all_of(c(cell_line_cols, ctrl_cols)))) %>%
     dplyr::summarise(control_median_n= median(mean_n),

From abdda0c29358c048f8b52c18a5ac4cb2e8630848 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 23 Aug 2024 10:08:31 -0400
Subject: [PATCH 018/127] Add cell set meta

---
 scripts/joining_sample_meta_columns.R | 10 ++++++----
 scripts/src/join_sample_meta.R        | 21 ++++++++++++++++-----
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/scripts/joining_sample_meta_columns.R b/scripts/joining_sample_meta_columns.R
index eb39d84f..50b1e003 100644
--- a/scripts/joining_sample_meta_columns.R
+++ b/scripts/joining_sample_meta_columns.R
@@ -6,6 +6,7 @@ source("./src/join_sample_meta.R")
 parser <- ArgumentParser()
 # specify our desired options 
 parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Sample meta data for the sequencing run.')
+parser$add_argument('--cell_set_meta', default= 'cell_set_meta.csv', help= 'Cell set metadata for the sequencing run.')
 parser$add_argument('--l2fc', default= 'l2fc.csv', help= 'L2FC data.') # level 4
 parser$add_argument('--collapsed_l2fc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5
 parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', 
@@ -21,6 +22,7 @@ if (args$out == "") {
 
 # Prepare args ----
 sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',')
+cell_set_meta= data.table::fread(args$cell_set_meta, header= T, sep= ',')
 sig_cols= unlist(strsplit(args$sig_cols, ","))
 
 # Add in metadata for l2fc file ----
@@ -32,7 +34,7 @@ if(file.exists(args$l2fc)) {
     input_cols= sig_cols
     print('WARNING: No "bio_rep" column detected. Proceeding with just sig_cols.')
   }
-  l2fc_with_sm= join_sample_meta(df= l2fc, sample_meta, key_cols= input_cols)
+  l2fc_with_sm= join_sample_meta(df= l2fc, sample_meta, cell_set_meta, key_cols= input_cols)
   
   # Write out
   outpath= paste(args$out, 'l2fc_with_sm.csv', sep='/')
@@ -46,11 +48,11 @@ if(file.exists(args$l2fc)) {
 # Add in metadata for collapsed_l2fc file ----
 if(file.exists(args$collapsed_l2fc)) {
   collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',')
-  collapsed_l2fc_with_sm= join_sample_meta(df= collapsed_l2fc, sample_meta, key_cols= sig_cols)
+  collapsed_l2fc_with_sm= join_sample_meta(df= collapsed_l2fc, sample_meta, cell_set_meta, key_cols= sig_cols)
   
   # Write out
-  outpath= paste(args$out, 'collapsed_l2fc_with_sm.csv', sep='/')
-  print(paste("Writing collapsed_l2fc_with_sm.csv to ", outpath))
+  outpath= paste(args$out, 'collapsed_l2fc_with_metadata.csv', sep='/')
+  print(paste("Writing collapsed_l2fc_with_metadata.csv to ", outpath))
   write.csv(collapsed_l2fc_with_sm, outpath, row.names= FALSE, quote= FALSE)
 } else {
   print('WARNING: collapsed_l2fc.csv does not exist. Skipping this file.')
diff --git a/scripts/src/join_sample_meta.R b/scripts/src/join_sample_meta.R
index 13f66b30..9a20e931 100644
--- a/scripts/src/join_sample_meta.R
+++ b/scripts/src/join_sample_meta.R
@@ -23,11 +23,12 @@ validate_columns_exist= function(selected_cols, df) {
 #' 
 #' Joins a given data frame with the sample meta.
 #' 
-#' @param df
-#' @param sample_meta Dataframe of the sample meta used in the run
+#' @param df Input dataframe that should contain the columns specified in the "key_cols" parameter and "cell_set".
+#' @param sample_meta Dataframe of the sample meta used in the run.
+#' @param cell_set_meta Datafrane of the cell set metadata used in the run. This should contain the "cell_set" column.
 #' @param key_cols Vector of column names used as identifiers in the sample meta.
-#' @returns Data frame with additional columns from the sample meta
-join_sample_meta= function(df, sample_meta, key_cols) {
+#' @returns Data frame with additional columns from the sample meta.
+join_sample_meta= function(df, sample_meta, cell_set_meta, key_cols) {
   # Validation: Check that key_cols are present in df ----
   if(validate_columns_exist(key_cols, df) == FALSE) {
     stop('Not all key_cols (printed above) are present in the provided data frame.')
@@ -38,10 +39,20 @@ join_sample_meta= function(df, sample_meta, key_cols) {
     stop('Not all key_cols (printed above) are present in the sample meta.')
   }
   
+  # Validation: Check that cell_set exists in df and cell_set meta ----
+  if(validate_columns_exist(c('cell_set'), sample_meta) == FALSE) {
+    stop('The cell_set column is NOT present in the sample meta.')
+  }
+  
+  if(validate_columns_exist(c('cell_set'), cell_set_meta) == FALSE) {
+    stop('The cell_set column is NOT present in the cell set meta.')
+  }
+  
   # Collapse the sample meta using key_cols and join onto the input df ----
   # Collapse unique values into a single row and then filter out columns with the separator.
   # Columns with only one unique value in a group are selected.
-  collapsed_metadata= sample_meta %>% dplyr::group_by(pick(all_of(key_cols))) %>% 
+  collapsed_metadata= sample_meta %>% dplyr::left_join(cell_set_meta, by= 'cell_set') %>%
+    dplyr::group_by(pick(all_of(key_cols))) %>% 
     dplyr::summarise(across(everything(), function(x) paste(sort(unique(x)), collapse= ':::'))) %>% dplyr::ungroup() %>%
     dplyr::select(all_of(key_cols), where(function(x) base::any(!grepl(':::', x))))
 

From f2ed2d7fff18bf6dc35a3a11e2fe4f8bb58887d6 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Mon, 26 Aug 2024 14:37:20 -0400
Subject: [PATCH 019/127] Updates from podman PR comments

---
 scripts/src/QC_images.R | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index a8ec5bac..2b6afc1d 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -427,8 +427,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   }
   
   # Do the same for index 2.
-  # Reverse index 2 barcodes if needed.
-  if(reverse_index2) {
+  # Reverse index 2 barcodes if it is indicated and if "index_2" exisits
+  if(reverse_index2 & 'index_2' %in% colnames(sample_meta) ) {
     print("Reverse-complementing index 2 barcode.")
     sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))
   }
@@ -485,8 +485,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
     skipped_qcs = c(skipped_qcs, potential_error)
   }
   
-  ## Contaminants ----
-  print('5. Generating contaminate cell lines ...')
+  ## Cell line contaminants ----
+  print('5. Generating cell line contaminants ...')
   potential_error= base::tryCatch({
     contams= annotated_counts %>% dplyr::filter(expected_read==F) %>%
       dplyr::mutate(barcode_id= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>%
@@ -507,14 +507,14 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
     skipped_qcs = c(skipped_qcs, potential_error)
   }
   
-  ## Contaminates for ursula ----
-  print('6. Generating contaminate reads for Ursula ...')
+  ## Contaminant reads ----
+  print('6. Generating contaminant reads ...')
   potential_error= base::tryCatch({
     pcr_locations= c('pcr_plate', 'pcr_well')
     
     # Validation: Check that the PCR columns are present in raw_counts.
     if(!validate_columns_exist(pcr_locations, raw_counts)) {
-      stop('pcr_plate and pcr_well are required raw_counts.csv for this to work.')
+      stop('pcr_plate and pcr_well are required in raw_counts.csv for this to work.')
     }
       
     # count number of wells a cell_set appears in.
@@ -568,8 +568,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
     contam_reads %>% write.csv(paste0(out, 'contam_reads.csv'), row.names=F)
   }, error= function(e) {
     print(e)
-    print('Encountered an error when creating the contams for UW file. Skipping this output ...')
-    return('contam for UW')
+    print('Encountered an error when creating the contams reads file. Skipping this output ...')
+    return('contam reads')
   })
   
   # Collect returned string if an error occurred

From 02fb15d90f27c7add90ecfddcf5799d50bbc11a9 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Mon, 26 Aug 2024 17:34:14 -0400
Subject: [PATCH 020/127] Created function for cdf plot

---
 scripts/src/QC_images.R | 143 +++++++++++++++++++++++++++-------------
 1 file changed, 98 insertions(+), 45 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index 2b6afc1d..67528939 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -180,6 +180,91 @@ create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value
   return(recov_plot)
 }
 
+#' Cumulative reads plot
+#' 
+#' Creates a line plot of the cumulative reads.
+#' 
+#' @param input_df Input dataframe. Usually is the filtered_counts dataframe.
+#' @param id_cols Vector of column names that identify every PCR well.
+#' @param counts_col Name of the column that contains the values. Defaults to "n".
+#' @param mark1 Percentage of reads to mark. Draws a line at a specified percentage to indicate the number of 
+#'              cell lines needed to reach this percentage of reads. Defaults to 0.5.
+#' @param mark2 Percentage of reads to mark. Draws a line at a specified percentage to indicate the number of 
+#'              cell lines needed to reach this percentage of reads. This parameter should be greater than the 
+#'              value specified for "mark1". Defaults to 0.95.
+#' @param contains_cb Boolean. If control barcodes are used, this can be set to TRUE so that points 
+#'                    corresponding to the control barcodes will be colored on the plot. Defaults to FALSE.
+#' @param order_aucs Boolean, when there are multiple facets, this can be set to TRUE to sort the facets by 
+#'                   the AUC value. The AUCs will be sorted in descending order. Defaults to FALSE.
+#' @returns Returns a ggplot object.
+create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2= 0.95, 
+                          contains_cbs= FALSE, order_aucs= FALSE) {
+  # Validation: Check that id_cols and counts_col are in the input dataframe.
+  if(!validate_columns_exist(c(id_cols, counts_col), input_df)) {
+    stop('Some input columns were not detected in the cdf input dataframe.')
+  }
+  
+  # Determine percentages, ranks and cumulative percentages
+  calc_cummulative= input_df %>% dplyr::group_by(pick(all_of(id_cols))) %>%
+    dplyr::arrange(dplyr::desc(.data[[counts_col]])) %>%
+    dplyr::mutate(expected_num_cls= dplyr::n(),
+                  total_counts= sum(.data[[counts_col]]), pct_counts= .data[[counts_col]]/total_counts,
+                  cum_pct= cumsum(pct_counts), 
+                  rank= row_number(), rank_pct= rank/expected_num_cls) %>% dplyr::ungroup()
+  
+  # Validation: mark1 should be less than mark2.
+  if(mark1 > mark2 | mark1 < 0 | mark1 > 1) {
+    stop('Mark values must be between 0 and 1 Mark1 should be less than mark2')
+  }
+  
+  # Find the number of cell lines needed to reach mark1 and mark2
+  mark1_values= calc_cummulative %>% dplyr::filter(cum_pct >= mark1) %>% 
+    dplyr::group_by(pick(all_of(id_cols))) %>% dplyr::arrange(cum_pct) %>% 
+    dplyr::filter(row_number() == 1) %>% dplyr::ungroup() %>% 
+    dplyr::select(all_of(id_cols), rank_pct= rank_pct, mark1_rank= rank, mark1_loc= rank_pct)
+  mark2_values= calc_cummulative %>% dplyr::group_by(pick(all_of(id_cols))) %>% 
+    dplyr::mutate(auc= sum(cum_pct * (1 / expected_num_cls))) %>% # calculate AUCs
+    dplyr::filter(cum_pct >= mark2) %>% dplyr::arrange(cum_pct) %>%
+    dplyr::filter(row_number() == 1) %>% dplyr::ungroup() %>% 
+    dplyr::select(all_of(id_cols), rank_pct= rank_pct, mark2_rank= rank, mark2_loc= rank_pct, auc)
+  
+  # Create cdf plot
+  data_for_plot= calc_cummulative %>% 
+    dplyr::left_join(mark1_values, by= c(id_cols, 'rank_pct')) %>% 
+    dplyr::left_join(mark2_values, by= c(id_cols, 'rank_pct')) %>%
+    tidyr::unite(all_of(id_cols), col= 'facet_name', sep= ':', remove= TRUE, na.rm= FALSE)
+  
+  # Reorder by aucs if specified
+  if(order_aucs) {
+    data_for_plot= data_for_plot %>% dplyr::arrange(dplyr::desc(auc)) %>%
+      dplyr::mutate(facet_name= base::factor(facet_name, levels= unique(facet_name)))
+  }
+  
+  # Create plot
+  output_plot= data_for_plot %>%
+    ggplot(aes(x= rank_pct, y=cum_pct)) +
+    # Color control barcodes if specified
+    { if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), 
+                                  mapping= aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size= 2) } + 
+    geom_line(color='black') +
+    # point for mark1 of counts
+    geom_segment(aes(x= -Inf , y= mark1, xend= mark1_loc, yend = mark1), color= 'black', linetype= 2) +
+    geom_segment(aes(x= mark1_loc, y= -Inf, xend = mark1_loc, yend = mark1), color= 'black', linetype= 2) +
+    geom_label(aes(x= mark1_loc, y= 0.25, label= mark1_rank), hjust= 0, color= 'black') +
+    # point for 95% of counts
+    geom_segment(aes(x= -Inf , y= mark2, xend= mark2_loc, yend= mark2), color= 'black', linetype= 2) +
+    geom_segment(aes(x= mark2_loc, y= -Inf, xend= mark2_loc, yend= mark2), color= 'black', linetype= 2) +
+    geom_label(aes(x= mark2_loc, y= 0.75, label= mark2_rank), hjust= 0, color= 'black') +
+    # label for AUC
+    #geom_label(aes(x= mark2_loc, y= 0.1, label= paste0('AUC ', round(auc, 3))), hjust= 'inward', color= 'black') +
+    geom_label(. %>% dplyr::filter(!is.na(auc)), mapping= aes(label= paste0('AUC ', round(auc, 3))), 
+               x= 1, y= 0, hjust= 'inward', vjust= 'inward', color= 'black') +
+    facet_wrap(~facet_name) + 
+    labs(x='% rank of unique reads', y='Cumulative percentage', color= 'CBs') + theme_bw()
+  
+  return(output_plot)
+}
+
 #' Control barcode scatter plot
 #' 
 #' Creates a scatter plot of the control barcodes.
@@ -373,10 +458,11 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
                     id_cols= c('pcr_plate', 'pcr_well'), sig_cols,
                     control_type= 'negcon', count_threshold= 40, 
                     reverse_index2= FALSE, out = NA) {
-  require(tidyverse)
-  require(magrittr)
-  require(reshape2)
-  require(scales)
+  library(tidyverse)
+  library(magrittr)
+  library(reshape2)
+  library(scales)
+  library(WGCNA)
   
   if(is.na(out)) {
     out = getwd()
@@ -578,53 +664,20 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   }
   
   ## Cumulative counts by lines in negcons ----
-  print("7. Generating cummulative image ...")
+  print("7. Generating cumulative image ...")
   potential_error= base::tryCatch({
-    cdf= filtered_counts %>% dplyr::filter(trt_type == control_type) %>%
-      dplyr::left_join(num_cls_in_set, by= "cell_set") %>%
-      dplyr::mutate(expected_num_cl= ifelse(control_barcodes, expected_num_cl + length(unique(CB_meta$Name)),
-                                            expected_num_cl)) %>% # add CBs to expected_num_cl if there are CBs
-      tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= FALSE) %>%
-      dplyr::group_by(pcr_plate, pcr_well, profile_id, expected_num_cl) %>% 
-      dplyr::mutate(total_counts= sum(n), pct_counts= n/total_counts,) %>% dplyr::arrange(-n) %>% 
-      dplyr::mutate(cum_pct= cumsum(pct_counts), rank= row_number(),
-                    rank_pct= rank/expected_num_cl) %>% dplyr::ungroup()
-    
-    # additional tables
-    mark50= cdf %>% dplyr::filter(cum_pct >= 0.5) %>% dplyr::group_by(profile_id) %>% 
-      arrange(cum_pct) %>% dplyr::filter(row_number()==1) %>% ungroup() %>% 
-      dplyr::select(profile_id, rank_pct= rank_pct, num50= rank, num50_loc= rank_pct)
-    mark95= cdf %>% dplyr::group_by(profile_id) %>% 
-      dplyr::mutate(auc= sum(cum_pct*(1/expected_num_cl))) %>% # calculate AUCs
-      dplyr::filter(cum_pct >= 0.95) %>% 
-      arrange(cum_pct) %>% dplyr::filter(row_number() ==1) %>% ungroup() %>% 
-      dplyr::select(profile_id, rank_pct= rank_pct, num95= rank, num95_loc= rank_pct, auc)
-    
-    cdf_plot= cdf %>% 
-      merge(mark50, by= c('profile_id', 'rank_pct'), all.x=T) %>% 
-      merge(mark95, by= c('profile_id', 'rank_pct'), all.x= T) %>%
-      ggplot(aes(x= rank_pct, y=cum_pct)) +
-      { if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), 
-                                    mapping=aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size=3) } + 
-      geom_line(color='black') +
-      # point for 50% of counts
-      geom_segment(aes(x= -Inf , y= .50, xend = num50_loc, yend = .50), color= 'black', linetype='dashed') +
-      geom_segment(aes(x= num50_loc, y= -Inf, xend = num50_loc, yend = .50), color= 'black', linetype='dashed') +
-      geom_label(aes(x=num50_loc, y= .25, label= num50), hjust= 0, color= 'black') +
-      # point for 95% of counts
-      geom_segment(aes(x= -Inf , y= .95, xend = num95_loc, yend = .95), color= 'black', linetype='dashed') +
-      geom_segment(aes(x= num95_loc, y= -Inf, xend = num95_loc, yend = .95), color= 'black', linetype='dashed') +
-      geom_label(aes(x=num95_loc, y= .75, label= num95), hjust= 0, color= 'black') +
-      # label for AUC
-      geom_label(aes(x=num95_loc, y= .25, label= paste0('AUC ', round(auc,3))), hjust= 'inward', color= 'black') +
-      facet_wrap(~profile_id) + 
-      labs(x='% rank of unique reads', y='Cumulative percentage', color= 'CBs') + theme_bw()
+    cdf_plot= create_cdf_plot(filtered_counts %>% dplyr::filter(trt_type == 'control_type'), 
+                              id_cols= id_cols, 
+                              counts_col= 'n', 
+                              mark1= 0.5, mark2= 0.95, 
+                              contains_cbs= contains_cbs, order_aucs= TRUE) +
+      labs(title= 'Cumulative reads in negative controls.')
     
     pdf(file=paste(out, "cdf_plot.pdf", sep="/"),
         width=sqrt(num_profiles)*2, height=sqrt(num_profiles))
     print(cdf_plot)
     dev.off()
-    rm(cdf, mark50, mark95, cdf_plot)
+    rm(cdf_plot)
   }, error= function(e) {
     print(e)
     print('Encountered an error when creating the cdf plot. Skipping this output ...') 

From c1890e546d0ea8469f6a2b5fd6c6a7dc10343f1f Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 27 Aug 2024 11:18:24 -0400
Subject: [PATCH 021/127] Fixed typo

---
 scripts/filter_counts.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R
index 62b54e2c..6fabf610 100755
--- a/scripts/filter_counts.R
+++ b/scripts/filter_counts.R
@@ -10,7 +10,7 @@ suppressPackageStartupMessages(library(sets))
 suppressPackageStartupMessages(library(tidyverse)) # load last - after dplyr
 source("./src/filter_raw_reads.R")
 
-# Arguement parser ----
+# Argument parser ----
 parser <- ArgumentParser()
 # specify desired options
 parser$add_argument("-v", "--verbose", action="store_true", default=TRUE,

From db92b42b6d03b706241bc12deed757da57d01712 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 27 Aug 2024 11:18:57 -0400
Subject: [PATCH 022/127] Renamed file

---
 scripts/src/join_metadata.R    | 75 ++++++++++++++++++++++++++++++++++
 scripts/src/join_sample_meta.R | 73 ---------------------------------
 2 files changed, 75 insertions(+), 73 deletions(-)
 create mode 100644 scripts/src/join_metadata.R
 delete mode 100644 scripts/src/join_sample_meta.R

diff --git a/scripts/src/join_metadata.R b/scripts/src/join_metadata.R
new file mode 100644
index 00000000..3842054c
--- /dev/null
+++ b/scripts/src/join_metadata.R
@@ -0,0 +1,75 @@
+#' validate_columns_exist
+#' 
+#' This function checks that a list of columns are present in a dataframe.
+#' Columns that were not found in the dataframe are printed out.
+#' 
+#' @param selected_columns A vector of strings each representing a column name
+#' @param df A dataframe to check against
+#' @return Boolean
+validate_columns_exist= function(selected_cols, df) {
+  # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B].
+  unmatched_cols= base::setdiff(selected_cols, colnames(df))
+  
+  if(length(unmatched_cols) > 0) {
+    print('The following columns are missing: ')
+    print(unmatched_cols)
+    return(FALSE)
+  } else {
+    return(TRUE)
+  }
+}
+
+#' Join metadata
+#' 
+#' Joins a given data frame with the sample meta.
+#' 
+#' @param input_df Input dataframe that should contain the columns specified in the "key_cols" parameter and "cell_set".
+#' @param metadata Dataframe of the sample meta used in the run.
+#' @param key_cols Vector of column names used as identifiers in the sample meta.
+#' @returns Data frame with additional columns from the sample meta.
+join_metadata= function(input_df, metadata, key_cols) {
+  # Validation: Check that key_cols are present in df ----
+  if(validate_columns_exist(key_cols, input_df) == FALSE) {
+    stop('Not all key_cols (printed above) are present in the provided dataframe.')
+  }
+  
+  # Validation: Check that key_cols are present in the sample meta ----
+  if(validate_columns_exist(key_cols, metadata) == FALSE) {
+    stop('Not all key_cols (printed above) are present in the provided metadata.')
+  }
+  
+  # Collapse the sample meta using key_cols and join onto the input df ----
+  # Collapse unique values into a single row and then filter out columns with the separator.
+  # Columns with only one unique value in a group are selected.
+  collapsed_metadata= metadata %>% dplyr::group_by(pick(all_of(key_cols))) %>% 
+    dplyr::summarise(across(everything(), function(x) paste(sort(unique(x)), collapse= ':::'))) %>% dplyr::ungroup() %>%
+    dplyr::select(all_of(key_cols), where(function(x) base::any(!grepl(':::', x))))
+  
+  # Join using the key_cols, drop any columns that were duplicated.
+  output_df= dplyr::left_join(input_df, collapsed_metadata, by= key_cols, 
+                              suffix= c('', '.y'), relationship='many-to-one') %>%
+    dplyr::select(-tidyselect::ends_with('.y'))
+  
+  # Validation: Check that merge did not explode ----
+  print(paste0(' Input df rows: ', nrow(input_df)))
+  print(paste0('Output df rows: ', nrow(output_df)))
+  if(nrow(input_df) < nrow(output_df)) {
+    stop('Metadata join is producing more rows than expected!')
+  } else if(nrow(input_df) > nrow(output_df)) {
+    stop('Metadata join is dropping some rows!')
+  } else {}
+  
+  # Print out the sample meta columns that were added to the dataframe ----
+  added_cols= base::setdiff(colnames(output_df), colnames(input_df))
+  if(length(added_cols > 0)) {
+    print(paste0('The following ', length(added_cols), ' column(s) were added:'))
+    print(added_cols)
+    print(paste0('The following ', length(metadata) - length(added_cols) - length(key_cols),
+                 ' column(s) from the metadata were not added. They may already exist in the dataframe.'))
+    print(base::setdiff(colnames(metadata), c(added_cols, key_cols)))
+  } else {
+    print('No additional columns from the metadata were added.')
+  }
+  
+  return(output_df)
+}
diff --git a/scripts/src/join_sample_meta.R b/scripts/src/join_sample_meta.R
deleted file mode 100644
index 9a20e931..00000000
--- a/scripts/src/join_sample_meta.R
+++ /dev/null
@@ -1,73 +0,0 @@
-#' validate_columns_exist
-#' 
-#' This function checks that a list of columns are present in a dataframe.
-#' Columns that were not found in the dataframe are printed out.
-#' 
-#' @param selected_columns A vector of strings each representing a column name
-#' @param df A dataframe to check against
-#' @return Boolean
-validate_columns_exist= function(selected_cols, df) {
-  # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B].
-  unmatched_cols= base::setdiff(selected_cols, colnames(df))
-  
-  if(length(unmatched_cols) > 0) {
-    print('The following columns are missing: ')
-    print(unmatched_cols)
-    return(FALSE)
-  } else {
-    return(TRUE)
-  }
-}
-
-#' join_sample_meta
-#' 
-#' Joins a given data frame with the sample meta.
-#' 
-#' @param df Input dataframe that should contain the columns specified in the "key_cols" parameter and "cell_set".
-#' @param sample_meta Dataframe of the sample meta used in the run.
-#' @param cell_set_meta Datafrane of the cell set metadata used in the run. This should contain the "cell_set" column.
-#' @param key_cols Vector of column names used as identifiers in the sample meta.
-#' @returns Data frame with additional columns from the sample meta.
-join_sample_meta= function(df, sample_meta, cell_set_meta, key_cols) {
-  # Validation: Check that key_cols are present in df ----
-  if(validate_columns_exist(key_cols, df) == FALSE) {
-    stop('Not all key_cols (printed above) are present in the provided data frame.')
-  }
-  
-  # Validation: Check that key_cols are present in the sample meta ----
-  if(validate_columns_exist(key_cols, sample_meta) == FALSE) {
-    stop('Not all key_cols (printed above) are present in the sample meta.')
-  }
-  
-  # Validation: Check that cell_set exists in df and cell_set meta ----
-  if(validate_columns_exist(c('cell_set'), sample_meta) == FALSE) {
-    stop('The cell_set column is NOT present in the sample meta.')
-  }
-  
-  if(validate_columns_exist(c('cell_set'), cell_set_meta) == FALSE) {
-    stop('The cell_set column is NOT present in the cell set meta.')
-  }
-  
-  # Collapse the sample meta using key_cols and join onto the input df ----
-  # Collapse unique values into a single row and then filter out columns with the separator.
-  # Columns with only one unique value in a group are selected.
-  collapsed_metadata= sample_meta %>% dplyr::left_join(cell_set_meta, by= 'cell_set') %>%
-    dplyr::group_by(pick(all_of(key_cols))) %>% 
-    dplyr::summarise(across(everything(), function(x) paste(sort(unique(x)), collapse= ':::'))) %>% dplyr::ungroup() %>%
-    dplyr::select(all_of(key_cols), where(function(x) base::any(!grepl(':::', x))))
-
-  expanded_df= dplyr::left_join(df, collapsed_metadata, 
-                                by= base::intersect(colnames(df), colnames(collapsed_metadata)), 
-                                relationship='many-to-one')
-  
-  # Print out the sample meta columns that were added to the dataframe ----
-  added_cols= base::setdiff(colnames(expanded_df), colnames(df))
-  if(length(added_cols > 0)) {
-    print('The following columns from the sample meta were added:')
-    print(added_cols)
-  } else {
-    print('No additional columns from the sample meta were added.')
-  }
-  
-  return(expanded_df)
-}
\ No newline at end of file

From 94e833cd89d73e23acd7661c5904380495bb9014 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 27 Aug 2024 11:19:43 -0400
Subject: [PATCH 023/127] Rename file

Call function twice to join sample meta and assay pool meta.
---
 scripts/join_metadata.R               | 93 +++++++++++++++++++++++++++
 scripts/joining_sample_meta_columns.R | 60 -----------------
 2 files changed, 93 insertions(+), 60 deletions(-)
 create mode 100644 scripts/join_metadata.R
 delete mode 100644 scripts/joining_sample_meta_columns.R

diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R
new file mode 100644
index 00000000..8d603b7c
--- /dev/null
+++ b/scripts/join_metadata.R
@@ -0,0 +1,93 @@
+library(argparse)
+library(tidyverse)
+source("./src/join_sample_meta.R")
+
+# Argument parser ----
+parser <- ArgumentParser()
+# specify our desired options 
+parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Sample meta data for the sequencing run.')
+parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata")
+parser$add_argument('--l2fc', default= 'l2fc.csv', help= 'L2FC data.') # level 4
+parser$add_argument('--collapsed_l2fc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5
+parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', 
+                    help= 'Columns that uniquely identify a condition.') 
+parser$add_argument('--out', default= getwd(), help= 'Path to the output directory.')
+
+args <- parser$parse_args()
+
+# set output to working directory if none is specified
+if (args$out == "") {
+  args$out = args$wkdir
+}
+
+# Read in files and prepare some parameters ----
+sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',')
+sig_cols= unlist(strsplit(args$sig_cols, ","))
+
+# For assay pool meta, check if it exists. If so, then filter it for relavent cell_sets/davepool_ids
+# and select and rename some columns.
+assay_pool_meta_exists= FALSE
+if(file.exists(args$assay_pool_meta)) {
+  assay_pool_meta_exists= TRUE # Update boolean
+  
+  # Read in assay pool meta and transform the table into something more usable.
+  assay_pool_meta= read.delim(args$assay_pool_meta)
+  unique_cell_sets= unique(sample_meta$cell_set[sample_meta$cell_set != ""])
+  input_assay_pool_meta= assay_pool_meta %>% dplyr::filter(davepool_id %in% unique_cell_sets) %>% 
+    dplyr::select(DepMap_ID= depmap_id, CCLE_name= ccle_name, cell_set= davepool_id, pool_id)
+}
+
+# Add sample meta and assay pool meta to l2fc table ----
+if(file.exists(args$l2fc)) {
+  l2fc= data.table::fread(args$l2fc, header= T, sep= ',')
+  
+  # Add sample meta columns to l2fc
+  if('bio_rep' %in% colnames(sample_meta) & 'bio_rep' %in% colnames(l2fc)) {
+    input_cols= c(sig_cols, 'bio_rep')
+  } else {
+    input_cols= sig_cols
+    print('WARNING: No "bio_rep" column detected. Proceeding with just sig_cols.')
+  }
+  l2fc_with_meta_columns= join_metadata(input_df= l2fc, metadata= sample_meta, key_cols= input_cols)
+  
+  # Add assay pool meta columns to l2fc
+  if(assay_pool_meta_exists) {
+    l2fc_with_meta_columns= join_metadata(input_df= l2fc_with_meta_columns, 
+                                          metadata= input_assay_pool_meta,
+                                          key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set'))
+  } else {
+    print('WARNING: Assay pool meta not detected and will not be joined onto l2fc.')
+  }
+  
+  # Write out
+  outpath= paste(args$out, 'l2fc_with_meta_columns.csv', sep='/')
+  print(paste("Writing l2fc_with_meta_columns.csv to ", outpath))
+  write.csv(l2fc_with_meta_columns, outpath, row.names= FALSE, quote= FALSE)
+} else {
+  print('WARNING: l2fc.csv does not exist. Skipping this file.')
+}
+
+# Add sample meta and assay pool meta to collapsed_l2fc table ----
+if(file.exists(args$collapsed_l2fc)) {
+  collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',')
+  
+  # Add sample meta columns to collapsed l2fc
+  collapsed_l2fc_with_meta_columns= join_metadata(input_df= collapsed_l2fc, metadata= sample_meta, 
+                                                  key_cols= sig_cols)
+  
+  # Add assay pool meta columns to collapsed l2fc
+  if(assay_pool_meta_exists) {
+    collapsed_l2fc_with_meta_columns= join_metadata(input_df= collapsed_l2fc_with_meta_columns, 
+                                                    metadata= input_assay_pool_meta,
+                                                    key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set'))
+  } else {
+    print('WARNING: Assay pool meta not detected and will not be joined onto collapsed l2fc.')
+  }
+  
+  # Write out
+  outpath= paste(args$out, 'collapsed_l2fc_with_meta_columns.csv', sep='/')
+  print(paste("Writing collapsed_l2fc_with_meta_columns.csv to ", outpath))
+  write.csv(collapsed_l2fc_with_meta_columns.csv, outpath, row.names= FALSE, quote= FALSE)
+} else {
+  print('WARNING: collapsed_l2fc.csv does not exist. Skipping this file.')
+}
diff --git a/scripts/joining_sample_meta_columns.R b/scripts/joining_sample_meta_columns.R
deleted file mode 100644
index 50b1e003..00000000
--- a/scripts/joining_sample_meta_columns.R
+++ /dev/null
@@ -1,60 +0,0 @@
-library(argparse)
-library(tidyverse)
-source("./src/join_sample_meta.R")
-
-# Argument parser ----
-parser <- ArgumentParser()
-# specify our desired options 
-parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Sample meta data for the sequencing run.')
-parser$add_argument('--cell_set_meta', default= 'cell_set_meta.csv', help= 'Cell set metadata for the sequencing run.')
-parser$add_argument('--l2fc', default= 'l2fc.csv', help= 'L2FC data.') # level 4
-parser$add_argument('--collapsed_l2fc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5
-parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', 
-                    help= 'Columns that uniquely identify a condition.') 
-parser$add_argument('--out', default= getwd(), help= 'Path to the output directory.')
-
-args <- parser$parse_args()
-
-# set output to working directory if none is specified
-if (args$out == "") {
-  args$out = args$wkdir
-}
-
-# Prepare args ----
-sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',')
-cell_set_meta= data.table::fread(args$cell_set_meta, header= T, sep= ',')
-sig_cols= unlist(strsplit(args$sig_cols, ","))
-
-# Add in metadata for l2fc file ----
-if(file.exists(args$l2fc)) {
-  l2fc= data.table::fread(args$l2fc, header= T, sep= ',')
-  if('bio_rep' %in% sample_meta & 'bio_rep' %in% l2fc) {
-    input_cols= c(sig_cols, 'bio_rep')
-  } else {
-    input_cols= sig_cols
-    print('WARNING: No "bio_rep" column detected. Proceeding with just sig_cols.')
-  }
-  l2fc_with_sm= join_sample_meta(df= l2fc, sample_meta, cell_set_meta, key_cols= input_cols)
-  
-  # Write out
-  outpath= paste(args$out, 'l2fc_with_sm.csv', sep='/')
-  print(paste("Writing l2fc_with_sm.csv to ", outpath))
-  write.csv(l2fc_with_sm, outpath, row.names= FALSE, quote= FALSE)
-} else {
-  print('WARNING: l2fc.csv does not exist. Skipping this file.')
-}
-#
-
-# Add in metadata for collapsed_l2fc file ----
-if(file.exists(args$collapsed_l2fc)) {
-  collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',')
-  collapsed_l2fc_with_sm= join_sample_meta(df= collapsed_l2fc, sample_meta, cell_set_meta, key_cols= sig_cols)
-  
-  # Write out
-  outpath= paste(args$out, 'collapsed_l2fc_with_metadata.csv', sep='/')
-  print(paste("Writing collapsed_l2fc_with_metadata.csv to ", outpath))
-  write.csv(collapsed_l2fc_with_sm, outpath, row.names= FALSE, quote= FALSE)
-} else {
-  print('WARNING: collapsed_l2fc.csv does not exist. Skipping this file.')
-}
-#

From 8287fe251a9b3ccb071c11d97e04e0218b48d57f Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 27 Aug 2024 11:23:07 -0400
Subject: [PATCH 024/127] Change source due to renaming

---
 scripts/join_metadata.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R
index 8d603b7c..5eff5629 100644
--- a/scripts/join_metadata.R
+++ b/scripts/join_metadata.R
@@ -1,6 +1,6 @@
 library(argparse)
 library(tidyverse)
-source("./src/join_sample_meta.R")
+source("./src/join_metadata.R")
 
 # Argument parser ----
 parser <- ArgumentParser()

From c98ce0eb0aa39cb28288e597478ff87da69637bd Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 27 Aug 2024 14:15:14 -0400
Subject: [PATCH 025/127] Removed filtered counts

---
 scripts/filteredCounts_QC.R | 7 ++-----
 scripts/src/QC_images.R     | 5 ++++-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R
index 372ac869..0dcf74ab 100755
--- a/scripts/filteredCounts_QC.R
+++ b/scripts/filteredCounts_QC.R
@@ -22,12 +22,11 @@ parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="
 parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output")
 parser$add_argument("--wkdir", default=getwd(), help="Working directory")
 parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata")
-parser$add_argument("-c", "--uncollapsed_raw_counts", default="raw_counts_uncollapsed.csv",
+parser$add_argument("-c", "--raw_counts_uncollapsed", default="raw_counts_uncollapsed.csv",
                     help="path to file containing uncollapsed raw counts file")
 parser$add_argument("--raw_counts", default= "raw_counts.csv", help="path to raw counts file")
 parser$add_argument("--annotated_counts", default= "annotated_counts.csv",
                     help= "path to file containing annotated counts")
-parser$add_argument("--filtered_counts", default= "filtered_counts.csv", help= "path to filtered_counts file")
 parser$add_argument("--normalized_counts", default="normalized_counts.csv",
                     help="path to file containing normalized counts")
 parser$add_argument("--l2fc", default="l2fc.csv", help= "path to l2fc file")
@@ -56,10 +55,9 @@ if (args$out == ""){
 
 # Read in files and pull out parameters ----
 # Pipeline outputs
-raw_counts_uncollapsed= data.table::fread(args$uncollapsed_raw_counts, header= TRUE, sep= ',')
+raw_counts_uncollapsed= data.table::fread(args$raw_counts_uncollapsed, header= TRUE, sep= ',')
 raw_counts= data.table::fread(args$raw_counts, header= TRUE, sep= ',')
 annotated_counts= data.table::fread(args$annotated_counts, header= TRUE, sep= ',')
-filtered_counts= data.table::fread(args$filtered_counts, header= TRUE, sep= ',')
 if(file.exists(args$normalized_counts)) {
   normalized_counts= data.table::fread(args$normalized_counts, header=TRUE, sep=',', data.table=FALSE)
 } else {
@@ -93,7 +91,6 @@ print("Generating QC images ...")
 QC_images(raw_counts_uncollapsed= raw_counts_uncollapsed, 
           raw_counts= raw_counts, 
           annotated_counts= annotated_counts, 
-          filtered_counts= filtered_counts, 
           normalized_counts= normalized_counts, 
           l2fc= l2fc, 
           sample_meta= sample_meta, 
diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index 67528939..2867a2a1 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -323,6 +323,7 @@ create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log
 #' @import tidyverse
 #' @import WGCNA
 #' @import reshape2
+#' @import scales
 #' @param input_df Dataframe.
 #' @param row_id_cols Vector of column names from input_df that identifies the cell lines. For example,
 #'                    this can be "DepMap_ID", "CCLE_name" if only cell lines exist. It can also be 
@@ -452,7 +453,7 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou
 #' @param out Path to the directory to save the QC images.
 #' @returns NA. QC images are written out to the specified folder.
 QC_images= function(raw_counts_uncollapsed, raw_counts, 
-                    annotated_counts, filtered_counts, normalized_counts= NA, l2fc, 
+                    annotated_counts, normalized_counts= NA, l2fc, 
                     sample_meta, CB_meta, cell_set_meta,
                     cell_line_cols, 
                     id_cols= c('pcr_plate', 'pcr_well'), sig_cols,
@@ -490,6 +491,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   num_cls_in_set %<>% dplyr::mutate(expected_num_cl= str_split(members, ';')) %>%
     tidyr::unnest(cols= expected_num_cl) %>% dplyr::group_by(cell_set) %>% 
     dplyr::summarize(expected_num_cl= length(unique(expected_num_cl))) %>% dplyr::ungroup()
+  
+  filtered_counts= annotated_counts %>% dplyr::filter(expected)
   #
   
   # Sequencing QCs ____________________ ----

From a1053329c9e4a60c235c22b327acccec7c326d43 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 27 Aug 2024 14:24:20 -0400
Subject: [PATCH 026/127] Added parameters id_cols and reverse_index2

---
 scripts/collate_fastq_reads.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/collate_fastq_reads.sh b/scripts/collate_fastq_reads.sh
index 9a275420..d33eedfa 100644
--- a/scripts/collate_fastq_reads.sh
+++ b/scripts/collate_fastq_reads.sh
@@ -77,11 +77,14 @@ PROJECT_DIR=$(dirname "$BUILD_DIR")
 PROJECT_CODE=$(basename "$PROJECT_DIR")
 
 echo Project Code: $PROJECT_CODE
+echo REVERSE_INDEX2 is: $REVERSE_INDEX2
 
 args=(
 --sample_meta "$SAMPLE_META"
 --out "$BUILD_DIR"
 --sequencing_index_cols="$SEQUENCING_INDEX_COLS"
+--id_cols "$ID_COLS" 
+--reverse_index2 "$REVERSE_INDEX2"
 )
 
 echo Rscript collate_fastq_reads.R "${args[@]}"

From 10743f97fb9a2632edcc785cca72a9a3b774efe7 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 27 Aug 2024 14:24:43 -0400
Subject: [PATCH 027/127] Removed parameters seq_cols and reverse_index2

---
 scripts/filter_counts.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/scripts/filter_counts.sh b/scripts/filter_counts.sh
index f43447c8..ba6609f7 100644
--- a/scripts/filter_counts.sh
+++ b/scripts/filter_counts.sh
@@ -78,7 +78,6 @@ echo RAW_COUNTS is: $RAW_COUNTS
 echo CELL_LINE_META is: $CELL_LINE_META
 echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META
 echo CELL_SET_META is: $CELL_SET_META
-echo REVERSE_INDEX2 is: $REVERSE_INDEX2
 
 args=(
 -c "$RAW_COUNTS"
@@ -88,8 +87,6 @@ args=(
 --cell_set_meta "$CELL_SET_META"
 --out "$BUILD_DIR"
 --count_threshold "$COUNT_THRESHOLD"
---sequencing_index_cols "$SEQUENCING_INDEX_COLS"
---reverse_index2 "$REVERSE_INDEX2"
 --pool_id "$PULL_POOL_ID"
 --rm_data "$REMOVE_DATA"
 --assay_pool_meta "$ASSAY_POOL_META"

From 301e0f6341ad24a4a7a3ddd4389c74e2befb22e3 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 27 Aug 2024 14:25:04 -0400
Subject: [PATCH 028/127] Added two files as inputs

---
 scripts/filteredCounts_QC.sh | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh
index 2c5799a8..ff2f13e0 100644
--- a/scripts/filteredCounts_QC.sh
+++ b/scripts/filteredCounts_QC.sh
@@ -72,6 +72,22 @@ else
 	RAW_COUNTS=$BUILD_DIR/$RAW_COUNTS
 fi
 
+#Enforces abs paths
+if [[ "$RAW_COUNTS_UNCOLLAPSED" = /* ]]
+then
+	RAW_COUNTS_UNCOLLAPSED=$(ls $RAW_COUNTS_UNCOLLAPSED)
+else
+	RAW_COUNTS_UNCOLLAPSED=$BUILD_DIR/$RAW_COUNTS_UNCOLLAPSED
+fi
+
+#Enforces abs paths
+if [[ "$L2FC" = /* ]]
+then
+	L2FC=$(ls $L2FC)
+else
+	L2FC=$BUILD_DIR/$L2FC
+fi
+
 #Enforces abs paths
 if [[ "$CONTROL_BARCODE_META" = /* ]]
 then
@@ -88,6 +104,8 @@ echo CELL_SET_META is: $CELL_SET_META
 echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META
 echo COUNT_THRESHOLD is: $COUNT_THRESHOLD
 echo COUNT_COL_NAME is: $COUNT_COL_NAME
+echo RAW_COUNTS_UNCOLLAPSED is: $RAW_COUNTS_UNCOLLAPSED
+echo L2FC is: $L2FC
 echo RAW_COUNTS is: $RAW_COUNTS
 echo REVERSE_INDEX2 is: $REVERSE_INDEX2
 
@@ -102,7 +120,9 @@ args=(
 --count_threshold "$COUNT_THRESHOLD"
 --count_col_name "$COUNT_COL_NAME"
 --control_type "$CTL_TYPES"
+--raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED"
 --raw_counts "$RAW_COUNTS"
+--l2fc "$L2FC"
 --id_cols "$ID_COLS"
 --reverse_index2 "$REVERSE_INDEX2"
 )
@@ -118,7 +138,9 @@ echo Rscript filteredCounts_QC.R --sample_meta $SAMPLE_META \
 --count_col_name $COUNT_COL_NAME \
 --reverse_index2 $REVERSE_INDEX2 \
 --control_type $CTL_TYPES \
+--raw_counts_uncollapsed $RAW_COUNTS_UNCOLLAPSED \
 --raw_counts $RAW_COUNTS \
+--l2fc $L2FC \
 --id_cols $ID_COLS
 
 Rscript filteredCounts_QC.R "${args[@]}"
\ No newline at end of file

From 388f40c7658f91ced3966c21ed2ef6a74d009d4e Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 28 Aug 2024 10:45:18 -0400
Subject: [PATCH 029/127] Changed flag to 'lfc'

---
 scripts/join_metadata.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R
index 5eff5629..c0d2e98d 100644
--- a/scripts/join_metadata.R
+++ b/scripts/join_metadata.R
@@ -7,7 +7,7 @@ parser <- ArgumentParser()
 # specify our desired options 
 parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Sample meta data for the sequencing run.')
 parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata")
-parser$add_argument('--l2fc', default= 'l2fc.csv', help= 'L2FC data.') # level 4
+parser$add_argument('--lfc', default= 'l2fc.csv', help= 'L2FC data.') # level 4
 parser$add_argument('--collapsed_l2fc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5
 parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', 
                     help= 'Columns that uniquely identify a condition.') 
@@ -38,8 +38,8 @@ if(file.exists(args$assay_pool_meta)) {
 }
 
 # Add sample meta and assay pool meta to l2fc table ----
-if(file.exists(args$l2fc)) {
-  l2fc= data.table::fread(args$l2fc, header= T, sep= ',')
+if(file.exists(args$lfc)) {
+  l2fc= data.table::fread(args$lfc, header= T, sep= ',')
   
   # Add sample meta columns to l2fc
   if('bio_rep' %in% colnames(sample_meta) & 'bio_rep' %in% colnames(l2fc)) {

From 482fd0b52ce4a2bd9fa03062320d2db635c59124 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 28 Aug 2024 10:45:55 -0400
Subject: [PATCH 030/127] Look for meta joined files

---
 scripts/seq_to_mts.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/seq_to_mts.py b/scripts/seq_to_mts.py
index 64ecd2e7..ba8c307f 100755
--- a/scripts/seq_to_mts.py
+++ b/scripts/seq_to_mts.py
@@ -74,14 +74,14 @@ def main(args):
         os.makedirs(args.out)
 
     try:
-        fstr = os.path.join(args.build_path, 'l2fc.csv')
+        fstr = os.path.join(args.build_path, 'l2fc_with_meta_columns.csv')
         fmatch = glob.glob(fstr)
         assert (len(fmatch) == 1) , "Too many files found"
         print("Reading in data")
         sample_meta = read_build_file("sample_meta.csv", args)
         level_3 = read_build_file("normalized_counts.csv", args)
-        level_4 = read_build_file("l2fc.csv", args)
-        level_5 = read_build_file("collapsed_l2fc.csv", args)
+        level_4 = read_build_file("l2fc_with_meta_columns.csv", args)
+        level_5 = read_build_file("collapsed_l2fc_with_meta_columns.csv", args)
 
     except IndexError as err:
         logger.error(err)

From 9aa2b9f4b6a564bc4c178aa86663d135b0b619f6 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 28 Aug 2024 10:46:01 -0400
Subject: [PATCH 031/127] Create join_metadata.sh

---
 scripts/join_metadata.sh | 52 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 scripts/join_metadata.sh

diff --git a/scripts/join_metadata.sh b/scripts/join_metadata.sh
new file mode 100644
index 00000000..f2588eef
--- /dev/null
+++ b/scripts/join_metadata.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+echo Starting metadata join...
+
+if [ -z "$BUILD_DIR" ]
+then
+	echo BUILD_DIR not specified
+    exit -1
+fi
+
+if [ -z "$L2FC" ]
+then
+	echo LFC parameter empty
+    exit -1
+
+fi
+
+if [ -z "$COLLAPSED_L2FC" ]
+then
+	echo Collapsed l2fc parameter empty
+    exit -1
+fi
+
+#Enforces abs paths
+if [[ "$LFC" = /* ]]
+then
+	LFC=$(ls $LFC)
+else
+	LFC=$BUILD_DIR/$LFC
+fi
+
+#Enforces abs paths
+if [[ "$COLLAPSED_L2FC" = /* ]]
+then
+	COLLAPSED_L2FC=$(ls $COLLAPSED_L2FC)
+else
+	COLLAPSED_L2FC=$BUILD_DIR/$COLLAPSED_L2FC
+fi
+
+echo Build dir is: $BUILD_DIR
+echo LFC is: $LFC
+echo COLLAPSED_L2FC is: $COLLAPSED_L2FC
+
+echo Rscript join_metadata.R -c $LFC	\
+--collapsed_l2fc $COLLAPSED_L2FC \
+--out $BUILD_DIR \
+--sig_cols $SIG_COLS
+
+Rscript join_metadata.R -c $LFC	\
+--collapsed_l2fc $COLLAPSED_L2FC \
+--out $BUILD_DIR \
+--sig_cols $SIG_COLS
\ No newline at end of file

From 0200dec8fa090e8c6256f21f0cd3678d5de65780 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 28 Aug 2024 11:04:04 -0400
Subject: [PATCH 032/127] Changed COLLAPSED_L2FC to COLLAPSED_VALUES

---
 scripts/join_metadata.sh | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/scripts/join_metadata.sh b/scripts/join_metadata.sh
index f2588eef..20c2e95e 100644
--- a/scripts/join_metadata.sh
+++ b/scripts/join_metadata.sh
@@ -15,12 +15,18 @@ then
 
 fi
 
-if [ -z "$COLLAPSED_L2FC" ]
+if [ -z "$COLLAPSED_VALUES" ]
 then
 	echo Collapsed l2fc parameter empty
     exit -1
 fi
 
+if [ -z "$ASSAY_POOL_META" ]
+then
+	echo ASSAY_POOL_META parameter empty
+    exit -1
+fi
+
 #Enforces abs paths
 if [[ "$LFC" = /* ]]
 then
@@ -30,23 +36,33 @@ else
 fi
 
 #Enforces abs paths
-if [[ "$COLLAPSED_L2FC" = /* ]]
+if [[ "$COLLAPSED_VALUES" = /* ]]
+then
+	COLLAPSED_VALUES=$(ls $COLLAPSED_VALUES)
+else
+	COLLAPSED_VALUES=$BUILD_DIR/$COLLAPSED_VALUES
+fi
+
+#Enforces abs paths
+if [[ "$ASSAY_POOL_META" = /* ]]
 then
-	COLLAPSED_L2FC=$(ls $COLLAPSED_L2FC)
+	ASSAY_POOL_META=$(ls $ASSAY_POOL_META)
 else
-	COLLAPSED_L2FC=$BUILD_DIR/$COLLAPSED_L2FC
+	ASSAY_POOL_META=$BUILD_DIR/$ASSAY_POOL_META
 fi
 
 echo Build dir is: $BUILD_DIR
 echo LFC is: $LFC
-echo COLLAPSED_L2FC is: $COLLAPSED_L2FC
+echo COLLAPSED_VALUES is: $COLLAPSED_VALUES
 
 echo Rscript join_metadata.R -c $LFC	\
---collapsed_l2fc $COLLAPSED_L2FC \
+--collapsed_l2fc $COLLAPSED_VALUES \
+--assay_pool_meta $ASSAY_POOL_META \
 --out $BUILD_DIR \
 --sig_cols $SIG_COLS
 
 Rscript join_metadata.R -c $LFC	\
---collapsed_l2fc $COLLAPSED_L2FC \
+--collapsed_l2fc $COLLAPSED_VALUES \
+--assay_pool_meta $ASSAY_POOL_META \
 --out $BUILD_DIR \
 --sig_cols $SIG_COLS
\ No newline at end of file

From bf2537b473a9607a3e9691db7e16b4a0afd39e4d Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 28 Aug 2024 11:04:29 -0400
Subject: [PATCH 033/127] Add join_metada to run

---
 scripts/make_config_file.groovy | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy
index 4b3cff60..779c9305 100644
--- a/scripts/make_config_file.groovy
+++ b/scripts/make_config_file.groovy
@@ -14,6 +14,7 @@ pipeline {
         booleanParam(name: 'CBNORMALIZE', defaultValue: true, description: 'Check this to trigger the CBnormalize job.')
         booleanParam(name: 'COMPUTE_LFC', defaultValue: true, description: 'Check this to trigger the compute_l2fc job.')
         booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.')
+        booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.')
         booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.')
         string(name: 'BUILD_DIR', defaultValue: '/cmap/obelix/pod/prismSeq/', description: 'Output path to deposit build. Format should be /directory/PROJECT_CODE/BUILD_NAME')
         string(name: 'BUILD_NAME', defaultValue: '', description: 'Build name')
@@ -200,6 +201,9 @@ pipeline {
                         if (params.COLLAPSE) {
                             scriptsToRun.add('collapse_replicates.sh')
                         }
+                        if (params.JOIN_METADATA) {
+                            scriptsToRun.add('join_metadata.sh')
+                        }
                         if (params.RUN_EPS_QC) {
                             scriptsToRun.add('eps_qc.sh')
                         }

From 961f64307c726c944fb4525683760e48d9c9467e Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 28 Aug 2024 11:12:55 -0400
Subject: [PATCH 034/127] Change l2fc to lfc

---
 scripts/filteredCounts_QC.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh
index ff2f13e0..67debcb8 100644
--- a/scripts/filteredCounts_QC.sh
+++ b/scripts/filteredCounts_QC.sh
@@ -81,11 +81,11 @@ else
 fi
 
 #Enforces abs paths
-if [[ "$L2FC" = /* ]]
+if [[ "$LFC" = /* ]]
 then
-	L2FC=$(ls $L2FC)
+	LFC=$(ls $LFC)
 else
-	L2FC=$BUILD_DIR/$L2FC
+	LFC=$BUILD_DIR/$LFC
 fi
 
 #Enforces abs paths
@@ -105,7 +105,7 @@ echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META
 echo COUNT_THRESHOLD is: $COUNT_THRESHOLD
 echo COUNT_COL_NAME is: $COUNT_COL_NAME
 echo RAW_COUNTS_UNCOLLAPSED is: $RAW_COUNTS_UNCOLLAPSED
-echo L2FC is: $L2FC
+echo LFC is: $LFC
 echo RAW_COUNTS is: $RAW_COUNTS
 echo REVERSE_INDEX2 is: $REVERSE_INDEX2
 
@@ -122,7 +122,7 @@ args=(
 --control_type "$CTL_TYPES"
 --raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED"
 --raw_counts "$RAW_COUNTS"
---l2fc "$L2FC"
+--lfc "$LFC"
 --id_cols "$ID_COLS"
 --reverse_index2 "$REVERSE_INDEX2"
 )
@@ -140,7 +140,7 @@ echo Rscript filteredCounts_QC.R --sample_meta $SAMPLE_META \
 --control_type $CTL_TYPES \
 --raw_counts_uncollapsed $RAW_COUNTS_UNCOLLAPSED \
 --raw_counts $RAW_COUNTS \
---l2fc $L2FC \
+--lfc $LFC \
 --id_cols $ID_COLS
 
 Rscript filteredCounts_QC.R "${args[@]}"
\ No newline at end of file

From 1c4a7882368d1de9e11816d81ad24e26e48f9f60 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 28 Aug 2024 11:14:22 -0400
Subject: [PATCH 035/127] Changed l2fc to lfc

---
 scripts/filteredCounts_QC.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R
index 0dcf74ab..5f8d2304 100755
--- a/scripts/filteredCounts_QC.R
+++ b/scripts/filteredCounts_QC.R
@@ -29,7 +29,7 @@ parser$add_argument("--annotated_counts", default= "annotated_counts.csv",
                     help= "path to file containing annotated counts")
 parser$add_argument("--normalized_counts", default="normalized_counts.csv",
                     help="path to file containing normalized counts")
-parser$add_argument("--l2fc", default="l2fc.csv", help= "path to l2fc file")
+parser$add_argument("--lfc", default="l2fc.csv", help= "path to l2fc file")
 parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help = "Sample metadata")
 parser$add_argument("--CB_meta", default="/data/CB_meta.csv", help = "control barcode metadata")
 parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help = "Cell set metadata")
@@ -63,7 +63,7 @@ if(file.exists(args$normalized_counts)) {
 } else {
   normalized_counts= NA
 }
-l2fc= data.table::fread(args$l2fc, header= TRUE, sep= ',')
+l2fc= data.table::fread(args$lfc, header= TRUE, sep= ',')
 
 # Metadata files
 sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',', data.table= FALSE)

From ff1833b06b8699163c8cf9009e938cf8b12b53e0 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 28 Aug 2024 13:22:42 -0400
Subject: [PATCH 036/127] Updated comments

changed library back to require, fixed a bug, and numbered images
---
 scripts/src/QC_images.R | 74 +++++++++++++++++------------------------
 1 file changed, 31 insertions(+), 43 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index 2867a2a1..7be92d14 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -459,19 +459,22 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
                     id_cols= c('pcr_plate', 'pcr_well'), sig_cols,
                     control_type= 'negcon', count_threshold= 40, 
                     reverse_index2= FALSE, out = NA) {
-  library(tidyverse)
-  library(magrittr)
-  library(reshape2)
-  library(scales)
-  library(WGCNA)
-  
-  if(is.na(out)) {
-    out = getwd()
-  }
+  # Required packages ----
+  require(tidyverse)
+  require(magrittr)
+  require(reshape2)
+  require(scales)
+  require(WGCNA)
   
   # Some preprocessing ----
-  skipped_qcs= c() # empty vector to collect potential errors
-  num_profiles = annotated_counts %>% dplyr::distinct(pick(all_of(id_cols))) %>% nrow()
+  # Set out directory if none is specified.
+  if(is.na(out)) {out = getwd()}
+  
+  # Create empty vector to collect potential errors.
+  skipped_qcs= c() 
+  
+  # Count number of distinct profile to help scale some plots.
+  num_profiles= annotated_counts %>% dplyr::distinct(pick(all_of(id_cols))) %>% nrow()
   
   # Detect control barcodes
   cb_check= sample_meta %>%
@@ -479,31 +482,16 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
                   !(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type))
   contains_cbs= ifelse(nrow(cb_check)!= 0, T, F)
   
-  # Count number of cell lines in each cell set
-  num_cls_in_set= cell_set_meta %>% dplyr::filter(cell_set %in% unique(sample_meta$cell_set))
-  # Add cell_cets that are 'missing' or are strings of LUAs
-  if(nrow(num_cls_in_set) != length(unique(sample_meta$cell_set))) {
-    sets_not_in_meta= sample_meta %>% dplyr::filter(!cell_set %in% cell_set_meta$cell_set) %>%
-      dplyr::pull(cell_set) %>% unique() %>% sort()
-    sets_to_add_df= data_frame(cell_set= sets_not_in_meta, members= sets_not_in_meta)
-    num_cls_in_set= dplyr::bind_rows(num_cls_in_set, sets_to_add_df)
-  }
-  num_cls_in_set %<>% dplyr::mutate(expected_num_cl= str_split(members, ';')) %>%
-    tidyr::unnest(cols= expected_num_cl) %>% dplyr::group_by(cell_set) %>% 
-    dplyr::summarize(expected_num_cl= length(unique(expected_num_cl))) %>% dplyr::ungroup()
-  
-  filtered_counts= annotated_counts %>% dplyr::filter(expected)
-  #
+  # Pull filtered counts from annotated counts
+  filtered_counts= annotated_counts %>% dplyr::filter(expected_read)
   
   # Sequencing QCs ____________________ ----
-  ## Purity metrics ----
-  # call this function
+  ## 1. Purity metrics ----
   print('1. Generating QC table ...')
   create_qc_table(raw_counts_uncollapsed, raw_counts, filtered_counts,
                   value_col= 'n', file_path= paste0(out, '/QC_table.csv'))
-  #
   
-  ## Index count summaries ----
+  ## 2. Index count summaries ----
   print("2. Generating index counts tables ...")
   # Check that "IndexBarcode1" and "index_1" columns are present.
   # If so, calculate index summary and write out.
@@ -530,7 +518,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
     print('Column "index_2" not detected. Skipping index 2 summaries ...',  quote= FALSE)
   }
   
-  ## Total counts ----
+  ## 3. Total counts ----
   print("3. Generating total_counts image ...")
   potential_error= base::tryCatch({
     tc= create_total_counts_barplot(filtered_counts, id_cols, facet_col= 'pcr_plate')
@@ -552,7 +540,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   }
   
   # Assay QCs _________________________ ----
-  ## Cell lines recovered ----
+  ## 4. Cell lines recovered ----
   print("4. Generating cell_lines_present image ...")
   potential_error= base::tryCatch({
     cl_rec= create_recovery_barplot(filtered_counts, id_cols= id_cols, facet_col= 'pcr_plate', 
@@ -574,10 +562,10 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
     skipped_qcs = c(skipped_qcs, potential_error)
   }
   
-  ## Cell line contaminants ----
+  ## 5. Cell line contaminants ----
   print('5. Generating cell line contaminants ...')
   potential_error= base::tryCatch({
-    contams= annotated_counts %>% dplyr::filter(expected_read==F) %>%
+    contams= annotated_counts %>% dplyr::filter(expected_read == F) %>%
       dplyr::mutate(barcode_id= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>%
       dplyr::group_by(forward_read_cl_barcode, barcode_id) %>% 
       dplyr::summarise(num_wells= n(), median_n=median(n), max_n= max(n)) %>% ungroup() %>%
@@ -596,7 +584,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
     skipped_qcs = c(skipped_qcs, potential_error)
   }
   
-  ## Contaminant reads ----
+  ## 6. Contaminant reads ----
   print('6. Generating contaminant reads ...')
   potential_error= base::tryCatch({
     pcr_locations= c('pcr_plate', 'pcr_well')
@@ -666,10 +654,10 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
     skipped_qcs = c(skipped_qcs, potential_error)
   }
   
-  ## Cumulative counts by lines in negcons ----
+  ## 7. Cumulative counts by lines in negcons ----
   print("7. Generating cumulative image ...")
   potential_error= base::tryCatch({
-    cdf_plot= create_cdf_plot(filtered_counts %>% dplyr::filter(trt_type == 'control_type'), 
+    cdf_plot= create_cdf_plot(filtered_counts %>% dplyr::filter(trt_type == control_type), 
                               id_cols= id_cols, 
                               counts_col= 'n', 
                               mark1= 0.5, mark2= 0.95, 
@@ -692,7 +680,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
     skipped_qcs = c(skipped_qcs, potential_error)
   }
   
-  ## Control barcode trends ----
+  ## 8. Control barcode trends ----
   if(contains_cbs & is.data.frame(normalized_counts)) {
     print("8. Generating control_barcode_trend image")
     potential_error= base::tryCatch({
@@ -717,7 +705,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
     print('8. No control barcodes detected. Skipping control_barcode_trend image.')
   }
   
-  ## Sample correlation -----
+  ## 9. Sample correlation -----
   print("9. Generating sample_cor image ...")
   potential_error= base::tryCatch({
     cor_df= filtered_counts %>% 
@@ -744,7 +732,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
     skipped_qcs = c(skipped_qcs, potential_error)
   }
   
-  ## Tech rep correlations ----
+  ## 10. Tech rep correlations ----
   if(is.data.frame(normalized_counts) & 'tech_rep' %in% colnames(normalized_counts)) {
     # Check if there are more at least two tech reps
     unique_tech_reps= na.omit(unique(normalized_counts$tech_rep))
@@ -798,7 +786,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
     print('10. No technical replicates detected. Skipping tech_reps scatter plot.')
   }
   
-  ## Bio rep correlations ----
+  ## 11. Bio rep correlations ----
   if('bio_rep' %in% colnames(l2fc)) {
     unique_bio_reps= na.omit(unique(l2fc$bio_rep))
     
@@ -830,8 +818,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
                                         cor_method= 'pearson') 
         pdf(file=paste(out, "bio_corr_hm.pdf", sep="/"),
             width=sqrt(num_profiles), height=sqrt(num_profiles))
-        #print(bio_corr_hm)
-        #dev.off()
+        print(bio_corr_hm)
+        dev.off()
       }, error= function(e) {
         print(e)
         print('Encountered an error when creating the bio_corr_hm figure. Skipping this output ...') 

From 6e7afdd0803b6f24aabda59423c54936854740ca Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 28 Aug 2024 18:14:44 -0400
Subject: [PATCH 037/127] Fixed some minor bugs

---
 scripts/src/QC_images.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index 7be92d14..09b2b381 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -152,7 +152,7 @@ create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value
     dplyr::mutate(detect_type= case_when(.data[[value_col]] == 0 ~ 'Not detected',
                                          .data[[value_col]] <= count_threshold ~ 'Low counts',
                                          .data[[value_col]] > count_threshold ~ 'Detected')) %>%
-    dplyr::count(pick(all_of(c(id_cols, facet_col, 'detect_type', 'total_num_cls'))), name= 'num_cls_by_type') %>%
+    dplyr::count(pick(all_of(na.omit(c(id_cols, facet_col, 'detect_type', 'total_num_cls')))), name= 'num_cls_by_type') %>%
     tidyr::unite(all_of(id_cols), col= 'sample_id', sep= ':', remove= FALSE, na.rm= FALSE) %>%
     dplyr::mutate(percent= (num_cls_by_type / total_num_cls) * 100)
   
@@ -258,7 +258,7 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2=
     # label for AUC
     #geom_label(aes(x= mark2_loc, y= 0.1, label= paste0('AUC ', round(auc, 3))), hjust= 'inward', color= 'black') +
     geom_label(. %>% dplyr::filter(!is.na(auc)), mapping= aes(label= paste0('AUC ', round(auc, 3))), 
-               x= 1, y= 0, hjust= 'inward', vjust= 'inward', color= 'black') +
+               x= 1, y= 0.25, hjust= 'inward', vjust= 'inward', color= 'black') +
     facet_wrap(~facet_name) + 
     labs(x='% rank of unique reads', y='Cumulative percentage', color= 'CBs') + theme_bw()
   

From de5fdbee1639b05aa066a255f2a2089107863416 Mon Sep 17 00:00:00 2001
From: jdavis3141 <jdavis@broadinstitute.org>
Date: Fri, 30 Aug 2024 09:36:21 -0400
Subject: [PATCH 038/127] updated groovy to include join_meta

---
 scripts/join_metadata.sh        | 15 +++++++++++++--
 scripts/make_config_file.groovy |  1 +
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/scripts/join_metadata.sh b/scripts/join_metadata.sh
index 20c2e95e..e1ef3a56 100644
--- a/scripts/join_metadata.sh
+++ b/scripts/join_metadata.sh
@@ -51,18 +51,29 @@ else
 	ASSAY_POOL_META=$BUILD_DIR/$ASSAY_POOL_META
 fi
 
+#Enforces abs paths
+if [[ "$SAMPLE_META" = /* ]]
+then
+	SAMPLE_META=$(ls $SAMPLE_META)
+else
+	SAMPLE_META=$BUILD_DIR/$SAMPLE_META
+fi
+
 echo Build dir is: $BUILD_DIR
 echo LFC is: $LFC
 echo COLLAPSED_VALUES is: $COLLAPSED_VALUES
+echo SAMPLE_META is: $SAMPLE_META
 
 echo Rscript join_metadata.R -c $LFC	\
 --collapsed_l2fc $COLLAPSED_VALUES \
 --assay_pool_meta $ASSAY_POOL_META \
 --out $BUILD_DIR \
---sig_cols $SIG_COLS
+--sig_cols $SIG_COLS \
+--sample_meta $SAMPLE_META
 
 Rscript join_metadata.R -c $LFC	\
 --collapsed_l2fc $COLLAPSED_VALUES \
 --assay_pool_meta $ASSAY_POOL_META \
 --out $BUILD_DIR \
---sig_cols $SIG_COLS
\ No newline at end of file
+--sig_cols $SIG_COLS \
+--sample_meta $SAMPLE_META
\ No newline at end of file
diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy
index 779c9305..23eaf395 100644
--- a/scripts/make_config_file.groovy
+++ b/scripts/make_config_file.groovy
@@ -16,6 +16,7 @@ pipeline {
         booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.')
         booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.')
         booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.')
+        booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.')
         string(name: 'BUILD_DIR', defaultValue: '/cmap/obelix/pod/prismSeq/', description: 'Output path to deposit build. Format should be /directory/PROJECT_CODE/BUILD_NAME')
         string(name: 'BUILD_NAME', defaultValue: '', description: 'Build name')
         string(name: 'SCREEN', defaultValue: '', description: 'Screen name from COMET, necessary if using COMET for sample metadata.')

From 5273913a07a4000e37776842899e1781271a2797 Mon Sep 17 00:00:00 2001
From: jdavis3141 <jdavis@broadinstitute.org>
Date: Fri, 30 Aug 2024 09:38:00 -0400
Subject: [PATCH 039/127] rm extra param

---
 scripts/make_config_file.groovy | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy
index 23eaf395..089348d3 100644
--- a/scripts/make_config_file.groovy
+++ b/scripts/make_config_file.groovy
@@ -14,7 +14,6 @@ pipeline {
         booleanParam(name: 'CBNORMALIZE', defaultValue: true, description: 'Check this to trigger the CBnormalize job.')
         booleanParam(name: 'COMPUTE_LFC', defaultValue: true, description: 'Check this to trigger the compute_l2fc job.')
         booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.')
-        booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.')
         booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.')
         booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.')
         string(name: 'BUILD_DIR', defaultValue: '/cmap/obelix/pod/prismSeq/', description: 'Output path to deposit build. Format should be /directory/PROJECT_CODE/BUILD_NAME')

From be9fa66e06c028c91bdbb67de387d3ff8248063c Mon Sep 17 00:00:00 2001
From: jdavis3141 <jdavis@broadinstitute.org>
Date: Fri, 30 Aug 2024 09:47:47 -0400
Subject: [PATCH 040/127] get rid of store_true

---
 scripts/collate_fastq_reads.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index 08449d16..b5a03004 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -16,7 +16,7 @@ parser$add_argument("--sequencing_index_cols", default= "index_1,index_2",
                     help = "Sequencing columns in the sample meta")
 parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", 
                     help = "Columns that identify a unique PCR well")
-parser$add_argument("--reverse_index2", action="store_true", default=FALSE, 
+parser$add_argument("--reverse_index2", type="logical", default=FALSE,
                     help= "Reverse complement of index 2 for NovaSeq and NextSeq")
 parser$add_argument("--barcode_col", default= "forward_read_cl_barcode", 
                     help= "Name of the column in uncollapsed_raw_counts that contains the barcode sequences.")

From 96c763cc602aee2591191d7296caaf446c5e0ba1 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 13:20:51 -0400
Subject: [PATCH 041/127] Fix bug - added flowcell groups

---
 scripts/src/collate_fastq_reads.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 26f0063f..3790f950 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -198,8 +198,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Filter for the expected flowcells and summed up the reads over the ID cols.
   print('Summing up reads ...')
   raw_counts= uncollapsed_raw_counts %>% 
-    dplyr::semi_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane')) %>%
-    dplyr::inner_join(sequencing_map, by= intersect(colnames(.), colnames(sequencing_map)), relationship= 'many-to-one') %>%
+    dplyr::inner_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane')) %>%
+    dplyr::inner_join(sequencing_map, by= sequencing_index_cols, relationship= 'many-to-one') %>%
     dplyr::group_by(pick(all_of(c(id_cols, barcode_col)))) %>% 
     dplyr::summarize(n= sum(n)) %>% dplyr::ungroup()
   

From 71d188dd8f363c29d361da6f1185ba7085451a4c Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 13:59:27 -0400
Subject: [PATCH 042/127] Added id_cols as parameter

---
 scripts/filter_counts.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/filter_counts.sh b/scripts/filter_counts.sh
index ba6609f7..a2423385 100644
--- a/scripts/filter_counts.sh
+++ b/scripts/filter_counts.sh
@@ -78,6 +78,7 @@ echo RAW_COUNTS is: $RAW_COUNTS
 echo CELL_LINE_META is: $CELL_LINE_META
 echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META
 echo CELL_SET_META is: $CELL_SET_META
+echo ID_COLS is: $ID_COLS
 
 args=(
 -c "$RAW_COUNTS"
@@ -85,6 +86,7 @@ args=(
 --cell_line_meta "$CELL_LINE_META"
 --CB_meta "$CONTROL_BARCODE_META"
 --cell_set_meta "$CELL_SET_META"
+--id_cols "$ID_COLS"
 --out "$BUILD_DIR"
 --count_threshold "$COUNT_THRESHOLD"
 --pool_id "$PULL_POOL_ID"

From 5fc37fa37a37acc11b2bf1e0ce95183d4e17743d Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 14:07:57 -0400
Subject: [PATCH 043/127] Fixed "conflicting option strings"

---
 scripts/filter_counts.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R
index 6fabf610..d46d7bcc 100755
--- a/scripts/filter_counts.R
+++ b/scripts/filter_counts.R
@@ -20,7 +20,7 @@ parser$add_argument("-q", "--quietly", action="store_false",
 parser$add_argument("--wkdir", default=getwd(), help="Working directory")
 parser$add_argument("-c", "--raw_counts", default="raw_counts.csv", help = "path to file containing raw counts")
 parser$add_argument("-o", "--out", default="", help = "Output path. Default is working directory")
-parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata")
+parser$add_argument("--sample_meta", default="sample_meta.csv", help= "Sample metadata")
 parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata")
 parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help= "Cell set metadata")
 parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata")

From a2354ae1d403a7c14c90275e6d49ca891e8c223b Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 14:13:54 -0400
Subject: [PATCH 044/127] Undo previous change

---
 scripts/filter_counts.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R
index d46d7bcc..6fabf610 100755
--- a/scripts/filter_counts.R
+++ b/scripts/filter_counts.R
@@ -20,7 +20,7 @@ parser$add_argument("-q", "--quietly", action="store_false",
 parser$add_argument("--wkdir", default=getwd(), help="Working directory")
 parser$add_argument("-c", "--raw_counts", default="raw_counts.csv", help = "path to file containing raw counts")
 parser$add_argument("-o", "--out", default="", help = "Output path. Default is working directory")
-parser$add_argument("--sample_meta", default="sample_meta.csv", help= "Sample metadata")
+parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata")
 parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata")
 parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help= "Cell set metadata")
 parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata")

From 800491ec307bc38c890f11a14a9f5bdd44a0f7fa Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 14:14:02 -0400
Subject: [PATCH 045/127] Remove duplicate

---
 scripts/filteredCounts_QC.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R
index 5f8d2304..12e7bdc3 100755
--- a/scripts/filteredCounts_QC.R
+++ b/scripts/filteredCounts_QC.R
@@ -30,7 +30,6 @@ parser$add_argument("--annotated_counts", default= "annotated_counts.csv",
 parser$add_argument("--normalized_counts", default="normalized_counts.csv",
                     help="path to file containing normalized counts")
 parser$add_argument("--lfc", default="l2fc.csv", help= "path to l2fc file")
-parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help = "Sample metadata")
 parser$add_argument("--CB_meta", default="/data/CB_meta.csv", help = "control barcode metadata")
 parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help = "Cell set metadata")
 parser$add_argument("--cell_line_cols", default= 'DepMap_ID,CCLE_name',

From 223d6957034efee677b973afd10575468c58e1fc Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 14:17:05 -0400
Subject: [PATCH 046/127] Moved QC module towards end

---
 scripts/make_config_file.groovy | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy
index 089348d3..5ddce157 100644
--- a/scripts/make_config_file.groovy
+++ b/scripts/make_config_file.groovy
@@ -10,11 +10,11 @@ pipeline {
         booleanParam(name: 'CREATE_CELLDB_METADATA', defaultValue: true, description: 'Check this to trigger the create_celldb_metadata job.')
         booleanParam(name: 'COLLATE_FASTQ_READS', defaultValue: true, description: 'Check this to trigger the collate_fastq_reads job.')
         booleanParam(name: 'FILTER_COUNTS', defaultValue: true, description: 'Check this to trigger the filter_counts job.')
-        booleanParam(name: 'FILTER_COUNTS_QC', defaultValue: true, description: 'Check this to trigger the filteredCounts_QC job.')
         booleanParam(name: 'CBNORMALIZE', defaultValue: true, description: 'Check this to trigger the CBnormalize job.')
         booleanParam(name: 'COMPUTE_LFC', defaultValue: true, description: 'Check this to trigger the compute_l2fc job.')
         booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.')
         booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.')
+        booleanParam(name: 'FILTER_COUNTS_QC', defaultValue: true, description: 'Check this to trigger the filteredCounts_QC job.')
         booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.')
         string(name: 'BUILD_DIR', defaultValue: '/cmap/obelix/pod/prismSeq/', description: 'Output path to deposit build. Format should be /directory/PROJECT_CODE/BUILD_NAME')
         string(name: 'BUILD_NAME', defaultValue: '', description: 'Build name')
@@ -189,9 +189,6 @@ pipeline {
                         if (params.FILTER_COUNTS) {
                             scriptsToRun.add('filter_counts.sh')
                         }
-                        if (params.FILTER_COUNTS_QC) {
-                            scriptsToRun.add('filteredCounts_QC.sh')
-                        }
                         if (params.CBNORMALIZE) {
                             scriptsToRun.add('CBnormalize.sh')
                         }
@@ -201,6 +198,9 @@ pipeline {
                         if (params.COLLAPSE) {
                             scriptsToRun.add('collapse_replicates.sh')
                         }
+                        if (params.FILTER_COUNTS_QC) {
+                            scriptsToRun.add('filteredCounts_QC.sh')
+                        }
                         if (params.JOIN_METADATA) {
                             scriptsToRun.add('join_metadata.sh')
                         }

From d41dd82689e04df25a8931da247f52a3e3562e16 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 14:22:48 -0400
Subject: [PATCH 047/127] Removed count_col_name

---
 scripts/filteredCounts_QC.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh
index 67debcb8..460d7b38 100644
--- a/scripts/filteredCounts_QC.sh
+++ b/scripts/filteredCounts_QC.sh
@@ -103,7 +103,6 @@ echo NORMALIZED_COUNTS is: $NORMALIZED_COUNTS
 echo CELL_SET_META is: $CELL_SET_META
 echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META
 echo COUNT_THRESHOLD is: $COUNT_THRESHOLD
-echo COUNT_COL_NAME is: $COUNT_COL_NAME
 echo RAW_COUNTS_UNCOLLAPSED is: $RAW_COUNTS_UNCOLLAPSED
 echo LFC is: $LFC
 echo RAW_COUNTS is: $RAW_COUNTS
@@ -118,7 +117,6 @@ args=(
 --CB_meta "$CONTROL_BARCODE_META"
 --out "$BUILD_DIR"
 --count_threshold "$COUNT_THRESHOLD"
---count_col_name "$COUNT_COL_NAME"
 --control_type "$CTL_TYPES"
 --raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED"
 --raw_counts "$RAW_COUNTS"
@@ -135,7 +133,6 @@ echo Rscript filteredCounts_QC.R --sample_meta $SAMPLE_META \
 --sig_cols $SIG_COLS \
 --out $BUILD_DIR \
 --count_threshold $COUNT_THRESHOLD \
---count_col_name $COUNT_COL_NAME \
 --reverse_index2 $REVERSE_INDEX2 \
 --control_type $CTL_TYPES \
 --raw_counts_uncollapsed $RAW_COUNTS_UNCOLLAPSED \

From 33797c51c8a75cdd2e469b61b7f239618d80b0da Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 14:43:17 -0400
Subject: [PATCH 048/127] Added raw_counts_uncollapsed as a parameter

---
 scripts/make_config_file.groovy | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy
index 5ddce157..334e0b4d 100644
--- a/scripts/make_config_file.groovy
+++ b/scripts/make_config_file.groovy
@@ -41,6 +41,7 @@ pipeline {
         string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'Minimum threshold to filter cell line counts by.')
         string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'Pseudocount for normalization.')
         string(name: 'CELL_LINE_META', defaultValue: 'cell_line_meta.csv', description: 'File in BUILD_DIR containing cell line metadata')
+        string(name: 'RAW_COUNTS_UNCOLLAPSED', defaultValue: 'raw_counts_uncollapsed.csv', description: 'Filename in BUILD_DIR containing nori output')
         string(name: 'RAW_COUNTS', defaultValue: 'raw_counts.csv', description: 'Filename in BUILD_DIR containing raw counts')
         string(name: 'FILTERED_COUNTS', defaultValue: 'filtered_counts.csv', description: 'File in BUILD_DIR containing filtered counts')
         string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'File containing log2 fold change values')
@@ -105,7 +106,8 @@ pipeline {
                         'RUN_NORM', 'CONTROL_COLS', 'COUNT_THRESHOLD', 'COUNT_COL_NAME', 'BUILD_NAME', 'CONVERT_SUSHI',
                         'PULL_POOL_ID', 'RUN_EPS_QC', 'PSEUDOCOUNT', 'REMOVE_DATA', 'DAYS', 'SEQUENCING_INDEX_COLS',
                         'RAW_COUNTS', 'CELL_SET_META', 'CELL_LINE_META', 'FILTERED_COUNTS', 'LFC', 'COUNTS', 'ANNOTATED_COUNTS',
-                        'COLLAPSED_VALUES', 'NORMALIZED_COUNTS', 'API_URL', 'FILTER_COUNTS_QC', 'ASSAY_POOL_META', 'SCREEN'
+                        'COLLAPSED_VALUES', 'NORMALIZED_COUNTS', 'API_URL', 'FILTER_COUNTS_QC', 'ASSAY_POOL_META', 'SCREEN',
+                        'RAW_COUNTS_UNCOLLAPSED'
                     ]
 
                     def config = [:]

From 10a950b32d5f0b785e1343449efe7c032df7a529 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 15:25:42 -0400
Subject: [PATCH 049/127] Allow module to run from non-nori outputs

---
 scripts/collate_fastq_reads.R | 77 ++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 41 deletions(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index b5a03004..9da53c38 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -9,7 +9,7 @@ parser <- ArgumentParser()
 # specify desired options
 parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]")
 parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output")
-parser$add_argument("-c", "--uncollapsed_raw_counts", default="raw_counts_uncollapsed.csv",
+parser$add_argument('--raw_counts_uncollapsed', default="raw_counts_uncollapsed.csv",
                     help="path to file containing uncollapsed raw counts file")
 parser$add_argument("--sample_meta", default="sample_meta.csv", help = "Sample metadata")
 parser$add_argument("--sequencing_index_cols", default= "index_1,index_2", 
@@ -30,45 +30,40 @@ if (args$out == "") {
   args$out = args$wkdir
 }
 
-# Run collate_fastq_reads if uncollapsed file exists ----
-expected_file_path <- paste(args$out, "raw_counts_uncollapsed.csv", sep='/')
+# Run collate_fastq_reads ----
+# Read in files and parse vector arguments
+raw_counts_uncollapsed= data.table::fread(args$raw_counts_uncollapsed, header= T, sep= ',')
+sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',')
 
-if(file.exists(expected_file_path)) {
-  # Read in files and parse vector arguments
-  uncollapsed_raw_counts= data.table::fread(expected_file_path, header= T, sep= ',', data.table= F)
-  sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',', data.table= F)
-  
-  # Parse vector inputs
-  sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ","))
-  id_cols= unlist(strsplit(args$id_cols, ","))
-  
-  # Validation: Check that sequencing_index_cols are from sample meta column names
-  if(!all(sequencing_index_cols %in% colnames(sample_meta))) {
-    stop(paste('The following sequencing_index_cols were not found in the sample meta: ',
-               sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)]))
-  }
-  
-  # Validation: Check that id_cols are from sample meta column names
-  if(!all(id_cols %in% colnames(sample_meta))) {
-    stop(paste('The following id_cols were not found in the sample meta: ',
-               id_cols[!id_cols %in% colnames(sample_meta)]))
-  }
-  
-  print("Collating fastq reads ...")
-  raw_counts= collate_fastq_reads(uncollapsed_raw_counts, sample_meta, 
-                                  sequencing_index_cols= sequencing_index_cols,
-                                  id_cols= id_cols,
-                                  reverse_index2= args$reverse_index2,
-                                  barcode_col= args$barcode_col)
-  
-  # Validation: Basic file size check
-  if(nrow(raw_counts) == 0) {
-    stop('ERROR: Empty file generated. No rows in raw_counts output.')
-  } 
-  
-  rc_out_file= paste(args$out, 'raw_counts.csv', sep='/')
-  print(paste("Writing raw_counts.csv to ", rc_out_file))
-  write.csv(raw_counts, rc_out_file, row.names= FALSE, quote= FALSE)
-} else {
-  print("Uncollapsed raw counts file not detected. Proceeding with generating filtered counts file.")
+# Parse vector inputs
+sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ","))
+id_cols= unlist(strsplit(args$id_cols, ","))
+
+# Validation: Check that sequencing_index_cols are from sample meta column names
+if(!all(sequencing_index_cols %in% colnames(sample_meta))) {
+  stop(paste('The following sequencing_index_cols were not found in the sample meta: ',
+             sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)]))
+}
+
+# Validation: Check that id_cols are from sample meta column names
+if(!all(id_cols %in% colnames(sample_meta))) {
+  stop(paste('The following id_cols were not found in the sample meta: ',
+             id_cols[!id_cols %in% colnames(sample_meta)]))
 }
+
+print("Collating fastq reads ...")
+raw_counts= collate_fastq_reads(uncollapsed_raw_counts= raw_counts_uncollapsed, 
+                                sample_meta= sample_meta, 
+                                sequencing_index_cols= sequencing_index_cols,
+                                id_cols= id_cols,
+                                reverse_index2= args$reverse_index2,
+                                barcode_col= args$barcode_col)
+
+# Validation: Basic file size check
+if(nrow(raw_counts) == 0) {
+  stop('ERROR: Empty file generated. No rows in raw_counts output.')
+} 
+
+rc_out_file= paste(args$out, 'raw_counts.csv', sep='/')
+print(paste("Writing raw_counts.csv to ", rc_out_file))
+write.csv(raw_counts, rc_out_file, row.names= FALSE, quote= FALSE)

From c3924ed2377faf304035c3823f6af90fc53f5e73 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 15:26:13 -0400
Subject: [PATCH 050/127] Allow module to run from non-nori outputs

---
 scripts/src/collate_fastq_reads.R | 93 +++++++++++++++++--------------
 1 file changed, 52 insertions(+), 41 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 3790f950..eff9ebe7 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -141,45 +141,57 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     stop('One or more sequencing_index_cols in the sample meta is not filled out.')
   }
   
-  # Determine which flowcell names + lanes are expected ----
-  # "flowcell_names" and "flowcell_lanes" are strings that can contain more than one item.
-  # Columns can be parsed by splitting on the chars , ; :
-  # If there are multiple lane names and lane numbers, this uses the Cartesian product!
-  # Note: fread and read.csv keeps commas, read_csv DROPS commas
-  expected_flowcells= sample_meta %>% dplyr::distinct(flowcell_names, flowcell_lanes) %>%
-    dplyr::mutate(flowcell_name= base::strsplit(flowcell_names, split='[,;:]', fixed=F),
-                  flowcell_lane= base::strsplit(flowcell_lanes, split='[,;:]', fixed=F)) %>% 
-    tidyr::unnest(cols= flowcell_name) %>% tidyr::unnest(cols= flowcell_lane) %>%
-    dplyr::mutate(flowcell_lane= as.numeric(flowcell_lane))
-  
-  # Print out expected flowcells from the sample meta.
-  print(paste0('Identified ', nrow(expected_flowcells), ' unique flowcell + lane combos in the sample meta ...'))
-  print(expected_flowcells)
-  
-  # Print warning if there are multiple flowcell names with multiple flowcell lanes.
-  multi_name_and_lanes= expected_flowcells %>% dplyr::filter(grepl(',:;', flowcell_names) & grepl(',:;', flowcell_names))
-  if(nrow(multi_name_and_lanes) > 0) {
-    print('WARNING: Detected sample(s) sequenced over multiple flowcells and flowcell lanes.')
-    print('The function assumes that the same lanes were used for both flowcells.')
-  }
-  
-  # Validation: Check that all expected flowcell name + lanes are detected ----
-  # Check that all expected flowcell name + lanes are present in uncollapsed raw counts.
-  detected_flowcells= uncollapsed_raw_counts %>% dplyr::distinct(flowcell_name, flowcell_lane)
-  print(paste0('Identified ', nrow(detected_flowcells), ' unique flowcell + lane combos in the uncollapsed raw counts ...'))
-  print(detected_flowcells)
-  validate_detected_flowcells(detected_flowcells, expected_flowcells)
-  
-  # Validation: Check that sequencing_index_cols uniquely identify rows of sample meta ----
-  if(!validate_unique_samples(sequencing_index_cols, sample_meta)) {
-    print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.')
-    stop('The specified sequencing index columns do NOT uniquely identify every PCR well.')
-  }
-  
-  # Validation: Check that id_cols uniquely identify rows of sample meta ----
-  if(!validate_unique_samples(id_cols, sample_meta)) {
-    print('There may be multiple entries in the sample meta that have the same combination of ID columns.')
-    stop('The specified ID columns do NOT uniquely identify every PCR well.')
+  # If "flowcell_name" and "flowcell_lane" are present filter for valid flowcells ----
+  # Note: Can this switch be tied to the sequencer type?
+  if(all_of(c('flowcell_name', 'flowcell_lane') %in% colnames(uncollapsed_raw_counts))) {
+    # Determine which flowcell names + lanes are expected ----
+    # "flowcell_names" and "flowcell_lanes" are strings that can contain more than one item.
+    # Columns can be parsed by splitting on the chars , ; :
+    # If there are multiple lane names and lane numbers, this uses the Cartesian product!
+    # Note: fread and read.csv keeps commas, read_csv DROPS commas
+    expected_flowcells= sample_meta %>% dplyr::distinct(flowcell_names, flowcell_lanes) %>%
+      dplyr::mutate(flowcell_name= base::strsplit(flowcell_names, split='[,;:]', fixed=F),
+                    flowcell_lane= base::strsplit(flowcell_lanes, split='[,;:]', fixed=F)) %>% 
+      tidyr::unnest(cols= flowcell_name) %>% tidyr::unnest(cols= flowcell_lane) %>%
+      dplyr::mutate(flowcell_lane= as.numeric(flowcell_lane))
+    
+    # Print out expected flowcells from the sample meta.
+    print(paste0('Identified ', nrow(expected_flowcells), ' unique flowcell + lane combos in the sample meta ...'))
+    print(expected_flowcells)
+    
+    # Print warning if there are multiple flowcell names with multiple flowcell lanes.
+    multi_name_and_lanes= expected_flowcells %>% dplyr::filter(grepl(',:;', flowcell_names) & grepl(',:;', flowcell_names))
+    if(nrow(multi_name_and_lanes) > 0) {
+      print('WARNING: Detected sample(s) sequenced over multiple flowcells and flowcell lanes.')
+      print('The function assumes that the same lanes were used for both flowcells.')
+    }
+    
+    # Validation: Check that all expected flowcell name + lanes are detected ----
+    # Check that all expected flowcell name + lanes are present in uncollapsed raw counts.
+    detected_flowcells= uncollapsed_raw_counts %>% dplyr::distinct(flowcell_name, flowcell_lane)
+    print(paste0('Identified ', nrow(detected_flowcells), ' unique flowcell + lane combos in the uncollapsed raw counts ...'))
+    print(detected_flowcells)
+    validate_detected_flowcells(detected_flowcells, expected_flowcells)
+    
+    # Validation: Check that sequencing_index_cols uniquely identify rows of sample meta ----
+    if(!validate_unique_samples(sequencing_index_cols, sample_meta)) {
+      print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.')
+      stop('The specified sequencing index columns do NOT uniquely identify every PCR well.')
+    }
+    
+    # Validation: Check that id_cols uniquely identify rows of sample meta ----
+    if(!validate_unique_samples(id_cols, sample_meta)) {
+      print('There may be multiple entries in the sample meta that have the same combination of ID columns.')
+      stop('The specified ID columns do NOT uniquely identify every PCR well.')
+    }
+    
+    # Filter for expected flowcells ----
+    uncollapsed_raw_counts= uncollapsed_raw_counts %>% 
+      dplyr::inner_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane'))
+    
+  } else {
+    print('Flowcell_name and/or flowcell_lane were not detected in raw_counts_uncollapsed.')
+    print('Proceeding without flowcell filters ...')
   }
   
   # Create sequence map ----
@@ -197,8 +209,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Create raw counts file ----
   # Filter for the expected flowcells and summed up the reads over the ID cols.
   print('Summing up reads ...')
-  raw_counts= uncollapsed_raw_counts %>% 
-    dplyr::inner_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane')) %>%
+  raw_counts= uncollapsed_raw_counts %>%
     dplyr::inner_join(sequencing_map, by= sequencing_index_cols, relationship= 'many-to-one') %>%
     dplyr::group_by(pick(all_of(c(id_cols, barcode_col)))) %>% 
     dplyr::summarize(n= sum(n)) %>% dplyr::ungroup()

From 18c8e91015265ca960b04c14832f04a1613a257c Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 15:36:04 -0400
Subject: [PATCH 051/127] Added raw_counts_uncollapsed as param

---
 scripts/collate_fastq_reads.sh | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/scripts/collate_fastq_reads.sh b/scripts/collate_fastq_reads.sh
index d33eedfa..7fe46214 100644
--- a/scripts/collate_fastq_reads.sh
+++ b/scripts/collate_fastq_reads.sh
@@ -63,12 +63,20 @@ then
     exit -1
 fi
 
+#Enforces abs paths
+if [[ "$RAW_COUNTS_UNCOLLAPSED" = /* ]]
+then
+	RAW_COUNTS_UNCOLLAPSED=$(ls $RAW_COUNTS_UNCOLLAPSED)
+else
+	RAW_COUNTS_UNCOLLAPSED=$BUILD_DIR/$RAW_COUNTS_UNCOLLAPSED
+fi
+
 #Enforces abs paths
 if [[ "$SAMPLE_META" = /* ]]
 then
-	SAMPLE_META=$(ls $SAMPLE_META)
+  SAMPLE_META=$(ls $SAMPLE_META)
 else
-	SAMPLE_META=$BUILD_DIR/$SAMPLE_META
+  SAMPLE_META=$BUILD_DIR/$SAMPLE_META
 fi
 
 echo Build dir is: $BUILD_DIR
@@ -80,6 +88,7 @@ echo Project Code: $PROJECT_CODE
 echo REVERSE_INDEX2 is: $REVERSE_INDEX2
 
 args=(
+--raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED"
 --sample_meta "$SAMPLE_META"
 --out "$BUILD_DIR"
 --sequencing_index_cols="$SEQUENCING_INDEX_COLS"

From 270829266f138af07d1a4de5816d01f480072a5b Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 15:36:24 -0400
Subject: [PATCH 052/127] Added raw_counts_uncollapsed as param

---
 scripts/launch_job.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/launch_job.sh b/scripts/launch_job.sh
index d2bbfd0c..6b8cf25d 100644
--- a/scripts/launch_job.sh
+++ b/scripts/launch_job.sh
@@ -29,6 +29,7 @@ PARAMS=(
   CONVERT_SUSHI PULL_POOL_ID RUN_EPS_QC PSEUDOCOUNT REMOVE_DATA DAYS
   SEQUENCING_INDEX_COLS RAW_COUNTS CELL_SET_META CELL_LINE_META FILTERED_COUNTS
   LFC COUNTS ANNOTATED_COUNTS COLLAPSED_VALUES NORMALIZED_COUNTS ASSAY_POOL_META
+  RAW_COUNTS_UNCOLLAPSED
 )
 
 # Load parameters
@@ -104,6 +105,7 @@ echo "Running in container:"
   -e COLLAPSED_VALUES="$COLLAPSED_VALUES" \
   -e NORMALIZED_COUNTS="$NORMALIZED_COUNTS" \
   -e ASSAY_POOL_META="$ASSAY_POOL_META" \
+  -e RAW_COUNTS_UNCOLLAPSED="$RAW_COUNTS_UNCOLLAPSED"\
   -v "$WORKSPACE:/workspace" \
   -v /local/jenkins/.clue_api_key:/local/jenkins/.clue_api_key \
   -v /cmap/data/vdb/prismSeq:/data \

From 6b3e8b24e8417a9952a296c2f5084bc24d706237 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 15:42:11 -0400
Subject: [PATCH 053/127] Fixed boolean return

---
 scripts/src/collate_fastq_reads.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index eff9ebe7..b476feda 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -143,7 +143,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   
   # If "flowcell_name" and "flowcell_lane" are present filter for valid flowcells ----
   # Note: Can this switch be tied to the sequencer type?
-  if(all_of(c('flowcell_name', 'flowcell_lane') %in% colnames(uncollapsed_raw_counts))) {
+  if(base::all(c('flowcell_name', 'flowcell_lane') %in% colnames(uncollapsed_raw_counts))) {
     # Determine which flowcell names + lanes are expected ----
     # "flowcell_names" and "flowcell_lanes" are strings that can contain more than one item.
     # Columns can be parsed by splitting on the chars , ; :

From bb9ac63012861451d6def90759347c170fe45093 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 16:01:41 -0400
Subject: [PATCH 054/127] Changed type to logical

---
 scripts/filteredCounts_QC.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R
index 12e7bdc3..bacfca23 100755
--- a/scripts/filteredCounts_QC.R
+++ b/scripts/filteredCounts_QC.R
@@ -41,7 +41,8 @@ parser$add_argument("--sig_cols", default="cell_set,treatment,dose,dose_unit,day
 parser$add_argument("--control_type", default = "negcon",
                     help= "how negative control wells are distinguished in the trt_type column")
 parser$add_argument("--count_threshold", default=40, help= "Low counts threshold")
-parser$add_argument("--reverse_index2", default=FALSE, help = "Reverse index 2")
+parser$add_argument("--reverse_index2", type="logical", default=FALSE,
+                    help= "Reverse complement of index 2 for NovaSeq and NextSeq")
 parser$add_argument("-o","--out", default="", help = "Output path. Default is working directory")
 
 # get command line options, if help option encountered print help and exit

From 7d8dda3f3c116d9d19987a51488c131ae73afadd Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 16:14:41 -0400
Subject: [PATCH 055/127] Reference correct parameter

---
 scripts/join_metadata.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/join_metadata.sh b/scripts/join_metadata.sh
index e1ef3a56..d13f1698 100644
--- a/scripts/join_metadata.sh
+++ b/scripts/join_metadata.sh
@@ -8,7 +8,7 @@ then
     exit -1
 fi
 
-if [ -z "$L2FC" ]
+if [ -z "$LFC" ]
 then
 	echo LFC parameter empty
     exit -1
@@ -64,14 +64,14 @@ echo LFC is: $LFC
 echo COLLAPSED_VALUES is: $COLLAPSED_VALUES
 echo SAMPLE_META is: $SAMPLE_META
 
-echo Rscript join_metadata.R -c $LFC	\
+echo Rscript join_metadata.R --lfc $LFC	\
 --collapsed_l2fc $COLLAPSED_VALUES \
 --assay_pool_meta $ASSAY_POOL_META \
 --out $BUILD_DIR \
 --sig_cols $SIG_COLS \
 --sample_meta $SAMPLE_META
 
-Rscript join_metadata.R -c $LFC	\
+Rscript join_metadata.R --lfc $LFC	\
 --collapsed_l2fc $COLLAPSED_VALUES \
 --assay_pool_meta $ASSAY_POOL_META \
 --out $BUILD_DIR \

From ba3cff8d7f8640b8ae64f573f3126d1a21d58781 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 16:36:54 -0400
Subject: [PATCH 056/127] Added more prints and fixed a typo

---
 scripts/join_metadata.R | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R
index c0d2e98d..861f8024 100644
--- a/scripts/join_metadata.R
+++ b/scripts/join_metadata.R
@@ -39,6 +39,7 @@ if(file.exists(args$assay_pool_meta)) {
 
 # Add sample meta and assay pool meta to l2fc table ----
 if(file.exists(args$lfc)) {
+  print('Attempting to add sample_meta to l2fc file.')
   l2fc= data.table::fread(args$lfc, header= T, sep= ',')
   
   # Add sample meta columns to l2fc
@@ -51,6 +52,7 @@ if(file.exists(args$lfc)) {
   l2fc_with_meta_columns= join_metadata(input_df= l2fc, metadata= sample_meta, key_cols= input_cols)
   
   # Add assay pool meta columns to l2fc
+  print('Attempting to add assay_pool_meta to l2fc file.')
   if(assay_pool_meta_exists) {
     l2fc_with_meta_columns= join_metadata(input_df= l2fc_with_meta_columns, 
                                           metadata= input_assay_pool_meta,
@@ -69,6 +71,7 @@ if(file.exists(args$lfc)) {
 
 # Add sample meta and assay pool meta to collapsed_l2fc table ----
 if(file.exists(args$collapsed_l2fc)) {
+  print('Attempting to add sample_meta to collapsed l2fc.')
   collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',')
   
   # Add sample meta columns to collapsed l2fc
@@ -77,6 +80,7 @@ if(file.exists(args$collapsed_l2fc)) {
   
   # Add assay pool meta columns to collapsed l2fc
   if(assay_pool_meta_exists) {
+    print('Attempting to add assay_pool_meta to collapsed l2fc.')
     collapsed_l2fc_with_meta_columns= join_metadata(input_df= collapsed_l2fc_with_meta_columns, 
                                                     metadata= input_assay_pool_meta,
                                                     key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set'))
@@ -87,7 +91,7 @@ if(file.exists(args$collapsed_l2fc)) {
   # Write out
   outpath= paste(args$out, 'collapsed_l2fc_with_meta_columns.csv', sep='/')
   print(paste("Writing collapsed_l2fc_with_meta_columns.csv to ", outpath))
-  write.csv(collapsed_l2fc_with_meta_columns.csv, outpath, row.names= FALSE, quote= FALSE)
+  write.csv(collapsed_l2fc_with_meta_columns, outpath, row.names= FALSE, quote= FALSE)
 } else {
   print('WARNING: collapsed_l2fc.csv does not exist. Skipping this file.')
 }

From 352debf7462800877409ab1c7f80dee3f4b62c6b Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 30 Aug 2024 17:54:20 -0400
Subject: [PATCH 057/127] Updated comments and prints

---
 scripts/collate_fastq_reads.R     |  2 +-
 scripts/filter_counts.R           | 20 ++++++++---------
 scripts/filteredCounts_QC.R       | 37 ++++++++-----------------------
 scripts/join_metadata.R           |  4 ++--
 scripts/src/QC_images.R           | 26 ++++++++++++++--------
 scripts/src/collate_fastq_reads.R |  9 +++++---
 scripts/src/filter_raw_reads.R    |  3 ++-
 scripts/src/join_metadata.R       |  5 ++++-
 8 files changed, 51 insertions(+), 55 deletions(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index 9da53c38..ea51d9c8 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -51,7 +51,7 @@ if(!all(id_cols %in% colnames(sample_meta))) {
              id_cols[!id_cols %in% colnames(sample_meta)]))
 }
 
-print("Collating fastq reads ...")
+print("Calling collate_fastq_reads ...")
 raw_counts= collate_fastq_reads(uncollapsed_raw_counts= raw_counts_uncollapsed, 
                                 sample_meta= sample_meta, 
                                 sequencing_index_cols= sequencing_index_cols,
diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R
index 6fabf610..034cbdfa 100755
--- a/scripts/filter_counts.R
+++ b/scripts/filter_counts.R
@@ -73,13 +73,13 @@ cell_line_meta %<>%
  # distinct() # This needs to be removed for sequencing_index_cols to work! - YL
 
 # Run filter_raw_reads -----
-print("creating filtered count file")
-filtered_counts = filter_raw_reads(raw_counts= raw_counts, sample_meta= sample_meta,
-                                   cell_line_meta= cell_line_meta,
-                                   cell_set_meta= cell_set_meta,
-                                   CB_meta= CB_meta,
-                                   id_cols= id_cols,
-                                   count_threshold= as.numeric(args$count_threshold))
+print('Calling filter_raw_reads ...')
+filtered_counts= filter_raw_reads(raw_counts= raw_counts, sample_meta= sample_meta,
+                                  cell_line_meta= cell_line_meta,
+                                  cell_set_meta= cell_set_meta,
+                                  CB_meta= CB_meta,
+                                  id_cols= id_cols,
+                                  count_threshold= as.numeric(args$count_threshold))
 
 # Pulling pool_id when db_flag and pool_id flags are passed
 if (args$pool_id) {
@@ -110,12 +110,12 @@ if(sum(cl_entries$n) == 0) {
 # Write out module outputs ----
 unmapped_reads= filtered_counts$unmapped_reads
 unmapped_out = paste(args$out, 'unmapped_reads.csv', sep='/')
-print(paste("writing unmapped reads to: ", unmapped_out))
+print(paste("Writing unmapped reads to: ", unmapped_out))
 write.csv(unmapped_reads, unmapped_out, row.names=F)
 
 annotated_counts = filtered_counts$annotated_counts
 annot_out_file = paste(args$out, 'annotated_counts.csv', sep='/')
-print(paste("writing annotated counts to: ", annot_out_file))
+print(paste("Writing annotated counts to: ", annot_out_file))
 write.csv(annotated_counts, annot_out_file, row.names=F)
 
 filtered_counts = filtered_counts$filtered_counts
@@ -139,6 +139,6 @@ if(args$rm_data == TRUE){
 }
 
 filtrc_out_file = paste(args$out, 'filtered_counts.csv', sep='/')
-print(paste("writing filtered counts csv to: ", filtrc_out_file))
+print(paste("Writing filtered counts csv to: ", filtrc_out_file))
 write.csv(filtered_counts, filtrc_out_file, row.names=F, quote=F)
 
diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R
index bacfca23..e98ca966 100755
--- a/scripts/filteredCounts_QC.R
+++ b/scripts/filteredCounts_QC.R
@@ -1,18 +1,14 @@
 options(cli.unicode = FALSE)
 suppressPackageStartupMessages(library(argparse))
-suppressPackageStartupMessages(library(dplyr))
 suppressPackageStartupMessages(library(scam))
 suppressPackageStartupMessages(library(magrittr))
-suppressPackageStartupMessages(library(tidyr))
 suppressPackageStartupMessages(library(reshape2))
-suppressPackageStartupMessages(library(tibble))
 suppressPackageStartupMessages(library(stringr))
 suppressPackageStartupMessages(library(grDevices))
-suppressPackageStartupMessages(library(ggplot2))
-suppressPackageStartupMessages(library(ggpubr))
+suppressPackageStartupMessages(library(tidyverse))
 suppressPackageStartupMessages(library(scales)) # for out of bound handling in plots
-suppressPackageStartupMessages(library(ggpmisc)) # with ggplot to add fit line and labels
-suppressPackageStartupMessages(library(WGCNA))
+suppressPackageStartupMessages(library(ggpmisc)) # with ggplot to add linear fit labels
+suppressPackageStartupMessages(library(WGCNA)) # for faster correlations
 source("/workspace/scripts/src/QC_images.R")
 
 # Argument parser ----
@@ -30,8 +26,6 @@ parser$add_argument("--annotated_counts", default= "annotated_counts.csv",
 parser$add_argument("--normalized_counts", default="normalized_counts.csv",
                     help="path to file containing normalized counts")
 parser$add_argument("--lfc", default="l2fc.csv", help= "path to l2fc file")
-parser$add_argument("--CB_meta", default="/data/CB_meta.csv", help = "control barcode metadata")
-parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help = "Cell set metadata")
 parser$add_argument("--cell_line_cols", default= 'DepMap_ID,CCLE_name',
                     help= "Columns that identify cell lines or barcodes")
 parser$add_argument("--id_cols", default= 'pcr_plate,pcr_well',
@@ -54,6 +48,8 @@ if (args$out == ""){
 }
 
 # Read in files and pull out parameters ----
+sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',')
+
 # Pipeline outputs
 raw_counts_uncollapsed= data.table::fread(args$raw_counts_uncollapsed, header= TRUE, sep= ',')
 raw_counts= data.table::fread(args$raw_counts, header= TRUE, sep= ',')
@@ -65,37 +61,22 @@ if(file.exists(args$normalized_counts)) {
 }
 l2fc= data.table::fread(args$lfc, header= TRUE, sep= ',')
 
-# Metadata files
-sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',', data.table= FALSE)
-CB_meta= data.table::fread(args$CB_meta, header=TRUE, sep=',', data.table=FALSE)
-cell_set_meta = data.table::fread(args$cell_set_meta, header=TRUE, sep=',', data.table=FALSE)
-
 # Parameters
 cell_line_cols = unlist(strsplit(args$cell_line_cols, ","))
 id_cols= unlist(strsplit(args$id_cols, ","))
 sig_cols= unlist(strsplit(args$sig_cols, ","))
 control_type = args$control_type
 count_threshold= as.numeric(args$count_threshold)
+#
 
-# # If flag passed, use cell_set_meta file generated for the project via CellDB
-# if (args$db_flag) {
-#   print("Calling cell_set_meta generated using CellDB")
-#   cell_set_meta = read.csv("cell_set_meta.csv")
-#   # Otherwise, use static file
-# } else {
-#   print("Using static cell set metadata file to locate cell information.")
-#   cell_set_meta = read.csv(args$cell_set_meta)
-# }
-
-print("Generating QC images ...")
+# Call QC images function ----
+print("Calling QC images ...")
 QC_images(raw_counts_uncollapsed= raw_counts_uncollapsed, 
           raw_counts= raw_counts, 
           annotated_counts= annotated_counts, 
           normalized_counts= normalized_counts, 
           l2fc= l2fc, 
-          sample_meta= sample_meta, 
-          CB_meta= CB_meta, 
-          cell_set_meta= cell_set_meta,
+          sample_meta= sample_meta,
           cell_line_cols= c('DepMap_ID', 'CCLE_name'), 
           id_cols= id_cols, 
           sig_cols= sig_cols,
diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R
index 861f8024..877509f1 100644
--- a/scripts/join_metadata.R
+++ b/scripts/join_metadata.R
@@ -24,8 +24,8 @@ if (args$out == "") {
 sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',')
 sig_cols= unlist(strsplit(args$sig_cols, ","))
 
-# For assay pool meta, check if it exists. If so, then filter it for relavent cell_sets/davepool_ids
-# and select and rename some columns.
+# For assay pool meta, check if it exists. If so, then filter it for relevant cell_sets/davepool_ids
+# and select/rename some columns.
 assay_pool_meta_exists= FALSE
 if(file.exists(args$assay_pool_meta)) {
   assay_pool_meta_exists= TRUE # Update boolean
diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index 09b2b381..3ad0db66 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -23,6 +23,7 @@ validate_columns_exist= function(selected_cols, df) {
 #' 
 #' Generates some simple summaries for each unique index.
 #' 
+#' @import tidyverse
 #' @param df A dataframe which must contain the column "n" which represents the count of a read.
 #' @param index_col The name of the column contain the index barcodes as a string. This column must be present in "df".
 #' @param valid_indices. A vector of all the valid indices for "index_col".
@@ -87,6 +88,7 @@ create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts, v
 #' Creates the total counts barplot with bars colored by the barcode type,
 #' either a cell line barcode or control barcode.
 #'
+#' @import tidyverse
 #' @param filtered_counts Filtered counts dataframe.
 #' @param id_cols Vector of columns names that identify each sample.
 #' @param facet_col String name of the column in filtered_counts to facet the plot.
@@ -106,6 +108,7 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) {
     dplyr::group_by(pick(all_of(na.omit(c('sample_id', facet_col, 'barcode_type'))))) %>%
     dplyr::summarise(total_counts= sum(n)) %>% dplyr::ungroup()
   
+  # Create total counts plot
   total_counts_plot= total_counts %>% 
     ggplot(aes(x=sample_id, y=total_counts, fill=barcode_type)) +
     geom_col(alpha=0.75, position='identity') +
@@ -125,6 +128,7 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) {
 #' the total cell line counts on teh y axis. The parameter "include_ctrl_bcs" can be used to include the control
 #' barcodes in the cell line count. 
 #' 
+#' @import tidyverse
 #' @param filtered_counts Filtered counts dataframe.
 #' @param id_cols Vector of column names that identify each sample.
 #' @param facet_col String name of the column in filtered_counts to facet the plot.
@@ -184,6 +188,7 @@ create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value
 #' 
 #' Creates a line plot of the cumulative reads.
 #' 
+#' @import tidyverse
 #' @param input_df Input dataframe. Usually is the filtered_counts dataframe.
 #' @param id_cols Vector of column names that identify every PCR well.
 #' @param counts_col Name of the column that contains the values. Defaults to "n".
@@ -244,8 +249,8 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2=
   output_plot= data_for_plot %>%
     ggplot(aes(x= rank_pct, y=cum_pct)) +
     # Color control barcodes if specified
-    { if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), 
-                                  mapping= aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size= 2) } + 
+    {if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), 
+                                 mapping= aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size= 2)} + 
     geom_line(color='black') +
     # point for mark1 of counts
     geom_segment(aes(x= -Inf , y= mark1, xend= mark1_loc, yend = mark1), color= 'black', linetype= 2) +
@@ -269,6 +274,7 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2=
 #' 
 #' Creates a scatter plot of the control barcodes.
 #' 
+#' @import tidyverse
 #' @param normalized_counts Dataframe output from the normalize module.
 #' @param id_cols Vector of column names that identify every PCR well.
 #' @param value_col Name of the column that contains the values.
@@ -376,6 +382,8 @@ create_cor_heatmap= function(input_df, row_id_cols, col_id_cols, value_col,
 #' 
 #' From a long table, creates scatter plots to two replicates.
 #' 
+#' @import tidyverse
+#' @import ggmisc
 #' @param input_df Dataframe.
 #' @param cell_line_cols List of column names used to identify each cell line or control barcode.
 #' @param replicate_group_cols List of column names that describe a group of similar conditions.
@@ -441,8 +449,6 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou
 #' @param normalized_counts Normalized counts dataframe from the normalize module. This is an optional parameter.
 #' @param l2fc L2FC dataframe from the compute_l2fc module. This is used for the bio_reps plot. 
 #' @param sample_meta Dataframe of the sample metadata for the sequencing run.
-#' @param CB_meta Dataframe of the control barcode metadata. This is only used for the CDF plot.
-#' @param cell_set_meta Dataframe of the cell set metadata. This is only used for the CDF plot.
 #' @param cell_line_cols Vector of sample meta column names used to describe a cell line or barcode.
 #' @param id_cols Vector of sample meta column names used to identify each PCR well. 
 #'                This defaults to "pcr_plate", "pcr_well".
@@ -454,7 +460,7 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou
 #' @returns NA. QC images are written out to the specified folder.
 QC_images= function(raw_counts_uncollapsed, raw_counts, 
                     annotated_counts, normalized_counts= NA, l2fc, 
-                    sample_meta, CB_meta, cell_set_meta,
+                    sample_meta,
                     cell_line_cols, 
                     id_cols= c('pcr_plate', 'pcr_well'), sig_cols,
                     control_type= 'negcon', count_threshold= 40, 
@@ -463,8 +469,9 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   require(tidyverse)
   require(magrittr)
   require(reshape2)
-  require(scales)
   require(WGCNA)
+  require(scales)
+  require(ggmisc)
   
   # Some preprocessing ----
   # Set out directory if none is specified.
@@ -480,7 +487,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   cb_check= sample_meta %>%
     dplyr::filter(control_barcodes %in% c("Y", "T", T),
                   !(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type))
-  contains_cbs= ifelse(nrow(cb_check)!= 0, T, F)
+  contains_cbs= ifelse(nrow(cb_check)!= 0, TRUE, FALSE)
   
   # Pull filtered counts from annotated counts
   filtered_counts= annotated_counts %>% dplyr::filter(expected_read)
@@ -684,7 +691,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   if(contains_cbs & is.data.frame(normalized_counts)) {
     print("8. Generating control_barcode_trend image")
     potential_error= base::tryCatch({
-      trend_sc= create_ctrlBC_scatterplots(normalized_counts, id_cols, value_col= 'log2_n')
+      trend_sc= create_ctrlBC_scatterplots(normalized_counts %>% dplyr::filter(control_barcodes %in% c("Y", "T", T)), 
+                                           id_cols, value_col= 'log2_n')
       
       pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"),
           width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2)
@@ -837,7 +845,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   }
   
   # End _________________________ ----
-  print('QC finishing')
+  print('QCs finishing!')
   if(length(na.omit(skipped_qcs)) != 0) {
     print(paste0('WARNING: The following ', length(skipped_qcs), ' QCs encountered errors and were skipped - '))
     print(na.omit(skipped_qcs))
diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index b476feda..1e038526 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -83,6 +83,7 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) {
 #' 
 #' @param uncollapsed_raw_counts Dataframe of reads from all the fastq files with the following columns -
 #'                    "flowcell_name", "flowcell_lane", "index_1", "index_2", "forward_read_cl_barcode", and "n".
+#'                    The flowcell columns are optional. If they do not exists, flowcell filters will be skipped.
 #' @param sample_meta Sample metadata generate for the project which may contain the following columns - 
 #'                    "flowcell_names", "flowcell_lanes", "index_1", "index_2". The sample meta MUST contain
 #'                    "flowcell_names" and "flowcell_lanes" for filtering.
@@ -141,9 +142,11 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     stop('One or more sequencing_index_cols in the sample meta is not filled out.')
   }
   
-  # If "flowcell_name" and "flowcell_lane" are present filter for valid flowcells ----
+  # If "flowcell_name" and "flowcell_lane" are present, filter for valid flowcells ----
   # Note: Can this switch be tied to the sequencer type?
   if(base::all(c('flowcell_name', 'flowcell_lane') %in% colnames(uncollapsed_raw_counts))) {
+    print('Detecting flowcells. Filtering for valid flowcells ...')
+    
     # Determine which flowcell names + lanes are expected ----
     # "flowcell_names" and "flowcell_lanes" are strings that can contain more than one item.
     # Columns can be parsed by splitting on the chars , ; :
@@ -191,7 +194,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     
   } else {
     print('Flowcell_name and/or flowcell_lane were not detected in raw_counts_uncollapsed.')
-    print('Proceeding without flowcell filters ...')
+    print('Proceeding without filtering flowcells ...')
   }
   
   # Create sequence map ----
@@ -223,6 +226,6 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     print('Warning: Low index purity!')
   } else {}
   
-  print('Done!')
+  print('Collate_fastq_reads has completed!')
   return(raw_counts)
 }
diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R
index 32cbbe72..c8c3a939 100755
--- a/scripts/src/filter_raw_reads.R
+++ b/scripts/src/filter_raw_reads.R
@@ -119,7 +119,7 @@ filter_raw_reads = function(raw_counts,
   
   # Validation: Check that cell sets do not contain duplicate LUAs ----
   # This will produce a warning if a LUA appears in a cell set more than once!
-  # This currently does NOT result in an error. Error avoided using a distinct when creating the template
+  # This currently does NOT result in an error. Error avoided using a distinct when creating the template.
   validate_cell_set_luas(sample_meta, cell_set_meta)
   
   # Split off unmapped reads ----
@@ -191,6 +191,7 @@ filter_raw_reads = function(raw_counts,
     print('Warning: Low cell line purity!')
   }
   
+  print('Filter_raw_reads has completed!')
   return(list(unmapped_reads= unmapped_reads, 
               annotated_counts= annotated_counts, 
               filtered_counts= filtered_counts))
diff --git a/scripts/src/join_metadata.R b/scripts/src/join_metadata.R
index 3842054c..4f3f7e95 100644
--- a/scripts/src/join_metadata.R
+++ b/scripts/src/join_metadata.R
@@ -23,11 +23,14 @@ validate_columns_exist= function(selected_cols, df) {
 #' 
 #' Joins a given data frame with the sample meta.
 #' 
+#' @import tidyverse
 #' @param input_df Input dataframe that should contain the columns specified in the "key_cols" parameter and "cell_set".
 #' @param metadata Dataframe of the sample meta used in the run.
 #' @param key_cols Vector of column names used as identifiers in the sample meta.
 #' @returns Data frame with additional columns from the sample meta.
 join_metadata= function(input_df, metadata, key_cols) {
+  require(tidyverse)
+  
   # Validation: Check that key_cols are present in df ----
   if(validate_columns_exist(key_cols, input_df) == FALSE) {
     stop('Not all key_cols (printed above) are present in the provided dataframe.')
@@ -51,7 +54,7 @@ join_metadata= function(input_df, metadata, key_cols) {
     dplyr::select(-tidyselect::ends_with('.y'))
   
   # Validation: Check that merge did not explode ----
-  print(paste0(' Input df rows: ', nrow(input_df)))
+  print(paste0('Input df rows: ', nrow(input_df)))
   print(paste0('Output df rows: ', nrow(output_df)))
   if(nrow(input_df) < nrow(output_df)) {
     stop('Metadata join is producing more rows than expected!')

From 144d097dda0c04dcb8357de91ebaa7638572ae88 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 6 Sep 2024 13:10:43 -0400
Subject: [PATCH 058/127] Update collate_fastq_reads.R

---
 scripts/src/collate_fastq_reads.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 1e038526..7298d0d1 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -214,7 +214,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   print('Summing up reads ...')
   raw_counts= uncollapsed_raw_counts %>%
     dplyr::inner_join(sequencing_map, by= sequencing_index_cols, relationship= 'many-to-one') %>%
-    dplyr::group_by(pick(all_of(c(id_cols, barcode_col)))) %>% 
+    dplyr::group_by(pick(all_of(c(id_cols, barcode_col)))) %>%
     dplyr::summarize(n= sum(n)) %>% dplyr::ungroup()
   
   # Calculate index purity ----

From 28280ac9d9a50d2011e97295be0ea164a0e50773 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 6 Sep 2024 18:03:14 -0400
Subject: [PATCH 059/127] Fixed typo in conflict resolution

---
 scripts/filter_counts.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R
index 2111bc12..fb55a52f 100755
--- a/scripts/filter_counts.R
+++ b/scripts/filter_counts.R
@@ -24,9 +24,8 @@ parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sam
 parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata")
 parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help= "Cell set metadata")
 parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata")
-parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", 
+parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", help = "Columns to identify each PCR well")
 parser$add_argument("--CB_meta", default="CB_meta.csv", help = "Control Barcode metadata")
-                    help = "Sequencing columns in the sample meta")
 parser$add_argument("--count_threshold", default= 40, help = "Low counts threshold")
 parser$add_argument("--rm_data", type="logical", help = "Remove bad experimental data")
 parser$add_argument("--pool_id", type="logical", help = "Pull pool IDs from CellDB.")

From 8e458338d84faca973c1c2a40055e97e9a487617 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 6 Sep 2024 18:18:28 -0400
Subject: [PATCH 060/127] Dropped unused params

---
 scripts/filteredCounts_QC.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh
index 7cfaa901..c2653172 100644
--- a/scripts/filteredCounts_QC.sh
+++ b/scripts/filteredCounts_QC.sh
@@ -113,8 +113,6 @@ args=(
 --annotated_counts "$ANNOTATED_COUNTS"
 --normalized_counts "$NORMALIZED_COUNTS"
 --sig_cols "$SIG_COLS"
---cell_set_meta "$CELL_SET_META"
---CB_meta "$CONTROL_BARCODE_META"
 --out "$BUILD_DIR"
 --count_threshold "$COUNT_THRESHOLD"
 --control_type "$CTL_TYPES"
@@ -128,8 +126,6 @@ args=(
 echo Rscript filteredCounts_QC.R --sample_meta $SAMPLE_META \
 --annotated_counts $ANNOTATED_COUNTS \
 --normalized_counts $NORMALIZED_COUNTS \
---cell_set_meta $CELL_SET_META \
---CB_meta $CONTROL_BARCODE_META \
 --sig_cols $SIG_COLS \
 --out $BUILD_DIR \
 --count_threshold $COUNT_THRESHOLD \

From 02db68bc745bbcf756d2d1be595d1fb88fb16a08 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Thu, 19 Sep 2024 15:29:11 -0400
Subject: [PATCH 061/127] Changed merges to data.table

also commented out a stop error
---
 scripts/src/collate_fastq_reads.R | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 7298d0d1..6727a25c 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -70,7 +70,7 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) {
     print('The following flowcells/lanes specified in the sample meta were not detected in the fastq reads.')
     print(missing_flowcells)
     print('Check that the sample meta is correct or that all fastq files are in the correct directory.')
-    stop('One or more flowcell specified in the sample meta was not detected.')
+    #stop('One or more flowcell specified in the sample meta was not detected.')
   }
 }
 
@@ -112,7 +112,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   if(reverse_index2) {
     if('index_2' %in% colnames(sample_meta)) {
       print("Reverse-complementing index 2 barcode ...")
-      sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))
+      sample_meta[, index_2 := chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))]
     } else {
       stop('Reverse index 2 is set to TRUE, but index_2 does not exists.')
     }
@@ -189,8 +189,9 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     }
     
     # Filter for expected flowcells ----
-    uncollapsed_raw_counts= uncollapsed_raw_counts %>% 
-      dplyr::inner_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane'))
+    uncollapsed_raw_counts= data.table::merge.data.table(
+      uncollapsed_raw_counts, data.table::setDT(expected_flowcells), 
+      by= c('flowcell_name', 'flowcell_lane'), allow.cartesian= FALSE)
     
   } else {
     print('Flowcell_name and/or flowcell_lane were not detected in raw_counts_uncollapsed.')
@@ -212,10 +213,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Create raw counts file ----
   # Filter for the expected flowcells and summed up the reads over the ID cols.
   print('Summing up reads ...')
-  raw_counts= uncollapsed_raw_counts %>%
-    dplyr::inner_join(sequencing_map, by= sequencing_index_cols, relationship= 'many-to-one') %>%
-    dplyr::group_by(pick(all_of(c(id_cols, barcode_col)))) %>%
-    dplyr::summarize(n= sum(n)) %>% dplyr::ungroup()
+  raw_counts= data.table::merge.data.table(uncollapsed_raw_counts, sequencing_map, by= sequencing_index_cols)
+  raw_counts= raw_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)]
   
   # Calculate index purity ----
   index_purity= sum(raw_counts$n) / sum(uncollapsed_raw_counts$n)

From f1c1f2374d25d3b5764a4fc2b4e97d37638f8e40 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 20 Sep 2024 11:29:43 -0400
Subject: [PATCH 062/127] Run collate in chunks

---
 scripts/collate_fastq_reads.R | 45 +++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index ea51d9c8..f0cd6aaa 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -30,40 +30,61 @@ if (args$out == "") {
   args$out = args$wkdir
 }
 
-# Run collate_fastq_reads ----
+# Read in sample meta and parse argument strings ----
 # Read in files and parse vector arguments
-raw_counts_uncollapsed= data.table::fread(args$raw_counts_uncollapsed, header= T, sep= ',')
 sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',')
 
 # Parse vector inputs
 sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ","))
 id_cols= unlist(strsplit(args$id_cols, ","))
 
-# Validation: Check that sequencing_index_cols are from sample meta column names
+# Validation: Check that sequencing_index_cols are from sample meta column names ----
 if(!all(sequencing_index_cols %in% colnames(sample_meta))) {
   stop(paste('The following sequencing_index_cols were not found in the sample meta: ',
              sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)]))
 }
 
-# Validation: Check that id_cols are from sample meta column names
+# Validation: Check that id_cols are from sample meta column names ----
 if(!all(id_cols %in% colnames(sample_meta))) {
   stop(paste('The following id_cols were not found in the sample meta: ',
              id_cols[!id_cols %in% colnames(sample_meta)]))
 }
 
-print("Calling collate_fastq_reads ...")
-raw_counts= collate_fastq_reads(uncollapsed_raw_counts= raw_counts_uncollapsed, 
-                                sample_meta= sample_meta, 
-                                sequencing_index_cols= sequencing_index_cols,
-                                id_cols= id_cols,
-                                reverse_index2= args$reverse_index2,
-                                barcode_col= args$barcode_col)
+# Run collate_fastq_reads on chunks ----
+# Set up loop to process chunks
+header_col_names= data.table::fread(args$raw_counts_uncollapsed, header=T, sep= ',', nrow= 0) %>% colnames()
+chunk_size= 10^6 # Maximum number of rows in a chunk
+chunk_idx= 1 # Counter to keep track of chunks in a loop
+current_chunk_size= chunk_size # Variable for loop exit condition
+chunk_collector= list() # List to collect processed chunks
 
-# Validation: Basic file size check
+# For each chunk, call collate
+while(current_chunk_size == chunk_size) {
+  nori_chunk= data.table::fread(args$raw_counts_uncollapsed, header= F, sep= ',',
+                                col.names= header_col_names,
+                                nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1)
+  
+  current_chunk_size= nrow(nori_chunk) # set current chunk size to stop loop
+  print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' '))
+  
+  chunk_collector[[chunk_idx]]= collate_fastq_reads(nori_chunk, sample_meta,
+                                                    sequencing_index_cols= sequencing_index_cols,
+                                                    id_cols= id_cols,
+                                                    reverse_index2=  args$reverse_index2,
+                                                    barcode_col= args$barcode_col)
+  
+  chunk_idx= chunk_idx + 1
+}
+
+raw_counts= data.table::rbindlist(chunk_collector)
+raw_counts= raw_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)]
+
+# Validation: Basic file size check ----
 if(nrow(raw_counts) == 0) {
   stop('ERROR: Empty file generated. No rows in raw_counts output.')
 } 
 
+# Write out file ----
 rc_out_file= paste(args$out, 'raw_counts.csv', sep='/')
 print(paste("Writing raw_counts.csv to ", rc_out_file))
 write.csv(raw_counts, rc_out_file, row.names= FALSE, quote= FALSE)

From c4e20072a7ae5fef3d667cd21e8f999e2793f39a Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 20 Sep 2024 11:30:11 -0400
Subject: [PATCH 063/127] Removed inplace reverse index

---
 scripts/src/collate_fastq_reads.R | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 6727a25c..d1e84b65 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -67,10 +67,9 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) {
   missing_flowcells= expected_flowcells %>% dplyr::anti_join(detected_flowcells, by= c('flowcell_name', 'flowcell_lane'))
 
   if(nrow(missing_flowcells) != 0) {
-    print('The following flowcells/lanes specified in the sample meta were not detected in the fastq reads.')
+    print('WARNING: The following flowcells/lanes specified in the sample meta were not detected in the fastq reads.')
     print(missing_flowcells)
     print('Check that the sample meta is correct or that all fastq files are in the correct directory.')
-    #stop('One or more flowcell specified in the sample meta was not detected.')
   }
 }
 
@@ -112,7 +111,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   if(reverse_index2) {
     if('index_2' %in% colnames(sample_meta)) {
       print("Reverse-complementing index 2 barcode ...")
-      sample_meta[, index_2 := chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))]
+      sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))
     } else {
       stop('Reverse index 2 is set to TRUE, but index_2 does not exists.')
     }

From 24f693bbae7829156da87f00f0210ce657d3640a Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 20 Sep 2024 11:31:53 -0400
Subject: [PATCH 064/127] Changed a mutate to data.table implace

---
 scripts/src/filter_raw_reads.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R
index c8c3a939..3899d46b 100755
--- a/scripts/src/filter_raw_reads.R
+++ b/scripts/src/filter_raw_reads.R
@@ -127,8 +127,8 @@ filter_raw_reads = function(raw_counts,
   # but do not map to known barcodes in PRISM.
   # Also sorted reads in descending order by read count.
   print('Splitting off unmapped reads ...')
-  raw_counts %<>% dplyr::mutate(mapped= forward_read_cl_barcode %in% c(cell_line_meta$Sequence, CB_meta$Sequence))
-  unmapped_reads= raw_counts %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>% 
+  raw_counts[, mapped := forward_read_cl_barcode %in% c(cell_line_meta$Sequence, CB_meta$Sequence)]
+  unmapped_reads= raw_counts %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>%
     dplyr::arrange(dplyr::desc(n))
   
   # Creating a template of all expected reads in the run ----

From bad695100c4e7b5344714d1267884536782aefe8 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 20 Sep 2024 12:52:29 -0400
Subject: [PATCH 065/127] Changed dplyr to data.table

---
 scripts/src/filter_raw_reads.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R
index 3899d46b..015d0813 100755
--- a/scripts/src/filter_raw_reads.R
+++ b/scripts/src/filter_raw_reads.R
@@ -98,6 +98,7 @@ filter_raw_reads = function(raw_counts,
                             count_threshold= 40) {
   require(tidyverse)
   require(magrittr)
+  browser()
   
   # Processing metadata and inputs ---- 
   # CB meta is in log10 and should be converted to log2.
@@ -128,8 +129,7 @@ filter_raw_reads = function(raw_counts,
   # Also sorted reads in descending order by read count.
   print('Splitting off unmapped reads ...')
   raw_counts[, mapped := forward_read_cl_barcode %in% c(cell_line_meta$Sequence, CB_meta$Sequence)]
-  unmapped_reads= raw_counts %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>%
-    dplyr::arrange(dplyr::desc(n))
+  unmapped_reads= raw_counts[mapped==FALSE,][order(-n)][, mapped:= NULL]
   
   # Creating a template of all expected reads in the run ----
   # Use all 4 meta data files to create a "template" dataframe where

From ef18cf05700eab3d0e6c0b01286378a84ab7c24e Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 20 Sep 2024 12:55:39 -0400
Subject: [PATCH 066/127] Drop browser

---
 scripts/src/filter_raw_reads.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R
index 015d0813..cd94c692 100755
--- a/scripts/src/filter_raw_reads.R
+++ b/scripts/src/filter_raw_reads.R
@@ -98,7 +98,6 @@ filter_raw_reads = function(raw_counts,
                             count_threshold= 40) {
   require(tidyverse)
   require(magrittr)
-  browser()
   
   # Processing metadata and inputs ---- 
   # CB meta is in log10 and should be converted to log2.

From ea388607db34d2f3d6642caeab2105f845ef9c46 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 20 Sep 2024 13:00:52 -0400
Subject: [PATCH 067/127] Added escape for empty chunks

---
 scripts/src/collate_fastq_reads.R | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index d1e84b65..dc8d0ee1 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -215,6 +215,12 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   raw_counts= data.table::merge.data.table(uncollapsed_raw_counts, sequencing_map, by= sequencing_index_cols)
   raw_counts= raw_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)]
   
+  # Escape for when a chunk contains invalid sequencing locations
+  if(nrow(raw_counts) == 0) {
+    print('WARNING: raw_counts is empty!')
+    return(raw_counts)
+  }
+  
   # Calculate index purity ----
   index_purity= sum(raw_counts$n) / sum(uncollapsed_raw_counts$n)
   print(paste0('Index purity: ', round(index_purity, 4)))

From 51f5c42d37b362a4c3bc057a5af9e4cefc59084b Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 24 Sep 2024 17:36:49 -0400
Subject: [PATCH 068/127] Wrapped chunking in a function

---
 scripts/src/collate_fastq_reads.R | 102 ++++++++++++++++++++++--------
 1 file changed, 77 insertions(+), 25 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index dc8d0ee1..23ce723a 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -73,6 +73,38 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) {
   }
 }
 
+#' process_in_chunks
+#' 
+#' This function runs some action over chunks of a large file. At the end, all chunks are
+#' appended together.
+#' 
+#' @param large_file_path description
+#' @param chunk_size description
+#' @param action A function passed to act on each chunk
+process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) {
+  
+  header_col_names= data.table::fread(large_file_path, header= TRUE, sep= ',', nrow= 0) %>% colnames()
+  chunk_idx= 1 # Counter to keep track of chunks in a loop
+  current_chunk_size= chunk_size # Variable for loop exit condition
+  chunk_collector= list() # List to collect processed chunks
+  
+  # For each chunk, call collate
+  while(current_chunk_size == chunk_size) {
+    current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',',
+                                     col.names= header_col_names,
+                                     nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1)
+    
+    current_chunk_size= nrow(current_chunk) # set current chunk size to stop loop
+    print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' '))
+    
+    chunk_collector[[chunk_idx]]= do.call(action, list(current_chunk, ...))
+    chunk_idx= chunk_idx + 1
+  }
+  
+  output_table= data.table::rbindlist(chunk_collector)
+  return(output_table)
+}
+
 #' collate_fastq_reads
 #' 
 #' This function takes in the fastq reads (uncollapsed_raw_counts) and
@@ -100,12 +132,14 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) {
 #' @param barcode_col String name of the column in uncollapsed_raw_counts that contains the sequences.  
 #' @returns Returns a dataframe with columns specified by the id_cols along with barcode_col, and "n".
 #' @import tidyverse
+#' @import data.table
 collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, 
                               sequencing_index_cols= c('index_1', 'index_2'),
                               id_cols= c('pcr_plate', 'pcr_well'),
                               reverse_index2= FALSE,
                               barcode_col= 'forward_read_cl_barcode') {
   require(tidyverse)
+  require(data.table)
   
   # Reverse index 2 if specified ----
   if(reverse_index2) {
@@ -117,6 +151,9 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     }
   }
   
+  # Create sequence map ----
+  sequencing_map= sample_meta %>% dplyr::distinct(pick(all_of(c(sequencing_index_cols, id_cols))))
+  
   # Validation: Check that flowcell_names and flowcell_lanes exist in the sample meta ----
   if(!validate_columns_exist(c('flowcell_names', 'flowcell_lanes'), sample_meta)) {
     stop('The above column(s) are NOT present in the sample meta.')
@@ -125,8 +162,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Validation: Check that sequencing_index_cols exist in the sample meta ----
   if(!validate_columns_exist(sequencing_index_cols, sample_meta)) {
     print('The following sequencing_index_cols are not present in the sample meta.')
-    print(sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)])
-    stop('One or more sequencing_index_cols is NOT present in the sample meta.')
+    stop('The above sequencing_index_cols are NOT present in the sample meta.')
   }
   
   # Validation: Check that id_cols exist in the sample meta ----
@@ -141,6 +177,15 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     stop('One or more sequencing_index_cols in the sample meta is not filled out.')
   }
   
+  # Validation: Check that mapping is one to one ----
+  check_mapping= sequencing_map %>% dplyr::group_by(pick(all_of(sequencing_index_cols))) %>%
+    dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup()
+  if(nrow(check_mapping) > 0) {
+    print('The following sequencing locations map to multiple conditions.')
+    print(check_mapping)
+    stop('The sequencing index columns do not map 1 to 1 to the ID columns.')
+  }
+  
   # If "flowcell_name" and "flowcell_lane" are present, filter for valid flowcells ----
   # Note: Can this switch be tied to the sequencer type?
   if(base::all(c('flowcell_name', 'flowcell_lane') %in% colnames(uncollapsed_raw_counts))) {
@@ -197,39 +242,46 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     print('Proceeding without filtering flowcells ...')
   }
   
-  # Create sequence map ----
-  sequencing_map= sample_meta %>% dplyr::distinct(pick(all_of(c(sequencing_index_cols, id_cols))))
-  
-  # Validation: Check that mapping is one to one ----
-  check_mapping= sequencing_map %>% dplyr::group_by(pick(all_of(sequencing_index_cols))) %>%
-    dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup()
-  if(nrow(check_mapping) > 0) {
-    print('The following sequening locations map to multiple conditions.')
-    print(check_mapping)
-    stop('The sequencing index columns do not map 1 to 1 to the ID columns.')
-  }
-  
-  # Create raw counts file ----
+  # Create summed_reads file ----
   # Filter for the expected flowcells and summed up the reads over the ID cols.
   print('Summing up reads ...')
-  raw_counts= data.table::merge.data.table(uncollapsed_raw_counts, sequencing_map, by= sequencing_index_cols)
-  raw_counts= raw_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)]
+  summed_reads= data.table::merge.data.table(uncollapsed_raw_counts, sequencing_map, by= sequencing_index_cols)
+  summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]
   
   # Escape for when a chunk contains invalid sequencing locations
-  if(nrow(raw_counts) == 0) {
-    print('WARNING: raw_counts is empty!')
-    return(raw_counts)
+  if(nrow(summed_reads) == 0) {
+    print('WARNING: summed_reads is empty!')
+    return(summed_reads)
   }
   
-  # Calculate index purity ----
-  index_purity= sum(raw_counts$n) / sum(uncollapsed_raw_counts$n)
-  print(paste0('Index purity: ', round(index_purity, 4)))
+  # Calculate index purity in a chunk----
+  index_purity= sum(summed_reads$n) / sum(uncollapsed_raw_counts$n)
+  print(paste0('Index purity in chunk: ', round(index_purity, 4)))
   if(index_purity > 1) {
-    stop('ERROR: Index purity is greater than 1!')
+    stop('ERROR: Chunk index purity is greater than 1!')
   } else if(index_purity < 0.5) {
     print('Warning: Low index purity!')
   } else {}
   
   print('Collate_fastq_reads has completed!')
-  return(raw_counts)
+  return(summed_reads)
+}
+
+#' extract_known_barcodes
+#' 
+#' This function runs some action over chunks of a large file. At the end, all chunks are
+#' appended together.
+#' 
+#' @param raw_counts description
+#' @param known_barcodes A vector known barcodes.
+#' @param barcode_col String name of the column in uncollapsed_raw_counts that contains the sequences. 
+extract_known_barcodes= function(summed_reads, known_barcodes, barcode_col= 'forward_read_cl_barcode') {
+  # Create boolean column of known or unknown
+  summed_reads[, known := get(barcode_col) %chin% known_barcodes]
+  
+  # Filter using that boolean column
+  unknown_reads= summed_reads[known == FALSE,][order(-n)][, known := NULL]
+  summed_reads= summed_reads[known == TRUE,][, known := NULL]
+  
+  return(list(unknown_reads= unknown_reads, known_reads= summed_reads))
 }

From 3fd1f90786403b1d848b09e86a138ec5e6dada22 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 24 Sep 2024 17:37:33 -0400
Subject: [PATCH 069/127] Call chunking function and extract known reads

---
 scripts/collate_fastq_reads.R | 76 +++++++++++++++++------------------
 1 file changed, 37 insertions(+), 39 deletions(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index f0cd6aaa..86a82298 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -2,6 +2,7 @@ options(cli.unicode = FALSE)
 library(argparse)
 library(magrittr)
 library(tidyverse)
+library(data.table)
 source("./src/collate_fastq_reads.R")
 
 # Argument parser ----
@@ -38,53 +39,50 @@ sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',')
 sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ","))
 id_cols= unlist(strsplit(args$id_cols, ","))
 
-# Validation: Check that sequencing_index_cols are from sample meta column names ----
-if(!all(sequencing_index_cols %in% colnames(sample_meta))) {
-  stop(paste('The following sequencing_index_cols were not found in the sample meta: ',
-             sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)]))
+# Validation: Check that sequencing_index_cols present in the sample meta ----
+if(!validate_columns_exist(sequencing_index_cols, sample_meta)) {
+  print('The following sequencing_index_cols are not present in the sample meta.')
+  stop('One or more sequencing_index_cols is NOT present in the sample meta.')
 }
 
-# Validation: Check that id_cols are from sample meta column names ----
-if(!all(id_cols %in% colnames(sample_meta))) {
-  stop(paste('The following id_cols were not found in the sample meta: ',
-             id_cols[!id_cols %in% colnames(sample_meta)]))
+# Validation: Check that id_cols are present in the sample meta ----
+if(!validate_columns_exist(id_cols, sample_meta)) {
+  stop('One or more id_cols is NOT present in the sample meta.')
 }
 
-# Run collate_fastq_reads on chunks ----
-# Set up loop to process chunks
-header_col_names= data.table::fread(args$raw_counts_uncollapsed, header=T, sep= ',', nrow= 0) %>% colnames()
-chunk_size= 10^6 # Maximum number of rows in a chunk
-chunk_idx= 1 # Counter to keep track of chunks in a loop
-current_chunk_size= chunk_size # Variable for loop exit condition
-chunk_collector= list() # List to collect processed chunks
+# Run collate_fastq_reads on chunks of raw_counts_uncollapsed.csv ----
+summed_reads= process_in_chunks(large_file_path= args$raw_counts_uncollapsed, 
+                                chunk_size= 10^6, 
+                                action= collate_fastq_reads,
+                                sample_meta= sample_meta, 
+                                sequencing_index_cols= sequencing_index_cols,
+                                id_cols= id_cols,
+                                reverse_index2= args$reverse_index2,
+                                barcode_col= args$barcode_col)
 
-# For each chunk, call collate
-while(current_chunk_size == chunk_size) {
-  nori_chunk= data.table::fread(args$raw_counts_uncollapsed, header= F, sep= ',',
-                                col.names= header_col_names,
-                                nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1)
-  
-  current_chunk_size= nrow(nori_chunk) # set current chunk size to stop loop
-  print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' '))
-  
-  chunk_collector[[chunk_idx]]= collate_fastq_reads(nori_chunk, sample_meta,
-                                                    sequencing_index_cols= sequencing_index_cols,
-                                                    id_cols= id_cols,
-                                                    reverse_index2=  args$reverse_index2,
-                                                    barcode_col= args$barcode_col)
-  
-  chunk_idx= chunk_idx + 1
-}
+# Sum up the read across the chunks afterwards!
+summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, args$barcode_col)]
+
+# Split reads by either known or unknown ----
+# Reads are separated by whether or not the barcode exists in the PRISM library
+# Read in metadata to get list of all known barcodes
+cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',')
+CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',')
 
-raw_counts= data.table::rbindlist(chunk_collector)
-raw_counts= raw_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)]
+# Call function to separate barcodes
+split_reads= extract_known_barcodes(summed_reads, unique(c(cell_line_meta$Sequence, CB_meta$Sequence)),
+                                    barcode_col= args$barcode)
 
 # Validation: Basic file size check ----
-if(nrow(raw_counts) == 0) {
+if(nrow(split_reads$mapped_reads) == 0) {
   stop('ERROR: Empty file generated. No rows in raw_counts output.')
 } 
 
-# Write out file ----
-rc_out_file= paste(args$out, 'raw_counts.csv', sep='/')
-print(paste("Writing raw_counts.csv to ", rc_out_file))
-write.csv(raw_counts, rc_out_file, row.names= FALSE, quote= FALSE)
+# Write out files ----
+out_file= paste(args$out, 'unknown_reads.csv', sep='/')
+print(paste("Writing unknown_reads.csv to ", out_file))
+write.csv(split_reads$unknown_reads, out_file, row.names= FALSE, quote= FALSE)
+
+out_file= paste(args$out, 'known_reads.csv', sep='/')
+print(paste("Writing known_reads.csv to ", out_file))
+write.csv(split_reads$known_reads, out_file, row.names= FALSE, quote= FALSE)

From 48aac7435d8a609793116a5b809d7a1cb6fbbffb Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 24 Sep 2024 17:38:14 -0400
Subject: [PATCH 070/127] Removed unmmaped reads part

---
 scripts/src/filter_raw_reads.R | 38 +++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R
index cd94c692..4484f7d7 100755
--- a/scripts/src/filter_raw_reads.R
+++ b/scripts/src/filter_raw_reads.R
@@ -96,8 +96,8 @@ filter_raw_reads = function(raw_counts,
                             sample_meta, cell_line_meta, cell_set_meta, CB_meta,
                             id_cols= c('pcr_plate', 'pcr_well'), 
                             count_threshold= 40) {
-  require(tidyverse)
   require(magrittr)
+  require(tidyverse)
   
   # Processing metadata and inputs ---- 
   # CB meta is in log10 and should be converted to log2.
@@ -122,14 +122,6 @@ filter_raw_reads = function(raw_counts,
   # This currently does NOT result in an error. Error avoided using a distinct when creating the template.
   validate_cell_set_luas(sample_meta, cell_set_meta)
   
-  # Split off unmapped reads ----
-  # Unmapped reads are defined as reads that are identified from valid PCR locations,
-  # but do not map to known barcodes in PRISM.
-  # Also sorted reads in descending order by read count.
-  print('Splitting off unmapped reads ...')
-  raw_counts[, mapped := forward_read_cl_barcode %in% c(cell_line_meta$Sequence, CB_meta$Sequence)]
-  unmapped_reads= raw_counts[mapped==FALSE,][order(-n)][, mapped:= NULL]
-  
   # Creating a template of all expected reads in the run ----
   # Use all 4 meta data files to create a "template" dataframe where
   # every row is a cell line that is expected in a PCR well. 
@@ -157,7 +149,30 @@ filter_raw_reads = function(raw_counts,
   # Reads that to not match to the template are contaminants and,
   # reads that are only present in the template are missing/not detected by PCR.
   print("Annotating reads ...")
-  annotated_counts= raw_counts %>% dplyr::filter(mapped) %>%
+  # Data.table version #
+  # # Left join cell_line_meta using data.table inplace left join
+  # raw_counts[cell_line_meta, base::setdiff(colnames(cell_line_meta), c('Sequence')) := 
+  #              base::mget(base::setdiff(colnames(cell_line_meta), c('Sequence'))), 
+  #            on= c('forward_read_cl_barcode' = 'Sequence')]
+  # # Left join CB_meta using data.table inplace left join
+  # raw_counts[CB_meta, base::setdiff(colnames(CB_meta), c('Sequence')) := 
+  #              base::mget(base::setdiff(colnames(CB_meta), c('Sequence'))), 
+  #            on= c('forward_read_cl_barcode' = 'Sequence')]
+  # # Left join CB_meta using data.table inplace left join
+  # raw_counts[sample_meta, base::setdiff(colnames(sample_meta), id_cols) := 
+  #              base::mget(base::setdiff(colnames(sample_meta), id_cols)), 
+  #            on= id_cols]
+  # data.table::setnames(raw_counts, 'forward_read_cl_barcode', 'Sequence')
+  # 
+  # annotated_counts= data.table::merge.data.table(
+  #   raw_counts, data.table::setDT(template %>% dplyr::mutate(expected_read= T)), 
+  #   by= intersect(colnames(template), colnames(raw_counts)), all.x= TRUE, all.y= TRUE,
+  #   allow.cartesian= FALSE) %>%
+  #   dplyr::select(!any_of(c('prism_cell_set', 'members', 'mapped'))) %>%
+  #   dplyr::mutate(n= replace_na(n, 0), expected_read= replace_na(expected_read, F))
+  
+  # Dplyr version #
+  annotated_counts= raw_counts %>%
     dplyr::left_join(cell_line_meta, by= join_by('forward_read_cl_barcode'=='Sequence'),
                      relationship= 'many-to-one') %>%
     dplyr::left_join(CB_meta, by= join_by('forward_read_cl_barcode'=='Sequence'),
@@ -191,8 +206,7 @@ filter_raw_reads = function(raw_counts,
   }
   
   print('Filter_raw_reads has completed!')
-  return(list(unmapped_reads= unmapped_reads, 
-              annotated_counts= annotated_counts, 
+  return(list(annotated_counts= annotated_counts, 
               filtered_counts= filtered_counts))
 }
 

From dac0ca7c05da5884f0a717175aaddb165f261ac6 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 24 Sep 2024 17:46:30 -0400
Subject: [PATCH 071/127] Added cell line meta and CB meta

---
 scripts/collate_fastq_reads.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index 86a82298..ca00205f 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -21,6 +21,8 @@ parser$add_argument("--reverse_index2", type="logical", default=FALSE,
                     help= "Reverse complement of index 2 for NovaSeq and NextSeq")
 parser$add_argument("--barcode_col", default= "forward_read_cl_barcode", 
                     help= "Name of the column in uncollapsed_raw_counts that contains the barcode sequences.")
+parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata")
+parser$add_argument("--CB_meta", default="CB_meta.csv", help = "Control Barcode metadata")
 parser$add_argument("-o", "--out", default=getwd(), help = "Output path. Default is working directory")
 
 # get command line options, if help option encountered print help and exit

From f8cd88be6f2150818095f840cb7c0d260558f737 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 24 Sep 2024 17:58:36 -0400
Subject: [PATCH 072/127] Added cell line meta and cb meta as inputs

---
 scripts/collate_fastq_reads.sh | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/scripts/collate_fastq_reads.sh b/scripts/collate_fastq_reads.sh
index 7fe46214..16735e99 100644
--- a/scripts/collate_fastq_reads.sh
+++ b/scripts/collate_fastq_reads.sh
@@ -79,6 +79,22 @@ else
   SAMPLE_META=$BUILD_DIR/$SAMPLE_META
 fi
 
+#Enforces abs paths
+if [[ "$CELL_LINE_META" = /* ]]
+then
+  CELL_LINE_META=$(ls $CELL_LINE_META)
+else
+  CELL_LINE_META=$BUILD_DIR/$CELL_LINE_META
+fi
+
+#Enforces abs paths
+if [[ "$CONTROL_BARCODE_META" = /* ]]
+then
+  CONTROL_BARCODE_META=$(ls $CONTROL_BARCODE_META)
+else
+  CONTROL_BARCODE_META=$BUILD_DIR/$CONTROL_BARCODE_META
+fi
+
 echo Build dir is: $BUILD_DIR
 
 PROJECT_DIR=$(dirname "$BUILD_DIR")
@@ -86,6 +102,8 @@ PROJECT_CODE=$(basename "$PROJECT_DIR")
 
 echo Project Code: $PROJECT_CODE
 echo REVERSE_INDEX2 is: $REVERSE_INDEX2
+echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META
+echo CELL_LINE_META is: $CELL_LINE_META
 
 args=(
 --raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED"
@@ -94,6 +112,8 @@ args=(
 --sequencing_index_cols="$SEQUENCING_INDEX_COLS"
 --id_cols "$ID_COLS" 
 --reverse_index2 "$REVERSE_INDEX2"
+--cell_line_meta "$CELL_LINE_META"
+--CB_meta "$CONTROL_BARCODE_META"
 )
 
 echo Rscript collate_fastq_reads.R "${args[@]}"

From 29a999c88e6803658c1d821f40e998d4d4719b61 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 25 Sep 2024 14:15:56 -0400
Subject: [PATCH 073/127] Fixed a comment

---
 scripts/src/collate_fastq_reads.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 23ce723a..8cbd25b0 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -88,7 +88,7 @@ process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) {
   current_chunk_size= chunk_size # Variable for loop exit condition
   chunk_collector= list() # List to collect processed chunks
   
-  # For each chunk, call collate
+  # For each chunk, call an action
   while(current_chunk_size == chunk_size) {
     current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',',
                                      col.names= header_col_names,

From 3a268ce1c092cf0f066fcc486eecf6170f485ec5 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 25 Sep 2024 14:16:11 -0400
Subject: [PATCH 074/127] Added chunking for some QC figures

---
 scripts/src/QC_images.R | 146 +++++++++++++++++++++-------------------
 1 file changed, 78 insertions(+), 68 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index 3ad0db66..b0c0c1dd 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -39,7 +39,7 @@ get_index_summary= function(df, index_col, valid_indices) {
   output_summary= df %>% dplyr::group_by(pick(all_of(index_col))) %>% 
     dplyr::summarise(idx_n= sum(n)) %>% dplyr::ungroup() %>%
     dplyr::mutate(fraction= round(idx_n/sum(idx_n), 5),
-                  expected= ifelse(.[[index_col]] %in% valid_indices, T, F),
+                  expected= ifelse(.[[index_col]] %chin% valid_indices, T, F),
                   contains_n= ifelse(grepl('N', .[[index_col]]), T, F),
                   lv_dist= apply(stringdist::stringdistmatrix(.[[index_col]], valid_indices, method="lv"), 
                                  1, min),
@@ -59,22 +59,37 @@ get_index_summary= function(df, index_col, valid_indices) {
 #' @param value_col String name of the counts column present all three dataframes.
 #' @param file_path Location to write out the output.
 #' @returns Writes out a QC_table to the file_path.
-create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts, value_col= 'n', file_path) {
-  # Validation: Check that value_col is present in the three files.
-  if(!validate_columns_exist(value_col, raw_counts_uncollapsed)) {
+create_qc_table= function(raw_counts_uncollapsed_filepath, unknown_reads, known_reads, filtered_counts,
+                          value_col= 'n', file_path) {
+  # Validations: Check that the path works and that value_col exists in all tables
+  if(!file.exists(raw_counts_uncollapsed_filepath)) {
+    stop('Cannot find the raw counts uncollapsed file.')
+  }
+  rcu_headers= data.table::fread(raw_counts_uncollapsed_filepath, header= TRUE, sep= ',', nrow= 0)
+  if(!validate_columns_exist(value_col, rcu_headers)) {
     stop(paste0('The column ', value_col, " was not detected in uncollapsed raw counts."))
   }
-  if(!validate_columns_exist(value_col, raw_counts)) {
-    stop(paste0('The column ', value_col, " was not detected in raw counts."))
+  if(!validate_columns_exist(value_col, unknown_reads)) {
+    stop(paste0('The column ', value_col, " was not detected in unknown_reads.csv"))
+  }
+  if(!validate_columns_exist(value_col, known_reads)) {
+    stop(paste0('The column ', value_col, " was not detected in known_reads.csv"))
   }
   if(!validate_columns_exist(value_col, filtered_counts)) {
-    stop(paste0('The column ', value_col, " was not detected in filtered counts."))
+    stop(paste0('The column ', value_col, " was not detected in filtered_counts.csv"))
   }
   
   # Calculate purities
-  index_purity= sum(raw_counts[[value_col]]) / sum(raw_counts_uncollapsed[[value_col]])
+  # Determine total number of reads
+  chunk_sum= process_in_chunks(large_file_path= raw_counts_uncollapsed_filepath, 
+                               chunk_size= 10^6, 
+                               action= function(x) data.table::as.data.table(sum(x[[value_col]])))
+  total_num_reads= sum(unlist(test_sum))
+  
+  # Calculate purities
+  index_purity= (sum(unknown_reads[[value_col]]) + sum(known_reads[[value_col]])) / total_num_reads
   print(paste0('Index purity: ', round(index_purity, 4)))
-  cell_line_purity= sum(filtered_counts[[value_col]]) / sum(raw_counts[[value_col]])
+  cell_line_purity= sum(filtered_counts[[value_col]]) / (sum(unknown_reads[[value_col]]) + sum(known_reads[[value_col]])) 
   print(paste0('Cell line purity: ', round(cell_line_purity, 4)))
   qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity)
   
@@ -83,6 +98,30 @@ create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts, v
   qc_table %>% write.csv(file_path, row.names= FALSE, quote= FALSE)
 }
 
+process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) {
+  
+  header_col_names= data.table::fread(large_file_path, header= TRUE, sep= ',', nrow= 0) %>% colnames()
+  chunk_idx= 1 # Counter to keep track of chunks in a loop
+  current_chunk_size= chunk_size # Variable for loop exit condition
+  chunk_collector= list() # List to collect processed chunks
+  
+  # For each chunk, call an action
+  while(current_chunk_size == chunk_size) {
+    current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',',
+                                     col.names= header_col_names,
+                                     nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1)
+    
+    current_chunk_size= nrow(current_chunk) # set current chunk size to stop loop
+    print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' '))
+    
+    chunk_collector[[chunk_idx]]= do.call(action, list(current_chunk, ...))
+    chunk_idx= chunk_idx + 1
+  }
+  
+  output_table= data.table::rbindlist(chunk_collector)
+  return(output_table)
+}
+
 #' Total counts barplot
 #' 
 #' Creates the total counts barplot with bars colored by the barcode type,
@@ -468,6 +507,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   # Required packages ----
   require(tidyverse)
   require(magrittr)
+  require(data.table)
   require(reshape2)
   require(WGCNA)
   require(scales)
@@ -495,30 +535,44 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   # Sequencing QCs ____________________ ----
   ## 1. Purity metrics ----
   print('1. Generating QC table ...')
-  create_qc_table(raw_counts_uncollapsed, raw_counts, filtered_counts,
+  create_qc_table(raw_counts_uncollapsed, 
+                  unknown_reads= unknown_reads,
+                  known_reads= known_reads,
+                  filtered_counts,
                   value_col= 'n', file_path= paste0(out, '/QC_table.csv'))
   
   ## 2. Index count summaries ----
   print("2. Generating index counts tables ...")
-  # Check that "IndexBarcode1" and "index_1" columns are present.
-  # If so, calculate index summary and write out.
-  if('index_1' %in% colnames(sample_meta) & 'index_1' %in% colnames(raw_counts_uncollapsed)) {
+  
+  # Pull out headers to perform checks
+  raw_counts_uncollapsed_headers= data.table::fread(raw_counts_file_path, header= TRUE, sep= ',', nrow= 0)
+  
+  # Check that "index_1" is present. If so, calculate index summary and write out.
+  if('index_1' %in% colnames(sample_meta) & 'index_1' %in% colnames(raw_counts_uncollapsed_headers)) {
     expected_index1= unique(sample_meta$index_1)
-    index1_counts= get_index_summary(raw_counts_uncollapsed, 'index_1', expected_index1)
+    # Aggregate by index_1 using chunks
+    index1_chunks= process_in_chunks(large_file_path= raw_counts_file_path, chunk_size= 10^6, 
+                                     action= function(x) x[, list(n= sum(n)), by= index_1]) 
+    index1_counts= get_index_summary(index1_chunks, 'index_1', expected_index1)
     index1_counts %>% write.csv(file= paste(out, 'index1_counts.csv', sep='/'), row.names=F)
   } else {
     print('Column "index_1" not detected. Skipping index 1 summaries ...', quote= FALSE)
   }
   
   # Do the same for index 2.
-  # Reverse index 2 barcodes if it is indicated and if "index_2" exisits
+  # Reverse index 2 barcodes if it is indicated and if "index_2" exists
   if(reverse_index2 & 'index_2' %in% colnames(sample_meta) ) {
     print("Reverse-complementing index 2 barcode.")
     sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))
   }
   
-  if('index_2' %in% colnames(sample_meta) & 'index_2' %in% colnames(raw_counts_uncollapsed)) {
+  if('index_2' %in% colnames(sample_meta) & 'index_2' %in% colnames(raw_counts_uncollapsed_headers)) {
     expected_index2= unique(sample_meta$index_2)
+    
+    # Aggregate by index_2 using chunks
+    index2_chunks= process_in_chunks(large_file_path= raw_counts_file_path, chunk_size= 10^6, 
+                                     action= function(x) x[, list(n= sum(n)), by= index_2]) 
+    
     index2_counts= get_index_summary(raw_counts_uncollapsed, 'index_2', expected_index2)
     index2_counts %>% write.csv(file= paste(out, 'index2_counts.csv', sep='/'), row.names=F)
   } else {
@@ -594,59 +648,15 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   ## 6. Contaminant reads ----
   print('6. Generating contaminant reads ...')
   potential_error= base::tryCatch({
-    pcr_locations= c('pcr_plate', 'pcr_well')
-    
-    # Validation: Check that the PCR columns are present in raw_counts.
-    if(!validate_columns_exist(pcr_locations, raw_counts)) {
-      stop('pcr_plate and pcr_well are required in raw_counts.csv for this to work.')
-    }
-      
-    # count number of wells a cell_set appears in.
-    pcr_plate_map= sample_meta %>% dplyr::distinct(pick(any_of(c(pcr_locations, 'cell_set')))) %>%
-      dplyr::group_by(pcr_plate) %>% dplyr::mutate(num_wells_in_plate= dplyr::n()) %>% dplyr::ungroup() %>%
-      dplyr::group_by(cell_set) %>% dplyr::mutate(num_wells_in_set= dplyr::n()) %>% dplyr::ungroup()
-
-    # index filter and identify reads as mapped or not
-    sequencing_filter= raw_counts %>%
-      dplyr::mutate(mapped= forward_read_cl_barcode %in% unique(annotated_counts$forward_read_cl_barcode))
-
-    # total counts per well - used to calculate fractions
-    counts_per_well= sequencing_filter %>% dplyr::group_by(pick(all_of(pcr_locations))) %>%
-      dplyr::summarise(well_total_n= sum(n)) %>% dplyr::ungroup()
-
-    # mapped contaminates to bind
-    mapped_contams= annotated_counts %>% dplyr::filter(!expected_read) %>%
-      dplyr::mutate(barcode_name= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>%
-      dplyr::select(all_of(c(pcr_locations, 'forward_read_cl_barcode', 'n', 'barcode_name')))
+    # watered down version
+    summed_unknown_reads= unknown_reads[, list(num_reads = sum(n), num_wells= .N), 
+                                        by= base::mget('forward_read_cl_barcode')]
+    summed_contams= annotated_counts[expected_read == FALSE, list(num_reads = sum(n), num_wells= .N),
+                                          by= base::mget(c('forward_read_cl_barcode', 'DepMap_ID', 'cb_name'))]
+    summed_contams[, barcode_name:= ifelse(is.na(DepMap_ID), cb_name, DepMap_ID)][,DepMap_ID:= NULL]
     
-    contam_reads= sequencing_filter %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>%
-      dplyr::bind_rows(mapped_contams) %>%
-      dplyr::left_join(counts_per_well, by= pcr_locations) %>%
-      dplyr::left_join(pcr_plate_map, by= pcr_locations) %>%
-      # filter out barcodes that only appear in one well
-      dplyr::group_by(forward_read_cl_barcode) %>% dplyr::filter(dplyr::n() >1) %>% dplyr::ungroup() %>%
-      # number of wells in a pcr plate a barcode is detected in
-      dplyr::group_by(forward_read_cl_barcode, pcr_plate) %>%
-      dplyr::mutate(num_wells_detected_plate= dplyr::n()) %>% dplyr::ungroup() %>%
-      # number of wells in a cell set a barcode is detected in
-      dplyr::group_by(forward_read_cl_barcode, cell_set) %>%
-      dplyr::mutate(num_wells_detected_set= dplyr::n()) %>% dplyr::ungroup() %>%
-      # determine if contamination is project, plate, or set
-      dplyr::group_by(forward_read_cl_barcode) %>%
-      dplyr::mutate(num_wells_detected= dplyr::n(),
-                    project_code= unique(sample_meta$project_code),
-                    fraction= n/well_total_n,
-                    type1= ifelse(sum(num_wells_detected== nrow(pcr_plate_map))>1, 'project_contam', NA),
-                    type2= ifelse(sum(num_wells_detected== num_wells_detected_plate & 
-                                        num_wells_detected_plate == num_wells_in_plate)>1, 'plate_contam', NA),
-                    type3= ifelse(sum(num_wells_detected == num_wells_detected_set &
-                                        num_wells_detected_set== num_wells_in_set)>1, 'set_contam', NA)) %>%
-      dplyr::ungroup() %>%
-      tidyr::unite(scope, all_of(c('type1', 'type2', 'type3')), sep=',', remove = T, na.rm = T) %>%
-      dplyr::group_by(project_code, forward_read_cl_barcode, barcode_name, scope, num_wells_detected) %>%
-      dplyr::summarise(min_n= min(n), med_n= median(n), max_n= max(n),
-                       min_fraction= min(fraction), med_fraction= median(fraction), max_fraction=max(fraction)) %>%
-      dplyr::arrange(desc(max_fraction))
+    contam_reads= data.table::rbindlist(list(summed_contams, summed_unknown_reads), fill= TRUE) %>%
+      dplyr::arrange(dplyr::desc(num_reads))
     
     # write out
     contam_reads %>% write.csv(paste0(out, 'contam_reads.csv'), row.names=F)

From b914add9d959fb27503cc2b9b4e26128afae9d99 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Thu, 3 Oct 2024 17:29:40 -0400
Subject: [PATCH 075/127] Sum up reads and split on known or unknown reads

---
 scripts/src/collate_fastq_reads.R | 269 ++++++++++--------------------
 1 file changed, 87 insertions(+), 182 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 8cbd25b0..1aba70a9 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -1,71 +1,15 @@
-#' validate_columns_exist
-#' 
-#' This function checks that a list of columns are present in a dataframe.
-#' Columns that were not found in the dataframe are printed out.
-#' 
-#' @param selected_columns A vector of strings each representing a column name
-#' @param df A dataframe to check against
-#' @return Boolean
-validate_columns_exist= function(selected_cols, df) {
-  # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B].
-  unmatched_cols= base::setdiff(selected_cols, colnames(df))
-  
-  if(length(unmatched_cols) > 0) {
-    print('The following columns are missing: ')
-    print(unmatched_cols)
-    return(FALSE)
-  } else {
-    return(TRUE)
-  }
-}
-  
-#' validate_columns_entries
-#' 
-#' This function checks that for a list of columns, all entries are filled in.
-#' It checks all column entries against a list of potential empty values.
-#' 
-#' @param selected_columns A vector of strings each representing a column name
-#' @param df A dataframe to check against
-#' @param empty_values Optional vector of values that equate to empty. Defaults to NA, "NA", "", and " ".
-#' @return Boolean
-validate_columns_entries= function(selected_columns, df, empty_values= c(NA, 'NA', '', ' ')) {
-  # Check for rows in selected_columns that equate to predefined empty values.
-  missing_rows= df %>% dplyr::filter(if_any(all_of(selected_columns), ~ . %in% empty_values))
-  if(nrow(missing_rows) > 0) {
-    print('The following rows in the sample meta are not filled out for the sequencing index columns.')
-    print(missing_rows) # show the empty rows
-    return(FALSE)
-  } else {
-    return(TRUE)
-  }
-}
-
-#' validate_unique_samples
-#' 
-#' This function checks that a list of columns uniquely identifies all entries of a dataframe.
-#' 
-#' @param selected_columns A vector of strings each representing a column name
-#' @param df A dataframe to check against
-#' @return Boolean
-validate_unique_samples= function(selected_columns, df) {
-  unique_column_values= df %>% dplyr::distinct(pick(all_of(selected_columns)))
-  if(nrow(unique_column_values) != nrow(df)) {
-    return(FALSE)
-  } else {
-    return(TRUE)
-  }
-}
-
 #' validate_detected_flowcells
 #' 
-#' This function checks that all the expected flowcells are present in a table of detected flowcells.
-#' There can be more detected flowcells than there are expected flowcells.
+#' This function checks the table of expected flowcells using the table of detected flowcells.
+#' Any flowcells that are expected but are not detected are printed and a warning is printed.
 #' 
 #' @param detected_flowcells A dataframe with the columns "flowcell_name" and "flowcell_lane".
 #' @param expected_flowcells A dataframe with the columns "flowcell_name" and "flowcell_lane".
 validate_detected_flowcells= function(detected_flowcells, expected_flowcells) {
+  # Use dplyr::anti_join to filter out row in expected_flowcells that appear in detected_flowcells.
   missing_flowcells= expected_flowcells %>% dplyr::anti_join(detected_flowcells, by= c('flowcell_name', 'flowcell_lane'))
 
+  # Print a warning if there are expected flowcells that were not detected!
   if(nrow(missing_flowcells) != 0) {
     print('WARNING: The following flowcells/lanes specified in the sample meta were not detected in the fastq reads.')
     print(missing_flowcells)
@@ -73,38 +17,6 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) {
   }
 }
 
-#' process_in_chunks
-#' 
-#' This function runs some action over chunks of a large file. At the end, all chunks are
-#' appended together.
-#' 
-#' @param large_file_path description
-#' @param chunk_size description
-#' @param action A function passed to act on each chunk
-process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) {
-  
-  header_col_names= data.table::fread(large_file_path, header= TRUE, sep= ',', nrow= 0) %>% colnames()
-  chunk_idx= 1 # Counter to keep track of chunks in a loop
-  current_chunk_size= chunk_size # Variable for loop exit condition
-  chunk_collector= list() # List to collect processed chunks
-  
-  # For each chunk, call an action
-  while(current_chunk_size == chunk_size) {
-    current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',',
-                                     col.names= header_col_names,
-                                     nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1)
-    
-    current_chunk_size= nrow(current_chunk) # set current chunk size to stop loop
-    print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' '))
-    
-    chunk_collector[[chunk_idx]]= do.call(action, list(current_chunk, ...))
-    chunk_idx= chunk_idx + 1
-  }
-  
-  output_table= data.table::rbindlist(chunk_collector)
-  return(output_table)
-}
-
 #' collate_fastq_reads
 #' 
 #' This function takes in the fastq reads (uncollapsed_raw_counts) and
@@ -127,6 +39,8 @@ process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) {
 #'                include any sequencing related columns. This parameter defaults onto "pcr_plate", "pcr_well". This 
 #'                parameter can also be a list of the sample conditions columns as long as they uniquely identify every
 #'                PCR well. For example "cell_set", "treatment", "dose", "day", "bio_rep", "tech_rep" can also be used.
+#' @param known_barcodes A vector of known PRISM barcodes. If a read does not match a barcode in this list,
+#'                       then its sequence is reassigned to "unknown_reads".
 #' @param reverse_index2 Index 2 should be reversed if the sequencer uses a reverse complement workflow. 
 #'                       Defaults to FALSE.
 #' @param barcode_col String name of the column in uncollapsed_raw_counts that contains the sequences.  
@@ -136,48 +50,61 @@ process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) {
 collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, 
                               sequencing_index_cols= c('index_1', 'index_2'),
                               id_cols= c('pcr_plate', 'pcr_well'),
+                              known_barcodes,
                               reverse_index2= FALSE,
                               barcode_col= 'forward_read_cl_barcode') {
   require(tidyverse)
   require(data.table)
   
-  # Reverse index 2 if specified ----
-  if(reverse_index2) {
-    if('index_2' %in% colnames(sample_meta)) {
-      print("Reverse-complementing index 2 barcode ...")
-      sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))
-    } else {
-      stop('Reverse index 2 is set to TRUE, but index_2 does not exists.')
-    }
+  # Validation: Check that sequencing_index_cols exist in the sample_meta ----
+  # Error out if a sequencing_index_col is not in the sample_meta.
+  if(!validate_columns_exist(sequencing_index_cols, sample_meta)) {
+    stop('The above sequencing_index_cols are NOT present in the sample meta.')
   }
   
-  # Create sequence map ----
-  sequencing_map= sample_meta %>% dplyr::distinct(pick(all_of(c(sequencing_index_cols, id_cols))))
-  
-  # Validation: Check that flowcell_names and flowcell_lanes exist in the sample meta ----
-  if(!validate_columns_exist(c('flowcell_names', 'flowcell_lanes'), sample_meta)) {
-    stop('The above column(s) are NOT present in the sample meta.')
+  # Validation: Check that sequencing_index_cols in the sample meta are filled out ----
+  # Check for rows in sequencing_index_cols that equate to empty - NA, "NA", "", " "
+  # Error out if the sequencing_index_cols are not filled out in the sample meta.
+  if(!validate_columns_entries(sequencing_index_cols, sample_meta)) {
+    stop('One or more sequencing_index_cols in the sample meta is not filled out.')
   }
   
-  # Validation: Check that sequencing_index_cols exist in the sample meta ----
-  if(!validate_columns_exist(sequencing_index_cols, sample_meta)) {
-    print('The following sequencing_index_cols are not present in the sample meta.')
-    stop('The above sequencing_index_cols are NOT present in the sample meta.')
+  # Validation: Check that sequencing_index_cols uniquely identify every row of the sample_meta ----
+  # Eror out if a sequencing_index_col does not appear in the sample_meta.
+  if(!validate_unique_samples(sequencing_index_cols, sample_meta)) {
+    print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.')
+    stop('The specified sequencing index columns do NOT uniquely identify every PCR well.')
   }
   
   # Validation: Check that id_cols exist in the sample meta ----
+  # Error out if an id_col is not detected in the sample_meta.
   if(!validate_columns_exist(id_cols, sample_meta)) {
     stop('One or more id_cols is NOT present in the sample meta.')
   }
   
-  # Validation: Check that sequencing_index_cols in the sample meta are filled out ----
-  # Check for rows in sequencing_index_cols that equate to empty - NA, "NA", "", " "
-  # Error out of the sequencing_index_cols are not filled out in the sample meta.
-  if(!validate_columns_entries(sequencing_index_cols, sample_meta)) {
-    stop('One or more sequencing_index_cols in the sample meta is not filled out.')
+  # Validation: Check that id_cols uniquely identify every row of the sample_meta ----
+  if(!validate_unique_samples(id_cols, sample_meta)) {
+    print('There may be multiple entries in the sample meta that have the same combination of ID columns.')
+    stop('The specified ID columns do NOT uniquely identify every PCR well.')
   }
   
+  # Reverse index 2 if specified ----
+  if(reverse_index2) {
+    if('index_2' %in% colnames(sample_meta)) {
+      print("Reverse-complementing index 2 barcode ...")
+      sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))
+    } else {
+      stop('Reverse index 2 is set to TRUE, but index_2 does not exists.')
+    }
+  }
+  
+  # Create sequence map ----
+  # Sequencing map is used to map combinations of the sequencing_index_cols to combinations of the id_cols.
+  sequencing_map= sample_meta %>% dplyr::distinct(pick(all_of(c(sequencing_index_cols, id_cols))))
+  
   # Validation: Check that mapping is one to one ----
+  # Make sure that the mapping from sequencing_index_cols to id_cols is 1 to 1.
+  # Code below groups on the sequencing_index_cols and filter for combinations that map to more than one id_col combination.
   check_mapping= sequencing_map %>% dplyr::group_by(pick(all_of(sequencing_index_cols))) %>%
     dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup()
   if(nrow(check_mapping) > 0) {
@@ -186,76 +113,71 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     stop('The sequencing index columns do not map 1 to 1 to the ID columns.')
   }
   
-  # If "flowcell_name" and "flowcell_lane" are present, filter for valid flowcells ----
-  # Note: Can this switch be tied to the sequencer type?
+  # Determine if 'flowcell_name' and flowcell_lane' are present in the uncollapsed raw counts file ----
+  # If the columns are present, assume that uncollapsed_raw_counts is from Nori and filter for only valid flowcells
+  # If not, the file could be from a MiSeq run or something else outside of Nori, so skip this filter step.
   if(base::all(c('flowcell_name', 'flowcell_lane') %in% colnames(uncollapsed_raw_counts))) {
-    print('Detecting flowcells. Filtering for valid flowcells ...')
+    print('Detecting flowcell_name and flowcell_lane. Filtering for valid flowcells.')
+    # Validation: Check that flowcell_names and flowcell_lanes exist in the sample meta ----
+    if(!validate_columns_exist(c('flowcell_names', 'flowcell_lanes'), sample_meta)) {
+      stop('The above column(s) are NOT present in the sample meta.')
+    }
     
     # Determine which flowcell names + lanes are expected ----
     # "flowcell_names" and "flowcell_lanes" are strings that can contain more than one item.
-    # Columns can be parsed by splitting on the chars , ; :
-    # If there are multiple lane names and lane numbers, this uses the Cartesian product!
+    # Columns can be parsed by splitting on the characters , ; :
+    # If there are multiple lane names and lane numbers, this will take the Cartesian product!
     # Note: fread and read.csv keeps commas, read_csv DROPS commas
     expected_flowcells= sample_meta %>% dplyr::distinct(flowcell_names, flowcell_lanes) %>%
       dplyr::mutate(flowcell_name= base::strsplit(flowcell_names, split='[,;:]', fixed=F),
                     flowcell_lane= base::strsplit(flowcell_lanes, split='[,;:]', fixed=F)) %>% 
       tidyr::unnest(cols= flowcell_name) %>% tidyr::unnest(cols= flowcell_lane) %>%
       dplyr::mutate(flowcell_lane= as.numeric(flowcell_lane))
+    # Note: This code uses base::strsplit and tidyr::unnest from an older version of tidyverse.
+    # If there is any update to the tidyverse verision, this can be refactored to use
+    # tidyr::separate_longer_delim
     
-    # Print out expected flowcells from the sample meta.
-    print(paste0('Identified ', nrow(expected_flowcells), ' unique flowcell + lane combos in the sample meta ...'))
-    print(expected_flowcells)
-    
-    # Print warning if there are multiple flowcell names with multiple flowcell lanes.
-    multi_name_and_lanes= expected_flowcells %>% dplyr::filter(grepl(',:;', flowcell_names) & grepl(',:;', flowcell_names))
-    if(nrow(multi_name_and_lanes) > 0) {
-      print('WARNING: Detected sample(s) sequenced over multiple flowcells and flowcell lanes.')
-      print('The function assumes that the same lanes were used for both flowcells.')
-    }
-    
-    # Validation: Check that all expected flowcell name + lanes are detected ----
-    # Check that all expected flowcell name + lanes are present in uncollapsed raw counts.
+    # Validation: Check if all expected flowcell name + lanes are detected ----
+    # Check that all expected flowcell name + lanes are present in uncollapsed_raw_counts.
+    # Print warning if a flowcell is expected but not detected.
     detected_flowcells= uncollapsed_raw_counts %>% dplyr::distinct(flowcell_name, flowcell_lane)
-    print(paste0('Identified ', nrow(detected_flowcells), ' unique flowcell + lane combos in the uncollapsed raw counts ...'))
+    print(paste0('Identified ', nrow(detected_flowcells), ' unique flowcell + lane combos in the uncollapsed raw counts.'))
     print(detected_flowcells)
     validate_detected_flowcells(detected_flowcells, expected_flowcells)
     
-    # Validation: Check that sequencing_index_cols uniquely identify rows of sample meta ----
-    if(!validate_unique_samples(sequencing_index_cols, sample_meta)) {
-      print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.')
-      stop('The specified sequencing index columns do NOT uniquely identify every PCR well.')
-    }
-    
-    # Validation: Check that id_cols uniquely identify rows of sample meta ----
-    if(!validate_unique_samples(id_cols, sample_meta)) {
-      print('There may be multiple entries in the sample meta that have the same combination of ID columns.')
-      stop('The specified ID columns do NOT uniquely identify every PCR well.')
-    }
-    
-    # Filter for expected flowcells ----
-    uncollapsed_raw_counts= data.table::merge.data.table(
-      uncollapsed_raw_counts, data.table::setDT(expected_flowcells), 
-      by= c('flowcell_name', 'flowcell_lane'), allow.cartesian= FALSE)
+    # Filter for expected flowcells and add names/lanes columns ----
+    # Filter using inner join with  merge from data.table instead of dplyr join to improve performance.
+    uncollapsed_raw_counts= data.table::merge.data.table(uncollapsed_raw_counts, data.table::setDT(expected_flowcells), 
+                                                         by= c('flowcell_name', 'flowcell_lane'), allow.cartesian= FALSE)
     
   } else {
-    print('Flowcell_name and/or flowcell_lane were not detected in raw_counts_uncollapsed.')
-    print('Proceeding without filtering flowcells ...')
+    print('Flowcell_name and/or flowcell_lane are not detected in raw_counts_uncollapsed.')
+    print('Proceeding without filtering flowcells.')
+  }
+
+  # Validation: Check that sequencing_index_cols exist in uncollapsed_raw_counts ----
+  if(!validate_columns_exist(sequencing_index_cols, uncollapsed_raw_counts)) {
+    stop('Some sequencing_index_cols are NOT present in the uncollapsed_raw_counts')
   }
   
   # Create summed_reads file ----
-  # Filter for the expected flowcells and summed up the reads over the ID cols.
-  print('Summing up reads ...')
+  print('Summing up reads.')
+  # Performing inner join with data.table instead of dplyr
   summed_reads= data.table::merge.data.table(uncollapsed_raw_counts, sequencing_map, by= sequencing_index_cols)
+  # Code below is checking if a barcode is in the list of known barcodes.
+  # If the barcode is not in the list of known barcodes, then the barcode is replaced with the string "unknown_reads".
+  # Function := performs the mutate inplace without copying the dataframe.
+  # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%.
+  summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes), 
+                                                       get(barcode_col), 'unknown_reads')]
+  # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells.
   summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]
   
-  # Escape for when a chunk contains invalid sequencing locations
-  if(nrow(summed_reads) == 0) {
-    print('WARNING: summed_reads is empty!')
-    return(summed_reads)
-  }
-  
-  # Calculate index purity in a chunk----
+  # Calculate index purity ----
+  # This is only accurate if the Nori input file is small enough to fit into a chunk.
   index_purity= sum(summed_reads$n) / sum(uncollapsed_raw_counts$n)
+  # Throw an error if the purity is greater than 1.
+  # Throw a warning if the purity is below 0.5.
   print(paste0('Index purity in chunk: ', round(index_purity, 4)))
   if(index_purity > 1) {
     stop('ERROR: Chunk index purity is greater than 1!')
@@ -263,25 +185,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     print('Warning: Low index purity!')
   } else {}
   
-  print('Collate_fastq_reads has completed!')
-  return(summed_reads)
-}
-
-#' extract_known_barcodes
-#' 
-#' This function runs some action over chunks of a large file. At the end, all chunks are
-#' appended together.
-#' 
-#' @param raw_counts description
-#' @param known_barcodes A vector known barcodes.
-#' @param barcode_col String name of the column in uncollapsed_raw_counts that contains the sequences. 
-extract_known_barcodes= function(summed_reads, known_barcodes, barcode_col= 'forward_read_cl_barcode') {
-  # Create boolean column of known or unknown
-  summed_reads[, known := get(barcode_col) %chin% known_barcodes]
-  
-  # Filter using that boolean column
-  unknown_reads= summed_reads[known == FALSE,][order(-n)][, known := NULL]
-  summed_reads= summed_reads[known == TRUE,][, known := NULL]
-  
-  return(list(unknown_reads= unknown_reads, known_reads= summed_reads))
+  # Return list of two dfs with known or unknown read counts ----
+  print('Completing collate_fastq_reads.')
+  return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] != 'unknown_reads',], 
+              unknown_barcode_counts= summed_reads[summed_reads[[barcode_col]] == 'unknown_reads',]))
 }

From 6d8f858a0bfe34659e820b6a5f51a92a98a6558b Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Thu, 3 Oct 2024 17:30:04 -0400
Subject: [PATCH 076/127] Updated names of output files

---
 scripts/collate_fastq_reads.R | 80 ++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 39 deletions(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index ca00205f..850094a5 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -4,25 +4,26 @@ library(magrittr)
 library(tidyverse)
 library(data.table)
 source("./src/collate_fastq_reads.R")
+source("./src/kitchen_utensils.R")
 
 # Argument parser ----
 parser <- ArgumentParser()
 # specify desired options
 parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]")
 parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output")
-parser$add_argument('--raw_counts_uncollapsed', default="raw_counts_uncollapsed.csv",
-                    help="path to file containing uncollapsed raw counts file")
-parser$add_argument("--sample_meta", default="sample_meta.csv", help = "Sample metadata")
-parser$add_argument("--sequencing_index_cols", default= "index_1,index_2", 
-                    help = "Sequencing columns in the sample meta")
+parser$add_argument('--raw_counts_uncollapsed', default= "raw_counts_uncollapsed.csv",
+                    help= "path to file containing uncollapsed raw counts file")
+parser$add_argument("--sample_meta", default="sample_meta.csv", help= "Sample metadata")
+parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell line metadata")
+parser$add_argument("--CB_meta", default= "CB_meta.csv", help= "Control Barcode metadata")
+parser$add_argument('--sequencing_index_cols', default= 'index_1,index_2', 
+                    help= 'List of sequencing columns in the sample meta.')
 parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", 
                     help = "Columns that identify a unique PCR well")
 parser$add_argument("--reverse_index2", type="logical", default=FALSE,
                     help= "Reverse complement of index 2 for NovaSeq and NextSeq")
 parser$add_argument("--barcode_col", default= "forward_read_cl_barcode", 
                     help= "Name of the column in uncollapsed_raw_counts that contains the barcode sequences.")
-parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata")
-parser$add_argument("--CB_meta", default="CB_meta.csv", help = "Control Barcode metadata")
 parser$add_argument("-o", "--out", default=getwd(), help = "Output path. Default is working directory")
 
 # get command line options, if help option encountered print help and exit
@@ -33,17 +34,17 @@ if (args$out == "") {
   args$out = args$wkdir
 }
 
-# Read in sample meta and parse argument strings ----
-# Read in files and parse vector arguments
-sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',')
+# Read in metadata files as data.table objects ----
+sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',')
+cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',')
+CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',')
 
-# Parse vector inputs
+# Parse some parameters into vectors ----
 sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ","))
 id_cols= unlist(strsplit(args$id_cols, ","))
 
 # Validation: Check that sequencing_index_cols present in the sample meta ----
 if(!validate_columns_exist(sequencing_index_cols, sample_meta)) {
-  print('The following sequencing_index_cols are not present in the sample meta.')
   stop('One or more sequencing_index_cols is NOT present in the sample meta.')
 }
 
@@ -53,38 +54,39 @@ if(!validate_columns_exist(id_cols, sample_meta)) {
 }
 
 # Run collate_fastq_reads on chunks of raw_counts_uncollapsed.csv ----
-summed_reads= process_in_chunks(large_file_path= args$raw_counts_uncollapsed, 
-                                chunk_size= 10^6, 
-                                action= collate_fastq_reads,
-                                sample_meta= sample_meta, 
-                                sequencing_index_cols= sequencing_index_cols,
-                                id_cols= id_cols,
-                                reverse_index2= args$reverse_index2,
-                                barcode_col= args$barcode_col)
-
-# Sum up the read across the chunks afterwards!
-summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, args$barcode_col)]
+# raw_counts_uncollapsed can be too large to read into memory,
+# so collate_fastq_reads is performed on chunks of the large file.
+chunked_results= process_in_chunks(large_file_path= args$raw_counts_uncollapsed, 
+                                   chunk_size= 10^6, 
+                                   action= collate_fastq_reads,
+                                   # Parameters for collate_fastq_reads
+                                   sample_meta= sample_meta, 
+                                   sequencing_index_cols= sequencing_index_cols,
+                                   id_cols= id_cols,
+                                   known_barcodes= unique(cell_line_meta$Sequence, CB_meta$Sequence),
+                                   reverse_index2= args$reverse_index2,
+                                   barcode_col= args$barcode_col)
 
-# Split reads by either known or unknown ----
-# Reads are separated by whether or not the barcode exists in the PRISM library
-# Read in metadata to get list of all known barcodes
-cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',')
-CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',')
+# From each chunk, extract prism_barcode_counts and bind the rows together into one dataframe.
+prism_barcode_counts= data.table::rbindlist(lapply(chunked_results, function(x) x$prism_barcode_counts))
+# Use data.table to group_by id_cols and barcode_col to sum up reads across all chunks.
+prism_barcode_counts= prism_barcode_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)]
 
-# Call function to separate barcodes
-split_reads= extract_known_barcodes(summed_reads, unique(c(cell_line_meta$Sequence, CB_meta$Sequence)),
-                                    barcode_col= args$barcode)
+# From each chunk, extract unknown_barcode_counts and bind the rows together into one dataframe.
+unknown_barcode_counts= data.table::rbindlist(lapply(chunked_results, function(x) x$unknown_barcode_counts))
+# Use data.table to group_by id_cols and barcode_col to sum up reads across all chunks.
+unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)]
 
 # Validation: Basic file size check ----
-if(nrow(split_reads$mapped_reads) == 0) {
-  stop('ERROR: Empty file generated. No rows in raw_counts output.')
+if(nrow(prism_barcode_counts) == 0) {
+  stop('ERROR: Empty file generated. No rows in prism_barcode_counts output.')
 } 
 
 # Write out files ----
-out_file= paste(args$out, 'unknown_reads.csv', sep='/')
-print(paste("Writing unknown_reads.csv to ", out_file))
-write.csv(split_reads$unknown_reads, out_file, row.names= FALSE, quote= FALSE)
+out_file= paste(args$out, 'prism_barcode_counts.csv', sep='/')
+print(paste("Writing prism_barcode_counts.csv to ", out_file))
+write.csv(prism_barcode_counts, out_file, row.names= FALSE, quote= FALSE)
 
-out_file= paste(args$out, 'known_reads.csv', sep='/')
-print(paste("Writing known_reads.csv to ", out_file))
-write.csv(split_reads$known_reads, out_file, row.names= FALSE, quote= FALSE)
+out_file= paste(args$out, 'unknown_barcode_counts.csv', sep='/')
+print(paste("Writing unknown_barcode_counts.csv to ", out_file))
+write.csv(unknown_barcode_counts, out_file, row.names= FALSE, quote= FALSE)

From c65e5e86747558a1fb94ee988feec778a272c767 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 12:36:24 -0400
Subject: [PATCH 077/127] Added more comments

---
 scripts/src/filter_raw_reads.R | 121 ++++++---------------------------
 1 file changed, 22 insertions(+), 99 deletions(-)

diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R
index 4484f7d7..1a3384f7 100755
--- a/scripts/src/filter_raw_reads.R
+++ b/scripts/src/filter_raw_reads.R
@@ -1,52 +1,5 @@
 options(cli.unicode = FALSE)
 
-#' validate_columns_exist
-#' 
-#' This function checks that a list of columns are present in a dataframe.
-#' Columns that were not found in the dataframe are printed out.
-#' 
-#' @param selected_columns A vector of strings each representing a column name
-#' @param df A dataframe to check against
-#' @return Boolean
-validate_columns_exist= function(selected_cols, df) {
-  # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B].
-  unmatched_cols= base::setdiff(selected_cols, colnames(df))
-  
-  if(length(unmatched_cols) > 0) {
-    print('The following columns are missing: ')
-    print(unmatched_cols)
-    return(FALSE)
-  } else {
-    return(TRUE)
-  }
-}
-
-#' validate_unique_samples
-#' 
-#' This function checks that a list of columns uniquely identifies all entries of a dataframe.
-#' 
-#' @param selected_columns A vector of strings each representing a column name
-#' @param df A dataframe to check against
-#' @return Boolean
-validate_unique_samples= function(selected_columns, df) {
-  message= paste0('The following columns do not uniquely identify every row of the dataframe: ',
-                  paste(selected_columns, collapse=', '))
-  print(message)
-  unique_column_values= df %>% dplyr::distinct(pick(all_of(selected_columns)))
-  if(nrow(unique_column_values) != nrow(df)) {
-    print('The selected columns do not uniquely identify all rows.')
-    
-    dups= df %>% dplyr::group_by(pick(all_of(selected_columns))) %>%
-      dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup() %>% 
-      dplyr::arrange(pick(all_of(selected_columns)))
-    print(dups)
-    
-    return(FALSE)
-  } else {
-    return(TRUE)
-  }
-}
-
 #' validate_cell_set_luas
 #' 
 #' This function checks that every cell set in the sample meta does not contain duplicate members.
@@ -76,33 +29,28 @@ validate_cell_set_luas= function(sample_meta, cell_set_meta) {
 #'                   "forward_read_cl_barcode", and "n".
 #' @param sample_meta Dataframe of the metadata for the sequencing run. This file should contain the id_cols,
 #'                    "cell_set", "control_barcodes", etc.
-#' @param cell_line_meta Master metadata of cell lines with the following required columns - "CCLE_name",
-#'                       "DepMap_ID", "LUA", and "Sequence".
 #' @param cell_set_meta Master metadata of cell sets and their contents with the following required columns -
 #'                      "cell_set" and "members".
+#' @param cell_line_meta Master metadata of cell lines with the following required columns - "CCLE_name",
+#'                       "DepMap_ID", "LUA", and "Sequence".
 #' @param CB_meta Master metadata of control barcodes, their sequences, and their doses. The file should contain 
 #'                the columns - "Sequence", "Name", and "log_dose".
 #' @param id_cols Columns present in both raw_counts and sample_meta that uniquely identify each PCR well. 
 #'                This defaults to "pcr_plate", "pcr_well".
-#' @param count_threshold Threshold to call low counts. This defaults to 40. 
 #' @returns List with the following elements:
 #' #' \itemize{
-#'   \item unmapped_reads: table of reads with valid index pairs but did not map to any known barcode.
-#'         The table contains the following columns - id_cols, "forward_read_cl_barcode", and "n".
 #'   \item annotated_counts: table of reads and the associated well and well conditions.
 #'   \item filtered_counts: table of all expected reads for the project, this is a subset of annotated counts.
 #' }
-filter_raw_reads = function(raw_counts, 
-                            sample_meta, cell_line_meta, cell_set_meta, CB_meta,
-                            id_cols= c('pcr_plate', 'pcr_well'), 
-                            count_threshold= 40) {
+filter_raw_reads = function(prism_barcode_counts, 
+                            sample_meta, cell_set_meta, cell_line_meta, CB_meta,
+                            id_cols= c('pcr_plate', 'pcr_well')) {
   require(magrittr)
   require(tidyverse)
   
-  # Processing metadata and inputs ---- 
-  # CB meta is in log10 and should be converted to log2.
+  # CB meta is in log10 and should be converted to log2 ----
   if('log_dose' %in% colnames(CB_meta)) {
-    print("Converting CB_meta from log10 to log2 ...")
+    print("Converting CB_meta from log10 to log2.")
     CB_meta= CB_meta %>% dplyr::mutate(log2_dose= log_dose/log10(2)) %>% dplyr::select(-log_dose)
   }
   
@@ -119,13 +67,13 @@ filter_raw_reads = function(raw_counts,
   
   # Validation: Check that cell sets do not contain duplicate LUAs ----
   # This will produce a warning if a LUA appears in a cell set more than once!
-  # This currently does NOT result in an error. Error avoided using a distinct when creating the template.
+  # This currently does NOT result in an error. Error avoided using a distinct when creating the template of expected reads.
   validate_cell_set_luas(sample_meta, cell_set_meta)
   
   # Creating a template of all expected reads in the run ----
-  # Use all 4 meta data files to create a "template" dataframe where
+  # Use all 4 metadata files to create a "template" dataframe where
   # every row is a cell line that is expected in a PCR well. 
-  print('Creating template of expected reads ...')
+  print('Creating template of expected reads.')
   # Join cell_set_meta and cell_line_meta. The cell_set can be a name "P939" or a list of LUAs.
   template= sample_meta %>% dplyr::left_join(cell_set_meta, by= 'cell_set') %>%
     dplyr::mutate(members= ifelse(is.na(members), str_split(cell_set, ';'), str_split(members, ';'))) %>% 
@@ -136,6 +84,8 @@ filter_raw_reads = function(raw_counts,
   
   # Check for control barcodes and add them to the template.
   if(any(unique(sample_meta$control_barcodes) %in% c('Y', 'T', T))) {
+    # Filter for wells with control barcodes and perform a many-to-many join. 
+    # This will expand each well entry to the number of control barcodes for that well.
     cb_template= sample_meta %>% dplyr::filter(control_barcodes %in% c('Y', 'T', T)) %>%
       dplyr::mutate(joiner= 'temp') %>%
       dplyr::inner_join(CB_meta %>% dplyr::mutate(joiner= 'temp'), by='joiner', relationship= 'many-to-many') %>% 
@@ -144,35 +94,11 @@ filter_raw_reads = function(raw_counts,
   }
   
   # Annotating reads ----
-  # From the set of reads that have the valid sequencing_index_cols combinations and map to the PRISM seq library,
-  # join in metadata to give each read a name and PCR location.
-  # Reads that to not match to the template are contaminants and,
-  # reads that are only present in the template are missing/not detected by PCR.
-  print("Annotating reads ...")
-  # Data.table version #
-  # # Left join cell_line_meta using data.table inplace left join
-  # raw_counts[cell_line_meta, base::setdiff(colnames(cell_line_meta), c('Sequence')) := 
-  #              base::mget(base::setdiff(colnames(cell_line_meta), c('Sequence'))), 
-  #            on= c('forward_read_cl_barcode' = 'Sequence')]
-  # # Left join CB_meta using data.table inplace left join
-  # raw_counts[CB_meta, base::setdiff(colnames(CB_meta), c('Sequence')) := 
-  #              base::mget(base::setdiff(colnames(CB_meta), c('Sequence'))), 
-  #            on= c('forward_read_cl_barcode' = 'Sequence')]
-  # # Left join CB_meta using data.table inplace left join
-  # raw_counts[sample_meta, base::setdiff(colnames(sample_meta), id_cols) := 
-  #              base::mget(base::setdiff(colnames(sample_meta), id_cols)), 
-  #            on= id_cols]
-  # data.table::setnames(raw_counts, 'forward_read_cl_barcode', 'Sequence')
-  # 
-  # annotated_counts= data.table::merge.data.table(
-  #   raw_counts, data.table::setDT(template %>% dplyr::mutate(expected_read= T)), 
-  #   by= intersect(colnames(template), colnames(raw_counts)), all.x= TRUE, all.y= TRUE,
-  #   allow.cartesian= FALSE) %>%
-  #   dplyr::select(!any_of(c('prism_cell_set', 'members', 'mapped'))) %>%
-  #   dplyr::mutate(n= replace_na(n, 0), expected_read= replace_na(expected_read, F))
-  
-  # Dplyr version #
-  annotated_counts= raw_counts %>%
+  # From prism_barcode_counts, left join metadata to annotate all reads.
+  # Perform a full join with the template of expected reads so that there is a row entry for 
+  # cell lines not detected in sequencing.
+  print("Annotating reads.")
+  annotated_counts= prism_barcode_counts %>%
     dplyr::left_join(cell_line_meta, by= join_by('forward_read_cl_barcode'=='Sequence'),
                      relationship= 'many-to-one') %>%
     dplyr::left_join(CB_meta, by= join_by('forward_read_cl_barcode'=='Sequence'),
@@ -186,17 +112,14 @@ filter_raw_reads = function(raw_counts,
     dplyr::mutate(n= replace_na(n, 0), expected_read= replace_na(expected_read, F))
   
   # Generating filtered reads ----
-  # Get filtered counts from annotated counts. Also flag reads that are either missing,
-  # or below a count threshold.
+  # Get filtered counts from annotated counts and drop a few select columns.
   print("Filtering reads ...")
   filtered_counts= annotated_counts %>% dplyr::filter(expected_read) %>%
     dplyr::select(!any_of(c('flowcell_names', 'flowcell_lanes', 'index_1', 'index_2', 
-                            'forward_read_cl_barcode', 'LUA', 'expected_read'))) %>%
-    dplyr::mutate(flag= ifelse(n==0, 'Missing', NA),
-                  flag= ifelse(n!=0 & n < count_threshold, 'low counts', flag))
+                            'forward_read_cl_barcode', 'LUA', 'expected_read')))
   
   # Calculate cell line purity ----
-  cell_line_purity= sum(filtered_counts$n)/ sum(raw_counts$n)
+  cell_line_purity= sum(filtered_counts$n)/ sum(prism_barcode_counts$n)
   print(paste0('Cell line purity: ', round(cell_line_purity, 4)))
   if(cell_line_purity > 1) {
     stop('ERROR: Cell line purity is greater than 1!')
@@ -205,9 +128,9 @@ filter_raw_reads = function(raw_counts,
     print('Warning: Low cell line purity!')
   }
   
+  # Return both annotated_counts and filtered_counts ----
   print('Filter_raw_reads has completed!')
-  return(list(annotated_counts= annotated_counts, 
-              filtered_counts= filtered_counts))
+  return(list(annotated_counts= annotated_counts, filtered_counts= filtered_counts))
 }
 
 # checks is a string can be numeric

From f1d7c50c55a28d7dea873edc79c3efd6c266b974 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 12:36:42 -0400
Subject: [PATCH 078/127] Create kitchen_utensils.R

File of functions used across modules
---
 scripts/src/kitchen_utensils.R | 92 ++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 scripts/src/kitchen_utensils.R

diff --git a/scripts/src/kitchen_utensils.R b/scripts/src/kitchen_utensils.R
new file mode 100644
index 00000000..cfe9c0f4
--- /dev/null
+++ b/scripts/src/kitchen_utensils.R
@@ -0,0 +1,92 @@
+# Kitchen Utensils - 
+# This file contains functions for the pipeline.
+# The functions are sorted alphabetically
+
+#' process_in_chunks
+#' 
+#' This function runs some action over chunks of a large file. At the end, returns a list of all the chunks
+#' 
+#' @param large_file_path description
+#' @param chunk_size description
+#' @param action A function passed to act on each chunk
+#' @param ... Additional parameters to be passed into the action parameter
+process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) {
+  header_col_names= data.table::fread(large_file_path, header= TRUE, sep= ',', nrow= 0) %>% colnames()
+  chunk_idx= 1 # Counter to keep track of chunks in a loop
+  current_chunk_size= chunk_size # Variable for loop exit condition
+  chunk_collector= list() # List to collect processed chunks
+  
+  # For each chunk, call an action
+  while(current_chunk_size == chunk_size) {
+    current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',',
+                                     col.names= header_col_names,
+                                     nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1)
+    
+    current_chunk_size= nrow(current_chunk) # set current chunk size to stop loop
+    print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' '))
+    
+    chunk_collector[[chunk_idx]]= do.call(action, list(current_chunk, ...))
+    chunk_idx= chunk_idx + 1
+  }
+  
+  # Return a list of all the chunks
+  return(chunk_collector)
+}
+
+#' validate_columns_entries
+#' 
+#' This function checks that for a list of columns, all entries are filled in.
+#' It checks all column entries against a list of potential empty values.
+#' 
+#' @param selected_columns A vector of strings each representing a column name
+#' @param df A dataframe to check against
+#' @param empty_values Optional vector of values that equate to empty. Defaults to NA, "NA", "", and " ".
+#' @return Boolean
+validate_columns_entries= function(selected_columns, df, empty_values= c(NA, 'NA', '', ' ')) {
+  # Check for rows in selected_columns that equate to predefined empty values.
+  missing_rows= df %>% dplyr::filter(if_any(all_of(selected_columns), ~ . %in% empty_values))
+  if(nrow(missing_rows) > 0) {
+    print('The following rows in the sample meta are not filled out for the sequencing index columns.')
+    print(missing_rows) # show the empty rows
+    return(FALSE)
+  } else {
+    return(TRUE)
+  }
+}
+
+#' validate_columns_exist
+#' 
+#' This function checks that a list of columns are present in a dataframe.
+#' Columns that were not found in the dataframe are printed out.
+#' 
+#' @param selected_columns A vector of strings each representing a column name
+#' @param df A dataframe to check against
+#' @return Boolean
+validate_columns_exist= function(selected_cols, df) {
+  # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B].
+  unmatched_cols= base::setdiff(selected_cols, colnames(df))
+  
+  if(length(unmatched_cols) > 0) {
+    print('The following columns are missing: ')
+    print(unmatched_cols)
+    return(FALSE)
+  } else {
+    return(TRUE)
+  }
+}
+
+#' validate_unique_samples
+#' 
+#' This function checks that a list of columns uniquely identifies every row of a dataframe.
+#' 
+#' @param selected_columns A vector of strings each representing a column name
+#' @param df A dataframe to check against
+#' @return Boolean
+validate_unique_samples= function(selected_columns, df) {
+  unique_column_values= df %>% dplyr::distinct(pick(all_of(selected_columns)))
+  if(nrow(unique_column_values) != nrow(df)) {
+    return(FALSE)
+  } else {
+    return(TRUE)
+  }
+}
\ No newline at end of file

From a1606d4b67fb22b748ee812f51b5a379e193af80 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 12:37:16 -0400
Subject: [PATCH 079/127] Comments and style changes

---
 scripts/src/normalize.R | 37 ++++++++++---------------------------
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/scripts/src/normalize.R b/scripts/src/normalize.R
index 511a0cbc..1c41270d 100755
--- a/scripts/src/normalize.R
+++ b/scripts/src/normalize.R
@@ -1,19 +1,3 @@
-#' validate_columns_exist
-#' 
-#' This function checks that a list of columns are present in a dataframe.
-#' 
-#' @param selected_columns A vector of strings each representing a column name
-#' @param df A dataframe to check against
-#' @return Boolean
-validate_columns_exist= function(selected_columns, df) {
-  # Check that all of selected_columns are in df
-  if(any(!selected_columns %in% colnames(df))) {
-    return(FALSE)
-  } else {
-    return(TRUE)
-  }
-}
-
 #'  normalize
 #'
 #'  takes a filtered dataframe of raw read counts and normalizes
@@ -40,21 +24,20 @@ normalize <- function(X, id_cols, barcodes, pseudocount) {
   X %<>% dplyr::mutate(log2_n = log2(n + pseudocount))
   
   # Validation: Check that id_cols are present in the dataframe ----
-  if(validate_columns_exist(id_cols, X) == FALSE) {
-    print(id_cols)
+  if(!validate_columns_exist(id_cols, X)) {
     stop('One or more id_cols (printed above) is NOT present in the supplied dataframe.')
   }
   
   # Identify valid profiles and valid control barcodes to determine intercept ----
-  # dropping invalid trt_type, wells without control barcodes, cell line entries or other CBs, cbs with zero reads,
-  # and profiles with fewer than 4 CBs.
+  # Drop wells with invalid trt_type, wells without control barcodes, cell line entries or other CBs, 
+  # cbs with zero reads, and profiles with fewer than 4 CBs.
   valid_profiles= X %>% dplyr::filter(!trt_type %in% c("empty", "", "CB_only"), !is.na(trt_type), 
                                       control_barcodes %in% c('Y', 'T', T), Name %in% barcodes, n!= 0) %>%
     dplyr::group_by(pick(all_of(id_cols))) %>% dplyr::filter(dplyr::n() > 4) %>% dplyr::ungroup()
   
   # Validation: Check which wells/profiles were dropped ----
-  distinct_all_profiles = X %>% dplyr::distinct(pick(all_of(id_cols)))
-  distinct_valid_profiles = valid_profiles %>% dplyr::distinct(pick(all_of(id_cols)))
+  distinct_all_profiles= X %>% dplyr::distinct(pick(all_of(id_cols)))
+  distinct_valid_profiles= valid_profiles %>% dplyr::distinct(pick(all_of(id_cols)))
   if(nrow(distinct_all_profiles) != nrow(distinct_valid_profiles)) {
     # Print error if all profiles were dropped
     if(nrow(valid_profiles) == 0) {
@@ -80,17 +63,17 @@ normalize <- function(X, id_cols, barcodes, pseudocount) {
   fit_stats= valid_profiles %>% dplyr::inner_join(fit_intercepts, by=id_cols) %>% 
     dplyr::group_by(pick(all_of(id_cols))) %>%
     dplyr::mutate(log2_normalized_n= log2_n + cb_intercept,
-                  norm_mae= median(abs(log2_dose- log2_normalized_n)),
+                  norm_mae= median(abs(log2_dose - log2_normalized_n)),
                   mean_y= mean(log2_dose),
-                  residual2= (log2_dose- log2_normalized_n)^2,
-                  squares2= (log2_dose- mean_y)^2,
-                  norm_r2= 1- sum(residual2)/sum(squares2)) %>% dplyr::ungroup() %>%
+                  residual2= (log2_dose - log2_normalized_n)^2,
+                  squares2= (log2_dose - mean_y)^2,
+                  norm_r2= 1 - sum(residual2) / sum(squares2)) %>% dplyr::ungroup() %>%
     dplyr::distinct(pick(all_of(c(id_cols, 'cb_intercept', 'norm_mae', 'norm_r2'))))
   
   # Normalize entries ----
   normalized= X %>% dplyr::inner_join(fit_stats, by=id_cols) %>%
     dplyr::mutate(log2_normalized_n= log2_n + cb_intercept,
-                  normalized_n = 2^log2_normalized_n)
+                  normalized_n= 2^log2_normalized_n)
   
   return(normalized)
 }

From 9d910a2bc9f84fdf3cb9a5eace09d6f6d49e7cfc Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 12:38:13 -0400
Subject: [PATCH 080/127] Reordered parameters

---
 scripts/make_config_file.groovy | 64 +++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 22 deletions(-)

diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy
index 6c1a60aa..513998b1 100644
--- a/scripts/make_config_file.groovy
+++ b/scripts/make_config_file.groovy
@@ -13,6 +13,8 @@ pipeline {
         booleanParam(name: 'CBNORMALIZE', defaultValue: true, description: 'Check this to trigger the CBnormalize job.')
         booleanParam(name: 'COMPUTE_LFC', defaultValue: true, description: 'Check this to trigger the compute_l2fc job.')
         booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.')
+        booleanParam(name: 'FILTER_COUNTS_QC', defaultValue: true, description: 'Check this to trigger the QC job.')
+        booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.')
         booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.')
         booleanParam(name: 'RUN_NORM', defaultValue: true, description: 'Run normalization module on data.')
         booleanParam(name: 'PULL_POOL_ID', defaultValue: false, description: 'Flag indicating whether to pull pool IDs from CellDB - only applicable to cell sets (i.e. EXT.PR500.CS01.1.A, EXT.PR500.CS01.1.B, etc).')
@@ -22,31 +24,40 @@ pipeline {
         string(name: 'BUILD_NAME', defaultValue: '', description: 'Build name')
         string(name: 'SCREEN', defaultValue: '', description: 'Screen name from COMET, necessary if using COMET for sample metadata.')
         string(name: 'SEQ_TYPE', defaultValue: 'DRAGEN', description: 'Choose DRAGEN, MiSeq, HiSeq, or NovaSeq. MiSeq and HiSeq/NovaSeq return files named differently. This setting sets the INDEX_1, INDEX_2, and BARCODE_SUFFIX parameters in fastq2readcount. Select DRAGEN if fastq files are from the DRAGEN pipeline from GP. Choosing NovaSeq reverses index 2.')
-        string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'Type to mark as control in compute_LFC')
         string(name: 'DAYS', defaultValue: '', description: 'If running the sushi_to_mts module, provide any days/timepoints (separated by commas) that should be dropped from output data. No quotes needed (ie, 2,8).')
         string(name: 'GIT_BRANCH', defaultValue: 'main', description: 'Pipeline branch to use')
         booleanParam(name: 'USE_LATEST', defaultValue: true, description: 'Check this to use the most up to date version from the specified branch. If not checked, will use the specified commit.')
         string(name: 'COMMIT_ID', defaultValue: '', description: 'Specific commit ID to use (leave empty if using the latest commit in the branch or if already specified in the config file.)')
-        string(name: 'CELL_SET_META', defaultValue: 'cell_set_meta.csv', description: 'Cell set metadata')
-        string(name: 'ID_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day,bio_rep,tech_rep', description: 'Columns to concat to create unique ID for each sample-replicate')
-        string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual controls')
-        string(name: 'SIG_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day', description: 'Signature columns')
-        string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'index_1,index_2,flowcell_names', description: 'Sequencing index columns')
-        string(name: 'CONTROL_BARCODE_META', defaultValue: 'CB_meta.csv', description: 'Metadata for control barcodes.')
-        string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'Field used to calculate L2FC')
-        string(name: 'CELL_SET_META', defaultValue: 'cell_set_meta.csv', description: 'Cell Set Metadata. Static cell_line_meta location: /data/vdb/prismSeq/cell_set_meta.csv')
+
+        // Metadata files used by sushi
         string(name: 'SAMPLE_META', defaultValue: 'sample_meta.csv', description: 'File name of sample metadata within the BUILD_DIR directory.')
-        string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'Minimum threshold to filter cell line counts by.')
-        string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'Pseudocount for normalization.')
+        string(name: 'CELL_SET_META', defaultValue: 'cell_set_meta.csv', description: 'Cell Set Metadata. Static cell_line_meta location: /data/vdb/prismSeq/cell_set_meta.csv')
         string(name: 'CELL_LINE_META', defaultValue: 'cell_line_meta.csv', description: 'File in BUILD_DIR containing cell line metadata')
+        string(name: 'CONTROL_BARCODE_META', defaultValue: 'CB_meta.csv', description: 'Metadata for control barcodes.')
+        string(name: 'ASSAY_POOL_META', defaultValue: 'assay_pool_meta.txt', description: 'File in BUILD_DIR containing assay pool metadata')
+
+        // Files consumed and created by sushi
         string(name: 'RAW_COUNTS_UNCOLLAPSED', defaultValue: 'raw_counts_uncollapsed.csv', description: 'Filename in BUILD_DIR containing nori output')
-        string(name: 'RAW_COUNTS', defaultValue: 'raw_counts.csv', description: 'Filename in BUILD_DIR containing raw counts')
-        string(name: 'FILTERED_COUNTS', defaultValue: 'filtered_counts.csv', description: 'File in BUILD_DIR containing filtered counts')
-        string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'File containing log2 fold change values')
+        string(name: 'PRISM_BARCODE_COUNTS', defaultValue: 'prism_barcode_counts.csv', description: 'Filename in BUILD_DIR containing PRISM barcode counts')
+        string(name: 'UNKNOWN_BARCODE_COUNTS', defaultValue: 'unknown_barcode_counts.csv', description: 'Filename in BUILD_DIR containing unknown barcode counts')
         string(name: 'ANNOTATED_COUNTS', defaultValue: 'annotated_counts.csv', description: 'File in BUILD_DIR containing annotated counts')
+        string(name: 'FILTERED_COUNTS', defaultValue: 'filtered_counts.csv', description: 'File in BUILD_DIR containing filtered counts')
         string(name: 'NORMALIZED_COUNTS', defaultValue: 'normalized_counts.csv', description: 'File in BUILD_DIR containing normalized counts')
-        string(name: 'COLLAPSED_VALUES', defaultValue: 'collapsed_l2fc.csv', description: 'File in BUILD_DIR containing replicate collapsed l2fc values')
-        string(name: 'ASSAY_POOL_META', defaultValue: 'assay_pool_meta.txt', description: 'File in BUILD_DIR containing assay pool metadata')
+        string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'File containing log2 fold change values')
+        string(name: 'COLLAPSED_LFC', defaultValue: 'collapsed_l2fc.csv', description: 'File in BUILD_DIR containing replicate collapsed l2fc values')
+
+        // Column names parameters
+        string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'flowcell_names,index_1,index_2', description: 'Sequencing index columns used in COLLATE_FASTQ_READS')
+        string(name: 'ID_COLS', defaultValue: 'pcr_plate,pcr_well', description: 'Columns to concat to create unique ID for each sample-replicate')
+        string(name: 'CELL_LINE_COLS', defaultValue: 'DepMap_ID', description: 'Columns in intermediate files that describe a read or cell line')
+        string(name: 'SIG_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day', description: 'Signature columns')
+        string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual controls in COMPUTE_LFC')
+        
+        // Additional parameters
+        string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'In CBNORMALIZE, the pesudocount value for log transformations.')
+        string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'In COMPUTE_LFC, the name of the numeric column to use for calculations')
+        string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'In COMPUTE_LFC, the value in trt_type that indicates the negative controls')
+        string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'In FILTER_COUNTS_QC, the threshold for calling reads with low counts')
         string(name: 'API_URL', defaultValue: 'https://api.clue.io/api/', description: 'API URL')
     }
 
@@ -100,12 +111,21 @@ pipeline {
                 script {
                     def paramList = [
                         'SEQ_TYPE', 'API_URL', 'BUILD_DIR', 'INDEX_1', 'INDEX_2', 'BARCODE_SUFFIX', 'REVERSE_INDEX2',
-                        'SAMPLE_META', 'CONTROL_BARCODE_META', 'CTL_TYPES', 'ID_COLS', 'SIG_COLS',
-                        'RUN_NORM', 'CONTROL_COLS', 'COUNT_THRESHOLD', 'COUNT_COL_NAME', 'BUILD_NAME', 'CONVERT_SUSHI',
-                        'PULL_POOL_ID', 'RUN_EPS_QC', 'PSEUDOCOUNT', 'REMOVE_DATA', 'DAYS', 'SEQUENCING_INDEX_COLS',
-                        'RAW_COUNTS', 'CELL_SET_META', 'CELL_LINE_META', 'FILTERED_COUNTS', 'LFC', 'COUNTS', 'ANNOTATED_COUNTS',
-                        'COLLAPSED_VALUES', 'NORMALIZED_COUNTS', 'API_URL', 'FILTER_COUNTS_QC', 'ASSAY_POOL_META', 'SCREEN',
-                        'RAW_COUNTS_UNCOLLAPSED'
+                        'RUN_NORM', 'BUILD_NAME', 'CONVERT_SUSHI', 'PULL_POOL_ID', 'RUN_EPS_QC', 'REMOVE_DATA', 'DAYS',
+                        'COUNTS', 'SCREEN',
+
+                        // metadata files
+                        'SAMPLE_META', 'CELL_SET_META', 'CELL_LINE_META', 'CONTROL_BARCODE_META', 'ASSAY_POOL_META'
+
+                        // sushi files
+                        'RAW_COUNTS_UNCOLLAPSED', 'PRISM_BARCODE_COUNTS', 'UNKNOWN_BARCODE_COUNTS', 
+                        'ANNOTATED_COUNTS', 'FILTERED_COUNTS', 'NORMALIZED_COUNTS', 'LFC', 'COLLAPSED_LFC'
+
+                        // column name parameters
+                        'SEQUENCING_INDEX_COLS', 'ID_COLS', 'CELL_LINE_COLS', 'SIG_COLS', 'CONTROL_COLS',
+
+                        // additional parameters
+                        'PSEUDOCOUNT', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD', 'API_URL'
                     ]
 
                     def config = [:]

From 7255c533cb8c43fc0d3004c51d05515cccc81ef8 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 12:56:08 -0400
Subject: [PATCH 081/127] Also source kitchen utensils

---
 scripts/CBnormalize.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/CBnormalize.R b/scripts/CBnormalize.R
index c0f47c4f..3775d32e 100755
--- a/scripts/CBnormalize.R
+++ b/scripts/CBnormalize.R
@@ -2,6 +2,7 @@ options(cli.unicode = FALSE)
 library(argparse)
 library(magrittr)
 source("./src/normalize.R")
+source("./src/kitchen_utensils.R")
 
 # Argument parser ----
 parser <- ArgumentParser()

From 32f91324ed1db626b236dc7dc0f4d08baab6b6b9 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 12:56:47 -0400
Subject: [PATCH 082/127] Update with new collate outputs

---
 scripts/filter_counts.R | 93 ++++++++++++++++-------------------------
 1 file changed, 37 insertions(+), 56 deletions(-)

diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R
index fb55a52f..7cd0326f 100755
--- a/scripts/filter_counts.R
+++ b/scripts/filter_counts.R
@@ -9,28 +9,25 @@ suppressPackageStartupMessages(library(tidyr)) #pivot_wider
 suppressPackageStartupMessages(library(sets))
 suppressPackageStartupMessages(library(tidyverse)) # load last - after dplyr
 source("./src/filter_raw_reads.R")
+source("./src/kitchen_utensils.R")
 
 # Argument parser ----
 parser <- ArgumentParser()
 # specify desired options
-parser$add_argument("-v", "--verbose", action="store_true", default=TRUE,
-                    help="Print extra output [default]")
-parser$add_argument("-q", "--quietly", action="store_false",
-                    dest="verbose", help="Print little output")
+parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]")
+parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output")
 parser$add_argument("--wkdir", default=getwd(), help="Working directory")
-parser$add_argument("-c", "--raw_counts", default="raw_counts.csv", help = "path to file containing raw counts")
-parser$add_argument("-o", "--out", default="", help = "Output path. Default is working directory")
-parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata")
-parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata")
-parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help= "Cell set metadata")
+parser$add_argument('--prism_barcode_counts', default= 'prism_barcode_counts.csv', help= 'Path to prism_barcode_counts.csv')
+parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Path to sample_meta.csv')
+parser$add_argument('--cell_set_meta', default= 'cell_set_meta.csv', help= 'Path to cell_set_meta.csv')
+parser$add_argument('--cell_line_meta', default= 'cell_line_meta.csv', help= 'Path to cell_line_meta.csv')
 parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata")
-parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", help = "Columns to identify each PCR well")
-parser$add_argument("--CB_meta", default="CB_meta.csv", help = "Control Barcode metadata")
-parser$add_argument("--count_threshold", default= 40, help = "Low counts threshold")
+parser$add_argument('--CB_meta', default= 'CB_meta.csv', help= 'Path to CB_meta.csv')
+parser$add_argument('--id_cols', default= 'pcr_plate,pcr_well', 
+                    help= 'List of sample_meta column names used to identify every PCR well')
 parser$add_argument("--rm_data", type="logical", help = "Remove bad experimental data")
 parser$add_argument("--pool_id", type="logical", help = "Pull pool IDs from CellDB.")
-parser$add_argument("--control_type", default="negcon", 
-                    help = "negative control wells in trt_type column in sample metadata")
+parser$add_argument("-o", "--out", default="", help = "Output path. Default is working directory")
 
 # get command line options, if help option encountered print help and exit
 args <- parser$parse_args()
@@ -41,17 +38,17 @@ if (args$out == ""){
 }
 #print_args(args)
 
-# Read in files and set up parameters ----
-cell_set_meta= data.table::fread(args$cell_set_meta, header= T, sep= ',', data.table= F)
-cell_line_meta= data.table::fread(args$cell_line_meta, header= T, sep= ',', data.table= F)
-CB_meta= data.table::fread(args$CB_meta, header= T, sep= ',', data.table= F)
-sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',', data.table= F)
-raw_counts= data.table::fread(args$raw_counts, header= T, sep= ',', data.table= F)
+# Read in all input files ----
+prism_barcode_counts= data.table::fread(args$prism_barcode_counts, header= TRUE, sep= ',')
+sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',')
+cell_set_meta= data.table::fread(args$cell_set_meta, header= TRUE, sep= ',')
+cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',')
+CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',')
 
-# Convert strings to vectors ----
+# Convert input strings into vectors ----
 id_cols= unlist(strsplit(args$id_cols, ","))
-count_threshold = as.numeric(args$count_threshold)
 
+# What is this check doing? -YL ----
 # make sure LUA codes in cell line meta are unique
 cell_line_meta %<>% 
   dplyr::group_by(LUA) %>% 
@@ -66,58 +63,38 @@ cell_line_meta %<>%
   dplyr::filter(!duplicated(cell_line_meta$LUA, fromLast = TRUE)) %>%
   dplyr::select(-LUA.duplicity)
 
-# Remove flowcell_name and lane columns from sample_meta because
-# there is a profile_id duplicate when there are more than 1 seq runs
-#sample_meta %<>% select(-flowcell_name, -flowcell_lane) %>%
- # distinct() # This needs to be removed for sequencing_index_cols to work! - YL
-
 # Run filter_raw_reads -----
 print('Calling filter_raw_reads ...')
-filtered_counts= filter_raw_reads(raw_counts= raw_counts, sample_meta= sample_meta,
-                                  cell_line_meta= cell_line_meta,
-                                  cell_set_meta= cell_set_meta,
-                                  CB_meta= CB_meta,
-                                  id_cols= id_cols,
-                                  count_threshold= as.numeric(args$count_threshold))
-
-# Pulling pool_id when db_flag and pool_id flags are passed
+module_outputs= filter_raw_reads(prism_barcode_counts= prism_barcode_counts, 
+                                 sample_meta= sample_meta,
+                                 cell_set_meta= cell_set_meta,
+                                 cell_line_meta= cell_line_meta,
+                                 CB_meta= CB_meta,
+                                 id_cols= id_cols)
+
+# Pulling pool_id when db_flag and pool_id flags are passed ----
 if (args$pool_id) {
   assay_pool_meta = read.delim(args$assay_pool_meta)
   unique_cell_sets <- unique(sample_meta$cell_set[sample_meta$cell_set != ""])
   assay_pool_meta <- assay_pool_meta[assay_pool_meta$davepool_id %in% unique_cell_sets,] %>% 
     select(pool_id, ccle_name, davepool_id, depmap_id)
   
-  filtered_counts$filtered_counts = filtered_counts$filtered_counts %>% 
+  module_outputs$filtered_counts = module_outputs$filtered_counts %>% 
     merge(assay_pool_meta, by.x=c("CCLE_name", "cell_set", "DepMap_ID"), 
           by.y=c("ccle_name", "davepool_id", "depmap_id"), all.x=T) 
   
-  filtered_counts$annotated_counts = filtered_counts$annotated_counts %>% 
+  module_outputs$annotated_counts = module_outputs$annotated_counts %>% 
     merge(assay_pool_meta, by.x=c("CCLE_name", "cell_set", "DepMap_ID"), 
           by.y=c("ccle_name", "davepool_id", "depmap_id"), all.x=T)
 }
 
 # Validation: Basic file size check ----
-if(sum(filtered_counts$filtered_counts$n) == 0) {
+if(sum(module_outputs$filtered_counts$n) == 0) {
   stop('All entries in filtered counts are missing!')
 }
 
-cl_entries= filtered_counts$filtered_counts %>% dplyr::filter(!is.na(CCLE_name))
-if(sum(cl_entries$n) == 0) {
-  stop('All cell line counts are zero!')
-}
-
-# Write out module outputs ----
-unmapped_reads= filtered_counts$unmapped_reads
-unmapped_out = paste(args$out, 'unmapped_reads.csv', sep='/')
-print(paste("Writing unmapped reads to: ", unmapped_out))
-write.csv(unmapped_reads, unmapped_out, row.names=F)
-
-annotated_counts = filtered_counts$annotated_counts
-annot_out_file = paste(args$out, 'annotated_counts.csv', sep='/')
-print(paste("Writing annotated counts to: ", annot_out_file))
-write.csv(annotated_counts, annot_out_file, row.names=F)
-
-filtered_counts = filtered_counts$filtered_counts
+# Remove data ----
+filtered_counts= module_outputs$filtered_counts
 
 print(paste("rm_data:", args$rm_data))
 # Remove data if needed
@@ -137,7 +114,11 @@ if(args$rm_data == TRUE){
   paste("Number of rows removed: ", rows_removed)
 }
 
+# Write out files ----
+annot_out_file= paste0(args$out, '/annotated_counts.csv')
+print(paste('Writing annotated counts to: ', annot_out_file))
+module_outputs$annotated_counts %>% write.csv(annot_out_file, row.names= FALSE)
+
 filtrc_out_file = paste(args$out, 'filtered_counts.csv', sep='/')
 print(paste("Writing filtered counts csv to: ", filtrc_out_file))
 write.csv(filtered_counts, filtrc_out_file, row.names=F, quote=F)
-

From 3bc331f45fea62535e161e1a61e5c40de54625af Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 12:57:02 -0400
Subject: [PATCH 083/127] Update with new outputs

---
 scripts/filteredCounts_QC.R | 73 +++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 39 deletions(-)

diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R
index e98ca966..525452f3 100755
--- a/scripts/filteredCounts_QC.R
+++ b/scripts/filteredCounts_QC.R
@@ -10,34 +10,33 @@ suppressPackageStartupMessages(library(scales)) # for out of bound handling in p
 suppressPackageStartupMessages(library(ggpmisc)) # with ggplot to add linear fit labels
 suppressPackageStartupMessages(library(WGCNA)) # for faster correlations
 source("/workspace/scripts/src/QC_images.R")
+source("./src/kitchen_utensils.R")
 
 # Argument parser ----
 parser <- ArgumentParser()
+
 # specify desired options
 parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]")
 parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output")
 parser$add_argument("--wkdir", default=getwd(), help="Working directory")
-parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata")
-parser$add_argument("-c", "--raw_counts_uncollapsed", default="raw_counts_uncollapsed.csv",
-                    help="path to file containing uncollapsed raw counts file")
-parser$add_argument("--raw_counts", default= "raw_counts.csv", help="path to raw counts file")
-parser$add_argument("--annotated_counts", default= "annotated_counts.csv",
-                    help= "path to file containing annotated counts")
-parser$add_argument("--normalized_counts", default="normalized_counts.csv",
-                    help="path to file containing normalized counts")
-parser$add_argument("--lfc", default="l2fc.csv", help= "path to l2fc file")
-parser$add_argument("--cell_line_cols", default= 'DepMap_ID,CCLE_name',
-                    help= "Columns that identify cell lines or barcodes")
-parser$add_argument("--id_cols", default= 'pcr_plate,pcr_well',
-                    help= "Columns to identify each PCR well")
-parser$add_argument("--sig_cols", default="cell_set,treatment,dose,dose_unit,day", 
-                    help= 'Columns used to identify the treatment conditions')
-parser$add_argument("--control_type", default = "negcon",
-                    help= "how negative control wells are distinguished in the trt_type column")
-parser$add_argument("--count_threshold", default=40, help= "Low counts threshold")
-parser$add_argument("--reverse_index2", type="logical", default=FALSE,
-                    help= "Reverse complement of index 2 for NovaSeq and NextSeq")
-parser$add_argument("-o","--out", default="", help = "Output path. Default is working directory")
+parser$add_argument('--raw_counts_uncollapsed', default= "raw_counts_uncollapsed.csv",
+                    help= 'Path to file containing uncollapsed raw counts')
+parser$add_argument('--prism_barcode_counts', default= "prism_barcode_counts.csv", help= 'Path to prism_barcode_counts.csv')
+parser$add_argument('--unknown_barcode_counts', default= "unknown_barcode_counts.csv", 
+                    help= 'Path to unknown_barcode_counts.csv')
+parser$add_argument('--annotated_counts', default= 'annotated_counts.csv', help= 'Path to annotated_counts.csv')
+parser$add_argument('--normalized_counts', default= 'normalized_counts.csv', help= 'Path to normalized_counts.csv')
+parser$add_argument('--lfc', default= 'l2fc.csv', help= 'Path to l2fc.csv')
+parser$add_argument('-s', '--sample_meta', default= 'sample_meta.csv', help= 'Path to sample_meta.csv')
+parser$add_argument('--id_cols', default= 'pcr_plate,pcr_well', help= 'Sample meta columns used to identify every PCR well')
+parser$add_argument('--cell_line_cols', default= 'DepMap_ID', help= 'Sushi columns used to identify a read')
+parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', 
+                    help= 'Sample meta columns used to identify unique treatment conditions')
+parser$add_argument('--control_type', default= 'negcon', help= 'Value used in trt_type column to denote negative controls')
+parser$add_argument('--count_threshold', default= 40, help= 'Low counts theshold used in some plots')
+parser$add_argument('--reverse_index2', type= "logical", default= FALSE,
+                    help= 'Switch to reverse complement index_2 for some sequencers')
+parser$add_argument('-o', '--out', default= '', help= 'Output path, defaults to working directory')
 
 # get command line options, if help option encountered print help and exit
 args <- parser$parse_args()
@@ -47,40 +46,36 @@ if (args$out == ""){
   args$out = args$wkdir
 }
 
-# Read in files and pull out parameters ----
+# Read in input files ----
 sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',')
-
-# Pipeline outputs
-raw_counts_uncollapsed= data.table::fread(args$raw_counts_uncollapsed, header= TRUE, sep= ',')
-raw_counts= data.table::fread(args$raw_counts, header= TRUE, sep= ',')
+prism_barcode_counts= data.table::fread(args$prism_barcode_counts, header= TRUE, sep= ',')
+unknown_barcode_counts= data.table::fread(args$unknown_barcode_counts, header= TRUE, sep= ',')
 annotated_counts= data.table::fread(args$annotated_counts, header= TRUE, sep= ',')
 if(file.exists(args$normalized_counts)) {
-  normalized_counts= data.table::fread(args$normalized_counts, header=TRUE, sep=',', data.table=FALSE)
+  normalized_counts= data.table::fread(args$normalized_counts, header=TRUE, sep=',')
 } else {
   normalized_counts= NA
 }
 l2fc= data.table::fread(args$lfc, header= TRUE, sep= ',')
 
-# Parameters
-cell_line_cols = unlist(strsplit(args$cell_line_cols, ","))
-id_cols= unlist(strsplit(args$id_cols, ","))
-sig_cols= unlist(strsplit(args$sig_cols, ","))
-control_type = args$control_type
-count_threshold= as.numeric(args$count_threshold)
-#
+# Parse some input parameters ----
+id_cols= unlist(strsplit(args$id_cols, ','))
+cell_line_cols= unlist(strsplit(args$cell_line_cols, ','))
+sig_cols= unlist(strsplit(args$sig_cols, ','))
 
 # Call QC images function ----
 print("Calling QC images ...")
-QC_images(raw_counts_uncollapsed= raw_counts_uncollapsed, 
-          raw_counts= raw_counts, 
+QC_images(raw_counts_uncollapsed_filepath= raw_counts_uncollapsed,
+          prism_barcode_counts= prism_barcode_counts, 
+          unknown_barcode_counts= unknown_barcode_counts,
           annotated_counts= annotated_counts, 
           normalized_counts= normalized_counts, 
           l2fc= l2fc, 
           sample_meta= sample_meta,
-          cell_line_cols= c('DepMap_ID', 'CCLE_name'), 
           id_cols= id_cols, 
+          cell_line_cols= cell_line_cols,
           sig_cols= sig_cols,
-          control_type= control_type, 
-          count_threshold= count_threshold, 
+          control_type= args$control_type, 
+          count_threshold= as.numeric(args$count_threshold), 
           reverse_index2= args$reverse_index2,
           out= args$out)

From eed79e18560c020884616238fe6d6c6ebb86c25b Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 12:57:12 -0400
Subject: [PATCH 084/127] Update with new upstream outputs

---
 scripts/src/QC_images.R | 345 +++++++++++++++++-----------------------
 1 file changed, 150 insertions(+), 195 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index b0c0c1dd..a88bb019 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -1,54 +1,3 @@
-#' validate_columns_exist
-#' 
-#' This function checks that a list of columns are present in a dataframe.
-#' Columns that were not found in the dataframe are printed out.
-#' 
-#' @param selected_columns A vector of strings each representing a column name
-#' @param df A dataframe to check against
-#' @returns Boolean
-validate_columns_exist= function(selected_cols, df) {
-  # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B].
-  unmatched_cols= base::setdiff(selected_cols, colnames(df))
-  
-  if(length(unmatched_cols) > 0) {
-    print('The following columns are missing: ')
-    print(unmatched_cols)
-    return(FALSE)
-  } else {
-    return(TRUE)
-  }
-}
-
-#' Calculate index summaries
-#' 
-#' Generates some simple summaries for each unique index.
-#' 
-#' @import tidyverse
-#' @param df A dataframe which must contain the column "n" which represents the count of a read.
-#' @param index_col The name of the column contain the index barcodes as a string. This column must be present in "df".
-#' @param valid_indices. A vector of all the valid indices for "index_col".
-#' @returns A dataframe with the follow columns:
-#'         - index_col: String, The column containing the index barcodes.
-#'         - idx_n: Numeric, Number of reads associated with a specific index barcode.
-#'         - fraction: Numeric, "idx_n" divided by the total number of reads in the run.
-#'         - expected: Boolean, True if the index barcode is in "valid_indices" otherwise False.
-#'         - contains_n: Boolean, True if the index barcode contains "N" in its sequence, otherwise False.
-#'         - lv_dist: Numeric, Edit distance from a valid index barcode.
-#'         - ham_dist: Numeric, Hamming distance from a valid index barcode.
-get_index_summary= function(df, index_col, valid_indices) {
-  output_summary= df %>% dplyr::group_by(pick(all_of(index_col))) %>% 
-    dplyr::summarise(idx_n= sum(n)) %>% dplyr::ungroup() %>%
-    dplyr::mutate(fraction= round(idx_n/sum(idx_n), 5),
-                  expected= ifelse(.[[index_col]] %chin% valid_indices, T, F),
-                  contains_n= ifelse(grepl('N', .[[index_col]]), T, F),
-                  lv_dist= apply(stringdist::stringdistmatrix(.[[index_col]], valid_indices, method="lv"), 
-                                 1, min),
-                  ham_dist= apply(stringdist::stringdistmatrix(.[[index_col]], valid_indices, method="hamming"), 
-                                  1, min)) %>%
-    dplyr::arrange(desc(fraction))
-  return(output_summary)
-}
-
 #' Calculate purity metrics
 #' 
 #' Create the qc table with index purity and cell line purity.
@@ -59,67 +8,87 @@ get_index_summary= function(df, index_col, valid_indices) {
 #' @param value_col String name of the counts column present all three dataframes.
 #' @param file_path Location to write out the output.
 #' @returns Writes out a QC_table to the file_path.
-create_qc_table= function(raw_counts_uncollapsed_filepath, unknown_reads, known_reads, filtered_counts,
-                          value_col= 'n', file_path) {
-  # Validations: Check that the path works and that value_col exists in all tables
+create_qc_table= function(raw_counts_uncollapsed_filepath, unknown_barcode_counts, 
+                          prism_barcode_counts, filtered_counts,
+                          value_col= 'n', output_path) {
+  # Validation: Check that the file at the path exists
   if(!file.exists(raw_counts_uncollapsed_filepath)) {
     stop('Cannot find the raw counts uncollapsed file.')
   }
+  
+  # Pull out only the headers of the large file for validation
   rcu_headers= data.table::fread(raw_counts_uncollapsed_filepath, header= TRUE, sep= ',', nrow= 0)
+  
+  # Validation: Check that value_col exists in raw_counts_uncollapsed
   if(!validate_columns_exist(value_col, rcu_headers)) {
-    stop(paste0('The column ', value_col, " was not detected in uncollapsed raw counts."))
+    stop(paste0('The column ', value_col, ' was not detected in uncollapsed raw counts.'))
   }
-  if(!validate_columns_exist(value_col, unknown_reads)) {
-    stop(paste0('The column ', value_col, " was not detected in unknown_reads.csv"))
+  
+  # Validation: Check that value_col exists in unknown_barocde_counts
+  if(!validate_columns_exist(value_col, unknown_barcode_counts)) {
+    stop(paste0('The column ', value_col, ' was not detected in unknown_barcode_counts.csv'))
   }
-  if(!validate_columns_exist(value_col, known_reads)) {
-    stop(paste0('The column ', value_col, " was not detected in known_reads.csv"))
+  
+  # Validation: Check that value_col exists in prism_barocde_counts
+  if(!validate_columns_exist(value_col, prism_barcode_counts)) {
+    stop(paste0('The column ', value_col, ' was not detected in prism_barcode_counts.csv'))
   }
+  
+  # Validation: Check that value_col exists in filtered_counts
   if(!validate_columns_exist(value_col, filtered_counts)) {
-    stop(paste0('The column ', value_col, " was not detected in filtered_counts.csv"))
+    stop(paste0('The column ', value_col, ' was not detected in filtered_counts.csv'))
   }
   
-  # Calculate purities
-  # Determine total number of reads
+  # Determine total number of reads in raw_counts_uncollapsed using chunking
   chunk_sum= process_in_chunks(large_file_path= raw_counts_uncollapsed_filepath, 
                                chunk_size= 10^6, 
                                action= function(x) data.table::as.data.table(sum(x[[value_col]])))
-  total_num_reads= sum(unlist(test_sum))
+  total_num_reads= sum(unlist(chunk_sum))
+  
+  # Determine number of reads that mapped to valid PCR locations
+  # These reads have the correct index barcodes
+  total_valid_pcr_reads= sum(unknown_barcode_counts[[value_col]]) + sum(prism_barcode_counts[[value_col]])
   
   # Calculate purities
-  index_purity= (sum(unknown_reads[[value_col]]) + sum(known_reads[[value_col]])) / total_num_reads
+  # Index purity is the fraction of reads that mapped to valid PCR locations out of the total number of reads.
+  index_purity= total_valid_pcr_reads / total_num_reads
   print(paste0('Index purity: ', round(index_purity, 4)))
-  cell_line_purity= sum(filtered_counts[[value_col]]) / (sum(unknown_reads[[value_col]]) + sum(known_reads[[value_col]])) 
+  # Cell line purity is the fraction of reads that are identified as cell lines or control barcodes out of valid PCR reads.
+  cell_line_purity= sum(filtered_counts[[value_col]]) / total_valid_pcr_reads
   print(paste0('Cell line purity: ', round(cell_line_purity, 4)))
-  qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity)
   
-  # Write out table
-  print(paste0('Writing QC table out to ', file_path))
-  qc_table %>% write.csv(file_path, row.names= FALSE, quote= FALSE)
+  # Write out QC table
+  qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity)
+  print(paste0('Writing QC table out to ', output_path))
+  qc_table %>% write.csv(output_path, row.names= FALSE, quote= FALSE)
 }
 
-process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) {
-  
-  header_col_names= data.table::fread(large_file_path, header= TRUE, sep= ',', nrow= 0) %>% colnames()
-  chunk_idx= 1 # Counter to keep track of chunks in a loop
-  current_chunk_size= chunk_size # Variable for loop exit condition
-  chunk_collector= list() # List to collect processed chunks
-  
-  # For each chunk, call an action
-  while(current_chunk_size == chunk_size) {
-    current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',',
-                                     col.names= header_col_names,
-                                     nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1)
-    
-    current_chunk_size= nrow(current_chunk) # set current chunk size to stop loop
-    print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' '))
-    
-    chunk_collector[[chunk_idx]]= do.call(action, list(current_chunk, ...))
-    chunk_idx= chunk_idx + 1
-  }
-  
-  output_table= data.table::rbindlist(chunk_collector)
-  return(output_table)
+#' Calculate index summaries
+#' 
+#' Generates some simple summaries for each unique index.
+#' 
+#' @import tidyverse
+#' @param df A dataframe which must contain the column "n" which represents the count of a read.
+#' @param index_col The name of the column contain the index barcodes as a string. This column must be present in "df".
+#' @param valid_indices. A vector of all the valid indices for "index_col".
+#' @returns A dataframe with the follow columns:
+#'         - index_col: String, The column containing the index barcodes.
+#'         - idx_n: Numeric, Number of reads associated with a specific index barcode.
+#'         - fraction: Numeric, "idx_n" divided by the total number of reads in the run.
+#'         - expected: Boolean, True if the index barcode is in "valid_indices" otherwise False.
+#'         - contains_n: Boolean, True if the index barcode contains "N" in its sequence, otherwise False.
+#'         - lv_dist: Numeric, Edit distance from a valid index barcode.
+#'         - ham_dist: Numeric, Hamming distance from a valid index barcode.
+get_index_summary= function(df, index_col, valid_indices) {
+  output_summary= df %>% dplyr::group_by(pick(all_of(index_col))) %>% 
+    dplyr::summarise(idx_n= sum(n)) %>% dplyr::ungroup() %>%
+    dplyr::mutate(fraction= round(idx_n/sum(idx_n), 5),
+                  expected= ifelse(.[[index_col]] %chin% valid_indices, T, F),
+                  contains_n= ifelse(grepl('N', .[[index_col]]), T, F),
+                  lv_dist= apply(stringdist::stringdistmatrix(.[[index_col]], valid_indices, method="lv"), 1, min),
+                  ham_dist= apply(stringdist::stringdistmatrix(.[[index_col]], valid_indices, method="hamming"), 1, min)) %>%
+    dplyr::arrange(desc(fraction))
+  return(output_summary)
 }
 
 #' Total counts barplot
@@ -142,18 +111,18 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) {
   # Sum up reads 
   total_counts= filtered_counts %>%
     dplyr::mutate(barcode_type= case_when(!is.na(CCLE_name) ~ 'cell line',
-                                          !is.na(Name) ~ 'ctrl barcode')) %>%
+                                          !is.na(cb_name) ~ 'ctrl barcode')) %>%
     tidyr::unite(all_of(id_cols), col= 'sample_id', sep= ':', remove= FALSE, na.rm= FALSE) %>%
     dplyr::group_by(pick(all_of(na.omit(c('sample_id', facet_col, 'barcode_type'))))) %>%
     dplyr::summarise(total_counts= sum(n)) %>% dplyr::ungroup()
   
   # Create total counts plot
   total_counts_plot= total_counts %>% 
-    ggplot(aes(x=sample_id, y=total_counts, fill=barcode_type)) +
-    geom_col(alpha=0.75, position='identity') +
-    geom_hline(yintercept= 10^4, linetype=2) + 
+    ggplot(aes(x= sample_id, y= total_counts, fill= barcode_type)) +
+    geom_col(alpha= 0.75, position= 'identity') +
+    geom_hline(yintercept= 10^4, linetype= 2) + 
     {if(!is.na(facet_col)) facet_wrap(~.data[[facet_col]], scale= 'free_x')} +
-    labs(x= "Sample constructed using id_cols", y="Total counts", fill= 'Barcode\ntype', 
+    labs(x= 'Sample constructed using id_cols', y= 'Total counts', fill= 'Barcode\ntype', 
          title= 'Filtered counts - unstacked') + 
     theme_bw() + theme(axis.text.x = element_text(angle=70, hjust=1))
   
@@ -181,7 +150,7 @@ create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value
                                   plot_type= 'percent', include_ctrl_bcs= FALSE) {
   # Validation: Check that id_cols, facet_col, or value_col exist in filtered counts.
   if(!validate_columns_exist(na.omit(c(id_cols, facet_col, value_col)), filtered_counts)) {
-    stop('Some input columns were not detected in filtered counts.')
+    stop('In create_recovery_barplot, some required input columns were not detected.')
   }
   
   # Filter out control barcodes if it is specified.
@@ -215,9 +184,9 @@ create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value
   # Create recovery plot.
   recov_plot= recovery %>%
     ggplot(aes(x= sample_id, y= .data[[y_col]], fill= reorder(detect_type, dplyr::desc(detect_type)))) +
-    geom_col(alpha=0.75, position='stack') +
+    geom_col(position= 'stack', alpha= 0.75) +
     {if(!is.na(facet_col)) facet_wrap(~.data[[facet_col]], scale= 'free_x')} +
-    labs(x= "Sample constructed using id_cols", y= y_text, fill= '', title= 'Cell line recovery') + 
+    labs(x= 'Sample constructed using id_cols', y= y_text, fill= '', title= 'Cell line recovery') + 
     theme_bw() + theme(axis.text.x = element_text(angle=70, hjust=1))
   
   return(recov_plot)
@@ -245,7 +214,12 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2=
                           contains_cbs= FALSE, order_aucs= FALSE) {
   # Validation: Check that id_cols and counts_col are in the input dataframe.
   if(!validate_columns_exist(c(id_cols, counts_col), input_df)) {
-    stop('Some input columns were not detected in the cdf input dataframe.')
+    stop('In create_cdf_plot, some required input columns were not detected.')
+  }
+  
+  # Validation: mark1 should be less than mark2.
+  if(mark1 > mark2 | mark1 < 0 | mark1 > 1) {
+    stop('Mark values must be between 0 and 1. mark1 should be less than mark2')
   }
   
   # Determine percentages, ranks and cumulative percentages
@@ -254,12 +228,7 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2=
     dplyr::mutate(expected_num_cls= dplyr::n(),
                   total_counts= sum(.data[[counts_col]]), pct_counts= .data[[counts_col]]/total_counts,
                   cum_pct= cumsum(pct_counts), 
-                  rank= row_number(), rank_pct= rank/expected_num_cls) %>% dplyr::ungroup()
-  
-  # Validation: mark1 should be less than mark2.
-  if(mark1 > mark2 | mark1 < 0 | mark1 > 1) {
-    stop('Mark values must be between 0 and 1 Mark1 should be less than mark2')
-  }
+                  rank= row_number(), rank_pct= rank / expected_num_cls) %>% dplyr::ungroup()
   
   # Find the number of cell lines needed to reach mark1 and mark2
   mark1_values= calc_cummulative %>% dplyr::filter(cum_pct >= mark1) %>% 
@@ -286,7 +255,7 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2=
   
   # Create plot
   output_plot= data_for_plot %>%
-    ggplot(aes(x= rank_pct, y=cum_pct)) +
+    ggplot(aes(x= rank_pct, y= cum_pct)) +
     # Color control barcodes if specified
     {if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), 
                                  mapping= aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size= 2)} + 
@@ -304,7 +273,7 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2=
     geom_label(. %>% dplyr::filter(!is.na(auc)), mapping= aes(label= paste0('AUC ', round(auc, 3))), 
                x= 1, y= 0.25, hjust= 'inward', vjust= 'inward', color= 'black') +
     facet_wrap(~facet_name) + 
-    labs(x='% rank of unique reads', y='Cumulative percentage', color= 'CBs') + theme_bw()
+    labs(x= '% rank of unique reads', y= 'Cumulative percentage', color= 'CBs') + theme_bw()
   
   return(output_plot)
 }
@@ -408,11 +377,11 @@ create_cor_heatmap= function(input_df, row_id_cols, col_id_cols, value_col,
   cor_heatmap= correlation_mx %>% reshape2::melt() %>% 
     ggplot(aes(x= Var1, y= Var2, fill= value)) + 
     geom_tile() +
-    labs(x= '', y= '', fill= '', title= paste0('Correlations using ', value_col)) +
     scale_fill_gradientn(breaks= c(0, 0.5, 1), 
                          colours= c('blue', 'white','red'),
                          limits=c(0, 1), oob= scales::squish) +
-    theme(axis.text.x = element_text(angle=70, hjust=1))
+    labs(x= '', y= '', fill= '', title= paste0('Correlations using ', value_col)) +
+    theme_bw() + theme(axis.text.x = element_text(angle=70, hjust=1))
   
   return(cor_heatmap)
 }
@@ -497,13 +466,16 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou
 #' @param reverse_index2 Boolean set to TRUE if the sequencing involved the reverse complement workflow.
 #' @param out Path to the directory to save the QC images.
 #' @returns NA. QC images are written out to the specified folder.
-QC_images= function(raw_counts_uncollapsed, raw_counts, 
+QC_images= function(raw_counts_uncollapsed_filepath,
+                    prism_barcode_counts, unknown_barcode_counts,
                     annotated_counts, normalized_counts= NA, l2fc, 
                     sample_meta,
-                    cell_line_cols, 
-                    id_cols= c('pcr_plate', 'pcr_well'), sig_cols,
+                    id_cols= c('pcr_plate', 'pcr_well'),
+                    cell_line_cols= c('DepMap_ID'), 
+                    sig_cols,
                     control_type= 'negcon', count_threshold= 40, 
                     reverse_index2= FALSE, out = NA) {
+  
   # Required packages ----
   require(tidyverse)
   require(magrittr)
@@ -515,46 +487,51 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   
   # Some preprocessing ----
   # Set out directory if none is specified.
-  if(is.na(out)) {out = getwd()}
+  if(is.na(out)) {out= getwd()}
   
-  # Create empty vector to collect potential errors.
+  # Create empty vector to collect potential errors when running QCs
   skipped_qcs= c() 
   
   # Count number of distinct profile to help scale some plots.
   num_profiles= annotated_counts %>% dplyr::distinct(pick(all_of(id_cols))) %>% nrow()
   
-  # Detect control barcodes
+  # Detect if there are wells with control barcodes.
   cb_check= sample_meta %>%
     dplyr::filter(control_barcodes %in% c("Y", "T", T),
                   !(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type))
   contains_cbs= ifelse(nrow(cb_check)!= 0, TRUE, FALSE)
   
-  # Pull filtered counts from annotated counts
+  # Create filtered_counts df from annotated_counts 
   filtered_counts= annotated_counts %>% dplyr::filter(expected_read)
   
   # Sequencing QCs ____________________ ----
   ## 1. Purity metrics ----
   print('1. Generating QC table ...')
-  create_qc_table(raw_counts_uncollapsed, 
-                  unknown_reads= unknown_reads,
-                  known_reads= known_reads,
-                  filtered_counts,
+  create_qc_table(raw_counts_uncollapsed_path= raw_counts_uncollapsed_path, 
+                  unknown_barcode_counts= unknown_barcode_counts,
+                  prism_barcode_counts= prism_barcode_counts,
+                  filtered_counts= filtered_counts,
                   value_col= 'n', file_path= paste0(out, '/QC_table.csv'))
   
   ## 2. Index count summaries ----
-  print("2. Generating index counts tables ...")
+  print('2. Generating index counts tables ...')
   
   # Pull out headers to perform checks
-  raw_counts_uncollapsed_headers= data.table::fread(raw_counts_file_path, header= TRUE, sep= ',', nrow= 0)
+  raw_counts_uncollapsed_headers= data.table::fread(raw_counts_uncollapsed_path, header= TRUE, sep= ',', nrow= 0)
   
   # Check that "index_1" is present. If so, calculate index summary and write out.
-  if('index_1' %in% colnames(sample_meta) & 'index_1' %in% colnames(raw_counts_uncollapsed_headers)) {
+  if('index_1' %in% colnames(sample_meta) & 'index_1' %in% colnames(raw_counts_uncollapsed_path)) {
+    # Aggregate over index_1 using chunks
+    # Action is set to a data.table summarize with summing
+    index1_chunks= process_in_chunks(large_file_path= raw_counts_uncollapsed_path, chunk_size= 10^6, 
+                                     action= function(x) x[, list(n= sum(n)), by= index_1])
+    
+    # Create vector of unique index_1 values
     expected_index1= unique(sample_meta$index_1)
-    # Aggregate by index_1 using chunks
-    index1_chunks= process_in_chunks(large_file_path= raw_counts_file_path, chunk_size= 10^6, 
-                                     action= function(x) x[, list(n= sum(n)), by= index_1]) 
-    index1_counts= get_index_summary(index1_chunks, 'index_1', expected_index1)
-    index1_counts %>% write.csv(file= paste(out, 'index1_counts.csv', sep='/'), row.names=F)
+    
+    # Call get_index_summary over index1_chunks as a full table, then write out table
+    index1_counts= get_index_summary(data.table::rbindlist(index1_chunks), 'index_1', expected_index1)
+    index1_counts %>% write.csv(file= paste(out, 'index1_counts.csv', sep= '/'), row.names= FALSE, quote= FALSE)
   } else {
     print('Column "index_1" not detected. Skipping index 1 summaries ...', quote= FALSE)
   }
@@ -567,20 +544,23 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   }
   
   if('index_2' %in% colnames(sample_meta) & 'index_2' %in% colnames(raw_counts_uncollapsed_headers)) {
-    expected_index2= unique(sample_meta$index_2)
-    
-    # Aggregate by index_2 using chunks
-    index2_chunks= process_in_chunks(large_file_path= raw_counts_file_path, chunk_size= 10^6, 
+    # Aggregate over index_2 using chunks
+    # Action is set to a data.table summarize with summing
+    index2_chunks= process_in_chunks(large_file_path= raw_counts_uncollapsed_path, chunk_size= 10^6, 
                                      action= function(x) x[, list(n= sum(n)), by= index_2]) 
     
-    index2_counts= get_index_summary(raw_counts_uncollapsed, 'index_2', expected_index2)
-    index2_counts %>% write.csv(file= paste(out, 'index2_counts.csv', sep='/'), row.names=F)
+    # Create vector of unique index_2 values
+    expected_index2= unique(sample_meta$index_2)
+    
+    # Call get_index_summary over index2_chunks as a full table, then write out table
+    index2_counts= get_index_summary(data.table::rbindlist(raw_counts_uncollapsed), 'index_2', expected_index2)
+    index2_counts %>% write.csv(file= paste(out, 'index2_counts.csv', sep= '/'), row.names= FALSE, quote= FALSE)
   } else {
     print('Column "index_2" not detected. Skipping index 2 summaries ...',  quote= FALSE)
   }
   
   ## 3. Total counts ----
-  print("3. Generating total_counts image ...")
+  print('3. Generating total_counts image ...')
   potential_error= base::tryCatch({
     tc= create_total_counts_barplot(filtered_counts, id_cols, facet_col= 'pcr_plate')
     
@@ -602,13 +582,13 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   
   # Assay QCs _________________________ ----
   ## 4. Cell lines recovered ----
-  print("4. Generating cell_lines_present image ...")
+  print('4. Generating cell_lines_present image ...')
   potential_error= base::tryCatch({
     cl_rec= create_recovery_barplot(filtered_counts, id_cols= id_cols, facet_col= 'pcr_plate', 
                                     count_threshold= count_threshold, plot_type= 'percent')
     
-    pdf(file=paste(out, "cell_lines_present.pdf", sep="/"),
-        width=sqrt(num_profiles)*2, height=sqrt(num_profiles))
+    pdf(file= paste(out, "cell_lines_present.pdf", sep="/"),
+        width= sqrt(num_profiles)*2, height= sqrt(num_profiles))
     print(cl_rec)
     dev.off()
     rm(cl_rec)
@@ -626,13 +606,13 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   ## 5. Cell line contaminants ----
   print('5. Generating cell line contaminants ...')
   potential_error= base::tryCatch({
-    contams= annotated_counts %>% dplyr::filter(expected_read == F) %>%
+    contams= annotated_counts %>% dplyr::filter(expected_read == FALSE) %>%
       dplyr::mutate(barcode_id= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>%
       dplyr::group_by(forward_read_cl_barcode, barcode_id) %>% 
-      dplyr::summarise(num_wells= n(), median_n=median(n), max_n= max(n)) %>% ungroup() %>%
+      dplyr::summarise(num_wells= n(), median_n= median(n), max_n= max(n)) %>% ungroup() %>%
       dplyr::arrange(desc(num_wells))
     
-    contams %>% write.csv(file= paste(out, 'contam_cell_lines.csv', sep='/'), row.names=F)
+    contams %>% write.csv(file= paste(out, 'contam_cell_lines.csv', sep='/'), row.names= FALSE, quote= FALSE)
     rm(contams)
   }, error= function(e) {
     print(e)
@@ -645,34 +625,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
     skipped_qcs = c(skipped_qcs, potential_error)
   }
   
-  ## 6. Contaminant reads ----
-  print('6. Generating contaminant reads ...')
-  potential_error= base::tryCatch({
-    # watered down version
-    summed_unknown_reads= unknown_reads[, list(num_reads = sum(n), num_wells= .N), 
-                                        by= base::mget('forward_read_cl_barcode')]
-    summed_contams= annotated_counts[expected_read == FALSE, list(num_reads = sum(n), num_wells= .N),
-                                          by= base::mget(c('forward_read_cl_barcode', 'DepMap_ID', 'cb_name'))]
-    summed_contams[, barcode_name:= ifelse(is.na(DepMap_ID), cb_name, DepMap_ID)][,DepMap_ID:= NULL]
-    
-    contam_reads= data.table::rbindlist(list(summed_contams, summed_unknown_reads), fill= TRUE) %>%
-      dplyr::arrange(dplyr::desc(num_reads))
-    
-    # write out
-    contam_reads %>% write.csv(paste0(out, 'contam_reads.csv'), row.names=F)
-  }, error= function(e) {
-    print(e)
-    print('Encountered an error when creating the contams reads file. Skipping this output ...')
-    return('contam reads')
-  })
-  
-  # Collect returned string if an error occurred
-  if(!is.null(potential_error)) {
-    skipped_qcs = c(skipped_qcs, potential_error)
-  }
-  
-  ## 7. Cumulative counts by lines in negcons ----
-  print("7. Generating cumulative image ...")
+  ## 6. Cumulative counts by lines in negcons ----
+  print('6. Generating cumulative image ...')
   potential_error= base::tryCatch({
     cdf_plot= create_cdf_plot(filtered_counts %>% dplyr::filter(trt_type == control_type), 
                               id_cols= id_cols, 
@@ -681,8 +635,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
                               contains_cbs= contains_cbs, order_aucs= TRUE) +
       labs(title= 'Cumulative reads in negative controls.')
     
-    pdf(file=paste(out, "cdf_plot.pdf", sep="/"),
-        width=sqrt(num_profiles)*2, height=sqrt(num_profiles))
+    pdf(file=paste(out, 'cdf_plot.pdf', sep= '/'),
+        width= sqrt(num_profiles) * 2, height= sqrt(num_profiles))
     print(cdf_plot)
     dev.off()
     rm(cdf_plot)
@@ -694,18 +648,18 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   
   # Collect returned string if an error occurred
   if(!is.null(potential_error)) {
-    skipped_qcs = c(skipped_qcs, potential_error)
+    skipped_qcs= c(skipped_qcs, potential_error)
   }
   
-  ## 8. Control barcode trends ----
+  ## 7. Control barcode trends ----
   if(contains_cbs & is.data.frame(normalized_counts)) {
-    print("8. Generating control_barcode_trend image")
+    print('7. Generating control_barcode_trend image')
     potential_error= base::tryCatch({
       trend_sc= create_ctrlBC_scatterplots(normalized_counts %>% dplyr::filter(control_barcodes %in% c("Y", "T", T)), 
                                            id_cols, value_col= 'log2_n')
       
       pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"),
-          width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2)
+          width= sqrt(num_profiles) * 2, height= sqrt(num_profiles) * 2)
       print(trend_sc)
       dev.off()
       rm(cb_trend, trend_sc)
@@ -717,25 +671,25 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
     
     # Collect returned string if an error occurred
     if(!is.null(potential_error)) {
-      skipped_qcs = c(skipped_qcs, potential_error)
+      skipped_qcs= c(skipped_qcs, potential_error)
     }
   } else {
-    print('8. No control barcodes detected. Skipping control_barcode_trend image.')
+    print('7. No control barcodes detected. Skipping control_barcode_trend image.')
   }
   
-  ## 9. Sample correlation -----
-  print("9. Generating sample_cor image ...")
+  ## 8. Sample correlation -----
+  print('8. Generating sample_cor image ...')
   potential_error= base::tryCatch({
     cor_df= filtered_counts %>% 
-      dplyr::filter(!is.na(DepMap_ID), !is.na(trt_type), !trt_type %in% c("empty", "", "CB_only")) %>%
+      dplyr::filter(!is.na(DepMap_ID), !is.na(trt_type), !trt_type %in% c('empty', '', 'CB_only')) %>%
       dplyr::mutate(log2_n= log2(n + 1))
     cp= create_cor_heatmap(input_df= cor_df,
                            row_id_cols= c('DepMap_ID'),
                            col_id_cols= c(sig_cols, id_cols),
                            value_col= 'log2_n')
     
-    pdf(file=paste(out, "sample_cor.pdf", sep="/"),
-        width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2)
+    pdf(file= paste(out, 'sample_cor.pdf', sep= '/'),
+        width= sqrt(num_profiles) * 2, height= sqrt(num_profiles) * 2)
     print(cp)
     dev.off()
     rm(correlation_matrix, cp)
@@ -747,16 +701,16 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
   
   # Collect returned string if an error occurred
   if(!is.null(potential_error)) {
-    skipped_qcs = c(skipped_qcs, potential_error)
+    skipped_qcs= c(skipped_qcs, potential_error)
   }
   
-  ## 10. Tech rep correlations ----
+  ## 9. Tech rep correlations ----
   if(is.data.frame(normalized_counts) & 'tech_rep' %in% colnames(normalized_counts)) {
     # Check if there are more at least two tech reps
     unique_tech_reps= na.omit(unique(normalized_counts$tech_rep))
     
     if(length(unique_tech_reps) >= 2) {
-      print("10. Generating tech rep correlations image ...")
+      print('9. Generating tech rep correlations image ...')
       # Set up replicate groups depending "bio_rep" column
       if('bio_rep' %in% colnames(normalized_counts) & !'bio_rep' %in% sig_cols) {
         replicate_group_cols= c(sig_cols, 'bio_rep')
@@ -794,17 +748,17 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
       
       # Collect returned string if an error occurred
       if(!is.null(potential_error)) {
-        skipped_qcs = c(skipped_qcs, potential_error)
+        skipped_qcs= c(skipped_qcs, potential_error)
       }
       
     } else {
-      print('10. No technical replicates detected. Skipping tech_reps scatter plot.')
+      print('9. No technical replicates detected. Skipping tech_reps scatter plot.')
     }
   } else {
-    print('10. No technical replicates detected. Skipping tech_reps scatter plot.')
+    print('9. No technical replicates detected. Skipping tech_reps scatter plot.')
   }
   
-  ## 11. Bio rep correlations ----
+  ## 10. Bio rep correlations ----
   if('bio_rep' %in% colnames(l2fc)) {
     unique_bio_reps= na.omit(unique(l2fc$bio_rep))
     
@@ -812,6 +766,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
       l2fc_with_log2= l2fc %>% dplyr::mutate(log2_mean_normalized_n= log2(mean_normalized_n))
       
       # Bio replicate scatter plots
+      # This is just another visualization that isn't being used.
       # bio_reps_plt= create_replicate_scatterplots(input_df= l2fc_with_log2s, 
       #                                           cell_line_cols= cell_line_cols, 
       #                                           replicate_group_cols= sig_cols, 
@@ -827,15 +782,15 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
       # }
       
       # Bio replicate heatmap
-      print("11. Generating bio rep correlations heatmap ...")
+      print('10. Generating bio rep correlations heatmap ...')
       potential_error= base::tryCatch({
         bio_corr_hm= create_cor_heatmap(input_df= l2fc_with_log2, 
                                         row_id_cols= cell_line_cols, 
                                         col_id_cols= c(sig_cols, 'bio_rep'), 
                                         value_col= 'l2fc',
                                         cor_method= 'pearson') 
-        pdf(file=paste(out, "bio_corr_hm.pdf", sep="/"),
-            width=sqrt(num_profiles), height=sqrt(num_profiles))
+        pdf(file= paste(out, 'bio_corr_hm.pdf', sep= '/'),
+            width= sqrt(num_profiles), height= sqrt(num_profiles))
         print(bio_corr_hm)
         dev.off()
       }, error= function(e) {
@@ -846,11 +801,11 @@ QC_images= function(raw_counts_uncollapsed, raw_counts,
       
       # Collect returned string if an error occurred
       if(!is.null(potential_error)) {
-        skipped_qcs = c(skipped_qcs, potential_error)
+        skipped_qcs= c(skipped_qcs, potential_error)
       }
       
     } else {
-      print('11. No biological replicates detected. Skipping bio_rep heatmap.')
+      print('10. No biological replicates detected. Skipping bio_rep heatmap.')
     }
   }
   

From a2cd665567a0fb854992f1df8d9c7f25de56e38d Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 13:51:51 -0400
Subject: [PATCH 085/127] Fixed a bug

---
 scripts/collate_fastq_reads.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index 850094a5..89421e57 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -63,7 +63,7 @@ chunked_results= process_in_chunks(large_file_path= args$raw_counts_uncollapsed,
                                    sample_meta= sample_meta, 
                                    sequencing_index_cols= sequencing_index_cols,
                                    id_cols= id_cols,
-                                   known_barcodes= unique(cell_line_meta$Sequence, CB_meta$Sequence),
+                                   known_barcodes= unique(c(cell_line_meta$Sequence, CB_meta$Sequence)),
                                    reverse_index2= args$reverse_index2,
                                    barcode_col= args$barcode_col)
 

From ce39711d653249fed97e7b449dc12c4017a33348 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 13:52:00 -0400
Subject: [PATCH 086/127] Corrected naming

---
 scripts/src/QC_images.R | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index a88bb019..5b16b2c7 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -8,16 +8,16 @@
 #' @param value_col String name of the counts column present all three dataframes.
 #' @param file_path Location to write out the output.
 #' @returns Writes out a QC_table to the file_path.
-create_qc_table= function(raw_counts_uncollapsed_filepath, unknown_barcode_counts, 
+create_qc_table= function(raw_counts_uncollapsed_path, unknown_barcode_counts, 
                           prism_barcode_counts, filtered_counts,
                           value_col= 'n', output_path) {
   # Validation: Check that the file at the path exists
-  if(!file.exists(raw_counts_uncollapsed_filepath)) {
+  if(!file.exists(raw_counts_uncollapsed_path)) {
     stop('Cannot find the raw counts uncollapsed file.')
   }
   
   # Pull out only the headers of the large file for validation
-  rcu_headers= data.table::fread(raw_counts_uncollapsed_filepath, header= TRUE, sep= ',', nrow= 0)
+  rcu_headers= data.table::fread(raw_counts_uncollapsed_path, header= TRUE, sep= ',', nrow= 0)
   
   # Validation: Check that value_col exists in raw_counts_uncollapsed
   if(!validate_columns_exist(value_col, rcu_headers)) {
@@ -40,7 +40,7 @@ create_qc_table= function(raw_counts_uncollapsed_filepath, unknown_barcode_count
   }
   
   # Determine total number of reads in raw_counts_uncollapsed using chunking
-  chunk_sum= process_in_chunks(large_file_path= raw_counts_uncollapsed_filepath, 
+  chunk_sum= process_in_chunks(large_file_path= raw_counts_uncollapsed_path, 
                                chunk_size= 10^6, 
                                action= function(x) data.table::as.data.table(sum(x[[value_col]])))
   total_num_reads= sum(unlist(chunk_sum))
@@ -111,7 +111,7 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) {
   # Sum up reads 
   total_counts= filtered_counts %>%
     dplyr::mutate(barcode_type= case_when(!is.na(CCLE_name) ~ 'cell line',
-                                          !is.na(cb_name) ~ 'ctrl barcode')) %>%
+                                          !is.na(Name) ~ 'ctrl barcode')) %>%
     tidyr::unite(all_of(id_cols), col= 'sample_id', sep= ':', remove= FALSE, na.rm= FALSE) %>%
     dplyr::group_by(pick(all_of(na.omit(c('sample_id', facet_col, 'barcode_type'))))) %>%
     dplyr::summarise(total_counts= sum(n)) %>% dplyr::ungroup()
@@ -391,7 +391,7 @@ create_cor_heatmap= function(input_df, row_id_cols, col_id_cols, value_col,
 #' From a long table, creates scatter plots to two replicates.
 #' 
 #' @import tidyverse
-#' @import ggmisc
+#' @import ggpmisc
 #' @param input_df Dataframe.
 #' @param cell_line_cols List of column names used to identify each cell line or control barcode.
 #' @param replicate_group_cols List of column names that describe a group of similar conditions.
@@ -466,7 +466,7 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou
 #' @param reverse_index2 Boolean set to TRUE if the sequencing involved the reverse complement workflow.
 #' @param out Path to the directory to save the QC images.
 #' @returns NA. QC images are written out to the specified folder.
-QC_images= function(raw_counts_uncollapsed_filepath,
+QC_images= function(raw_counts_uncollapsed_path,
                     prism_barcode_counts, unknown_barcode_counts,
                     annotated_counts, normalized_counts= NA, l2fc, 
                     sample_meta,
@@ -483,7 +483,7 @@ QC_images= function(raw_counts_uncollapsed_filepath,
   require(reshape2)
   require(WGCNA)
   require(scales)
-  require(ggmisc)
+  require(ggpmisc)
   
   # Some preprocessing ----
   # Set out directory if none is specified.
@@ -511,7 +511,7 @@ QC_images= function(raw_counts_uncollapsed_filepath,
                   unknown_barcode_counts= unknown_barcode_counts,
                   prism_barcode_counts= prism_barcode_counts,
                   filtered_counts= filtered_counts,
-                  value_col= 'n', file_path= paste0(out, '/QC_table.csv'))
+                  value_col= 'n', output_path= paste0(out, '/QC_table.csv'))
   
   ## 2. Index count summaries ----
   print('2. Generating index counts tables ...')
@@ -553,7 +553,7 @@ QC_images= function(raw_counts_uncollapsed_filepath,
     expected_index2= unique(sample_meta$index_2)
     
     # Call get_index_summary over index2_chunks as a full table, then write out table
-    index2_counts= get_index_summary(data.table::rbindlist(raw_counts_uncollapsed), 'index_2', expected_index2)
+    index2_counts= get_index_summary(data.table::rbindlist(index2_chunks), 'index_2', expected_index2)
     index2_counts %>% write.csv(file= paste(out, 'index2_counts.csv', sep= '/'), row.names= FALSE, quote= FALSE)
   } else {
     print('Column "index_2" not detected. Skipping index 2 summaries ...',  quote= FALSE)
@@ -572,7 +572,7 @@ QC_images= function(raw_counts_uncollapsed_filepath,
   }, error= function(e) {
     print(e)
     print('Encountered an error when creating the total counts barplot. Skipping this output ...') 
-    return('QC table')
+    return('Totalc ounts image')
   })
   
   # Collect returned string if an error occurred

From d230b495aa578bb86dd08b3fef7b9b32a9bbbbab Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 13:52:13 -0400
Subject: [PATCH 087/127] Corrected name

---
 scripts/filteredCounts_QC.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R
index 525452f3..1287745b 100755
--- a/scripts/filteredCounts_QC.R
+++ b/scripts/filteredCounts_QC.R
@@ -65,7 +65,7 @@ sig_cols= unlist(strsplit(args$sig_cols, ','))
 
 # Call QC images function ----
 print("Calling QC images ...")
-QC_images(raw_counts_uncollapsed_filepath= raw_counts_uncollapsed,
+QC_images(raw_counts_uncollapsed_path= args$raw_counts_uncollapsed,
           prism_barcode_counts= prism_barcode_counts, 
           unknown_barcode_counts= unknown_barcode_counts,
           annotated_counts= annotated_counts, 

From d3b413aaff8f5600eebd26a7003e73092dc0c834 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 13:52:26 -0400
Subject: [PATCH 088/127] Changed flag name

---
 scripts/join_metadata.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R
index 877509f1..226858ba 100644
--- a/scripts/join_metadata.R
+++ b/scripts/join_metadata.R
@@ -8,7 +8,7 @@ parser <- ArgumentParser()
 parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Sample meta data for the sequencing run.')
 parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata")
 parser$add_argument('--lfc', default= 'l2fc.csv', help= 'L2FC data.') # level 4
-parser$add_argument('--collapsed_l2fc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5
+parser$add_argument('--collapsed_lfc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5
 parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', 
                     help= 'Columns that uniquely identify a condition.') 
 parser$add_argument('--out', default= getwd(), help= 'Path to the output directory.')

From de5c54e1386e1fb92ff543b9bb6edc51ef05b7a6 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 14:15:31 -0400
Subject: [PATCH 089/127] Update and reorder parameteres

---
 scripts/collapse_replicates.sh  |  6 ++--
 scripts/collate_fastq_reads.sh  |  7 +++--
 scripts/compute_l2fc.sh         |  6 ++--
 scripts/filter_counts.sh        | 11 ++++---
 scripts/filteredCounts_QC.sh    | 31 +++++++++-----------
 scripts/join_metadata.sh        | 12 ++++----
 scripts/launch_job.sh           | 51 +++++++++++++++++++--------------
 scripts/make_config_file.groovy |  3 +-
 8 files changed, 68 insertions(+), 59 deletions(-)

diff --git a/scripts/collapse_replicates.sh b/scripts/collapse_replicates.sh
index 0329b863..b2db2a75 100644
--- a/scripts/collapse_replicates.sh
+++ b/scripts/collapse_replicates.sh
@@ -29,9 +29,11 @@ echo LFC is: $LFC
 
 echo Rscript collapse_replicates.R -c $LFC	\
 --out $BUILD_DIR \
---sig_cols $SIG_COLS
+--sig_cols $SIG_COLS \
+--cell_line_cols $CELL_LINE_COLS
 
 
 Rscript collapse_replicates.R -c $LFC	\
 --out $BUILD_DIR \
---sig_cols $SIG_COLS
+--sig_cols $SIG_COLS \
+--cell_line_cols $CELL_LINE_COLS
diff --git a/scripts/collate_fastq_reads.sh b/scripts/collate_fastq_reads.sh
index 16735e99..6acea4ea 100644
--- a/scripts/collate_fastq_reads.sh
+++ b/scripts/collate_fastq_reads.sh
@@ -108,12 +108,13 @@ echo CELL_LINE_META is: $CELL_LINE_META
 args=(
 --raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED"
 --sample_meta "$SAMPLE_META"
---out "$BUILD_DIR"
+--cell_line_meta "$CELL_LINE_META"
+--CB_meta "$CONTROL_BARCODE_META"
 --sequencing_index_cols="$SEQUENCING_INDEX_COLS"
 --id_cols "$ID_COLS" 
 --reverse_index2 "$REVERSE_INDEX2"
---cell_line_meta "$CELL_LINE_META"
---CB_meta "$CONTROL_BARCODE_META"
+--barcode_col "$BARCODE_COL"
+--out "$BUILD_DIR"
 )
 
 echo Rscript collate_fastq_reads.R "${args[@]}"
diff --git a/scripts/compute_l2fc.sh b/scripts/compute_l2fc.sh
index 270a9651..6bbf9f27 100644
--- a/scripts/compute_l2fc.sh
+++ b/scripts/compute_l2fc.sh
@@ -32,7 +32,8 @@ echo Rscript compute_l2fc.R -c $NORMALIZED_COUNTS \
 --sig_cols $SIG_COLS \
 --ctrl_cols $CONTROL_COLS \
 --count_threshold $COUNT_THRESHOLD \
---normalized_counts $NORMALIZED_COUNTS
+--normalized_counts $NORMALIZED_COUNTS \
+--cell_line_cols $CELL_LINE_COLS
 
 Rscript compute_l2fc.R -c $NORMALIZED_COUNTS \
 --out $BUILD_DIR \
@@ -41,4 +42,5 @@ Rscript compute_l2fc.R -c $NORMALIZED_COUNTS \
 --sig_cols $SIG_COLS \
 --ctrl_cols $CONTROL_COLS \
 --count_threshold $COUNT_THRESHOLD \
---normalized_counts $NORMALIZED_COUNTS
+--normalized_counts $NORMALIZED_COUNTS \
+--cell_line_cols $CELL_LINE_COLS
diff --git a/scripts/filter_counts.sh b/scripts/filter_counts.sh
index 0130677f..76f72f55 100644
--- a/scripts/filter_counts.sh
+++ b/scripts/filter_counts.sh
@@ -29,11 +29,11 @@ else
 fi
 
 #Enforces abs paths
-if [[ "$RAW_COUNTS" = /* ]]
+if [[ "$PRISM_BARCODE_COUNTS" = /* ]]
 then
-	RAW_COUNTS=$(ls $RAW_COUNTS)
+	PRISM_BARCODE_COUNTS=$(ls $PRISM_BARCODE_COUNTS)
 else
-	RAW_COUNTS=$BUILD_DIR/$RAW_COUNTS
+	PRISM_BARCODE_COUNTS=$BUILD_DIR/$PRISM_BARCODE_COUNTS
 fi
 
 echo $CELL_LINE_META
@@ -74,21 +74,20 @@ fi
 
 echo Build dir is: $BUILD_DIR
 echo SAMPLE_META is: $SAMPLE_META
-echo RAW_COUNTS is: $RAW_COUNTS
+echo PRISM_BARCODE_COUNTS is: $PRISM_BARCODE_COUNTS
 echo CELL_LINE_META is: $CELL_LINE_META
 echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META
 echo CELL_SET_META is: $CELL_SET_META
 echo ID_COLS is: $ID_COLS
 
 args=(
--c "$RAW_COUNTS"
+--prism_barcode_counts "$PRISM_BARCODE_COUNTS"
 --sample_meta "$SAMPLE_META"
 --cell_line_meta "$CELL_LINE_META"
 --CB_meta "$CONTROL_BARCODE_META"
 --cell_set_meta "$CELL_SET_META"
 --id_cols "$ID_COLS"
 --out "$BUILD_DIR"
---count_threshold "$COUNT_THRESHOLD"
 --pool_id "$PULL_POOL_ID"
 --rm_data "$REMOVE_DATA"
 --assay_pool_meta "$ASSAY_POOL_META"
diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh
index c2653172..bc826a64 100644
--- a/scripts/filteredCounts_QC.sh
+++ b/scripts/filteredCounts_QC.sh
@@ -65,11 +65,19 @@ else
 fi
 
 #Enforces abs paths
-if [[ "$RAW_COUNTS" = /* ]]
+if [[ "$PRISM_BARCODE_COUNTS" = /* ]]
 then
-	RAW_COUNTS=$(ls $RAW_COUNTS)
+	PRISM_BARCODE_COUNTS=$(ls $PRISM_BARCODE_COUNTS)
 else
-	RAW_COUNTS=$BUILD_DIR/$RAW_COUNTS
+	PRISM_BARCODE_COUNTS=$BUILD_DIR/$PRISM_BARCODE_COUNTS
+fi
+
+#Enforces abs paths
+if [[ "$UNKNOWN_BARCODE_COUNTS" = /* ]]
+then
+	UNKNOWN_BARCODE_COUNTS=$(ls $UNKNOWN_BARCODE_COUNTS)
+else
+	UNKNOWN_BARCODE_COUNTS=$BUILD_DIR/$UNKNOWN_BARCODE_COUNTS
 fi
 
 #Enforces abs paths
@@ -105,7 +113,6 @@ echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META
 echo COUNT_THRESHOLD is: $COUNT_THRESHOLD
 echo RAW_COUNTS_UNCOLLAPSED is: $RAW_COUNTS_UNCOLLAPSED
 echo LFC is: $LFC
-echo RAW_COUNTS is: $RAW_COUNTS
 echo REVERSE_INDEX2 is: $REVERSE_INDEX2
 
 args=(
@@ -117,23 +124,13 @@ args=(
 --count_threshold "$COUNT_THRESHOLD"
 --control_type "$CTL_TYPES"
 --raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED"
---raw_counts "$RAW_COUNTS"
+--prism_barcode_counts "$PRISM_BARCODE_COUNTS"
+--unknown_barcode_counts "$UNKNOWN_BARCODE_COUNTS"
 --lfc "$LFC"
 --id_cols "$ID_COLS"
 --reverse_index2 "$REVERSE_INDEX2"
 )
 
-echo Rscript filteredCounts_QC.R --sample_meta $SAMPLE_META \
---annotated_counts $ANNOTATED_COUNTS \
---normalized_counts $NORMALIZED_COUNTS \
---sig_cols $SIG_COLS \
---out $BUILD_DIR \
---count_threshold $COUNT_THRESHOLD \
---reverse_index2 $REVERSE_INDEX2 \
---control_type $CTL_TYPES \
---raw_counts_uncollapsed $RAW_COUNTS_UNCOLLAPSED \
---raw_counts $RAW_COUNTS \
---lfc $LFC \
---id_cols $ID_COLS
+echo Rscript filteredCounts_QC.R "${args[@]}"
 
 Rscript filteredCounts_QC.R "${args[@]}"
\ No newline at end of file
diff --git a/scripts/join_metadata.sh b/scripts/join_metadata.sh
index d13f1698..dd736dd8 100644
--- a/scripts/join_metadata.sh
+++ b/scripts/join_metadata.sh
@@ -36,11 +36,11 @@ else
 fi
 
 #Enforces abs paths
-if [[ "$COLLAPSED_VALUES" = /* ]]
+if [[ "$COLLAPSED_LFC" = /* ]]
 then
-	COLLAPSED_VALUES=$(ls $COLLAPSED_VALUES)
+	COLLAPSED_LFC=$(ls $COLLAPSED_LFC)
 else
-	COLLAPSED_VALUES=$BUILD_DIR/$COLLAPSED_VALUES
+	COLLAPSED_LFC=$BUILD_DIR/$COLLAPSED_LFC
 fi
 
 #Enforces abs paths
@@ -61,18 +61,18 @@ fi
 
 echo Build dir is: $BUILD_DIR
 echo LFC is: $LFC
-echo COLLAPSED_VALUES is: $COLLAPSED_VALUES
+echo COLLAPSED_LFC is: $COLLAPSED_LFC
 echo SAMPLE_META is: $SAMPLE_META
 
 echo Rscript join_metadata.R --lfc $LFC	\
---collapsed_l2fc $COLLAPSED_VALUES \
+--collapsed_lfc $COLLAPSED_LFC \
 --assay_pool_meta $ASSAY_POOL_META \
 --out $BUILD_DIR \
 --sig_cols $SIG_COLS \
 --sample_meta $SAMPLE_META
 
 Rscript join_metadata.R --lfc $LFC	\
---collapsed_l2fc $COLLAPSED_VALUES \
+--collapsed_lfc $COLLAPSED_LFC \
 --assay_pool_meta $ASSAY_POOL_META \
 --out $BUILD_DIR \
 --sig_cols $SIG_COLS \
diff --git a/scripts/launch_job.sh b/scripts/launch_job.sh
index e2378644..1ed0d9d9 100644
--- a/scripts/launch_job.sh
+++ b/scripts/launch_job.sh
@@ -23,13 +23,17 @@ fi
 
 # List of parameters
 PARAMS=(
-  SEQ_TYPE API_URL BUILD_DIR INDEX_1 INDEX_2 BARCODE_SUFFIX REVERSE_INDEX2
-  SAMPLE_META CONTROL_BARCODE_META CTL_TYPES ID_COLS SIG_COLS
-  RUN_NORM CONTROL_COLS COUNT_THRESHOLD COUNT_COL_NAME BUILD_NAME
-  CONVERT_SUSHI PULL_POOL_ID RUN_EPS_QC PSEUDOCOUNT REMOVE_DATA DAYS
-  SEQUENCING_INDEX_COLS RAW_COUNTS CELL_SET_META CELL_LINE_META FILTERED_COUNTS
-  LFC COUNTS ANNOTATED_COUNTS COLLAPSED_VALUES NORMALIZED_COUNTS ASSAY_POOL_META
-  RAW_COUNTS_UNCOLLAPSED
+  SEQ_TYPE BUILD_DIR INDEX_1 INDEX_2 BARCODE_SUFFIX REVERSE_INDEX2 RUN_NORM BUILD_NAME
+  CONVERT_SUSHI PULL_POOL_ID RUN_EPS_QC REMOVE_DATA DAYS COUNTS  
+  # metadata files
+  SAMPLE_META CELL_SET_META CELL_LINE_META CONTROL_BARCODE_META ASSAY_POOL_META
+  # susi files
+  RAW_COUNTS_UNCOLLAPSED PRISM_BARCODE_COUNTS UNKNOWN_BARCODE_COUNTS ANNOTATED_COUNTS
+  FILTERED_COUNTS NORMALIZED_COUNTS LFC COLLAPSED_LFC
+  # column name paramters
+  SEQUENCING_INDEX_COLS ID_COLS CELL_LINE_COLS SIG_COLS CONTROL_COLS
+  # additional parameters 
+  BARCODE_COL PSEUDOCOUNT COUNT_COL_NAME CTL_TYPES COUNT_THRESHOLD API_URL
 )
 
 # Load parameters
@@ -79,33 +83,36 @@ echo "Running in container:"
   -e BARCODE_SUFFIX="$BARCODE_SUFFIX" \
   -e REVERSE_INDEX2="$REVERSE_INDEX2" \
   -e SAMPLE_META="$SAMPLE_META" \
+  -e CELL_SET_META="$CELL_SET_META" \
+  -e CELL_LINE_META="$CELL_LINE_META" \
   -e CONTROL_BARCODE_META="$CONTROL_BARCODE_META" \
-  -e CTL_TYPES="$CTL_TYPES" \
+  -e ASSAY_POOL_META="$ASSAY_POOL_META" \
+  -e RAW_COUNTS_UNCOLLAPSED="$RAW_COUNTS_UNCOLLAPSED"\
+  -e PRISM_BARCODE_COUNTS="$PRISM_BARCODE_COUNTS"\
+  -e UNKNOWN_BARCODE_COUNTS="$UNKNOWN_BARCODE_COUNTS"\
+  -e ANNOTATED_COUNTS="$ANNOTATED_COUNTS" \
+  -e FILTERED_COUNTS="$FILTERED_COUNTS" \
+  -e NORMALIZED_COUNTS="$NORMALIZED_COUNTS" \
+  -e LFC="$LFC" \
+  -e COLLAPSED_LFC="$COLLAPSED_LFC" \
+  -e SEQUENCING_INDEX_COLS="$SEQUENCING_INDEX_COLS" \
   -e ID_COLS="$ID_COLS" \
+  -e CELL_LINE_COLS="$CELL_LINE_COLS" \
   -e SIG_COLS="$SIG_COLS" \
-  -e RUN_NORM="$RUN_NORM" \
   -e CONTROL_COLS="$CONTROL_COLS" \
-  -e COUNT_THRESHOLD="$COUNT_THRESHOLD" \
+  -e BARCODE_COL="$BARCODE_COL" \
+  -e PSEUDOCOUNT="$PSEUDOCOUNT" \
   -e COUNT_COL_NAME="$COUNT_COL_NAME" \
+  -e CTL_TYPES="$CTL_TYPES" \
+  -e COUNT_THRESHOLD="$COUNT_THRESHOLD" \
+  -e RUN_NORM="$RUN_NORM" \
   -e BUILD_NAME="$BUILD_NAME" \
   -e CONVERT_SUSHI="$CONVERT_SUSHI" \
   -e PULL_POOL_ID="$PULL_POOL_ID" \
   -e RUN_EPS_QC="$RUN_EPS_QC" \
-  -e PSEUDOCOUNT="$PSEUDOCOUNT" \
   -e REMOVE_DATA="$REMOVE_DATA" \
   -e DAYS="$DAYS" \
-  -e SEQUENCING_INDEX_COLS="$SEQUENCING_INDEX_COLS" \
-  -e RAW_COUNTS="$RAW_COUNTS" \
-  -e CELL_SET_META="$CELL_SET_META" \
-  -e CELL_LINE_META="$CELL_LINE_META" \
-  -e FILTERED_COUNTS="$FILTERED_COUNTS" \
-  -e LFC="$LFC" \
   -e COUNTS="$COUNTS" \
-  -e ANNOTATED_COUNTS="$ANNOTATED_COUNTS" \
-  -e COLLAPSED_VALUES="$COLLAPSED_VALUES" \
-  -e NORMALIZED_COUNTS="$NORMALIZED_COUNTS" \
-  -e ASSAY_POOL_META="$ASSAY_POOL_META" \
-  -e RAW_COUNTS_UNCOLLAPSED="$RAW_COUNTS_UNCOLLAPSED"\
   -v "$WORKSPACE:/workspace" \
   -v /cmap/tools/analysis2clue/credentials:/root/.aws/credentials:ro \
   -v /local/jenkins/.clue_api_key:/local/jenkins/.clue_api_key:ro \
diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy
index 513998b1..5b7ea040 100644
--- a/scripts/make_config_file.groovy
+++ b/scripts/make_config_file.groovy
@@ -54,6 +54,7 @@ pipeline {
         string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual controls in COMPUTE_LFC')
         
         // Additional parameters
+        string(name: 'BARCODE_COL', defaultValue: 'forward_read_cl_barcode', description: 'In COLLATE_FASTQ_READS, the name of the column containing the read')
         string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'In CBNORMALIZE, the pesudocount value for log transformations.')
         string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'In COMPUTE_LFC, the name of the numeric column to use for calculations')
         string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'In COMPUTE_LFC, the value in trt_type that indicates the negative controls')
@@ -125,7 +126,7 @@ pipeline {
                         'SEQUENCING_INDEX_COLS', 'ID_COLS', 'CELL_LINE_COLS', 'SIG_COLS', 'CONTROL_COLS',
 
                         // additional parameters
-                        'PSEUDOCOUNT', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD', 'API_URL'
+                        'BARCODE_COL', 'PSEUDOCOUNT', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD', 'API_URL'
                     ]
 
                     def config = [:]

From cb31403a0cac50afa7fcebf9a0e44549d1311808 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 14:20:25 -0400
Subject: [PATCH 090/127] Added commas

---
 scripts/make_config_file.groovy | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy
index 5b7ea040..eb039213 100644
--- a/scripts/make_config_file.groovy
+++ b/scripts/make_config_file.groovy
@@ -116,11 +116,11 @@ pipeline {
                         'COUNTS', 'SCREEN',
 
                         // metadata files
-                        'SAMPLE_META', 'CELL_SET_META', 'CELL_LINE_META', 'CONTROL_BARCODE_META', 'ASSAY_POOL_META'
+                        'SAMPLE_META', 'CELL_SET_META', 'CELL_LINE_META', 'CONTROL_BARCODE_META', 'ASSAY_POOL_META',
 
                         // sushi files
                         'RAW_COUNTS_UNCOLLAPSED', 'PRISM_BARCODE_COUNTS', 'UNKNOWN_BARCODE_COUNTS', 
-                        'ANNOTATED_COUNTS', 'FILTERED_COUNTS', 'NORMALIZED_COUNTS', 'LFC', 'COLLAPSED_LFC'
+                        'ANNOTATED_COUNTS', 'FILTERED_COUNTS', 'NORMALIZED_COUNTS', 'LFC', 'COLLAPSED_LFC',
 
                         // column name parameters
                         'SEQUENCING_INDEX_COLS', 'ID_COLS', 'CELL_LINE_COLS', 'SIG_COLS', 'CONTROL_COLS',

From b22ac03e54558a92620054da9e3e547800ff26ad Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 14:24:24 -0400
Subject: [PATCH 091/127] Removed raw_counts

---
 scripts/filter_counts.sh | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/scripts/filter_counts.sh b/scripts/filter_counts.sh
index 76f72f55..fb8a2d9a 100644
--- a/scripts/filter_counts.sh
+++ b/scripts/filter_counts.sh
@@ -8,12 +8,6 @@ then
     exit -1
 fi
 
-if [ -z "$RAW_COUNTS" ]
-then
-	echo RAW_COUNTS parameter empty
-    exit -1
-fi
-
 if [ -z "$SAMPLE_META" ]
 then
 	echo SAMPLE_META parameter empty

From b171ff820cf56d6a2086db22719518d98e1a9fcd Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 14:32:15 -0400
Subject: [PATCH 092/127] Update name

---
 scripts/join_metadata.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/join_metadata.sh b/scripts/join_metadata.sh
index dd736dd8..b4159ec1 100644
--- a/scripts/join_metadata.sh
+++ b/scripts/join_metadata.sh
@@ -15,7 +15,7 @@ then
 
 fi
 
-if [ -z "$COLLAPSED_VALUES" ]
+if [ -z "$COLLAPSED_LFC" ]
 then
 	echo Collapsed l2fc parameter empty
     exit -1

From 05780aadeac274870d5c7640ff7efc5210c1dc1c Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 14:36:32 -0400
Subject: [PATCH 093/127] Dropped CCLE_name as a key

---
 scripts/join_metadata.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R
index 226858ba..63433c8a 100644
--- a/scripts/join_metadata.R
+++ b/scripts/join_metadata.R
@@ -56,7 +56,7 @@ if(file.exists(args$lfc)) {
   if(assay_pool_meta_exists) {
     l2fc_with_meta_columns= join_metadata(input_df= l2fc_with_meta_columns, 
                                           metadata= input_assay_pool_meta,
-                                          key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set'))
+                                          key_cols= c('DepMap_ID', 'cell_set'))
   } else {
     print('WARNING: Assay pool meta not detected and will not be joined onto l2fc.')
   }
@@ -83,7 +83,7 @@ if(file.exists(args$collapsed_l2fc)) {
     print('Attempting to add assay_pool_meta to collapsed l2fc.')
     collapsed_l2fc_with_meta_columns= join_metadata(input_df= collapsed_l2fc_with_meta_columns, 
                                                     metadata= input_assay_pool_meta,
-                                                    key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set'))
+                                                    key_cols= c('DepMap_ID', 'cell_set'))
   } else {
     print('WARNING: Assay pool meta not detected and will not be joined onto collapsed l2fc.')
   }

From c6dc30c406b6534c68fa9befa4e709dafdad3939 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 14:43:24 -0400
Subject: [PATCH 094/127] Fixed renaming

---
 scripts/join_metadata.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R
index 63433c8a..63910922 100644
--- a/scripts/join_metadata.R
+++ b/scripts/join_metadata.R
@@ -64,13 +64,13 @@ if(file.exists(args$lfc)) {
   # Write out
   outpath= paste(args$out, 'l2fc_with_meta_columns.csv', sep='/')
   print(paste("Writing l2fc_with_meta_columns.csv to ", outpath))
-  write.csv(l2fc_with_meta_columns, outpath, row.names= FALSE, quote= FALSE)
+  l2fc_with_meta_columns %>% write.csv(outpath, row.names= FALSE, quote= FALSE)
 } else {
   print('WARNING: l2fc.csv does not exist. Skipping this file.')
 }
 
 # Add sample meta and assay pool meta to collapsed_l2fc table ----
-if(file.exists(args$collapsed_l2fc)) {
+if(file.exists(args$collapsed_lfc)) {
   print('Attempting to add sample_meta to collapsed l2fc.')
   collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',')
   
@@ -91,7 +91,7 @@ if(file.exists(args$collapsed_l2fc)) {
   # Write out
   outpath= paste(args$out, 'collapsed_l2fc_with_meta_columns.csv', sep='/')
   print(paste("Writing collapsed_l2fc_with_meta_columns.csv to ", outpath))
-  write.csv(collapsed_l2fc_with_meta_columns, outpath, row.names= FALSE, quote= FALSE)
+  collapsed_l2fc_with_meta_columns %>% write.csv(outpath, row.names= FALSE, quote= FALSE)
 } else {
   print('WARNING: collapsed_l2fc.csv does not exist. Skipping this file.')
 }

From e924827a02a0aa56e5726b285bd14b0e687b6447 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 14:45:20 -0400
Subject: [PATCH 095/127] Caught last bug

---
 scripts/join_metadata.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R
index 63910922..79d442f6 100644
--- a/scripts/join_metadata.R
+++ b/scripts/join_metadata.R
@@ -72,7 +72,7 @@ if(file.exists(args$lfc)) {
 # Add sample meta and assay pool meta to collapsed_l2fc table ----
 if(file.exists(args$collapsed_lfc)) {
   print('Attempting to add sample_meta to collapsed l2fc.')
-  collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',')
+  collapsed_l2fc= data.table::fread(args$collapsed_lfc, header= T, sep= ',')
   
   # Add sample meta columns to collapsed l2fc
   collapsed_l2fc_with_meta_columns= join_metadata(input_df= collapsed_l2fc, metadata= sample_meta, 

From ec143ea824aa3e64765183910f037991a65ab028 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 15:06:42 -0400
Subject: [PATCH 096/127] Added kitchen utensils

---
 scripts/collapse_replicates.R | 1 +
 scripts/compute_l2fc.R        | 1 +
 2 files changed, 2 insertions(+)

diff --git a/scripts/collapse_replicates.R b/scripts/collapse_replicates.R
index 771b08f9..d48d54b1 100755
--- a/scripts/collapse_replicates.R
+++ b/scripts/collapse_replicates.R
@@ -5,6 +5,7 @@ library(tidyverse)
 suppressPackageStartupMessages(library(argparse))
 suppressPackageStartupMessages(library(magrittr))
 source("./src/collapse_bio_reps.R")
+source("./src/kitchen_utensils.R")
 
 # Argument parser ----
 parser <- ArgumentParser()
diff --git a/scripts/compute_l2fc.R b/scripts/compute_l2fc.R
index 38b51ddd..7ef42653 100755
--- a/scripts/compute_l2fc.R
+++ b/scripts/compute_l2fc.R
@@ -5,6 +5,7 @@ library(tidyverse)
 suppressPackageStartupMessages(library(argparse))
 suppressPackageStartupMessages(library(dplyr))
 source("./src/compute_l2fc.R")
+source("./src/kitchen_utensils.R")
 
 # Argument parser ----
 parser <- ArgumentParser()

From 10f7c9e24d5e09d688a6e33f05a876d5f08abd75 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 15:07:00 -0400
Subject: [PATCH 097/127] Removed counts flag filter

---
 scripts/src/collapse_bio_reps.R | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/scripts/src/collapse_bio_reps.R b/scripts/src/collapse_bio_reps.R
index 223d3d68..5c09eecb 100755
--- a/scripts/src/collapse_bio_reps.R
+++ b/scripts/src/collapse_bio_reps.R
@@ -1,19 +1,3 @@
-#' validate_columns_exist
-#' 
-#' This function checks that a list of columns are present in a dataframe.
-#' 
-#' @param selected_columns A vector of strings each representing a column name
-#' @param df A dataframe to check against
-#' @return Boolean
-validate_columns_exist= function(selected_columns, df) {
-  # Check that all of selected_columns are in df
-  if(any(!selected_columns %in% colnames(df))) {
-    return(FALSE)
-  } else {
-    return(TRUE)
-  }
-}
-
 #' validate_num_bio_reps
 #' 
 #' This function checks that all the expected flowcells are present in a table of detected flowcells.
@@ -57,8 +41,7 @@ collapse_bio_reps= function(l2fc, sig_cols, cell_line_cols= c('project_code', 'D
   }
   
   # Median collapsing bio replicates ----
-  collapsed_counts= l2fc %>% dplyr::filter(is.na(counts_flag)) %>% 
-    tidyr::unite(col= 'sig_id', all_of(sig_cols), sep= ':', na.rm= FALSE, remove= FALSE) %>%
+  collapsed_counts= l2fc %>% tidyr::unite(col= 'sig_id', all_of(sig_cols), sep= ':', na.rm= FALSE, remove= FALSE) %>%
     dplyr::group_by(pick(all_of(c(cell_line_cols, 'sig_id', sig_cols)))) %>%
     dplyr::summarise(trt_median_n= median(mean_n), trt_median_normalized_n= median(mean_normalized_n),
                      trt_mad_sqrtN= mad(log2(mean_normalized_n)) / sqrt(dplyr::n()),

From a2259cf502a01a6c06d7e2e692662b6fc47cd922 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 15:09:14 -0400
Subject: [PATCH 098/127] Remove some columns

Dropped num_tech_reps, a num_tech_rep check, control_MAD_QC, and counts_flag
---
 scripts/src/compute_l2fc.R | 31 ++++++-------------------------
 1 file changed, 6 insertions(+), 25 deletions(-)

diff --git a/scripts/src/compute_l2fc.R b/scripts/src/compute_l2fc.R
index f89d9797..3007253f 100755
--- a/scripts/src/compute_l2fc.R
+++ b/scripts/src/compute_l2fc.R
@@ -1,19 +1,3 @@
-#' validate_columns_exist
-#' 
-#' This function checks that a list of columns are present in a dataframe.
-#' 
-#' @param selected_columns A vector of strings each representing a column name
-#' @param df A dataframe to check against
-#' @return Boolean
-validate_columns_exist= function(selected_columns, df) {
-  # Check that all of selected_columns are in df
-  if(any(!selected_columns %in% colnames(df))) {
-    return(FALSE)
-  } else {
-    return(TRUE)
-  }
-}
-
 #' compute_l2fc
 #' 
 #' takes normalized counts and computes log-fold change values as compared to the designated control condition
@@ -68,13 +52,12 @@ compute_l2fc= function(normalized_counts,
     dplyr::filter(!(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type), !is.na(CCLE_name)) %>%
     dplyr::group_by(pick(all_of(c(cell_line_cols, 'trt_type', bio_rep_id_cols, ctrl_cols)))) %>%
     dplyr::summarise(mean_n= mean(n),
-                     mean_normalized_n = mean(!!rlang::sym(count_col_name)), 
-                     num_tech_reps= dplyr::n()) %>% dplyr::ungroup()
+                     mean_normalized_n = mean(!!rlang::sym(count_col_name))) %>% dplyr::ungroup()
   
   # Print out the occurrence of each count of tech_reps
-  print('Number of technical replicate collapsed across all cell lines and biological replicates:')
-  print(collapsed_tech_rep %>% dplyr::group_by(num_tech_reps) %>% 
-          dplyr::summarise(count= dplyr::n()) %>% dplyr::ungroup())
+  # print('Number of technical replicate collapsed across all cell lines and biological replicates:')
+  # print(collapsed_tech_rep %>% dplyr::group_by(num_tech_reps) %>% 
+  #         dplyr::summarise(count= dplyr::n()) %>% dplyr::ungroup())
     
   # Pull out negative controls and collapse any biological replicates ----
   print('Collapsing control conditions on the following columns: ')
@@ -84,8 +67,7 @@ compute_l2fc= function(normalized_counts,
     dplyr::summarise(control_median_n= median(mean_n),
                      control_median_normalized_n = median(mean_normalized_n),
                      control_mad_sqrtN = mad(log2(mean_normalized_n))/sqrt(dplyr::n()),
-                     num_ctrl_bio_reps = dplyr::n()) %>% dplyr::ungroup() %>% 
-    dplyr::mutate(control_MAD_QC = (control_mad_sqrtN <= 0.5/log10(2))) #%>% # New: adjusted cut off to log2
+                     num_ctrl_bio_reps = dplyr::n()) %>% dplyr::ungroup()
   
   # Validation: Check that negative controls were extracted ----
   if(nrow(controls)==0) {
@@ -95,8 +77,7 @@ compute_l2fc= function(normalized_counts,
   # Join neg_cons and compute l2fc ----
   l2fc= collapsed_tech_rep %>% dplyr::filter(!trt_type %in% c(control_type, 'day_0')) %>% 
     dplyr::inner_join(controls, by= c(cell_line_cols, ctrl_cols), relationship='many-to-one') %>%
-    dplyr::mutate(l2fc= log2(mean_normalized_n/control_median_normalized_n),
-                  counts_flag= ifelse(control_median_n < count_threshold, paste0('negcon<', count_threshold), NA))
+    dplyr::mutate(l2fc= log2(mean_normalized_n/control_median_normalized_n))
   
   return(l2fc)
 }

From cbbe3a7bc0ca1d1f3c50fe38cecdca7e65cef139 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 4 Oct 2024 17:12:53 -0400
Subject: [PATCH 099/127] Updated QC job name

Also added a text description test
---
 scripts/make_config_file.groovy | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy
index eb039213..f122fc61 100644
--- a/scripts/make_config_file.groovy
+++ b/scripts/make_config_file.groovy
@@ -13,7 +13,7 @@ pipeline {
         booleanParam(name: 'CBNORMALIZE', defaultValue: true, description: 'Check this to trigger the CBnormalize job.')
         booleanParam(name: 'COMPUTE_LFC', defaultValue: true, description: 'Check this to trigger the compute_l2fc job.')
         booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.')
-        booleanParam(name: 'FILTER_COUNTS_QC', defaultValue: true, description: 'Check this to trigger the QC job.')
+        booleanParam(name: 'QC_IMAGES', defaultValue: true, description: 'Check this to trigger the QC job.')
         booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.')
         booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.')
         booleanParam(name: 'RUN_NORM', defaultValue: true, description: 'Run normalization module on data.')
@@ -30,7 +30,7 @@ pipeline {
         string(name: 'COMMIT_ID', defaultValue: '', description: 'Specific commit ID to use (leave empty if using the latest commit in the branch or if already specified in the config file.)')
 
         // Metadata files used by sushi
-        string(name: 'SAMPLE_META', defaultValue: 'sample_meta.csv', description: 'File name of sample metadata within the BUILD_DIR directory.')
+        string(name: 'SAMPLE_META', defaultValue: 'sample_meta.csv', description: 'File name of sample metadata within the BUILD_DIR directory.\n can text be formated here?')
         string(name: 'CELL_SET_META', defaultValue: 'cell_set_meta.csv', description: 'Cell Set Metadata. Static cell_line_meta location: /data/vdb/prismSeq/cell_set_meta.csv')
         string(name: 'CELL_LINE_META', defaultValue: 'cell_line_meta.csv', description: 'File in BUILD_DIR containing cell line metadata')
         string(name: 'CONTROL_BARCODE_META', defaultValue: 'CB_meta.csv', description: 'Metadata for control barcodes.')
@@ -40,11 +40,11 @@ pipeline {
         string(name: 'RAW_COUNTS_UNCOLLAPSED', defaultValue: 'raw_counts_uncollapsed.csv', description: 'Filename in BUILD_DIR containing nori output')
         string(name: 'PRISM_BARCODE_COUNTS', defaultValue: 'prism_barcode_counts.csv', description: 'Filename in BUILD_DIR containing PRISM barcode counts')
         string(name: 'UNKNOWN_BARCODE_COUNTS', defaultValue: 'unknown_barcode_counts.csv', description: 'Filename in BUILD_DIR containing unknown barcode counts')
-        string(name: 'ANNOTATED_COUNTS', defaultValue: 'annotated_counts.csv', description: 'File in BUILD_DIR containing annotated counts')
-        string(name: 'FILTERED_COUNTS', defaultValue: 'filtered_counts.csv', description: 'File in BUILD_DIR containing filtered counts')
-        string(name: 'NORMALIZED_COUNTS', defaultValue: 'normalized_counts.csv', description: 'File in BUILD_DIR containing normalized counts')
-        string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'File containing log2 fold change values')
-        string(name: 'COLLAPSED_LFC', defaultValue: 'collapsed_l2fc.csv', description: 'File in BUILD_DIR containing replicate collapsed l2fc values')
+        string(name: 'ANNOTATED_COUNTS', defaultValue: 'annotated_counts.csv', description: 'Filename in BUILD_DIR containing annotated counts')
+        string(name: 'FILTERED_COUNTS', defaultValue: 'filtered_counts.csv', description: 'Filename in BUILD_DIR containing filtered counts')
+        string(name: 'NORMALIZED_COUNTS', defaultValue: 'normalized_counts.csv', description: 'Filename in BUILD_DIR containing normalized counts')
+        string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'Filename containing log2 fold change values')
+        string(name: 'COLLAPSED_LFC', defaultValue: 'collapsed_l2fc.csv', description: 'Filename in BUILD_DIR containing replicate collapsed l2fc values')
 
         // Column names parameters
         string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'flowcell_names,index_1,index_2', description: 'Sequencing index columns used in COLLATE_FASTQ_READS')
@@ -58,7 +58,7 @@ pipeline {
         string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'In CBNORMALIZE, the pesudocount value for log transformations.')
         string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'In COMPUTE_LFC, the name of the numeric column to use for calculations')
         string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'In COMPUTE_LFC, the value in trt_type that indicates the negative controls')
-        string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'In FILTER_COUNTS_QC, the threshold for calling reads with low counts')
+        string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'In QC_IMAGES, the threshold for calling reads with low counts')
         string(name: 'API_URL', defaultValue: 'https://api.clue.io/api/', description: 'API URL')
     }
 
@@ -219,7 +219,7 @@ pipeline {
                         if (params.COLLAPSE) {
                             scriptsToRun.add('collapse_replicates.sh')
                         }
-                        if (params.FILTER_COUNTS_QC) {
+                        if (params.QC_IMAGES) {
                             scriptsToRun.add('filteredCounts_QC.sh')
                         }
                         if (params.JOIN_METADATA) {

From 9291889f7eba96cc1dd95de7a24967806f0e19a9 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Tue, 8 Oct 2024 17:06:49 -0400
Subject: [PATCH 100/127] Moved most common params up

---
 scripts/make_config_file.groovy | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy
index f122fc61..529d7861 100644
--- a/scripts/make_config_file.groovy
+++ b/scripts/make_config_file.groovy
@@ -29,8 +29,16 @@ pipeline {
         booleanParam(name: 'USE_LATEST', defaultValue: true, description: 'Check this to use the most up to date version from the specified branch. If not checked, will use the specified commit.')
         string(name: 'COMMIT_ID', defaultValue: '', description: 'Specific commit ID to use (leave empty if using the latest commit in the branch or if already specified in the config file.)')
 
+        // Most common parameters
+        // Column names parameters
+        string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'flowcell_names,index_1,index_2', description: 'Sequencing index columns used in COLLATE_FASTQ_READS')
+        string(name: 'ID_COLS', defaultValue: 'pcr_plate,pcr_well', description: 'Columns to concat to create unique ID for each sample-replicate')
+        string(name: 'CELL_LINE_COLS', defaultValue: 'DepMap_ID', description: 'Columns in intermediate files that describe a read or cell line')
+        string(name: 'SIG_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day', description: 'Signature columns')
+        string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual controls in COMPUTE_LFC')
+
         // Metadata files used by sushi
-        string(name: 'SAMPLE_META', defaultValue: 'sample_meta.csv', description: 'File name of sample metadata within the BUILD_DIR directory.\n can text be formated here?')
+        string(name: 'SAMPLE_META', defaultValue: 'sample_meta.csv', description: 'File name of sample metadata within the BUILD_DIR directory.')
         string(name: 'CELL_SET_META', defaultValue: 'cell_set_meta.csv', description: 'Cell Set Metadata. Static cell_line_meta location: /data/vdb/prismSeq/cell_set_meta.csv')
         string(name: 'CELL_LINE_META', defaultValue: 'cell_line_meta.csv', description: 'File in BUILD_DIR containing cell line metadata')
         string(name: 'CONTROL_BARCODE_META', defaultValue: 'CB_meta.csv', description: 'Metadata for control barcodes.')
@@ -46,19 +54,12 @@ pipeline {
         string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'Filename containing log2 fold change values')
         string(name: 'COLLAPSED_LFC', defaultValue: 'collapsed_l2fc.csv', description: 'Filename in BUILD_DIR containing replicate collapsed l2fc values')
 
-        // Column names parameters
-        string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'flowcell_names,index_1,index_2', description: 'Sequencing index columns used in COLLATE_FASTQ_READS')
-        string(name: 'ID_COLS', defaultValue: 'pcr_plate,pcr_well', description: 'Columns to concat to create unique ID for each sample-replicate')
-        string(name: 'CELL_LINE_COLS', defaultValue: 'DepMap_ID', description: 'Columns in intermediate files that describe a read or cell line')
-        string(name: 'SIG_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day', description: 'Signature columns')
-        string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual controls in COMPUTE_LFC')
-        
         // Additional parameters
         string(name: 'BARCODE_COL', defaultValue: 'forward_read_cl_barcode', description: 'In COLLATE_FASTQ_READS, the name of the column containing the read')
         string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'In CBNORMALIZE, the pesudocount value for log transformations.')
         string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'In COMPUTE_LFC, the name of the numeric column to use for calculations')
         string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'In COMPUTE_LFC, the value in trt_type that indicates the negative controls')
-        string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'In QC_IMAGES, the threshold for calling reads with low counts')
+        string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'Drops cell lines below this threshold in the negative controls')
         string(name: 'API_URL', defaultValue: 'https://api.clue.io/api/', description: 'API URL')
     }
 

From aeeb837060fe3b98765ddec2530a81856e2c81fd Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 09:39:16 -0400
Subject: [PATCH 101/127] Updated unknown barcode identification

Added a low_abundance_threshold filter to rename reads
---
 scripts/src/collate_fastq_reads.R | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 1aba70a9..aeb3aff6 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -52,7 +52,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
                               id_cols= c('pcr_plate', 'pcr_well'),
                               known_barcodes,
                               reverse_index2= FALSE,
-                              barcode_col= 'forward_read_cl_barcode') {
+                              barcode_col= 'forward_read_cl_barcode',
+                              low_abundance_threshold= 20) {
   require(tidyverse)
   require(data.table)
   
@@ -165,11 +166,14 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Performing inner join with data.table instead of dplyr
   summed_reads= data.table::merge.data.table(uncollapsed_raw_counts, sequencing_map, by= sequencing_index_cols)
   # Code below is checking if a barcode is in the list of known barcodes.
-  # If the barcode is not in the list of known barcodes, then the barcode is replaced with the string "unknown_reads".
-  # Function := performs the mutate inplace without copying the dataframe.
+  # If the barcode is not in the list of known barcodes and its counts is below the low_abundance_threshold, 
+  # then the barcode is replaced with the string "unknown_low_abundance_barcode".
+  # Function := performs the mutate in place without copying the dataframe.
   # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%.
-  summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes), 
-                                                       get(barcode_col), 'unknown_reads')]
+  summed_reads[, c(barcode_col) := data.table::fifelse(
+    !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, 
+    'unknown_low_abundance_barcode', get(barcode_col))]
+  
   # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells.
   summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]
   
@@ -187,6 +191,6 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   
   # Return list of two dfs with known or unknown read counts ----
   print('Completing collate_fastq_reads.')
-  return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] != 'unknown_reads',], 
-              unknown_barcode_counts= summed_reads[summed_reads[[barcode_col]] == 'unknown_reads',]))
+  return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], 
+              unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]))
 }

From 4e8f6773291ee7c087220e09774fc82703bedcbd Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 09:50:39 -0400
Subject: [PATCH 102/127] Added back unknown reads summary

---
 scripts/src/QC_images.R | 73 ++++++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 19 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index 5b16b2c7..a45b911f 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -470,6 +470,7 @@ QC_images= function(raw_counts_uncollapsed_path,
                     prism_barcode_counts, unknown_barcode_counts,
                     annotated_counts, normalized_counts= NA, l2fc, 
                     sample_meta,
+                    barcode_col= 'forward_read_cl_barcode',
                     id_cols= c('pcr_plate', 'pcr_well'),
                     cell_line_cols= c('DepMap_ID'), 
                     sig_cols,
@@ -490,7 +491,7 @@ QC_images= function(raw_counts_uncollapsed_path,
   if(is.na(out)) {out= getwd()}
   
   # Create empty vector to collect potential errors when running QCs
-  skipped_qcs= c() 
+  skipped_qcs= c()
   
   # Count number of distinct profile to help scale some plots.
   num_profiles= annotated_counts %>% dplyr::distinct(pick(all_of(id_cols))) %>% nrow()
@@ -580,9 +581,43 @@ QC_images= function(raw_counts_uncollapsed_path,
     skipped_qcs = c(skipped_qcs, potential_error)
   }
   
+  ## 4. Unknown barcodes ----
+  print('4. Generating table of unknown barcode reads ...')
+  potential_error= base::tryCatch({
+    unknown_totals= unknown_barcode_counts[, .(well_total= sum(n)), by= id_cols]
+    prism_totals= prism_barcode_counts[, .(well_total= sum(n)), by= id_cols]
+    well_totals= data.table::rbindlist(list(unknown_totals, prism_totals))[, .(well_total= sum(well_total)), by= id_cols]
+    
+    unknown_barcodes= unknown_barcode_counts %>% 
+      dplyr::filter(.data[[barcode_col]] != 'unknown_reads') %>%
+      dplyr::left_join(well_totals, by= id_cols) %>%
+      dplyr::mutate(read_percent= n / well_total) %>%
+      dplyr::group_by(pick(all_of(barcode_col))) %>%
+      dplyr::summarise(total= sum(n),
+                       median_read= median(n),
+                       median_percent= median(read_percent),
+                       max_read= max(n),
+                       max_percent= max(read_percent),
+                       num_wells= dplyr::n()) %>% dplyr::ungroup() %>%
+      dplyr::arrange(dplyr::desc(median_percent))
+    
+    unknown_barcodes %>% write.csv(file= paste(out, 'unknown_barcodes_summary.csv', sep= '/'), 
+                                   row.names= FALSE, quote= FALSE)
+  }, error= function(e) {
+    print(e)
+    print('Encountered an error when creating the total counts barplot. Skipping this output ...') 
+    return('Totalc ounts image')
+  })
+  
+  # Collect returned string if an error occurred
+  if(!is.null(potential_error)) {
+    skipped_qcs = c(skipped_qcs, potential_error)
+  }
+  #
+  
   # Assay QCs _________________________ ----
-  ## 4. Cell lines recovered ----
-  print('4. Generating cell_lines_present image ...')
+  ## 5. Cell lines recovered ----
+  print('5. Generating cell_lines_present image ...')
   potential_error= base::tryCatch({
     cl_rec= create_recovery_barplot(filtered_counts, id_cols= id_cols, facet_col= 'pcr_plate', 
                                     count_threshold= count_threshold, plot_type= 'percent')
@@ -603,8 +638,8 @@ QC_images= function(raw_counts_uncollapsed_path,
     skipped_qcs = c(skipped_qcs, potential_error)
   }
   
-  ## 5. Cell line contaminants ----
-  print('5. Generating cell line contaminants ...')
+  ## 6. Cell line contaminants ----
+  print('6. Generating cell line contaminants ...')
   potential_error= base::tryCatch({
     contams= annotated_counts %>% dplyr::filter(expected_read == FALSE) %>%
       dplyr::mutate(barcode_id= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>%
@@ -625,8 +660,8 @@ QC_images= function(raw_counts_uncollapsed_path,
     skipped_qcs = c(skipped_qcs, potential_error)
   }
   
-  ## 6. Cumulative counts by lines in negcons ----
-  print('6. Generating cumulative image ...')
+  ## 7. Cumulative counts by lines in negcons ----
+  print('7. Generating cumulative image ...')
   potential_error= base::tryCatch({
     cdf_plot= create_cdf_plot(filtered_counts %>% dplyr::filter(trt_type == control_type), 
                               id_cols= id_cols, 
@@ -651,9 +686,9 @@ QC_images= function(raw_counts_uncollapsed_path,
     skipped_qcs= c(skipped_qcs, potential_error)
   }
   
-  ## 7. Control barcode trends ----
+  ## 8. Control barcode trends ----
   if(contains_cbs & is.data.frame(normalized_counts)) {
-    print('7. Generating control_barcode_trend image')
+    print('8. Generating control_barcode_trend image')
     potential_error= base::tryCatch({
       trend_sc= create_ctrlBC_scatterplots(normalized_counts %>% dplyr::filter(control_barcodes %in% c("Y", "T", T)), 
                                            id_cols, value_col= 'log2_n')
@@ -674,11 +709,11 @@ QC_images= function(raw_counts_uncollapsed_path,
       skipped_qcs= c(skipped_qcs, potential_error)
     }
   } else {
-    print('7. No control barcodes detected. Skipping control_barcode_trend image.')
+    print('8. No control barcodes detected. Skipping control_barcode_trend image.')
   }
   
-  ## 8. Sample correlation -----
-  print('8. Generating sample_cor image ...')
+  ## 9. Sample correlation -----
+  print('9. Generating sample_cor image ...')
   potential_error= base::tryCatch({
     cor_df= filtered_counts %>% 
       dplyr::filter(!is.na(DepMap_ID), !is.na(trt_type), !trt_type %in% c('empty', '', 'CB_only')) %>%
@@ -704,13 +739,13 @@ QC_images= function(raw_counts_uncollapsed_path,
     skipped_qcs= c(skipped_qcs, potential_error)
   }
   
-  ## 9. Tech rep correlations ----
+  ## 10. Tech rep correlations ----
   if(is.data.frame(normalized_counts) & 'tech_rep' %in% colnames(normalized_counts)) {
     # Check if there are more at least two tech reps
     unique_tech_reps= na.omit(unique(normalized_counts$tech_rep))
     
     if(length(unique_tech_reps) >= 2) {
-      print('9. Generating tech rep correlations image ...')
+      print('10. Generating tech rep correlations image ...')
       # Set up replicate groups depending "bio_rep" column
       if('bio_rep' %in% colnames(normalized_counts) & !'bio_rep' %in% sig_cols) {
         replicate_group_cols= c(sig_cols, 'bio_rep')
@@ -752,13 +787,13 @@ QC_images= function(raw_counts_uncollapsed_path,
       }
       
     } else {
-      print('9. No technical replicates detected. Skipping tech_reps scatter plot.')
+      print('10. No technical replicates detected. Skipping tech_reps scatter plot.')
     }
   } else {
-    print('9. No technical replicates detected. Skipping tech_reps scatter plot.')
+    print('10. No technical replicates detected. Skipping tech_reps scatter plot.')
   }
   
-  ## 10. Bio rep correlations ----
+  ## 11. Bio rep correlations ----
   if('bio_rep' %in% colnames(l2fc)) {
     unique_bio_reps= na.omit(unique(l2fc$bio_rep))
     
@@ -782,7 +817,7 @@ QC_images= function(raw_counts_uncollapsed_path,
       # }
       
       # Bio replicate heatmap
-      print('10. Generating bio rep correlations heatmap ...')
+      print('11. Generating bio rep correlations heatmap ...')
       potential_error= base::tryCatch({
         bio_corr_hm= create_cor_heatmap(input_df= l2fc_with_log2, 
                                         row_id_cols= cell_line_cols, 
@@ -805,7 +840,7 @@ QC_images= function(raw_counts_uncollapsed_path,
       }
       
     } else {
-      print('10. No biological replicates detected. Skipping bio_rep heatmap.')
+      print('11. No biological replicates detected. Skipping bio_rep heatmap.')
     }
   }
   

From 8ef52039b58c0632054dc9764fdf3c983ca33831 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 11:05:46 -0400
Subject: [PATCH 103/127] Updated scripts for new params

---
 scripts/collate_fastq_reads.R   | 5 ++++-
 scripts/collate_fastq_reads.sh  | 1 +
 scripts/filter_counts.R         | 1 -
 scripts/filteredCounts_QC.R     | 3 +++
 scripts/filteredCounts_QC.sh    | 1 +
 scripts/launch_job.sh           | 2 +-
 scripts/make_config_file.groovy | 3 ++-
 7 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index 89421e57..5dce7d7e 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -24,6 +24,8 @@ parser$add_argument("--reverse_index2", type="logical", default=FALSE,
                     help= "Reverse complement of index 2 for NovaSeq and NextSeq")
 parser$add_argument("--barcode_col", default= "forward_read_cl_barcode", 
                     help= "Name of the column in uncollapsed_raw_counts that contains the barcode sequences.")
+parser$add_argument('--low_abundance_threshold', default= 20, 
+                    help= 'For unknown barcodes, counts below this threshold will be marked as an unknown barcode.')
 parser$add_argument("-o", "--out", default=getwd(), help = "Output path. Default is working directory")
 
 # get command line options, if help option encountered print help and exit
@@ -65,7 +67,8 @@ chunked_results= process_in_chunks(large_file_path= args$raw_counts_uncollapsed,
                                    id_cols= id_cols,
                                    known_barcodes= unique(c(cell_line_meta$Sequence, CB_meta$Sequence)),
                                    reverse_index2= args$reverse_index2,
-                                   barcode_col= args$barcode_col)
+                                   barcode_col= args$barcode_col,
+                                   low_abundance_threshold= as.numeric(args$low_abundance_threshold))
 
 # From each chunk, extract prism_barcode_counts and bind the rows together into one dataframe.
 prism_barcode_counts= data.table::rbindlist(lapply(chunked_results, function(x) x$prism_barcode_counts))
diff --git a/scripts/collate_fastq_reads.sh b/scripts/collate_fastq_reads.sh
index 6acea4ea..358444e0 100644
--- a/scripts/collate_fastq_reads.sh
+++ b/scripts/collate_fastq_reads.sh
@@ -114,6 +114,7 @@ args=(
 --id_cols "$ID_COLS" 
 --reverse_index2 "$REVERSE_INDEX2"
 --barcode_col "$BARCODE_COL"
+--low_abundance_threshold "$LOW_ABUNDANCE_THRESHOLD"
 --out "$BUILD_DIR"
 )
 
diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R
index 7cd0326f..15ed449f 100755
--- a/scripts/filter_counts.R
+++ b/scripts/filter_counts.R
@@ -48,7 +48,6 @@ CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',')
 # Convert input strings into vectors ----
 id_cols= unlist(strsplit(args$id_cols, ","))
 
-# What is this check doing? -YL ----
 # make sure LUA codes in cell line meta are unique
 cell_line_meta %<>% 
   dplyr::group_by(LUA) %>% 
diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R
index 1287745b..52a9efa0 100755
--- a/scripts/filteredCounts_QC.R
+++ b/scripts/filteredCounts_QC.R
@@ -28,6 +28,8 @@ parser$add_argument('--annotated_counts', default= 'annotated_counts.csv', help=
 parser$add_argument('--normalized_counts', default= 'normalized_counts.csv', help= 'Path to normalized_counts.csv')
 parser$add_argument('--lfc', default= 'l2fc.csv', help= 'Path to l2fc.csv')
 parser$add_argument('-s', '--sample_meta', default= 'sample_meta.csv', help= 'Path to sample_meta.csv')
+parser$add_argument("--barcode_col", default= "forward_read_cl_barcode", 
+                    help= "Name of the column in uncollapsed_raw_counts that contains the barcode sequences.")
 parser$add_argument('--id_cols', default= 'pcr_plate,pcr_well', help= 'Sample meta columns used to identify every PCR well')
 parser$add_argument('--cell_line_cols', default= 'DepMap_ID', help= 'Sushi columns used to identify a read')
 parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', 
@@ -73,6 +75,7 @@ QC_images(raw_counts_uncollapsed_path= args$raw_counts_uncollapsed,
           l2fc= l2fc, 
           sample_meta= sample_meta,
           id_cols= id_cols, 
+          barcode_col= args$barcode_col,
           cell_line_cols= cell_line_cols,
           sig_cols= sig_cols,
           control_type= args$control_type, 
diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh
index bc826a64..6a013270 100644
--- a/scripts/filteredCounts_QC.sh
+++ b/scripts/filteredCounts_QC.sh
@@ -128,6 +128,7 @@ args=(
 --unknown_barcode_counts "$UNKNOWN_BARCODE_COUNTS"
 --lfc "$LFC"
 --id_cols "$ID_COLS"
+--barcode_col "$BARCODE_COL"
 --reverse_index2 "$REVERSE_INDEX2"
 )
 
diff --git a/scripts/launch_job.sh b/scripts/launch_job.sh
index 1ed0d9d9..48ba689f 100644
--- a/scripts/launch_job.sh
+++ b/scripts/launch_job.sh
@@ -33,7 +33,7 @@ PARAMS=(
   # column name paramters
   SEQUENCING_INDEX_COLS ID_COLS CELL_LINE_COLS SIG_COLS CONTROL_COLS
   # additional parameters 
-  BARCODE_COL PSEUDOCOUNT COUNT_COL_NAME CTL_TYPES COUNT_THRESHOLD API_URL
+  BARCODE_COL LOW_ABUNDANCE_THRESHOLD PSEUDOCOUNT COUNT_COL_NAME CTL_TYPES COUNT_THRESHOLD API_URL
 )
 
 # Load parameters
diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy
index 529d7861..59b6dfa7 100644
--- a/scripts/make_config_file.groovy
+++ b/scripts/make_config_file.groovy
@@ -56,6 +56,7 @@ pipeline {
 
         // Additional parameters
         string(name: 'BARCODE_COL', defaultValue: 'forward_read_cl_barcode', description: 'In COLLATE_FASTQ_READS, the name of the column containing the read')
+        string(name: 'LOW_ABUNDANCE_THRESHOLD', defaultValue: '20', description: 'In COLLATE_FASTQ_READS, threshold for unknown barcodes')
         string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'In CBNORMALIZE, the pesudocount value for log transformations.')
         string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'In COMPUTE_LFC, the name of the numeric column to use for calculations')
         string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'In COMPUTE_LFC, the value in trt_type that indicates the negative controls')
@@ -127,7 +128,7 @@ pipeline {
                         'SEQUENCING_INDEX_COLS', 'ID_COLS', 'CELL_LINE_COLS', 'SIG_COLS', 'CONTROL_COLS',
 
                         // additional parameters
-                        'BARCODE_COL', 'PSEUDOCOUNT', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD', 'API_URL'
+                        'BARCODE_COL', 'LOW_ABUNDANCE_THRESHOLD', 'PSEUDOCOUNT', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD', 'API_URL'
                     ]
 
                     def config = [:]

From ae8f4e54139b894df2c601cf0262d8e74a1b781b Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 11:53:49 -0400
Subject: [PATCH 104/127] for troubleshooting

---
 scripts/collate_fastq_reads.R     | 7 ++++++-
 scripts/src/collate_fastq_reads.R | 3 +++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index 5dce7d7e..01ecc129 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -83,7 +83,12 @@ unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, ar
 # Validation: Basic file size check ----
 if(nrow(prism_barcode_counts) == 0) {
   stop('ERROR: Empty file generated. No rows in prism_barcode_counts output.')
-} 
+}
+
+# Trouble shooting ----
+nrow(prism_barcode_counts)
+nrow(unknown_barcode_counts)
+#
 
 # Write out files ----
 out_file= paste(args$out, 'prism_barcode_counts.csv', sep='/')
diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index aeb3aff6..c589ec1a 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -189,6 +189,9 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     print('Warning: Low index purity!')
   } else {}
   
+  print(nrow(summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,]))
+  print(nrow(summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]))  
+  
   # Return list of two dfs with known or unknown read counts ----
   print('Completing collate_fastq_reads.')
   return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], 

From a4ac5b2e18b8261bd2819ef45c7b31d332e33799 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 11:59:22 -0400
Subject: [PATCH 105/127] Jenkins troubleshooting

---
 scripts/collate_fastq_reads.R     | 5 -----
 scripts/src/collate_fastq_reads.R | 7 ++++---
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index 01ecc129..782f3cbd 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -85,11 +85,6 @@ if(nrow(prism_barcode_counts) == 0) {
   stop('ERROR: Empty file generated. No rows in prism_barcode_counts output.')
 }
 
-# Trouble shooting ----
-nrow(prism_barcode_counts)
-nrow(unknown_barcode_counts)
-#
-
 # Write out files ----
 out_file= paste(args$out, 'prism_barcode_counts.csv', sep='/')
 print(paste("Writing prism_barcode_counts.csv to ", out_file))
diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index c589ec1a..36ea45ee 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -171,7 +171,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Function := performs the mutate in place without copying the dataframe.
   # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%.
   summed_reads[, c(barcode_col) := data.table::fifelse(
-    !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, 
+    !(get(barcode_col) %chin% unique(known_barcodes)) & (n < low_abundance_threshold), 
     'unknown_low_abundance_barcode', get(barcode_col))]
   
   # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells.
@@ -189,11 +189,12 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     print('Warning: Low index purity!')
   } else {}
   
+  # troubleshooting
   print(nrow(summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,]))
-  print(nrow(summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]))  
+  print(nrow(summed_reads[!(summed_reads[[barcode_col]] %chin% known_barcodes),]))  
   
   # Return list of two dfs with known or unknown read counts ----
   print('Completing collate_fastq_reads.')
   return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], 
-              unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]))
+              unknown_barcode_counts= summed_reads[!(summed_reads[[barcode_col]] %chin% known_barcodes),]))
 }

From f062978b992741d2b4e3b83bb7a47d479877e56a Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 12:07:30 -0400
Subject: [PATCH 106/127] jenkins troubleshoot

---
 scripts/src/collate_fastq_reads.R | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 36ea45ee..b2be11bd 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -171,7 +171,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Function := performs the mutate in place without copying the dataframe.
   # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%.
   summed_reads[, c(barcode_col) := data.table::fifelse(
-    !(get(barcode_col) %chin% unique(known_barcodes)) & (n < low_abundance_threshold), 
+    !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, 
     'unknown_low_abundance_barcode', get(barcode_col))]
   
   # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells.
@@ -189,12 +189,13 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     print('Warning: Low index purity!')
   } else {}
   
-  # troubleshooting
+  # troubleshooting ----
+  head(summed_reads)
   print(nrow(summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,]))
   print(nrow(summed_reads[!(summed_reads[[barcode_col]] %chin% known_barcodes),]))  
   
   # Return list of two dfs with known or unknown read counts ----
   print('Completing collate_fastq_reads.')
   return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], 
-              unknown_barcode_counts= summed_reads[!(summed_reads[[barcode_col]] %chin% known_barcodes),]))
+              unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]))
 }

From 5923da0f3bf091b887f34b5995abf71a2c177073 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 12:08:12 -0400
Subject: [PATCH 107/127] more jenkins troubleshoot

---
 scripts/src/collate_fastq_reads.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index b2be11bd..61b5b95d 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -190,7 +190,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   } else {}
   
   # troubleshooting ----
-  head(summed_reads)
+  print(head(summed_reads))
   print(nrow(summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,]))
   print(nrow(summed_reads[!(summed_reads[[barcode_col]] %chin% known_barcodes),]))  
   

From 3480402dbbe2774aab0dc8f6ea6ec1bacfe6cfad Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 12:12:20 -0400
Subject: [PATCH 108/127] jenkin troubleshoot

---
 scripts/src/collate_fastq_reads.R | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 61b5b95d..6fc5df95 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -170,9 +170,16 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # then the barcode is replaced with the string "unknown_low_abundance_barcode".
   # Function := performs the mutate in place without copying the dataframe.
   # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%.
-  summed_reads[, c(barcode_col) := data.table::fifelse(
-    !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, 
-    'unknown_low_abundance_barcode', get(barcode_col))]
+  
+  # works?
+  summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | 
+                                                         n >= low_abundance_threshold,
+                                                       get(barcode_col), 'unknown_low_abundance_barcode')]
+  
+  # not working
+  # summed_reads[, c(barcode_col) := data.table::fifelse(
+  #   !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, 
+  #   'unknown_low_abundance_barcode', get(barcode_col))]
   
   # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells.
   summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]

From f8972d7db84da04381a279f7169f093006cb57a7 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 12:13:23 -0400
Subject: [PATCH 109/127] jenkins troubleshoot

---
 scripts/src/collate_fastq_reads.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 6fc5df95..d3b53e72 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -172,8 +172,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%.
   
   # works?
-  summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | 
-                                                         n >= low_abundance_threshold,
+  summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes),
                                                        get(barcode_col), 'unknown_low_abundance_barcode')]
   
   # not working

From bd20c4f9103f387a71cc878555b5717ce49a5324 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 13:28:00 -0400
Subject: [PATCH 110/127] jenkins troubleshoot

---
 scripts/src/collate_fastq_reads.R | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index d3b53e72..f59e2ed8 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -172,12 +172,15 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%.
   
   # works?
-  summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes),
+  # summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes),
+  #                                                      get(barcode_col), 'unknown_low_abundance_barcode')]
+  summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | 
+                                                         n > low_abundance_threshold,
                                                        get(barcode_col), 'unknown_low_abundance_barcode')]
   
   # not working
   # summed_reads[, c(barcode_col) := data.table::fifelse(
-  #   !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, 
+  #   !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold,
   #   'unknown_low_abundance_barcode', get(barcode_col))]
   
   # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells.

From 297ab1b3ceaa3fc054c4c8c16d9ca9630dd687de Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 13:51:00 -0400
Subject: [PATCH 111/127] Jenkins test

---
 scripts/src/collate_fastq_reads.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index f59e2ed8..aaa7bc52 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -174,8 +174,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # works?
   # summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes),
   #                                                      get(barcode_col), 'unknown_low_abundance_barcode')]
-  summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | 
-                                                         n > low_abundance_threshold,
+  summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes) | 
+                                                         n >= low_abundance_threshold,
                                                        get(barcode_col), 'unknown_low_abundance_barcode')]
   
   # not working

From 3e1651c69b1b21a4ad905b56602eb244242d300b Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 14:20:08 -0400
Subject: [PATCH 112/127] Potential jenkins fix

---
 scripts/src/collate_fastq_reads.R | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index aaa7bc52..0cebb234 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -172,16 +172,20 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%.
   
   # works?
-  # summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes),
-  #                                                      get(barcode_col), 'unknown_low_abundance_barcode')]
-  summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes) | 
-                                                         n >= low_abundance_threshold,
-                                                       get(barcode_col), 'unknown_low_abundance_barcode')]
-  
-  # not working
-  # summed_reads[, c(barcode_col) := data.table::fifelse(
-  #   !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold,
-  #   'unknown_low_abundance_barcode', get(barcode_col))]
+  # summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes),
+  #                                         get(barcode_col), 'unknown_low_abundance_barcode')]
+  
+  # two columns?
+  summed_reads[, temp := ifelse(get(barcode_col) %chin% unique(known_barcodes) | n >= low_abundance_threshold,
+                                          TRUE, FALSE)]
+  summed_reads[, c(barcode_col) := data.table::fifelse(temp, get(barcode_col), 'unknown_low_abundance_barcode')]
+  summed_reads[, temp := NULL]
+  
+  
+  # doesnt work
+  # summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes) | 
+  #                                                        n >= low_abundance_threshold,
+  #                                         get(barcode_col), 'unknown_low_abundance_barcode')]
   
   # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells.
   summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]

From c7b7488f238fce2af52f89360648658d6c50b7a2 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 14:26:27 -0400
Subject: [PATCH 113/127] Jenkins test

---
 scripts/src/collate_fastq_reads.R | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 0cebb234..2edbe350 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -178,13 +178,14 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # two columns?
   summed_reads[, temp := ifelse(get(barcode_col) %chin% unique(known_barcodes) | n >= low_abundance_threshold,
                                           TRUE, FALSE)]
-  summed_reads[, c(barcode_col) := data.table::fifelse(temp, get(barcode_col), 'unknown_low_abundance_barcode')]
-  summed_reads[, temp := NULL]
-  
+  #summed_reads[, c(barcode_col) := data.table::fifelse(temp, get(barcode_col), 'unknown_low_abundance_barcode')]
+  #summed_reads[, temp := NULL]
+  print(head(summed_reads))
   
-  # doesnt work
+  # This code is more efficient, but doesn't work in Jenkins only works locally
+  # The problem appears after adding a second condition in the ifelse - not sure why this is happening.
   # summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes) | 
-  #                                                        n >= low_abundance_threshold,
+  #                                           n >= low_abundance_threshold,
   #                                         get(barcode_col), 'unknown_low_abundance_barcode')]
   
   # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells.

From ee2b1e3d810f0baab7f0b7cfe0077973976cc58c Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 14:34:58 -0400
Subject: [PATCH 114/127] Jenkins test

---
 scripts/src/collate_fastq_reads.R | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 2edbe350..07d3eabf 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -176,7 +176,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   #                                         get(barcode_col), 'unknown_low_abundance_barcode')]
   
   # two columns?
-  summed_reads[, temp := ifelse(get(barcode_col) %chin% unique(known_barcodes) | n >= low_abundance_threshold,
+  summed_reads[, temp := ifelse(get(barcode_col) %in% unique(known_barcodes) | n >= low_abundance_threshold,
                                           TRUE, FALSE)]
   #summed_reads[, c(barcode_col) := data.table::fifelse(temp, get(barcode_col), 'unknown_low_abundance_barcode')]
   #summed_reads[, temp := NULL]
@@ -203,11 +203,6 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
     print('Warning: Low index purity!')
   } else {}
   
-  # troubleshooting ----
-  print(head(summed_reads))
-  print(nrow(summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,]))
-  print(nrow(summed_reads[!(summed_reads[[barcode_col]] %chin% known_barcodes),]))  
-  
   # Return list of two dfs with known or unknown read counts ----
   print('Completing collate_fastq_reads.')
   return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], 

From 454a2356f7ad842ed8822ea1bce14991c1b70812 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 14:54:02 -0400
Subject: [PATCH 115/127] test jenkins

---
 scripts/src/collate_fastq_reads.R | 37 ++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 07d3eabf..456acdb3 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -171,25 +171,34 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Function := performs the mutate in place without copying the dataframe.
   # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%.
   
+  # summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | 
+  #                                           n >= low_abundance_threshold,
+  #                                         get(barcode_col), 'unknown_low_abundance_barcode')]
+  
+  # The code above is the initial implementation. It works locally, but not on Jenkins.
+  # The problem appears to occur when adding a second condition in the ifelse - not sure why this is happening.
+  # %chin% to %in% - error persists
+  # data.table::fifelse to base::ifelse - error persists
+  # Jenkins and local are working with the same version of data.table.
+  
   # works?
   # summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes),
   #                                         get(barcode_col), 'unknown_low_abundance_barcode')]
   
-  # two columns?
-  summed_reads[, temp := ifelse(get(barcode_col) %in% unique(known_barcodes) | n >= low_abundance_threshold,
-                                          TRUE, FALSE)]
+  # wasted
+  summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]
+  prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,]
+  
+  unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]
+  unknown_barcode_counts[, c(barcode_col) := data.table::fifelse(n >= low_abundance_threshold, 
+                                                                 get(barcode_col), 'unknown_low_abundance_barcode')]
+  unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)]
+
   #summed_reads[, c(barcode_col) := data.table::fifelse(temp, get(barcode_col), 'unknown_low_abundance_barcode')]
   #summed_reads[, temp := NULL]
-  print(head(summed_reads))
-  
-  # This code is more efficient, but doesn't work in Jenkins only works locally
-  # The problem appears after adding a second condition in the ifelse - not sure why this is happening.
-  # summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes) | 
-  #                                           n >= low_abundance_threshold,
-  #                                         get(barcode_col), 'unknown_low_abundance_barcode')]
   
   # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells.
-  summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]
+  #summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]
   
   # Calculate index purity ----
   # This is only accurate if the Nori input file is small enough to fit into a chunk.
@@ -205,6 +214,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   
   # Return list of two dfs with known or unknown read counts ----
   print('Completing collate_fastq_reads.')
-  return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], 
-              unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]))
+  return(list(prism_barcode_counts= prism_barcode_counts,
+              unknown_barcode_counts= unknown_barcode_counts))
+  #return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], 
+  #            unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]))
 }

From b72e9e05cdc599b2db81da47c6fd7b27c2937b55 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 14:59:13 -0400
Subject: [PATCH 116/127] jenkins test

---
 scripts/src/collate_fastq_reads.R | 33 +++++++++++++++----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 456acdb3..16ec2b98 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -171,9 +171,9 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Function := performs the mutate in place without copying the dataframe.
   # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%.
   
-  # summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | 
-  #                                           n >= low_abundance_threshold,
-  #                                         get(barcode_col), 'unknown_low_abundance_barcode')]
+  summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) |
+                                                         n >= 20,
+                                                       get(barcode_col), 'unknown_low_abundance_barcode')]
   
   # The code above is the initial implementation. It works locally, but not on Jenkins.
   # The problem appears to occur when adding a second condition in the ifelse - not sure why this is happening.
@@ -186,19 +186,18 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   #                                         get(barcode_col), 'unknown_low_abundance_barcode')]
   
   # wasted
-  summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]
-  prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,]
-  
-  unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]
-  unknown_barcode_counts[, c(barcode_col) := data.table::fifelse(n >= low_abundance_threshold, 
-                                                                 get(barcode_col), 'unknown_low_abundance_barcode')]
-  unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)]
+  # print(low_abundance_threshold)
+  # summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]
+  # prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,]
+  # 
+  # unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]
+  # unknown_barcode_counts[, c(barcode_col) := data.table::fifelse(n >= low_abundance_threshold, 
+  #                                                                get(barcode_col), 'unknown_low_abundance_barcode')]
+  # unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)]
 
-  #summed_reads[, c(barcode_col) := data.table::fifelse(temp, get(barcode_col), 'unknown_low_abundance_barcode')]
-  #summed_reads[, temp := NULL]
   
   # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells.
-  #summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]
+  summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]
   
   # Calculate index purity ----
   # This is only accurate if the Nori input file is small enough to fit into a chunk.
@@ -214,8 +213,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   
   # Return list of two dfs with known or unknown read counts ----
   print('Completing collate_fastq_reads.')
-  return(list(prism_barcode_counts= prism_barcode_counts,
-              unknown_barcode_counts= unknown_barcode_counts))
-  #return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], 
-  #            unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]))
+  # return(list(prism_barcode_counts= prism_barcode_counts,
+  #             unknown_barcode_counts= unknown_barcode_counts))
+  return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], 
+              unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]))
 }

From fb3ced7ff4983c6c40363c57c20b23e7a1c1d7da Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 15:05:38 -0400
Subject: [PATCH 117/127] Found and fixed a bug

---
 scripts/launch_job.sh             |  1 +
 scripts/src/collate_fastq_reads.R | 26 +-------------------------
 2 files changed, 2 insertions(+), 25 deletions(-)

diff --git a/scripts/launch_job.sh b/scripts/launch_job.sh
index 48ba689f..a843d8a3 100644
--- a/scripts/launch_job.sh
+++ b/scripts/launch_job.sh
@@ -105,6 +105,7 @@ echo "Running in container:"
   -e COUNT_COL_NAME="$COUNT_COL_NAME" \
   -e CTL_TYPES="$CTL_TYPES" \
   -e COUNT_THRESHOLD="$COUNT_THRESHOLD" \
+  -e LOW_ABUNDANCE_THRESHOLD="$LOW_ABUNDANCE_THRESHOLD"
   -e RUN_NORM="$RUN_NORM" \
   -e BUILD_NAME="$BUILD_NAME" \
   -e CONVERT_SUSHI="$CONVERT_SUSHI" \
diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 16ec2b98..f3001fae 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -170,32 +170,10 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # then the barcode is replaced with the string "unknown_low_abundance_barcode".
   # Function := performs the mutate in place without copying the dataframe.
   # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%.
-  
   summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) |
-                                                         n >= 20,
+                                                         n >= low_abundance_threshold,
                                                        get(barcode_col), 'unknown_low_abundance_barcode')]
-  
-  # The code above is the initial implementation. It works locally, but not on Jenkins.
-  # The problem appears to occur when adding a second condition in the ifelse - not sure why this is happening.
-  # %chin% to %in% - error persists
-  # data.table::fifelse to base::ifelse - error persists
-  # Jenkins and local are working with the same version of data.table.
-  
-  # works?
-  # summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes),
-  #                                         get(barcode_col), 'unknown_low_abundance_barcode')]
-  
-  # wasted
-  # print(low_abundance_threshold)
-  # summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]
-  # prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,]
-  # 
-  # unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]
-  # unknown_barcode_counts[, c(barcode_col) := data.table::fifelse(n >= low_abundance_threshold, 
-  #                                                                get(barcode_col), 'unknown_low_abundance_barcode')]
-  # unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)]
 
-  
   # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells.
   summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)]
   
@@ -213,8 +191,6 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   
   # Return list of two dfs with known or unknown read counts ----
   print('Completing collate_fastq_reads.')
-  # return(list(prism_barcode_counts= prism_barcode_counts,
-  #             unknown_barcode_counts= unknown_barcode_counts))
   return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], 
               unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,]))
 }

From 4352d59d204e6b063731eb8834b1ab73ef216bbb Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 15:08:17 -0400
Subject: [PATCH 118/127] Added missing slash

---
 scripts/launch_job.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/launch_job.sh b/scripts/launch_job.sh
index a843d8a3..9196b5c5 100644
--- a/scripts/launch_job.sh
+++ b/scripts/launch_job.sh
@@ -105,7 +105,7 @@ echo "Running in container:"
   -e COUNT_COL_NAME="$COUNT_COL_NAME" \
   -e CTL_TYPES="$CTL_TYPES" \
   -e COUNT_THRESHOLD="$COUNT_THRESHOLD" \
-  -e LOW_ABUNDANCE_THRESHOLD="$LOW_ABUNDANCE_THRESHOLD"
+  -e LOW_ABUNDANCE_THRESHOLD="$LOW_ABUNDANCE_THRESHOLD" \
   -e RUN_NORM="$RUN_NORM" \
   -e BUILD_NAME="$BUILD_NAME" \
   -e CONVERT_SUSHI="$CONVERT_SUSHI" \

From 12138256f5e1ae05efdf8fdacf1abf7901780a6a Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Wed, 9 Oct 2024 15:52:54 -0400
Subject: [PATCH 119/127] Remove run_norm parameter

---
 scripts/launch_job.sh | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/scripts/launch_job.sh b/scripts/launch_job.sh
index 9196b5c5..ca354fd6 100644
--- a/scripts/launch_job.sh
+++ b/scripts/launch_job.sh
@@ -23,17 +23,19 @@ fi
 
 # List of parameters
 PARAMS=(
-  SEQ_TYPE BUILD_DIR INDEX_1 INDEX_2 BARCODE_SUFFIX REVERSE_INDEX2 RUN_NORM BUILD_NAME
-  CONVERT_SUSHI PULL_POOL_ID RUN_EPS_QC REMOVE_DATA DAYS COUNTS  
+  SEQ_TYPE BUILD_DIR INDEX_1 INDEX_2 BARCODE_SUFFIX REVERSE_INDEX2 BUILD_NAME
+  CONVERT_SUSHI PULL_POOL_ID RUN_EPS_QC REMOVE_DATA DAYS COUNTS API_URL
   # metadata files
   SAMPLE_META CELL_SET_META CELL_LINE_META CONTROL_BARCODE_META ASSAY_POOL_META
   # susi files
   RAW_COUNTS_UNCOLLAPSED PRISM_BARCODE_COUNTS UNKNOWN_BARCODE_COUNTS ANNOTATED_COUNTS
   FILTERED_COUNTS NORMALIZED_COUNTS LFC COLLAPSED_LFC
-  # column name paramters
-  SEQUENCING_INDEX_COLS ID_COLS CELL_LINE_COLS SIG_COLS CONTROL_COLS
-  # additional parameters 
-  BARCODE_COL LOW_ABUNDANCE_THRESHOLD PSEUDOCOUNT COUNT_COL_NAME CTL_TYPES COUNT_THRESHOLD API_URL
+  # collate_fastq_reads parameters
+  SEQUENCING_INDEX_COLS ID_COLS LOW_ABUNDANCE_THRESHOLD BARCODE_COL
+  # normalize parameters
+  PSEUDOCOUNT
+  # compute_l2fc parameters
+  CELL_LINE_COLS SIG_COLS CONTROL_COLS COUNT_COL_NAME CTL_TYPES COUNT_THRESHOLD
 )
 
 # Load parameters
@@ -106,7 +108,6 @@ echo "Running in container:"
   -e CTL_TYPES="$CTL_TYPES" \
   -e COUNT_THRESHOLD="$COUNT_THRESHOLD" \
   -e LOW_ABUNDANCE_THRESHOLD="$LOW_ABUNDANCE_THRESHOLD" \
-  -e RUN_NORM="$RUN_NORM" \
   -e BUILD_NAME="$BUILD_NAME" \
   -e CONVERT_SUSHI="$CONVERT_SUSHI" \
   -e PULL_POOL_ID="$PULL_POOL_ID" \

From 2a997c4e3fe10e82d82c5bf58369235675470930 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Thu, 10 Oct 2024 16:07:06 -0400
Subject: [PATCH 120/127] Updated error message

---
 scripts/src/QC_images.R | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index a45b911f..8bff0b0f 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -2,10 +2,11 @@
 #' 
 #' Create the qc table with index purity and cell line purity.
 #' 
-#' @param raw_counts_uncollapsed Dataframe output from nori.
-#' @param raw_counts Raw counts dataframe outputed from collate_fastq_reads.
-#' @param filtered_counts Filtered counts dataframe outputed from filter_raw_reads.
-#' @param value_col String name of the counts column present all three dataframes.
+#' @param raw_counts_uncollapsed_path Path to the nori raw_counts_uncollapsed file.
+#' @param unknown_barcode_counts Dataframe of unknown barcodes.
+#' @param prism_barcode_counts Dataframe of prism barcodes extracted from the nori. 
+#' @param filtered_counts Filtered counts dataframe created from filter_raw_reads.
+#' @param value_col String name of the counts column present all the four input dataframes.
 #' @param file_path Location to write out the output.
 #' @returns Writes out a QC_table to the file_path.
 create_qc_table= function(raw_counts_uncollapsed_path, unknown_barcode_counts, 
@@ -129,7 +130,6 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) {
   return(total_counts_plot)
 }
 
-
 #' Cell line recover barplot
 #' 
 #' Creates barplots of the cell lines recovered. The parameter "plot_type" can be used to plot the percentage or
@@ -141,7 +141,7 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) {
 #' @param id_cols Vector of column names that identify each sample.
 #' @param facet_col String name of the column in filtered_counts to facet the plot.
 #' @param value_col String name of the column in filtered_counts that contains the counts.
-#' @param counts_threshold Threshold used to determine low counts.
+#' @param count_threshold Threshold used to determine low counts.
 #' @param plot_type String of either "percent" or "count" to adjust the y axis to be either the percentage or the 
 #'                  total number of cell lines.
 #' @param include_ctrl_bcs Boolean. Set to TRUE if control barcodes are to be counted. 
@@ -449,14 +449,14 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou
 #'
 #' Takes in various pipeline outputs and generates 11 QC files.
 #'
-#' @param raw_counts_uncollapsed Dataframe output from nori. This is used to generate purity metrics and
-#'                               the index summaries.
-#' @param raw_counts Raw counts dataframe from the collate_fastq_reads modules. This is used to generate puritu metrics.
+#' @param raw_counts_uncollapsed_path Path to the raw_counts_uncollapsed file.
+#' @param prism_barcode_counts Dataframe of prism barcodes identified in the run.
+#' @param unknown_barcode_counts Dataframe of unknown barcodes.
 #' @param annotated_counts Annotated counts dataframe from the filter_raw_reads module.
-#' @param filtered_counts Filtered counts dataframe from the filter_raw_reads module.
 #' @param normalized_counts Normalized counts dataframe from the normalize module. This is an optional parameter.
 #' @param l2fc L2FC dataframe from the compute_l2fc module. This is used for the bio_reps plot. 
 #' @param sample_meta Dataframe of the sample metadata for the sequencing run.
+#' @param barcode_col String name of the column containing the barcode sequences.
 #' @param cell_line_cols Vector of sample meta column names used to describe a cell line or barcode.
 #' @param id_cols Vector of sample meta column names used to identify each PCR well. 
 #'                This defaults to "pcr_plate", "pcr_well".
@@ -605,8 +605,8 @@ QC_images= function(raw_counts_uncollapsed_path,
                                    row.names= FALSE, quote= FALSE)
   }, error= function(e) {
     print(e)
-    print('Encountered an error when creating the total counts barplot. Skipping this output ...') 
-    return('Totalc ounts image')
+    print('Encountered an error when creating the summary unknown barcode reads. Skipping this output ...') 
+    return('unknown barcode reads')
   })
   
   # Collect returned string if an error occurred

From 42e7a93c0b4753e0b07175550002bf0d3b2c4c35 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Thu, 10 Oct 2024 16:08:37 -0400
Subject: [PATCH 121/127] Fixed flag

---
 scripts/src/collapse_bio_reps.R | 3 ++-
 scripts/src/compute_l2fc.R      | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/src/collapse_bio_reps.R b/scripts/src/collapse_bio_reps.R
index 5c09eecb..c93537e6 100755
--- a/scripts/src/collapse_bio_reps.R
+++ b/scripts/src/collapse_bio_reps.R
@@ -41,7 +41,8 @@ collapse_bio_reps= function(l2fc, sig_cols, cell_line_cols= c('project_code', 'D
   }
   
   # Median collapsing bio replicates ----
-  collapsed_counts= l2fc %>% tidyr::unite(col= 'sig_id', all_of(sig_cols), sep= ':', na.rm= FALSE, remove= FALSE) %>%
+  collapsed_counts= l2fc %>% dplyr::filter(is.na(counts_flag)) %>% 
+    tidyr::unite(col= 'sig_id', all_of(sig_cols), sep= ':', na.rm= FALSE, remove= FALSE) %>%
     dplyr::group_by(pick(all_of(c(cell_line_cols, 'sig_id', sig_cols)))) %>%
     dplyr::summarise(trt_median_n= median(mean_n), trt_median_normalized_n= median(mean_normalized_n),
                      trt_mad_sqrtN= mad(log2(mean_normalized_n)) / sqrt(dplyr::n()),
diff --git a/scripts/src/compute_l2fc.R b/scripts/src/compute_l2fc.R
index 3007253f..5db25f2d 100755
--- a/scripts/src/compute_l2fc.R
+++ b/scripts/src/compute_l2fc.R
@@ -77,7 +77,8 @@ compute_l2fc= function(normalized_counts,
   # Join neg_cons and compute l2fc ----
   l2fc= collapsed_tech_rep %>% dplyr::filter(!trt_type %in% c(control_type, 'day_0')) %>% 
     dplyr::inner_join(controls, by= c(cell_line_cols, ctrl_cols), relationship='many-to-one') %>%
-    dplyr::mutate(l2fc= log2(mean_normalized_n/control_median_normalized_n))
+    dplyr::mutate(l2fc= log2(mean_normalized_n/control_median_normalized_n),
+                  counts_flag= ifelse(control_median_n < count_threshold, paste0('negcon<', count_threshold), NA))
   
   return(l2fc)
 }

From 19bb90f07d07178660a5b95e798f3512d4412dbb Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Thu, 10 Oct 2024 16:08:52 -0400
Subject: [PATCH 122/127] Updated comments

---
 scripts/collate_fastq_reads.R | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index 782f3cbd..2ccfbaf5 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -56,8 +56,8 @@ if(!validate_columns_exist(id_cols, sample_meta)) {
 }
 
 # Run collate_fastq_reads on chunks of raw_counts_uncollapsed.csv ----
-# raw_counts_uncollapsed can be too large to read into memory,
-# so collate_fastq_reads is performed on chunks of the large file.
+# raw_counts_uncollapsed could be too large to read into memory,
+# so collate_fastq_reads is performed on chunks of the raw_counts_uncollapsed file.
 chunked_results= process_in_chunks(large_file_path= args$raw_counts_uncollapsed, 
                                    chunk_size= 10^6, 
                                    action= collate_fastq_reads,
@@ -70,14 +70,13 @@ chunked_results= process_in_chunks(large_file_path= args$raw_counts_uncollapsed,
                                    barcode_col= args$barcode_col,
                                    low_abundance_threshold= as.numeric(args$low_abundance_threshold))
 
-# From each chunk, extract prism_barcode_counts and bind the rows together into one dataframe.
+# From each chunk, extract prism_barcode_counts or unknown_barcode_counts and bind those rows together.
+# Then use data.table to aggregate and sum up reads across the chunks.
+# data.table functions are faster and less memory intensivie.
 prism_barcode_counts= data.table::rbindlist(lapply(chunked_results, function(x) x$prism_barcode_counts))
-# Use data.table to group_by id_cols and barcode_col to sum up reads across all chunks.
 prism_barcode_counts= prism_barcode_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)]
 
-# From each chunk, extract unknown_barcode_counts and bind the rows together into one dataframe.
 unknown_barcode_counts= data.table::rbindlist(lapply(chunked_results, function(x) x$unknown_barcode_counts))
-# Use data.table to group_by id_cols and barcode_col to sum up reads across all chunks.
 unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)]
 
 # Validation: Basic file size check ----

From ad2b61f3115766d14243fc2344a4dd00c06a5e4f Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Thu, 10 Oct 2024 16:09:11 -0400
Subject: [PATCH 123/127] Reordered params and added comments

---
 scripts/make_config_file.groovy | 56 ++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy
index 59b6dfa7..49ac31a6 100644
--- a/scripts/make_config_file.groovy
+++ b/scripts/make_config_file.groovy
@@ -8,16 +8,15 @@ pipeline {
     parameters {
         booleanParam(name: 'TRIGGER_BUILD', defaultValue: true, description: 'Check this to trigger the build. If unchecked, the build will not be triggered and only the config.json will be generated.')
         booleanParam(name: 'CREATE_CELLDB_METADATA', defaultValue: true, description: 'Check this to trigger the create_celldb_metadata job.')
+        booleanParam(name: 'PULL_POOL_ID', defaultValue: false, description: 'Flag indicating whether to pull pool IDs from CellDB - only applicable to cell sets (i.e. EXT.PR500.CS01.1.A, EXT.PR500.CS01.1.B, etc).')
         booleanParam(name: 'COLLATE_FASTQ_READS', defaultValue: true, description: 'Check this to trigger the collate_fastq_reads job.')
         booleanParam(name: 'FILTER_COUNTS', defaultValue: true, description: 'Check this to trigger the filter_counts job.')
+        booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.')
         booleanParam(name: 'CBNORMALIZE', defaultValue: true, description: 'Check this to trigger the CBnormalize job.')
         booleanParam(name: 'COMPUTE_LFC', defaultValue: true, description: 'Check this to trigger the compute_l2fc job.')
         booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.')
         booleanParam(name: 'QC_IMAGES', defaultValue: true, description: 'Check this to trigger the QC job.')
-        booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.')
-        booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.')
-        booleanParam(name: 'RUN_NORM', defaultValue: true, description: 'Run normalization module on data.')
-        booleanParam(name: 'PULL_POOL_ID', defaultValue: false, description: 'Flag indicating whether to pull pool IDs from CellDB - only applicable to cell sets (i.e. EXT.PR500.CS01.1.A, EXT.PR500.CS01.1.B, etc).')
+        booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job. This should be checked you are planning on ruunning CONVERT_SUSHI.')
         booleanParam(name: 'CONVERT_SUSHI', defaultValue: false, description: 'Convert output column headers to format for MTS pipeline and upload to s3.')
         booleanParam(name: 'RUN_EPS_QC', defaultValue: false, description: 'Run EPS QC')
         string(name: 'BUILD_DIR', defaultValue: '/cmap/obelix/pod/prismSeq/', description: 'Output path to deposit build. Format should be /directory/PROJECT_CODE/BUILD_NAME')
@@ -25,27 +24,36 @@ pipeline {
         string(name: 'SCREEN', defaultValue: '', description: 'Screen name from COMET, necessary if using COMET for sample metadata.')
         string(name: 'SEQ_TYPE', defaultValue: 'DRAGEN', description: 'Choose DRAGEN, MiSeq, HiSeq, or NovaSeq. MiSeq and HiSeq/NovaSeq return files named differently. This setting sets the INDEX_1, INDEX_2, and BARCODE_SUFFIX parameters in fastq2readcount. Select DRAGEN if fastq files are from the DRAGEN pipeline from GP. Choosing NovaSeq reverses index 2.')
         string(name: 'DAYS', defaultValue: '', description: 'If running the sushi_to_mts module, provide any days/timepoints (separated by commas) that should be dropped from output data. No quotes needed (ie, 2,8).')
+
+        // pipeline version
         string(name: 'GIT_BRANCH', defaultValue: 'main', description: 'Pipeline branch to use')
         booleanParam(name: 'USE_LATEST', defaultValue: true, description: 'Check this to use the most up to date version from the specified branch. If not checked, will use the specified commit.')
         string(name: 'COMMIT_ID', defaultValue: '', description: 'Specific commit ID to use (leave empty if using the latest commit in the branch or if already specified in the config file.)')
 
         // Most common parameters
-        // Column names parameters
-        string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'flowcell_names,index_1,index_2', description: 'Sequencing index columns used in COLLATE_FASTQ_READS')
-        string(name: 'ID_COLS', defaultValue: 'pcr_plate,pcr_well', description: 'Columns to concat to create unique ID for each sample-replicate')
+        string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'flowcell_names,index_1,index_2', description: 'Used in COLLATE_FASTQ_READS, this is a comma separate list of sequencing columns in the sample meta that are need to identify every PCR well in the run.')
+        string(name: 'ID_COLS', defaultValue: 'pcr_plate,pcr_well', description: 'Used in COLLATE_FASTQ_READS, columns to concat to create unique ID for each sample-replicate')
         string(name: 'CELL_LINE_COLS', defaultValue: 'DepMap_ID', description: 'Columns in intermediate files that describe a read or cell line')
-        string(name: 'SIG_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day', description: 'Signature columns')
-        string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual controls in COMPUTE_LFC')
+        string(name: 'SIG_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day', description: 'Signature columns, these describe unique treatment conditions and generally should not include replicate information.')
+        string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual negative control conditions.')
 
-        // Metadata files used by sushi
+        // Sushi Input files
+        string(name: 'RAW_COUNTS_UNCOLLAPSED', defaultValue: 'raw_counts_uncollapsed.csv', description: 'Filename in BUILD_DIR containing nori output')
         string(name: 'SAMPLE_META', defaultValue: 'sample_meta.csv', description: 'File name of sample metadata within the BUILD_DIR directory.')
         string(name: 'CELL_SET_META', defaultValue: 'cell_set_meta.csv', description: 'Cell Set Metadata. Static cell_line_meta location: /data/vdb/prismSeq/cell_set_meta.csv')
         string(name: 'CELL_LINE_META', defaultValue: 'cell_line_meta.csv', description: 'File in BUILD_DIR containing cell line metadata')
         string(name: 'CONTROL_BARCODE_META', defaultValue: 'CB_meta.csv', description: 'Metadata for control barcodes.')
         string(name: 'ASSAY_POOL_META', defaultValue: 'assay_pool_meta.txt', description: 'File in BUILD_DIR containing assay pool metadata')
 
-        // Files consumed and created by sushi
-        string(name: 'RAW_COUNTS_UNCOLLAPSED', defaultValue: 'raw_counts_uncollapsed.csv', description: 'Filename in BUILD_DIR containing nori output')
+        // Additional parameters ordered by when they first appear
+        string(name: 'BARCODE_COL', defaultValue: 'forward_read_cl_barcode', description: 'Used in COLLATE_FASTQ_READS, the name of the column containing the read')
+        string(name: 'LOW_ABUNDANCE_THRESHOLD', defaultValue: '20', description: 'Used in COLLATE_FASTQ_READS, threshold for unknown barcodes')
+        string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'Used in CBNORMALIZE, the pesudocount value for log transformations.')
+        string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'Used in COMPUTE_LFC, the name of the numeric column to use for calculations')
+        string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'Used in COMPUTE_LFC, the value in trt_type that indicates the negative controls')
+        string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'Used in COMPUTE_LFC, the count threshold for the collapsed negative controls. Cell lines in the negative controls below this threshold will be dropped from log2 fold change calculations.')
+
+        // Files created by sushi
         string(name: 'PRISM_BARCODE_COUNTS', defaultValue: 'prism_barcode_counts.csv', description: 'Filename in BUILD_DIR containing PRISM barcode counts')
         string(name: 'UNKNOWN_BARCODE_COUNTS', defaultValue: 'unknown_barcode_counts.csv', description: 'Filename in BUILD_DIR containing unknown barcode counts')
         string(name: 'ANNOTATED_COUNTS', defaultValue: 'annotated_counts.csv', description: 'Filename in BUILD_DIR containing annotated counts')
@@ -53,14 +61,7 @@ pipeline {
         string(name: 'NORMALIZED_COUNTS', defaultValue: 'normalized_counts.csv', description: 'Filename in BUILD_DIR containing normalized counts')
         string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'Filename containing log2 fold change values')
         string(name: 'COLLAPSED_LFC', defaultValue: 'collapsed_l2fc.csv', description: 'Filename in BUILD_DIR containing replicate collapsed l2fc values')
-
-        // Additional parameters
-        string(name: 'BARCODE_COL', defaultValue: 'forward_read_cl_barcode', description: 'In COLLATE_FASTQ_READS, the name of the column containing the read')
-        string(name: 'LOW_ABUNDANCE_THRESHOLD', defaultValue: '20', description: 'In COLLATE_FASTQ_READS, threshold for unknown barcodes')
-        string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'In CBNORMALIZE, the pesudocount value for log transformations.')
-        string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'In COMPUTE_LFC, the name of the numeric column to use for calculations')
-        string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'In COMPUTE_LFC, the value in trt_type that indicates the negative controls')
-        string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'Drops cell lines below this threshold in the negative controls')
+        // Other
         string(name: 'API_URL', defaultValue: 'https://api.clue.io/api/', description: 'API URL')
     }
 
@@ -113,8 +114,8 @@ pipeline {
             steps {
                 script {
                     def paramList = [
-                        'SEQ_TYPE', 'API_URL', 'BUILD_DIR', 'INDEX_1', 'INDEX_2', 'BARCODE_SUFFIX', 'REVERSE_INDEX2',
-                        'RUN_NORM', 'BUILD_NAME', 'CONVERT_SUSHI', 'PULL_POOL_ID', 'RUN_EPS_QC', 'REMOVE_DATA', 'DAYS',
+                        'SEQ_TYPE', 'API_URL', 'BUILD_DIR', 'INDEX_1', 'INDEX_2', 'BARCODE_SUFFIX',
+                        'BUILD_NAME', 'CONVERT_SUSHI', 'PULL_POOL_ID', 'RUN_EPS_QC', 'REMOVE_DATA', 'DAYS',
                         'COUNTS', 'SCREEN',
 
                         // metadata files
@@ -124,11 +125,14 @@ pipeline {
                         'RAW_COUNTS_UNCOLLAPSED', 'PRISM_BARCODE_COUNTS', 'UNKNOWN_BARCODE_COUNTS', 
                         'ANNOTATED_COUNTS', 'FILTERED_COUNTS', 'NORMALIZED_COUNTS', 'LFC', 'COLLAPSED_LFC',
 
-                        // column name parameters
-                        'SEQUENCING_INDEX_COLS', 'ID_COLS', 'CELL_LINE_COLS', 'SIG_COLS', 'CONTROL_COLS',
+                        // collate_fastq_reads parameters
+                        'SEQUENCING_INDEX_COLS', 'ID_COLS', 'BARCODE_COL', 'LOW_ABUNDANCE_THRESHOLD', 'REVERSE_INDEX2',
+
+                        // normalize parameters
+                        'PSEUDOCOUNT',
 
-                        // additional parameters
-                        'BARCODE_COL', 'LOW_ABUNDANCE_THRESHOLD', 'PSEUDOCOUNT', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD', 'API_URL'
+                        // compute_l2fc paramters
+                        'SIG_COLS', 'CONTROL_COLS', 'CELL_LINE_COLS', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD',
                     ]
 
                     def config = [:]

From b3294fac0c020fbd53f561fa528a96db18aa22fb Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Thu, 10 Oct 2024 17:07:24 -0400
Subject: [PATCH 124/127] Added more comments

---
 scripts/src/kitchen_utensils.R | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/scripts/src/kitchen_utensils.R b/scripts/src/kitchen_utensils.R
index cfe9c0f4..d11eddd2 100644
--- a/scripts/src/kitchen_utensils.R
+++ b/scripts/src/kitchen_utensils.R
@@ -6,11 +6,14 @@
 #' 
 #' This function runs some action over chunks of a large file. At the end, returns a list of all the chunks
 #' 
-#' @param large_file_path description
-#' @param chunk_size description
-#' @param action A function passed to act on each chunk
+#' @param large_file_path Path to a large csv file. This file may be too large to read into R.
+#' @param chunk_size The number of rows in a chunk.
+#' @param action A function to perform over a chunk.
 #' @param ... Additional parameters to be passed into the action parameter
 process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) {
+  # Read in the column names. These names will be passed onto each chunk.
+  # When reading a file in chunks, the column names in the first line are not always passed.
+  # Use data.table to read in just the headers with nrow= 0.
   header_col_names= data.table::fread(large_file_path, header= TRUE, sep= ',', nrow= 0) %>% colnames()
   chunk_idx= 1 # Counter to keep track of chunks in a loop
   current_chunk_size= chunk_size # Variable for loop exit condition
@@ -18,6 +21,9 @@ process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) {
   
   # For each chunk, call an action
   while(current_chunk_size == chunk_size) {
+    # Read in a chunk of the large file and set the column names.
+    # nrow - the number of rows to read in
+    # skip - the number of rows to skip before starting to read in.
     current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',',
                                      col.names= header_col_names,
                                      nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1)
@@ -25,6 +31,7 @@ process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) {
     current_chunk_size= nrow(current_chunk) # set current chunk size to stop loop
     print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' '))
     
+    # Call the action over the chunk
     chunk_collector[[chunk_idx]]= do.call(action, list(current_chunk, ...))
     chunk_idx= chunk_idx + 1
   }

From 84d7dc1e99eaef22f5500c88ccffcb8a63f02aba Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Thu, 10 Oct 2024 17:27:29 -0400
Subject: [PATCH 125/127] Dropped run_norm parameter

---
 scripts/CBnormalize.sh | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/scripts/CBnormalize.sh b/scripts/CBnormalize.sh
index e23c1979..4af2fdd7 100644
--- a/scripts/CBnormalize.sh
+++ b/scripts/CBnormalize.sh
@@ -52,27 +52,18 @@ echo SAMPLE_META is: $SAMPLE_META
 echo $RUN_NORM
 
 
-if [[ "$RUN_NORM" == "true" ]]
-then
-	echo "Running normalization module"
-
-	echo Rscript CBnormalize.R -c $FILTERED_COUNTS	\
-    --CB_meta $CONTROL_BARCODE_META \
-    --pseudocount $PSEUDOCOUNT \
-    --id_cols $ID_COLS \
-    --out $BUILD_DIR
+echo "Running normalization module"
 
-	Rscript CBnormalize.R -c $FILTERED_COUNTS	\
-	--CB_meta $CONTROL_BARCODE_META \
-	--pseudocount $PSEUDOCOUNT \
-	--id_cols $ID_COLS \
-	--out $BUILD_DIR
+echo Rscript CBnormalize.R -c $FILTERED_COUNTS	\
+--CB_meta $CONTROL_BARCODE_META \
+--pseudocount $PSEUDOCOUNT \
+--id_cols $ID_COLS \
+--out $BUILD_DIR
 
-	COUNTS="normalized_counts.csv"
+Rscript CBnormalize.R -c $FILTERED_COUNTS	\
+--CB_meta $CONTROL_BARCODE_META \
+--pseudocount $PSEUDOCOUNT \
+--id_cols $ID_COLS \
+--out $BUILD_DIR
 
-else
-	echo "Not running normalization module"
-    COUNTS=$FILTERED_COUNTS
-    COUNT_COL_NAME="n"
-    echo $COUNTS
-fi
+COUNTS="normalized_counts.csv"

From 3469baee5aa65d52d558e277f0ead632ff40af7b Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Thu, 10 Oct 2024 17:28:48 -0400
Subject: [PATCH 126/127] Added back CCLE_name

Added back CCLE_name in order to get the adapter script to work
---
 scripts/join_metadata.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R
index 79d442f6..78aaa44d 100644
--- a/scripts/join_metadata.R
+++ b/scripts/join_metadata.R
@@ -56,7 +56,7 @@ if(file.exists(args$lfc)) {
   if(assay_pool_meta_exists) {
     l2fc_with_meta_columns= join_metadata(input_df= l2fc_with_meta_columns, 
                                           metadata= input_assay_pool_meta,
-                                          key_cols= c('DepMap_ID', 'cell_set'))
+                                          key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set'))
   } else {
     print('WARNING: Assay pool meta not detected and will not be joined onto l2fc.')
   }
@@ -83,7 +83,7 @@ if(file.exists(args$collapsed_lfc)) {
     print('Attempting to add assay_pool_meta to collapsed l2fc.')
     collapsed_l2fc_with_meta_columns= join_metadata(input_df= collapsed_l2fc_with_meta_columns, 
                                                     metadata= input_assay_pool_meta,
-                                                    key_cols= c('DepMap_ID', 'cell_set'))
+                                                    key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set'))
   } else {
     print('WARNING: Assay pool meta not detected and will not be joined onto collapsed l2fc.')
   }

From 0f1bb43ba352086e107557df4c58c97491257993 Mon Sep 17 00:00:00 2001
From: YuhJong Liu <yuhjong24@gmail.com>
Date: Fri, 11 Oct 2024 08:54:51 -0400
Subject: [PATCH 127/127] Drop wells without a cell set

---
 scripts/src/filter_raw_reads.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R
index 1a3384f7..83d036c6 100755
--- a/scripts/src/filter_raw_reads.R
+++ b/scripts/src/filter_raw_reads.R
@@ -75,7 +75,8 @@ filter_raw_reads = function(prism_barcode_counts,
   # every row is a cell line that is expected in a PCR well. 
   print('Creating template of expected reads.')
   # Join cell_set_meta and cell_line_meta. The cell_set can be a name "P939" or a list of LUAs.
-  template= sample_meta %>% dplyr::left_join(cell_set_meta, by= 'cell_set') %>%
+  template= sample_meta %>% dplyr::filter(!is.na(cell_set)) %>% # this filter prevents some NA rows
+    dplyr::left_join(cell_set_meta, by= 'cell_set') %>%
     dplyr::mutate(members= ifelse(is.na(members), str_split(cell_set, ';'), str_split(members, ';'))) %>% 
     tidyr::unnest(cols= members) %>%
     dplyr::left_join(cell_line_meta, by= dplyr::join_by('members'=='LUA'), relationship= 'many-to-one') %>%