From 3ad3433b221eb418d01d293f26180829338d8d28 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Mon, 19 Aug 2024 08:48:17 -0400 Subject: [PATCH 001/127] Created functions each plot --- scripts/src/QC_images.R | 398 ++++++++++++++++++++++++++++++---------- 1 file changed, 300 insertions(+), 98 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index 322fd110..29ef4d87 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -1,3 +1,24 @@ +#' validate_columns_exist +#' +#' This function checks that a list of columns are present in a dataframe. +#' Columns that were not found in the dataframe are printed out. +#' +#' @param selected_columns A vector of strings each representing a column name +#' @param df A dataframe to check against +#' @return Boolean +validate_columns_exist= function(selected_columns, df) { + # Check that all of selected_columns are in df + unmatched_cols= setdiff(selected_columns, colnames(df)) + + if(length(unmatched_cols) > 0) { + print('The following columns are missing: ') + print(unmatched_cols) + return(FALSE) + } else { + return(TRUE) + } +} + #' Calculate index summaries #' #' Generates some simple summaries for each unique index. @@ -27,6 +48,232 @@ get_index_summary= function(df, index_col, valid_indices) { return(output_summary) } +#' Calculate purity metrics +#' +#' Create the qc table with index purity and cell line purity +#' +#' @param raw_counts_uncollapsed Dataframe output from nori. +#' @param raw_counts Raw counts dataframe outputed from collate_fastq_reads. +#' @param filtered_counts Filtered counts dataframe outputed from filter_raw_reads. +#' @param counts_col String name of the counts column in all three dataframes. +#' @param file_path Location to write out the output. +create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts, + counts_col= 'n', file_path) { + index_purity= sum(raw_counts[[counts_col]]) / sum(raw_counts_uncollapsed[[counts_col]]) + print(paste0('Index purity: ', round(index_purity, 4))) + cell_line_purity= sum(filtered_counts[[counts_col]]) / sum(raw_counts[[counts_col]]) + print(paste0('Cell line purity: ', round(cell_line_purity, 4))) + + qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity) + + print(paste0('Writing QC table out to ', file_path)) + qc_table %>% write.csv(file_path, row.names= FALSE, quote= FALSE) +} + +#' Total counts barplot +#' +#' Creates the total counts barplot with bars colored by the barcode type, +#' either a cell line barcode or control barcode. +#' +#' @param filtered_counts Filtered counts dataframe. +#' @param id_cols Vector of columns names that identify each sample. +#' @param facet_col String name of the column in filtered_counts to facet the plot. +plot_total_counts= function(filtered_counts, id_cols, facet_col= NA) { + total_counts= filtered_counts %>% + dplyr::mutate(barcode_type= case_when(!is.na(CCLE_name) ~ 'cell line', + !is.na(Name) ~ 'ctrl barcode')) %>% + tidyr::unite(all_of(id_cols), col= 'sample_id', sep= ':', remove= FALSE, na.rm= FALSE) %>% + dplyr::group_by(pick(all_of(na.omit(c('sample_id', facet_col, 'barcode_type'))))) %>% + dplyr::summarise(total_counts= sum(n)) %>% dplyr::ungroup() + + total_counts_plot= total_counts %>% + ggplot(aes(x=sample_id, y=total_counts, fill=barcode_type)) + + geom_col(alpha=0.75, position='identity') + + geom_hline(yintercept= 10^4, linetype=2) + + {if(!is.na(facet_col)) facet_wrap(~.data[[facet_col]], scale= 'free_x')} + + labs(x= "Sample constructed using id_cols", y="Total counts", fill= 'Barcode\ntype', + title= 'Filtered counts - unstacked') + + theme_bw() + theme(axis.text.x = element_text(angle=70, hjust=1)) + + return(total_counts_plot) +} + + +#' Cell line recover barplot +#' +#' text +#' +#' @param filtered_counts Filtered counts dataframe. +#' @param id_cols Vector of column names that identify each sample. +#' @param facet_col String name of the column in filtered_counts to facet the plot. +#' @param counts_col String name of the column in filtered_counts that contains the counts. +#' @param counts_threshold Threshold used to determine low counts. +#' @param plot_type description +#' @param include_ctrl_bcs description +plot_cl_recovery= function(filtered_counts, id_cols, facet_col= NA, counts_col= 'n', counts_threshold, + plot_type= 'percent', include_ctrl_bcs= FALSE) { + + # Filter out control barcodes if it is specified. + if(include_ctrl_bcs == FALSE) { + filtered_counts= filtered_counts %>% dplyr::filter(is.na(Name)) + } + + # Count number of cell lines/ barcodes for a detection group. + recovery= filtered_counts %>% + dplyr::add_count(pick(all_of(id_cols)), name= 'total_num_cls') %>% + dplyr::mutate(detect_type= case_when(.data[[counts_col]] == 0 ~ 'Not detected', + .data[[counts_col]] <= counts_threshold ~ 'Low counts', + .data[[counts_col]] > counts_threshold ~ 'Detected')) %>% + dplyr::count(pick(all_of(c(id_cols, facet_col, 'detect_type', 'total_num_cls'))), name= 'num_cls_by_type') %>% + tidyr::unite(all_of(id_cols), col= 'sample_id', sep= ':', remove= FALSE, na.rm= FALSE) %>% + dplyr::mutate(percent= (num_cls_by_type / total_num_cls) * 100) + + # Set the y axis depending on the plot type. + if(plot_type == 'count') { + y_col= 'num_cls_by_type' + y_text= 'Number of cell lines' + } else { + if(plot_type != 'percent') { + print(paste0('Warning: ', plot_type, ' is not a valid plot type. Please use either count or percent.')) + print('Defaulting to percent plot.') + } + y_col= 'percent' + y_text= 'Percentage of cell lines recovered (%)' + } + + # Create recovery plot. + recov_plot= recovery %>% + ggplot(aes(x= sample_id, y= .data[[y_col]], fill= reorder(detect_type, dplyr::desc(detect_type)))) + + geom_col(alpha=0.75, position='stack') + + {if(!is.na(facet_col)) facet_wrap(~.data[[facet_col]], scale= 'free_x')} + + labs(x= "Sample constructed using id_cols", y= y_text, fill= '', title= 'Cell line recovery') + + theme_bw() + theme(axis.text.x = element_text(angle=70, hjust=1)) + + return(recov_plot) +} + +#' Control barcode scatter plot +#' +#' text +#' +#' @param name description +plot_ctrl_bc_trend= function(normalized_counts, id_cols, counts_col= 'log2_n') { + # Detect norm_r2 and norm_mae. + # If columns do not exist, then roughly calculate those columns. + if(any(!c('norm_r2', 'norm_mae') %in% colnames(normalized_counts))) { + print('WARNING: Columns "norm_r2" and/or "norm_mae" were not detected in normalized_counts.', quote= FALSE) + print('Calculating both columns - this method may not be as robust as the normalize module.') + + normalized_counts= normalized_counts %>% + dplyr::filter(!is.na(Name), control_barcodes %in% c("Y", "T", T), n != 0) %>% + dplyr::group_by(pick(all_of(id_cols))) %>% + dplyr::mutate(mean_y= mean(log2_dose), + residual2= (log2_dose - log2_normalized_n)^2, + squares2= (log2_dose - mean_y)^2, + norm_r2= 1 - sum(residual2) / sum(squares2), + norm_mae= median(abs(log2_dose- log2_normalized_n))) %>% ungroup() + } + + # Filter for just the control barcodes, create a profile_id for faceting, + # and determine the x and y positions for the r2 + mae label. + cb_trend= normalized_counts %>% dplyr::filter(!is.na(Name), control_barcodes %in% c("Y", "T", T)) %>% + tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= TRUE) %>% + dplyr::group_by(profile_id) %>% dplyr::mutate(label_x_pos= min(.data[[counts_col]]), + label_y_pos= max(log2_dose)) %>% dplyr::ungroup() + + # Create control barcode trend plot + trend_scatter_plot= cb_trend %>% ggplot(aes(x= .data[[counts_col]], y= log2_dose)) + + geom_point() + + geom_abline(aes(slope=1, intercept= cb_intercept) , color='blue', alpha= 0.5) + + geom_text(aes(x= label_x_pos, y= label_y_pos, + label= paste0('r2= ', round(norm_r2, 4), '\nmae= ', round(norm_mae, 4))), + hjust='inward', vjust='inward', alpha= 0.5) + + facet_wrap(~profile_id, scales='free_x') + + labs(title= 'Linear fit of control barcodes across all samples') + theme_bw() + + return(trend_scatter_plot) +} + + +#' Heatmap of correlations +#' +#' text +#' +#' @param input_df description +plot_cor_heatmap= function(input_df, row_id_cols, col_id_cols, counts_col, + cor_method= 'pearson') { + + # Validate that specified columns are in the dataframe. + if(!validate_columns_exist(c(row_id_cols, col_id_cols, counts_col), input_df)) { + stop('Not all columns were detected in the input dataframe.') + } + + # Create row and column names for pivoting to a matrix + correlation_mx= input_df %>% + tidyr::unite(all_of(row_id_cols), col= 'row_id', sep= ':', remove= TRUE) %>% + tidyr::unite(all_of(col_id_cols), col= 'col_id', sep= ':', remove= TRUE) + + # Check that the row and column ids specify one value. + validate_ids= correlation_mx %>% dplyr::group_by(row_id, col_id) %>% + dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup() + if(nrow(validate_ids) != 0) { + print('The provide columns specify more than one value.') + print(head(validate_ids)) + stop('Multiple values detected for a unique combination of "row_id_cols" and "col_id_cols".') + } + + # Pivot and calculate correlations + correlation_mx= correlation_mx %>% reshape2::acast(row_id~col_id, value.var= counts_col) %>% + WGCNA::cor(use= 'pairwise.complete.obs', method= cor_method) + + # Create heatmap + cor_heatmap= correlation_mx %>% reshape2::melt() %>% + ggplot(aes(x= Var1, y= Var2, fill= value)) + + geom_tile() + + labs(x= '', y= '', fill= '', title= paste0('Correlations using ', counts_col)) + + scale_fill_gradientn(breaks= c(0, 0.5, 1), + colours= c('blue', 'white','red'), + limits=c(0, 1), oob= scales::squish) + + theme(axis.text.x = element_text(angle=70, hjust=1)) + + return(cor_heatmap) +} + +#' Scatterplots of two replicates +#' +#' @param input_df description +make_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_cols, replicate_col, values_col, + x_axis_rep= '1', y_axis_rep= '2') { + reps_piv= input_df %>% + tidyr::unite(all_of(replicate_group_cols), col= 'replicate_group', sep= ':', remove= TRUE, na.rm= FALSE) %>% + dplyr::group_by(pick(all_of(c(cell_line_cols, 'replicate_group')))) %>% + dplyr::filter(n!= 0, dplyr::n() >= 2, !is.na(.data[[replicate_col]]), .data[[replicate_col]] != '') %>% + dplyr::ungroup() + + # Retun a null object if no entries pass the filter. + if(nrow(reps_piv) == 0) {return(NULL)} + + reps_piv= reps_piv %>% + pivot_wider(id_cols= all_of(c(cell_line_cols, 'replicate_group')), + names_from= replicate_col, names_prefix= replicate_col, values_from= values_col) %>% + dplyr::mutate(type= ifelse(!is.na(CCLE_name), "cell line", "control barcode")) %>% dplyr::ungroup() + + # Create names of the columns to plot on xy axes + x_col_name= paste0(replicate_col, x_axis_rep) + y_col_name= paste0(replicate_col, y_axis_rep) + + reps_scatter= reps_piv %>% dplyr::filter(!is.na(.data[[x_col_name]]), !is.na(.data[[y_col_name]])) %>% + ggplot(aes(x= .data[[x_col_name]], y= .data[[y_col_name]])) + + geom_point(aes(color= type), alpha=0.75) + + geom_smooth(method='lm', se=F, color='black', linewidth=0.5, linetype=2) + + ggpmisc::stat_correlation(mapping = use_label(c("R2", "n")))+ + facet_wrap(~replicate_group, scales= 'free') + + labs(x= paste0(replicate_col, '1 ', values_col), y= paste0(replicate_col, '2 ', values_col)) + + theme_bw() + + return(reps_scatter) +} + #' QC_images #' #' Takes in the metadata, raw counts, annotated counts, and normalized counts to generate some QC images. @@ -53,7 +300,6 @@ get_index_summary= function(df, index_col, valid_indices) { #' @param reverse_index2 reverse index 2 if newer sequencers are used. #' @return - NA, QC images are written out to the specified folder #' @export - QC_images = function(raw_counts, annotated_counts, normalized_counts= NA, sample_meta, CB_meta, cell_set_meta, id_cols, sig_cols, count_col_name= 'normalized_n', @@ -98,6 +344,13 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA, # # Sequencing QCs ____________________ ---- + ## Purity metrics ---- + # call this function + print('Generating QC table') + create_qc_table(raw_counts_uncollapsed, raw_counts, filtered_counts, + counts_col= 'n', file_path) + # + ## Index count summaries ---- print("Generating index counts tables") # Check that "IndexBarcode1" and "index_1" columns are present. @@ -121,46 +374,21 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA, ## Total counts ---- print("generating total_counts image") - total_counts= annotated_counts %>% dplyr::filter(expected_read) %>% - mutate(barcode_type = ifelse(!is.na(CCLE_name), "cell line", "control barcode"), - sample_id= paste(pcr_plate, pcr_well, sep='_')) %>% - group_by(pick(all_of(c('pcr_plate', 'pcr_well', 'sample_id', id_cols, 'barcode_type')))) %>% - dplyr::summarise(total_counts = sum(n)) - - tc= total_counts %>% ggplot() + - geom_col(aes(x=sample_id, y=total_counts, fill=barcode_type), alpha=0.75, position='identity') + - geom_hline(yintercept= 10^4, linetype=2) + - facet_wrap(~pcr_plate, scale= 'free_x') + - labs(x="PCR location", y="total counts", fill="", title= 'Raw counts - unstacked') + theme_bw() + - theme(axis.text.x = element_text(angle=70, hjust=1, size=5)) + + tc= plot_total_counts(filtered_counts, id_cols, facet_col= 'pcr_plate') pdf(file=paste(out, "total_counts.pdf", sep="/"), width=sqrt(num_profiles)*2, height=sqrt(num_profiles)) print(tc) dev.off() - rm(total_counts, tc) + rm(tc) # Assay QCs _________________________ ---- ## Cell lines recovered ---- print("generating cell_lines_present image") - recovery= annotated_counts %>% dplyr::filter(expected_read, !is.na(CCLE_name)) %>% - dplyr::select(!any_of(c('members'))) %>% - dplyr::left_join(cell_set_meta, by="cell_set", relationship= 'many-to-one') %>% - dplyr::mutate(members= if_else(is.na(members), cell_set, members), # for custom cell sets - expected_num_cl= as.character(members) %>% purrr::map(strsplit, ";") %>% - purrr::map(`[[`, 1) %>% purrr::map(length) %>% as.numeric(), - count_type= ifelse(n > count_threshold, 'Detected', 'Low'), - count_type= ifelse(n==0, 'Missing', count_type)) %>% - dplyr::count(pick(all_of(c('pcr_plate', 'pcr_well', id_cols, 'count_type', 'expected_num_cl'))), name= 'count') %>% - dplyr::mutate(frac_type= count/expected_num_cl) - - cl_rec= recovery %>% tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove=FALSE) %>% - ggplot() + - geom_col(aes(x=profile_id, y=frac_type*100, fill= reorder(count_type, dplyr::desc(count_type)))) + - facet_wrap(~pcr_plate, scales= 'free_x') + - labs(x="", y="Percentage of expected cell lines", fill= '') + - theme_bw() + - theme(axis.text.x = element_text(angle=70, hjust=1, size=5)) + + cl_rec= plot_cl_recovery(filtered_counts, id_cols= id_cols, facet_col= 'pcr_plate', + counts_threshold= counts_threshold, plot_type= 'percent') pdf(file=paste(out, "cell_lines_present.pdf", sep="/"), width=sqrt(num_profiles)*2, height=sqrt(num_profiles)) @@ -291,32 +519,7 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA, if(contains_cbs & is.data.frame(normalized_counts)) { print("generating control_barcode_trend image") - # calculate r2 and mae if columns do not exist - if (!'norm_r2' %in% colnames(normalized_counts) | !'norm_mae' %in% colnames(normalized_counts)) { - cb_trend= normalized_counts %>% - dplyr::filter(control_barcodes %in% c("Y", "T", T), - !(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type), - !is.na(Name)) %>% - tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= TRUE) %>% - dplyr::group_by(profile_id) %>% - dplyr::mutate(mean_y= mean(log2_dose), - residual2= (log2_dose - log2_normalized_n)^2, - squares2= (log2_dose - mean_y)^2, - norm_r2= 1 - sum(residual2)/sum(squares2), - norm_mae= median(abs(log2_dose- log2_normalized_n))) %>% ungroup() - } else { - cb_trend= normalized_counts %>% dplyr::filter(!is.na(Name)) %>% - tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= TRUE) - } - - trend_sc= cb_trend %>% dplyr::mutate(profile_id= reorder(profile_id, dplyr::desc(norm_mae))) %>% - ggplot(aes(x=log2_n, y=log2_dose)) + geom_point() + - geom_abline(aes(slope=1, intercept= cb_intercept) , color='blue') + - geom_text(aes(x= min(log2_n), y= dplyr::desc(sort(unique(log2_dose)))[1], - label= paste('r2=', round(norm_r2, 4), '\nmae=', round(norm_mae, 4), sep='')), - hjust='inward', vjust='inward') + - facet_wrap(~profile_id, scales= 'free_x') + - labs(x= 'log2(n)', y= 'log2(dose)') + theme_bw() + trend_sc= plot_ctrl_bc_trend(normalized_counts, id_cols, counts_col= 'log2_n') pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"), width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2) @@ -327,21 +530,15 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA, ## Sample correlation ----- print("generating sample_cor image") - correlation_matrix= annotated_counts %>% - dplyr::filter(expected_read, !is.na(CCLE_name), - (!trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type)) %>% - tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= TRUE) %>% - dplyr::mutate(log2_n = log2(n +1)) %>% - reshape2::dcast(CCLE_name~profile_id, value.var="log2_n") %>% - column_to_rownames("CCLE_name") %>% - cor(use="pairwise.complete.obs") - - cp= correlation_matrix %>% reshape2::melt() %>% - ggplot() + geom_tile(aes(x=Var1, y=Var2, fill=value)) + - labs(x="", y="", fill="correlation") + - scale_fill_gradient(low="yellow", high="red") + - theme(axis.text.x = element_text(angle=70, hjust=1, size=5), - axis.text.y = element_text(size=5)) + + cor_df= filtered_counts %>% + dplyr::filter(!is.na(DepMap_ID), !is.na(trt_type), !trt_type %in% c("empty", "", "CB_only")) %>% + dplyr::mutate(log2_n= log2(n + 1)) + cp= plot_cor_heatmap(input_df= cor_df, + row_id_cols= c('DepMap_ID'), + col_id_cols= c(sig_cols, id_cols), + counts_col= 'log2_n') + pdf(file=paste(out, "sample_cor.pdf", sep="/"), width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2) print(cp) @@ -349,35 +546,40 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA, rm(correlation_matrix, cp) ## Tech rep correlations ---- - # assumes that tech reps are the last component of profile_id - if('tech_rep' %in% colnames(normalized_counts)) { - if(max(unique(normalized_counts$tech_rep), na.rm= TRUE) == 2) { - print("generating tech rep correlations image") - - static_cols= c('project_code', 'CCLE_name', 'DepMap_ID', 'Name', 'cell_set') - tech_reps_piv= normalized_counts %>% dplyr::mutate(bio_rep_id= str_replace(profile_id, ':\\d+$', '')) %>% - dplyr::group_by_at(c(static_cols, 'bio_rep_id')) %>% dplyr::filter(n!=0, n()==2) %>% dplyr::ungroup() %>% - pivot_wider(id_cols= all_of(c(static_cols, 'bio_rep_id')), - names_from= tech_rep, names_prefix= 'tech_rep', values_from= log2_n) %>% - dplyr::group_by(bio_rep_id) %>% - dplyr::mutate(r2= cor(tech_rep1, tech_rep2, use='p')^2, - type= ifelse(!is.na(CCLE_name), "cell line", "control barcode")) %>% dplyr::ungroup() + if(is.data.frame(normalized_counts)) { + if('tech_rep' %in% colnames(normalized_counts)) { + # Set up replicate groups depending "bio_rep" column + if('bio_rep' %in% colnames(normalized_counts) & !'bio_rep' %in% sig_cols) { + replicate_group_cols= c(sig_cols, 'bio_rep') + } else { + replicate_group_cols= sig_cols + } - tech_reps_plt= tech_reps_piv %>% dplyr::mutate(bio_rep_id= reorder(bio_rep_id, r2)) %>% - ggplot(aes(x= tech_rep1, y= tech_rep2)) + - geom_point(aes(color= type), alpha=0.75) + - geom_smooth(method='lm', se=F, color='black', linewidth=0.5, linetype=2) + - stat_correlation(mapping = use_label(c("R2", "n")))+ - facet_wrap(~bio_rep_id, scales= 'free') + - labs(x="tech rep 1 log2(n)", y="tech rep 2 log2(n)") + theme_bw() + # Handle cases if control barcodes are used. + if('Name' %in% colnames(normalized_counts)) { + unique_cell_line_cols= c(cell_line_cols, 'Name') + } else { + unique_cell_line_cols= cell_line_cols + } - pdf(file=paste(out, "tech_reps_plt.pdf", sep="/"), - width=sqrt(num_profiles), height=sqrt(num_profiles)) - print(tech_reps_plt) - dev.off() + # Create replicate scatter plot + print("generating tech rep correlations image") + tech_reps_plt= make_replicate_scatterplots(input_df= normalized_counts, + cell_line_cols= unique_cell_line_cols, + replicate_group_cols= replicate_group_cols, + replicate_col= 'tech_rep', + values_col= 'log2_n') + + if(!is.null(tech_reps_plt)) { + pdf(file=paste(out, "tech_reps_plt.pdf", sep="/"), + width=sqrt(num_profiles), height=sqrt(num_profiles)) + print(tech_reps_plt) + dev.off() + } else { + print('No technical replicates detected - skipping plot.') + } } } - ## Bio rep correlations ---- if('bio_rep' %in% colnames(normalized_counts)) { num_bio_reps= normalized_counts %>% From 102a06b3b3e57d0f35832fcbb2bba94e3157c6ca Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Mon, 19 Aug 2024 08:49:49 -0400 Subject: [PATCH 002/127] Mapped reads to id cols --- scripts/src/collate_fastq_reads.R | 83 +++++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 15 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index e6b75298..2ea5fd75 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -59,8 +59,8 @@ validate_unique_samples= function(selected_columns, df) { #' @param detected_flowcells A dataframe with the columns "flowcell_name" and "flowcell_lane". #' @param expected_flowcells A dataframe with the columns "flowcell_name" and "flowcell_lane". validate_detected_flowcells= function(detected_flowcells, expected_flowcells) { - missing_flowcells= expected_flowcells %>% - dplyr::anti_join(detected_flowcells, by= c('flowcell_name', 'flowcell_lane')) + missing_flowcells= expected_flowcells %>% dplyr::anti_join(detected_flowcells, by= c('flowcell_name', 'flowcell_lane')) + if(nrow(missing_flowcells) != 0) { print('The following flowcells/lanes specified in the sample meta were not detected in the fastq reads.') print(missing_flowcells) @@ -84,19 +84,42 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) { #' This defaults onto the following columns: "index_1", "index_2" #' @returns Returns a dataframe with columns specified by the sequencing_index_cols, "forward_read_cl_barcode", and "n". #' @import tidyverse -collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, sequencing_index_cols= c('index_1', 'index_2')) { +collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, + sequencing_index_cols= c('index_1', 'index_2'), + id_cols= c('pcr_plate', 'pcr_well'), + reverse_index2= FALSE, + barcode_col= 'forward_read_cl_barcode') { require(tidyverse) + # Reverse index 2 if specified ---- + if(reverse_index2) { + if('index_2' %in% colnames(sample_meta)) { + print("Reverse-complementing index 2 barcode ...") + sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2)) + } else { + stop('Reverse index 2 is set to TRUE, but index_2 does not exists.') + } + } + # Validation: Check that flowcell_names and flowcell_lanes exist in the sample meta ---- if(!validate_columns_exist(c('flowcell_names', 'flowcell_lanes'), sample_meta)) { - stop('flowcell_names and/or flowcell_lanes is NOT present in the sample meta.') + stop('Flowcell_names and/or flowcell_lanes is NOT present in the sample meta.') } # Validation: Check that sequencing_index_cols exist in the sample meta ---- if(!validate_columns_exist(sequencing_index_cols, sample_meta)) { + print('The following sequencing_index_cols are not present in the sample meta.') + print(sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)]) stop('One or more sequencing_index_cols is NOT present in the sample meta.') } + # Validation: Check that id_cols exist in the sample meta ---- + if(!validate_columns_exist(id_cols, sample_meta)) { + print('The following id_cols are not present in the sample meta.') + print(id_cols[!id_cols %in% colnames(sample_meta)]) + stop('One or more id_cols is NOT present in the sample meta.') + } + # Validation: Check that sequencing_index_cols in the sample meta are filled out ---- # Check for rows in sequencing_index_cols that equate to empty - NA, "NA", "", " " # Error out of the sequencing_index_cols are not filled out in the sample meta. @@ -104,12 +127,6 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, sequencing_in stop('One or more sequencing_index_cols in the sample meta is not filled out.') } - # Validation: Check that sequencing_index_cols uniquely identify rows of sample meta ---- - if(!validate_unique_samples(sequencing_index_cols, sample_meta)) { - print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.') - stop('The specified sequencing index columns do NOT uniquely identify every PCR well.') - } - # Determine which flowcell names + lanes are expected ---- # "flowcell_names" and "flowcell_lanes" are strings that can contain more than one item. # Columns can be parsed by splitting on the chars , ; : @@ -139,13 +156,49 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, sequencing_in print(detected_flowcells) validate_detected_flowcells(detected_flowcells, expected_flowcells) - # Create raw counts by summing over appropriate sequencing_index_cols ---- - # Use an inner join to collect reads with valid flowcell name/lane combinations, - # then sum reads across sequencing columns + # Validation: Check that sequencing_index_cols uniquely identify rows of sample meta ---- + if(!validate_unique_samples(sequencing_index_cols, sample_meta)) { + print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.') + stop('The specified sequencing index columns do NOT uniquely identify every PCR well.') + } + + # Validation: Check that id_cols uniquely identify rows of sample meta ---- + if(!validate_unique_samples(id_cols, sample_meta)) { + print('There may be multiple entries in the sample meta that have the same combination of ID columns.') + stop('The specified ID columns do NOT uniquely identify every PCR well.') + } + + # Create sequence map ---- + sequencing_map= sample_meta %>% dplyr::distinct(pick(all_of(c(sequencing_index_cols, id_cols)))) + + # Validation: Check that mapping is one to one ---- + check_mapping= sequencing_map %>% dplyr::group_by(pick(all_of(sequencing_index_cols))) %>% + dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup() + if(nrow(check_mapping) > 0) { + print('The following sequening locations map to multiple conditions.') + print(check_mapping) + stop('The sequencing index columns do not map 1 to 1 to the ID columns.') + } + + # Create raw counts file ---- + # Filter for the expected flowcells and summed up the reads over the ID cols. + print('Summing up reads ...') raw_counts= uncollapsed_raw_counts %>% - dplyr::inner_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane'), relationship= 'many-to-one') %>% - dplyr::group_by(pick(all_of(c(sequencing_index_cols, 'forward_read_cl_barcode')))) %>% + dplyr::semi_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane')) %>% + dplyr::inner_join(sequencing_map, by= intersect(colnames(.), colnames(sequencing_map)), relationship= 'many-to-one') %>% + dplyr::group_by(pick(all_of(c(id_cols, barcode_col)))) %>% dplyr::summarize(n= sum(n)) %>% dplyr::ungroup() + # Calculate index purity ---- + index_purity= sum(raw_counts$n) / sum(uncollapsed_raw_counts$n) + print(paste0('Index purity: ', round(index_purity, 4))) + if(index_purity > 1) { + stop('ERROR: Index purity is greater than 1!') + } + if(index_purity < 0.5) { + print('Warning: Low index purity!') + } + + print('Done!') return(raw_counts) } From df5e36ac45dd6b051370583e72d5d3d2003300b0 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Mon, 19 Aug 2024 09:04:17 -0400 Subject: [PATCH 003/127] Removed sequencing_index_cols Raw counts is now assumed to contain the columns specified in id_cols --- scripts/src/filter_raw_reads.R | 71 +++++++++++++++------------------- 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R index ad4b7e22..8e663bc0 100755 --- a/scripts/src/filter_raw_reads.R +++ b/scripts/src/filter_raw_reads.R @@ -26,6 +26,13 @@ validate_columns_exist= function(selected_columns, df) { validate_unique_samples= function(selected_columns, df) { unique_column_values= df %>% dplyr::distinct(pick(all_of(selected_columns))) if(nrow(unique_column_values) != nrow(df)) { + print('The selected columns do not uniquely identify all rows.') + + dups= df %>% dplyr::group_by(pick(all_of(selected_columns))) %>% + dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup() %>% + dplyr::arrange(pick(all_of(selected_columns))) + print(dups) + return(FALSE) } else { return(TRUE) @@ -93,7 +100,7 @@ validate_cell_set_luas= function(sample_meta, cell_set_meta) { #' @export filter_raw_reads = function(raw_counts, sample_meta, cell_line_meta, cell_set_meta, CB_meta, - sequencing_index_cols= c('index_1', 'index_2'), + id_cols= c('pcr_plate', 'pcr_well'), reverse_index2= FALSE, count_threshold= 40) { require(tidyverse) @@ -101,10 +108,12 @@ filter_raw_reads = function(raw_counts, # Processing metadata and inputs ---- # CB meta is in log10 and should be converted to log2. - print("Converting CB_meta from log10 to log2 ...") - CB_meta= CB_meta %>% dplyr::mutate(log2_dose= log_dose/log10(2)) %>% dplyr::select(-log_dose) + if('log_dose' %in% colnames(CB_meta)) { + print("Converting CB_meta from log10 to log2 ...") + CB_meta= CB_meta %>% dplyr::mutate(log2_dose= log_dose/log10(2)) %>% dplyr::select(-log_dose) + } - if (reverse_index2) { + if(reverse_index2) { if ('index_2' %in% colnames(sample_meta)) { print("Reverse-complementing index 2 barcode ...") sample_meta$index_2 <- chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2)) @@ -113,15 +122,15 @@ filter_raw_reads = function(raw_counts, } } - # Validation: Check that sequencing_index_cols exist in the sample meta ---- - if(!validate_columns_exist(sequencing_index_cols, sample_meta)) { - stop('One or more sequencing_index_cols is NOT present in the sample meta.') + # Validation: Check that id_cols exist in the sample meta ---- + if(!validate_columns_exist(id_cols, sample_meta)) { + stop('One or more id_cols is NOT present in the sample meta.') } - # Validation: Check that sequencing_index_cols uniquely identify every rows of sample meta ---- - if(!validate_unique_samples(sequencing_index_cols, sample_meta)) { - print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.') - stop('The specified sequencing index columns do NOT uniquely identify every PCR well.') + # Validation: Check that id_cols uniquely identify every rows of sample meta ---- + if(!validate_unique_samples(id_cols, sample_meta)) { + print('There may be multiple entries in the sample meta that have the same combination of id_cols.') + stop('The specified ID columns do NOT uniquely identify every PCR well.') } # Validation: Check that cell sets do not contain duplicate LUAs ---- @@ -129,24 +138,6 @@ filter_raw_reads = function(raw_counts, # This currently does NOT result in an error. Error avoided using a distinct later in line 162 validate_cell_set_luas(sample_meta, cell_set_meta) - # Filtering by sequencing columns ---- - # Filter raw counts using the sequencing columns. - # Also create "mapped" column to identify reads that mapped to all known PRISM sequences. - print("Filtering by sequencing columns ...") - unique_sequencing_index_vals= sample_meta %>% dplyr::distinct(pick(all_of(sequencing_index_cols))) - index_filtered= raw_counts %>% dplyr::semi_join(unique_sequencing_index_vals, by= sequencing_index_cols) %>% - dplyr::mutate(mapped= forward_read_cl_barcode %in% c(cell_line_meta$Sequence, CB_meta$Sequence)) - - # Calculate index purity for QC table. - index_purity= sum(index_filtered$n)/ sum(raw_counts$n) - - # Split off unmapped reads ---- - # Unmapped reads are defined as having valid indices but do not map to barcodes in PRISM. - # Also sorted reads in descending order by read count. - print('Splitting off unmapped reads ...') - unmapped_reads= index_filtered %>% dplyr::filter(mapped==F) %>% dplyr::select(-mapped) %>% - dplyr::arrange(dplyr::desc(n)) - # Creating a template of all expected reads in the run ---- # Use all 4 meta data files to create a "template" dataframe where # every row is a cell line that is expected in a PCR well. @@ -174,12 +165,12 @@ filter_raw_reads = function(raw_counts, # Reads that to not match to the template are contaminants and, # reads that are only present in the template are missing/not detected by PCR. print("Annotating reads ...") - annotated_counts= index_filtered %>% dplyr::filter(mapped) %>% + annotated_counts= raw_counts %>% dplyr::left_join(cell_line_meta, by= join_by('forward_read_cl_barcode'=='Sequence'), relationship= 'many-to-one') %>% dplyr::left_join(CB_meta, by= join_by('forward_read_cl_barcode'=='Sequence'), relationship= 'many-to-one') %>% - dplyr::left_join(sample_meta, by= sequencing_index_cols, relationship= 'many-to-one') %>% + dplyr::left_join(sample_meta, by= id_cols, relationship= 'many-to-one') %>% dplyr::full_join(template %>% dplyr::mutate(expected_read= T), by= c('forward_read_cl_barcode'='Sequence', intersect(colnames(template), colnames(.))), relationship= 'one-to-one') %>% @@ -197,15 +188,17 @@ filter_raw_reads = function(raw_counts, dplyr::mutate(flag= ifelse(n==0, 'Missing', NA), flag= ifelse(n!=0 & n < count_threshold, 'low counts', flag)) - # Calculate cell line purity for the QC table. - cell_line_purity= sum(filtered_counts$n)/ sum(index_filtered$n) - - # Generating QC table ---- - print('Generating QC table ...') - qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity) + # Calculate cell line purity ---- + cell_line_purity= sum(filtered_counts$n)/ sum(raw_counts$n) + print(paste0('Cell line purity: ', round(cell_line_purity, 4))) + if(cell_line_purity > 1) { + stop('ERROR: Cell line purity is greater than 1!') + } + if(cell_line_purity < 0.5) { + print('Warning: Low cell line purity!') + } - return(list(unmapped_reads= unmapped_reads, annotated_counts= annotated_counts, - filtered_counts= filtered_counts, qc_table= qc_table)) + return(list(annotated_counts= annotated_counts, filtered_counts= filtered_counts)) } # checks is a string can be numeric From 2ba40b10c551e569facd63d956147eb8862ff9f6 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Mon, 19 Aug 2024 09:04:36 -0400 Subject: [PATCH 004/127] Updated script to reflect function changes --- scripts/collate_fastq_reads.R | 39 ++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index d6cf1bd5..29f4d994 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -13,6 +13,12 @@ parser$add_argument("-c", "--uncollapsed_raw_counts", default="raw_counts_uncoll parser$add_argument("--sample_meta", default="sample_meta.csv", help = "Sample metadata") parser$add_argument("--sequencing_index_cols", default= "index_1,index_2", help = "Sequencing columns in the sample meta") +parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", + help = "Columns that identify a unique PCR well") +parser$add_argument("--reverse_index2", action="store_true", default=FALSE, + help= "Reverse complement of index 2 for NovaSeq and NextSeq") +parser$add_argument("--barcode_col", default= "forward_read_cl_barcode", + help= "Name of the column in uncollapsed_raw_counts that contains the barcode sequences.") parser$add_argument("-o", "--out", default=getwd(), help = "Output path. Default is working directory") # get command line options, if help option encountered print help and exit @@ -27,27 +33,40 @@ if (args$out == "") { expected_file_path <- paste(args$out, "raw_counts_uncollapsed.csv", sep='/') if(file.exists(expected_file_path)) { - sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',', data.table= F) + # Read in files and parse vector arguments uncollapsed_raw_counts= data.table::fread(expected_file_path, header= T, sep= ',', data.table= F) + sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',', data.table= F) + sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ",")) + id_cols= unlist(strsplit(args$id_cols, ",")) + + # Validation: Check that sequencing_index_cols are from sample meta column names + if(!all(sequencing_index_cols %in% colnames(sample_meta))) { + stop(paste('The following sequencing_index_cols were not found in the sample meta: ', + sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)])) + } - # Validation: Check if sequencing_index_cols is composed of sample meta column names - if (!all(sequencing_index_cols %in% colnames(sample_meta))) { - stop(paste("Colnames not found in sample_meta, check metadata or --sequencing_index_cols argument:", - args$sequencing_index_cols)) + # Validation: Check that id_cols are from sample meta column names + if(!all(id_cols %in% colnames(sample_meta))) { + stop(paste('The following id_cols were not found in the sample meta: ', + id_cols[!id_cols %in% colnames(sample_meta)])) } - print("Collating fastq reads") - raw_counts= collate_fastq_reads(uncollapsed_raw_counts, sample_meta, sequencing_index_cols) + print("Collating fastq reads ...") + raw_counts= collate_fastq_reads(uncollapsed_raw_counts, sample_meta, + sequencing_index_cols, + id_cols, + reverse_index2= args$reverse_index2, + barcode_col= args$barcode_col) # Validation: Basic file size check if(nrow(raw_counts) == 0) { stop('ERROR: Empty file generated. No rows in raw_counts output.') } - rc_out_file = paste(args$out, 'raw_counts.csv', sep='/') - print(paste("Writing to file: ", rc_out_file)) - write.csv(raw_counts, rc_out_file, row.names=F, quote=T) + rc_out_file= paste(args$out, 'raw_counts.csv', sep='/') + print(paste("Writing raw_counts.csv to ", rc_out_file)) + write.csv(raw_counts, rc_out_file, row.names= FALSE, quote= FALSE) } else { print("Uncollapsed raw counts file not detected. Proceeding with generating filtered counts file.") } From 66d58f511a99eb05e8f9eb8c3e51dd0a88b23a39 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Mon, 19 Aug 2024 14:52:20 -0400 Subject: [PATCH 005/127] Wrapped qcs in trycatches --- scripts/src/QC_images.R | 600 +++++++++++++++++++++++----------------- 1 file changed, 353 insertions(+), 247 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index 29ef4d87..d5bc5994 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -6,9 +6,9 @@ #' @param selected_columns A vector of strings each representing a column name #' @param df A dataframe to check against #' @return Boolean -validate_columns_exist= function(selected_columns, df) { - # Check that all of selected_columns are in df - unmatched_cols= setdiff(selected_columns, colnames(df)) +validate_columns_exist= function(selected_cols, df) { + # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B]. + unmatched_cols= base::setdiff(selected_cols, colnames(df)) if(length(unmatched_cols) > 0) { print('The following columns are missing: ') @@ -55,13 +55,13 @@ get_index_summary= function(df, index_col, valid_indices) { #' @param raw_counts_uncollapsed Dataframe output from nori. #' @param raw_counts Raw counts dataframe outputed from collate_fastq_reads. #' @param filtered_counts Filtered counts dataframe outputed from filter_raw_reads. -#' @param counts_col String name of the counts column in all three dataframes. +#' @param value_col String name of the counts column in all three dataframes. #' @param file_path Location to write out the output. create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts, - counts_col= 'n', file_path) { - index_purity= sum(raw_counts[[counts_col]]) / sum(raw_counts_uncollapsed[[counts_col]]) + value_col= 'n', file_path) { + index_purity= sum(raw_counts[[value_col]]) / sum(raw_counts_uncollapsed[[value_col]]) print(paste0('Index purity: ', round(index_purity, 4))) - cell_line_purity= sum(filtered_counts[[counts_col]]) / sum(raw_counts[[counts_col]]) + cell_line_purity= sum(filtered_counts[[value_col]]) / sum(raw_counts[[value_col]]) print(paste0('Cell line purity: ', round(cell_line_purity, 4))) qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity) @@ -78,7 +78,7 @@ create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts, #' @param filtered_counts Filtered counts dataframe. #' @param id_cols Vector of columns names that identify each sample. #' @param facet_col String name of the column in filtered_counts to facet the plot. -plot_total_counts= function(filtered_counts, id_cols, facet_col= NA) { +create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) { total_counts= filtered_counts %>% dplyr::mutate(barcode_type= case_when(!is.na(CCLE_name) ~ 'cell line', !is.na(Name) ~ 'ctrl barcode')) %>% @@ -106,12 +106,12 @@ plot_total_counts= function(filtered_counts, id_cols, facet_col= NA) { #' @param filtered_counts Filtered counts dataframe. #' @param id_cols Vector of column names that identify each sample. #' @param facet_col String name of the column in filtered_counts to facet the plot. -#' @param counts_col String name of the column in filtered_counts that contains the counts. +#' @param value_col String name of the column in filtered_counts that contains the counts. #' @param counts_threshold Threshold used to determine low counts. #' @param plot_type description #' @param include_ctrl_bcs description -plot_cl_recovery= function(filtered_counts, id_cols, facet_col= NA, counts_col= 'n', counts_threshold, - plot_type= 'percent', include_ctrl_bcs= FALSE) { +create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value_col= 'n', count_threshold, + plot_type= 'percent', include_ctrl_bcs= FALSE) { # Filter out control barcodes if it is specified. if(include_ctrl_bcs == FALSE) { @@ -121,9 +121,9 @@ plot_cl_recovery= function(filtered_counts, id_cols, facet_col= NA, counts_col= # Count number of cell lines/ barcodes for a detection group. recovery= filtered_counts %>% dplyr::add_count(pick(all_of(id_cols)), name= 'total_num_cls') %>% - dplyr::mutate(detect_type= case_when(.data[[counts_col]] == 0 ~ 'Not detected', - .data[[counts_col]] <= counts_threshold ~ 'Low counts', - .data[[counts_col]] > counts_threshold ~ 'Detected')) %>% + dplyr::mutate(detect_type= case_when(.data[[value_col]] == 0 ~ 'Not detected', + .data[[value_col]] <= count_threshold ~ 'Low counts', + .data[[value_col]] > count_threshold ~ 'Detected')) %>% dplyr::count(pick(all_of(c(id_cols, facet_col, 'detect_type', 'total_num_cls'))), name= 'num_cls_by_type') %>% tidyr::unite(all_of(id_cols), col= 'sample_id', sep= ':', remove= FALSE, na.rm= FALSE) %>% dplyr::mutate(percent= (num_cls_by_type / total_num_cls) * 100) @@ -157,7 +157,7 @@ plot_cl_recovery= function(filtered_counts, id_cols, facet_col= NA, counts_col= #' text #' #' @param name description -plot_ctrl_bc_trend= function(normalized_counts, id_cols, counts_col= 'log2_n') { +create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n') { # Detect norm_r2 and norm_mae. # If columns do not exist, then roughly calculate those columns. if(any(!c('norm_r2', 'norm_mae') %in% colnames(normalized_counts))) { @@ -178,11 +178,11 @@ plot_ctrl_bc_trend= function(normalized_counts, id_cols, counts_col= 'log2_n') { # and determine the x and y positions for the r2 + mae label. cb_trend= normalized_counts %>% dplyr::filter(!is.na(Name), control_barcodes %in% c("Y", "T", T)) %>% tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= TRUE) %>% - dplyr::group_by(profile_id) %>% dplyr::mutate(label_x_pos= min(.data[[counts_col]]), + dplyr::group_by(profile_id) %>% dplyr::mutate(label_x_pos= min(.data[[value_col]]), label_y_pos= max(log2_dose)) %>% dplyr::ungroup() # Create control barcode trend plot - trend_scatter_plot= cb_trend %>% ggplot(aes(x= .data[[counts_col]], y= log2_dose)) + + trend_scatter_plot= cb_trend %>% ggplot(aes(x= .data[[value_col]], y= log2_dose)) + geom_point() + geom_abline(aes(slope=1, intercept= cb_intercept) , color='blue', alpha= 0.5) + geom_text(aes(x= label_x_pos, y= label_y_pos, @@ -200,11 +200,11 @@ plot_ctrl_bc_trend= function(normalized_counts, id_cols, counts_col= 'log2_n') { #' text #' #' @param input_df description -plot_cor_heatmap= function(input_df, row_id_cols, col_id_cols, counts_col, +create_cor_heatmap= function(input_df, row_id_cols, col_id_cols, value_col, cor_method= 'pearson') { # Validate that specified columns are in the dataframe. - if(!validate_columns_exist(c(row_id_cols, col_id_cols, counts_col), input_df)) { + if(!validate_columns_exist(c(row_id_cols, col_id_cols, value_col), input_df)) { stop('Not all columns were detected in the input dataframe.') } @@ -223,14 +223,14 @@ plot_cor_heatmap= function(input_df, row_id_cols, col_id_cols, counts_col, } # Pivot and calculate correlations - correlation_mx= correlation_mx %>% reshape2::acast(row_id~col_id, value.var= counts_col) %>% + correlation_mx= correlation_mx %>% reshape2::acast(row_id~col_id, value.var= value_col) %>% WGCNA::cor(use= 'pairwise.complete.obs', method= cor_method) # Create heatmap cor_heatmap= correlation_mx %>% reshape2::melt() %>% ggplot(aes(x= Var1, y= Var2, fill= value)) + geom_tile() + - labs(x= '', y= '', fill= '', title= paste0('Correlations using ', counts_col)) + + labs(x= '', y= '', fill= '', title= paste0('Correlations using ', value_col)) + scale_fill_gradientn(breaks= c(0, 0.5, 1), colours= c('blue', 'white','red'), limits=c(0, 1), oob= scales::squish) + @@ -242,12 +242,13 @@ plot_cor_heatmap= function(input_df, row_id_cols, col_id_cols, counts_col, #' Scatterplots of two replicates #' #' @param input_df description -make_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_cols, replicate_col, values_col, +create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_cols, replicate_col, value_col, x_axis_rep= '1', y_axis_rep= '2') { reps_piv= input_df %>% tidyr::unite(all_of(replicate_group_cols), col= 'replicate_group', sep= ':', remove= TRUE, na.rm= FALSE) %>% dplyr::group_by(pick(all_of(c(cell_line_cols, 'replicate_group')))) %>% - dplyr::filter(n!= 0, dplyr::n() >= 2, !is.na(.data[[replicate_col]]), .data[[replicate_col]] != '') %>% + dplyr::filter(!is.na(.data[[replicate_col]]), .data[[replicate_col]] != '', .data[[value_col]] != 0, + dplyr::n() >= 2) %>% dplyr::ungroup() # Retun a null object if no entries pass the filter. @@ -255,7 +256,7 @@ make_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_ reps_piv= reps_piv %>% pivot_wider(id_cols= all_of(c(cell_line_cols, 'replicate_group')), - names_from= replicate_col, names_prefix= replicate_col, values_from= values_col) %>% + names_from= replicate_col, names_prefix= replicate_col, values_from= value_col) %>% dplyr::mutate(type= ifelse(!is.na(CCLE_name), "cell line", "control barcode")) %>% dplyr::ungroup() # Create names of the columns to plot on xy axes @@ -264,11 +265,12 @@ make_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_ reps_scatter= reps_piv %>% dplyr::filter(!is.na(.data[[x_col_name]]), !is.na(.data[[y_col_name]])) %>% ggplot(aes(x= .data[[x_col_name]], y= .data[[y_col_name]])) + + geom_abline(color='black', linewidth=0.5, linetype=2) + geom_point(aes(color= type), alpha=0.75) + - geom_smooth(method='lm', se=F, color='black', linewidth=0.5, linetype=2) + ggpmisc::stat_correlation(mapping = use_label(c("R2", "n")))+ facet_wrap(~replicate_group, scales= 'free') + - labs(x= paste0(replicate_col, '1 ', values_col), y= paste0(replicate_col, '2 ', values_col)) + + labs(x= paste0(replicate_col, '1 ', value_col), y= paste0(replicate_col, '2 ', value_col), + title= paste0('Scatter plot of ', replicate_col, ' with ', value_col)) + theme_bw() return(reps_scatter) @@ -300,7 +302,8 @@ make_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_ #' @param reverse_index2 reverse index 2 if newer sequencers are used. #' @return - NA, QC images are written out to the specified folder #' @export -QC_images = function(raw_counts, annotated_counts, normalized_counts= NA, +QC_images = function(raw_counts_uncollapsed, raw_counts, + filtered_counts, normalized_counts= NA, l2fc, sample_meta, CB_meta, cell_set_meta, id_cols, sig_cols, count_col_name= 'normalized_n', control_type, count_threshold= 40, @@ -315,6 +318,7 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA, } # Some preprocessing ---- + skipped_qcs= c() # empty vector to collect potential errors num_profiles = annotated_counts %>% dplyr::distinct(pick(all_of(id_cols))) %>% nrow() # Reverse index 2 barcodes @@ -346,208 +350,286 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA, # Sequencing QCs ____________________ ---- ## Purity metrics ---- # call this function - print('Generating QC table') + print('1. Generating QC table ...') create_qc_table(raw_counts_uncollapsed, raw_counts, filtered_counts, - counts_col= 'n', file_path) + value_col= 'n', file_path= paste0(out, '/QC_table.csv')) # ## Index count summaries ---- - print("Generating index counts tables") + print("2. Generating index counts tables ...") # Check that "IndexBarcode1" and "index_1" columns are present. # If so, calculate index summary and write out. - if('index_1' %in% colnames(sample_meta) & 'index_1' %in% colnames(raw_counts)) { + if('index_1' %in% colnames(sample_meta) & 'index_1' %in% colnames(raw_counts_uncollapsed)) { expected_index1= unique(sample_meta$index_1) - index1_counts= get_index_summary(raw_counts, 'index_1', expected_index1) + index1_counts= get_index_summary(raw_counts_uncollapsed, 'index_1', expected_index1) index1_counts %>% write.csv(file= paste(out, 'index1_counts.csv', sep='/'), row.names=F) } else { - print('Column "index_1" not detected. Skipping index 1 summaries ...') + print('Column "index_1" not detected. Skipping index 1 summaries ...', quote= FALSE) } # Do the same for index 2. - if('index_2' %in% colnames(sample_meta) & 'index_2' %in% colnames(raw_counts)) { + if('index_2' %in% colnames(sample_meta) & 'index_2' %in% colnames(raw_counts_uncollapsed)) { expected_index2= unique(sample_meta$index_2) - index2_counts= get_index_summary(raw_counts, 'index_2', expected_index2) + index2_counts= get_index_summary(raw_counts_uncollapsed, 'index_2', expected_index2) index2_counts %>% write.csv(file= paste(out, 'index2_counts.csv', sep='/'), row.names=F) } else { - print('Column "index_2" not detected. Skipping index 2 summaries ...') + print('Column "index_2" not detected. Skipping index 2 summaries ...', quote= FALSE) } ## Total counts ---- - print("generating total_counts image") - - tc= plot_total_counts(filtered_counts, id_cols, facet_col= 'pcr_plate') - - pdf(file=paste(out, "total_counts.pdf", sep="/"), - width=sqrt(num_profiles)*2, height=sqrt(num_profiles)) - print(tc) - dev.off() - rm(tc) + print("3. Generating total_counts image ...") + potential_error= base::tryCatch({ + tc= create_total_counts_barplot(filtered_counts, id_cols, facet_col= 'pcr_plate') + + pdf(file=paste(out, "total_counts.pdf", sep="/"), + width=sqrt(num_profiles)*2, height=sqrt(num_profiles)) + print(tc) + dev.off() + rm(tc) + }, error= function(e) { + print(e) + print('Encountered an error when creating the total counts barplot. Skipping this output ...') + return('QC table') + }) + + # Collect returned string if an error occurred + if(!is.null(potential_error)) { + skipped_qcs = c(skipped_qcs, potential_error) + } # Assay QCs _________________________ ---- ## Cell lines recovered ---- - print("generating cell_lines_present image") - - cl_rec= plot_cl_recovery(filtered_counts, id_cols= id_cols, facet_col= 'pcr_plate', - counts_threshold= counts_threshold, plot_type= 'percent') - - pdf(file=paste(out, "cell_lines_present.pdf", sep="/"), - width=sqrt(num_profiles)*2, height=sqrt(num_profiles)) - print(cl_rec) - dev.off() - rm(recovery, cl_rec) + print("4. Generating cell_lines_present image ...") + potential_error= base::tryCatch({ + cl_rec= create_recovery_barplot(filtered_counts, id_cols= id_cols, facet_col= 'pcr_plate', + count_threshold= count_threshold, plot_type= 'percent') + + pdf(file=paste(out, "cell_lines_present.pdf", sep="/"), + width=sqrt(num_profiles)*2, height=sqrt(num_profiles)) + print(cl_rec) + dev.off() + rm(cl_rec) + }, error= function(e) { + print(e) + print('Encountered an error when creating the recovery barplot. Skipping this output ...') + return('CL recovery') + }) + + # Collect returned string if an error occurred + if(!is.null(potential_error)) { + skipped_qcs = c(skipped_qcs, potential_error) + } ## Contaminants ---- - print('generating contaminate cell lines') - contams= annotated_counts %>% dplyr::filter(expected_read==F) %>% - dplyr::mutate(barcode_id= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>% - dplyr::group_by(forward_read_cl_barcode, barcode_id) %>% - dplyr::summarise(num_wells= n(), median_n=median(n), max_n= max(n)) %>% ungroup() %>% - dplyr::arrange(desc(num_wells)) - - contams %>% write.csv(file= paste(out, 'contam_cell_lines.csv', sep='/'), row.names=F) - rm(contams) - - # Contaminates for ursula ---- - print('generating contaminate reads for Ursula') - # Determine which seq cols are present. - rc_seq_cols= c('flowcell_names', 'flowcell_lanes', 'index_1', 'index_2') - present_seq_cols= intersect(rc_seq_cols, colnames(raw_counts)) - - # map of seq_cols to PCR locations - pcr_plate_map= sample_meta %>% - dplyr::distinct(pick(any_of(c(present_seq_cols, 'pcr_plate', 'pcr_well', 'cell_set')))) %>% - dplyr::group_by(pcr_plate) %>% dplyr::mutate(num_wells_in_plate= dplyr::n()) %>% dplyr::ungroup() %>% - dplyr::group_by(cell_set) %>% dplyr::mutate(num_wells_in_set= dplyr::n()) %>% dplyr::ungroup() - - # index filter and identify reads as mapped or not - unique_seq_col_vals= sample_meta %>% dplyr::distinct(pick(all_of(present_seq_cols))) - sequencing_filter= raw_counts %>% - dplyr::semi_join(unique_seq_col_vals, by= present_seq_cols) %>% - dplyr::mutate(mapped= ifelse(forward_read_cl_barcode %in% unique(annotated_counts$forward_read_cl_barcode), T, F)) - - # total counts per well - used to calculate fractions - counts_per_well= sequencing_filter %>% dplyr::group_by(pick(all_of(present_seq_cols))) %>% - dplyr::summarise(well_total_n= sum(n)) %>% dplyr::ungroup() - - # mapped contaminates to bind - mapped_contams= annotated_counts %>% dplyr::filter(!expected_read) %>% - dplyr::mutate(barcode_name= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>% - dplyr::select(all_of(c(present_seq_cols, 'forward_read_cl_barcode', 'n', 'barcode_name'))) - - contam_reads= sequencing_filter %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>% - dplyr::bind_rows(mapped_contams) %>% - dplyr::left_join(counts_per_well, by= present_seq_cols) %>% - dplyr::left_join(pcr_plate_map, by= present_seq_cols) %>% - # filter out barcodes that only appear in one well - dplyr::group_by(forward_read_cl_barcode) %>% dplyr::filter(dplyr::n() >1) %>% dplyr::ungroup() %>% - # number of wells in a pcr plate a barcode is detected in - dplyr::group_by(forward_read_cl_barcode, pcr_plate) %>% - dplyr::mutate(num_wells_detected_plate= n()) %>% dplyr::ungroup() %>% - # number of wells in a cell set a barcode is detected in - dplyr::group_by(forward_read_cl_barcode, cell_set) %>% - dplyr::mutate(num_wells_detected_set= n()) %>% dplyr::ungroup() %>% - # determine if contamination is project, plate, or set - dplyr::group_by(forward_read_cl_barcode) %>% - dplyr::mutate(num_wells_detected= dplyr::n(), - project_code= unique(sample_meta$project_code), - fraction= n/well_total_n, - type1= ifelse(sum(num_wells_detected== nrow(pcr_plate_map))>1, 'project_contam', NA), - type2= ifelse(sum(num_wells_detected== num_wells_detected_plate & - num_wells_detected_plate == num_wells_in_plate)>1, 'plate_contam', NA), - type3= ifelse(sum(num_wells_detected == num_wells_detected_set & - num_wells_detected_set== num_wells_in_set)>1, 'set_contam', NA)) %>% - dplyr::ungroup() %>% - tidyr::unite(scope, all_of(c('type1', 'type2', 'type3')), sep=',', remove = T, na.rm = T) %>% - dplyr::group_by(project_code, forward_read_cl_barcode, barcode_name, scope, num_wells_detected) %>% - dplyr::summarise(min_n= min(n), med_n= median(n), max_n= max(n), - min_fraction= min(fraction), med_fraction= median(fraction), max_fraction=max(fraction)) %>% - dplyr::arrange(desc(max_fraction)) - - # write out - contam_reads %>% write.csv(paste0(out, 'contam_reads.csv'), row.names=F) + print('5. Generating contaminate cell lines ...') + potential_error= base::tryCatch({ + contams= annotated_counts %>% dplyr::filter(expected_read==F) %>% + dplyr::mutate(barcode_id= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>% + dplyr::group_by(forward_read_cl_barcode, barcode_id) %>% + dplyr::summarise(num_wells= n(), median_n=median(n), max_n= max(n)) %>% ungroup() %>% + dplyr::arrange(desc(num_wells)) + + contams %>% write.csv(file= paste(out, 'contam_cell_lines.csv', sep='/'), row.names=F) + rm(contams) + }, error= function(e) { + print(e) + print('Encountered an error when creating the contaminants file. Skipping this output ...') + return('contam_cell_lines.csv') + }) + + # Collect returned string if an error occurred + if(!is.null(potential_error)) { + skipped_qcs = c(skipped_qcs, potential_error) + } + + ## Contaminates for ursula ---- + print('6. Generating contaminate reads for Ursula ...') + potential_error= base::tryCatch({ + # Determine which seq cols are present. + rc_seq_cols= c('flowcell_names', 'flowcell_lanes', 'index_1', 'index_2') + present_seq_cols= intersect(rc_seq_cols, colnames(raw_counts)) + + # map of seq_cols to PCR locations + pcr_plate_map= sample_meta %>% + dplyr::distinct(pick(any_of(c(present_seq_cols, 'pcr_plate', 'pcr_well', 'cell_set')))) %>% + dplyr::group_by(pcr_plate) %>% dplyr::mutate(num_wells_in_plate= dplyr::n()) %>% dplyr::ungroup() %>% + dplyr::group_by(cell_set) %>% dplyr::mutate(num_wells_in_set= dplyr::n()) %>% dplyr::ungroup() + + # index filter and identify reads as mapped or not + unique_seq_col_vals= sample_meta %>% dplyr::distinct(pick(all_of(present_seq_cols))) + sequencing_filter= raw_counts %>% + dplyr::semi_join(unique_seq_col_vals, by= present_seq_cols) %>% + dplyr::mutate(mapped= ifelse(forward_read_cl_barcode %in% unique(annotated_counts$forward_read_cl_barcode), T, F)) + + # total counts per well - used to calculate fractions + counts_per_well= sequencing_filter %>% dplyr::group_by(pick(all_of(present_seq_cols))) %>% + dplyr::summarise(well_total_n= sum(n)) %>% dplyr::ungroup() + + # mapped contaminates to bind + mapped_contams= annotated_counts %>% dplyr::filter(!expected_read) %>% + dplyr::mutate(barcode_name= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>% + dplyr::select(all_of(c(present_seq_cols, 'forward_read_cl_barcode', 'n', 'barcode_name'))) + + contam_reads= sequencing_filter %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>% + dplyr::bind_rows(mapped_contams) %>% + dplyr::left_join(counts_per_well, by= present_seq_cols) %>% + dplyr::left_join(pcr_plate_map, by= present_seq_cols) %>% + # filter out barcodes that only appear in one well + dplyr::group_by(forward_read_cl_barcode) %>% dplyr::filter(dplyr::n() >1) %>% dplyr::ungroup() %>% + # number of wells in a pcr plate a barcode is detected in + dplyr::group_by(forward_read_cl_barcode, pcr_plate) %>% + dplyr::mutate(num_wells_detected_plate= n()) %>% dplyr::ungroup() %>% + # number of wells in a cell set a barcode is detected in + dplyr::group_by(forward_read_cl_barcode, cell_set) %>% + dplyr::mutate(num_wells_detected_set= n()) %>% dplyr::ungroup() %>% + # determine if contamination is project, plate, or set + dplyr::group_by(forward_read_cl_barcode) %>% + dplyr::mutate(num_wells_detected= dplyr::n(), + project_code= unique(sample_meta$project_code), + fraction= n/well_total_n, + type1= ifelse(sum(num_wells_detected== nrow(pcr_plate_map))>1, 'project_contam', NA), + type2= ifelse(sum(num_wells_detected== num_wells_detected_plate & + num_wells_detected_plate == num_wells_in_plate)>1, 'plate_contam', NA), + type3= ifelse(sum(num_wells_detected == num_wells_detected_set & + num_wells_detected_set== num_wells_in_set)>1, 'set_contam', NA)) %>% + dplyr::ungroup() %>% + tidyr::unite(scope, all_of(c('type1', 'type2', 'type3')), sep=',', remove = T, na.rm = T) %>% + dplyr::group_by(project_code, forward_read_cl_barcode, barcode_name, scope, num_wells_detected) %>% + dplyr::summarise(min_n= min(n), med_n= median(n), max_n= max(n), + min_fraction= min(fraction), med_fraction= median(fraction), max_fraction=max(fraction)) %>% + dplyr::arrange(desc(max_fraction)) + + # write out + contam_reads %>% write.csv(paste0(out, 'contam_reads.csv'), row.names=F) + }, error= function(e) { + print(e) + print('Encountered an error when creating the contams for UW file. Skipping this output ...') + return('contam for UW') + }) + + # Collect returned string if an error occurred + if(!is.null(potential_error)) { + skipped_qcs = c(skipped_qcs, potential_error) + } ## Cumulative counts by lines in negcons ---- - print("generating cummulative image") - cdf= annotated_counts %>% dplyr::select(!any_of(c('members'))) %>% - dplyr::filter(expected_read, trt_type == control_type) %>% - dplyr::left_join(num_cls_in_set, by= "cell_set") %>% - dplyr::mutate(expected_num_cl= ifelse(control_barcodes, expected_num_cl + length(unique(CB_meta$Name)), - expected_num_cl)) %>% # add CBs to expected_num_cl if there are CBs - tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= FALSE) %>% - dplyr::group_by(pcr_plate, pcr_well, profile_id, expected_num_cl) %>% - dplyr::mutate(total_counts= sum(n), pct_counts= n/total_counts,) %>% dplyr::arrange(-n) %>% - dplyr::mutate(cum_pct= cumsum(pct_counts), rank= row_number(), - rank_pct= rank/expected_num_cl) %>% dplyr::ungroup() - - # additional tables - mark50= cdf %>% dplyr::filter(cum_pct >= 0.5) %>% dplyr::group_by(profile_id) %>% - arrange(cum_pct) %>% dplyr::filter(row_number()==1) %>% ungroup() %>% - dplyr::select(profile_id, rank_pct= rank_pct, num50= rank, num50_loc= rank_pct) - mark95= cdf %>% dplyr::group_by(profile_id) %>% - dplyr::mutate(auc= sum(cum_pct*(1/expected_num_cl))) %>% # calculate AUCs - dplyr::filter(cum_pct >= 0.95) %>% - arrange(cum_pct) %>% dplyr::filter(row_number() ==1) %>% ungroup() %>% - dplyr::select(profile_id, rank_pct= rank_pct, num95= rank, num95_loc= rank_pct, auc) - - cdf_plot= cdf %>% - merge(mark50, by= c('profile_id', 'rank_pct'), all.x=T) %>% - merge(mark95, by= c('profile_id', 'rank_pct'), all.x= T) %>% - ggplot(aes(x= rank_pct, y=cum_pct)) + - { if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), - mapping=aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size=3) } + - geom_line(color='black') + - # point for 50% of counts - geom_segment(aes(x= -Inf , y= .50, xend = num50_loc, yend = .50), color= 'black', linetype='dashed') + - geom_segment(aes(x= num50_loc, y= -Inf, xend = num50_loc, yend = .50), color= 'black', linetype='dashed') + - geom_label(aes(x=num50_loc, y= .25, label= num50), hjust= 0, color= 'black') + - # point for 95% of counts - geom_segment(aes(x= -Inf , y= .95, xend = num95_loc, yend = .95), color= 'black', linetype='dashed') + - geom_segment(aes(x= num95_loc, y= -Inf, xend = num95_loc, yend = .95), color= 'black', linetype='dashed') + - geom_label(aes(x=num95_loc, y= .75, label= num95), hjust= 0, color= 'black') + - # label for AUC - geom_label(aes(x=num95_loc, y= .25, label= paste0('AUC ', round(auc,3))), hjust= 'inward', color= 'black') + - facet_wrap(~profile_id) + - labs(x='% rank of unique reads', y='Cumulative percentage', color= 'CBs') + theme_bw() - - pdf(file=paste(out, "cdf_plot.pdf", sep="/"), - width=sqrt(num_profiles)*2, height=sqrt(num_profiles)) - print(cdf_plot) - dev.off() - rm(cdf, mark50, mark95, cdf_plot) + print("7. Generating cummulative image ...") + potential_error= base::tryCatch({ + cdf= filtered_counts %>% dplyr::filter(trt_type == control_type) %>% + dplyr::left_join(num_cls_in_set, by= "cell_set") %>% + dplyr::mutate(expected_num_cl= ifelse(control_barcodes, expected_num_cl + length(unique(CB_meta$Name)), + expected_num_cl)) %>% # add CBs to expected_num_cl if there are CBs + tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= FALSE) %>% + dplyr::group_by(pcr_plate, pcr_well, profile_id, expected_num_cl) %>% + dplyr::mutate(total_counts= sum(n), pct_counts= n/total_counts,) %>% dplyr::arrange(-n) %>% + dplyr::mutate(cum_pct= cumsum(pct_counts), rank= row_number(), + rank_pct= rank/expected_num_cl) %>% dplyr::ungroup() + + # additional tables + mark50= cdf %>% dplyr::filter(cum_pct >= 0.5) %>% dplyr::group_by(profile_id) %>% + arrange(cum_pct) %>% dplyr::filter(row_number()==1) %>% ungroup() %>% + dplyr::select(profile_id, rank_pct= rank_pct, num50= rank, num50_loc= rank_pct) + mark95= cdf %>% dplyr::group_by(profile_id) %>% + dplyr::mutate(auc= sum(cum_pct*(1/expected_num_cl))) %>% # calculate AUCs + dplyr::filter(cum_pct >= 0.95) %>% + arrange(cum_pct) %>% dplyr::filter(row_number() ==1) %>% ungroup() %>% + dplyr::select(profile_id, rank_pct= rank_pct, num95= rank, num95_loc= rank_pct, auc) + + cdf_plot= cdf %>% + merge(mark50, by= c('profile_id', 'rank_pct'), all.x=T) %>% + merge(mark95, by= c('profile_id', 'rank_pct'), all.x= T) %>% + ggplot(aes(x= rank_pct, y=cum_pct)) + + { if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), + mapping=aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size=3) } + + geom_line(color='black') + + # point for 50% of counts + geom_segment(aes(x= -Inf , y= .50, xend = num50_loc, yend = .50), color= 'black', linetype='dashed') + + geom_segment(aes(x= num50_loc, y= -Inf, xend = num50_loc, yend = .50), color= 'black', linetype='dashed') + + geom_label(aes(x=num50_loc, y= .25, label= num50), hjust= 0, color= 'black') + + # point for 95% of counts + geom_segment(aes(x= -Inf , y= .95, xend = num95_loc, yend = .95), color= 'black', linetype='dashed') + + geom_segment(aes(x= num95_loc, y= -Inf, xend = num95_loc, yend = .95), color= 'black', linetype='dashed') + + geom_label(aes(x=num95_loc, y= .75, label= num95), hjust= 0, color= 'black') + + # label for AUC + geom_label(aes(x=num95_loc, y= .25, label= paste0('AUC ', round(auc,3))), hjust= 'inward', color= 'black') + + facet_wrap(~profile_id) + + labs(x='% rank of unique reads', y='Cumulative percentage', color= 'CBs') + theme_bw() + + pdf(file=paste(out, "cdf_plot.pdf", sep="/"), + width=sqrt(num_profiles)*2, height=sqrt(num_profiles)) + print(cdf_plot) + dev.off() + rm(cdf, mark50, mark95, cdf_plot) + }, error= function(e) { + print(e) + print('Encountered an error when creating the cdf plot. Skipping this output ...') + return('cdf plot') + }) + + # Collect returned string if an error occurred + if(!is.null(potential_error)) { + skipped_qcs = c(skipped_qcs, potential_error) + } ## Control barcode trends ---- if(contains_cbs & is.data.frame(normalized_counts)) { - print("generating control_barcode_trend image") + print("8. Generating control_barcode_trend image") + potential_error= base::tryCatch({ + trend_sc= create_ctrlBC_scatterplots(normalized_counts, id_cols, value_col= 'log2_n') + + pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"), + width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2) + print(trend_sc) + dev.off() + rm(cb_trend, trend_sc) + }, error= function(e) { + print(e) + print('Encountered an error when creating the CB trends plot. Skipping this output ...') + return('cb trend') + }) - trend_sc= plot_ctrl_bc_trend(normalized_counts, id_cols, counts_col= 'log2_n') + # Collect returned string if an error occurred + if(!is.null(potential_error)) { + skipped_qcs = c(skipped_qcs, potential_error) + } + } else { + print('8. No control barcodes detected. Skipping control_barcode_trend image.') + } + + ## Sample correlation ----- + print("9. Generating sample_cor image ...") + potential_error= base::tryCatch({ + cor_df= filtered_counts %>% + dplyr::filter(!is.na(DepMap_ID), !is.na(trt_type), !trt_type %in% c("empty", "", "CB_only")) %>% + dplyr::mutate(log2_n= log2(n + 1)) + cp= create_cor_heatmap(input_df= cor_df, + row_id_cols= c('DepMap_ID'), + col_id_cols= c(sig_cols, id_cols), + value_col= 'log2_n') - pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"), + pdf(file=paste(out, "sample_cor.pdf", sep="/"), width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2) - print(trend_sc) + print(cp) dev.off() - rm(cb_trend, trend_sc) + rm(correlation_matrix, cp) + }, error= function(e) { + print(e) + print('Encountered an error when creating the sample_cor figure. Skipping this output ...') + return('sample_cor') + }) + + # Collect returned string if an error occurred + if(!is.null(potential_error)) { + skipped_qcs = c(skipped_qcs, potential_error) } - ## Sample correlation ----- - print("generating sample_cor image") - - cor_df= filtered_counts %>% - dplyr::filter(!is.na(DepMap_ID), !is.na(trt_type), !trt_type %in% c("empty", "", "CB_only")) %>% - dplyr::mutate(log2_n= log2(n + 1)) - cp= plot_cor_heatmap(input_df= cor_df, - row_id_cols= c('DepMap_ID'), - col_id_cols= c(sig_cols, id_cols), - counts_col= 'log2_n') - - pdf(file=paste(out, "sample_cor.pdf", sep="/"), - width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2) - print(cp) - dev.off() - rm(correlation_matrix, cp) - ## Tech rep correlations ---- - if(is.data.frame(normalized_counts)) { - if('tech_rep' %in% colnames(normalized_counts)) { + if(is.data.frame(normalized_counts) & 'tech_rep' %in% colnames(normalized_counts)) { + # Check if there are more at least two tech reps + unique_tech_reps= na.omit(unique(normalized_counts$tech_rep)) + + if(length(unique_tech_reps) >= 2) { + print("10. Generating tech rep correlations image ...") # Set up replicate groups depending "bio_rep" column if('bio_rep' %in% colnames(normalized_counts) & !'bio_rep' %in% sig_cols) { replicate_group_cols= c(sig_cols, 'bio_rep') @@ -563,71 +645,95 @@ QC_images = function(raw_counts, annotated_counts, normalized_counts= NA, } # Create replicate scatter plot - print("generating tech rep correlations image") - tech_reps_plt= make_replicate_scatterplots(input_df= normalized_counts, - cell_line_cols= unique_cell_line_cols, - replicate_group_cols= replicate_group_cols, - replicate_col= 'tech_rep', - values_col= 'log2_n') + potential_error= base::tryCatch({ + tech_reps_plt= create_replicate_scatterplots(input_df= normalized_counts, + cell_line_cols= unique_cell_line_cols, + replicate_group_cols= replicate_group_cols, + replicate_col= 'tech_rep', + value_col= 'log2_n') + if(!is.null(tech_reps_plt)) { + pdf(file=paste(out, "tech_reps_plt.pdf", sep="/"), + width=sqrt(num_profiles), height=sqrt(num_profiles)) + print(tech_reps_plt) + dev.off() + } else { + print('No technical replicates detected - skipping plot.') + } + }, error= function(e) { + print(e) + print('Encountered an error when creating the tech_reps_plt figure. Skipping this output ...') + return('tech_reps_plt') + }) - if(!is.null(tech_reps_plt)) { - pdf(file=paste(out, "tech_reps_plt.pdf", sep="/"), - width=sqrt(num_profiles), height=sqrt(num_profiles)) - print(tech_reps_plt) - dev.off() - } else { - print('No technical replicates detected - skipping plot.') + # Collect returned string if an error occurred + if(!is.null(potential_error)) { + skipped_qcs = c(skipped_qcs, potential_error) } + + } else { + print('10. No technical replicates detected. Skipping tech_reps scatter plot.') } + } else { + print('10. No technical replicates detected. Skipping tech_reps scatter plot.') } + ## Bio rep correlations ---- - if('bio_rep' %in% colnames(normalized_counts)) { - num_bio_reps= normalized_counts %>% - dplyr::filter((!trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type)) %>% - dplyr::pull(bio_rep) %>% unique() %>% length() + if('bio_rep' %in% colnames(l2fc)) { + unique_bio_reps= na.omit(unique(l2fc$bio_rep)) - if(num_bio_reps > 1) { - print("generating bio rep correlations image") + if(length(unique_bio_reps) >= 2) { + l2fc_with_log2= l2fc %>% dplyr::mutate(log2_mean_normalized_n= log2(mean_normalized_n)) - if('bio_rep' %in% colnames(normalized_counts)) { - bio_rep_id_cols= c(sig_cols, 'bio_rep') - } else { - bio_rep_id_cols= sig_cols - print('WARNING: bio_rep column not detected. Assuming that there are NO biological replicates.') - print('Technical replicate collapse will be performed across the sig_cols.') - } + # Bio replicate scatter plots + # bio_reps_plt= create_replicate_scatterplots(input_df= l2fc_with_log2s, + # cell_line_cols= cell_line_cols, + # replicate_group_cols= sig_cols, + # replicate_col= 'bio_rep', + # value_col= 'log2_mean_normalized_n') + # if(!is.null(bio_reps_plt)) { + # pdf(file=paste(out, "bio_reps_plt.pdf", sep="/"), + # width=sqrt(num_profiles), height=sqrt(num_profiles)) + # print(bio_reps_plt) + # dev.off() + # } else { + # print('No technical replicates detected - skipping plot.') + # } - # collapse tech reps taken from 'compute_l2fc' - collapsed_tech_rep= normalized_counts %>% - dplyr::filter(!(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type), !is.na(CCLE_name)) %>% - dplyr::group_by(pick(all_of(c('CCLE_name', 'trt_type', bio_rep_id_cols)))) %>% - dplyr::summarise(mean_normalized_n = mean(!! rlang::sym(count_col_name)), - num_tech_reps= n()) %>% dplyr::ungroup() - collapsed_tech_rep$sig_id= do.call(paste,c(collapsed_tech_rep[sig_cols], sep=':')) - - bio_corr= collapsed_tech_rep %>% ungroup() %>% - filter(!is.na(CCLE_name), - (!trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type)) %>% - mutate(plt_id= paste(sig_id, bio_rep, sep=':')) %>% - reshape2::acast(CCLE_name~plt_id, value.var="mean_normalized_n") %>% - cor(use="pairwise.complete.obs") + # Bio replicate heatmap + print("11. Generating bio rep correlations heatmap ...") + potential_error= base::tryCatch({ + bio_corr_hm= create_cor_heatmap(input_df= l2fc_with_log2, + row_id_cols= cell_line_cols, + col_id_cols= c(sig_cols, 'bio_rep'), + value_col= 'l2fc', + cor_method= 'pearson') + pdf(file=paste(out, "bio_corr_hm.pdf", sep="/"), + width=sqrt(num_profiles), height=sqrt(num_profiles)) + print(bio_corr_hm) + dev.off() + }, error= function(e) { + print(e) + print('Encountered an error when creating the bio_corr_hm figure. Skipping this output ...') + return('bio_corr_hm') + }) - bio_corr_hm= bio_corr %>% reshape2::melt() %>% ggplot() + - geom_tile(aes(x=Var1, y=Var2, fill=value)) + - labs(x="", y="", fill="correlation\nnorm_n") + - scale_fill_gradientn(breaks= c(0, 0.5, 1), - colours= c('white', 'yellow', 'red'), - limits=c(0,1), oob=squish) + - theme(axis.text.x = element_text(angle=70, hjust=1, size=5), - axis.text.y = element_text(size=5)) + # Collect returned string if an error occurred + if(!is.null(potential_error)) { + skipped_qcs = c(skipped_qcs, potential_error) + } - pdf(file=paste(out, "bio_corr_hm.pdf", sep="/"), - width=sqrt(num_profiles), height=sqrt(num_profiles)) - print(bio_corr_hm) - dev.off() + } else { + print('11. No biological replicates detected. Skipping bio_rep heatmap.') } } - + # End _________________________ ---- print('QC finishing') + if(length(na.omit(skipped_qcs)) != 0) { + print(paste0('WARNING: The following ', length(skipped_qcs), ' QCs encountered errors and were skipped - ')) + print(na.omit(skipped_qcs)) + } else { + print('No errors encountered.') + } + return(skipped_qcs) } From ecb6dc45c8db6cd49c9da5f52bf3f1ce9e84d014 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Mon, 19 Aug 2024 14:55:32 -0400 Subject: [PATCH 006/127] Updated validator validate_columns_exist now prints missing items --- scripts/src/collate_fastq_reads.R | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 2ea5fd75..c74e4861 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -1,13 +1,18 @@ #' validate_columns_exist #' #' This function checks that a list of columns are present in a dataframe. +#' Columns that were not found in the dataframe are printed out. #' #' @param selected_columns A vector of strings each representing a column name #' @param df A dataframe to check against #' @return Boolean -validate_columns_exist= function(selected_columns, df) { - # Check that all of selected_columns are in df - if(any(!selected_columns %in% colnames(df))) { +validate_columns_exist= function(selected_cols, df) { + # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B]. + unmatched_cols= base::setdiff(selected_cols, colnames(df)) + + if(length(unmatched_cols) > 0) { + print('The following columns are missing: ') + print(unmatched_cols) return(FALSE) } else { return(TRUE) @@ -103,7 +108,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Validation: Check that flowcell_names and flowcell_lanes exist in the sample meta ---- if(!validate_columns_exist(c('flowcell_names', 'flowcell_lanes'), sample_meta)) { - stop('Flowcell_names and/or flowcell_lanes is NOT present in the sample meta.') + stop('The above column(s) are NOT present in the sample meta.') } # Validation: Check that sequencing_index_cols exist in the sample meta ---- @@ -115,8 +120,6 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Validation: Check that id_cols exist in the sample meta ---- if(!validate_columns_exist(id_cols, sample_meta)) { - print('The following id_cols are not present in the sample meta.') - print(id_cols[!id_cols %in% colnames(sample_meta)]) stop('One or more id_cols is NOT present in the sample meta.') } @@ -194,10 +197,9 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, print(paste0('Index purity: ', round(index_purity, 4))) if(index_purity > 1) { stop('ERROR: Index purity is greater than 1!') - } - if(index_purity < 0.5) { + } else if(index_purity < 0.5) { print('Warning: Low index purity!') - } + } else {} print('Done!') return(raw_counts) From 04e83a22f883af4e03a4014f50720c294a00399e Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Mon, 19 Aug 2024 14:57:05 -0400 Subject: [PATCH 007/127] Remove index2 RC-ing Removed index2 RC-ing - moved to collate --- scripts/src/filter_raw_reads.R | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R index 8e663bc0..e29f811a 100755 --- a/scripts/src/filter_raw_reads.R +++ b/scripts/src/filter_raw_reads.R @@ -1,15 +1,18 @@ -#suppressPackageStartupMessages(library(sets)) - #' validate_columns_exist #' #' This function checks that a list of columns are present in a dataframe. +#' Columns that were not found in the dataframe are printed out. #' #' @param selected_columns A vector of strings each representing a column name #' @param df A dataframe to check against #' @return Boolean -validate_columns_exist= function(selected_columns, df) { - # Check that all of selected_columns are in df - if(any(!selected_columns %in% colnames(df))) { +validate_columns_exist= function(selected_cols, df) { + # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B]. + unmatched_cols= base::setdiff(selected_cols, colnames(df)) + + if(length(unmatched_cols) > 0) { + print('The following columns are missing: ') + print(unmatched_cols) return(FALSE) } else { return(TRUE) @@ -113,14 +116,14 @@ filter_raw_reads = function(raw_counts, CB_meta= CB_meta %>% dplyr::mutate(log2_dose= log_dose/log10(2)) %>% dplyr::select(-log_dose) } - if(reverse_index2) { - if ('index_2' %in% colnames(sample_meta)) { - print("Reverse-complementing index 2 barcode ...") - sample_meta$index_2 <- chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2)) - } else { - stop('ERROR: Reverse index 2 is set to TRUE, but index_2 does not exists.') - } - } + # if(reverse_index2) { + # if ('index_2' %in% colnames(sample_meta)) { + # print("Reverse-complementing index 2 barcode ...") + # sample_meta$index_2 <- chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2)) + # } else { + # stop('ERROR: Reverse index 2 is set to TRUE, but index_2 does not exists.') + # } + # } # Validation: Check that id_cols exist in the sample meta ---- if(!validate_columns_exist(id_cols, sample_meta)) { From 5313fb9ebf6eb2615a9969a38c847058e4ab7460 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Mon, 19 Aug 2024 17:30:11 -0400 Subject: [PATCH 008/127] Documented new function --- scripts/src/QC_images.R | 183 +++++++++++++++++++++++++++------------- 1 file changed, 124 insertions(+), 59 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index d5bc5994..273aad88 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -5,7 +5,7 @@ #' #' @param selected_columns A vector of strings each representing a column name #' @param df A dataframe to check against -#' @return Boolean +#' @returns Boolean validate_columns_exist= function(selected_cols, df) { # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B]. unmatched_cols= base::setdiff(selected_cols, colnames(df)) @@ -26,7 +26,7 @@ validate_columns_exist= function(selected_cols, df) { #' @param df A dataframe which must contain the column "n" which represents the count of a read. #' @param index_col The name of the column contain the index barcodes as a string. This column must be present in "df". #' @param valid_indices. A vector of all the valid indices for "index_col". -#' @return A dataframe with the follow columns: +#' @returns A dataframe with the follow columns: #' - index_col: String, The column containing the index barcodes. #' - idx_n: Numeric, Number of reads associated with a specific index barcode. #' - fraction: Numeric, "idx_n" divided by the total number of reads in the run. @@ -50,22 +50,34 @@ get_index_summary= function(df, index_col, valid_indices) { #' Calculate purity metrics #' -#' Create the qc table with index purity and cell line purity +#' Create the qc table with index purity and cell line purity. #' #' @param raw_counts_uncollapsed Dataframe output from nori. #' @param raw_counts Raw counts dataframe outputed from collate_fastq_reads. #' @param filtered_counts Filtered counts dataframe outputed from filter_raw_reads. -#' @param value_col String name of the counts column in all three dataframes. +#' @param value_col String name of the counts column present all three dataframes. #' @param file_path Location to write out the output. -create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts, - value_col= 'n', file_path) { +#' @returns Writes out a QC_table to the file_path. +create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts, value_col= 'n', file_path) { + # Validation: Check that value_col is present in the three files. + if(!validate_columns_exist(value_col, raw_counts_uncollapsed)) { + stop(paste0('The column ', value_col, " was not detected in uncollapsed raw counts.")) + } + if(!validate_columns_exist(value_col, raw_counts)) { + stop(paste0('The column ', value_col, " was not detected in raw counts.")) + } + if(!validate_columns_exist(value_col, filtered_counts)) { + stop(paste0('The column ', value_col, " was not detected in filtered counts.")) + } + + # Calculate purities index_purity= sum(raw_counts[[value_col]]) / sum(raw_counts_uncollapsed[[value_col]]) print(paste0('Index purity: ', round(index_purity, 4))) cell_line_purity= sum(filtered_counts[[value_col]]) / sum(raw_counts[[value_col]]) print(paste0('Cell line purity: ', round(cell_line_purity, 4))) - qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity) + # Write out table print(paste0('Writing QC table out to ', file_path)) qc_table %>% write.csv(file_path, row.names= FALSE, quote= FALSE) } @@ -78,7 +90,15 @@ create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts, #' @param filtered_counts Filtered counts dataframe. #' @param id_cols Vector of columns names that identify each sample. #' @param facet_col String name of the column in filtered_counts to facet the plot. +#' This can be left as NA if there isn't a column to facet on. +#' @returns Returns a ggplot object. create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) { + # Validation: Check that id_cols and facet_col exist in filtered counts. + if(!validate_columns_exist(na.omit(c(id_cols, facet_col)), filtered_counts)) { + stop('Some input columns were not detected in filtered counts.') + } + + # Sum up reads total_counts= filtered_counts %>% dplyr::mutate(barcode_type= case_when(!is.na(CCLE_name) ~ 'cell line', !is.na(Name) ~ 'ctrl barcode')) %>% @@ -101,17 +121,25 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) { #' Cell line recover barplot #' -#' text +#' Creates barplots of the cell lines recovered. The parameter "plot_type" can be used to plot the percentage or +#' the total cell line counts on teh y axis. The parameter "include_ctrl_bcs" can be used to include the control +#' barcodes in the cell line count. #' #' @param filtered_counts Filtered counts dataframe. #' @param id_cols Vector of column names that identify each sample. #' @param facet_col String name of the column in filtered_counts to facet the plot. #' @param value_col String name of the column in filtered_counts that contains the counts. #' @param counts_threshold Threshold used to determine low counts. -#' @param plot_type description -#' @param include_ctrl_bcs description +#' @param plot_type String of either "percent" or "count" to adjust the y axis to be either the percentage or the +#' total number of cell lines. +#' @param include_ctrl_bcs Boolean. Set to TRUE if control barcodes are to be counted. +#' @returns Returns a ggplot plot. create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value_col= 'n', count_threshold, plot_type= 'percent', include_ctrl_bcs= FALSE) { + # Validation: Check that id_cols, facet_col, or value_col exist in filtered counts. + if(!validate_columns_exist(na.omit(c(id_cols, facet_col, value_col)), filtered_counts)) { + stop('Some input columns were not detected in filtered counts.') + } # Filter out control barcodes if it is specified. if(include_ctrl_bcs == FALSE) { @@ -154,12 +182,19 @@ create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value #' Control barcode scatter plot #' -#' text +#' Creates a scatter plot of the control barcodes. #' -#' @param name description +#' @param normalized_counts Dataframe output from the normalize module. +#' @param id_cols Vector of column names that identify every PCR well. +#' @param value_col Name of the column that contains the values. +#' @returns Returns a ggplot object. create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n') { - # Detect norm_r2 and norm_mae. - # If columns do not exist, then roughly calculate those columns. + # Validation: Check that id_cols and value_col exist in filtered counts. + if(!validate_columns_exist(c(id_cols, value_col), normalized_counts)) { + stop('Some input columns were not detected in normalized counts.') + } + + # Detect norm_r2 and norm_mae. If columns do not exist, then roughly calculate those columns. if(any(!c('norm_r2', 'norm_mae') %in% colnames(normalized_counts))) { print('WARNING: Columns "norm_r2" and/or "norm_mae" were not detected in normalized_counts.', quote= FALSE) print('Calculating both columns - this method may not be as robust as the normalize module.') @@ -194,14 +229,26 @@ create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log return(trend_scatter_plot) } - #' Heatmap of correlations #' -#' text +#' Creates a correlation heatmap. A matrix of values is created from the input_df. The row_id_cols +#' are used identify each row and the col_id_cols are used to identify each column. The value_col is +#' used to fill the matrix. Correlations are then computed. #' -#' @param input_df description +#' @import tidyverse +#' @import WGCNA +#' @import reshape2 +#' @param input_df Dataframe. +#' @param row_id_cols Vector of column names from input_df that identifies the cell lines. For example, +#' this can be "DepMap_ID", "CCLE_name" if only cell lines exist. It can also be +#' "DepMap_ID", "CCLE_name", "Name" if control barcodes are also present. +#' @param col_id_cols Vector of column names from input_df that identifies the PCR wells or conditions. +#' For example, this can be "pcr_plate", "pcr_well" or a list of conditions like those in sig_cols. +#' @param value_col String name of the column in input_df to be used as the values. +#' @param cor_method WGCNA correlation method. This defaults to "pearson". +#' @returns Returns a ggplot object. create_cor_heatmap= function(input_df, row_id_cols, col_id_cols, value_col, - cor_method= 'pearson') { + cor_method= 'pearson') { # Validate that specified columns are in the dataframe. if(!validate_columns_exist(c(row_id_cols, col_id_cols, value_col), input_df)) { @@ -239,11 +286,31 @@ create_cor_heatmap= function(input_df, row_id_cols, col_id_cols, value_col, return(cor_heatmap) } -#' Scatterplots of two replicates +#' Scatter plots of two replicates +#' +#' From a long table, creates scatter plots to two replicates. #' -#' @param input_df description +#' @param input_df Dataframe. +#' @param cell_line_cols List of column names used to identify each cell line or control barcode. +#' @param replicate_group_cols List of column names that describe a group of similar conditions. +#' @param replicate_col Name of the column that specifies the replicate. This column should not be +#' in replicate_group_cols! +#' @param value_col Name of the column in input_df that contains the values. +#' @param x_axis_rep String of the replicate identifier that should be on the x axis of the plot. +#' @param y_axis_rep String of the replicate identifier that should be on the y axis of the plot. +#' @returns Returns a ggplot object or NULL if all entries are filtered out. create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_group_cols, replicate_col, value_col, - x_axis_rep= '1', y_axis_rep= '2') { + x_axis_rep= '1', y_axis_rep= '2') { + # Validation: Check that input columns are present in the dataframe. + if(!validate_columns_exist(c(cell_line_cols, replicate_group_cols, replicate_col, value_col), input_df)) { + stop('Some input columns were not detected in normalized counts.') + } + + # Validation: Check that replicate_col is not in replicate_group_cols. + if(replicate_col %in% replicate_group_cols) { + stop(paste0(replicate_col, ' should not be included in replicate_group_cols!')) + } + reps_piv= input_df %>% tidyr::unite(all_of(replicate_group_cols), col= 'replicate_group', sep= ':', remove= TRUE, na.rm= FALSE) %>% dplyr::group_by(pick(all_of(c(cell_line_cols, 'replicate_group')))) %>% @@ -276,38 +343,36 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou return(reps_scatter) } -#' QC_images +#' QC_images +#' +#' Takes in various pipeline outputs and generates 11 QC files. #' -#' Takes in the metadata, raw counts, annotated counts, and normalized counts to generate some QC images. -#' -#' @param sample_meta - sample metadata -#' @param annotated_counts - dataframe of annotated readcounts that must include the following columns: -#' n: raw readcounts -#' profile_id: string unique to each sample as defined by filter_counts method -#' Name: name of the control barcode that the read corresponds to, or NA (if read is cell line) -#' CCLE_name: name of the cell line that the read corresponds to, or NA (if read is control barcode) -#' cell_set: string identifier of cell set expected in a given sample, must match a cell set -#' found in cell_set_meta -#' @param normalized_counts - -#' @param CB_meta - control barcode metadata -#' @param cell_set_meta - a metadata dataframe that contains a mapping from cell set names (e.g. CS5) to -#' lists of LUAs in that cell set separated by semicolons -#' @param out - the filepath to the folder in which QC images are meant to be saved, NA by default and -#' images are saved in the working directory -#' @param id_cols -#' @param sig_cols - -#' @param count_col_names - which counts to plot -#' @param control_type - how the negative controls are designated in the trt_type column in the sample metadata -#' @param count_threshold - threshold for low counts -#' @param reverse_index2 reverse index 2 if newer sequencers are used. -#' @return - NA, QC images are written out to the specified folder -#' @export -QC_images = function(raw_counts_uncollapsed, raw_counts, - filtered_counts, normalized_counts= NA, l2fc, - sample_meta, CB_meta, cell_set_meta, - id_cols, sig_cols, count_col_name= 'normalized_n', - control_type, count_threshold= 40, - reverse_index2= FALSE, out = NA) { +#' @param raw_counts_uncollapsed Dataframe output from nori. This is used to generate purity metrics and +#' the index summaries. +#' @param raw_counts Raw counts dataframe from the collate_fastq_reads modules. This is used to generate puritu metrics. +#' @param annotated_counts Annotated counts dataframe from the filter_raw_reads module. +#' @param filtered_counts Filtered counts dataframe from the filter_raw_reads module. +#' @param normalized_counts Normalized counts dataframe from the normalize module. This is an optional parameter. +#' @param l2fc L2FC dataframe from the compute_l2fc module. This is used for the bio_reps plot. +#' @param sample_meta Dataframe of the sample metadata for the sequencing run. +#' @param CB_meta Dataframe of the control barcode metadata. This is only used for the CDF plot. +#' @param cell_set_meta Dataframe of the cell set metadata. This is only used for the CDF plot. +#' @param cell_line_cols Vector of sample meta column names used to describe a cell line or barcode. +#' @param id_cols Vector of sample meta column names used to identify each PCR well. +#' This defaults to "pcr_plate", "pcr_well". +#' @param sig_cols Vector of sample meta column names used to identify a unique treatment condition. +#' @param control_type String of how the negative controls are designated in the trt_type column in the sample_meta. +#' @param count_threshold Threshold for low read counts. +#' @param reverse_index2 Boolean set to TRUE if the sequencing involved the reverse complement workflow. +#' @param out Path to the directory to save the QC images. +#' @returns NA. QC images are written out to the specified folder. +QC_images= function(raw_counts_uncollapsed, raw_counts, + annotated_counts, filtered_counts, normalized_counts= NA, l2fc, + sample_meta, CB_meta, cell_set_meta, + cell_line_cols, + id_cols= c('pcr_plate', 'pcr_well'), sig_cols, + control_type= 'negcon', count_threshold= 40, + reverse_index2= FALSE, out = NA) { require(tidyverse) require(magrittr) require(reshape2) @@ -321,12 +386,6 @@ QC_images = function(raw_counts_uncollapsed, raw_counts, skipped_qcs= c() # empty vector to collect potential errors num_profiles = annotated_counts %>% dplyr::distinct(pick(all_of(id_cols))) %>% nrow() - # Reverse index 2 barcodes - if(reverse_index2) { - print("Reverse-complementing index 2 barcode.") - sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2)) - } - # Detect control barcodes cb_check= sample_meta %>% dplyr::filter(control_barcodes %in% c("Y", "T", T), @@ -368,6 +427,12 @@ QC_images = function(raw_counts_uncollapsed, raw_counts, } # Do the same for index 2. + # Reverse index 2 barcodes if needed. + if(reverse_index2) { + print("Reverse-complementing index 2 barcode.") + sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2)) + } + if('index_2' %in% colnames(sample_meta) & 'index_2' %in% colnames(raw_counts_uncollapsed)) { expected_index2= unique(sample_meta$index_2) index2_counts= get_index_summary(raw_counts_uncollapsed, 'index_2', expected_index2) @@ -726,7 +791,7 @@ QC_images = function(raw_counts_uncollapsed, raw_counts, print('11. No biological replicates detected. Skipping bio_rep heatmap.') } } - + # End _________________________ ---- print('QC finishing') if(length(na.omit(skipped_qcs)) != 0) { From 2435648fdda0b7698e7a11e259716ebcab46697d Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Mon, 19 Aug 2024 17:31:07 -0400 Subject: [PATCH 009/127] Update module calls --- scripts/collate_fastq_reads.R | 4 ++-- scripts/filter_counts.R | 25 ++++++++----------------- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index 29f4d994..ea2ac734 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -54,8 +54,8 @@ if(file.exists(expected_file_path)) { print("Collating fastq reads ...") raw_counts= collate_fastq_reads(uncollapsed_raw_counts, sample_meta, - sequencing_index_cols, - id_cols, + sequencing_index_cols= sequencing_index_cols, + id_cols= id_cols, reverse_index2= args$reverse_index2, barcode_col= args$barcode_col) diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R index 11958c7d..61b18f31 100755 --- a/scripts/filter_counts.R +++ b/scripts/filter_counts.R @@ -36,11 +36,9 @@ parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help = "Ce parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help = "Cell set metadata") parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata") parser$add_argument("--CB_meta", default="../metadata/CB_meta.csv", help = "Control Barcode metadata") -parser$add_argument("--sequencing_index_cols", default= "index_1,index_2", +parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", help = "Sequencing columns in the sample meta") parser$add_argument("--count_threshold", default= 40, help = "Low counts threshold") -parser$add_argument("--reverse_index2", action="store_true", default=FALSE, - help = "Reverse complement of index 2 for NovaSeq and NextSeq") parser$add_argument("--rm_data", action="store_true", default=FALSE, help = "Remove bad experimental data") parser$add_argument("--pool_id", action="store_true", default=FALSE, help = "Pull pool IDs from CellDB.") parser$add_argument("--control_type", default="negcon", @@ -64,7 +62,7 @@ raw_counts= data.table::fread(args$raw_counts, header= T, sep= ',', data.table= # Convert strings to vectors ---- # Also check that column names are present in the sample meta. -sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ",")) +id_cols= unlist(strsplit(args$id_cols, ",")) if (!all(sequencing_index_cols %in% colnames(sample_meta))){ stop(paste("All seq columns not found in sample_meta, check metadata or --sequencing_index_cols argument:", args$sequencing_index_cols)) @@ -93,14 +91,12 @@ cell_line_meta %<>% # Run filter_raw_reads ----- print("creating filtered count file") -filtered_counts = filter_raw_reads(raw_counts, - sample_meta, - cell_line_meta, - cell_set_meta, - CB_meta, - sequencing_index_cols= sequencing_index_cols, - count_threshold= as.numeric(args$count_threshold), - reverse_index2= args$reverse_index2) +filtered_counts = filter_raw_reads(raw_counts= raw_counts, sample_meta= sample_meta, + cell_line_meta= cell_line_meta, + cell_set_meta= cell_set_meta, + CB_meta= CB_meta, + id_cols= id_cols, + count_threshold= as.numeric(args$count_threshold)) # Pulling pool_id when db_flag and pool_id flags are passed if (args$pool_id) { @@ -129,11 +125,6 @@ if(sum(cl_entries$n) == 0) { } # Write out module outputs ---- -qc_table = filtered_counts$qc_table -qc_out_file = paste(args$out, 'QC_table.csv', sep='/') -print(paste("writing QC_table to: ", qc_out_file)) -write.csv(qc_table, qc_out_file, row.names=F, quote=F) - unmapped_reads= filtered_counts$unmapped_reads unmapped_out = paste(args$out, 'unmapped_reads.csv', sep='/') print(paste("writing unmapped reads to: ", unmapped_out)) From 16966e9dd48373dc770a1815349d61cd43e09aec Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Mon, 19 Aug 2024 17:31:23 -0400 Subject: [PATCH 010/127] Update function documentation --- scripts/src/collate_fastq_reads.R | 27 +++++++--- scripts/src/filter_raw_reads.R | 83 +++++++++++++------------------ 2 files changed, 54 insertions(+), 56 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index c74e4861..26f0063f 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -77,17 +77,28 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) { #' collate_fastq_reads #' #' This function takes in the fastq reads (uncollapsed_raw_counts) and -#' filters for reads coming from flowcells specificed in the sample meta. -#' The function then sums up the reads across specified sequencing index columns. +#' filters for reads coming from flowcells specified in the sample meta. +#' The function then sums up the reads across specified sequencing index columns and +#' maps the sequencing index columns to the ID columns. #' -#' @param uncollapsed_raw_counts Data frame of reads from all the fastq files with the following columns - \cr -#' "flowcell_name", "flowcell_lane", "index_1", "index_2", and "forward_read_cl_barcode", "n" +#' @param uncollapsed_raw_counts Dataframe of reads from all the fastq files with the following columns - +#' "flowcell_name", "flowcell_lane", "index_1", "index_2", "forward_read_cl_barcode", and "n". #' @param sample_meta Sample metadata generate for the project which may contain the following columns - -#' "flowcell_names", "flowcell_lanes", "index_1", "index_2". The sample meta must contain +#' "flowcell_names", "flowcell_lanes", "index_1", "index_2". The sample meta MUST contain #' "flowcell_names" and "flowcell_lanes" for filtering. -#' @param sequencing_index_cols Sequencing columns from the sample meta that the counts should be collapsed on. \cr -#' This defaults onto the following columns: "index_1", "index_2" -#' @returns Returns a dataframe with columns specified by the sequencing_index_cols, "forward_read_cl_barcode", and "n". +#' @param sequencing_index_cols Sequencing columns from the sample meta that the counts should be collapsed on. +#' These columns should be a subset of the four sequencing related columns in the +#' sample meta - "flowcell_names", "flowcell_lanes", "index_1", and "index_2". They +#' should also uniquely identify every PCR well. This parameter defaults onto +#' the following columns: "index_1", "index_2". +#' @param id_cols ID columns from the sample meta that uniquely identify every PCR well. These columns should not +#' include any sequencing related columns. This parameter defaults onto "pcr_plate", "pcr_well". This +#' parameter can also be a list of the sample conditions columns as long as they uniquely identify every +#' PCR well. For example "cell_set", "treatment", "dose", "day", "bio_rep", "tech_rep" can also be used. +#' @param reverse_index2 Index 2 should be reversed if the sequencer uses a reverse complement workflow. +#' Defaults to FALSE. +#' @param barcode_col String name of the column in uncollapsed_raw_counts that contains the sequences. +#' @returns Returns a dataframe with columns specified by the id_cols along with barcode_col, and "n". #' @import tidyverse collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, sequencing_index_cols= c('index_1', 'index_2'), diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R index e29f811a..1cf909b1 100755 --- a/scripts/src/filter_raw_reads.R +++ b/scripts/src/filter_raw_reads.R @@ -64,48 +64,33 @@ validate_cell_set_luas= function(sample_meta, cell_set_meta) { #' filter raw reads #' -#' takes the raw readcount table and filters for expected indices and cell lines -#' using the given metadata. QC metrics are returned as a data.frame +#' Takes the raw readcount table and filters for expected indices and cell lines +#' using the given metadata. #' -#' @param raw_counts - an unfiltered counts table -#' @param sample_meta - the sample metadata for the particular experiment. Must follow the given set of -#' guidelines for metadata. Required columns include: -#' - index_1 -#' - index_2 -#' - cell_set -#' @param cell_line_meta - master metadata of cell lines with the following required columns: -#' - CCLE_name -#' - DepMap_ID -#' - LUA -#' - Sequence -#' @param cell_set_meta - master metdata of cell sets and their contents with the following required columns: -#' - cell_set -#' - members -#' @param CB_meta - master metdata of control barcodes, their sequences, and their doses. -#' The file should contain the columns: -#' - Sequence -#' - Name -#' - log_dose -#' @param sequencing_index_cols Vector of column names from the sample meta that is used to uniquely identify where a -#' sequencing read is coming from. This defaults to "index_1" and "index_2", but -#' it can be expanded to include "flowcell_names" and "flowcell_lanes". -#' @param reverse_index2 Reverses index2 for certain sequencers -#' @param count_threshold Threshold to call low counts. -#' @param control_type - how the negative controls are designated in the trt_type column in the sample metadata -#' @return - list with the following elements +#' @param raw_counts Dataframe of reads. The columns of this dataframe should include the id_cols, +#' "forward_read_cl_barcode", and "n". +#' @param sample_meta Dataframe of the metadata for the sequencing run. This file should contain the id_cols, +#' "cell_set", "control_barcodes", etc. +#' @param cell_line_meta Master metadata of cell lines with the following required columns - "CCLE_name", +#' "DepMap_ID", "LUA", and "Sequence". +#' @param cell_set_meta Master metadata of cell sets and their contents with the following required columns - +#' "cell_set" and "members". +#' @param CB_meta Master metadata of control barcodes, their sequences, and their doses. The file should contain +#' the columns - "Sequence", "Name", and "log_dose". +#' @param id_cols Columns present in both raw_counts and sample_meta that uniquely identify each PCR well. +#' This defaults to "pcr_plate", "pcr_well". +#' @param count_threshold Threshold to call low counts. This defaults to 40. +#' @returns List with the following elements: #' #' \itemize{ #' \item unmapped_reads: table of reads with valid index pairs but did not map to any known barcode. -#' The table contains the following columns - index_1, index_2, forward_read_cl_barcode, and n -#' \item annotated_counts: table of reads and the associated well and well conditions +#' The table contains the following columns - id_cols, "forward_read_cl_barcode", and "n". +#' \item annotated_counts: table of reads and the associated well and well conditions. #' \item filtered_counts: table of all expected reads for the project, this is a subset of annotated counts. -#' \item qc_table: QC table of index_purity and cell_line_purity #' } -#' @export filter_raw_reads = function(raw_counts, sample_meta, cell_line_meta, cell_set_meta, CB_meta, - id_cols= c('pcr_plate', 'pcr_well'), - reverse_index2= FALSE, count_threshold= 40) { - + id_cols= c('pcr_plate', 'pcr_well'), + count_threshold= 40) { require(tidyverse) require(magrittr) @@ -116,15 +101,6 @@ filter_raw_reads = function(raw_counts, CB_meta= CB_meta %>% dplyr::mutate(log2_dose= log_dose/log10(2)) %>% dplyr::select(-log_dose) } - # if(reverse_index2) { - # if ('index_2' %in% colnames(sample_meta)) { - # print("Reverse-complementing index 2 barcode ...") - # sample_meta$index_2 <- chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2)) - # } else { - # stop('ERROR: Reverse index 2 is set to TRUE, but index_2 does not exists.') - # } - # } - # Validation: Check that id_cols exist in the sample meta ---- if(!validate_columns_exist(id_cols, sample_meta)) { stop('One or more id_cols is NOT present in the sample meta.') @@ -138,9 +114,18 @@ filter_raw_reads = function(raw_counts, # Validation: Check that cell sets do not contain duplicate LUAs ---- # This will produce a warning if a LUA appears in a cell set more than once! - # This currently does NOT result in an error. Error avoided using a distinct later in line 162 + # This currently does NOT result in an error. Error avoided using a distinct when creating the template validate_cell_set_luas(sample_meta, cell_set_meta) + # Split off unmapped reads ---- + # Unmapped reads are defined as reads that are identified from valid PCR locations, + # but do not map to known barcodes in PRISM. + # Also sorted reads in descending order by read count. + print('Splitting off unmapped reads ...') + raw_counts %<>% dplyr::mutate(mapped= forward_read_cl_barcode %in% c(cell_line_meta$Sequence, CB_meta$Sequence)) + unmapped_reads= raw_counts %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>% + dplyr::arrange(dplyr::desc(n)) + # Creating a template of all expected reads in the run ---- # Use all 4 meta data files to create a "template" dataframe where # every row is a cell line that is expected in a PCR well. @@ -168,14 +153,14 @@ filter_raw_reads = function(raw_counts, # Reads that to not match to the template are contaminants and, # reads that are only present in the template are missing/not detected by PCR. print("Annotating reads ...") - annotated_counts= raw_counts %>% + annotated_counts= raw_counts %>% dplyr::filter(mapped) %>% dplyr::left_join(cell_line_meta, by= join_by('forward_read_cl_barcode'=='Sequence'), relationship= 'many-to-one') %>% dplyr::left_join(CB_meta, by= join_by('forward_read_cl_barcode'=='Sequence'), relationship= 'many-to-one') %>% dplyr::left_join(sample_meta, by= id_cols, relationship= 'many-to-one') %>% dplyr::full_join(template %>% dplyr::mutate(expected_read= T), - by= c('forward_read_cl_barcode'='Sequence', intersect(colnames(template), colnames(.))), + by= c('forward_read_cl_barcode'= 'Sequence', intersect(colnames(template), colnames(.))), relationship= 'one-to-one') %>% # drop unneeded columns and fill in any new NAs from the merge dplyr::select(!any_of(c('prism_cell_set', 'members', 'mapped'))) %>% @@ -201,7 +186,9 @@ filter_raw_reads = function(raw_counts, print('Warning: Low cell line purity!') } - return(list(annotated_counts= annotated_counts, filtered_counts= filtered_counts)) + return(list(unmapped_reads= unmapped_reads, + annotated_counts= annotated_counts, + filtered_counts= filtered_counts)) } # checks is a string can be numeric From 86be77df3965ce8f4b5636fc254525067e52bdfb Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 20 Aug 2024 09:30:23 -0400 Subject: [PATCH 011/127] Update filter_counts.R --- scripts/filter_counts.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R index 61b18f31..2d09f2c6 100755 --- a/scripts/filter_counts.R +++ b/scripts/filter_counts.R @@ -31,9 +31,9 @@ parser$add_argument("-q", "--quietly", action="store_false", parser$add_argument("--wkdir", default=getwd(), help="Working directory") parser$add_argument("-c", "--raw_counts", default="raw_counts.csv", help = "path to file containing raw counts") parser$add_argument("-o", "--out", default="", help = "Output path. Default is working directory") -parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help = "Sample metadata") -parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help = "Cell Line metadata") -parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help = "Cell set metadata") +parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata") +parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata") +parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help= "Cell set metadata") parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata") parser$add_argument("--CB_meta", default="../metadata/CB_meta.csv", help = "Control Barcode metadata") parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", From 1458ed7c40f1ccaacc34989cf3704700804c8208 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 20 Aug 2024 09:31:38 -0400 Subject: [PATCH 012/127] Update parameters --- scripts/filteredCounts_QC.R | 72 ++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R index 2e9da455..3d83625f 100755 --- a/scripts/filteredCounts_QC.R +++ b/scripts/filteredCounts_QC.R @@ -22,25 +22,30 @@ parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help=" parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output") parser$add_argument("--wkdir", default=getwd(), help="Working directory") parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata") -parser$add_argument("--raw_counts", default= "raw_counts.csv", help="path to file containing raw counts") -parser$add_argument("--annotated_counts", default="annotated_counts.csv", - help="path to file containing annotated counts") +parser$add_argument("-c", "--uncollapsed_raw_counts", default="raw_counts_uncollapsed.csv", + help="path to file containing uncollapsed raw counts file") +parser$add_argument("--raw_counts", default= "raw_counts.csv", help="path to raw counts file") +parser$add_argument("--annotated_counts", default= "annotated_counts.csv", + help= "path to file containing annotated counts") +parser$add_argument("--filtered_counts", default= "filtered_counts.csv", help= "path to filtered_counts file") parser$add_argument("--normalized_counts", default="normalized_counts.csv", help="path to file containing normalized counts") +parser$add_argument("--l2fc", default="l2fc.csv", help= "path to l2fc file") +parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help = "Sample metadata") parser$add_argument("--CB_meta", default="../metadata/CB_meta.csv", help = "control barcode metadata") parser$add_argument("--cell_set_meta", default="../metadata/cell_set_meta.csv", help = "Cell set metadata") -parser$add_argument("-o","--out", default="", help = "Output path. Default is working directory") -parser$add_argument("--id_cols", default="cell_set,treatment,dose,dose_unit,day,bio_rep,tech_rep", - help = "Columns to identify each PCR well") +parser$add_argument("--cell_line_cols", default= 'DepMap_ID,CCLE_name', + help= "Columns that identify cell lines or barcodes") +parser$add_argument("--id_cols", default= 'pcr_plate,pcr_well', + help= "Columns to identify each PCR well") parser$add_argument("--sig_cols", default="cell_set,treatment,dose,dose_unit,day", - help = "columns used to generate signature ids") -parser$add_argument("--count_col_name", default="normalized_n", - help = "column containing counts with which to calculate l2fc") -parser$add_argument("--count_threshold", default=40, - help = "Low counts threshold") -parser$add_argument("--reverse_index2", default=FALSE, help = "Reverse index 2") + help= 'Columns used to identify the treatment conditions') parser$add_argument("--control_type", default = "negcon", - help = "how negative control wells are distinguished in the trt_type column") + help= "how negative control wells are distinguished in the trt_type column") +parser$add_argument("--count_threshold", default=40, help= "Low counts threshold") +parser$add_argument("--reverse_index2", default=FALSE, help = "Reverse index 2") +parser$add_argument("-o","--out", default="", help = "Output path. Default is working directory") + # parser$add_argument("--db_flag", action="store_true", default=FALSE, help = "Use CellDB to locate cell set information") # get command line options, if help option encountered print help and exit @@ -51,21 +56,29 @@ if (args$out == ""){ } # Read in files and pull out parameters ---- -sample_meta= data.table::fread(args$sample_meta, header=TRUE, sep=',', data.table=FALSE) -raw_counts= data.table::fread(args$raw_counts, header=TRUE, sep=',', data.table=FALSE) -annotated_counts= data.table::fread(args$annotated_counts, header=TRUE, sep=',', data.table=FALSE) +# Pipeline outputs +raw_counts_uncollapsed= data.table::fread(args$uncollapsed_raw_counts, header= TRUE, sep= ',') +raw_counts= data.table::fread(args$raw_counts, header= TRUE, sep= ',') +annotated_counts= data.table::fread(args$annotated_counts, header= TRUE, sep= ',') +filtered_counts= data.table::fread(args$filtered_counts, header= TRUE, sep= ',') if(file.exists(args$normalized_counts)) { normalized_counts= data.table::fread(args$normalized_counts, header=TRUE, sep=',', data.table=FALSE) } else { normalized_counts= NA } +l2fc= data.table::fread(args$l2fc, header= TRUE, sep= ',') + +# Metadata files +sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',', data.table= FALSE) CB_meta= data.table::fread(args$CB_meta, header=TRUE, sep=',', data.table=FALSE) -id_cols = unlist(strsplit(args$id_cols, ",")) -sig_cols = unlist(strsplit(args$sig_cols, ",")) -count_col_name = args$count_col_name -count_threshold = as.numeric(args$count_threshold) cell_set_meta = data.table::fread(args$cell_set_meta, header=TRUE, sep=',', data.table=FALSE) + +# Parameters +cell_line_cols = unlist(strsplit(args$cell_line_cols, ",")) +id_cols= unlist(strsplit(args$id_cols, ",")) +sig_cols= unlist(strsplit(args$sig_cols, ",")) control_type = args$control_type +count_threshold= as.numeric(args$count_threshold) # # If flag passed, use cell_set_meta file generated for the project via CellDB # if (args$db_flag) { @@ -78,16 +91,19 @@ control_type = args$control_type # } print("Generating QC images ...") -QC_images(raw_counts= raw_counts, +QC_images(raw_counts_uncollapsed= raw_counts_uncollapsed, + raw_counts= raw_counts, annotated_counts= annotated_counts, - normalized_counts= normalized_counts, - sample_meta= sample_meta, + filtered_counts= filtered_counts, + normalized_counts= normalized_counts, + l2fc= l2fc, + sample_meta= sample_meta, CB_meta= CB_meta, - cell_set_meta= cell_set_meta, - id_cols= id_cols, + cell_set_meta= cell_set_meta, + cell_line_cols= c('DepMap_ID', 'CCLE_name'), + id_cols= id_cols, sig_cols= sig_cols, - count_col_name= count_col_name, - control_type = control_type, - count_threshold= count_threshold, + control_type= control_type, + count_threshold= count_threshold, reverse_index2= args$reverse_index2, out= args$out) From 1dba7688f51bc1582a0454bdf993e383e5a830cc Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 20 Aug 2024 11:04:14 -0400 Subject: [PATCH 013/127] Fixed contams_reads output --- scripts/src/QC_images.R | 43 ++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index 273aad88..a8ec5bac 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -510,43 +510,43 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, ## Contaminates for ursula ---- print('6. Generating contaminate reads for Ursula ...') potential_error= base::tryCatch({ - # Determine which seq cols are present. - rc_seq_cols= c('flowcell_names', 'flowcell_lanes', 'index_1', 'index_2') - present_seq_cols= intersect(rc_seq_cols, colnames(raw_counts)) + pcr_locations= c('pcr_plate', 'pcr_well') - # map of seq_cols to PCR locations - pcr_plate_map= sample_meta %>% - dplyr::distinct(pick(any_of(c(present_seq_cols, 'pcr_plate', 'pcr_well', 'cell_set')))) %>% + # Validation: Check that the PCR columns are present in raw_counts. + if(!validate_columns_exist(pcr_locations, raw_counts)) { + stop('pcr_plate and pcr_well are required raw_counts.csv for this to work.') + } + + # count number of wells a cell_set appears in. + pcr_plate_map= sample_meta %>% dplyr::distinct(pick(any_of(c(pcr_locations, 'cell_set')))) %>% dplyr::group_by(pcr_plate) %>% dplyr::mutate(num_wells_in_plate= dplyr::n()) %>% dplyr::ungroup() %>% dplyr::group_by(cell_set) %>% dplyr::mutate(num_wells_in_set= dplyr::n()) %>% dplyr::ungroup() - + # index filter and identify reads as mapped or not - unique_seq_col_vals= sample_meta %>% dplyr::distinct(pick(all_of(present_seq_cols))) - sequencing_filter= raw_counts %>% - dplyr::semi_join(unique_seq_col_vals, by= present_seq_cols) %>% - dplyr::mutate(mapped= ifelse(forward_read_cl_barcode %in% unique(annotated_counts$forward_read_cl_barcode), T, F)) - + sequencing_filter= raw_counts %>% + dplyr::mutate(mapped= forward_read_cl_barcode %in% unique(annotated_counts$forward_read_cl_barcode)) + # total counts per well - used to calculate fractions - counts_per_well= sequencing_filter %>% dplyr::group_by(pick(all_of(present_seq_cols))) %>% + counts_per_well= sequencing_filter %>% dplyr::group_by(pick(all_of(pcr_locations))) %>% dplyr::summarise(well_total_n= sum(n)) %>% dplyr::ungroup() - + # mapped contaminates to bind mapped_contams= annotated_counts %>% dplyr::filter(!expected_read) %>% dplyr::mutate(barcode_name= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>% - dplyr::select(all_of(c(present_seq_cols, 'forward_read_cl_barcode', 'n', 'barcode_name'))) + dplyr::select(all_of(c(pcr_locations, 'forward_read_cl_barcode', 'n', 'barcode_name'))) contam_reads= sequencing_filter %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>% dplyr::bind_rows(mapped_contams) %>% - dplyr::left_join(counts_per_well, by= present_seq_cols) %>% - dplyr::left_join(pcr_plate_map, by= present_seq_cols) %>% + dplyr::left_join(counts_per_well, by= pcr_locations) %>% + dplyr::left_join(pcr_plate_map, by= pcr_locations) %>% # filter out barcodes that only appear in one well dplyr::group_by(forward_read_cl_barcode) %>% dplyr::filter(dplyr::n() >1) %>% dplyr::ungroup() %>% # number of wells in a pcr plate a barcode is detected in dplyr::group_by(forward_read_cl_barcode, pcr_plate) %>% - dplyr::mutate(num_wells_detected_plate= n()) %>% dplyr::ungroup() %>% + dplyr::mutate(num_wells_detected_plate= dplyr::n()) %>% dplyr::ungroup() %>% # number of wells in a cell set a barcode is detected in dplyr::group_by(forward_read_cl_barcode, cell_set) %>% - dplyr::mutate(num_wells_detected_set= n()) %>% dplyr::ungroup() %>% + dplyr::mutate(num_wells_detected_set= dplyr::n()) %>% dplyr::ungroup() %>% # determine if contamination is project, plate, or set dplyr::group_by(forward_read_cl_barcode) %>% dplyr::mutate(num_wells_detected= dplyr::n(), @@ -774,8 +774,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, cor_method= 'pearson') pdf(file=paste(out, "bio_corr_hm.pdf", sep="/"), width=sqrt(num_profiles), height=sqrt(num_profiles)) - print(bio_corr_hm) - dev.off() + #print(bio_corr_hm) + #dev.off() }, error= function(e) { print(e) print('Encountered an error when creating the bio_corr_hm figure. Skipping this output ...') @@ -800,5 +800,4 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, } else { print('No errors encountered.') } - return(skipped_qcs) } From 6bb451cd985c590a1dacd2fd315950857b774cb0 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 20 Aug 2024 11:04:37 -0400 Subject: [PATCH 014/127] Updated scripts --- scripts/collate_fastq_reads.R | 1 + scripts/filter_counts.R | 6 ------ scripts/filteredCounts_QC.R | 1 + 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index ea2ac734..198c5ad1 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -37,6 +37,7 @@ if(file.exists(expected_file_path)) { uncollapsed_raw_counts= data.table::fread(expected_file_path, header= T, sep= ',', data.table= F) sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',', data.table= F) + # Parse vector inputs sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ",")) id_cols= unlist(strsplit(args$id_cols, ",")) diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R index 2d09f2c6..31849b77 100755 --- a/scripts/filter_counts.R +++ b/scripts/filter_counts.R @@ -61,13 +61,7 @@ sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',', data.table raw_counts= data.table::fread(args$raw_counts, header= T, sep= ',', data.table= F) # Convert strings to vectors ---- -# Also check that column names are present in the sample meta. id_cols= unlist(strsplit(args$id_cols, ",")) -if (!all(sequencing_index_cols %in% colnames(sample_meta))){ - stop(paste("All seq columns not found in sample_meta, check metadata or --sequencing_index_cols argument:", - args$sequencing_index_cols)) -} - count_threshold = as.numeric(args$count_threshold) # make sure LUA codes in cell line meta are unique diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R index 3d83625f..ce93e655 100755 --- a/scripts/filteredCounts_QC.R +++ b/scripts/filteredCounts_QC.R @@ -13,6 +13,7 @@ suppressPackageStartupMessages(library(ggplot2)) suppressPackageStartupMessages(library(ggpubr)) suppressPackageStartupMessages(library(scales)) # for out of bound handling in plots suppressPackageStartupMessages(library(ggpmisc)) # with ggplot to add fit line and labels +suppressPackageStartupMessages(library(WGCNA)) source("./src/QC_images.R") # source function # Argument parser ---- From c06f35a4c95578e2bb2bfb50836a8418419f530e Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 20 Aug 2024 13:42:06 -0400 Subject: [PATCH 015/127] Create joining_sample_meta_columns.R --- scripts/joining_sample_meta_columns.R | 58 +++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 scripts/joining_sample_meta_columns.R diff --git a/scripts/joining_sample_meta_columns.R b/scripts/joining_sample_meta_columns.R new file mode 100644 index 00000000..eb39d84f --- /dev/null +++ b/scripts/joining_sample_meta_columns.R @@ -0,0 +1,58 @@ +library(argparse) +library(tidyverse) +source("./src/join_sample_meta.R") + +# Argument parser ---- +parser <- ArgumentParser() +# specify our desired options +parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Sample meta data for the sequencing run.') +parser$add_argument('--l2fc', default= 'l2fc.csv', help= 'L2FC data.') # level 4 +parser$add_argument('--collapsed_l2fc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5 +parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', + help= 'Columns that uniquely identify a condition.') +parser$add_argument('--out', default= getwd(), help= 'Path to the output directory.') + +args <- parser$parse_args() + +# set output to working directory if none is specified +if (args$out == "") { + args$out = args$wkdir +} + +# Prepare args ---- +sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',') +sig_cols= unlist(strsplit(args$sig_cols, ",")) + +# Add in metadata for l2fc file ---- +if(file.exists(args$l2fc)) { + l2fc= data.table::fread(args$l2fc, header= T, sep= ',') + if('bio_rep' %in% sample_meta & 'bio_rep' %in% l2fc) { + input_cols= c(sig_cols, 'bio_rep') + } else { + input_cols= sig_cols + print('WARNING: No "bio_rep" column detected. Proceeding with just sig_cols.') + } + l2fc_with_sm= join_sample_meta(df= l2fc, sample_meta, key_cols= input_cols) + + # Write out + outpath= paste(args$out, 'l2fc_with_sm.csv', sep='/') + print(paste("Writing l2fc_with_sm.csv to ", outpath)) + write.csv(l2fc_with_sm, outpath, row.names= FALSE, quote= FALSE) +} else { + print('WARNING: l2fc.csv does not exist. Skipping this file.') +} +# + +# Add in metadata for collapsed_l2fc file ---- +if(file.exists(args$collapsed_l2fc)) { + collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',') + collapsed_l2fc_with_sm= join_sample_meta(df= collapsed_l2fc, sample_meta, key_cols= sig_cols) + + # Write out + outpath= paste(args$out, 'collapsed_l2fc_with_sm.csv', sep='/') + print(paste("Writing collapsed_l2fc_with_sm.csv to ", outpath)) + write.csv(collapsed_l2fc_with_sm, outpath, row.names= FALSE, quote= FALSE) +} else { + print('WARNING: collapsed_l2fc.csv does not exist. Skipping this file.') +} +# From 0b53666d41be5d1c365396f0d3277d88471a0f6e Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 20 Aug 2024 13:42:39 -0400 Subject: [PATCH 016/127] Pick columns with 1 unique value per group --- scripts/src/join_sample_meta.R | 35 +++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/scripts/src/join_sample_meta.R b/scripts/src/join_sample_meta.R index 4fd9b43e..13f66b30 100644 --- a/scripts/src/join_sample_meta.R +++ b/scripts/src/join_sample_meta.R @@ -1,13 +1,18 @@ #' validate_columns_exist #' #' This function checks that a list of columns are present in a dataframe. +#' Columns that were not found in the dataframe are printed out. #' #' @param selected_columns A vector of strings each representing a column name #' @param df A dataframe to check against #' @return Boolean -validate_columns_exist= function(selected_columns, df) { - # Check that all of selected_columns are in df - if(any(!selected_columns %in% colnames(df))) { +validate_columns_exist= function(selected_cols, df) { + # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B]. + unmatched_cols= base::setdiff(selected_cols, colnames(df)) + + if(length(unmatched_cols) > 0) { + print('The following columns are missing: ') + print(unmatched_cols) return(FALSE) } else { return(TRUE) @@ -25,28 +30,32 @@ validate_columns_exist= function(selected_columns, df) { join_sample_meta= function(df, sample_meta, key_cols) { # Validation: Check that key_cols are present in df ---- if(validate_columns_exist(key_cols, df) == FALSE) { - print(key_cols) stop('Not all key_cols (printed above) are present in the provided data frame.') } # Validation: Check that key_cols are present in the sample meta ---- if(validate_columns_exist(key_cols, sample_meta) == FALSE) { - print(key_cols) stop('Not all key_cols (printed above) are present in the sample meta.') } # Collapse the sample meta using key_cols and join onto the input df ---- + # Collapse unique values into a single row and then filter out columns with the separator. + # Columns with only one unique value in a group are selected. collapsed_metadata= sample_meta %>% dplyr::group_by(pick(all_of(key_cols))) %>% - dplyr::summarise(across(everything(), function(x) paste(sort(unique(x)), collapse= ' | '))) %>% dplyr::ungroup() + dplyr::summarise(across(everything(), function(x) paste(sort(unique(x)), collapse= ':::'))) %>% dplyr::ungroup() %>% + dplyr::select(all_of(key_cols), where(function(x) base::any(!grepl(':::', x)))) - expanded_df= df %>% dplyr::left_join(collapsed_metadata, by= key_cols, relationship='many-to-one') + expanded_df= dplyr::left_join(df, collapsed_metadata, + by= base::intersect(colnames(df), colnames(collapsed_metadata)), + relationship='many-to-one') - # Validation: Check for duplicate columns ---- - duplicate_columns= setdiff(c(colnames(df), colnames(sample_meta)), colnames(expanded_df)) - if(length(duplicate_columns > 0)) { - print("WARNING: The following column(s) appear in the dataframe and the sample meta, but not in key_cols.") - print(duplicate_columns) - print('The columns(s) thus appear twice in the output dataframe.') + # Print out the sample meta columns that were added to the dataframe ---- + added_cols= base::setdiff(colnames(expanded_df), colnames(df)) + if(length(added_cols > 0)) { + print('The following columns from the sample meta were added:') + print(added_cols) + } else { + print('No additional columns from the sample meta were added.') } return(expanded_df) From bbb2e66218f31f1556687a4f8d752b8ac2503ab2 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Thu, 22 Aug 2024 20:27:58 -0400 Subject: [PATCH 017/127] Allow for ctrl_cols to exclude sig_cols --- scripts/src/compute_l2fc.R | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/scripts/src/compute_l2fc.R b/scripts/src/compute_l2fc.R index eeadafb3..f89d9797 100755 --- a/scripts/src/compute_l2fc.R +++ b/scripts/src/compute_l2fc.R @@ -51,11 +51,6 @@ compute_l2fc= function(normalized_counts, stop('Not all cell_line_cols (printed above) are present in normalized_counts.') } - # Validation: Check that ctrl_cols are in sig_cols ---- - if(!all(ctrl_cols %in% sig_cols)) { - stop('Control columns are not a subset of sig columns.') - } - # Collapsing technical replicates ---- # Detect bio_rep column to be used to collapse technical replicates if('bio_rep' %in% colnames(normalized_counts)) { @@ -67,10 +62,11 @@ compute_l2fc= function(normalized_counts, } # collapse tech reps - print('Collapsing technical replicates ...') + print('Collapsing technical replicates on the following columns: ') + print(unique(c(cell_line_cols, 'trt_type', bio_rep_id_cols, ctrl_cols))) collapsed_tech_rep= normalized_counts %>% dplyr::filter(!(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type), !is.na(CCLE_name)) %>% - dplyr::group_by(pick(all_of(c(cell_line_cols, 'trt_type', bio_rep_id_cols)))) %>% + dplyr::group_by(pick(all_of(c(cell_line_cols, 'trt_type', bio_rep_id_cols, ctrl_cols)))) %>% dplyr::summarise(mean_n= mean(n), mean_normalized_n = mean(!!rlang::sym(count_col_name)), num_tech_reps= dplyr::n()) %>% dplyr::ungroup() @@ -81,6 +77,8 @@ compute_l2fc= function(normalized_counts, dplyr::summarise(count= dplyr::n()) %>% dplyr::ungroup()) # Pull out negative controls and collapse any biological replicates ---- + print('Collapsing control conditions on the following columns: ') + print(unique(c(cell_line_cols, ctrl_cols))) controls= collapsed_tech_rep %>% dplyr::filter(trt_type== control_type) %>% dplyr::group_by(pick(all_of(c(cell_line_cols, ctrl_cols)))) %>% dplyr::summarise(control_median_n= median(mean_n), From abdda0c29358c048f8b52c18a5ac4cb2e8630848 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 23 Aug 2024 10:08:31 -0400 Subject: [PATCH 018/127] Add cell set meta --- scripts/joining_sample_meta_columns.R | 10 ++++++---- scripts/src/join_sample_meta.R | 21 ++++++++++++++++----- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/scripts/joining_sample_meta_columns.R b/scripts/joining_sample_meta_columns.R index eb39d84f..50b1e003 100644 --- a/scripts/joining_sample_meta_columns.R +++ b/scripts/joining_sample_meta_columns.R @@ -6,6 +6,7 @@ source("./src/join_sample_meta.R") parser <- ArgumentParser() # specify our desired options parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Sample meta data for the sequencing run.') +parser$add_argument('--cell_set_meta', default= 'cell_set_meta.csv', help= 'Cell set metadata for the sequencing run.') parser$add_argument('--l2fc', default= 'l2fc.csv', help= 'L2FC data.') # level 4 parser$add_argument('--collapsed_l2fc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5 parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', @@ -21,6 +22,7 @@ if (args$out == "") { # Prepare args ---- sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',') +cell_set_meta= data.table::fread(args$cell_set_meta, header= T, sep= ',') sig_cols= unlist(strsplit(args$sig_cols, ",")) # Add in metadata for l2fc file ---- @@ -32,7 +34,7 @@ if(file.exists(args$l2fc)) { input_cols= sig_cols print('WARNING: No "bio_rep" column detected. Proceeding with just sig_cols.') } - l2fc_with_sm= join_sample_meta(df= l2fc, sample_meta, key_cols= input_cols) + l2fc_with_sm= join_sample_meta(df= l2fc, sample_meta, cell_set_meta, key_cols= input_cols) # Write out outpath= paste(args$out, 'l2fc_with_sm.csv', sep='/') @@ -46,11 +48,11 @@ if(file.exists(args$l2fc)) { # Add in metadata for collapsed_l2fc file ---- if(file.exists(args$collapsed_l2fc)) { collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',') - collapsed_l2fc_with_sm= join_sample_meta(df= collapsed_l2fc, sample_meta, key_cols= sig_cols) + collapsed_l2fc_with_sm= join_sample_meta(df= collapsed_l2fc, sample_meta, cell_set_meta, key_cols= sig_cols) # Write out - outpath= paste(args$out, 'collapsed_l2fc_with_sm.csv', sep='/') - print(paste("Writing collapsed_l2fc_with_sm.csv to ", outpath)) + outpath= paste(args$out, 'collapsed_l2fc_with_metadata.csv', sep='/') + print(paste("Writing collapsed_l2fc_with_metadata.csv to ", outpath)) write.csv(collapsed_l2fc_with_sm, outpath, row.names= FALSE, quote= FALSE) } else { print('WARNING: collapsed_l2fc.csv does not exist. Skipping this file.') diff --git a/scripts/src/join_sample_meta.R b/scripts/src/join_sample_meta.R index 13f66b30..9a20e931 100644 --- a/scripts/src/join_sample_meta.R +++ b/scripts/src/join_sample_meta.R @@ -23,11 +23,12 @@ validate_columns_exist= function(selected_cols, df) { #' #' Joins a given data frame with the sample meta. #' -#' @param df -#' @param sample_meta Dataframe of the sample meta used in the run +#' @param df Input dataframe that should contain the columns specified in the "key_cols" parameter and "cell_set". +#' @param sample_meta Dataframe of the sample meta used in the run. +#' @param cell_set_meta Datafrane of the cell set metadata used in the run. This should contain the "cell_set" column. #' @param key_cols Vector of column names used as identifiers in the sample meta. -#' @returns Data frame with additional columns from the sample meta -join_sample_meta= function(df, sample_meta, key_cols) { +#' @returns Data frame with additional columns from the sample meta. +join_sample_meta= function(df, sample_meta, cell_set_meta, key_cols) { # Validation: Check that key_cols are present in df ---- if(validate_columns_exist(key_cols, df) == FALSE) { stop('Not all key_cols (printed above) are present in the provided data frame.') @@ -38,10 +39,20 @@ join_sample_meta= function(df, sample_meta, key_cols) { stop('Not all key_cols (printed above) are present in the sample meta.') } + # Validation: Check that cell_set exists in df and cell_set meta ---- + if(validate_columns_exist(c('cell_set'), sample_meta) == FALSE) { + stop('The cell_set column is NOT present in the sample meta.') + } + + if(validate_columns_exist(c('cell_set'), cell_set_meta) == FALSE) { + stop('The cell_set column is NOT present in the cell set meta.') + } + # Collapse the sample meta using key_cols and join onto the input df ---- # Collapse unique values into a single row and then filter out columns with the separator. # Columns with only one unique value in a group are selected. - collapsed_metadata= sample_meta %>% dplyr::group_by(pick(all_of(key_cols))) %>% + collapsed_metadata= sample_meta %>% dplyr::left_join(cell_set_meta, by= 'cell_set') %>% + dplyr::group_by(pick(all_of(key_cols))) %>% dplyr::summarise(across(everything(), function(x) paste(sort(unique(x)), collapse= ':::'))) %>% dplyr::ungroup() %>% dplyr::select(all_of(key_cols), where(function(x) base::any(!grepl(':::', x)))) From f2ed2d7fff18bf6dc35a3a11e2fe4f8bb58887d6 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Mon, 26 Aug 2024 14:37:20 -0400 Subject: [PATCH 019/127] Updates from podman PR comments --- scripts/src/QC_images.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index a8ec5bac..2b6afc1d 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -427,8 +427,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, } # Do the same for index 2. - # Reverse index 2 barcodes if needed. - if(reverse_index2) { + # Reverse index 2 barcodes if it is indicated and if "index_2" exisits + if(reverse_index2 & 'index_2' %in% colnames(sample_meta) ) { print("Reverse-complementing index 2 barcode.") sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2)) } @@ -485,8 +485,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, skipped_qcs = c(skipped_qcs, potential_error) } - ## Contaminants ---- - print('5. Generating contaminate cell lines ...') + ## Cell line contaminants ---- + print('5. Generating cell line contaminants ...') potential_error= base::tryCatch({ contams= annotated_counts %>% dplyr::filter(expected_read==F) %>% dplyr::mutate(barcode_id= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>% @@ -507,14 +507,14 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, skipped_qcs = c(skipped_qcs, potential_error) } - ## Contaminates for ursula ---- - print('6. Generating contaminate reads for Ursula ...') + ## Contaminant reads ---- + print('6. Generating contaminant reads ...') potential_error= base::tryCatch({ pcr_locations= c('pcr_plate', 'pcr_well') # Validation: Check that the PCR columns are present in raw_counts. if(!validate_columns_exist(pcr_locations, raw_counts)) { - stop('pcr_plate and pcr_well are required raw_counts.csv for this to work.') + stop('pcr_plate and pcr_well are required in raw_counts.csv for this to work.') } # count number of wells a cell_set appears in. @@ -568,8 +568,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, contam_reads %>% write.csv(paste0(out, 'contam_reads.csv'), row.names=F) }, error= function(e) { print(e) - print('Encountered an error when creating the contams for UW file. Skipping this output ...') - return('contam for UW') + print('Encountered an error when creating the contams reads file. Skipping this output ...') + return('contam reads') }) # Collect returned string if an error occurred From 02fb15d90f27c7add90ecfddcf5799d50bbc11a9 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Mon, 26 Aug 2024 17:34:14 -0400 Subject: [PATCH 020/127] Created function for cdf plot --- scripts/src/QC_images.R | 143 +++++++++++++++++++++++++++------------- 1 file changed, 98 insertions(+), 45 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index 2b6afc1d..67528939 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -180,6 +180,91 @@ create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value return(recov_plot) } +#' Cumulative reads plot +#' +#' Creates a line plot of the cumulative reads. +#' +#' @param input_df Input dataframe. Usually is the filtered_counts dataframe. +#' @param id_cols Vector of column names that identify every PCR well. +#' @param counts_col Name of the column that contains the values. Defaults to "n". +#' @param mark1 Percentage of reads to mark. Draws a line at a specified percentage to indicate the number of +#' cell lines needed to reach this percentage of reads. Defaults to 0.5. +#' @param mark2 Percentage of reads to mark. Draws a line at a specified percentage to indicate the number of +#' cell lines needed to reach this percentage of reads. This parameter should be greater than the +#' value specified for "mark1". Defaults to 0.95. +#' @param contains_cb Boolean. If control barcodes are used, this can be set to TRUE so that points +#' corresponding to the control barcodes will be colored on the plot. Defaults to FALSE. +#' @param order_aucs Boolean, when there are multiple facets, this can be set to TRUE to sort the facets by +#' the AUC value. The AUCs will be sorted in descending order. Defaults to FALSE. +#' @returns Returns a ggplot object. +create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2= 0.95, + contains_cbs= FALSE, order_aucs= FALSE) { + # Validation: Check that id_cols and counts_col are in the input dataframe. + if(!validate_columns_exist(c(id_cols, counts_col), input_df)) { + stop('Some input columns were not detected in the cdf input dataframe.') + } + + # Determine percentages, ranks and cumulative percentages + calc_cummulative= input_df %>% dplyr::group_by(pick(all_of(id_cols))) %>% + dplyr::arrange(dplyr::desc(.data[[counts_col]])) %>% + dplyr::mutate(expected_num_cls= dplyr::n(), + total_counts= sum(.data[[counts_col]]), pct_counts= .data[[counts_col]]/total_counts, + cum_pct= cumsum(pct_counts), + rank= row_number(), rank_pct= rank/expected_num_cls) %>% dplyr::ungroup() + + # Validation: mark1 should be less than mark2. + if(mark1 > mark2 | mark1 < 0 | mark1 > 1) { + stop('Mark values must be between 0 and 1 Mark1 should be less than mark2') + } + + # Find the number of cell lines needed to reach mark1 and mark2 + mark1_values= calc_cummulative %>% dplyr::filter(cum_pct >= mark1) %>% + dplyr::group_by(pick(all_of(id_cols))) %>% dplyr::arrange(cum_pct) %>% + dplyr::filter(row_number() == 1) %>% dplyr::ungroup() %>% + dplyr::select(all_of(id_cols), rank_pct= rank_pct, mark1_rank= rank, mark1_loc= rank_pct) + mark2_values= calc_cummulative %>% dplyr::group_by(pick(all_of(id_cols))) %>% + dplyr::mutate(auc= sum(cum_pct * (1 / expected_num_cls))) %>% # calculate AUCs + dplyr::filter(cum_pct >= mark2) %>% dplyr::arrange(cum_pct) %>% + dplyr::filter(row_number() == 1) %>% dplyr::ungroup() %>% + dplyr::select(all_of(id_cols), rank_pct= rank_pct, mark2_rank= rank, mark2_loc= rank_pct, auc) + + # Create cdf plot + data_for_plot= calc_cummulative %>% + dplyr::left_join(mark1_values, by= c(id_cols, 'rank_pct')) %>% + dplyr::left_join(mark2_values, by= c(id_cols, 'rank_pct')) %>% + tidyr::unite(all_of(id_cols), col= 'facet_name', sep= ':', remove= TRUE, na.rm= FALSE) + + # Reorder by aucs if specified + if(order_aucs) { + data_for_plot= data_for_plot %>% dplyr::arrange(dplyr::desc(auc)) %>% + dplyr::mutate(facet_name= base::factor(facet_name, levels= unique(facet_name))) + } + + # Create plot + output_plot= data_for_plot %>% + ggplot(aes(x= rank_pct, y=cum_pct)) + + # Color control barcodes if specified + { if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), + mapping= aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size= 2) } + + geom_line(color='black') + + # point for mark1 of counts + geom_segment(aes(x= -Inf , y= mark1, xend= mark1_loc, yend = mark1), color= 'black', linetype= 2) + + geom_segment(aes(x= mark1_loc, y= -Inf, xend = mark1_loc, yend = mark1), color= 'black', linetype= 2) + + geom_label(aes(x= mark1_loc, y= 0.25, label= mark1_rank), hjust= 0, color= 'black') + + # point for 95% of counts + geom_segment(aes(x= -Inf , y= mark2, xend= mark2_loc, yend= mark2), color= 'black', linetype= 2) + + geom_segment(aes(x= mark2_loc, y= -Inf, xend= mark2_loc, yend= mark2), color= 'black', linetype= 2) + + geom_label(aes(x= mark2_loc, y= 0.75, label= mark2_rank), hjust= 0, color= 'black') + + # label for AUC + #geom_label(aes(x= mark2_loc, y= 0.1, label= paste0('AUC ', round(auc, 3))), hjust= 'inward', color= 'black') + + geom_label(. %>% dplyr::filter(!is.na(auc)), mapping= aes(label= paste0('AUC ', round(auc, 3))), + x= 1, y= 0, hjust= 'inward', vjust= 'inward', color= 'black') + + facet_wrap(~facet_name) + + labs(x='% rank of unique reads', y='Cumulative percentage', color= 'CBs') + theme_bw() + + return(output_plot) +} + #' Control barcode scatter plot #' #' Creates a scatter plot of the control barcodes. @@ -373,10 +458,11 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, id_cols= c('pcr_plate', 'pcr_well'), sig_cols, control_type= 'negcon', count_threshold= 40, reverse_index2= FALSE, out = NA) { - require(tidyverse) - require(magrittr) - require(reshape2) - require(scales) + library(tidyverse) + library(magrittr) + library(reshape2) + library(scales) + library(WGCNA) if(is.na(out)) { out = getwd() @@ -578,53 +664,20 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, } ## Cumulative counts by lines in negcons ---- - print("7. Generating cummulative image ...") + print("7. Generating cumulative image ...") potential_error= base::tryCatch({ - cdf= filtered_counts %>% dplyr::filter(trt_type == control_type) %>% - dplyr::left_join(num_cls_in_set, by= "cell_set") %>% - dplyr::mutate(expected_num_cl= ifelse(control_barcodes, expected_num_cl + length(unique(CB_meta$Name)), - expected_num_cl)) %>% # add CBs to expected_num_cl if there are CBs - tidyr::unite(all_of(id_cols), col= 'profile_id', sep= ':', remove= FALSE) %>% - dplyr::group_by(pcr_plate, pcr_well, profile_id, expected_num_cl) %>% - dplyr::mutate(total_counts= sum(n), pct_counts= n/total_counts,) %>% dplyr::arrange(-n) %>% - dplyr::mutate(cum_pct= cumsum(pct_counts), rank= row_number(), - rank_pct= rank/expected_num_cl) %>% dplyr::ungroup() - - # additional tables - mark50= cdf %>% dplyr::filter(cum_pct >= 0.5) %>% dplyr::group_by(profile_id) %>% - arrange(cum_pct) %>% dplyr::filter(row_number()==1) %>% ungroup() %>% - dplyr::select(profile_id, rank_pct= rank_pct, num50= rank, num50_loc= rank_pct) - mark95= cdf %>% dplyr::group_by(profile_id) %>% - dplyr::mutate(auc= sum(cum_pct*(1/expected_num_cl))) %>% # calculate AUCs - dplyr::filter(cum_pct >= 0.95) %>% - arrange(cum_pct) %>% dplyr::filter(row_number() ==1) %>% ungroup() %>% - dplyr::select(profile_id, rank_pct= rank_pct, num95= rank, num95_loc= rank_pct, auc) - - cdf_plot= cdf %>% - merge(mark50, by= c('profile_id', 'rank_pct'), all.x=T) %>% - merge(mark95, by= c('profile_id', 'rank_pct'), all.x= T) %>% - ggplot(aes(x= rank_pct, y=cum_pct)) + - { if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), - mapping=aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size=3) } + - geom_line(color='black') + - # point for 50% of counts - geom_segment(aes(x= -Inf , y= .50, xend = num50_loc, yend = .50), color= 'black', linetype='dashed') + - geom_segment(aes(x= num50_loc, y= -Inf, xend = num50_loc, yend = .50), color= 'black', linetype='dashed') + - geom_label(aes(x=num50_loc, y= .25, label= num50), hjust= 0, color= 'black') + - # point for 95% of counts - geom_segment(aes(x= -Inf , y= .95, xend = num95_loc, yend = .95), color= 'black', linetype='dashed') + - geom_segment(aes(x= num95_loc, y= -Inf, xend = num95_loc, yend = .95), color= 'black', linetype='dashed') + - geom_label(aes(x=num95_loc, y= .75, label= num95), hjust= 0, color= 'black') + - # label for AUC - geom_label(aes(x=num95_loc, y= .25, label= paste0('AUC ', round(auc,3))), hjust= 'inward', color= 'black') + - facet_wrap(~profile_id) + - labs(x='% rank of unique reads', y='Cumulative percentage', color= 'CBs') + theme_bw() + cdf_plot= create_cdf_plot(filtered_counts %>% dplyr::filter(trt_type == 'control_type'), + id_cols= id_cols, + counts_col= 'n', + mark1= 0.5, mark2= 0.95, + contains_cbs= contains_cbs, order_aucs= TRUE) + + labs(title= 'Cumulative reads in negative controls.') pdf(file=paste(out, "cdf_plot.pdf", sep="/"), width=sqrt(num_profiles)*2, height=sqrt(num_profiles)) print(cdf_plot) dev.off() - rm(cdf, mark50, mark95, cdf_plot) + rm(cdf_plot) }, error= function(e) { print(e) print('Encountered an error when creating the cdf plot. Skipping this output ...') From c1890e546d0ea8469f6a2b5fd6c6a7dc10343f1f Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 27 Aug 2024 11:18:24 -0400 Subject: [PATCH 021/127] Fixed typo --- scripts/filter_counts.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R index 62b54e2c..6fabf610 100755 --- a/scripts/filter_counts.R +++ b/scripts/filter_counts.R @@ -10,7 +10,7 @@ suppressPackageStartupMessages(library(sets)) suppressPackageStartupMessages(library(tidyverse)) # load last - after dplyr source("./src/filter_raw_reads.R") -# Arguement parser ---- +# Argument parser ---- parser <- ArgumentParser() # specify desired options parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, From db92b42b6d03b706241bc12deed757da57d01712 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 27 Aug 2024 11:18:57 -0400 Subject: [PATCH 022/127] Renamed file --- scripts/src/join_metadata.R | 75 ++++++++++++++++++++++++++++++++++ scripts/src/join_sample_meta.R | 73 --------------------------------- 2 files changed, 75 insertions(+), 73 deletions(-) create mode 100644 scripts/src/join_metadata.R delete mode 100644 scripts/src/join_sample_meta.R diff --git a/scripts/src/join_metadata.R b/scripts/src/join_metadata.R new file mode 100644 index 00000000..3842054c --- /dev/null +++ b/scripts/src/join_metadata.R @@ -0,0 +1,75 @@ +#' validate_columns_exist +#' +#' This function checks that a list of columns are present in a dataframe. +#' Columns that were not found in the dataframe are printed out. +#' +#' @param selected_columns A vector of strings each representing a column name +#' @param df A dataframe to check against +#' @return Boolean +validate_columns_exist= function(selected_cols, df) { + # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B]. + unmatched_cols= base::setdiff(selected_cols, colnames(df)) + + if(length(unmatched_cols) > 0) { + print('The following columns are missing: ') + print(unmatched_cols) + return(FALSE) + } else { + return(TRUE) + } +} + +#' Join metadata +#' +#' Joins a given data frame with the sample meta. +#' +#' @param input_df Input dataframe that should contain the columns specified in the "key_cols" parameter and "cell_set". +#' @param metadata Dataframe of the sample meta used in the run. +#' @param key_cols Vector of column names used as identifiers in the sample meta. +#' @returns Data frame with additional columns from the sample meta. +join_metadata= function(input_df, metadata, key_cols) { + # Validation: Check that key_cols are present in df ---- + if(validate_columns_exist(key_cols, input_df) == FALSE) { + stop('Not all key_cols (printed above) are present in the provided dataframe.') + } + + # Validation: Check that key_cols are present in the sample meta ---- + if(validate_columns_exist(key_cols, metadata) == FALSE) { + stop('Not all key_cols (printed above) are present in the provided metadata.') + } + + # Collapse the sample meta using key_cols and join onto the input df ---- + # Collapse unique values into a single row and then filter out columns with the separator. + # Columns with only one unique value in a group are selected. + collapsed_metadata= metadata %>% dplyr::group_by(pick(all_of(key_cols))) %>% + dplyr::summarise(across(everything(), function(x) paste(sort(unique(x)), collapse= ':::'))) %>% dplyr::ungroup() %>% + dplyr::select(all_of(key_cols), where(function(x) base::any(!grepl(':::', x)))) + + # Join using the key_cols, drop any columns that were duplicated. + output_df= dplyr::left_join(input_df, collapsed_metadata, by= key_cols, + suffix= c('', '.y'), relationship='many-to-one') %>% + dplyr::select(-tidyselect::ends_with('.y')) + + # Validation: Check that merge did not explode ---- + print(paste0(' Input df rows: ', nrow(input_df))) + print(paste0('Output df rows: ', nrow(output_df))) + if(nrow(input_df) < nrow(output_df)) { + stop('Metadata join is producing more rows than expected!') + } else if(nrow(input_df) > nrow(output_df)) { + stop('Metadata join is dropping some rows!') + } else {} + + # Print out the sample meta columns that were added to the dataframe ---- + added_cols= base::setdiff(colnames(output_df), colnames(input_df)) + if(length(added_cols > 0)) { + print(paste0('The following ', length(added_cols), ' column(s) were added:')) + print(added_cols) + print(paste0('The following ', length(metadata) - length(added_cols) - length(key_cols), + ' column(s) from the metadata were not added. They may already exist in the dataframe.')) + print(base::setdiff(colnames(metadata), c(added_cols, key_cols))) + } else { + print('No additional columns from the metadata were added.') + } + + return(output_df) +} diff --git a/scripts/src/join_sample_meta.R b/scripts/src/join_sample_meta.R deleted file mode 100644 index 9a20e931..00000000 --- a/scripts/src/join_sample_meta.R +++ /dev/null @@ -1,73 +0,0 @@ -#' validate_columns_exist -#' -#' This function checks that a list of columns are present in a dataframe. -#' Columns that were not found in the dataframe are printed out. -#' -#' @param selected_columns A vector of strings each representing a column name -#' @param df A dataframe to check against -#' @return Boolean -validate_columns_exist= function(selected_cols, df) { - # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B]. - unmatched_cols= base::setdiff(selected_cols, colnames(df)) - - if(length(unmatched_cols) > 0) { - print('The following columns are missing: ') - print(unmatched_cols) - return(FALSE) - } else { - return(TRUE) - } -} - -#' join_sample_meta -#' -#' Joins a given data frame with the sample meta. -#' -#' @param df Input dataframe that should contain the columns specified in the "key_cols" parameter and "cell_set". -#' @param sample_meta Dataframe of the sample meta used in the run. -#' @param cell_set_meta Datafrane of the cell set metadata used in the run. This should contain the "cell_set" column. -#' @param key_cols Vector of column names used as identifiers in the sample meta. -#' @returns Data frame with additional columns from the sample meta. -join_sample_meta= function(df, sample_meta, cell_set_meta, key_cols) { - # Validation: Check that key_cols are present in df ---- - if(validate_columns_exist(key_cols, df) == FALSE) { - stop('Not all key_cols (printed above) are present in the provided data frame.') - } - - # Validation: Check that key_cols are present in the sample meta ---- - if(validate_columns_exist(key_cols, sample_meta) == FALSE) { - stop('Not all key_cols (printed above) are present in the sample meta.') - } - - # Validation: Check that cell_set exists in df and cell_set meta ---- - if(validate_columns_exist(c('cell_set'), sample_meta) == FALSE) { - stop('The cell_set column is NOT present in the sample meta.') - } - - if(validate_columns_exist(c('cell_set'), cell_set_meta) == FALSE) { - stop('The cell_set column is NOT present in the cell set meta.') - } - - # Collapse the sample meta using key_cols and join onto the input df ---- - # Collapse unique values into a single row and then filter out columns with the separator. - # Columns with only one unique value in a group are selected. - collapsed_metadata= sample_meta %>% dplyr::left_join(cell_set_meta, by= 'cell_set') %>% - dplyr::group_by(pick(all_of(key_cols))) %>% - dplyr::summarise(across(everything(), function(x) paste(sort(unique(x)), collapse= ':::'))) %>% dplyr::ungroup() %>% - dplyr::select(all_of(key_cols), where(function(x) base::any(!grepl(':::', x)))) - - expanded_df= dplyr::left_join(df, collapsed_metadata, - by= base::intersect(colnames(df), colnames(collapsed_metadata)), - relationship='many-to-one') - - # Print out the sample meta columns that were added to the dataframe ---- - added_cols= base::setdiff(colnames(expanded_df), colnames(df)) - if(length(added_cols > 0)) { - print('The following columns from the sample meta were added:') - print(added_cols) - } else { - print('No additional columns from the sample meta were added.') - } - - return(expanded_df) -} \ No newline at end of file From 94e833cd89d73e23acd7661c5904380495bb9014 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 27 Aug 2024 11:19:43 -0400 Subject: [PATCH 023/127] Rename file Call function twice to join sample meta and assay pool meta. --- scripts/join_metadata.R | 93 +++++++++++++++++++++++++++ scripts/joining_sample_meta_columns.R | 60 ----------------- 2 files changed, 93 insertions(+), 60 deletions(-) create mode 100644 scripts/join_metadata.R delete mode 100644 scripts/joining_sample_meta_columns.R diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R new file mode 100644 index 00000000..8d603b7c --- /dev/null +++ b/scripts/join_metadata.R @@ -0,0 +1,93 @@ +library(argparse) +library(tidyverse) +source("./src/join_sample_meta.R") + +# Argument parser ---- +parser <- ArgumentParser() +# specify our desired options +parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Sample meta data for the sequencing run.') +parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata") +parser$add_argument('--l2fc', default= 'l2fc.csv', help= 'L2FC data.') # level 4 +parser$add_argument('--collapsed_l2fc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5 +parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', + help= 'Columns that uniquely identify a condition.') +parser$add_argument('--out', default= getwd(), help= 'Path to the output directory.') + +args <- parser$parse_args() + +# set output to working directory if none is specified +if (args$out == "") { + args$out = args$wkdir +} + +# Read in files and prepare some parameters ---- +sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',') +sig_cols= unlist(strsplit(args$sig_cols, ",")) + +# For assay pool meta, check if it exists. If so, then filter it for relavent cell_sets/davepool_ids +# and select and rename some columns. +assay_pool_meta_exists= FALSE +if(file.exists(args$assay_pool_meta)) { + assay_pool_meta_exists= TRUE # Update boolean + + # Read in assay pool meta and transform the table into something more usable. + assay_pool_meta= read.delim(args$assay_pool_meta) + unique_cell_sets= unique(sample_meta$cell_set[sample_meta$cell_set != ""]) + input_assay_pool_meta= assay_pool_meta %>% dplyr::filter(davepool_id %in% unique_cell_sets) %>% + dplyr::select(DepMap_ID= depmap_id, CCLE_name= ccle_name, cell_set= davepool_id, pool_id) +} + +# Add sample meta and assay pool meta to l2fc table ---- +if(file.exists(args$l2fc)) { + l2fc= data.table::fread(args$l2fc, header= T, sep= ',') + + # Add sample meta columns to l2fc + if('bio_rep' %in% colnames(sample_meta) & 'bio_rep' %in% colnames(l2fc)) { + input_cols= c(sig_cols, 'bio_rep') + } else { + input_cols= sig_cols + print('WARNING: No "bio_rep" column detected. Proceeding with just sig_cols.') + } + l2fc_with_meta_columns= join_metadata(input_df= l2fc, metadata= sample_meta, key_cols= input_cols) + + # Add assay pool meta columns to l2fc + if(assay_pool_meta_exists) { + l2fc_with_meta_columns= join_metadata(input_df= l2fc_with_meta_columns, + metadata= input_assay_pool_meta, + key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set')) + } else { + print('WARNING: Assay pool meta not detected and will not be joined onto l2fc.') + } + + # Write out + outpath= paste(args$out, 'l2fc_with_meta_columns.csv', sep='/') + print(paste("Writing l2fc_with_meta_columns.csv to ", outpath)) + write.csv(l2fc_with_meta_columns, outpath, row.names= FALSE, quote= FALSE) +} else { + print('WARNING: l2fc.csv does not exist. Skipping this file.') +} + +# Add sample meta and assay pool meta to collapsed_l2fc table ---- +if(file.exists(args$collapsed_l2fc)) { + collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',') + + # Add sample meta columns to collapsed l2fc + collapsed_l2fc_with_meta_columns= join_metadata(input_df= collapsed_l2fc, metadata= sample_meta, + key_cols= sig_cols) + + # Add assay pool meta columns to collapsed l2fc + if(assay_pool_meta_exists) { + collapsed_l2fc_with_meta_columns= join_metadata(input_df= collapsed_l2fc_with_meta_columns, + metadata= input_assay_pool_meta, + key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set')) + } else { + print('WARNING: Assay pool meta not detected and will not be joined onto collapsed l2fc.') + } + + # Write out + outpath= paste(args$out, 'collapsed_l2fc_with_meta_columns.csv', sep='/') + print(paste("Writing collapsed_l2fc_with_meta_columns.csv to ", outpath)) + write.csv(collapsed_l2fc_with_meta_columns.csv, outpath, row.names= FALSE, quote= FALSE) +} else { + print('WARNING: collapsed_l2fc.csv does not exist. Skipping this file.') +} diff --git a/scripts/joining_sample_meta_columns.R b/scripts/joining_sample_meta_columns.R deleted file mode 100644 index 50b1e003..00000000 --- a/scripts/joining_sample_meta_columns.R +++ /dev/null @@ -1,60 +0,0 @@ -library(argparse) -library(tidyverse) -source("./src/join_sample_meta.R") - -# Argument parser ---- -parser <- ArgumentParser() -# specify our desired options -parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Sample meta data for the sequencing run.') -parser$add_argument('--cell_set_meta', default= 'cell_set_meta.csv', help= 'Cell set metadata for the sequencing run.') -parser$add_argument('--l2fc', default= 'l2fc.csv', help= 'L2FC data.') # level 4 -parser$add_argument('--collapsed_l2fc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5 -parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', - help= 'Columns that uniquely identify a condition.') -parser$add_argument('--out', default= getwd(), help= 'Path to the output directory.') - -args <- parser$parse_args() - -# set output to working directory if none is specified -if (args$out == "") { - args$out = args$wkdir -} - -# Prepare args ---- -sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',') -cell_set_meta= data.table::fread(args$cell_set_meta, header= T, sep= ',') -sig_cols= unlist(strsplit(args$sig_cols, ",")) - -# Add in metadata for l2fc file ---- -if(file.exists(args$l2fc)) { - l2fc= data.table::fread(args$l2fc, header= T, sep= ',') - if('bio_rep' %in% sample_meta & 'bio_rep' %in% l2fc) { - input_cols= c(sig_cols, 'bio_rep') - } else { - input_cols= sig_cols - print('WARNING: No "bio_rep" column detected. Proceeding with just sig_cols.') - } - l2fc_with_sm= join_sample_meta(df= l2fc, sample_meta, cell_set_meta, key_cols= input_cols) - - # Write out - outpath= paste(args$out, 'l2fc_with_sm.csv', sep='/') - print(paste("Writing l2fc_with_sm.csv to ", outpath)) - write.csv(l2fc_with_sm, outpath, row.names= FALSE, quote= FALSE) -} else { - print('WARNING: l2fc.csv does not exist. Skipping this file.') -} -# - -# Add in metadata for collapsed_l2fc file ---- -if(file.exists(args$collapsed_l2fc)) { - collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',') - collapsed_l2fc_with_sm= join_sample_meta(df= collapsed_l2fc, sample_meta, cell_set_meta, key_cols= sig_cols) - - # Write out - outpath= paste(args$out, 'collapsed_l2fc_with_metadata.csv', sep='/') - print(paste("Writing collapsed_l2fc_with_metadata.csv to ", outpath)) - write.csv(collapsed_l2fc_with_sm, outpath, row.names= FALSE, quote= FALSE) -} else { - print('WARNING: collapsed_l2fc.csv does not exist. Skipping this file.') -} -# From 8287fe251a9b3ccb071c11d97e04e0218b48d57f Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 27 Aug 2024 11:23:07 -0400 Subject: [PATCH 024/127] Change source due to renaming --- scripts/join_metadata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R index 8d603b7c..5eff5629 100644 --- a/scripts/join_metadata.R +++ b/scripts/join_metadata.R @@ -1,6 +1,6 @@ library(argparse) library(tidyverse) -source("./src/join_sample_meta.R") +source("./src/join_metadata.R") # Argument parser ---- parser <- ArgumentParser() From c98ce0eb0aa39cb28288e597478ff87da69637bd Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 27 Aug 2024 14:15:14 -0400 Subject: [PATCH 025/127] Removed filtered counts --- scripts/filteredCounts_QC.R | 7 ++----- scripts/src/QC_images.R | 5 ++++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R index 372ac869..0dcf74ab 100755 --- a/scripts/filteredCounts_QC.R +++ b/scripts/filteredCounts_QC.R @@ -22,12 +22,11 @@ parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help=" parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output") parser$add_argument("--wkdir", default=getwd(), help="Working directory") parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata") -parser$add_argument("-c", "--uncollapsed_raw_counts", default="raw_counts_uncollapsed.csv", +parser$add_argument("-c", "--raw_counts_uncollapsed", default="raw_counts_uncollapsed.csv", help="path to file containing uncollapsed raw counts file") parser$add_argument("--raw_counts", default= "raw_counts.csv", help="path to raw counts file") parser$add_argument("--annotated_counts", default= "annotated_counts.csv", help= "path to file containing annotated counts") -parser$add_argument("--filtered_counts", default= "filtered_counts.csv", help= "path to filtered_counts file") parser$add_argument("--normalized_counts", default="normalized_counts.csv", help="path to file containing normalized counts") parser$add_argument("--l2fc", default="l2fc.csv", help= "path to l2fc file") @@ -56,10 +55,9 @@ if (args$out == ""){ # Read in files and pull out parameters ---- # Pipeline outputs -raw_counts_uncollapsed= data.table::fread(args$uncollapsed_raw_counts, header= TRUE, sep= ',') +raw_counts_uncollapsed= data.table::fread(args$raw_counts_uncollapsed, header= TRUE, sep= ',') raw_counts= data.table::fread(args$raw_counts, header= TRUE, sep= ',') annotated_counts= data.table::fread(args$annotated_counts, header= TRUE, sep= ',') -filtered_counts= data.table::fread(args$filtered_counts, header= TRUE, sep= ',') if(file.exists(args$normalized_counts)) { normalized_counts= data.table::fread(args$normalized_counts, header=TRUE, sep=',', data.table=FALSE) } else { @@ -93,7 +91,6 @@ print("Generating QC images ...") QC_images(raw_counts_uncollapsed= raw_counts_uncollapsed, raw_counts= raw_counts, annotated_counts= annotated_counts, - filtered_counts= filtered_counts, normalized_counts= normalized_counts, l2fc= l2fc, sample_meta= sample_meta, diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index 67528939..2867a2a1 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -323,6 +323,7 @@ create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log #' @import tidyverse #' @import WGCNA #' @import reshape2 +#' @import scales #' @param input_df Dataframe. #' @param row_id_cols Vector of column names from input_df that identifies the cell lines. For example, #' this can be "DepMap_ID", "CCLE_name" if only cell lines exist. It can also be @@ -452,7 +453,7 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou #' @param out Path to the directory to save the QC images. #' @returns NA. QC images are written out to the specified folder. QC_images= function(raw_counts_uncollapsed, raw_counts, - annotated_counts, filtered_counts, normalized_counts= NA, l2fc, + annotated_counts, normalized_counts= NA, l2fc, sample_meta, CB_meta, cell_set_meta, cell_line_cols, id_cols= c('pcr_plate', 'pcr_well'), sig_cols, @@ -490,6 +491,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, num_cls_in_set %<>% dplyr::mutate(expected_num_cl= str_split(members, ';')) %>% tidyr::unnest(cols= expected_num_cl) %>% dplyr::group_by(cell_set) %>% dplyr::summarize(expected_num_cl= length(unique(expected_num_cl))) %>% dplyr::ungroup() + + filtered_counts= annotated_counts %>% dplyr::filter(expected) # # Sequencing QCs ____________________ ---- From a1053329c9e4a60c235c22b327acccec7c326d43 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 27 Aug 2024 14:24:20 -0400 Subject: [PATCH 026/127] Added parameters id_cols and reverse_index2 --- scripts/collate_fastq_reads.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/collate_fastq_reads.sh b/scripts/collate_fastq_reads.sh index 9a275420..d33eedfa 100644 --- a/scripts/collate_fastq_reads.sh +++ b/scripts/collate_fastq_reads.sh @@ -77,11 +77,14 @@ PROJECT_DIR=$(dirname "$BUILD_DIR") PROJECT_CODE=$(basename "$PROJECT_DIR") echo Project Code: $PROJECT_CODE +echo REVERSE_INDEX2 is: $REVERSE_INDEX2 args=( --sample_meta "$SAMPLE_META" --out "$BUILD_DIR" --sequencing_index_cols="$SEQUENCING_INDEX_COLS" +--id_cols "$ID_COLS" +--reverse_index2 "$REVERSE_INDEX2" ) echo Rscript collate_fastq_reads.R "${args[@]}" From 10743f97fb9a2632edcc785cca72a9a3b774efe7 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 27 Aug 2024 14:24:43 -0400 Subject: [PATCH 027/127] Removed parameters seq_cols and reverse_index2 --- scripts/filter_counts.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/filter_counts.sh b/scripts/filter_counts.sh index f43447c8..ba6609f7 100644 --- a/scripts/filter_counts.sh +++ b/scripts/filter_counts.sh @@ -78,7 +78,6 @@ echo RAW_COUNTS is: $RAW_COUNTS echo CELL_LINE_META is: $CELL_LINE_META echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META echo CELL_SET_META is: $CELL_SET_META -echo REVERSE_INDEX2 is: $REVERSE_INDEX2 args=( -c "$RAW_COUNTS" @@ -88,8 +87,6 @@ args=( --cell_set_meta "$CELL_SET_META" --out "$BUILD_DIR" --count_threshold "$COUNT_THRESHOLD" ---sequencing_index_cols "$SEQUENCING_INDEX_COLS" ---reverse_index2 "$REVERSE_INDEX2" --pool_id "$PULL_POOL_ID" --rm_data "$REMOVE_DATA" --assay_pool_meta "$ASSAY_POOL_META" From 301e0f6341ad24a4a7a3ddd4389c74e2befb22e3 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 27 Aug 2024 14:25:04 -0400 Subject: [PATCH 028/127] Added two files as inputs --- scripts/filteredCounts_QC.sh | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh index 2c5799a8..ff2f13e0 100644 --- a/scripts/filteredCounts_QC.sh +++ b/scripts/filteredCounts_QC.sh @@ -72,6 +72,22 @@ else RAW_COUNTS=$BUILD_DIR/$RAW_COUNTS fi +#Enforces abs paths +if [[ "$RAW_COUNTS_UNCOLLAPSED" = /* ]] +then + RAW_COUNTS_UNCOLLAPSED=$(ls $RAW_COUNTS_UNCOLLAPSED) +else + RAW_COUNTS_UNCOLLAPSED=$BUILD_DIR/$RAW_COUNTS_UNCOLLAPSED +fi + +#Enforces abs paths +if [[ "$L2FC" = /* ]] +then + L2FC=$(ls $L2FC) +else + L2FC=$BUILD_DIR/$L2FC +fi + #Enforces abs paths if [[ "$CONTROL_BARCODE_META" = /* ]] then @@ -88,6 +104,8 @@ echo CELL_SET_META is: $CELL_SET_META echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META echo COUNT_THRESHOLD is: $COUNT_THRESHOLD echo COUNT_COL_NAME is: $COUNT_COL_NAME +echo RAW_COUNTS_UNCOLLAPSED is: $RAW_COUNTS_UNCOLLAPSED +echo L2FC is: $L2FC echo RAW_COUNTS is: $RAW_COUNTS echo REVERSE_INDEX2 is: $REVERSE_INDEX2 @@ -102,7 +120,9 @@ args=( --count_threshold "$COUNT_THRESHOLD" --count_col_name "$COUNT_COL_NAME" --control_type "$CTL_TYPES" +--raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED" --raw_counts "$RAW_COUNTS" +--l2fc "$L2FC" --id_cols "$ID_COLS" --reverse_index2 "$REVERSE_INDEX2" ) @@ -118,7 +138,9 @@ echo Rscript filteredCounts_QC.R --sample_meta $SAMPLE_META \ --count_col_name $COUNT_COL_NAME \ --reverse_index2 $REVERSE_INDEX2 \ --control_type $CTL_TYPES \ +--raw_counts_uncollapsed $RAW_COUNTS_UNCOLLAPSED \ --raw_counts $RAW_COUNTS \ +--l2fc $L2FC \ --id_cols $ID_COLS Rscript filteredCounts_QC.R "${args[@]}" \ No newline at end of file From 388f40c7658f91ced3966c21ed2ef6a74d009d4e Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 28 Aug 2024 10:45:18 -0400 Subject: [PATCH 029/127] Changed flag to 'lfc' --- scripts/join_metadata.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R index 5eff5629..c0d2e98d 100644 --- a/scripts/join_metadata.R +++ b/scripts/join_metadata.R @@ -7,7 +7,7 @@ parser <- ArgumentParser() # specify our desired options parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Sample meta data for the sequencing run.') parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata") -parser$add_argument('--l2fc', default= 'l2fc.csv', help= 'L2FC data.') # level 4 +parser$add_argument('--lfc', default= 'l2fc.csv', help= 'L2FC data.') # level 4 parser$add_argument('--collapsed_l2fc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5 parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', help= 'Columns that uniquely identify a condition.') @@ -38,8 +38,8 @@ if(file.exists(args$assay_pool_meta)) { } # Add sample meta and assay pool meta to l2fc table ---- -if(file.exists(args$l2fc)) { - l2fc= data.table::fread(args$l2fc, header= T, sep= ',') +if(file.exists(args$lfc)) { + l2fc= data.table::fread(args$lfc, header= T, sep= ',') # Add sample meta columns to l2fc if('bio_rep' %in% colnames(sample_meta) & 'bio_rep' %in% colnames(l2fc)) { From 482fd0b52ce4a2bd9fa03062320d2db635c59124 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 28 Aug 2024 10:45:55 -0400 Subject: [PATCH 030/127] Look for meta joined files --- scripts/seq_to_mts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/seq_to_mts.py b/scripts/seq_to_mts.py index 64ecd2e7..ba8c307f 100755 --- a/scripts/seq_to_mts.py +++ b/scripts/seq_to_mts.py @@ -74,14 +74,14 @@ def main(args): os.makedirs(args.out) try: - fstr = os.path.join(args.build_path, 'l2fc.csv') + fstr = os.path.join(args.build_path, 'l2fc_with_meta_columns.csv') fmatch = glob.glob(fstr) assert (len(fmatch) == 1) , "Too many files found" print("Reading in data") sample_meta = read_build_file("sample_meta.csv", args) level_3 = read_build_file("normalized_counts.csv", args) - level_4 = read_build_file("l2fc.csv", args) - level_5 = read_build_file("collapsed_l2fc.csv", args) + level_4 = read_build_file("l2fc_with_meta_columns.csv", args) + level_5 = read_build_file("collapsed_l2fc_with_meta_columns.csv", args) except IndexError as err: logger.error(err) From 9aa2b9f4b6a564bc4c178aa86663d135b0b619f6 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 28 Aug 2024 10:46:01 -0400 Subject: [PATCH 031/127] Create join_metadata.sh --- scripts/join_metadata.sh | 52 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 scripts/join_metadata.sh diff --git a/scripts/join_metadata.sh b/scripts/join_metadata.sh new file mode 100644 index 00000000..f2588eef --- /dev/null +++ b/scripts/join_metadata.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +echo Starting metadata join... + +if [ -z "$BUILD_DIR" ] +then + echo BUILD_DIR not specified + exit -1 +fi + +if [ -z "$L2FC" ] +then + echo LFC parameter empty + exit -1 + +fi + +if [ -z "$COLLAPSED_L2FC" ] +then + echo Collapsed l2fc parameter empty + exit -1 +fi + +#Enforces abs paths +if [[ "$LFC" = /* ]] +then + LFC=$(ls $LFC) +else + LFC=$BUILD_DIR/$LFC +fi + +#Enforces abs paths +if [[ "$COLLAPSED_L2FC" = /* ]] +then + COLLAPSED_L2FC=$(ls $COLLAPSED_L2FC) +else + COLLAPSED_L2FC=$BUILD_DIR/$COLLAPSED_L2FC +fi + +echo Build dir is: $BUILD_DIR +echo LFC is: $LFC +echo COLLAPSED_L2FC is: $COLLAPSED_L2FC + +echo Rscript join_metadata.R -c $LFC \ +--collapsed_l2fc $COLLAPSED_L2FC \ +--out $BUILD_DIR \ +--sig_cols $SIG_COLS + +Rscript join_metadata.R -c $LFC \ +--collapsed_l2fc $COLLAPSED_L2FC \ +--out $BUILD_DIR \ +--sig_cols $SIG_COLS \ No newline at end of file From 0200dec8fa090e8c6256f21f0cd3678d5de65780 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 28 Aug 2024 11:04:04 -0400 Subject: [PATCH 032/127] Changed COLLAPSED_L2FC to COLLAPSED_VALUES --- scripts/join_metadata.sh | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/scripts/join_metadata.sh b/scripts/join_metadata.sh index f2588eef..20c2e95e 100644 --- a/scripts/join_metadata.sh +++ b/scripts/join_metadata.sh @@ -15,12 +15,18 @@ then fi -if [ -z "$COLLAPSED_L2FC" ] +if [ -z "$COLLAPSED_VALUES" ] then echo Collapsed l2fc parameter empty exit -1 fi +if [ -z "$ASSAY_POOL_META" ] +then + echo ASSAY_POOL_META parameter empty + exit -1 +fi + #Enforces abs paths if [[ "$LFC" = /* ]] then @@ -30,23 +36,33 @@ else fi #Enforces abs paths -if [[ "$COLLAPSED_L2FC" = /* ]] +if [[ "$COLLAPSED_VALUES" = /* ]] +then + COLLAPSED_VALUES=$(ls $COLLAPSED_VALUES) +else + COLLAPSED_VALUES=$BUILD_DIR/$COLLAPSED_VALUES +fi + +#Enforces abs paths +if [[ "$ASSAY_POOL_META" = /* ]] then - COLLAPSED_L2FC=$(ls $COLLAPSED_L2FC) + ASSAY_POOL_META=$(ls $ASSAY_POOL_META) else - COLLAPSED_L2FC=$BUILD_DIR/$COLLAPSED_L2FC + ASSAY_POOL_META=$BUILD_DIR/$ASSAY_POOL_META fi echo Build dir is: $BUILD_DIR echo LFC is: $LFC -echo COLLAPSED_L2FC is: $COLLAPSED_L2FC +echo COLLAPSED_VALUES is: $COLLAPSED_VALUES echo Rscript join_metadata.R -c $LFC \ ---collapsed_l2fc $COLLAPSED_L2FC \ +--collapsed_l2fc $COLLAPSED_VALUES \ +--assay_pool_meta $ASSAY_POOL_META \ --out $BUILD_DIR \ --sig_cols $SIG_COLS Rscript join_metadata.R -c $LFC \ ---collapsed_l2fc $COLLAPSED_L2FC \ +--collapsed_l2fc $COLLAPSED_VALUES \ +--assay_pool_meta $ASSAY_POOL_META \ --out $BUILD_DIR \ --sig_cols $SIG_COLS \ No newline at end of file From bf2537b473a9607a3e9691db7e16b4a0afd39e4d Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 28 Aug 2024 11:04:29 -0400 Subject: [PATCH 033/127] Add join_metada to run --- scripts/make_config_file.groovy | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy index 4b3cff60..779c9305 100644 --- a/scripts/make_config_file.groovy +++ b/scripts/make_config_file.groovy @@ -14,6 +14,7 @@ pipeline { booleanParam(name: 'CBNORMALIZE', defaultValue: true, description: 'Check this to trigger the CBnormalize job.') booleanParam(name: 'COMPUTE_LFC', defaultValue: true, description: 'Check this to trigger the compute_l2fc job.') booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.') + booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.') booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.') string(name: 'BUILD_DIR', defaultValue: '/cmap/obelix/pod/prismSeq/', description: 'Output path to deposit build. Format should be /directory/PROJECT_CODE/BUILD_NAME') string(name: 'BUILD_NAME', defaultValue: '', description: 'Build name') @@ -200,6 +201,9 @@ pipeline { if (params.COLLAPSE) { scriptsToRun.add('collapse_replicates.sh') } + if (params.JOIN_METADATA) { + scriptsToRun.add('join_metadata.sh') + } if (params.RUN_EPS_QC) { scriptsToRun.add('eps_qc.sh') } From 961f64307c726c944fb4525683760e48d9c9467e Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 28 Aug 2024 11:12:55 -0400 Subject: [PATCH 034/127] Change l2fc to lfc --- scripts/filteredCounts_QC.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh index ff2f13e0..67debcb8 100644 --- a/scripts/filteredCounts_QC.sh +++ b/scripts/filteredCounts_QC.sh @@ -81,11 +81,11 @@ else fi #Enforces abs paths -if [[ "$L2FC" = /* ]] +if [[ "$LFC" = /* ]] then - L2FC=$(ls $L2FC) + LFC=$(ls $LFC) else - L2FC=$BUILD_DIR/$L2FC + LFC=$BUILD_DIR/$LFC fi #Enforces abs paths @@ -105,7 +105,7 @@ echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META echo COUNT_THRESHOLD is: $COUNT_THRESHOLD echo COUNT_COL_NAME is: $COUNT_COL_NAME echo RAW_COUNTS_UNCOLLAPSED is: $RAW_COUNTS_UNCOLLAPSED -echo L2FC is: $L2FC +echo LFC is: $LFC echo RAW_COUNTS is: $RAW_COUNTS echo REVERSE_INDEX2 is: $REVERSE_INDEX2 @@ -122,7 +122,7 @@ args=( --control_type "$CTL_TYPES" --raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED" --raw_counts "$RAW_COUNTS" ---l2fc "$L2FC" +--lfc "$LFC" --id_cols "$ID_COLS" --reverse_index2 "$REVERSE_INDEX2" ) @@ -140,7 +140,7 @@ echo Rscript filteredCounts_QC.R --sample_meta $SAMPLE_META \ --control_type $CTL_TYPES \ --raw_counts_uncollapsed $RAW_COUNTS_UNCOLLAPSED \ --raw_counts $RAW_COUNTS \ ---l2fc $L2FC \ +--lfc $LFC \ --id_cols $ID_COLS Rscript filteredCounts_QC.R "${args[@]}" \ No newline at end of file From 1c4a7882368d1de9e11816d81ad24e26e48f9f60 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 28 Aug 2024 11:14:22 -0400 Subject: [PATCH 035/127] Changed l2fc to lfc --- scripts/filteredCounts_QC.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R index 0dcf74ab..5f8d2304 100755 --- a/scripts/filteredCounts_QC.R +++ b/scripts/filteredCounts_QC.R @@ -29,7 +29,7 @@ parser$add_argument("--annotated_counts", default= "annotated_counts.csv", help= "path to file containing annotated counts") parser$add_argument("--normalized_counts", default="normalized_counts.csv", help="path to file containing normalized counts") -parser$add_argument("--l2fc", default="l2fc.csv", help= "path to l2fc file") +parser$add_argument("--lfc", default="l2fc.csv", help= "path to l2fc file") parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help = "Sample metadata") parser$add_argument("--CB_meta", default="/data/CB_meta.csv", help = "control barcode metadata") parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help = "Cell set metadata") @@ -63,7 +63,7 @@ if(file.exists(args$normalized_counts)) { } else { normalized_counts= NA } -l2fc= data.table::fread(args$l2fc, header= TRUE, sep= ',') +l2fc= data.table::fread(args$lfc, header= TRUE, sep= ',') # Metadata files sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',', data.table= FALSE) From ff1833b06b8699163c8cf9009e938cf8b12b53e0 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 28 Aug 2024 13:22:42 -0400 Subject: [PATCH 036/127] Updated comments changed library back to require, fixed a bug, and numbered images --- scripts/src/QC_images.R | 74 +++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 43 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index 2867a2a1..7be92d14 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -459,19 +459,22 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, id_cols= c('pcr_plate', 'pcr_well'), sig_cols, control_type= 'negcon', count_threshold= 40, reverse_index2= FALSE, out = NA) { - library(tidyverse) - library(magrittr) - library(reshape2) - library(scales) - library(WGCNA) - - if(is.na(out)) { - out = getwd() - } + # Required packages ---- + require(tidyverse) + require(magrittr) + require(reshape2) + require(scales) + require(WGCNA) # Some preprocessing ---- - skipped_qcs= c() # empty vector to collect potential errors - num_profiles = annotated_counts %>% dplyr::distinct(pick(all_of(id_cols))) %>% nrow() + # Set out directory if none is specified. + if(is.na(out)) {out = getwd()} + + # Create empty vector to collect potential errors. + skipped_qcs= c() + + # Count number of distinct profile to help scale some plots. + num_profiles= annotated_counts %>% dplyr::distinct(pick(all_of(id_cols))) %>% nrow() # Detect control barcodes cb_check= sample_meta %>% @@ -479,31 +482,16 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, !(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type)) contains_cbs= ifelse(nrow(cb_check)!= 0, T, F) - # Count number of cell lines in each cell set - num_cls_in_set= cell_set_meta %>% dplyr::filter(cell_set %in% unique(sample_meta$cell_set)) - # Add cell_cets that are 'missing' or are strings of LUAs - if(nrow(num_cls_in_set) != length(unique(sample_meta$cell_set))) { - sets_not_in_meta= sample_meta %>% dplyr::filter(!cell_set %in% cell_set_meta$cell_set) %>% - dplyr::pull(cell_set) %>% unique() %>% sort() - sets_to_add_df= data_frame(cell_set= sets_not_in_meta, members= sets_not_in_meta) - num_cls_in_set= dplyr::bind_rows(num_cls_in_set, sets_to_add_df) - } - num_cls_in_set %<>% dplyr::mutate(expected_num_cl= str_split(members, ';')) %>% - tidyr::unnest(cols= expected_num_cl) %>% dplyr::group_by(cell_set) %>% - dplyr::summarize(expected_num_cl= length(unique(expected_num_cl))) %>% dplyr::ungroup() - - filtered_counts= annotated_counts %>% dplyr::filter(expected) - # + # Pull filtered counts from annotated counts + filtered_counts= annotated_counts %>% dplyr::filter(expected_read) # Sequencing QCs ____________________ ---- - ## Purity metrics ---- - # call this function + ## 1. Purity metrics ---- print('1. Generating QC table ...') create_qc_table(raw_counts_uncollapsed, raw_counts, filtered_counts, value_col= 'n', file_path= paste0(out, '/QC_table.csv')) - # - ## Index count summaries ---- + ## 2. Index count summaries ---- print("2. Generating index counts tables ...") # Check that "IndexBarcode1" and "index_1" columns are present. # If so, calculate index summary and write out. @@ -530,7 +518,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, print('Column "index_2" not detected. Skipping index 2 summaries ...', quote= FALSE) } - ## Total counts ---- + ## 3. Total counts ---- print("3. Generating total_counts image ...") potential_error= base::tryCatch({ tc= create_total_counts_barplot(filtered_counts, id_cols, facet_col= 'pcr_plate') @@ -552,7 +540,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, } # Assay QCs _________________________ ---- - ## Cell lines recovered ---- + ## 4. Cell lines recovered ---- print("4. Generating cell_lines_present image ...") potential_error= base::tryCatch({ cl_rec= create_recovery_barplot(filtered_counts, id_cols= id_cols, facet_col= 'pcr_plate', @@ -574,10 +562,10 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, skipped_qcs = c(skipped_qcs, potential_error) } - ## Cell line contaminants ---- + ## 5. Cell line contaminants ---- print('5. Generating cell line contaminants ...') potential_error= base::tryCatch({ - contams= annotated_counts %>% dplyr::filter(expected_read==F) %>% + contams= annotated_counts %>% dplyr::filter(expected_read == F) %>% dplyr::mutate(barcode_id= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>% dplyr::group_by(forward_read_cl_barcode, barcode_id) %>% dplyr::summarise(num_wells= n(), median_n=median(n), max_n= max(n)) %>% ungroup() %>% @@ -596,7 +584,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, skipped_qcs = c(skipped_qcs, potential_error) } - ## Contaminant reads ---- + ## 6. Contaminant reads ---- print('6. Generating contaminant reads ...') potential_error= base::tryCatch({ pcr_locations= c('pcr_plate', 'pcr_well') @@ -666,10 +654,10 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, skipped_qcs = c(skipped_qcs, potential_error) } - ## Cumulative counts by lines in negcons ---- + ## 7. Cumulative counts by lines in negcons ---- print("7. Generating cumulative image ...") potential_error= base::tryCatch({ - cdf_plot= create_cdf_plot(filtered_counts %>% dplyr::filter(trt_type == 'control_type'), + cdf_plot= create_cdf_plot(filtered_counts %>% dplyr::filter(trt_type == control_type), id_cols= id_cols, counts_col= 'n', mark1= 0.5, mark2= 0.95, @@ -692,7 +680,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, skipped_qcs = c(skipped_qcs, potential_error) } - ## Control barcode trends ---- + ## 8. Control barcode trends ---- if(contains_cbs & is.data.frame(normalized_counts)) { print("8. Generating control_barcode_trend image") potential_error= base::tryCatch({ @@ -717,7 +705,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, print('8. No control barcodes detected. Skipping control_barcode_trend image.') } - ## Sample correlation ----- + ## 9. Sample correlation ----- print("9. Generating sample_cor image ...") potential_error= base::tryCatch({ cor_df= filtered_counts %>% @@ -744,7 +732,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, skipped_qcs = c(skipped_qcs, potential_error) } - ## Tech rep correlations ---- + ## 10. Tech rep correlations ---- if(is.data.frame(normalized_counts) & 'tech_rep' %in% colnames(normalized_counts)) { # Check if there are more at least two tech reps unique_tech_reps= na.omit(unique(normalized_counts$tech_rep)) @@ -798,7 +786,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, print('10. No technical replicates detected. Skipping tech_reps scatter plot.') } - ## Bio rep correlations ---- + ## 11. Bio rep correlations ---- if('bio_rep' %in% colnames(l2fc)) { unique_bio_reps= na.omit(unique(l2fc$bio_rep)) @@ -830,8 +818,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, cor_method= 'pearson') pdf(file=paste(out, "bio_corr_hm.pdf", sep="/"), width=sqrt(num_profiles), height=sqrt(num_profiles)) - #print(bio_corr_hm) - #dev.off() + print(bio_corr_hm) + dev.off() }, error= function(e) { print(e) print('Encountered an error when creating the bio_corr_hm figure. Skipping this output ...') From 6e7afdd0803b6f24aabda59423c54936854740ca Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 28 Aug 2024 18:14:44 -0400 Subject: [PATCH 037/127] Fixed some minor bugs --- scripts/src/QC_images.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index 7be92d14..09b2b381 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -152,7 +152,7 @@ create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value dplyr::mutate(detect_type= case_when(.data[[value_col]] == 0 ~ 'Not detected', .data[[value_col]] <= count_threshold ~ 'Low counts', .data[[value_col]] > count_threshold ~ 'Detected')) %>% - dplyr::count(pick(all_of(c(id_cols, facet_col, 'detect_type', 'total_num_cls'))), name= 'num_cls_by_type') %>% + dplyr::count(pick(all_of(na.omit(c(id_cols, facet_col, 'detect_type', 'total_num_cls')))), name= 'num_cls_by_type') %>% tidyr::unite(all_of(id_cols), col= 'sample_id', sep= ':', remove= FALSE, na.rm= FALSE) %>% dplyr::mutate(percent= (num_cls_by_type / total_num_cls) * 100) @@ -258,7 +258,7 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2= # label for AUC #geom_label(aes(x= mark2_loc, y= 0.1, label= paste0('AUC ', round(auc, 3))), hjust= 'inward', color= 'black') + geom_label(. %>% dplyr::filter(!is.na(auc)), mapping= aes(label= paste0('AUC ', round(auc, 3))), - x= 1, y= 0, hjust= 'inward', vjust= 'inward', color= 'black') + + x= 1, y= 0.25, hjust= 'inward', vjust= 'inward', color= 'black') + facet_wrap(~facet_name) + labs(x='% rank of unique reads', y='Cumulative percentage', color= 'CBs') + theme_bw() From de5fdbee1639b05aa066a255f2a2089107863416 Mon Sep 17 00:00:00 2001 From: jdavis3141 Date: Fri, 30 Aug 2024 09:36:21 -0400 Subject: [PATCH 038/127] updated groovy to include join_meta --- scripts/join_metadata.sh | 15 +++++++++++++-- scripts/make_config_file.groovy | 1 + 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/scripts/join_metadata.sh b/scripts/join_metadata.sh index 20c2e95e..e1ef3a56 100644 --- a/scripts/join_metadata.sh +++ b/scripts/join_metadata.sh @@ -51,18 +51,29 @@ else ASSAY_POOL_META=$BUILD_DIR/$ASSAY_POOL_META fi +#Enforces abs paths +if [[ "$SAMPLE_META" = /* ]] +then + SAMPLE_META=$(ls $SAMPLE_META) +else + SAMPLE_META=$BUILD_DIR/$SAMPLE_META +fi + echo Build dir is: $BUILD_DIR echo LFC is: $LFC echo COLLAPSED_VALUES is: $COLLAPSED_VALUES +echo SAMPLE_META is: $SAMPLE_META echo Rscript join_metadata.R -c $LFC \ --collapsed_l2fc $COLLAPSED_VALUES \ --assay_pool_meta $ASSAY_POOL_META \ --out $BUILD_DIR \ ---sig_cols $SIG_COLS +--sig_cols $SIG_COLS \ +--sample_meta $SAMPLE_META Rscript join_metadata.R -c $LFC \ --collapsed_l2fc $COLLAPSED_VALUES \ --assay_pool_meta $ASSAY_POOL_META \ --out $BUILD_DIR \ ---sig_cols $SIG_COLS \ No newline at end of file +--sig_cols $SIG_COLS \ +--sample_meta $SAMPLE_META \ No newline at end of file diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy index 779c9305..23eaf395 100644 --- a/scripts/make_config_file.groovy +++ b/scripts/make_config_file.groovy @@ -16,6 +16,7 @@ pipeline { booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.') booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.') booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.') + booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.') string(name: 'BUILD_DIR', defaultValue: '/cmap/obelix/pod/prismSeq/', description: 'Output path to deposit build. Format should be /directory/PROJECT_CODE/BUILD_NAME') string(name: 'BUILD_NAME', defaultValue: '', description: 'Build name') string(name: 'SCREEN', defaultValue: '', description: 'Screen name from COMET, necessary if using COMET for sample metadata.') From 5273913a07a4000e37776842899e1781271a2797 Mon Sep 17 00:00:00 2001 From: jdavis3141 Date: Fri, 30 Aug 2024 09:38:00 -0400 Subject: [PATCH 039/127] rm extra param --- scripts/make_config_file.groovy | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy index 23eaf395..089348d3 100644 --- a/scripts/make_config_file.groovy +++ b/scripts/make_config_file.groovy @@ -14,7 +14,6 @@ pipeline { booleanParam(name: 'CBNORMALIZE', defaultValue: true, description: 'Check this to trigger the CBnormalize job.') booleanParam(name: 'COMPUTE_LFC', defaultValue: true, description: 'Check this to trigger the compute_l2fc job.') booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.') - booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.') booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.') booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.') string(name: 'BUILD_DIR', defaultValue: '/cmap/obelix/pod/prismSeq/', description: 'Output path to deposit build. Format should be /directory/PROJECT_CODE/BUILD_NAME') From be9fa66e06c028c91bdbb67de387d3ff8248063c Mon Sep 17 00:00:00 2001 From: jdavis3141 Date: Fri, 30 Aug 2024 09:47:47 -0400 Subject: [PATCH 040/127] get rid of store_true --- scripts/collate_fastq_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index 08449d16..b5a03004 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -16,7 +16,7 @@ parser$add_argument("--sequencing_index_cols", default= "index_1,index_2", help = "Sequencing columns in the sample meta") parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", help = "Columns that identify a unique PCR well") -parser$add_argument("--reverse_index2", action="store_true", default=FALSE, +parser$add_argument("--reverse_index2", type="logical", default=FALSE, help= "Reverse complement of index 2 for NovaSeq and NextSeq") parser$add_argument("--barcode_col", default= "forward_read_cl_barcode", help= "Name of the column in uncollapsed_raw_counts that contains the barcode sequences.") From 96c763cc602aee2591191d7296caaf446c5e0ba1 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 13:20:51 -0400 Subject: [PATCH 041/127] Fix bug - added flowcell groups --- scripts/src/collate_fastq_reads.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 26f0063f..3790f950 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -198,8 +198,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Filter for the expected flowcells and summed up the reads over the ID cols. print('Summing up reads ...') raw_counts= uncollapsed_raw_counts %>% - dplyr::semi_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane')) %>% - dplyr::inner_join(sequencing_map, by= intersect(colnames(.), colnames(sequencing_map)), relationship= 'many-to-one') %>% + dplyr::inner_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane')) %>% + dplyr::inner_join(sequencing_map, by= sequencing_index_cols, relationship= 'many-to-one') %>% dplyr::group_by(pick(all_of(c(id_cols, barcode_col)))) %>% dplyr::summarize(n= sum(n)) %>% dplyr::ungroup() From 71d188dd8f363c29d361da6f1185ba7085451a4c Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 13:59:27 -0400 Subject: [PATCH 042/127] Added id_cols as parameter --- scripts/filter_counts.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/filter_counts.sh b/scripts/filter_counts.sh index ba6609f7..a2423385 100644 --- a/scripts/filter_counts.sh +++ b/scripts/filter_counts.sh @@ -78,6 +78,7 @@ echo RAW_COUNTS is: $RAW_COUNTS echo CELL_LINE_META is: $CELL_LINE_META echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META echo CELL_SET_META is: $CELL_SET_META +echo ID_COLS is: $ID_COLS args=( -c "$RAW_COUNTS" @@ -85,6 +86,7 @@ args=( --cell_line_meta "$CELL_LINE_META" --CB_meta "$CONTROL_BARCODE_META" --cell_set_meta "$CELL_SET_META" +--id_cols "$ID_COLS" --out "$BUILD_DIR" --count_threshold "$COUNT_THRESHOLD" --pool_id "$PULL_POOL_ID" From 5fc37fa37a37acc11b2bf1e0ce95183d4e17743d Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 14:07:57 -0400 Subject: [PATCH 043/127] Fixed "conflicting option strings" --- scripts/filter_counts.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R index 6fabf610..d46d7bcc 100755 --- a/scripts/filter_counts.R +++ b/scripts/filter_counts.R @@ -20,7 +20,7 @@ parser$add_argument("-q", "--quietly", action="store_false", parser$add_argument("--wkdir", default=getwd(), help="Working directory") parser$add_argument("-c", "--raw_counts", default="raw_counts.csv", help = "path to file containing raw counts") parser$add_argument("-o", "--out", default="", help = "Output path. Default is working directory") -parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata") +parser$add_argument("--sample_meta", default="sample_meta.csv", help= "Sample metadata") parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata") parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help= "Cell set metadata") parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata") From a2354ae1d403a7c14c90275e6d49ca891e8c223b Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 14:13:54 -0400 Subject: [PATCH 044/127] Undo previous change --- scripts/filter_counts.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R index d46d7bcc..6fabf610 100755 --- a/scripts/filter_counts.R +++ b/scripts/filter_counts.R @@ -20,7 +20,7 @@ parser$add_argument("-q", "--quietly", action="store_false", parser$add_argument("--wkdir", default=getwd(), help="Working directory") parser$add_argument("-c", "--raw_counts", default="raw_counts.csv", help = "path to file containing raw counts") parser$add_argument("-o", "--out", default="", help = "Output path. Default is working directory") -parser$add_argument("--sample_meta", default="sample_meta.csv", help= "Sample metadata") +parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata") parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata") parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help= "Cell set metadata") parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata") From 800491ec307bc38c890f11a14a9f5bdd44a0f7fa Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 14:14:02 -0400 Subject: [PATCH 045/127] Remove duplicate --- scripts/filteredCounts_QC.R | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R index 5f8d2304..12e7bdc3 100755 --- a/scripts/filteredCounts_QC.R +++ b/scripts/filteredCounts_QC.R @@ -30,7 +30,6 @@ parser$add_argument("--annotated_counts", default= "annotated_counts.csv", parser$add_argument("--normalized_counts", default="normalized_counts.csv", help="path to file containing normalized counts") parser$add_argument("--lfc", default="l2fc.csv", help= "path to l2fc file") -parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help = "Sample metadata") parser$add_argument("--CB_meta", default="/data/CB_meta.csv", help = "control barcode metadata") parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help = "Cell set metadata") parser$add_argument("--cell_line_cols", default= 'DepMap_ID,CCLE_name', From 223d6957034efee677b973afd10575468c58e1fc Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 14:17:05 -0400 Subject: [PATCH 046/127] Moved QC module towards end --- scripts/make_config_file.groovy | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy index 089348d3..5ddce157 100644 --- a/scripts/make_config_file.groovy +++ b/scripts/make_config_file.groovy @@ -10,11 +10,11 @@ pipeline { booleanParam(name: 'CREATE_CELLDB_METADATA', defaultValue: true, description: 'Check this to trigger the create_celldb_metadata job.') booleanParam(name: 'COLLATE_FASTQ_READS', defaultValue: true, description: 'Check this to trigger the collate_fastq_reads job.') booleanParam(name: 'FILTER_COUNTS', defaultValue: true, description: 'Check this to trigger the filter_counts job.') - booleanParam(name: 'FILTER_COUNTS_QC', defaultValue: true, description: 'Check this to trigger the filteredCounts_QC job.') booleanParam(name: 'CBNORMALIZE', defaultValue: true, description: 'Check this to trigger the CBnormalize job.') booleanParam(name: 'COMPUTE_LFC', defaultValue: true, description: 'Check this to trigger the compute_l2fc job.') booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.') booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.') + booleanParam(name: 'FILTER_COUNTS_QC', defaultValue: true, description: 'Check this to trigger the filteredCounts_QC job.') booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.') string(name: 'BUILD_DIR', defaultValue: '/cmap/obelix/pod/prismSeq/', description: 'Output path to deposit build. Format should be /directory/PROJECT_CODE/BUILD_NAME') string(name: 'BUILD_NAME', defaultValue: '', description: 'Build name') @@ -189,9 +189,6 @@ pipeline { if (params.FILTER_COUNTS) { scriptsToRun.add('filter_counts.sh') } - if (params.FILTER_COUNTS_QC) { - scriptsToRun.add('filteredCounts_QC.sh') - } if (params.CBNORMALIZE) { scriptsToRun.add('CBnormalize.sh') } @@ -201,6 +198,9 @@ pipeline { if (params.COLLAPSE) { scriptsToRun.add('collapse_replicates.sh') } + if (params.FILTER_COUNTS_QC) { + scriptsToRun.add('filteredCounts_QC.sh') + } if (params.JOIN_METADATA) { scriptsToRun.add('join_metadata.sh') } From d41dd82689e04df25a8931da247f52a3e3562e16 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 14:22:48 -0400 Subject: [PATCH 047/127] Removed count_col_name --- scripts/filteredCounts_QC.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh index 67debcb8..460d7b38 100644 --- a/scripts/filteredCounts_QC.sh +++ b/scripts/filteredCounts_QC.sh @@ -103,7 +103,6 @@ echo NORMALIZED_COUNTS is: $NORMALIZED_COUNTS echo CELL_SET_META is: $CELL_SET_META echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META echo COUNT_THRESHOLD is: $COUNT_THRESHOLD -echo COUNT_COL_NAME is: $COUNT_COL_NAME echo RAW_COUNTS_UNCOLLAPSED is: $RAW_COUNTS_UNCOLLAPSED echo LFC is: $LFC echo RAW_COUNTS is: $RAW_COUNTS @@ -118,7 +117,6 @@ args=( --CB_meta "$CONTROL_BARCODE_META" --out "$BUILD_DIR" --count_threshold "$COUNT_THRESHOLD" ---count_col_name "$COUNT_COL_NAME" --control_type "$CTL_TYPES" --raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED" --raw_counts "$RAW_COUNTS" @@ -135,7 +133,6 @@ echo Rscript filteredCounts_QC.R --sample_meta $SAMPLE_META \ --sig_cols $SIG_COLS \ --out $BUILD_DIR \ --count_threshold $COUNT_THRESHOLD \ ---count_col_name $COUNT_COL_NAME \ --reverse_index2 $REVERSE_INDEX2 \ --control_type $CTL_TYPES \ --raw_counts_uncollapsed $RAW_COUNTS_UNCOLLAPSED \ From 33797c51c8a75cdd2e469b61b7f239618d80b0da Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 14:43:17 -0400 Subject: [PATCH 048/127] Added raw_counts_uncollapsed as a parameter --- scripts/make_config_file.groovy | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy index 5ddce157..334e0b4d 100644 --- a/scripts/make_config_file.groovy +++ b/scripts/make_config_file.groovy @@ -41,6 +41,7 @@ pipeline { string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'Minimum threshold to filter cell line counts by.') string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'Pseudocount for normalization.') string(name: 'CELL_LINE_META', defaultValue: 'cell_line_meta.csv', description: 'File in BUILD_DIR containing cell line metadata') + string(name: 'RAW_COUNTS_UNCOLLAPSED', defaultValue: 'raw_counts_uncollapsed.csv', description: 'Filename in BUILD_DIR containing nori output') string(name: 'RAW_COUNTS', defaultValue: 'raw_counts.csv', description: 'Filename in BUILD_DIR containing raw counts') string(name: 'FILTERED_COUNTS', defaultValue: 'filtered_counts.csv', description: 'File in BUILD_DIR containing filtered counts') string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'File containing log2 fold change values') @@ -105,7 +106,8 @@ pipeline { 'RUN_NORM', 'CONTROL_COLS', 'COUNT_THRESHOLD', 'COUNT_COL_NAME', 'BUILD_NAME', 'CONVERT_SUSHI', 'PULL_POOL_ID', 'RUN_EPS_QC', 'PSEUDOCOUNT', 'REMOVE_DATA', 'DAYS', 'SEQUENCING_INDEX_COLS', 'RAW_COUNTS', 'CELL_SET_META', 'CELL_LINE_META', 'FILTERED_COUNTS', 'LFC', 'COUNTS', 'ANNOTATED_COUNTS', - 'COLLAPSED_VALUES', 'NORMALIZED_COUNTS', 'API_URL', 'FILTER_COUNTS_QC', 'ASSAY_POOL_META', 'SCREEN' + 'COLLAPSED_VALUES', 'NORMALIZED_COUNTS', 'API_URL', 'FILTER_COUNTS_QC', 'ASSAY_POOL_META', 'SCREEN', + 'RAW_COUNTS_UNCOLLAPSED' ] def config = [:] From 10a950b32d5f0b785e1343449efe7c032df7a529 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 15:25:42 -0400 Subject: [PATCH 049/127] Allow module to run from non-nori outputs --- scripts/collate_fastq_reads.R | 77 ++++++++++++++++------------------- 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index b5a03004..9da53c38 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -9,7 +9,7 @@ parser <- ArgumentParser() # specify desired options parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]") parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output") -parser$add_argument("-c", "--uncollapsed_raw_counts", default="raw_counts_uncollapsed.csv", +parser$add_argument('--raw_counts_uncollapsed', default="raw_counts_uncollapsed.csv", help="path to file containing uncollapsed raw counts file") parser$add_argument("--sample_meta", default="sample_meta.csv", help = "Sample metadata") parser$add_argument("--sequencing_index_cols", default= "index_1,index_2", @@ -30,45 +30,40 @@ if (args$out == "") { args$out = args$wkdir } -# Run collate_fastq_reads if uncollapsed file exists ---- -expected_file_path <- paste(args$out, "raw_counts_uncollapsed.csv", sep='/') +# Run collate_fastq_reads ---- +# Read in files and parse vector arguments +raw_counts_uncollapsed= data.table::fread(args$raw_counts_uncollapsed, header= T, sep= ',') +sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',') -if(file.exists(expected_file_path)) { - # Read in files and parse vector arguments - uncollapsed_raw_counts= data.table::fread(expected_file_path, header= T, sep= ',', data.table= F) - sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',', data.table= F) - - # Parse vector inputs - sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ",")) - id_cols= unlist(strsplit(args$id_cols, ",")) - - # Validation: Check that sequencing_index_cols are from sample meta column names - if(!all(sequencing_index_cols %in% colnames(sample_meta))) { - stop(paste('The following sequencing_index_cols were not found in the sample meta: ', - sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)])) - } - - # Validation: Check that id_cols are from sample meta column names - if(!all(id_cols %in% colnames(sample_meta))) { - stop(paste('The following id_cols were not found in the sample meta: ', - id_cols[!id_cols %in% colnames(sample_meta)])) - } - - print("Collating fastq reads ...") - raw_counts= collate_fastq_reads(uncollapsed_raw_counts, sample_meta, - sequencing_index_cols= sequencing_index_cols, - id_cols= id_cols, - reverse_index2= args$reverse_index2, - barcode_col= args$barcode_col) - - # Validation: Basic file size check - if(nrow(raw_counts) == 0) { - stop('ERROR: Empty file generated. No rows in raw_counts output.') - } - - rc_out_file= paste(args$out, 'raw_counts.csv', sep='/') - print(paste("Writing raw_counts.csv to ", rc_out_file)) - write.csv(raw_counts, rc_out_file, row.names= FALSE, quote= FALSE) -} else { - print("Uncollapsed raw counts file not detected. Proceeding with generating filtered counts file.") +# Parse vector inputs +sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ",")) +id_cols= unlist(strsplit(args$id_cols, ",")) + +# Validation: Check that sequencing_index_cols are from sample meta column names +if(!all(sequencing_index_cols %in% colnames(sample_meta))) { + stop(paste('The following sequencing_index_cols were not found in the sample meta: ', + sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)])) +} + +# Validation: Check that id_cols are from sample meta column names +if(!all(id_cols %in% colnames(sample_meta))) { + stop(paste('The following id_cols were not found in the sample meta: ', + id_cols[!id_cols %in% colnames(sample_meta)])) } + +print("Collating fastq reads ...") +raw_counts= collate_fastq_reads(uncollapsed_raw_counts= raw_counts_uncollapsed, + sample_meta= sample_meta, + sequencing_index_cols= sequencing_index_cols, + id_cols= id_cols, + reverse_index2= args$reverse_index2, + barcode_col= args$barcode_col) + +# Validation: Basic file size check +if(nrow(raw_counts) == 0) { + stop('ERROR: Empty file generated. No rows in raw_counts output.') +} + +rc_out_file= paste(args$out, 'raw_counts.csv', sep='/') +print(paste("Writing raw_counts.csv to ", rc_out_file)) +write.csv(raw_counts, rc_out_file, row.names= FALSE, quote= FALSE) From c3924ed2377faf304035c3823f6af90fc53f5e73 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 15:26:13 -0400 Subject: [PATCH 050/127] Allow module to run from non-nori outputs --- scripts/src/collate_fastq_reads.R | 93 +++++++++++++++++-------------- 1 file changed, 52 insertions(+), 41 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 3790f950..eff9ebe7 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -141,45 +141,57 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, stop('One or more sequencing_index_cols in the sample meta is not filled out.') } - # Determine which flowcell names + lanes are expected ---- - # "flowcell_names" and "flowcell_lanes" are strings that can contain more than one item. - # Columns can be parsed by splitting on the chars , ; : - # If there are multiple lane names and lane numbers, this uses the Cartesian product! - # Note: fread and read.csv keeps commas, read_csv DROPS commas - expected_flowcells= sample_meta %>% dplyr::distinct(flowcell_names, flowcell_lanes) %>% - dplyr::mutate(flowcell_name= base::strsplit(flowcell_names, split='[,;:]', fixed=F), - flowcell_lane= base::strsplit(flowcell_lanes, split='[,;:]', fixed=F)) %>% - tidyr::unnest(cols= flowcell_name) %>% tidyr::unnest(cols= flowcell_lane) %>% - dplyr::mutate(flowcell_lane= as.numeric(flowcell_lane)) - - # Print out expected flowcells from the sample meta. - print(paste0('Identified ', nrow(expected_flowcells), ' unique flowcell + lane combos in the sample meta ...')) - print(expected_flowcells) - - # Print warning if there are multiple flowcell names with multiple flowcell lanes. - multi_name_and_lanes= expected_flowcells %>% dplyr::filter(grepl(',:;', flowcell_names) & grepl(',:;', flowcell_names)) - if(nrow(multi_name_and_lanes) > 0) { - print('WARNING: Detected sample(s) sequenced over multiple flowcells and flowcell lanes.') - print('The function assumes that the same lanes were used for both flowcells.') - } - - # Validation: Check that all expected flowcell name + lanes are detected ---- - # Check that all expected flowcell name + lanes are present in uncollapsed raw counts. - detected_flowcells= uncollapsed_raw_counts %>% dplyr::distinct(flowcell_name, flowcell_lane) - print(paste0('Identified ', nrow(detected_flowcells), ' unique flowcell + lane combos in the uncollapsed raw counts ...')) - print(detected_flowcells) - validate_detected_flowcells(detected_flowcells, expected_flowcells) - - # Validation: Check that sequencing_index_cols uniquely identify rows of sample meta ---- - if(!validate_unique_samples(sequencing_index_cols, sample_meta)) { - print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.') - stop('The specified sequencing index columns do NOT uniquely identify every PCR well.') - } - - # Validation: Check that id_cols uniquely identify rows of sample meta ---- - if(!validate_unique_samples(id_cols, sample_meta)) { - print('There may be multiple entries in the sample meta that have the same combination of ID columns.') - stop('The specified ID columns do NOT uniquely identify every PCR well.') + # If "flowcell_name" and "flowcell_lane" are present filter for valid flowcells ---- + # Note: Can this switch be tied to the sequencer type? + if(all_of(c('flowcell_name', 'flowcell_lane') %in% colnames(uncollapsed_raw_counts))) { + # Determine which flowcell names + lanes are expected ---- + # "flowcell_names" and "flowcell_lanes" are strings that can contain more than one item. + # Columns can be parsed by splitting on the chars , ; : + # If there are multiple lane names and lane numbers, this uses the Cartesian product! + # Note: fread and read.csv keeps commas, read_csv DROPS commas + expected_flowcells= sample_meta %>% dplyr::distinct(flowcell_names, flowcell_lanes) %>% + dplyr::mutate(flowcell_name= base::strsplit(flowcell_names, split='[,;:]', fixed=F), + flowcell_lane= base::strsplit(flowcell_lanes, split='[,;:]', fixed=F)) %>% + tidyr::unnest(cols= flowcell_name) %>% tidyr::unnest(cols= flowcell_lane) %>% + dplyr::mutate(flowcell_lane= as.numeric(flowcell_lane)) + + # Print out expected flowcells from the sample meta. + print(paste0('Identified ', nrow(expected_flowcells), ' unique flowcell + lane combos in the sample meta ...')) + print(expected_flowcells) + + # Print warning if there are multiple flowcell names with multiple flowcell lanes. + multi_name_and_lanes= expected_flowcells %>% dplyr::filter(grepl(',:;', flowcell_names) & grepl(',:;', flowcell_names)) + if(nrow(multi_name_and_lanes) > 0) { + print('WARNING: Detected sample(s) sequenced over multiple flowcells and flowcell lanes.') + print('The function assumes that the same lanes were used for both flowcells.') + } + + # Validation: Check that all expected flowcell name + lanes are detected ---- + # Check that all expected flowcell name + lanes are present in uncollapsed raw counts. + detected_flowcells= uncollapsed_raw_counts %>% dplyr::distinct(flowcell_name, flowcell_lane) + print(paste0('Identified ', nrow(detected_flowcells), ' unique flowcell + lane combos in the uncollapsed raw counts ...')) + print(detected_flowcells) + validate_detected_flowcells(detected_flowcells, expected_flowcells) + + # Validation: Check that sequencing_index_cols uniquely identify rows of sample meta ---- + if(!validate_unique_samples(sequencing_index_cols, sample_meta)) { + print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.') + stop('The specified sequencing index columns do NOT uniquely identify every PCR well.') + } + + # Validation: Check that id_cols uniquely identify rows of sample meta ---- + if(!validate_unique_samples(id_cols, sample_meta)) { + print('There may be multiple entries in the sample meta that have the same combination of ID columns.') + stop('The specified ID columns do NOT uniquely identify every PCR well.') + } + + # Filter for expected flowcells ---- + uncollapsed_raw_counts= uncollapsed_raw_counts %>% + dplyr::inner_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane')) + + } else { + print('Flowcell_name and/or flowcell_lane were not detected in raw_counts_uncollapsed.') + print('Proceeding without flowcell filters ...') } # Create sequence map ---- @@ -197,8 +209,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Create raw counts file ---- # Filter for the expected flowcells and summed up the reads over the ID cols. print('Summing up reads ...') - raw_counts= uncollapsed_raw_counts %>% - dplyr::inner_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane')) %>% + raw_counts= uncollapsed_raw_counts %>% dplyr::inner_join(sequencing_map, by= sequencing_index_cols, relationship= 'many-to-one') %>% dplyr::group_by(pick(all_of(c(id_cols, barcode_col)))) %>% dplyr::summarize(n= sum(n)) %>% dplyr::ungroup() From 18c8e91015265ca960b04c14832f04a1613a257c Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 15:36:04 -0400 Subject: [PATCH 051/127] Added raw_counts_uncollapsed as param --- scripts/collate_fastq_reads.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/collate_fastq_reads.sh b/scripts/collate_fastq_reads.sh index d33eedfa..7fe46214 100644 --- a/scripts/collate_fastq_reads.sh +++ b/scripts/collate_fastq_reads.sh @@ -63,12 +63,20 @@ then exit -1 fi +#Enforces abs paths +if [[ "$RAW_COUNTS_UNCOLLAPSED" = /* ]] +then + RAW_COUNTS_UNCOLLAPSED=$(ls $RAW_COUNTS_UNCOLLAPSED) +else + RAW_COUNTS_UNCOLLAPSED=$BUILD_DIR/$RAW_COUNTS_UNCOLLAPSED +fi + #Enforces abs paths if [[ "$SAMPLE_META" = /* ]] then - SAMPLE_META=$(ls $SAMPLE_META) + SAMPLE_META=$(ls $SAMPLE_META) else - SAMPLE_META=$BUILD_DIR/$SAMPLE_META + SAMPLE_META=$BUILD_DIR/$SAMPLE_META fi echo Build dir is: $BUILD_DIR @@ -80,6 +88,7 @@ echo Project Code: $PROJECT_CODE echo REVERSE_INDEX2 is: $REVERSE_INDEX2 args=( +--raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED" --sample_meta "$SAMPLE_META" --out "$BUILD_DIR" --sequencing_index_cols="$SEQUENCING_INDEX_COLS" From 270829266f138af07d1a4de5816d01f480072a5b Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 15:36:24 -0400 Subject: [PATCH 052/127] Added raw_counts_uncollapsed as param --- scripts/launch_job.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/launch_job.sh b/scripts/launch_job.sh index d2bbfd0c..6b8cf25d 100644 --- a/scripts/launch_job.sh +++ b/scripts/launch_job.sh @@ -29,6 +29,7 @@ PARAMS=( CONVERT_SUSHI PULL_POOL_ID RUN_EPS_QC PSEUDOCOUNT REMOVE_DATA DAYS SEQUENCING_INDEX_COLS RAW_COUNTS CELL_SET_META CELL_LINE_META FILTERED_COUNTS LFC COUNTS ANNOTATED_COUNTS COLLAPSED_VALUES NORMALIZED_COUNTS ASSAY_POOL_META + RAW_COUNTS_UNCOLLAPSED ) # Load parameters @@ -104,6 +105,7 @@ echo "Running in container:" -e COLLAPSED_VALUES="$COLLAPSED_VALUES" \ -e NORMALIZED_COUNTS="$NORMALIZED_COUNTS" \ -e ASSAY_POOL_META="$ASSAY_POOL_META" \ + -e RAW_COUNTS_UNCOLLAPSED="$RAW_COUNTS_UNCOLLAPSED"\ -v "$WORKSPACE:/workspace" \ -v /local/jenkins/.clue_api_key:/local/jenkins/.clue_api_key \ -v /cmap/data/vdb/prismSeq:/data \ From 6b3e8b24e8417a9952a296c2f5084bc24d706237 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 15:42:11 -0400 Subject: [PATCH 053/127] Fixed boolean return --- scripts/src/collate_fastq_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index eff9ebe7..b476feda 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -143,7 +143,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # If "flowcell_name" and "flowcell_lane" are present filter for valid flowcells ---- # Note: Can this switch be tied to the sequencer type? - if(all_of(c('flowcell_name', 'flowcell_lane') %in% colnames(uncollapsed_raw_counts))) { + if(base::all(c('flowcell_name', 'flowcell_lane') %in% colnames(uncollapsed_raw_counts))) { # Determine which flowcell names + lanes are expected ---- # "flowcell_names" and "flowcell_lanes" are strings that can contain more than one item. # Columns can be parsed by splitting on the chars , ; : From bb9ac63012861451d6def90759347c170fe45093 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 16:01:41 -0400 Subject: [PATCH 054/127] Changed type to logical --- scripts/filteredCounts_QC.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R index 12e7bdc3..bacfca23 100755 --- a/scripts/filteredCounts_QC.R +++ b/scripts/filteredCounts_QC.R @@ -41,7 +41,8 @@ parser$add_argument("--sig_cols", default="cell_set,treatment,dose,dose_unit,day parser$add_argument("--control_type", default = "negcon", help= "how negative control wells are distinguished in the trt_type column") parser$add_argument("--count_threshold", default=40, help= "Low counts threshold") -parser$add_argument("--reverse_index2", default=FALSE, help = "Reverse index 2") +parser$add_argument("--reverse_index2", type="logical", default=FALSE, + help= "Reverse complement of index 2 for NovaSeq and NextSeq") parser$add_argument("-o","--out", default="", help = "Output path. Default is working directory") # get command line options, if help option encountered print help and exit From 7d8dda3f3c116d9d19987a51488c131ae73afadd Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 16:14:41 -0400 Subject: [PATCH 055/127] Reference correct parameter --- scripts/join_metadata.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/join_metadata.sh b/scripts/join_metadata.sh index e1ef3a56..d13f1698 100644 --- a/scripts/join_metadata.sh +++ b/scripts/join_metadata.sh @@ -8,7 +8,7 @@ then exit -1 fi -if [ -z "$L2FC" ] +if [ -z "$LFC" ] then echo LFC parameter empty exit -1 @@ -64,14 +64,14 @@ echo LFC is: $LFC echo COLLAPSED_VALUES is: $COLLAPSED_VALUES echo SAMPLE_META is: $SAMPLE_META -echo Rscript join_metadata.R -c $LFC \ +echo Rscript join_metadata.R --lfc $LFC \ --collapsed_l2fc $COLLAPSED_VALUES \ --assay_pool_meta $ASSAY_POOL_META \ --out $BUILD_DIR \ --sig_cols $SIG_COLS \ --sample_meta $SAMPLE_META -Rscript join_metadata.R -c $LFC \ +Rscript join_metadata.R --lfc $LFC \ --collapsed_l2fc $COLLAPSED_VALUES \ --assay_pool_meta $ASSAY_POOL_META \ --out $BUILD_DIR \ From ba3cff8d7f8640b8ae64f573f3126d1a21d58781 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 16:36:54 -0400 Subject: [PATCH 056/127] Added more prints and fixed a typo --- scripts/join_metadata.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R index c0d2e98d..861f8024 100644 --- a/scripts/join_metadata.R +++ b/scripts/join_metadata.R @@ -39,6 +39,7 @@ if(file.exists(args$assay_pool_meta)) { # Add sample meta and assay pool meta to l2fc table ---- if(file.exists(args$lfc)) { + print('Attempting to add sample_meta to l2fc file.') l2fc= data.table::fread(args$lfc, header= T, sep= ',') # Add sample meta columns to l2fc @@ -51,6 +52,7 @@ if(file.exists(args$lfc)) { l2fc_with_meta_columns= join_metadata(input_df= l2fc, metadata= sample_meta, key_cols= input_cols) # Add assay pool meta columns to l2fc + print('Attempting to add assay_pool_meta to l2fc file.') if(assay_pool_meta_exists) { l2fc_with_meta_columns= join_metadata(input_df= l2fc_with_meta_columns, metadata= input_assay_pool_meta, @@ -69,6 +71,7 @@ if(file.exists(args$lfc)) { # Add sample meta and assay pool meta to collapsed_l2fc table ---- if(file.exists(args$collapsed_l2fc)) { + print('Attempting to add sample_meta to collapsed l2fc.') collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',') # Add sample meta columns to collapsed l2fc @@ -77,6 +80,7 @@ if(file.exists(args$collapsed_l2fc)) { # Add assay pool meta columns to collapsed l2fc if(assay_pool_meta_exists) { + print('Attempting to add assay_pool_meta to collapsed l2fc.') collapsed_l2fc_with_meta_columns= join_metadata(input_df= collapsed_l2fc_with_meta_columns, metadata= input_assay_pool_meta, key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set')) @@ -87,7 +91,7 @@ if(file.exists(args$collapsed_l2fc)) { # Write out outpath= paste(args$out, 'collapsed_l2fc_with_meta_columns.csv', sep='/') print(paste("Writing collapsed_l2fc_with_meta_columns.csv to ", outpath)) - write.csv(collapsed_l2fc_with_meta_columns.csv, outpath, row.names= FALSE, quote= FALSE) + write.csv(collapsed_l2fc_with_meta_columns, outpath, row.names= FALSE, quote= FALSE) } else { print('WARNING: collapsed_l2fc.csv does not exist. Skipping this file.') } From 352debf7462800877409ab1c7f80dee3f4b62c6b Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 30 Aug 2024 17:54:20 -0400 Subject: [PATCH 057/127] Updated comments and prints --- scripts/collate_fastq_reads.R | 2 +- scripts/filter_counts.R | 20 ++++++++--------- scripts/filteredCounts_QC.R | 37 ++++++++----------------------- scripts/join_metadata.R | 4 ++-- scripts/src/QC_images.R | 26 ++++++++++++++-------- scripts/src/collate_fastq_reads.R | 9 +++++--- scripts/src/filter_raw_reads.R | 3 ++- scripts/src/join_metadata.R | 5 ++++- 8 files changed, 51 insertions(+), 55 deletions(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index 9da53c38..ea51d9c8 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -51,7 +51,7 @@ if(!all(id_cols %in% colnames(sample_meta))) { id_cols[!id_cols %in% colnames(sample_meta)])) } -print("Collating fastq reads ...") +print("Calling collate_fastq_reads ...") raw_counts= collate_fastq_reads(uncollapsed_raw_counts= raw_counts_uncollapsed, sample_meta= sample_meta, sequencing_index_cols= sequencing_index_cols, diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R index 6fabf610..034cbdfa 100755 --- a/scripts/filter_counts.R +++ b/scripts/filter_counts.R @@ -73,13 +73,13 @@ cell_line_meta %<>% # distinct() # This needs to be removed for sequencing_index_cols to work! - YL # Run filter_raw_reads ----- -print("creating filtered count file") -filtered_counts = filter_raw_reads(raw_counts= raw_counts, sample_meta= sample_meta, - cell_line_meta= cell_line_meta, - cell_set_meta= cell_set_meta, - CB_meta= CB_meta, - id_cols= id_cols, - count_threshold= as.numeric(args$count_threshold)) +print('Calling filter_raw_reads ...') +filtered_counts= filter_raw_reads(raw_counts= raw_counts, sample_meta= sample_meta, + cell_line_meta= cell_line_meta, + cell_set_meta= cell_set_meta, + CB_meta= CB_meta, + id_cols= id_cols, + count_threshold= as.numeric(args$count_threshold)) # Pulling pool_id when db_flag and pool_id flags are passed if (args$pool_id) { @@ -110,12 +110,12 @@ if(sum(cl_entries$n) == 0) { # Write out module outputs ---- unmapped_reads= filtered_counts$unmapped_reads unmapped_out = paste(args$out, 'unmapped_reads.csv', sep='/') -print(paste("writing unmapped reads to: ", unmapped_out)) +print(paste("Writing unmapped reads to: ", unmapped_out)) write.csv(unmapped_reads, unmapped_out, row.names=F) annotated_counts = filtered_counts$annotated_counts annot_out_file = paste(args$out, 'annotated_counts.csv', sep='/') -print(paste("writing annotated counts to: ", annot_out_file)) +print(paste("Writing annotated counts to: ", annot_out_file)) write.csv(annotated_counts, annot_out_file, row.names=F) filtered_counts = filtered_counts$filtered_counts @@ -139,6 +139,6 @@ if(args$rm_data == TRUE){ } filtrc_out_file = paste(args$out, 'filtered_counts.csv', sep='/') -print(paste("writing filtered counts csv to: ", filtrc_out_file)) +print(paste("Writing filtered counts csv to: ", filtrc_out_file)) write.csv(filtered_counts, filtrc_out_file, row.names=F, quote=F) diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R index bacfca23..e98ca966 100755 --- a/scripts/filteredCounts_QC.R +++ b/scripts/filteredCounts_QC.R @@ -1,18 +1,14 @@ options(cli.unicode = FALSE) suppressPackageStartupMessages(library(argparse)) -suppressPackageStartupMessages(library(dplyr)) suppressPackageStartupMessages(library(scam)) suppressPackageStartupMessages(library(magrittr)) -suppressPackageStartupMessages(library(tidyr)) suppressPackageStartupMessages(library(reshape2)) -suppressPackageStartupMessages(library(tibble)) suppressPackageStartupMessages(library(stringr)) suppressPackageStartupMessages(library(grDevices)) -suppressPackageStartupMessages(library(ggplot2)) -suppressPackageStartupMessages(library(ggpubr)) +suppressPackageStartupMessages(library(tidyverse)) suppressPackageStartupMessages(library(scales)) # for out of bound handling in plots -suppressPackageStartupMessages(library(ggpmisc)) # with ggplot to add fit line and labels -suppressPackageStartupMessages(library(WGCNA)) +suppressPackageStartupMessages(library(ggpmisc)) # with ggplot to add linear fit labels +suppressPackageStartupMessages(library(WGCNA)) # for faster correlations source("/workspace/scripts/src/QC_images.R") # Argument parser ---- @@ -30,8 +26,6 @@ parser$add_argument("--annotated_counts", default= "annotated_counts.csv", parser$add_argument("--normalized_counts", default="normalized_counts.csv", help="path to file containing normalized counts") parser$add_argument("--lfc", default="l2fc.csv", help= "path to l2fc file") -parser$add_argument("--CB_meta", default="/data/CB_meta.csv", help = "control barcode metadata") -parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help = "Cell set metadata") parser$add_argument("--cell_line_cols", default= 'DepMap_ID,CCLE_name', help= "Columns that identify cell lines or barcodes") parser$add_argument("--id_cols", default= 'pcr_plate,pcr_well', @@ -54,6 +48,8 @@ if (args$out == ""){ } # Read in files and pull out parameters ---- +sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',') + # Pipeline outputs raw_counts_uncollapsed= data.table::fread(args$raw_counts_uncollapsed, header= TRUE, sep= ',') raw_counts= data.table::fread(args$raw_counts, header= TRUE, sep= ',') @@ -65,37 +61,22 @@ if(file.exists(args$normalized_counts)) { } l2fc= data.table::fread(args$lfc, header= TRUE, sep= ',') -# Metadata files -sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',', data.table= FALSE) -CB_meta= data.table::fread(args$CB_meta, header=TRUE, sep=',', data.table=FALSE) -cell_set_meta = data.table::fread(args$cell_set_meta, header=TRUE, sep=',', data.table=FALSE) - # Parameters cell_line_cols = unlist(strsplit(args$cell_line_cols, ",")) id_cols= unlist(strsplit(args$id_cols, ",")) sig_cols= unlist(strsplit(args$sig_cols, ",")) control_type = args$control_type count_threshold= as.numeric(args$count_threshold) +# -# # If flag passed, use cell_set_meta file generated for the project via CellDB -# if (args$db_flag) { -# print("Calling cell_set_meta generated using CellDB") -# cell_set_meta = read.csv("cell_set_meta.csv") -# # Otherwise, use static file -# } else { -# print("Using static cell set metadata file to locate cell information.") -# cell_set_meta = read.csv(args$cell_set_meta) -# } - -print("Generating QC images ...") +# Call QC images function ---- +print("Calling QC images ...") QC_images(raw_counts_uncollapsed= raw_counts_uncollapsed, raw_counts= raw_counts, annotated_counts= annotated_counts, normalized_counts= normalized_counts, l2fc= l2fc, - sample_meta= sample_meta, - CB_meta= CB_meta, - cell_set_meta= cell_set_meta, + sample_meta= sample_meta, cell_line_cols= c('DepMap_ID', 'CCLE_name'), id_cols= id_cols, sig_cols= sig_cols, diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R index 861f8024..877509f1 100644 --- a/scripts/join_metadata.R +++ b/scripts/join_metadata.R @@ -24,8 +24,8 @@ if (args$out == "") { sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',') sig_cols= unlist(strsplit(args$sig_cols, ",")) -# For assay pool meta, check if it exists. If so, then filter it for relavent cell_sets/davepool_ids -# and select and rename some columns. +# For assay pool meta, check if it exists. If so, then filter it for relevant cell_sets/davepool_ids +# and select/rename some columns. assay_pool_meta_exists= FALSE if(file.exists(args$assay_pool_meta)) { assay_pool_meta_exists= TRUE # Update boolean diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index 09b2b381..3ad0db66 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -23,6 +23,7 @@ validate_columns_exist= function(selected_cols, df) { #' #' Generates some simple summaries for each unique index. #' +#' @import tidyverse #' @param df A dataframe which must contain the column "n" which represents the count of a read. #' @param index_col The name of the column contain the index barcodes as a string. This column must be present in "df". #' @param valid_indices. A vector of all the valid indices for "index_col". @@ -87,6 +88,7 @@ create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts, v #' Creates the total counts barplot with bars colored by the barcode type, #' either a cell line barcode or control barcode. #' +#' @import tidyverse #' @param filtered_counts Filtered counts dataframe. #' @param id_cols Vector of columns names that identify each sample. #' @param facet_col String name of the column in filtered_counts to facet the plot. @@ -106,6 +108,7 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) { dplyr::group_by(pick(all_of(na.omit(c('sample_id', facet_col, 'barcode_type'))))) %>% dplyr::summarise(total_counts= sum(n)) %>% dplyr::ungroup() + # Create total counts plot total_counts_plot= total_counts %>% ggplot(aes(x=sample_id, y=total_counts, fill=barcode_type)) + geom_col(alpha=0.75, position='identity') + @@ -125,6 +128,7 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) { #' the total cell line counts on teh y axis. The parameter "include_ctrl_bcs" can be used to include the control #' barcodes in the cell line count. #' +#' @import tidyverse #' @param filtered_counts Filtered counts dataframe. #' @param id_cols Vector of column names that identify each sample. #' @param facet_col String name of the column in filtered_counts to facet the plot. @@ -184,6 +188,7 @@ create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value #' #' Creates a line plot of the cumulative reads. #' +#' @import tidyverse #' @param input_df Input dataframe. Usually is the filtered_counts dataframe. #' @param id_cols Vector of column names that identify every PCR well. #' @param counts_col Name of the column that contains the values. Defaults to "n". @@ -244,8 +249,8 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2= output_plot= data_for_plot %>% ggplot(aes(x= rank_pct, y=cum_pct)) + # Color control barcodes if specified - { if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), - mapping= aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size= 2) } + + {if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), + mapping= aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size= 2)} + geom_line(color='black') + # point for mark1 of counts geom_segment(aes(x= -Inf , y= mark1, xend= mark1_loc, yend = mark1), color= 'black', linetype= 2) + @@ -269,6 +274,7 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2= #' #' Creates a scatter plot of the control barcodes. #' +#' @import tidyverse #' @param normalized_counts Dataframe output from the normalize module. #' @param id_cols Vector of column names that identify every PCR well. #' @param value_col Name of the column that contains the values. @@ -376,6 +382,8 @@ create_cor_heatmap= function(input_df, row_id_cols, col_id_cols, value_col, #' #' From a long table, creates scatter plots to two replicates. #' +#' @import tidyverse +#' @import ggmisc #' @param input_df Dataframe. #' @param cell_line_cols List of column names used to identify each cell line or control barcode. #' @param replicate_group_cols List of column names that describe a group of similar conditions. @@ -441,8 +449,6 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou #' @param normalized_counts Normalized counts dataframe from the normalize module. This is an optional parameter. #' @param l2fc L2FC dataframe from the compute_l2fc module. This is used for the bio_reps plot. #' @param sample_meta Dataframe of the sample metadata for the sequencing run. -#' @param CB_meta Dataframe of the control barcode metadata. This is only used for the CDF plot. -#' @param cell_set_meta Dataframe of the cell set metadata. This is only used for the CDF plot. #' @param cell_line_cols Vector of sample meta column names used to describe a cell line or barcode. #' @param id_cols Vector of sample meta column names used to identify each PCR well. #' This defaults to "pcr_plate", "pcr_well". @@ -454,7 +460,7 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou #' @returns NA. QC images are written out to the specified folder. QC_images= function(raw_counts_uncollapsed, raw_counts, annotated_counts, normalized_counts= NA, l2fc, - sample_meta, CB_meta, cell_set_meta, + sample_meta, cell_line_cols, id_cols= c('pcr_plate', 'pcr_well'), sig_cols, control_type= 'negcon', count_threshold= 40, @@ -463,8 +469,9 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, require(tidyverse) require(magrittr) require(reshape2) - require(scales) require(WGCNA) + require(scales) + require(ggmisc) # Some preprocessing ---- # Set out directory if none is specified. @@ -480,7 +487,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, cb_check= sample_meta %>% dplyr::filter(control_barcodes %in% c("Y", "T", T), !(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type)) - contains_cbs= ifelse(nrow(cb_check)!= 0, T, F) + contains_cbs= ifelse(nrow(cb_check)!= 0, TRUE, FALSE) # Pull filtered counts from annotated counts filtered_counts= annotated_counts %>% dplyr::filter(expected_read) @@ -684,7 +691,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, if(contains_cbs & is.data.frame(normalized_counts)) { print("8. Generating control_barcode_trend image") potential_error= base::tryCatch({ - trend_sc= create_ctrlBC_scatterplots(normalized_counts, id_cols, value_col= 'log2_n') + trend_sc= create_ctrlBC_scatterplots(normalized_counts %>% dplyr::filter(control_barcodes %in% c("Y", "T", T)), + id_cols, value_col= 'log2_n') pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"), width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2) @@ -837,7 +845,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, } # End _________________________ ---- - print('QC finishing') + print('QCs finishing!') if(length(na.omit(skipped_qcs)) != 0) { print(paste0('WARNING: The following ', length(skipped_qcs), ' QCs encountered errors and were skipped - ')) print(na.omit(skipped_qcs)) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index b476feda..1e038526 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -83,6 +83,7 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) { #' #' @param uncollapsed_raw_counts Dataframe of reads from all the fastq files with the following columns - #' "flowcell_name", "flowcell_lane", "index_1", "index_2", "forward_read_cl_barcode", and "n". +#' The flowcell columns are optional. If they do not exists, flowcell filters will be skipped. #' @param sample_meta Sample metadata generate for the project which may contain the following columns - #' "flowcell_names", "flowcell_lanes", "index_1", "index_2". The sample meta MUST contain #' "flowcell_names" and "flowcell_lanes" for filtering. @@ -141,9 +142,11 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, stop('One or more sequencing_index_cols in the sample meta is not filled out.') } - # If "flowcell_name" and "flowcell_lane" are present filter for valid flowcells ---- + # If "flowcell_name" and "flowcell_lane" are present, filter for valid flowcells ---- # Note: Can this switch be tied to the sequencer type? if(base::all(c('flowcell_name', 'flowcell_lane') %in% colnames(uncollapsed_raw_counts))) { + print('Detecting flowcells. Filtering for valid flowcells ...') + # Determine which flowcell names + lanes are expected ---- # "flowcell_names" and "flowcell_lanes" are strings that can contain more than one item. # Columns can be parsed by splitting on the chars , ; : @@ -191,7 +194,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, } else { print('Flowcell_name and/or flowcell_lane were not detected in raw_counts_uncollapsed.') - print('Proceeding without flowcell filters ...') + print('Proceeding without filtering flowcells ...') } # Create sequence map ---- @@ -223,6 +226,6 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, print('Warning: Low index purity!') } else {} - print('Done!') + print('Collate_fastq_reads has completed!') return(raw_counts) } diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R index 32cbbe72..c8c3a939 100755 --- a/scripts/src/filter_raw_reads.R +++ b/scripts/src/filter_raw_reads.R @@ -119,7 +119,7 @@ filter_raw_reads = function(raw_counts, # Validation: Check that cell sets do not contain duplicate LUAs ---- # This will produce a warning if a LUA appears in a cell set more than once! - # This currently does NOT result in an error. Error avoided using a distinct when creating the template + # This currently does NOT result in an error. Error avoided using a distinct when creating the template. validate_cell_set_luas(sample_meta, cell_set_meta) # Split off unmapped reads ---- @@ -191,6 +191,7 @@ filter_raw_reads = function(raw_counts, print('Warning: Low cell line purity!') } + print('Filter_raw_reads has completed!') return(list(unmapped_reads= unmapped_reads, annotated_counts= annotated_counts, filtered_counts= filtered_counts)) diff --git a/scripts/src/join_metadata.R b/scripts/src/join_metadata.R index 3842054c..4f3f7e95 100644 --- a/scripts/src/join_metadata.R +++ b/scripts/src/join_metadata.R @@ -23,11 +23,14 @@ validate_columns_exist= function(selected_cols, df) { #' #' Joins a given data frame with the sample meta. #' +#' @import tidyverse #' @param input_df Input dataframe that should contain the columns specified in the "key_cols" parameter and "cell_set". #' @param metadata Dataframe of the sample meta used in the run. #' @param key_cols Vector of column names used as identifiers in the sample meta. #' @returns Data frame with additional columns from the sample meta. join_metadata= function(input_df, metadata, key_cols) { + require(tidyverse) + # Validation: Check that key_cols are present in df ---- if(validate_columns_exist(key_cols, input_df) == FALSE) { stop('Not all key_cols (printed above) are present in the provided dataframe.') @@ -51,7 +54,7 @@ join_metadata= function(input_df, metadata, key_cols) { dplyr::select(-tidyselect::ends_with('.y')) # Validation: Check that merge did not explode ---- - print(paste0(' Input df rows: ', nrow(input_df))) + print(paste0('Input df rows: ', nrow(input_df))) print(paste0('Output df rows: ', nrow(output_df))) if(nrow(input_df) < nrow(output_df)) { stop('Metadata join is producing more rows than expected!') From 144d097dda0c04dcb8357de91ebaa7638572ae88 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 6 Sep 2024 13:10:43 -0400 Subject: [PATCH 058/127] Update collate_fastq_reads.R --- scripts/src/collate_fastq_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 1e038526..7298d0d1 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -214,7 +214,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, print('Summing up reads ...') raw_counts= uncollapsed_raw_counts %>% dplyr::inner_join(sequencing_map, by= sequencing_index_cols, relationship= 'many-to-one') %>% - dplyr::group_by(pick(all_of(c(id_cols, barcode_col)))) %>% + dplyr::group_by(pick(all_of(c(id_cols, barcode_col)))) %>% dplyr::summarize(n= sum(n)) %>% dplyr::ungroup() # Calculate index purity ---- From 28280ac9d9a50d2011e97295be0ea164a0e50773 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 6 Sep 2024 18:03:14 -0400 Subject: [PATCH 059/127] Fixed typo in conflict resolution --- scripts/filter_counts.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R index 2111bc12..fb55a52f 100755 --- a/scripts/filter_counts.R +++ b/scripts/filter_counts.R @@ -24,9 +24,8 @@ parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sam parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata") parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help= "Cell set metadata") parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata") -parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", +parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", help = "Columns to identify each PCR well") parser$add_argument("--CB_meta", default="CB_meta.csv", help = "Control Barcode metadata") - help = "Sequencing columns in the sample meta") parser$add_argument("--count_threshold", default= 40, help = "Low counts threshold") parser$add_argument("--rm_data", type="logical", help = "Remove bad experimental data") parser$add_argument("--pool_id", type="logical", help = "Pull pool IDs from CellDB.") From 8e458338d84faca973c1c2a40055e97e9a487617 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 6 Sep 2024 18:18:28 -0400 Subject: [PATCH 060/127] Dropped unused params --- scripts/filteredCounts_QC.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh index 7cfaa901..c2653172 100644 --- a/scripts/filteredCounts_QC.sh +++ b/scripts/filteredCounts_QC.sh @@ -113,8 +113,6 @@ args=( --annotated_counts "$ANNOTATED_COUNTS" --normalized_counts "$NORMALIZED_COUNTS" --sig_cols "$SIG_COLS" ---cell_set_meta "$CELL_SET_META" ---CB_meta "$CONTROL_BARCODE_META" --out "$BUILD_DIR" --count_threshold "$COUNT_THRESHOLD" --control_type "$CTL_TYPES" @@ -128,8 +126,6 @@ args=( echo Rscript filteredCounts_QC.R --sample_meta $SAMPLE_META \ --annotated_counts $ANNOTATED_COUNTS \ --normalized_counts $NORMALIZED_COUNTS \ ---cell_set_meta $CELL_SET_META \ ---CB_meta $CONTROL_BARCODE_META \ --sig_cols $SIG_COLS \ --out $BUILD_DIR \ --count_threshold $COUNT_THRESHOLD \ From 02db68bc745bbcf756d2d1be595d1fb88fb16a08 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Thu, 19 Sep 2024 15:29:11 -0400 Subject: [PATCH 061/127] Changed merges to data.table also commented out a stop error --- scripts/src/collate_fastq_reads.R | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 7298d0d1..6727a25c 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -70,7 +70,7 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) { print('The following flowcells/lanes specified in the sample meta were not detected in the fastq reads.') print(missing_flowcells) print('Check that the sample meta is correct or that all fastq files are in the correct directory.') - stop('One or more flowcell specified in the sample meta was not detected.') + #stop('One or more flowcell specified in the sample meta was not detected.') } } @@ -112,7 +112,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, if(reverse_index2) { if('index_2' %in% colnames(sample_meta)) { print("Reverse-complementing index 2 barcode ...") - sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2)) + sample_meta[, index_2 := chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))] } else { stop('Reverse index 2 is set to TRUE, but index_2 does not exists.') } @@ -189,8 +189,9 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, } # Filter for expected flowcells ---- - uncollapsed_raw_counts= uncollapsed_raw_counts %>% - dplyr::inner_join(expected_flowcells, by= c('flowcell_name', 'flowcell_lane')) + uncollapsed_raw_counts= data.table::merge.data.table( + uncollapsed_raw_counts, data.table::setDT(expected_flowcells), + by= c('flowcell_name', 'flowcell_lane'), allow.cartesian= FALSE) } else { print('Flowcell_name and/or flowcell_lane were not detected in raw_counts_uncollapsed.') @@ -212,10 +213,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Create raw counts file ---- # Filter for the expected flowcells and summed up the reads over the ID cols. print('Summing up reads ...') - raw_counts= uncollapsed_raw_counts %>% - dplyr::inner_join(sequencing_map, by= sequencing_index_cols, relationship= 'many-to-one') %>% - dplyr::group_by(pick(all_of(c(id_cols, barcode_col)))) %>% - dplyr::summarize(n= sum(n)) %>% dplyr::ungroup() + raw_counts= data.table::merge.data.table(uncollapsed_raw_counts, sequencing_map, by= sequencing_index_cols) + raw_counts= raw_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)] # Calculate index purity ---- index_purity= sum(raw_counts$n) / sum(uncollapsed_raw_counts$n) From f1c1f2374d25d3b5764a4fc2b4e97d37638f8e40 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 20 Sep 2024 11:29:43 -0400 Subject: [PATCH 062/127] Run collate in chunks --- scripts/collate_fastq_reads.R | 45 +++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index ea51d9c8..f0cd6aaa 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -30,40 +30,61 @@ if (args$out == "") { args$out = args$wkdir } -# Run collate_fastq_reads ---- +# Read in sample meta and parse argument strings ---- # Read in files and parse vector arguments -raw_counts_uncollapsed= data.table::fread(args$raw_counts_uncollapsed, header= T, sep= ',') sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',') # Parse vector inputs sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ",")) id_cols= unlist(strsplit(args$id_cols, ",")) -# Validation: Check that sequencing_index_cols are from sample meta column names +# Validation: Check that sequencing_index_cols are from sample meta column names ---- if(!all(sequencing_index_cols %in% colnames(sample_meta))) { stop(paste('The following sequencing_index_cols were not found in the sample meta: ', sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)])) } -# Validation: Check that id_cols are from sample meta column names +# Validation: Check that id_cols are from sample meta column names ---- if(!all(id_cols %in% colnames(sample_meta))) { stop(paste('The following id_cols were not found in the sample meta: ', id_cols[!id_cols %in% colnames(sample_meta)])) } -print("Calling collate_fastq_reads ...") -raw_counts= collate_fastq_reads(uncollapsed_raw_counts= raw_counts_uncollapsed, - sample_meta= sample_meta, - sequencing_index_cols= sequencing_index_cols, - id_cols= id_cols, - reverse_index2= args$reverse_index2, - barcode_col= args$barcode_col) +# Run collate_fastq_reads on chunks ---- +# Set up loop to process chunks +header_col_names= data.table::fread(args$raw_counts_uncollapsed, header=T, sep= ',', nrow= 0) %>% colnames() +chunk_size= 10^6 # Maximum number of rows in a chunk +chunk_idx= 1 # Counter to keep track of chunks in a loop +current_chunk_size= chunk_size # Variable for loop exit condition +chunk_collector= list() # List to collect processed chunks -# Validation: Basic file size check +# For each chunk, call collate +while(current_chunk_size == chunk_size) { + nori_chunk= data.table::fread(args$raw_counts_uncollapsed, header= F, sep= ',', + col.names= header_col_names, + nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1) + + current_chunk_size= nrow(nori_chunk) # set current chunk size to stop loop + print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' ')) + + chunk_collector[[chunk_idx]]= collate_fastq_reads(nori_chunk, sample_meta, + sequencing_index_cols= sequencing_index_cols, + id_cols= id_cols, + reverse_index2= args$reverse_index2, + barcode_col= args$barcode_col) + + chunk_idx= chunk_idx + 1 +} + +raw_counts= data.table::rbindlist(chunk_collector) +raw_counts= raw_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)] + +# Validation: Basic file size check ---- if(nrow(raw_counts) == 0) { stop('ERROR: Empty file generated. No rows in raw_counts output.') } +# Write out file ---- rc_out_file= paste(args$out, 'raw_counts.csv', sep='/') print(paste("Writing raw_counts.csv to ", rc_out_file)) write.csv(raw_counts, rc_out_file, row.names= FALSE, quote= FALSE) From c4e20072a7ae5fef3d667cd21e8f999e2793f39a Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 20 Sep 2024 11:30:11 -0400 Subject: [PATCH 063/127] Removed inplace reverse index --- scripts/src/collate_fastq_reads.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 6727a25c..d1e84b65 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -67,10 +67,9 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) { missing_flowcells= expected_flowcells %>% dplyr::anti_join(detected_flowcells, by= c('flowcell_name', 'flowcell_lane')) if(nrow(missing_flowcells) != 0) { - print('The following flowcells/lanes specified in the sample meta were not detected in the fastq reads.') + print('WARNING: The following flowcells/lanes specified in the sample meta were not detected in the fastq reads.') print(missing_flowcells) print('Check that the sample meta is correct or that all fastq files are in the correct directory.') - #stop('One or more flowcell specified in the sample meta was not detected.') } } @@ -112,7 +111,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, if(reverse_index2) { if('index_2' %in% colnames(sample_meta)) { print("Reverse-complementing index 2 barcode ...") - sample_meta[, index_2 := chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2))] + sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2)) } else { stop('Reverse index 2 is set to TRUE, but index_2 does not exists.') } From 24f693bbae7829156da87f00f0210ce657d3640a Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 20 Sep 2024 11:31:53 -0400 Subject: [PATCH 064/127] Changed a mutate to data.table implace --- scripts/src/filter_raw_reads.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R index c8c3a939..3899d46b 100755 --- a/scripts/src/filter_raw_reads.R +++ b/scripts/src/filter_raw_reads.R @@ -127,8 +127,8 @@ filter_raw_reads = function(raw_counts, # but do not map to known barcodes in PRISM. # Also sorted reads in descending order by read count. print('Splitting off unmapped reads ...') - raw_counts %<>% dplyr::mutate(mapped= forward_read_cl_barcode %in% c(cell_line_meta$Sequence, CB_meta$Sequence)) - unmapped_reads= raw_counts %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>% + raw_counts[, mapped := forward_read_cl_barcode %in% c(cell_line_meta$Sequence, CB_meta$Sequence)] + unmapped_reads= raw_counts %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>% dplyr::arrange(dplyr::desc(n)) # Creating a template of all expected reads in the run ---- From bad695100c4e7b5344714d1267884536782aefe8 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 20 Sep 2024 12:52:29 -0400 Subject: [PATCH 065/127] Changed dplyr to data.table --- scripts/src/filter_raw_reads.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R index 3899d46b..015d0813 100755 --- a/scripts/src/filter_raw_reads.R +++ b/scripts/src/filter_raw_reads.R @@ -98,6 +98,7 @@ filter_raw_reads = function(raw_counts, count_threshold= 40) { require(tidyverse) require(magrittr) + browser() # Processing metadata and inputs ---- # CB meta is in log10 and should be converted to log2. @@ -128,8 +129,7 @@ filter_raw_reads = function(raw_counts, # Also sorted reads in descending order by read count. print('Splitting off unmapped reads ...') raw_counts[, mapped := forward_read_cl_barcode %in% c(cell_line_meta$Sequence, CB_meta$Sequence)] - unmapped_reads= raw_counts %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>% - dplyr::arrange(dplyr::desc(n)) + unmapped_reads= raw_counts[mapped==FALSE,][order(-n)][, mapped:= NULL] # Creating a template of all expected reads in the run ---- # Use all 4 meta data files to create a "template" dataframe where From ef18cf05700eab3d0e6c0b01286378a84ab7c24e Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 20 Sep 2024 12:55:39 -0400 Subject: [PATCH 066/127] Drop browser --- scripts/src/filter_raw_reads.R | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R index 015d0813..cd94c692 100755 --- a/scripts/src/filter_raw_reads.R +++ b/scripts/src/filter_raw_reads.R @@ -98,7 +98,6 @@ filter_raw_reads = function(raw_counts, count_threshold= 40) { require(tidyverse) require(magrittr) - browser() # Processing metadata and inputs ---- # CB meta is in log10 and should be converted to log2. From ea388607db34d2f3d6642caeab2105f845ef9c46 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 20 Sep 2024 13:00:52 -0400 Subject: [PATCH 067/127] Added escape for empty chunks --- scripts/src/collate_fastq_reads.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index d1e84b65..dc8d0ee1 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -215,6 +215,12 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, raw_counts= data.table::merge.data.table(uncollapsed_raw_counts, sequencing_map, by= sequencing_index_cols) raw_counts= raw_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)] + # Escape for when a chunk contains invalid sequencing locations + if(nrow(raw_counts) == 0) { + print('WARNING: raw_counts is empty!') + return(raw_counts) + } + # Calculate index purity ---- index_purity= sum(raw_counts$n) / sum(uncollapsed_raw_counts$n) print(paste0('Index purity: ', round(index_purity, 4))) From 51f5c42d37b362a4c3bc057a5af9e4cefc59084b Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 24 Sep 2024 17:36:49 -0400 Subject: [PATCH 068/127] Wrapped chunking in a function --- scripts/src/collate_fastq_reads.R | 102 ++++++++++++++++++++++-------- 1 file changed, 77 insertions(+), 25 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index dc8d0ee1..23ce723a 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -73,6 +73,38 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) { } } +#' process_in_chunks +#' +#' This function runs some action over chunks of a large file. At the end, all chunks are +#' appended together. +#' +#' @param large_file_path description +#' @param chunk_size description +#' @param action A function passed to act on each chunk +process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) { + + header_col_names= data.table::fread(large_file_path, header= TRUE, sep= ',', nrow= 0) %>% colnames() + chunk_idx= 1 # Counter to keep track of chunks in a loop + current_chunk_size= chunk_size # Variable for loop exit condition + chunk_collector= list() # List to collect processed chunks + + # For each chunk, call collate + while(current_chunk_size == chunk_size) { + current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',', + col.names= header_col_names, + nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1) + + current_chunk_size= nrow(current_chunk) # set current chunk size to stop loop + print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' ')) + + chunk_collector[[chunk_idx]]= do.call(action, list(current_chunk, ...)) + chunk_idx= chunk_idx + 1 + } + + output_table= data.table::rbindlist(chunk_collector) + return(output_table) +} + #' collate_fastq_reads #' #' This function takes in the fastq reads (uncollapsed_raw_counts) and @@ -100,12 +132,14 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) { #' @param barcode_col String name of the column in uncollapsed_raw_counts that contains the sequences. #' @returns Returns a dataframe with columns specified by the id_cols along with barcode_col, and "n". #' @import tidyverse +#' @import data.table collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, sequencing_index_cols= c('index_1', 'index_2'), id_cols= c('pcr_plate', 'pcr_well'), reverse_index2= FALSE, barcode_col= 'forward_read_cl_barcode') { require(tidyverse) + require(data.table) # Reverse index 2 if specified ---- if(reverse_index2) { @@ -117,6 +151,9 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, } } + # Create sequence map ---- + sequencing_map= sample_meta %>% dplyr::distinct(pick(all_of(c(sequencing_index_cols, id_cols)))) + # Validation: Check that flowcell_names and flowcell_lanes exist in the sample meta ---- if(!validate_columns_exist(c('flowcell_names', 'flowcell_lanes'), sample_meta)) { stop('The above column(s) are NOT present in the sample meta.') @@ -125,8 +162,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Validation: Check that sequencing_index_cols exist in the sample meta ---- if(!validate_columns_exist(sequencing_index_cols, sample_meta)) { print('The following sequencing_index_cols are not present in the sample meta.') - print(sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)]) - stop('One or more sequencing_index_cols is NOT present in the sample meta.') + stop('The above sequencing_index_cols are NOT present in the sample meta.') } # Validation: Check that id_cols exist in the sample meta ---- @@ -141,6 +177,15 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, stop('One or more sequencing_index_cols in the sample meta is not filled out.') } + # Validation: Check that mapping is one to one ---- + check_mapping= sequencing_map %>% dplyr::group_by(pick(all_of(sequencing_index_cols))) %>% + dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup() + if(nrow(check_mapping) > 0) { + print('The following sequencing locations map to multiple conditions.') + print(check_mapping) + stop('The sequencing index columns do not map 1 to 1 to the ID columns.') + } + # If "flowcell_name" and "flowcell_lane" are present, filter for valid flowcells ---- # Note: Can this switch be tied to the sequencer type? if(base::all(c('flowcell_name', 'flowcell_lane') %in% colnames(uncollapsed_raw_counts))) { @@ -197,39 +242,46 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, print('Proceeding without filtering flowcells ...') } - # Create sequence map ---- - sequencing_map= sample_meta %>% dplyr::distinct(pick(all_of(c(sequencing_index_cols, id_cols)))) - - # Validation: Check that mapping is one to one ---- - check_mapping= sequencing_map %>% dplyr::group_by(pick(all_of(sequencing_index_cols))) %>% - dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup() - if(nrow(check_mapping) > 0) { - print('The following sequening locations map to multiple conditions.') - print(check_mapping) - stop('The sequencing index columns do not map 1 to 1 to the ID columns.') - } - - # Create raw counts file ---- + # Create summed_reads file ---- # Filter for the expected flowcells and summed up the reads over the ID cols. print('Summing up reads ...') - raw_counts= data.table::merge.data.table(uncollapsed_raw_counts, sequencing_map, by= sequencing_index_cols) - raw_counts= raw_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)] + summed_reads= data.table::merge.data.table(uncollapsed_raw_counts, sequencing_map, by= sequencing_index_cols) + summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] # Escape for when a chunk contains invalid sequencing locations - if(nrow(raw_counts) == 0) { - print('WARNING: raw_counts is empty!') - return(raw_counts) + if(nrow(summed_reads) == 0) { + print('WARNING: summed_reads is empty!') + return(summed_reads) } - # Calculate index purity ---- - index_purity= sum(raw_counts$n) / sum(uncollapsed_raw_counts$n) - print(paste0('Index purity: ', round(index_purity, 4))) + # Calculate index purity in a chunk---- + index_purity= sum(summed_reads$n) / sum(uncollapsed_raw_counts$n) + print(paste0('Index purity in chunk: ', round(index_purity, 4))) if(index_purity > 1) { - stop('ERROR: Index purity is greater than 1!') + stop('ERROR: Chunk index purity is greater than 1!') } else if(index_purity < 0.5) { print('Warning: Low index purity!') } else {} print('Collate_fastq_reads has completed!') - return(raw_counts) + return(summed_reads) +} + +#' extract_known_barcodes +#' +#' This function runs some action over chunks of a large file. At the end, all chunks are +#' appended together. +#' +#' @param raw_counts description +#' @param known_barcodes A vector known barcodes. +#' @param barcode_col String name of the column in uncollapsed_raw_counts that contains the sequences. +extract_known_barcodes= function(summed_reads, known_barcodes, barcode_col= 'forward_read_cl_barcode') { + # Create boolean column of known or unknown + summed_reads[, known := get(barcode_col) %chin% known_barcodes] + + # Filter using that boolean column + unknown_reads= summed_reads[known == FALSE,][order(-n)][, known := NULL] + summed_reads= summed_reads[known == TRUE,][, known := NULL] + + return(list(unknown_reads= unknown_reads, known_reads= summed_reads)) } From 3fd1f90786403b1d848b09e86a138ec5e6dada22 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 24 Sep 2024 17:37:33 -0400 Subject: [PATCH 069/127] Call chunking function and extract known reads --- scripts/collate_fastq_reads.R | 76 +++++++++++++++++------------------ 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index f0cd6aaa..86a82298 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -2,6 +2,7 @@ options(cli.unicode = FALSE) library(argparse) library(magrittr) library(tidyverse) +library(data.table) source("./src/collate_fastq_reads.R") # Argument parser ---- @@ -38,53 +39,50 @@ sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',') sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ",")) id_cols= unlist(strsplit(args$id_cols, ",")) -# Validation: Check that sequencing_index_cols are from sample meta column names ---- -if(!all(sequencing_index_cols %in% colnames(sample_meta))) { - stop(paste('The following sequencing_index_cols were not found in the sample meta: ', - sequencing_index_cols[!sequencing_index_cols %in% colnames(sample_meta)])) +# Validation: Check that sequencing_index_cols present in the sample meta ---- +if(!validate_columns_exist(sequencing_index_cols, sample_meta)) { + print('The following sequencing_index_cols are not present in the sample meta.') + stop('One or more sequencing_index_cols is NOT present in the sample meta.') } -# Validation: Check that id_cols are from sample meta column names ---- -if(!all(id_cols %in% colnames(sample_meta))) { - stop(paste('The following id_cols were not found in the sample meta: ', - id_cols[!id_cols %in% colnames(sample_meta)])) +# Validation: Check that id_cols are present in the sample meta ---- +if(!validate_columns_exist(id_cols, sample_meta)) { + stop('One or more id_cols is NOT present in the sample meta.') } -# Run collate_fastq_reads on chunks ---- -# Set up loop to process chunks -header_col_names= data.table::fread(args$raw_counts_uncollapsed, header=T, sep= ',', nrow= 0) %>% colnames() -chunk_size= 10^6 # Maximum number of rows in a chunk -chunk_idx= 1 # Counter to keep track of chunks in a loop -current_chunk_size= chunk_size # Variable for loop exit condition -chunk_collector= list() # List to collect processed chunks +# Run collate_fastq_reads on chunks of raw_counts_uncollapsed.csv ---- +summed_reads= process_in_chunks(large_file_path= args$raw_counts_uncollapsed, + chunk_size= 10^6, + action= collate_fastq_reads, + sample_meta= sample_meta, + sequencing_index_cols= sequencing_index_cols, + id_cols= id_cols, + reverse_index2= args$reverse_index2, + barcode_col= args$barcode_col) -# For each chunk, call collate -while(current_chunk_size == chunk_size) { - nori_chunk= data.table::fread(args$raw_counts_uncollapsed, header= F, sep= ',', - col.names= header_col_names, - nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1) - - current_chunk_size= nrow(nori_chunk) # set current chunk size to stop loop - print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' ')) - - chunk_collector[[chunk_idx]]= collate_fastq_reads(nori_chunk, sample_meta, - sequencing_index_cols= sequencing_index_cols, - id_cols= id_cols, - reverse_index2= args$reverse_index2, - barcode_col= args$barcode_col) - - chunk_idx= chunk_idx + 1 -} +# Sum up the read across the chunks afterwards! +summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, args$barcode_col)] + +# Split reads by either known or unknown ---- +# Reads are separated by whether or not the barcode exists in the PRISM library +# Read in metadata to get list of all known barcodes +cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',') +CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',') -raw_counts= data.table::rbindlist(chunk_collector) -raw_counts= raw_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)] +# Call function to separate barcodes +split_reads= extract_known_barcodes(summed_reads, unique(c(cell_line_meta$Sequence, CB_meta$Sequence)), + barcode_col= args$barcode) # Validation: Basic file size check ---- -if(nrow(raw_counts) == 0) { +if(nrow(split_reads$mapped_reads) == 0) { stop('ERROR: Empty file generated. No rows in raw_counts output.') } -# Write out file ---- -rc_out_file= paste(args$out, 'raw_counts.csv', sep='/') -print(paste("Writing raw_counts.csv to ", rc_out_file)) -write.csv(raw_counts, rc_out_file, row.names= FALSE, quote= FALSE) +# Write out files ---- +out_file= paste(args$out, 'unknown_reads.csv', sep='/') +print(paste("Writing unknown_reads.csv to ", out_file)) +write.csv(split_reads$unknown_reads, out_file, row.names= FALSE, quote= FALSE) + +out_file= paste(args$out, 'known_reads.csv', sep='/') +print(paste("Writing known_reads.csv to ", out_file)) +write.csv(split_reads$known_reads, out_file, row.names= FALSE, quote= FALSE) From 48aac7435d8a609793116a5b809d7a1cb6fbbffb Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 24 Sep 2024 17:38:14 -0400 Subject: [PATCH 070/127] Removed unmmaped reads part --- scripts/src/filter_raw_reads.R | 38 +++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R index cd94c692..4484f7d7 100755 --- a/scripts/src/filter_raw_reads.R +++ b/scripts/src/filter_raw_reads.R @@ -96,8 +96,8 @@ filter_raw_reads = function(raw_counts, sample_meta, cell_line_meta, cell_set_meta, CB_meta, id_cols= c('pcr_plate', 'pcr_well'), count_threshold= 40) { - require(tidyverse) require(magrittr) + require(tidyverse) # Processing metadata and inputs ---- # CB meta is in log10 and should be converted to log2. @@ -122,14 +122,6 @@ filter_raw_reads = function(raw_counts, # This currently does NOT result in an error. Error avoided using a distinct when creating the template. validate_cell_set_luas(sample_meta, cell_set_meta) - # Split off unmapped reads ---- - # Unmapped reads are defined as reads that are identified from valid PCR locations, - # but do not map to known barcodes in PRISM. - # Also sorted reads in descending order by read count. - print('Splitting off unmapped reads ...') - raw_counts[, mapped := forward_read_cl_barcode %in% c(cell_line_meta$Sequence, CB_meta$Sequence)] - unmapped_reads= raw_counts[mapped==FALSE,][order(-n)][, mapped:= NULL] - # Creating a template of all expected reads in the run ---- # Use all 4 meta data files to create a "template" dataframe where # every row is a cell line that is expected in a PCR well. @@ -157,7 +149,30 @@ filter_raw_reads = function(raw_counts, # Reads that to not match to the template are contaminants and, # reads that are only present in the template are missing/not detected by PCR. print("Annotating reads ...") - annotated_counts= raw_counts %>% dplyr::filter(mapped) %>% + # Data.table version # + # # Left join cell_line_meta using data.table inplace left join + # raw_counts[cell_line_meta, base::setdiff(colnames(cell_line_meta), c('Sequence')) := + # base::mget(base::setdiff(colnames(cell_line_meta), c('Sequence'))), + # on= c('forward_read_cl_barcode' = 'Sequence')] + # # Left join CB_meta using data.table inplace left join + # raw_counts[CB_meta, base::setdiff(colnames(CB_meta), c('Sequence')) := + # base::mget(base::setdiff(colnames(CB_meta), c('Sequence'))), + # on= c('forward_read_cl_barcode' = 'Sequence')] + # # Left join CB_meta using data.table inplace left join + # raw_counts[sample_meta, base::setdiff(colnames(sample_meta), id_cols) := + # base::mget(base::setdiff(colnames(sample_meta), id_cols)), + # on= id_cols] + # data.table::setnames(raw_counts, 'forward_read_cl_barcode', 'Sequence') + # + # annotated_counts= data.table::merge.data.table( + # raw_counts, data.table::setDT(template %>% dplyr::mutate(expected_read= T)), + # by= intersect(colnames(template), colnames(raw_counts)), all.x= TRUE, all.y= TRUE, + # allow.cartesian= FALSE) %>% + # dplyr::select(!any_of(c('prism_cell_set', 'members', 'mapped'))) %>% + # dplyr::mutate(n= replace_na(n, 0), expected_read= replace_na(expected_read, F)) + + # Dplyr version # + annotated_counts= raw_counts %>% dplyr::left_join(cell_line_meta, by= join_by('forward_read_cl_barcode'=='Sequence'), relationship= 'many-to-one') %>% dplyr::left_join(CB_meta, by= join_by('forward_read_cl_barcode'=='Sequence'), @@ -191,8 +206,7 @@ filter_raw_reads = function(raw_counts, } print('Filter_raw_reads has completed!') - return(list(unmapped_reads= unmapped_reads, - annotated_counts= annotated_counts, + return(list(annotated_counts= annotated_counts, filtered_counts= filtered_counts)) } From dac0ca7c05da5884f0a717175aaddb165f261ac6 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 24 Sep 2024 17:46:30 -0400 Subject: [PATCH 071/127] Added cell line meta and CB meta --- scripts/collate_fastq_reads.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index 86a82298..ca00205f 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -21,6 +21,8 @@ parser$add_argument("--reverse_index2", type="logical", default=FALSE, help= "Reverse complement of index 2 for NovaSeq and NextSeq") parser$add_argument("--barcode_col", default= "forward_read_cl_barcode", help= "Name of the column in uncollapsed_raw_counts that contains the barcode sequences.") +parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata") +parser$add_argument("--CB_meta", default="CB_meta.csv", help = "Control Barcode metadata") parser$add_argument("-o", "--out", default=getwd(), help = "Output path. Default is working directory") # get command line options, if help option encountered print help and exit From f8cd88be6f2150818095f840cb7c0d260558f737 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 24 Sep 2024 17:58:36 -0400 Subject: [PATCH 072/127] Added cell line meta and cb meta as inputs --- scripts/collate_fastq_reads.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/scripts/collate_fastq_reads.sh b/scripts/collate_fastq_reads.sh index 7fe46214..16735e99 100644 --- a/scripts/collate_fastq_reads.sh +++ b/scripts/collate_fastq_reads.sh @@ -79,6 +79,22 @@ else SAMPLE_META=$BUILD_DIR/$SAMPLE_META fi +#Enforces abs paths +if [[ "$CELL_LINE_META" = /* ]] +then + CELL_LINE_META=$(ls $CELL_LINE_META) +else + CELL_LINE_META=$BUILD_DIR/$CELL_LINE_META +fi + +#Enforces abs paths +if [[ "$CONTROL_BARCODE_META" = /* ]] +then + CONTROL_BARCODE_META=$(ls $CONTROL_BARCODE_META) +else + CONTROL_BARCODE_META=$BUILD_DIR/$CONTROL_BARCODE_META +fi + echo Build dir is: $BUILD_DIR PROJECT_DIR=$(dirname "$BUILD_DIR") @@ -86,6 +102,8 @@ PROJECT_CODE=$(basename "$PROJECT_DIR") echo Project Code: $PROJECT_CODE echo REVERSE_INDEX2 is: $REVERSE_INDEX2 +echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META +echo CELL_LINE_META is: $CELL_LINE_META args=( --raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED" @@ -94,6 +112,8 @@ args=( --sequencing_index_cols="$SEQUENCING_INDEX_COLS" --id_cols "$ID_COLS" --reverse_index2 "$REVERSE_INDEX2" +--cell_line_meta "$CELL_LINE_META" +--CB_meta "$CONTROL_BARCODE_META" ) echo Rscript collate_fastq_reads.R "${args[@]}" From 29a999c88e6803658c1d821f40e998d4d4719b61 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 25 Sep 2024 14:15:56 -0400 Subject: [PATCH 073/127] Fixed a comment --- scripts/src/collate_fastq_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 23ce723a..8cbd25b0 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -88,7 +88,7 @@ process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) { current_chunk_size= chunk_size # Variable for loop exit condition chunk_collector= list() # List to collect processed chunks - # For each chunk, call collate + # For each chunk, call an action while(current_chunk_size == chunk_size) { current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',', col.names= header_col_names, From 3a268ce1c092cf0f066fcc486eecf6170f485ec5 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 25 Sep 2024 14:16:11 -0400 Subject: [PATCH 074/127] Added chunking for some QC figures --- scripts/src/QC_images.R | 146 +++++++++++++++++++++------------------- 1 file changed, 78 insertions(+), 68 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index 3ad0db66..b0c0c1dd 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -39,7 +39,7 @@ get_index_summary= function(df, index_col, valid_indices) { output_summary= df %>% dplyr::group_by(pick(all_of(index_col))) %>% dplyr::summarise(idx_n= sum(n)) %>% dplyr::ungroup() %>% dplyr::mutate(fraction= round(idx_n/sum(idx_n), 5), - expected= ifelse(.[[index_col]] %in% valid_indices, T, F), + expected= ifelse(.[[index_col]] %chin% valid_indices, T, F), contains_n= ifelse(grepl('N', .[[index_col]]), T, F), lv_dist= apply(stringdist::stringdistmatrix(.[[index_col]], valid_indices, method="lv"), 1, min), @@ -59,22 +59,37 @@ get_index_summary= function(df, index_col, valid_indices) { #' @param value_col String name of the counts column present all three dataframes. #' @param file_path Location to write out the output. #' @returns Writes out a QC_table to the file_path. -create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts, value_col= 'n', file_path) { - # Validation: Check that value_col is present in the three files. - if(!validate_columns_exist(value_col, raw_counts_uncollapsed)) { +create_qc_table= function(raw_counts_uncollapsed_filepath, unknown_reads, known_reads, filtered_counts, + value_col= 'n', file_path) { + # Validations: Check that the path works and that value_col exists in all tables + if(!file.exists(raw_counts_uncollapsed_filepath)) { + stop('Cannot find the raw counts uncollapsed file.') + } + rcu_headers= data.table::fread(raw_counts_uncollapsed_filepath, header= TRUE, sep= ',', nrow= 0) + if(!validate_columns_exist(value_col, rcu_headers)) { stop(paste0('The column ', value_col, " was not detected in uncollapsed raw counts.")) } - if(!validate_columns_exist(value_col, raw_counts)) { - stop(paste0('The column ', value_col, " was not detected in raw counts.")) + if(!validate_columns_exist(value_col, unknown_reads)) { + stop(paste0('The column ', value_col, " was not detected in unknown_reads.csv")) + } + if(!validate_columns_exist(value_col, known_reads)) { + stop(paste0('The column ', value_col, " was not detected in known_reads.csv")) } if(!validate_columns_exist(value_col, filtered_counts)) { - stop(paste0('The column ', value_col, " was not detected in filtered counts.")) + stop(paste0('The column ', value_col, " was not detected in filtered_counts.csv")) } # Calculate purities - index_purity= sum(raw_counts[[value_col]]) / sum(raw_counts_uncollapsed[[value_col]]) + # Determine total number of reads + chunk_sum= process_in_chunks(large_file_path= raw_counts_uncollapsed_filepath, + chunk_size= 10^6, + action= function(x) data.table::as.data.table(sum(x[[value_col]]))) + total_num_reads= sum(unlist(test_sum)) + + # Calculate purities + index_purity= (sum(unknown_reads[[value_col]]) + sum(known_reads[[value_col]])) / total_num_reads print(paste0('Index purity: ', round(index_purity, 4))) - cell_line_purity= sum(filtered_counts[[value_col]]) / sum(raw_counts[[value_col]]) + cell_line_purity= sum(filtered_counts[[value_col]]) / (sum(unknown_reads[[value_col]]) + sum(known_reads[[value_col]])) print(paste0('Cell line purity: ', round(cell_line_purity, 4))) qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity) @@ -83,6 +98,30 @@ create_qc_table= function(raw_counts_uncollapsed, raw_counts, filtered_counts, v qc_table %>% write.csv(file_path, row.names= FALSE, quote= FALSE) } +process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) { + + header_col_names= data.table::fread(large_file_path, header= TRUE, sep= ',', nrow= 0) %>% colnames() + chunk_idx= 1 # Counter to keep track of chunks in a loop + current_chunk_size= chunk_size # Variable for loop exit condition + chunk_collector= list() # List to collect processed chunks + + # For each chunk, call an action + while(current_chunk_size == chunk_size) { + current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',', + col.names= header_col_names, + nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1) + + current_chunk_size= nrow(current_chunk) # set current chunk size to stop loop + print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' ')) + + chunk_collector[[chunk_idx]]= do.call(action, list(current_chunk, ...)) + chunk_idx= chunk_idx + 1 + } + + output_table= data.table::rbindlist(chunk_collector) + return(output_table) +} + #' Total counts barplot #' #' Creates the total counts barplot with bars colored by the barcode type, @@ -468,6 +507,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, # Required packages ---- require(tidyverse) require(magrittr) + require(data.table) require(reshape2) require(WGCNA) require(scales) @@ -495,30 +535,44 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, # Sequencing QCs ____________________ ---- ## 1. Purity metrics ---- print('1. Generating QC table ...') - create_qc_table(raw_counts_uncollapsed, raw_counts, filtered_counts, + create_qc_table(raw_counts_uncollapsed, + unknown_reads= unknown_reads, + known_reads= known_reads, + filtered_counts, value_col= 'n', file_path= paste0(out, '/QC_table.csv')) ## 2. Index count summaries ---- print("2. Generating index counts tables ...") - # Check that "IndexBarcode1" and "index_1" columns are present. - # If so, calculate index summary and write out. - if('index_1' %in% colnames(sample_meta) & 'index_1' %in% colnames(raw_counts_uncollapsed)) { + + # Pull out headers to perform checks + raw_counts_uncollapsed_headers= data.table::fread(raw_counts_file_path, header= TRUE, sep= ',', nrow= 0) + + # Check that "index_1" is present. If so, calculate index summary and write out. + if('index_1' %in% colnames(sample_meta) & 'index_1' %in% colnames(raw_counts_uncollapsed_headers)) { expected_index1= unique(sample_meta$index_1) - index1_counts= get_index_summary(raw_counts_uncollapsed, 'index_1', expected_index1) + # Aggregate by index_1 using chunks + index1_chunks= process_in_chunks(large_file_path= raw_counts_file_path, chunk_size= 10^6, + action= function(x) x[, list(n= sum(n)), by= index_1]) + index1_counts= get_index_summary(index1_chunks, 'index_1', expected_index1) index1_counts %>% write.csv(file= paste(out, 'index1_counts.csv', sep='/'), row.names=F) } else { print('Column "index_1" not detected. Skipping index 1 summaries ...', quote= FALSE) } # Do the same for index 2. - # Reverse index 2 barcodes if it is indicated and if "index_2" exisits + # Reverse index 2 barcodes if it is indicated and if "index_2" exists if(reverse_index2 & 'index_2' %in% colnames(sample_meta) ) { print("Reverse-complementing index 2 barcode.") sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2)) } - if('index_2' %in% colnames(sample_meta) & 'index_2' %in% colnames(raw_counts_uncollapsed)) { + if('index_2' %in% colnames(sample_meta) & 'index_2' %in% colnames(raw_counts_uncollapsed_headers)) { expected_index2= unique(sample_meta$index_2) + + # Aggregate by index_2 using chunks + index2_chunks= process_in_chunks(large_file_path= raw_counts_file_path, chunk_size= 10^6, + action= function(x) x[, list(n= sum(n)), by= index_2]) + index2_counts= get_index_summary(raw_counts_uncollapsed, 'index_2', expected_index2) index2_counts %>% write.csv(file= paste(out, 'index2_counts.csv', sep='/'), row.names=F) } else { @@ -594,59 +648,15 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, ## 6. Contaminant reads ---- print('6. Generating contaminant reads ...') potential_error= base::tryCatch({ - pcr_locations= c('pcr_plate', 'pcr_well') - - # Validation: Check that the PCR columns are present in raw_counts. - if(!validate_columns_exist(pcr_locations, raw_counts)) { - stop('pcr_plate and pcr_well are required in raw_counts.csv for this to work.') - } - - # count number of wells a cell_set appears in. - pcr_plate_map= sample_meta %>% dplyr::distinct(pick(any_of(c(pcr_locations, 'cell_set')))) %>% - dplyr::group_by(pcr_plate) %>% dplyr::mutate(num_wells_in_plate= dplyr::n()) %>% dplyr::ungroup() %>% - dplyr::group_by(cell_set) %>% dplyr::mutate(num_wells_in_set= dplyr::n()) %>% dplyr::ungroup() - - # index filter and identify reads as mapped or not - sequencing_filter= raw_counts %>% - dplyr::mutate(mapped= forward_read_cl_barcode %in% unique(annotated_counts$forward_read_cl_barcode)) - - # total counts per well - used to calculate fractions - counts_per_well= sequencing_filter %>% dplyr::group_by(pick(all_of(pcr_locations))) %>% - dplyr::summarise(well_total_n= sum(n)) %>% dplyr::ungroup() - - # mapped contaminates to bind - mapped_contams= annotated_counts %>% dplyr::filter(!expected_read) %>% - dplyr::mutate(barcode_name= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>% - dplyr::select(all_of(c(pcr_locations, 'forward_read_cl_barcode', 'n', 'barcode_name'))) + # watered down version + summed_unknown_reads= unknown_reads[, list(num_reads = sum(n), num_wells= .N), + by= base::mget('forward_read_cl_barcode')] + summed_contams= annotated_counts[expected_read == FALSE, list(num_reads = sum(n), num_wells= .N), + by= base::mget(c('forward_read_cl_barcode', 'DepMap_ID', 'cb_name'))] + summed_contams[, barcode_name:= ifelse(is.na(DepMap_ID), cb_name, DepMap_ID)][,DepMap_ID:= NULL] - contam_reads= sequencing_filter %>% dplyr::filter(mapped == FALSE) %>% dplyr::select(-mapped) %>% - dplyr::bind_rows(mapped_contams) %>% - dplyr::left_join(counts_per_well, by= pcr_locations) %>% - dplyr::left_join(pcr_plate_map, by= pcr_locations) %>% - # filter out barcodes that only appear in one well - dplyr::group_by(forward_read_cl_barcode) %>% dplyr::filter(dplyr::n() >1) %>% dplyr::ungroup() %>% - # number of wells in a pcr plate a barcode is detected in - dplyr::group_by(forward_read_cl_barcode, pcr_plate) %>% - dplyr::mutate(num_wells_detected_plate= dplyr::n()) %>% dplyr::ungroup() %>% - # number of wells in a cell set a barcode is detected in - dplyr::group_by(forward_read_cl_barcode, cell_set) %>% - dplyr::mutate(num_wells_detected_set= dplyr::n()) %>% dplyr::ungroup() %>% - # determine if contamination is project, plate, or set - dplyr::group_by(forward_read_cl_barcode) %>% - dplyr::mutate(num_wells_detected= dplyr::n(), - project_code= unique(sample_meta$project_code), - fraction= n/well_total_n, - type1= ifelse(sum(num_wells_detected== nrow(pcr_plate_map))>1, 'project_contam', NA), - type2= ifelse(sum(num_wells_detected== num_wells_detected_plate & - num_wells_detected_plate == num_wells_in_plate)>1, 'plate_contam', NA), - type3= ifelse(sum(num_wells_detected == num_wells_detected_set & - num_wells_detected_set== num_wells_in_set)>1, 'set_contam', NA)) %>% - dplyr::ungroup() %>% - tidyr::unite(scope, all_of(c('type1', 'type2', 'type3')), sep=',', remove = T, na.rm = T) %>% - dplyr::group_by(project_code, forward_read_cl_barcode, barcode_name, scope, num_wells_detected) %>% - dplyr::summarise(min_n= min(n), med_n= median(n), max_n= max(n), - min_fraction= min(fraction), med_fraction= median(fraction), max_fraction=max(fraction)) %>% - dplyr::arrange(desc(max_fraction)) + contam_reads= data.table::rbindlist(list(summed_contams, summed_unknown_reads), fill= TRUE) %>% + dplyr::arrange(dplyr::desc(num_reads)) # write out contam_reads %>% write.csv(paste0(out, 'contam_reads.csv'), row.names=F) From b914add9d959fb27503cc2b9b4e26128afae9d99 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Thu, 3 Oct 2024 17:29:40 -0400 Subject: [PATCH 075/127] Sum up reads and split on known or unknown reads --- scripts/src/collate_fastq_reads.R | 269 ++++++++++-------------------- 1 file changed, 87 insertions(+), 182 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 8cbd25b0..1aba70a9 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -1,71 +1,15 @@ -#' validate_columns_exist -#' -#' This function checks that a list of columns are present in a dataframe. -#' Columns that were not found in the dataframe are printed out. -#' -#' @param selected_columns A vector of strings each representing a column name -#' @param df A dataframe to check against -#' @return Boolean -validate_columns_exist= function(selected_cols, df) { - # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B]. - unmatched_cols= base::setdiff(selected_cols, colnames(df)) - - if(length(unmatched_cols) > 0) { - print('The following columns are missing: ') - print(unmatched_cols) - return(FALSE) - } else { - return(TRUE) - } -} - -#' validate_columns_entries -#' -#' This function checks that for a list of columns, all entries are filled in. -#' It checks all column entries against a list of potential empty values. -#' -#' @param selected_columns A vector of strings each representing a column name -#' @param df A dataframe to check against -#' @param empty_values Optional vector of values that equate to empty. Defaults to NA, "NA", "", and " ". -#' @return Boolean -validate_columns_entries= function(selected_columns, df, empty_values= c(NA, 'NA', '', ' ')) { - # Check for rows in selected_columns that equate to predefined empty values. - missing_rows= df %>% dplyr::filter(if_any(all_of(selected_columns), ~ . %in% empty_values)) - if(nrow(missing_rows) > 0) { - print('The following rows in the sample meta are not filled out for the sequencing index columns.') - print(missing_rows) # show the empty rows - return(FALSE) - } else { - return(TRUE) - } -} - -#' validate_unique_samples -#' -#' This function checks that a list of columns uniquely identifies all entries of a dataframe. -#' -#' @param selected_columns A vector of strings each representing a column name -#' @param df A dataframe to check against -#' @return Boolean -validate_unique_samples= function(selected_columns, df) { - unique_column_values= df %>% dplyr::distinct(pick(all_of(selected_columns))) - if(nrow(unique_column_values) != nrow(df)) { - return(FALSE) - } else { - return(TRUE) - } -} - #' validate_detected_flowcells #' -#' This function checks that all the expected flowcells are present in a table of detected flowcells. -#' There can be more detected flowcells than there are expected flowcells. +#' This function checks the table of expected flowcells using the table of detected flowcells. +#' Any flowcells that are expected but are not detected are printed and a warning is printed. #' #' @param detected_flowcells A dataframe with the columns "flowcell_name" and "flowcell_lane". #' @param expected_flowcells A dataframe with the columns "flowcell_name" and "flowcell_lane". validate_detected_flowcells= function(detected_flowcells, expected_flowcells) { + # Use dplyr::anti_join to filter out row in expected_flowcells that appear in detected_flowcells. missing_flowcells= expected_flowcells %>% dplyr::anti_join(detected_flowcells, by= c('flowcell_name', 'flowcell_lane')) + # Print a warning if there are expected flowcells that were not detected! if(nrow(missing_flowcells) != 0) { print('WARNING: The following flowcells/lanes specified in the sample meta were not detected in the fastq reads.') print(missing_flowcells) @@ -73,38 +17,6 @@ validate_detected_flowcells= function(detected_flowcells, expected_flowcells) { } } -#' process_in_chunks -#' -#' This function runs some action over chunks of a large file. At the end, all chunks are -#' appended together. -#' -#' @param large_file_path description -#' @param chunk_size description -#' @param action A function passed to act on each chunk -process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) { - - header_col_names= data.table::fread(large_file_path, header= TRUE, sep= ',', nrow= 0) %>% colnames() - chunk_idx= 1 # Counter to keep track of chunks in a loop - current_chunk_size= chunk_size # Variable for loop exit condition - chunk_collector= list() # List to collect processed chunks - - # For each chunk, call an action - while(current_chunk_size == chunk_size) { - current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',', - col.names= header_col_names, - nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1) - - current_chunk_size= nrow(current_chunk) # set current chunk size to stop loop - print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' ')) - - chunk_collector[[chunk_idx]]= do.call(action, list(current_chunk, ...)) - chunk_idx= chunk_idx + 1 - } - - output_table= data.table::rbindlist(chunk_collector) - return(output_table) -} - #' collate_fastq_reads #' #' This function takes in the fastq reads (uncollapsed_raw_counts) and @@ -127,6 +39,8 @@ process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) { #' include any sequencing related columns. This parameter defaults onto "pcr_plate", "pcr_well". This #' parameter can also be a list of the sample conditions columns as long as they uniquely identify every #' PCR well. For example "cell_set", "treatment", "dose", "day", "bio_rep", "tech_rep" can also be used. +#' @param known_barcodes A vector of known PRISM barcodes. If a read does not match a barcode in this list, +#' then its sequence is reassigned to "unknown_reads". #' @param reverse_index2 Index 2 should be reversed if the sequencer uses a reverse complement workflow. #' Defaults to FALSE. #' @param barcode_col String name of the column in uncollapsed_raw_counts that contains the sequences. @@ -136,48 +50,61 @@ process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) { collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, sequencing_index_cols= c('index_1', 'index_2'), id_cols= c('pcr_plate', 'pcr_well'), + known_barcodes, reverse_index2= FALSE, barcode_col= 'forward_read_cl_barcode') { require(tidyverse) require(data.table) - # Reverse index 2 if specified ---- - if(reverse_index2) { - if('index_2' %in% colnames(sample_meta)) { - print("Reverse-complementing index 2 barcode ...") - sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2)) - } else { - stop('Reverse index 2 is set to TRUE, but index_2 does not exists.') - } + # Validation: Check that sequencing_index_cols exist in the sample_meta ---- + # Error out if a sequencing_index_col is not in the sample_meta. + if(!validate_columns_exist(sequencing_index_cols, sample_meta)) { + stop('The above sequencing_index_cols are NOT present in the sample meta.') } - # Create sequence map ---- - sequencing_map= sample_meta %>% dplyr::distinct(pick(all_of(c(sequencing_index_cols, id_cols)))) - - # Validation: Check that flowcell_names and flowcell_lanes exist in the sample meta ---- - if(!validate_columns_exist(c('flowcell_names', 'flowcell_lanes'), sample_meta)) { - stop('The above column(s) are NOT present in the sample meta.') + # Validation: Check that sequencing_index_cols in the sample meta are filled out ---- + # Check for rows in sequencing_index_cols that equate to empty - NA, "NA", "", " " + # Error out if the sequencing_index_cols are not filled out in the sample meta. + if(!validate_columns_entries(sequencing_index_cols, sample_meta)) { + stop('One or more sequencing_index_cols in the sample meta is not filled out.') } - # Validation: Check that sequencing_index_cols exist in the sample meta ---- - if(!validate_columns_exist(sequencing_index_cols, sample_meta)) { - print('The following sequencing_index_cols are not present in the sample meta.') - stop('The above sequencing_index_cols are NOT present in the sample meta.') + # Validation: Check that sequencing_index_cols uniquely identify every row of the sample_meta ---- + # Eror out if a sequencing_index_col does not appear in the sample_meta. + if(!validate_unique_samples(sequencing_index_cols, sample_meta)) { + print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.') + stop('The specified sequencing index columns do NOT uniquely identify every PCR well.') } # Validation: Check that id_cols exist in the sample meta ---- + # Error out if an id_col is not detected in the sample_meta. if(!validate_columns_exist(id_cols, sample_meta)) { stop('One or more id_cols is NOT present in the sample meta.') } - # Validation: Check that sequencing_index_cols in the sample meta are filled out ---- - # Check for rows in sequencing_index_cols that equate to empty - NA, "NA", "", " " - # Error out of the sequencing_index_cols are not filled out in the sample meta. - if(!validate_columns_entries(sequencing_index_cols, sample_meta)) { - stop('One or more sequencing_index_cols in the sample meta is not filled out.') + # Validation: Check that id_cols uniquely identify every row of the sample_meta ---- + if(!validate_unique_samples(id_cols, sample_meta)) { + print('There may be multiple entries in the sample meta that have the same combination of ID columns.') + stop('The specified ID columns do NOT uniquely identify every PCR well.') } + # Reverse index 2 if specified ---- + if(reverse_index2) { + if('index_2' %in% colnames(sample_meta)) { + print("Reverse-complementing index 2 barcode ...") + sample_meta$index_2= chartr("ATGC", "TACG", stringi::stri_reverse(sample_meta$index_2)) + } else { + stop('Reverse index 2 is set to TRUE, but index_2 does not exists.') + } + } + + # Create sequence map ---- + # Sequencing map is used to map combinations of the sequencing_index_cols to combinations of the id_cols. + sequencing_map= sample_meta %>% dplyr::distinct(pick(all_of(c(sequencing_index_cols, id_cols)))) + # Validation: Check that mapping is one to one ---- + # Make sure that the mapping from sequencing_index_cols to id_cols is 1 to 1. + # Code below groups on the sequencing_index_cols and filter for combinations that map to more than one id_col combination. check_mapping= sequencing_map %>% dplyr::group_by(pick(all_of(sequencing_index_cols))) %>% dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup() if(nrow(check_mapping) > 0) { @@ -186,76 +113,71 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, stop('The sequencing index columns do not map 1 to 1 to the ID columns.') } - # If "flowcell_name" and "flowcell_lane" are present, filter for valid flowcells ---- - # Note: Can this switch be tied to the sequencer type? + # Determine if 'flowcell_name' and flowcell_lane' are present in the uncollapsed raw counts file ---- + # If the columns are present, assume that uncollapsed_raw_counts is from Nori and filter for only valid flowcells + # If not, the file could be from a MiSeq run or something else outside of Nori, so skip this filter step. if(base::all(c('flowcell_name', 'flowcell_lane') %in% colnames(uncollapsed_raw_counts))) { - print('Detecting flowcells. Filtering for valid flowcells ...') + print('Detecting flowcell_name and flowcell_lane. Filtering for valid flowcells.') + # Validation: Check that flowcell_names and flowcell_lanes exist in the sample meta ---- + if(!validate_columns_exist(c('flowcell_names', 'flowcell_lanes'), sample_meta)) { + stop('The above column(s) are NOT present in the sample meta.') + } # Determine which flowcell names + lanes are expected ---- # "flowcell_names" and "flowcell_lanes" are strings that can contain more than one item. - # Columns can be parsed by splitting on the chars , ; : - # If there are multiple lane names and lane numbers, this uses the Cartesian product! + # Columns can be parsed by splitting on the characters , ; : + # If there are multiple lane names and lane numbers, this will take the Cartesian product! # Note: fread and read.csv keeps commas, read_csv DROPS commas expected_flowcells= sample_meta %>% dplyr::distinct(flowcell_names, flowcell_lanes) %>% dplyr::mutate(flowcell_name= base::strsplit(flowcell_names, split='[,;:]', fixed=F), flowcell_lane= base::strsplit(flowcell_lanes, split='[,;:]', fixed=F)) %>% tidyr::unnest(cols= flowcell_name) %>% tidyr::unnest(cols= flowcell_lane) %>% dplyr::mutate(flowcell_lane= as.numeric(flowcell_lane)) + # Note: This code uses base::strsplit and tidyr::unnest from an older version of tidyverse. + # If there is any update to the tidyverse verision, this can be refactored to use + # tidyr::separate_longer_delim - # Print out expected flowcells from the sample meta. - print(paste0('Identified ', nrow(expected_flowcells), ' unique flowcell + lane combos in the sample meta ...')) - print(expected_flowcells) - - # Print warning if there are multiple flowcell names with multiple flowcell lanes. - multi_name_and_lanes= expected_flowcells %>% dplyr::filter(grepl(',:;', flowcell_names) & grepl(',:;', flowcell_names)) - if(nrow(multi_name_and_lanes) > 0) { - print('WARNING: Detected sample(s) sequenced over multiple flowcells and flowcell lanes.') - print('The function assumes that the same lanes were used for both flowcells.') - } - - # Validation: Check that all expected flowcell name + lanes are detected ---- - # Check that all expected flowcell name + lanes are present in uncollapsed raw counts. + # Validation: Check if all expected flowcell name + lanes are detected ---- + # Check that all expected flowcell name + lanes are present in uncollapsed_raw_counts. + # Print warning if a flowcell is expected but not detected. detected_flowcells= uncollapsed_raw_counts %>% dplyr::distinct(flowcell_name, flowcell_lane) - print(paste0('Identified ', nrow(detected_flowcells), ' unique flowcell + lane combos in the uncollapsed raw counts ...')) + print(paste0('Identified ', nrow(detected_flowcells), ' unique flowcell + lane combos in the uncollapsed raw counts.')) print(detected_flowcells) validate_detected_flowcells(detected_flowcells, expected_flowcells) - # Validation: Check that sequencing_index_cols uniquely identify rows of sample meta ---- - if(!validate_unique_samples(sequencing_index_cols, sample_meta)) { - print('There may be multiple entries in the sample meta that have the same combination of sequencing index columns.') - stop('The specified sequencing index columns do NOT uniquely identify every PCR well.') - } - - # Validation: Check that id_cols uniquely identify rows of sample meta ---- - if(!validate_unique_samples(id_cols, sample_meta)) { - print('There may be multiple entries in the sample meta that have the same combination of ID columns.') - stop('The specified ID columns do NOT uniquely identify every PCR well.') - } - - # Filter for expected flowcells ---- - uncollapsed_raw_counts= data.table::merge.data.table( - uncollapsed_raw_counts, data.table::setDT(expected_flowcells), - by= c('flowcell_name', 'flowcell_lane'), allow.cartesian= FALSE) + # Filter for expected flowcells and add names/lanes columns ---- + # Filter using inner join with merge from data.table instead of dplyr join to improve performance. + uncollapsed_raw_counts= data.table::merge.data.table(uncollapsed_raw_counts, data.table::setDT(expected_flowcells), + by= c('flowcell_name', 'flowcell_lane'), allow.cartesian= FALSE) } else { - print('Flowcell_name and/or flowcell_lane were not detected in raw_counts_uncollapsed.') - print('Proceeding without filtering flowcells ...') + print('Flowcell_name and/or flowcell_lane are not detected in raw_counts_uncollapsed.') + print('Proceeding without filtering flowcells.') + } + + # Validation: Check that sequencing_index_cols exist in uncollapsed_raw_counts ---- + if(!validate_columns_exist(sequencing_index_cols, uncollapsed_raw_counts)) { + stop('Some sequencing_index_cols are NOT present in the uncollapsed_raw_counts') } # Create summed_reads file ---- - # Filter for the expected flowcells and summed up the reads over the ID cols. - print('Summing up reads ...') + print('Summing up reads.') + # Performing inner join with data.table instead of dplyr summed_reads= data.table::merge.data.table(uncollapsed_raw_counts, sequencing_map, by= sequencing_index_cols) + # Code below is checking if a barcode is in the list of known barcodes. + # If the barcode is not in the list of known barcodes, then the barcode is replaced with the string "unknown_reads". + # Function := performs the mutate inplace without copying the dataframe. + # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%. + summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes), + get(barcode_col), 'unknown_reads')] + # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells. summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] - # Escape for when a chunk contains invalid sequencing locations - if(nrow(summed_reads) == 0) { - print('WARNING: summed_reads is empty!') - return(summed_reads) - } - - # Calculate index purity in a chunk---- + # Calculate index purity ---- + # This is only accurate if the Nori input file is small enough to fit into a chunk. index_purity= sum(summed_reads$n) / sum(uncollapsed_raw_counts$n) + # Throw an error if the purity is greater than 1. + # Throw a warning if the purity is below 0.5. print(paste0('Index purity in chunk: ', round(index_purity, 4))) if(index_purity > 1) { stop('ERROR: Chunk index purity is greater than 1!') @@ -263,25 +185,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, print('Warning: Low index purity!') } else {} - print('Collate_fastq_reads has completed!') - return(summed_reads) -} - -#' extract_known_barcodes -#' -#' This function runs some action over chunks of a large file. At the end, all chunks are -#' appended together. -#' -#' @param raw_counts description -#' @param known_barcodes A vector known barcodes. -#' @param barcode_col String name of the column in uncollapsed_raw_counts that contains the sequences. -extract_known_barcodes= function(summed_reads, known_barcodes, barcode_col= 'forward_read_cl_barcode') { - # Create boolean column of known or unknown - summed_reads[, known := get(barcode_col) %chin% known_barcodes] - - # Filter using that boolean column - unknown_reads= summed_reads[known == FALSE,][order(-n)][, known := NULL] - summed_reads= summed_reads[known == TRUE,][, known := NULL] - - return(list(unknown_reads= unknown_reads, known_reads= summed_reads)) + # Return list of two dfs with known or unknown read counts ---- + print('Completing collate_fastq_reads.') + return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] != 'unknown_reads',], + unknown_barcode_counts= summed_reads[summed_reads[[barcode_col]] == 'unknown_reads',])) } From 6d8f858a0bfe34659e820b6a5f51a92a98a6558b Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Thu, 3 Oct 2024 17:30:04 -0400 Subject: [PATCH 076/127] Updated names of output files --- scripts/collate_fastq_reads.R | 80 ++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 39 deletions(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index ca00205f..850094a5 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -4,25 +4,26 @@ library(magrittr) library(tidyverse) library(data.table) source("./src/collate_fastq_reads.R") +source("./src/kitchen_utensils.R") # Argument parser ---- parser <- ArgumentParser() # specify desired options parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]") parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output") -parser$add_argument('--raw_counts_uncollapsed', default="raw_counts_uncollapsed.csv", - help="path to file containing uncollapsed raw counts file") -parser$add_argument("--sample_meta", default="sample_meta.csv", help = "Sample metadata") -parser$add_argument("--sequencing_index_cols", default= "index_1,index_2", - help = "Sequencing columns in the sample meta") +parser$add_argument('--raw_counts_uncollapsed', default= "raw_counts_uncollapsed.csv", + help= "path to file containing uncollapsed raw counts file") +parser$add_argument("--sample_meta", default="sample_meta.csv", help= "Sample metadata") +parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell line metadata") +parser$add_argument("--CB_meta", default= "CB_meta.csv", help= "Control Barcode metadata") +parser$add_argument('--sequencing_index_cols', default= 'index_1,index_2', + help= 'List of sequencing columns in the sample meta.') parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", help = "Columns that identify a unique PCR well") parser$add_argument("--reverse_index2", type="logical", default=FALSE, help= "Reverse complement of index 2 for NovaSeq and NextSeq") parser$add_argument("--barcode_col", default= "forward_read_cl_barcode", help= "Name of the column in uncollapsed_raw_counts that contains the barcode sequences.") -parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata") -parser$add_argument("--CB_meta", default="CB_meta.csv", help = "Control Barcode metadata") parser$add_argument("-o", "--out", default=getwd(), help = "Output path. Default is working directory") # get command line options, if help option encountered print help and exit @@ -33,17 +34,17 @@ if (args$out == "") { args$out = args$wkdir } -# Read in sample meta and parse argument strings ---- -# Read in files and parse vector arguments -sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',') +# Read in metadata files as data.table objects ---- +sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',') +cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',') +CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',') -# Parse vector inputs +# Parse some parameters into vectors ---- sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ",")) id_cols= unlist(strsplit(args$id_cols, ",")) # Validation: Check that sequencing_index_cols present in the sample meta ---- if(!validate_columns_exist(sequencing_index_cols, sample_meta)) { - print('The following sequencing_index_cols are not present in the sample meta.') stop('One or more sequencing_index_cols is NOT present in the sample meta.') } @@ -53,38 +54,39 @@ if(!validate_columns_exist(id_cols, sample_meta)) { } # Run collate_fastq_reads on chunks of raw_counts_uncollapsed.csv ---- -summed_reads= process_in_chunks(large_file_path= args$raw_counts_uncollapsed, - chunk_size= 10^6, - action= collate_fastq_reads, - sample_meta= sample_meta, - sequencing_index_cols= sequencing_index_cols, - id_cols= id_cols, - reverse_index2= args$reverse_index2, - barcode_col= args$barcode_col) - -# Sum up the read across the chunks afterwards! -summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, args$barcode_col)] +# raw_counts_uncollapsed can be too large to read into memory, +# so collate_fastq_reads is performed on chunks of the large file. +chunked_results= process_in_chunks(large_file_path= args$raw_counts_uncollapsed, + chunk_size= 10^6, + action= collate_fastq_reads, + # Parameters for collate_fastq_reads + sample_meta= sample_meta, + sequencing_index_cols= sequencing_index_cols, + id_cols= id_cols, + known_barcodes= unique(cell_line_meta$Sequence, CB_meta$Sequence), + reverse_index2= args$reverse_index2, + barcode_col= args$barcode_col) -# Split reads by either known or unknown ---- -# Reads are separated by whether or not the barcode exists in the PRISM library -# Read in metadata to get list of all known barcodes -cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',') -CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',') +# From each chunk, extract prism_barcode_counts and bind the rows together into one dataframe. +prism_barcode_counts= data.table::rbindlist(lapply(chunked_results, function(x) x$prism_barcode_counts)) +# Use data.table to group_by id_cols and barcode_col to sum up reads across all chunks. +prism_barcode_counts= prism_barcode_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)] -# Call function to separate barcodes -split_reads= extract_known_barcodes(summed_reads, unique(c(cell_line_meta$Sequence, CB_meta$Sequence)), - barcode_col= args$barcode) +# From each chunk, extract unknown_barcode_counts and bind the rows together into one dataframe. +unknown_barcode_counts= data.table::rbindlist(lapply(chunked_results, function(x) x$unknown_barcode_counts)) +# Use data.table to group_by id_cols and barcode_col to sum up reads across all chunks. +unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)] # Validation: Basic file size check ---- -if(nrow(split_reads$mapped_reads) == 0) { - stop('ERROR: Empty file generated. No rows in raw_counts output.') +if(nrow(prism_barcode_counts) == 0) { + stop('ERROR: Empty file generated. No rows in prism_barcode_counts output.') } # Write out files ---- -out_file= paste(args$out, 'unknown_reads.csv', sep='/') -print(paste("Writing unknown_reads.csv to ", out_file)) -write.csv(split_reads$unknown_reads, out_file, row.names= FALSE, quote= FALSE) +out_file= paste(args$out, 'prism_barcode_counts.csv', sep='/') +print(paste("Writing prism_barcode_counts.csv to ", out_file)) +write.csv(prism_barcode_counts, out_file, row.names= FALSE, quote= FALSE) -out_file= paste(args$out, 'known_reads.csv', sep='/') -print(paste("Writing known_reads.csv to ", out_file)) -write.csv(split_reads$known_reads, out_file, row.names= FALSE, quote= FALSE) +out_file= paste(args$out, 'unknown_barcode_counts.csv', sep='/') +print(paste("Writing unknown_barcode_counts.csv to ", out_file)) +write.csv(unknown_barcode_counts, out_file, row.names= FALSE, quote= FALSE) From c65e5e86747558a1fb94ee988feec778a272c767 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 12:36:24 -0400 Subject: [PATCH 077/127] Added more comments --- scripts/src/filter_raw_reads.R | 121 ++++++--------------------------- 1 file changed, 22 insertions(+), 99 deletions(-) diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R index 4484f7d7..1a3384f7 100755 --- a/scripts/src/filter_raw_reads.R +++ b/scripts/src/filter_raw_reads.R @@ -1,52 +1,5 @@ options(cli.unicode = FALSE) -#' validate_columns_exist -#' -#' This function checks that a list of columns are present in a dataframe. -#' Columns that were not found in the dataframe are printed out. -#' -#' @param selected_columns A vector of strings each representing a column name -#' @param df A dataframe to check against -#' @return Boolean -validate_columns_exist= function(selected_cols, df) { - # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B]. - unmatched_cols= base::setdiff(selected_cols, colnames(df)) - - if(length(unmatched_cols) > 0) { - print('The following columns are missing: ') - print(unmatched_cols) - return(FALSE) - } else { - return(TRUE) - } -} - -#' validate_unique_samples -#' -#' This function checks that a list of columns uniquely identifies all entries of a dataframe. -#' -#' @param selected_columns A vector of strings each representing a column name -#' @param df A dataframe to check against -#' @return Boolean -validate_unique_samples= function(selected_columns, df) { - message= paste0('The following columns do not uniquely identify every row of the dataframe: ', - paste(selected_columns, collapse=', ')) - print(message) - unique_column_values= df %>% dplyr::distinct(pick(all_of(selected_columns))) - if(nrow(unique_column_values) != nrow(df)) { - print('The selected columns do not uniquely identify all rows.') - - dups= df %>% dplyr::group_by(pick(all_of(selected_columns))) %>% - dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup() %>% - dplyr::arrange(pick(all_of(selected_columns))) - print(dups) - - return(FALSE) - } else { - return(TRUE) - } -} - #' validate_cell_set_luas #' #' This function checks that every cell set in the sample meta does not contain duplicate members. @@ -76,33 +29,28 @@ validate_cell_set_luas= function(sample_meta, cell_set_meta) { #' "forward_read_cl_barcode", and "n". #' @param sample_meta Dataframe of the metadata for the sequencing run. This file should contain the id_cols, #' "cell_set", "control_barcodes", etc. -#' @param cell_line_meta Master metadata of cell lines with the following required columns - "CCLE_name", -#' "DepMap_ID", "LUA", and "Sequence". #' @param cell_set_meta Master metadata of cell sets and their contents with the following required columns - #' "cell_set" and "members". +#' @param cell_line_meta Master metadata of cell lines with the following required columns - "CCLE_name", +#' "DepMap_ID", "LUA", and "Sequence". #' @param CB_meta Master metadata of control barcodes, their sequences, and their doses. The file should contain #' the columns - "Sequence", "Name", and "log_dose". #' @param id_cols Columns present in both raw_counts and sample_meta that uniquely identify each PCR well. #' This defaults to "pcr_plate", "pcr_well". -#' @param count_threshold Threshold to call low counts. This defaults to 40. #' @returns List with the following elements: #' #' \itemize{ -#' \item unmapped_reads: table of reads with valid index pairs but did not map to any known barcode. -#' The table contains the following columns - id_cols, "forward_read_cl_barcode", and "n". #' \item annotated_counts: table of reads and the associated well and well conditions. #' \item filtered_counts: table of all expected reads for the project, this is a subset of annotated counts. #' } -filter_raw_reads = function(raw_counts, - sample_meta, cell_line_meta, cell_set_meta, CB_meta, - id_cols= c('pcr_plate', 'pcr_well'), - count_threshold= 40) { +filter_raw_reads = function(prism_barcode_counts, + sample_meta, cell_set_meta, cell_line_meta, CB_meta, + id_cols= c('pcr_plate', 'pcr_well')) { require(magrittr) require(tidyverse) - # Processing metadata and inputs ---- - # CB meta is in log10 and should be converted to log2. + # CB meta is in log10 and should be converted to log2 ---- if('log_dose' %in% colnames(CB_meta)) { - print("Converting CB_meta from log10 to log2 ...") + print("Converting CB_meta from log10 to log2.") CB_meta= CB_meta %>% dplyr::mutate(log2_dose= log_dose/log10(2)) %>% dplyr::select(-log_dose) } @@ -119,13 +67,13 @@ filter_raw_reads = function(raw_counts, # Validation: Check that cell sets do not contain duplicate LUAs ---- # This will produce a warning if a LUA appears in a cell set more than once! - # This currently does NOT result in an error. Error avoided using a distinct when creating the template. + # This currently does NOT result in an error. Error avoided using a distinct when creating the template of expected reads. validate_cell_set_luas(sample_meta, cell_set_meta) # Creating a template of all expected reads in the run ---- - # Use all 4 meta data files to create a "template" dataframe where + # Use all 4 metadata files to create a "template" dataframe where # every row is a cell line that is expected in a PCR well. - print('Creating template of expected reads ...') + print('Creating template of expected reads.') # Join cell_set_meta and cell_line_meta. The cell_set can be a name "P939" or a list of LUAs. template= sample_meta %>% dplyr::left_join(cell_set_meta, by= 'cell_set') %>% dplyr::mutate(members= ifelse(is.na(members), str_split(cell_set, ';'), str_split(members, ';'))) %>% @@ -136,6 +84,8 @@ filter_raw_reads = function(raw_counts, # Check for control barcodes and add them to the template. if(any(unique(sample_meta$control_barcodes) %in% c('Y', 'T', T))) { + # Filter for wells with control barcodes and perform a many-to-many join. + # This will expand each well entry to the number of control barcodes for that well. cb_template= sample_meta %>% dplyr::filter(control_barcodes %in% c('Y', 'T', T)) %>% dplyr::mutate(joiner= 'temp') %>% dplyr::inner_join(CB_meta %>% dplyr::mutate(joiner= 'temp'), by='joiner', relationship= 'many-to-many') %>% @@ -144,35 +94,11 @@ filter_raw_reads = function(raw_counts, } # Annotating reads ---- - # From the set of reads that have the valid sequencing_index_cols combinations and map to the PRISM seq library, - # join in metadata to give each read a name and PCR location. - # Reads that to not match to the template are contaminants and, - # reads that are only present in the template are missing/not detected by PCR. - print("Annotating reads ...") - # Data.table version # - # # Left join cell_line_meta using data.table inplace left join - # raw_counts[cell_line_meta, base::setdiff(colnames(cell_line_meta), c('Sequence')) := - # base::mget(base::setdiff(colnames(cell_line_meta), c('Sequence'))), - # on= c('forward_read_cl_barcode' = 'Sequence')] - # # Left join CB_meta using data.table inplace left join - # raw_counts[CB_meta, base::setdiff(colnames(CB_meta), c('Sequence')) := - # base::mget(base::setdiff(colnames(CB_meta), c('Sequence'))), - # on= c('forward_read_cl_barcode' = 'Sequence')] - # # Left join CB_meta using data.table inplace left join - # raw_counts[sample_meta, base::setdiff(colnames(sample_meta), id_cols) := - # base::mget(base::setdiff(colnames(sample_meta), id_cols)), - # on= id_cols] - # data.table::setnames(raw_counts, 'forward_read_cl_barcode', 'Sequence') - # - # annotated_counts= data.table::merge.data.table( - # raw_counts, data.table::setDT(template %>% dplyr::mutate(expected_read= T)), - # by= intersect(colnames(template), colnames(raw_counts)), all.x= TRUE, all.y= TRUE, - # allow.cartesian= FALSE) %>% - # dplyr::select(!any_of(c('prism_cell_set', 'members', 'mapped'))) %>% - # dplyr::mutate(n= replace_na(n, 0), expected_read= replace_na(expected_read, F)) - - # Dplyr version # - annotated_counts= raw_counts %>% + # From prism_barcode_counts, left join metadata to annotate all reads. + # Perform a full join with the template of expected reads so that there is a row entry for + # cell lines not detected in sequencing. + print("Annotating reads.") + annotated_counts= prism_barcode_counts %>% dplyr::left_join(cell_line_meta, by= join_by('forward_read_cl_barcode'=='Sequence'), relationship= 'many-to-one') %>% dplyr::left_join(CB_meta, by= join_by('forward_read_cl_barcode'=='Sequence'), @@ -186,17 +112,14 @@ filter_raw_reads = function(raw_counts, dplyr::mutate(n= replace_na(n, 0), expected_read= replace_na(expected_read, F)) # Generating filtered reads ---- - # Get filtered counts from annotated counts. Also flag reads that are either missing, - # or below a count threshold. + # Get filtered counts from annotated counts and drop a few select columns. print("Filtering reads ...") filtered_counts= annotated_counts %>% dplyr::filter(expected_read) %>% dplyr::select(!any_of(c('flowcell_names', 'flowcell_lanes', 'index_1', 'index_2', - 'forward_read_cl_barcode', 'LUA', 'expected_read'))) %>% - dplyr::mutate(flag= ifelse(n==0, 'Missing', NA), - flag= ifelse(n!=0 & n < count_threshold, 'low counts', flag)) + 'forward_read_cl_barcode', 'LUA', 'expected_read'))) # Calculate cell line purity ---- - cell_line_purity= sum(filtered_counts$n)/ sum(raw_counts$n) + cell_line_purity= sum(filtered_counts$n)/ sum(prism_barcode_counts$n) print(paste0('Cell line purity: ', round(cell_line_purity, 4))) if(cell_line_purity > 1) { stop('ERROR: Cell line purity is greater than 1!') @@ -205,9 +128,9 @@ filter_raw_reads = function(raw_counts, print('Warning: Low cell line purity!') } + # Return both annotated_counts and filtered_counts ---- print('Filter_raw_reads has completed!') - return(list(annotated_counts= annotated_counts, - filtered_counts= filtered_counts)) + return(list(annotated_counts= annotated_counts, filtered_counts= filtered_counts)) } # checks is a string can be numeric From f1d7c50c55a28d7dea873edc79c3efd6c266b974 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 12:36:42 -0400 Subject: [PATCH 078/127] Create kitchen_utensils.R File of functions used across modules --- scripts/src/kitchen_utensils.R | 92 ++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 scripts/src/kitchen_utensils.R diff --git a/scripts/src/kitchen_utensils.R b/scripts/src/kitchen_utensils.R new file mode 100644 index 00000000..cfe9c0f4 --- /dev/null +++ b/scripts/src/kitchen_utensils.R @@ -0,0 +1,92 @@ +# Kitchen Utensils - +# This file contains functions for the pipeline. +# The functions are sorted alphabetically + +#' process_in_chunks +#' +#' This function runs some action over chunks of a large file. At the end, returns a list of all the chunks +#' +#' @param large_file_path description +#' @param chunk_size description +#' @param action A function passed to act on each chunk +#' @param ... Additional parameters to be passed into the action parameter +process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) { + header_col_names= data.table::fread(large_file_path, header= TRUE, sep= ',', nrow= 0) %>% colnames() + chunk_idx= 1 # Counter to keep track of chunks in a loop + current_chunk_size= chunk_size # Variable for loop exit condition + chunk_collector= list() # List to collect processed chunks + + # For each chunk, call an action + while(current_chunk_size == chunk_size) { + current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',', + col.names= header_col_names, + nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1) + + current_chunk_size= nrow(current_chunk) # set current chunk size to stop loop + print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' ')) + + chunk_collector[[chunk_idx]]= do.call(action, list(current_chunk, ...)) + chunk_idx= chunk_idx + 1 + } + + # Return a list of all the chunks + return(chunk_collector) +} + +#' validate_columns_entries +#' +#' This function checks that for a list of columns, all entries are filled in. +#' It checks all column entries against a list of potential empty values. +#' +#' @param selected_columns A vector of strings each representing a column name +#' @param df A dataframe to check against +#' @param empty_values Optional vector of values that equate to empty. Defaults to NA, "NA", "", and " ". +#' @return Boolean +validate_columns_entries= function(selected_columns, df, empty_values= c(NA, 'NA', '', ' ')) { + # Check for rows in selected_columns that equate to predefined empty values. + missing_rows= df %>% dplyr::filter(if_any(all_of(selected_columns), ~ . %in% empty_values)) + if(nrow(missing_rows) > 0) { + print('The following rows in the sample meta are not filled out for the sequencing index columns.') + print(missing_rows) # show the empty rows + return(FALSE) + } else { + return(TRUE) + } +} + +#' validate_columns_exist +#' +#' This function checks that a list of columns are present in a dataframe. +#' Columns that were not found in the dataframe are printed out. +#' +#' @param selected_columns A vector of strings each representing a column name +#' @param df A dataframe to check against +#' @return Boolean +validate_columns_exist= function(selected_cols, df) { + # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B]. + unmatched_cols= base::setdiff(selected_cols, colnames(df)) + + if(length(unmatched_cols) > 0) { + print('The following columns are missing: ') + print(unmatched_cols) + return(FALSE) + } else { + return(TRUE) + } +} + +#' validate_unique_samples +#' +#' This function checks that a list of columns uniquely identifies every row of a dataframe. +#' +#' @param selected_columns A vector of strings each representing a column name +#' @param df A dataframe to check against +#' @return Boolean +validate_unique_samples= function(selected_columns, df) { + unique_column_values= df %>% dplyr::distinct(pick(all_of(selected_columns))) + if(nrow(unique_column_values) != nrow(df)) { + return(FALSE) + } else { + return(TRUE) + } +} \ No newline at end of file From a1606d4b67fb22b748ee812f51b5a379e193af80 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 12:37:16 -0400 Subject: [PATCH 079/127] Comments and style changes --- scripts/src/normalize.R | 37 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/scripts/src/normalize.R b/scripts/src/normalize.R index 511a0cbc..1c41270d 100755 --- a/scripts/src/normalize.R +++ b/scripts/src/normalize.R @@ -1,19 +1,3 @@ -#' validate_columns_exist -#' -#' This function checks that a list of columns are present in a dataframe. -#' -#' @param selected_columns A vector of strings each representing a column name -#' @param df A dataframe to check against -#' @return Boolean -validate_columns_exist= function(selected_columns, df) { - # Check that all of selected_columns are in df - if(any(!selected_columns %in% colnames(df))) { - return(FALSE) - } else { - return(TRUE) - } -} - #' normalize #' #' takes a filtered dataframe of raw read counts and normalizes @@ -40,21 +24,20 @@ normalize <- function(X, id_cols, barcodes, pseudocount) { X %<>% dplyr::mutate(log2_n = log2(n + pseudocount)) # Validation: Check that id_cols are present in the dataframe ---- - if(validate_columns_exist(id_cols, X) == FALSE) { - print(id_cols) + if(!validate_columns_exist(id_cols, X)) { stop('One or more id_cols (printed above) is NOT present in the supplied dataframe.') } # Identify valid profiles and valid control barcodes to determine intercept ---- - # dropping invalid trt_type, wells without control barcodes, cell line entries or other CBs, cbs with zero reads, - # and profiles with fewer than 4 CBs. + # Drop wells with invalid trt_type, wells without control barcodes, cell line entries or other CBs, + # cbs with zero reads, and profiles with fewer than 4 CBs. valid_profiles= X %>% dplyr::filter(!trt_type %in% c("empty", "", "CB_only"), !is.na(trt_type), control_barcodes %in% c('Y', 'T', T), Name %in% barcodes, n!= 0) %>% dplyr::group_by(pick(all_of(id_cols))) %>% dplyr::filter(dplyr::n() > 4) %>% dplyr::ungroup() # Validation: Check which wells/profiles were dropped ---- - distinct_all_profiles = X %>% dplyr::distinct(pick(all_of(id_cols))) - distinct_valid_profiles = valid_profiles %>% dplyr::distinct(pick(all_of(id_cols))) + distinct_all_profiles= X %>% dplyr::distinct(pick(all_of(id_cols))) + distinct_valid_profiles= valid_profiles %>% dplyr::distinct(pick(all_of(id_cols))) if(nrow(distinct_all_profiles) != nrow(distinct_valid_profiles)) { # Print error if all profiles were dropped if(nrow(valid_profiles) == 0) { @@ -80,17 +63,17 @@ normalize <- function(X, id_cols, barcodes, pseudocount) { fit_stats= valid_profiles %>% dplyr::inner_join(fit_intercepts, by=id_cols) %>% dplyr::group_by(pick(all_of(id_cols))) %>% dplyr::mutate(log2_normalized_n= log2_n + cb_intercept, - norm_mae= median(abs(log2_dose- log2_normalized_n)), + norm_mae= median(abs(log2_dose - log2_normalized_n)), mean_y= mean(log2_dose), - residual2= (log2_dose- log2_normalized_n)^2, - squares2= (log2_dose- mean_y)^2, - norm_r2= 1- sum(residual2)/sum(squares2)) %>% dplyr::ungroup() %>% + residual2= (log2_dose - log2_normalized_n)^2, + squares2= (log2_dose - mean_y)^2, + norm_r2= 1 - sum(residual2) / sum(squares2)) %>% dplyr::ungroup() %>% dplyr::distinct(pick(all_of(c(id_cols, 'cb_intercept', 'norm_mae', 'norm_r2')))) # Normalize entries ---- normalized= X %>% dplyr::inner_join(fit_stats, by=id_cols) %>% dplyr::mutate(log2_normalized_n= log2_n + cb_intercept, - normalized_n = 2^log2_normalized_n) + normalized_n= 2^log2_normalized_n) return(normalized) } From 9d910a2bc9f84fdf3cb9a5eace09d6f6d49e7cfc Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 12:38:13 -0400 Subject: [PATCH 080/127] Reordered parameters --- scripts/make_config_file.groovy | 64 +++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy index 6c1a60aa..513998b1 100644 --- a/scripts/make_config_file.groovy +++ b/scripts/make_config_file.groovy @@ -13,6 +13,8 @@ pipeline { booleanParam(name: 'CBNORMALIZE', defaultValue: true, description: 'Check this to trigger the CBnormalize job.') booleanParam(name: 'COMPUTE_LFC', defaultValue: true, description: 'Check this to trigger the compute_l2fc job.') booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.') + booleanParam(name: 'FILTER_COUNTS_QC', defaultValue: true, description: 'Check this to trigger the QC job.') + booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.') booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.') booleanParam(name: 'RUN_NORM', defaultValue: true, description: 'Run normalization module on data.') booleanParam(name: 'PULL_POOL_ID', defaultValue: false, description: 'Flag indicating whether to pull pool IDs from CellDB - only applicable to cell sets (i.e. EXT.PR500.CS01.1.A, EXT.PR500.CS01.1.B, etc).') @@ -22,31 +24,40 @@ pipeline { string(name: 'BUILD_NAME', defaultValue: '', description: 'Build name') string(name: 'SCREEN', defaultValue: '', description: 'Screen name from COMET, necessary if using COMET for sample metadata.') string(name: 'SEQ_TYPE', defaultValue: 'DRAGEN', description: 'Choose DRAGEN, MiSeq, HiSeq, or NovaSeq. MiSeq and HiSeq/NovaSeq return files named differently. This setting sets the INDEX_1, INDEX_2, and BARCODE_SUFFIX parameters in fastq2readcount. Select DRAGEN if fastq files are from the DRAGEN pipeline from GP. Choosing NovaSeq reverses index 2.') - string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'Type to mark as control in compute_LFC') string(name: 'DAYS', defaultValue: '', description: 'If running the sushi_to_mts module, provide any days/timepoints (separated by commas) that should be dropped from output data. No quotes needed (ie, 2,8).') string(name: 'GIT_BRANCH', defaultValue: 'main', description: 'Pipeline branch to use') booleanParam(name: 'USE_LATEST', defaultValue: true, description: 'Check this to use the most up to date version from the specified branch. If not checked, will use the specified commit.') string(name: 'COMMIT_ID', defaultValue: '', description: 'Specific commit ID to use (leave empty if using the latest commit in the branch or if already specified in the config file.)') - string(name: 'CELL_SET_META', defaultValue: 'cell_set_meta.csv', description: 'Cell set metadata') - string(name: 'ID_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day,bio_rep,tech_rep', description: 'Columns to concat to create unique ID for each sample-replicate') - string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual controls') - string(name: 'SIG_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day', description: 'Signature columns') - string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'index_1,index_2,flowcell_names', description: 'Sequencing index columns') - string(name: 'CONTROL_BARCODE_META', defaultValue: 'CB_meta.csv', description: 'Metadata for control barcodes.') - string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'Field used to calculate L2FC') - string(name: 'CELL_SET_META', defaultValue: 'cell_set_meta.csv', description: 'Cell Set Metadata. Static cell_line_meta location: /data/vdb/prismSeq/cell_set_meta.csv') + + // Metadata files used by sushi string(name: 'SAMPLE_META', defaultValue: 'sample_meta.csv', description: 'File name of sample metadata within the BUILD_DIR directory.') - string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'Minimum threshold to filter cell line counts by.') - string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'Pseudocount for normalization.') + string(name: 'CELL_SET_META', defaultValue: 'cell_set_meta.csv', description: 'Cell Set Metadata. Static cell_line_meta location: /data/vdb/prismSeq/cell_set_meta.csv') string(name: 'CELL_LINE_META', defaultValue: 'cell_line_meta.csv', description: 'File in BUILD_DIR containing cell line metadata') + string(name: 'CONTROL_BARCODE_META', defaultValue: 'CB_meta.csv', description: 'Metadata for control barcodes.') + string(name: 'ASSAY_POOL_META', defaultValue: 'assay_pool_meta.txt', description: 'File in BUILD_DIR containing assay pool metadata') + + // Files consumed and created by sushi string(name: 'RAW_COUNTS_UNCOLLAPSED', defaultValue: 'raw_counts_uncollapsed.csv', description: 'Filename in BUILD_DIR containing nori output') - string(name: 'RAW_COUNTS', defaultValue: 'raw_counts.csv', description: 'Filename in BUILD_DIR containing raw counts') - string(name: 'FILTERED_COUNTS', defaultValue: 'filtered_counts.csv', description: 'File in BUILD_DIR containing filtered counts') - string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'File containing log2 fold change values') + string(name: 'PRISM_BARCODE_COUNTS', defaultValue: 'prism_barcode_counts.csv', description: 'Filename in BUILD_DIR containing PRISM barcode counts') + string(name: 'UNKNOWN_BARCODE_COUNTS', defaultValue: 'unknown_barcode_counts.csv', description: 'Filename in BUILD_DIR containing unknown barcode counts') string(name: 'ANNOTATED_COUNTS', defaultValue: 'annotated_counts.csv', description: 'File in BUILD_DIR containing annotated counts') + string(name: 'FILTERED_COUNTS', defaultValue: 'filtered_counts.csv', description: 'File in BUILD_DIR containing filtered counts') string(name: 'NORMALIZED_COUNTS', defaultValue: 'normalized_counts.csv', description: 'File in BUILD_DIR containing normalized counts') - string(name: 'COLLAPSED_VALUES', defaultValue: 'collapsed_l2fc.csv', description: 'File in BUILD_DIR containing replicate collapsed l2fc values') - string(name: 'ASSAY_POOL_META', defaultValue: 'assay_pool_meta.txt', description: 'File in BUILD_DIR containing assay pool metadata') + string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'File containing log2 fold change values') + string(name: 'COLLAPSED_LFC', defaultValue: 'collapsed_l2fc.csv', description: 'File in BUILD_DIR containing replicate collapsed l2fc values') + + // Column names parameters + string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'flowcell_names,index_1,index_2', description: 'Sequencing index columns used in COLLATE_FASTQ_READS') + string(name: 'ID_COLS', defaultValue: 'pcr_plate,pcr_well', description: 'Columns to concat to create unique ID for each sample-replicate') + string(name: 'CELL_LINE_COLS', defaultValue: 'DepMap_ID', description: 'Columns in intermediate files that describe a read or cell line') + string(name: 'SIG_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day', description: 'Signature columns') + string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual controls in COMPUTE_LFC') + + // Additional parameters + string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'In CBNORMALIZE, the pesudocount value for log transformations.') + string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'In COMPUTE_LFC, the name of the numeric column to use for calculations') + string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'In COMPUTE_LFC, the value in trt_type that indicates the negative controls') + string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'In FILTER_COUNTS_QC, the threshold for calling reads with low counts') string(name: 'API_URL', defaultValue: 'https://api.clue.io/api/', description: 'API URL') } @@ -100,12 +111,21 @@ pipeline { script { def paramList = [ 'SEQ_TYPE', 'API_URL', 'BUILD_DIR', 'INDEX_1', 'INDEX_2', 'BARCODE_SUFFIX', 'REVERSE_INDEX2', - 'SAMPLE_META', 'CONTROL_BARCODE_META', 'CTL_TYPES', 'ID_COLS', 'SIG_COLS', - 'RUN_NORM', 'CONTROL_COLS', 'COUNT_THRESHOLD', 'COUNT_COL_NAME', 'BUILD_NAME', 'CONVERT_SUSHI', - 'PULL_POOL_ID', 'RUN_EPS_QC', 'PSEUDOCOUNT', 'REMOVE_DATA', 'DAYS', 'SEQUENCING_INDEX_COLS', - 'RAW_COUNTS', 'CELL_SET_META', 'CELL_LINE_META', 'FILTERED_COUNTS', 'LFC', 'COUNTS', 'ANNOTATED_COUNTS', - 'COLLAPSED_VALUES', 'NORMALIZED_COUNTS', 'API_URL', 'FILTER_COUNTS_QC', 'ASSAY_POOL_META', 'SCREEN', - 'RAW_COUNTS_UNCOLLAPSED' + 'RUN_NORM', 'BUILD_NAME', 'CONVERT_SUSHI', 'PULL_POOL_ID', 'RUN_EPS_QC', 'REMOVE_DATA', 'DAYS', + 'COUNTS', 'SCREEN', + + // metadata files + 'SAMPLE_META', 'CELL_SET_META', 'CELL_LINE_META', 'CONTROL_BARCODE_META', 'ASSAY_POOL_META' + + // sushi files + 'RAW_COUNTS_UNCOLLAPSED', 'PRISM_BARCODE_COUNTS', 'UNKNOWN_BARCODE_COUNTS', + 'ANNOTATED_COUNTS', 'FILTERED_COUNTS', 'NORMALIZED_COUNTS', 'LFC', 'COLLAPSED_LFC' + + // column name parameters + 'SEQUENCING_INDEX_COLS', 'ID_COLS', 'CELL_LINE_COLS', 'SIG_COLS', 'CONTROL_COLS', + + // additional parameters + 'PSEUDOCOUNT', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD', 'API_URL' ] def config = [:] From 7255c533cb8c43fc0d3004c51d05515cccc81ef8 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 12:56:08 -0400 Subject: [PATCH 081/127] Also source kitchen utensils --- scripts/CBnormalize.R | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/CBnormalize.R b/scripts/CBnormalize.R index c0f47c4f..3775d32e 100755 --- a/scripts/CBnormalize.R +++ b/scripts/CBnormalize.R @@ -2,6 +2,7 @@ options(cli.unicode = FALSE) library(argparse) library(magrittr) source("./src/normalize.R") +source("./src/kitchen_utensils.R") # Argument parser ---- parser <- ArgumentParser() From 32f91324ed1db626b236dc7dc0f4d08baab6b6b9 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 12:56:47 -0400 Subject: [PATCH 082/127] Update with new collate outputs --- scripts/filter_counts.R | 93 ++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 56 deletions(-) diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R index fb55a52f..7cd0326f 100755 --- a/scripts/filter_counts.R +++ b/scripts/filter_counts.R @@ -9,28 +9,25 @@ suppressPackageStartupMessages(library(tidyr)) #pivot_wider suppressPackageStartupMessages(library(sets)) suppressPackageStartupMessages(library(tidyverse)) # load last - after dplyr source("./src/filter_raw_reads.R") +source("./src/kitchen_utensils.R") # Argument parser ---- parser <- ArgumentParser() # specify desired options -parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, - help="Print extra output [default]") -parser$add_argument("-q", "--quietly", action="store_false", - dest="verbose", help="Print little output") +parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]") +parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output") parser$add_argument("--wkdir", default=getwd(), help="Working directory") -parser$add_argument("-c", "--raw_counts", default="raw_counts.csv", help = "path to file containing raw counts") -parser$add_argument("-o", "--out", default="", help = "Output path. Default is working directory") -parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata") -parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell Line metadata") -parser$add_argument("--cell_set_meta", default="cell_set_meta.csv", help= "Cell set metadata") +parser$add_argument('--prism_barcode_counts', default= 'prism_barcode_counts.csv', help= 'Path to prism_barcode_counts.csv') +parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Path to sample_meta.csv') +parser$add_argument('--cell_set_meta', default= 'cell_set_meta.csv', help= 'Path to cell_set_meta.csv') +parser$add_argument('--cell_line_meta', default= 'cell_line_meta.csv', help= 'Path to cell_line_meta.csv') parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata") -parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", help = "Columns to identify each PCR well") -parser$add_argument("--CB_meta", default="CB_meta.csv", help = "Control Barcode metadata") -parser$add_argument("--count_threshold", default= 40, help = "Low counts threshold") +parser$add_argument('--CB_meta', default= 'CB_meta.csv', help= 'Path to CB_meta.csv') +parser$add_argument('--id_cols', default= 'pcr_plate,pcr_well', + help= 'List of sample_meta column names used to identify every PCR well') parser$add_argument("--rm_data", type="logical", help = "Remove bad experimental data") parser$add_argument("--pool_id", type="logical", help = "Pull pool IDs from CellDB.") -parser$add_argument("--control_type", default="negcon", - help = "negative control wells in trt_type column in sample metadata") +parser$add_argument("-o", "--out", default="", help = "Output path. Default is working directory") # get command line options, if help option encountered print help and exit args <- parser$parse_args() @@ -41,17 +38,17 @@ if (args$out == ""){ } #print_args(args) -# Read in files and set up parameters ---- -cell_set_meta= data.table::fread(args$cell_set_meta, header= T, sep= ',', data.table= F) -cell_line_meta= data.table::fread(args$cell_line_meta, header= T, sep= ',', data.table= F) -CB_meta= data.table::fread(args$CB_meta, header= T, sep= ',', data.table= F) -sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',', data.table= F) -raw_counts= data.table::fread(args$raw_counts, header= T, sep= ',', data.table= F) +# Read in all input files ---- +prism_barcode_counts= data.table::fread(args$prism_barcode_counts, header= TRUE, sep= ',') +sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',') +cell_set_meta= data.table::fread(args$cell_set_meta, header= TRUE, sep= ',') +cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',') +CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',') -# Convert strings to vectors ---- +# Convert input strings into vectors ---- id_cols= unlist(strsplit(args$id_cols, ",")) -count_threshold = as.numeric(args$count_threshold) +# What is this check doing? -YL ---- # make sure LUA codes in cell line meta are unique cell_line_meta %<>% dplyr::group_by(LUA) %>% @@ -66,58 +63,38 @@ cell_line_meta %<>% dplyr::filter(!duplicated(cell_line_meta$LUA, fromLast = TRUE)) %>% dplyr::select(-LUA.duplicity) -# Remove flowcell_name and lane columns from sample_meta because -# there is a profile_id duplicate when there are more than 1 seq runs -#sample_meta %<>% select(-flowcell_name, -flowcell_lane) %>% - # distinct() # This needs to be removed for sequencing_index_cols to work! - YL - # Run filter_raw_reads ----- print('Calling filter_raw_reads ...') -filtered_counts= filter_raw_reads(raw_counts= raw_counts, sample_meta= sample_meta, - cell_line_meta= cell_line_meta, - cell_set_meta= cell_set_meta, - CB_meta= CB_meta, - id_cols= id_cols, - count_threshold= as.numeric(args$count_threshold)) - -# Pulling pool_id when db_flag and pool_id flags are passed +module_outputs= filter_raw_reads(prism_barcode_counts= prism_barcode_counts, + sample_meta= sample_meta, + cell_set_meta= cell_set_meta, + cell_line_meta= cell_line_meta, + CB_meta= CB_meta, + id_cols= id_cols) + +# Pulling pool_id when db_flag and pool_id flags are passed ---- if (args$pool_id) { assay_pool_meta = read.delim(args$assay_pool_meta) unique_cell_sets <- unique(sample_meta$cell_set[sample_meta$cell_set != ""]) assay_pool_meta <- assay_pool_meta[assay_pool_meta$davepool_id %in% unique_cell_sets,] %>% select(pool_id, ccle_name, davepool_id, depmap_id) - filtered_counts$filtered_counts = filtered_counts$filtered_counts %>% + module_outputs$filtered_counts = module_outputs$filtered_counts %>% merge(assay_pool_meta, by.x=c("CCLE_name", "cell_set", "DepMap_ID"), by.y=c("ccle_name", "davepool_id", "depmap_id"), all.x=T) - filtered_counts$annotated_counts = filtered_counts$annotated_counts %>% + module_outputs$annotated_counts = module_outputs$annotated_counts %>% merge(assay_pool_meta, by.x=c("CCLE_name", "cell_set", "DepMap_ID"), by.y=c("ccle_name", "davepool_id", "depmap_id"), all.x=T) } # Validation: Basic file size check ---- -if(sum(filtered_counts$filtered_counts$n) == 0) { +if(sum(module_outputs$filtered_counts$n) == 0) { stop('All entries in filtered counts are missing!') } -cl_entries= filtered_counts$filtered_counts %>% dplyr::filter(!is.na(CCLE_name)) -if(sum(cl_entries$n) == 0) { - stop('All cell line counts are zero!') -} - -# Write out module outputs ---- -unmapped_reads= filtered_counts$unmapped_reads -unmapped_out = paste(args$out, 'unmapped_reads.csv', sep='/') -print(paste("Writing unmapped reads to: ", unmapped_out)) -write.csv(unmapped_reads, unmapped_out, row.names=F) - -annotated_counts = filtered_counts$annotated_counts -annot_out_file = paste(args$out, 'annotated_counts.csv', sep='/') -print(paste("Writing annotated counts to: ", annot_out_file)) -write.csv(annotated_counts, annot_out_file, row.names=F) - -filtered_counts = filtered_counts$filtered_counts +# Remove data ---- +filtered_counts= module_outputs$filtered_counts print(paste("rm_data:", args$rm_data)) # Remove data if needed @@ -137,7 +114,11 @@ if(args$rm_data == TRUE){ paste("Number of rows removed: ", rows_removed) } +# Write out files ---- +annot_out_file= paste0(args$out, '/annotated_counts.csv') +print(paste('Writing annotated counts to: ', annot_out_file)) +module_outputs$annotated_counts %>% write.csv(annot_out_file, row.names= FALSE) + filtrc_out_file = paste(args$out, 'filtered_counts.csv', sep='/') print(paste("Writing filtered counts csv to: ", filtrc_out_file)) write.csv(filtered_counts, filtrc_out_file, row.names=F, quote=F) - From 3bc331f45fea62535e161e1a61e5c40de54625af Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 12:57:02 -0400 Subject: [PATCH 083/127] Update with new outputs --- scripts/filteredCounts_QC.R | 73 +++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R index e98ca966..525452f3 100755 --- a/scripts/filteredCounts_QC.R +++ b/scripts/filteredCounts_QC.R @@ -10,34 +10,33 @@ suppressPackageStartupMessages(library(scales)) # for out of bound handling in p suppressPackageStartupMessages(library(ggpmisc)) # with ggplot to add linear fit labels suppressPackageStartupMessages(library(WGCNA)) # for faster correlations source("/workspace/scripts/src/QC_images.R") +source("./src/kitchen_utensils.R") # Argument parser ---- parser <- ArgumentParser() + # specify desired options parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]") parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output") parser$add_argument("--wkdir", default=getwd(), help="Working directory") -parser$add_argument("-s", "--sample_meta", default="sample_meta.csv", help= "Sample metadata") -parser$add_argument("-c", "--raw_counts_uncollapsed", default="raw_counts_uncollapsed.csv", - help="path to file containing uncollapsed raw counts file") -parser$add_argument("--raw_counts", default= "raw_counts.csv", help="path to raw counts file") -parser$add_argument("--annotated_counts", default= "annotated_counts.csv", - help= "path to file containing annotated counts") -parser$add_argument("--normalized_counts", default="normalized_counts.csv", - help="path to file containing normalized counts") -parser$add_argument("--lfc", default="l2fc.csv", help= "path to l2fc file") -parser$add_argument("--cell_line_cols", default= 'DepMap_ID,CCLE_name', - help= "Columns that identify cell lines or barcodes") -parser$add_argument("--id_cols", default= 'pcr_plate,pcr_well', - help= "Columns to identify each PCR well") -parser$add_argument("--sig_cols", default="cell_set,treatment,dose,dose_unit,day", - help= 'Columns used to identify the treatment conditions') -parser$add_argument("--control_type", default = "negcon", - help= "how negative control wells are distinguished in the trt_type column") -parser$add_argument("--count_threshold", default=40, help= "Low counts threshold") -parser$add_argument("--reverse_index2", type="logical", default=FALSE, - help= "Reverse complement of index 2 for NovaSeq and NextSeq") -parser$add_argument("-o","--out", default="", help = "Output path. Default is working directory") +parser$add_argument('--raw_counts_uncollapsed', default= "raw_counts_uncollapsed.csv", + help= 'Path to file containing uncollapsed raw counts') +parser$add_argument('--prism_barcode_counts', default= "prism_barcode_counts.csv", help= 'Path to prism_barcode_counts.csv') +parser$add_argument('--unknown_barcode_counts', default= "unknown_barcode_counts.csv", + help= 'Path to unknown_barcode_counts.csv') +parser$add_argument('--annotated_counts', default= 'annotated_counts.csv', help= 'Path to annotated_counts.csv') +parser$add_argument('--normalized_counts', default= 'normalized_counts.csv', help= 'Path to normalized_counts.csv') +parser$add_argument('--lfc', default= 'l2fc.csv', help= 'Path to l2fc.csv') +parser$add_argument('-s', '--sample_meta', default= 'sample_meta.csv', help= 'Path to sample_meta.csv') +parser$add_argument('--id_cols', default= 'pcr_plate,pcr_well', help= 'Sample meta columns used to identify every PCR well') +parser$add_argument('--cell_line_cols', default= 'DepMap_ID', help= 'Sushi columns used to identify a read') +parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', + help= 'Sample meta columns used to identify unique treatment conditions') +parser$add_argument('--control_type', default= 'negcon', help= 'Value used in trt_type column to denote negative controls') +parser$add_argument('--count_threshold', default= 40, help= 'Low counts theshold used in some plots') +parser$add_argument('--reverse_index2', type= "logical", default= FALSE, + help= 'Switch to reverse complement index_2 for some sequencers') +parser$add_argument('-o', '--out', default= '', help= 'Output path, defaults to working directory') # get command line options, if help option encountered print help and exit args <- parser$parse_args() @@ -47,40 +46,36 @@ if (args$out == ""){ args$out = args$wkdir } -# Read in files and pull out parameters ---- +# Read in input files ---- sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',') - -# Pipeline outputs -raw_counts_uncollapsed= data.table::fread(args$raw_counts_uncollapsed, header= TRUE, sep= ',') -raw_counts= data.table::fread(args$raw_counts, header= TRUE, sep= ',') +prism_barcode_counts= data.table::fread(args$prism_barcode_counts, header= TRUE, sep= ',') +unknown_barcode_counts= data.table::fread(args$unknown_barcode_counts, header= TRUE, sep= ',') annotated_counts= data.table::fread(args$annotated_counts, header= TRUE, sep= ',') if(file.exists(args$normalized_counts)) { - normalized_counts= data.table::fread(args$normalized_counts, header=TRUE, sep=',', data.table=FALSE) + normalized_counts= data.table::fread(args$normalized_counts, header=TRUE, sep=',') } else { normalized_counts= NA } l2fc= data.table::fread(args$lfc, header= TRUE, sep= ',') -# Parameters -cell_line_cols = unlist(strsplit(args$cell_line_cols, ",")) -id_cols= unlist(strsplit(args$id_cols, ",")) -sig_cols= unlist(strsplit(args$sig_cols, ",")) -control_type = args$control_type -count_threshold= as.numeric(args$count_threshold) -# +# Parse some input parameters ---- +id_cols= unlist(strsplit(args$id_cols, ',')) +cell_line_cols= unlist(strsplit(args$cell_line_cols, ',')) +sig_cols= unlist(strsplit(args$sig_cols, ',')) # Call QC images function ---- print("Calling QC images ...") -QC_images(raw_counts_uncollapsed= raw_counts_uncollapsed, - raw_counts= raw_counts, +QC_images(raw_counts_uncollapsed_filepath= raw_counts_uncollapsed, + prism_barcode_counts= prism_barcode_counts, + unknown_barcode_counts= unknown_barcode_counts, annotated_counts= annotated_counts, normalized_counts= normalized_counts, l2fc= l2fc, sample_meta= sample_meta, - cell_line_cols= c('DepMap_ID', 'CCLE_name'), id_cols= id_cols, + cell_line_cols= cell_line_cols, sig_cols= sig_cols, - control_type= control_type, - count_threshold= count_threshold, + control_type= args$control_type, + count_threshold= as.numeric(args$count_threshold), reverse_index2= args$reverse_index2, out= args$out) From eed79e18560c020884616238fe6d6c6ebb86c25b Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 12:57:12 -0400 Subject: [PATCH 084/127] Update with new upstream outputs --- scripts/src/QC_images.R | 345 +++++++++++++++++----------------------- 1 file changed, 150 insertions(+), 195 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index b0c0c1dd..a88bb019 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -1,54 +1,3 @@ -#' validate_columns_exist -#' -#' This function checks that a list of columns are present in a dataframe. -#' Columns that were not found in the dataframe are printed out. -#' -#' @param selected_columns A vector of strings each representing a column name -#' @param df A dataframe to check against -#' @returns Boolean -validate_columns_exist= function(selected_cols, df) { - # Check that all of selected_columns are in df - base::setdiff(A, B) = A[!A %in% B]. - unmatched_cols= base::setdiff(selected_cols, colnames(df)) - - if(length(unmatched_cols) > 0) { - print('The following columns are missing: ') - print(unmatched_cols) - return(FALSE) - } else { - return(TRUE) - } -} - -#' Calculate index summaries -#' -#' Generates some simple summaries for each unique index. -#' -#' @import tidyverse -#' @param df A dataframe which must contain the column "n" which represents the count of a read. -#' @param index_col The name of the column contain the index barcodes as a string. This column must be present in "df". -#' @param valid_indices. A vector of all the valid indices for "index_col". -#' @returns A dataframe with the follow columns: -#' - index_col: String, The column containing the index barcodes. -#' - idx_n: Numeric, Number of reads associated with a specific index barcode. -#' - fraction: Numeric, "idx_n" divided by the total number of reads in the run. -#' - expected: Boolean, True if the index barcode is in "valid_indices" otherwise False. -#' - contains_n: Boolean, True if the index barcode contains "N" in its sequence, otherwise False. -#' - lv_dist: Numeric, Edit distance from a valid index barcode. -#' - ham_dist: Numeric, Hamming distance from a valid index barcode. -get_index_summary= function(df, index_col, valid_indices) { - output_summary= df %>% dplyr::group_by(pick(all_of(index_col))) %>% - dplyr::summarise(idx_n= sum(n)) %>% dplyr::ungroup() %>% - dplyr::mutate(fraction= round(idx_n/sum(idx_n), 5), - expected= ifelse(.[[index_col]] %chin% valid_indices, T, F), - contains_n= ifelse(grepl('N', .[[index_col]]), T, F), - lv_dist= apply(stringdist::stringdistmatrix(.[[index_col]], valid_indices, method="lv"), - 1, min), - ham_dist= apply(stringdist::stringdistmatrix(.[[index_col]], valid_indices, method="hamming"), - 1, min)) %>% - dplyr::arrange(desc(fraction)) - return(output_summary) -} - #' Calculate purity metrics #' #' Create the qc table with index purity and cell line purity. @@ -59,67 +8,87 @@ get_index_summary= function(df, index_col, valid_indices) { #' @param value_col String name of the counts column present all three dataframes. #' @param file_path Location to write out the output. #' @returns Writes out a QC_table to the file_path. -create_qc_table= function(raw_counts_uncollapsed_filepath, unknown_reads, known_reads, filtered_counts, - value_col= 'n', file_path) { - # Validations: Check that the path works and that value_col exists in all tables +create_qc_table= function(raw_counts_uncollapsed_filepath, unknown_barcode_counts, + prism_barcode_counts, filtered_counts, + value_col= 'n', output_path) { + # Validation: Check that the file at the path exists if(!file.exists(raw_counts_uncollapsed_filepath)) { stop('Cannot find the raw counts uncollapsed file.') } + + # Pull out only the headers of the large file for validation rcu_headers= data.table::fread(raw_counts_uncollapsed_filepath, header= TRUE, sep= ',', nrow= 0) + + # Validation: Check that value_col exists in raw_counts_uncollapsed if(!validate_columns_exist(value_col, rcu_headers)) { - stop(paste0('The column ', value_col, " was not detected in uncollapsed raw counts.")) + stop(paste0('The column ', value_col, ' was not detected in uncollapsed raw counts.')) } - if(!validate_columns_exist(value_col, unknown_reads)) { - stop(paste0('The column ', value_col, " was not detected in unknown_reads.csv")) + + # Validation: Check that value_col exists in unknown_barocde_counts + if(!validate_columns_exist(value_col, unknown_barcode_counts)) { + stop(paste0('The column ', value_col, ' was not detected in unknown_barcode_counts.csv')) } - if(!validate_columns_exist(value_col, known_reads)) { - stop(paste0('The column ', value_col, " was not detected in known_reads.csv")) + + # Validation: Check that value_col exists in prism_barocde_counts + if(!validate_columns_exist(value_col, prism_barcode_counts)) { + stop(paste0('The column ', value_col, ' was not detected in prism_barcode_counts.csv')) } + + # Validation: Check that value_col exists in filtered_counts if(!validate_columns_exist(value_col, filtered_counts)) { - stop(paste0('The column ', value_col, " was not detected in filtered_counts.csv")) + stop(paste0('The column ', value_col, ' was not detected in filtered_counts.csv')) } - # Calculate purities - # Determine total number of reads + # Determine total number of reads in raw_counts_uncollapsed using chunking chunk_sum= process_in_chunks(large_file_path= raw_counts_uncollapsed_filepath, chunk_size= 10^6, action= function(x) data.table::as.data.table(sum(x[[value_col]]))) - total_num_reads= sum(unlist(test_sum)) + total_num_reads= sum(unlist(chunk_sum)) + + # Determine number of reads that mapped to valid PCR locations + # These reads have the correct index barcodes + total_valid_pcr_reads= sum(unknown_barcode_counts[[value_col]]) + sum(prism_barcode_counts[[value_col]]) # Calculate purities - index_purity= (sum(unknown_reads[[value_col]]) + sum(known_reads[[value_col]])) / total_num_reads + # Index purity is the fraction of reads that mapped to valid PCR locations out of the total number of reads. + index_purity= total_valid_pcr_reads / total_num_reads print(paste0('Index purity: ', round(index_purity, 4))) - cell_line_purity= sum(filtered_counts[[value_col]]) / (sum(unknown_reads[[value_col]]) + sum(known_reads[[value_col]])) + # Cell line purity is the fraction of reads that are identified as cell lines or control barcodes out of valid PCR reads. + cell_line_purity= sum(filtered_counts[[value_col]]) / total_valid_pcr_reads print(paste0('Cell line purity: ', round(cell_line_purity, 4))) - qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity) - # Write out table - print(paste0('Writing QC table out to ', file_path)) - qc_table %>% write.csv(file_path, row.names= FALSE, quote= FALSE) + # Write out QC table + qc_table= data.frame(index_purity= index_purity, cell_line_purity= cell_line_purity) + print(paste0('Writing QC table out to ', output_path)) + qc_table %>% write.csv(output_path, row.names= FALSE, quote= FALSE) } -process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) { - - header_col_names= data.table::fread(large_file_path, header= TRUE, sep= ',', nrow= 0) %>% colnames() - chunk_idx= 1 # Counter to keep track of chunks in a loop - current_chunk_size= chunk_size # Variable for loop exit condition - chunk_collector= list() # List to collect processed chunks - - # For each chunk, call an action - while(current_chunk_size == chunk_size) { - current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',', - col.names= header_col_names, - nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1) - - current_chunk_size= nrow(current_chunk) # set current chunk size to stop loop - print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' ')) - - chunk_collector[[chunk_idx]]= do.call(action, list(current_chunk, ...)) - chunk_idx= chunk_idx + 1 - } - - output_table= data.table::rbindlist(chunk_collector) - return(output_table) +#' Calculate index summaries +#' +#' Generates some simple summaries for each unique index. +#' +#' @import tidyverse +#' @param df A dataframe which must contain the column "n" which represents the count of a read. +#' @param index_col The name of the column contain the index barcodes as a string. This column must be present in "df". +#' @param valid_indices. A vector of all the valid indices for "index_col". +#' @returns A dataframe with the follow columns: +#' - index_col: String, The column containing the index barcodes. +#' - idx_n: Numeric, Number of reads associated with a specific index barcode. +#' - fraction: Numeric, "idx_n" divided by the total number of reads in the run. +#' - expected: Boolean, True if the index barcode is in "valid_indices" otherwise False. +#' - contains_n: Boolean, True if the index barcode contains "N" in its sequence, otherwise False. +#' - lv_dist: Numeric, Edit distance from a valid index barcode. +#' - ham_dist: Numeric, Hamming distance from a valid index barcode. +get_index_summary= function(df, index_col, valid_indices) { + output_summary= df %>% dplyr::group_by(pick(all_of(index_col))) %>% + dplyr::summarise(idx_n= sum(n)) %>% dplyr::ungroup() %>% + dplyr::mutate(fraction= round(idx_n/sum(idx_n), 5), + expected= ifelse(.[[index_col]] %chin% valid_indices, T, F), + contains_n= ifelse(grepl('N', .[[index_col]]), T, F), + lv_dist= apply(stringdist::stringdistmatrix(.[[index_col]], valid_indices, method="lv"), 1, min), + ham_dist= apply(stringdist::stringdistmatrix(.[[index_col]], valid_indices, method="hamming"), 1, min)) %>% + dplyr::arrange(desc(fraction)) + return(output_summary) } #' Total counts barplot @@ -142,18 +111,18 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) { # Sum up reads total_counts= filtered_counts %>% dplyr::mutate(barcode_type= case_when(!is.na(CCLE_name) ~ 'cell line', - !is.na(Name) ~ 'ctrl barcode')) %>% + !is.na(cb_name) ~ 'ctrl barcode')) %>% tidyr::unite(all_of(id_cols), col= 'sample_id', sep= ':', remove= FALSE, na.rm= FALSE) %>% dplyr::group_by(pick(all_of(na.omit(c('sample_id', facet_col, 'barcode_type'))))) %>% dplyr::summarise(total_counts= sum(n)) %>% dplyr::ungroup() # Create total counts plot total_counts_plot= total_counts %>% - ggplot(aes(x=sample_id, y=total_counts, fill=barcode_type)) + - geom_col(alpha=0.75, position='identity') + - geom_hline(yintercept= 10^4, linetype=2) + + ggplot(aes(x= sample_id, y= total_counts, fill= barcode_type)) + + geom_col(alpha= 0.75, position= 'identity') + + geom_hline(yintercept= 10^4, linetype= 2) + {if(!is.na(facet_col)) facet_wrap(~.data[[facet_col]], scale= 'free_x')} + - labs(x= "Sample constructed using id_cols", y="Total counts", fill= 'Barcode\ntype', + labs(x= 'Sample constructed using id_cols', y= 'Total counts', fill= 'Barcode\ntype', title= 'Filtered counts - unstacked') + theme_bw() + theme(axis.text.x = element_text(angle=70, hjust=1)) @@ -181,7 +150,7 @@ create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value plot_type= 'percent', include_ctrl_bcs= FALSE) { # Validation: Check that id_cols, facet_col, or value_col exist in filtered counts. if(!validate_columns_exist(na.omit(c(id_cols, facet_col, value_col)), filtered_counts)) { - stop('Some input columns were not detected in filtered counts.') + stop('In create_recovery_barplot, some required input columns were not detected.') } # Filter out control barcodes if it is specified. @@ -215,9 +184,9 @@ create_recovery_barplot= function(filtered_counts, id_cols, facet_col= NA, value # Create recovery plot. recov_plot= recovery %>% ggplot(aes(x= sample_id, y= .data[[y_col]], fill= reorder(detect_type, dplyr::desc(detect_type)))) + - geom_col(alpha=0.75, position='stack') + + geom_col(position= 'stack', alpha= 0.75) + {if(!is.na(facet_col)) facet_wrap(~.data[[facet_col]], scale= 'free_x')} + - labs(x= "Sample constructed using id_cols", y= y_text, fill= '', title= 'Cell line recovery') + + labs(x= 'Sample constructed using id_cols', y= y_text, fill= '', title= 'Cell line recovery') + theme_bw() + theme(axis.text.x = element_text(angle=70, hjust=1)) return(recov_plot) @@ -245,7 +214,12 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2= contains_cbs= FALSE, order_aucs= FALSE) { # Validation: Check that id_cols and counts_col are in the input dataframe. if(!validate_columns_exist(c(id_cols, counts_col), input_df)) { - stop('Some input columns were not detected in the cdf input dataframe.') + stop('In create_cdf_plot, some required input columns were not detected.') + } + + # Validation: mark1 should be less than mark2. + if(mark1 > mark2 | mark1 < 0 | mark1 > 1) { + stop('Mark values must be between 0 and 1. mark1 should be less than mark2') } # Determine percentages, ranks and cumulative percentages @@ -254,12 +228,7 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2= dplyr::mutate(expected_num_cls= dplyr::n(), total_counts= sum(.data[[counts_col]]), pct_counts= .data[[counts_col]]/total_counts, cum_pct= cumsum(pct_counts), - rank= row_number(), rank_pct= rank/expected_num_cls) %>% dplyr::ungroup() - - # Validation: mark1 should be less than mark2. - if(mark1 > mark2 | mark1 < 0 | mark1 > 1) { - stop('Mark values must be between 0 and 1 Mark1 should be less than mark2') - } + rank= row_number(), rank_pct= rank / expected_num_cls) %>% dplyr::ungroup() # Find the number of cell lines needed to reach mark1 and mark2 mark1_values= calc_cummulative %>% dplyr::filter(cum_pct >= mark1) %>% @@ -286,7 +255,7 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2= # Create plot output_plot= data_for_plot %>% - ggplot(aes(x= rank_pct, y=cum_pct)) + + ggplot(aes(x= rank_pct, y= cum_pct)) + # Color control barcodes if specified {if(contains_cbs) geom_point(. %>% dplyr::filter(!is.na(Name)), mapping= aes(x= rank_pct, y=cum_pct, color=reorder(Name, log2_dose)), size= 2)} + @@ -304,7 +273,7 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2= geom_label(. %>% dplyr::filter(!is.na(auc)), mapping= aes(label= paste0('AUC ', round(auc, 3))), x= 1, y= 0.25, hjust= 'inward', vjust= 'inward', color= 'black') + facet_wrap(~facet_name) + - labs(x='% rank of unique reads', y='Cumulative percentage', color= 'CBs') + theme_bw() + labs(x= '% rank of unique reads', y= 'Cumulative percentage', color= 'CBs') + theme_bw() return(output_plot) } @@ -408,11 +377,11 @@ create_cor_heatmap= function(input_df, row_id_cols, col_id_cols, value_col, cor_heatmap= correlation_mx %>% reshape2::melt() %>% ggplot(aes(x= Var1, y= Var2, fill= value)) + geom_tile() + - labs(x= '', y= '', fill= '', title= paste0('Correlations using ', value_col)) + scale_fill_gradientn(breaks= c(0, 0.5, 1), colours= c('blue', 'white','red'), limits=c(0, 1), oob= scales::squish) + - theme(axis.text.x = element_text(angle=70, hjust=1)) + labs(x= '', y= '', fill= '', title= paste0('Correlations using ', value_col)) + + theme_bw() + theme(axis.text.x = element_text(angle=70, hjust=1)) return(cor_heatmap) } @@ -497,13 +466,16 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou #' @param reverse_index2 Boolean set to TRUE if the sequencing involved the reverse complement workflow. #' @param out Path to the directory to save the QC images. #' @returns NA. QC images are written out to the specified folder. -QC_images= function(raw_counts_uncollapsed, raw_counts, +QC_images= function(raw_counts_uncollapsed_filepath, + prism_barcode_counts, unknown_barcode_counts, annotated_counts, normalized_counts= NA, l2fc, sample_meta, - cell_line_cols, - id_cols= c('pcr_plate', 'pcr_well'), sig_cols, + id_cols= c('pcr_plate', 'pcr_well'), + cell_line_cols= c('DepMap_ID'), + sig_cols, control_type= 'negcon', count_threshold= 40, reverse_index2= FALSE, out = NA) { + # Required packages ---- require(tidyverse) require(magrittr) @@ -515,46 +487,51 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, # Some preprocessing ---- # Set out directory if none is specified. - if(is.na(out)) {out = getwd()} + if(is.na(out)) {out= getwd()} - # Create empty vector to collect potential errors. + # Create empty vector to collect potential errors when running QCs skipped_qcs= c() # Count number of distinct profile to help scale some plots. num_profiles= annotated_counts %>% dplyr::distinct(pick(all_of(id_cols))) %>% nrow() - # Detect control barcodes + # Detect if there are wells with control barcodes. cb_check= sample_meta %>% dplyr::filter(control_barcodes %in% c("Y", "T", T), !(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type)) contains_cbs= ifelse(nrow(cb_check)!= 0, TRUE, FALSE) - # Pull filtered counts from annotated counts + # Create filtered_counts df from annotated_counts filtered_counts= annotated_counts %>% dplyr::filter(expected_read) # Sequencing QCs ____________________ ---- ## 1. Purity metrics ---- print('1. Generating QC table ...') - create_qc_table(raw_counts_uncollapsed, - unknown_reads= unknown_reads, - known_reads= known_reads, - filtered_counts, + create_qc_table(raw_counts_uncollapsed_path= raw_counts_uncollapsed_path, + unknown_barcode_counts= unknown_barcode_counts, + prism_barcode_counts= prism_barcode_counts, + filtered_counts= filtered_counts, value_col= 'n', file_path= paste0(out, '/QC_table.csv')) ## 2. Index count summaries ---- - print("2. Generating index counts tables ...") + print('2. Generating index counts tables ...') # Pull out headers to perform checks - raw_counts_uncollapsed_headers= data.table::fread(raw_counts_file_path, header= TRUE, sep= ',', nrow= 0) + raw_counts_uncollapsed_headers= data.table::fread(raw_counts_uncollapsed_path, header= TRUE, sep= ',', nrow= 0) # Check that "index_1" is present. If so, calculate index summary and write out. - if('index_1' %in% colnames(sample_meta) & 'index_1' %in% colnames(raw_counts_uncollapsed_headers)) { + if('index_1' %in% colnames(sample_meta) & 'index_1' %in% colnames(raw_counts_uncollapsed_path)) { + # Aggregate over index_1 using chunks + # Action is set to a data.table summarize with summing + index1_chunks= process_in_chunks(large_file_path= raw_counts_uncollapsed_path, chunk_size= 10^6, + action= function(x) x[, list(n= sum(n)), by= index_1]) + + # Create vector of unique index_1 values expected_index1= unique(sample_meta$index_1) - # Aggregate by index_1 using chunks - index1_chunks= process_in_chunks(large_file_path= raw_counts_file_path, chunk_size= 10^6, - action= function(x) x[, list(n= sum(n)), by= index_1]) - index1_counts= get_index_summary(index1_chunks, 'index_1', expected_index1) - index1_counts %>% write.csv(file= paste(out, 'index1_counts.csv', sep='/'), row.names=F) + + # Call get_index_summary over index1_chunks as a full table, then write out table + index1_counts= get_index_summary(data.table::rbindlist(index1_chunks), 'index_1', expected_index1) + index1_counts %>% write.csv(file= paste(out, 'index1_counts.csv', sep= '/'), row.names= FALSE, quote= FALSE) } else { print('Column "index_1" not detected. Skipping index 1 summaries ...', quote= FALSE) } @@ -567,20 +544,23 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, } if('index_2' %in% colnames(sample_meta) & 'index_2' %in% colnames(raw_counts_uncollapsed_headers)) { - expected_index2= unique(sample_meta$index_2) - - # Aggregate by index_2 using chunks - index2_chunks= process_in_chunks(large_file_path= raw_counts_file_path, chunk_size= 10^6, + # Aggregate over index_2 using chunks + # Action is set to a data.table summarize with summing + index2_chunks= process_in_chunks(large_file_path= raw_counts_uncollapsed_path, chunk_size= 10^6, action= function(x) x[, list(n= sum(n)), by= index_2]) - index2_counts= get_index_summary(raw_counts_uncollapsed, 'index_2', expected_index2) - index2_counts %>% write.csv(file= paste(out, 'index2_counts.csv', sep='/'), row.names=F) + # Create vector of unique index_2 values + expected_index2= unique(sample_meta$index_2) + + # Call get_index_summary over index2_chunks as a full table, then write out table + index2_counts= get_index_summary(data.table::rbindlist(raw_counts_uncollapsed), 'index_2', expected_index2) + index2_counts %>% write.csv(file= paste(out, 'index2_counts.csv', sep= '/'), row.names= FALSE, quote= FALSE) } else { print('Column "index_2" not detected. Skipping index 2 summaries ...', quote= FALSE) } ## 3. Total counts ---- - print("3. Generating total_counts image ...") + print('3. Generating total_counts image ...') potential_error= base::tryCatch({ tc= create_total_counts_barplot(filtered_counts, id_cols, facet_col= 'pcr_plate') @@ -602,13 +582,13 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, # Assay QCs _________________________ ---- ## 4. Cell lines recovered ---- - print("4. Generating cell_lines_present image ...") + print('4. Generating cell_lines_present image ...') potential_error= base::tryCatch({ cl_rec= create_recovery_barplot(filtered_counts, id_cols= id_cols, facet_col= 'pcr_plate', count_threshold= count_threshold, plot_type= 'percent') - pdf(file=paste(out, "cell_lines_present.pdf", sep="/"), - width=sqrt(num_profiles)*2, height=sqrt(num_profiles)) + pdf(file= paste(out, "cell_lines_present.pdf", sep="/"), + width= sqrt(num_profiles)*2, height= sqrt(num_profiles)) print(cl_rec) dev.off() rm(cl_rec) @@ -626,13 +606,13 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, ## 5. Cell line contaminants ---- print('5. Generating cell line contaminants ...') potential_error= base::tryCatch({ - contams= annotated_counts %>% dplyr::filter(expected_read == F) %>% + contams= annotated_counts %>% dplyr::filter(expected_read == FALSE) %>% dplyr::mutate(barcode_id= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>% dplyr::group_by(forward_read_cl_barcode, barcode_id) %>% - dplyr::summarise(num_wells= n(), median_n=median(n), max_n= max(n)) %>% ungroup() %>% + dplyr::summarise(num_wells= n(), median_n= median(n), max_n= max(n)) %>% ungroup() %>% dplyr::arrange(desc(num_wells)) - contams %>% write.csv(file= paste(out, 'contam_cell_lines.csv', sep='/'), row.names=F) + contams %>% write.csv(file= paste(out, 'contam_cell_lines.csv', sep='/'), row.names= FALSE, quote= FALSE) rm(contams) }, error= function(e) { print(e) @@ -645,34 +625,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, skipped_qcs = c(skipped_qcs, potential_error) } - ## 6. Contaminant reads ---- - print('6. Generating contaminant reads ...') - potential_error= base::tryCatch({ - # watered down version - summed_unknown_reads= unknown_reads[, list(num_reads = sum(n), num_wells= .N), - by= base::mget('forward_read_cl_barcode')] - summed_contams= annotated_counts[expected_read == FALSE, list(num_reads = sum(n), num_wells= .N), - by= base::mget(c('forward_read_cl_barcode', 'DepMap_ID', 'cb_name'))] - summed_contams[, barcode_name:= ifelse(is.na(DepMap_ID), cb_name, DepMap_ID)][,DepMap_ID:= NULL] - - contam_reads= data.table::rbindlist(list(summed_contams, summed_unknown_reads), fill= TRUE) %>% - dplyr::arrange(dplyr::desc(num_reads)) - - # write out - contam_reads %>% write.csv(paste0(out, 'contam_reads.csv'), row.names=F) - }, error= function(e) { - print(e) - print('Encountered an error when creating the contams reads file. Skipping this output ...') - return('contam reads') - }) - - # Collect returned string if an error occurred - if(!is.null(potential_error)) { - skipped_qcs = c(skipped_qcs, potential_error) - } - - ## 7. Cumulative counts by lines in negcons ---- - print("7. Generating cumulative image ...") + ## 6. Cumulative counts by lines in negcons ---- + print('6. Generating cumulative image ...') potential_error= base::tryCatch({ cdf_plot= create_cdf_plot(filtered_counts %>% dplyr::filter(trt_type == control_type), id_cols= id_cols, @@ -681,8 +635,8 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, contains_cbs= contains_cbs, order_aucs= TRUE) + labs(title= 'Cumulative reads in negative controls.') - pdf(file=paste(out, "cdf_plot.pdf", sep="/"), - width=sqrt(num_profiles)*2, height=sqrt(num_profiles)) + pdf(file=paste(out, 'cdf_plot.pdf', sep= '/'), + width= sqrt(num_profiles) * 2, height= sqrt(num_profiles)) print(cdf_plot) dev.off() rm(cdf_plot) @@ -694,18 +648,18 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, # Collect returned string if an error occurred if(!is.null(potential_error)) { - skipped_qcs = c(skipped_qcs, potential_error) + skipped_qcs= c(skipped_qcs, potential_error) } - ## 8. Control barcode trends ---- + ## 7. Control barcode trends ---- if(contains_cbs & is.data.frame(normalized_counts)) { - print("8. Generating control_barcode_trend image") + print('7. Generating control_barcode_trend image') potential_error= base::tryCatch({ trend_sc= create_ctrlBC_scatterplots(normalized_counts %>% dplyr::filter(control_barcodes %in% c("Y", "T", T)), id_cols, value_col= 'log2_n') pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"), - width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2) + width= sqrt(num_profiles) * 2, height= sqrt(num_profiles) * 2) print(trend_sc) dev.off() rm(cb_trend, trend_sc) @@ -717,25 +671,25 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, # Collect returned string if an error occurred if(!is.null(potential_error)) { - skipped_qcs = c(skipped_qcs, potential_error) + skipped_qcs= c(skipped_qcs, potential_error) } } else { - print('8. No control barcodes detected. Skipping control_barcode_trend image.') + print('7. No control barcodes detected. Skipping control_barcode_trend image.') } - ## 9. Sample correlation ----- - print("9. Generating sample_cor image ...") + ## 8. Sample correlation ----- + print('8. Generating sample_cor image ...') potential_error= base::tryCatch({ cor_df= filtered_counts %>% - dplyr::filter(!is.na(DepMap_ID), !is.na(trt_type), !trt_type %in% c("empty", "", "CB_only")) %>% + dplyr::filter(!is.na(DepMap_ID), !is.na(trt_type), !trt_type %in% c('empty', '', 'CB_only')) %>% dplyr::mutate(log2_n= log2(n + 1)) cp= create_cor_heatmap(input_df= cor_df, row_id_cols= c('DepMap_ID'), col_id_cols= c(sig_cols, id_cols), value_col= 'log2_n') - pdf(file=paste(out, "sample_cor.pdf", sep="/"), - width=sqrt(num_profiles)*2, height=sqrt(num_profiles)*2) + pdf(file= paste(out, 'sample_cor.pdf', sep= '/'), + width= sqrt(num_profiles) * 2, height= sqrt(num_profiles) * 2) print(cp) dev.off() rm(correlation_matrix, cp) @@ -747,16 +701,16 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, # Collect returned string if an error occurred if(!is.null(potential_error)) { - skipped_qcs = c(skipped_qcs, potential_error) + skipped_qcs= c(skipped_qcs, potential_error) } - ## 10. Tech rep correlations ---- + ## 9. Tech rep correlations ---- if(is.data.frame(normalized_counts) & 'tech_rep' %in% colnames(normalized_counts)) { # Check if there are more at least two tech reps unique_tech_reps= na.omit(unique(normalized_counts$tech_rep)) if(length(unique_tech_reps) >= 2) { - print("10. Generating tech rep correlations image ...") + print('9. Generating tech rep correlations image ...') # Set up replicate groups depending "bio_rep" column if('bio_rep' %in% colnames(normalized_counts) & !'bio_rep' %in% sig_cols) { replicate_group_cols= c(sig_cols, 'bio_rep') @@ -794,17 +748,17 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, # Collect returned string if an error occurred if(!is.null(potential_error)) { - skipped_qcs = c(skipped_qcs, potential_error) + skipped_qcs= c(skipped_qcs, potential_error) } } else { - print('10. No technical replicates detected. Skipping tech_reps scatter plot.') + print('9. No technical replicates detected. Skipping tech_reps scatter plot.') } } else { - print('10. No technical replicates detected. Skipping tech_reps scatter plot.') + print('9. No technical replicates detected. Skipping tech_reps scatter plot.') } - ## 11. Bio rep correlations ---- + ## 10. Bio rep correlations ---- if('bio_rep' %in% colnames(l2fc)) { unique_bio_reps= na.omit(unique(l2fc$bio_rep)) @@ -812,6 +766,7 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, l2fc_with_log2= l2fc %>% dplyr::mutate(log2_mean_normalized_n= log2(mean_normalized_n)) # Bio replicate scatter plots + # This is just another visualization that isn't being used. # bio_reps_plt= create_replicate_scatterplots(input_df= l2fc_with_log2s, # cell_line_cols= cell_line_cols, # replicate_group_cols= sig_cols, @@ -827,15 +782,15 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, # } # Bio replicate heatmap - print("11. Generating bio rep correlations heatmap ...") + print('10. Generating bio rep correlations heatmap ...') potential_error= base::tryCatch({ bio_corr_hm= create_cor_heatmap(input_df= l2fc_with_log2, row_id_cols= cell_line_cols, col_id_cols= c(sig_cols, 'bio_rep'), value_col= 'l2fc', cor_method= 'pearson') - pdf(file=paste(out, "bio_corr_hm.pdf", sep="/"), - width=sqrt(num_profiles), height=sqrt(num_profiles)) + pdf(file= paste(out, 'bio_corr_hm.pdf', sep= '/'), + width= sqrt(num_profiles), height= sqrt(num_profiles)) print(bio_corr_hm) dev.off() }, error= function(e) { @@ -846,11 +801,11 @@ QC_images= function(raw_counts_uncollapsed, raw_counts, # Collect returned string if an error occurred if(!is.null(potential_error)) { - skipped_qcs = c(skipped_qcs, potential_error) + skipped_qcs= c(skipped_qcs, potential_error) } } else { - print('11. No biological replicates detected. Skipping bio_rep heatmap.') + print('10. No biological replicates detected. Skipping bio_rep heatmap.') } } From a2cd665567a0fb854992f1df8d9c7f25de56e38d Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 13:51:51 -0400 Subject: [PATCH 085/127] Fixed a bug --- scripts/collate_fastq_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index 850094a5..89421e57 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -63,7 +63,7 @@ chunked_results= process_in_chunks(large_file_path= args$raw_counts_uncollapsed, sample_meta= sample_meta, sequencing_index_cols= sequencing_index_cols, id_cols= id_cols, - known_barcodes= unique(cell_line_meta$Sequence, CB_meta$Sequence), + known_barcodes= unique(c(cell_line_meta$Sequence, CB_meta$Sequence)), reverse_index2= args$reverse_index2, barcode_col= args$barcode_col) From ce39711d653249fed97e7b449dc12c4017a33348 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 13:52:00 -0400 Subject: [PATCH 086/127] Corrected naming --- scripts/src/QC_images.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index a88bb019..5b16b2c7 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -8,16 +8,16 @@ #' @param value_col String name of the counts column present all three dataframes. #' @param file_path Location to write out the output. #' @returns Writes out a QC_table to the file_path. -create_qc_table= function(raw_counts_uncollapsed_filepath, unknown_barcode_counts, +create_qc_table= function(raw_counts_uncollapsed_path, unknown_barcode_counts, prism_barcode_counts, filtered_counts, value_col= 'n', output_path) { # Validation: Check that the file at the path exists - if(!file.exists(raw_counts_uncollapsed_filepath)) { + if(!file.exists(raw_counts_uncollapsed_path)) { stop('Cannot find the raw counts uncollapsed file.') } # Pull out only the headers of the large file for validation - rcu_headers= data.table::fread(raw_counts_uncollapsed_filepath, header= TRUE, sep= ',', nrow= 0) + rcu_headers= data.table::fread(raw_counts_uncollapsed_path, header= TRUE, sep= ',', nrow= 0) # Validation: Check that value_col exists in raw_counts_uncollapsed if(!validate_columns_exist(value_col, rcu_headers)) { @@ -40,7 +40,7 @@ create_qc_table= function(raw_counts_uncollapsed_filepath, unknown_barcode_count } # Determine total number of reads in raw_counts_uncollapsed using chunking - chunk_sum= process_in_chunks(large_file_path= raw_counts_uncollapsed_filepath, + chunk_sum= process_in_chunks(large_file_path= raw_counts_uncollapsed_path, chunk_size= 10^6, action= function(x) data.table::as.data.table(sum(x[[value_col]]))) total_num_reads= sum(unlist(chunk_sum)) @@ -111,7 +111,7 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) { # Sum up reads total_counts= filtered_counts %>% dplyr::mutate(barcode_type= case_when(!is.na(CCLE_name) ~ 'cell line', - !is.na(cb_name) ~ 'ctrl barcode')) %>% + !is.na(Name) ~ 'ctrl barcode')) %>% tidyr::unite(all_of(id_cols), col= 'sample_id', sep= ':', remove= FALSE, na.rm= FALSE) %>% dplyr::group_by(pick(all_of(na.omit(c('sample_id', facet_col, 'barcode_type'))))) %>% dplyr::summarise(total_counts= sum(n)) %>% dplyr::ungroup() @@ -391,7 +391,7 @@ create_cor_heatmap= function(input_df, row_id_cols, col_id_cols, value_col, #' From a long table, creates scatter plots to two replicates. #' #' @import tidyverse -#' @import ggmisc +#' @import ggpmisc #' @param input_df Dataframe. #' @param cell_line_cols List of column names used to identify each cell line or control barcode. #' @param replicate_group_cols List of column names that describe a group of similar conditions. @@ -466,7 +466,7 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou #' @param reverse_index2 Boolean set to TRUE if the sequencing involved the reverse complement workflow. #' @param out Path to the directory to save the QC images. #' @returns NA. QC images are written out to the specified folder. -QC_images= function(raw_counts_uncollapsed_filepath, +QC_images= function(raw_counts_uncollapsed_path, prism_barcode_counts, unknown_barcode_counts, annotated_counts, normalized_counts= NA, l2fc, sample_meta, @@ -483,7 +483,7 @@ QC_images= function(raw_counts_uncollapsed_filepath, require(reshape2) require(WGCNA) require(scales) - require(ggmisc) + require(ggpmisc) # Some preprocessing ---- # Set out directory if none is specified. @@ -511,7 +511,7 @@ QC_images= function(raw_counts_uncollapsed_filepath, unknown_barcode_counts= unknown_barcode_counts, prism_barcode_counts= prism_barcode_counts, filtered_counts= filtered_counts, - value_col= 'n', file_path= paste0(out, '/QC_table.csv')) + value_col= 'n', output_path= paste0(out, '/QC_table.csv')) ## 2. Index count summaries ---- print('2. Generating index counts tables ...') @@ -553,7 +553,7 @@ QC_images= function(raw_counts_uncollapsed_filepath, expected_index2= unique(sample_meta$index_2) # Call get_index_summary over index2_chunks as a full table, then write out table - index2_counts= get_index_summary(data.table::rbindlist(raw_counts_uncollapsed), 'index_2', expected_index2) + index2_counts= get_index_summary(data.table::rbindlist(index2_chunks), 'index_2', expected_index2) index2_counts %>% write.csv(file= paste(out, 'index2_counts.csv', sep= '/'), row.names= FALSE, quote= FALSE) } else { print('Column "index_2" not detected. Skipping index 2 summaries ...', quote= FALSE) @@ -572,7 +572,7 @@ QC_images= function(raw_counts_uncollapsed_filepath, }, error= function(e) { print(e) print('Encountered an error when creating the total counts barplot. Skipping this output ...') - return('QC table') + return('Totalc ounts image') }) # Collect returned string if an error occurred From d230b495aa578bb86dd08b3fef7b9b32a9bbbbab Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 13:52:13 -0400 Subject: [PATCH 087/127] Corrected name --- scripts/filteredCounts_QC.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R index 525452f3..1287745b 100755 --- a/scripts/filteredCounts_QC.R +++ b/scripts/filteredCounts_QC.R @@ -65,7 +65,7 @@ sig_cols= unlist(strsplit(args$sig_cols, ',')) # Call QC images function ---- print("Calling QC images ...") -QC_images(raw_counts_uncollapsed_filepath= raw_counts_uncollapsed, +QC_images(raw_counts_uncollapsed_path= args$raw_counts_uncollapsed, prism_barcode_counts= prism_barcode_counts, unknown_barcode_counts= unknown_barcode_counts, annotated_counts= annotated_counts, From d3b413aaff8f5600eebd26a7003e73092dc0c834 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 13:52:26 -0400 Subject: [PATCH 088/127] Changed flag name --- scripts/join_metadata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R index 877509f1..226858ba 100644 --- a/scripts/join_metadata.R +++ b/scripts/join_metadata.R @@ -8,7 +8,7 @@ parser <- ArgumentParser() parser$add_argument('--sample_meta', default= 'sample_meta.csv', help= 'Sample meta data for the sequencing run.') parser$add_argument("--assay_pool_meta", default="assay_pool_meta.txt", help = "Assay pool metadata") parser$add_argument('--lfc', default= 'l2fc.csv', help= 'L2FC data.') # level 4 -parser$add_argument('--collapsed_l2fc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5 +parser$add_argument('--collapsed_lfc', default= 'collapsed_l2fc.csv', help= 'Collapsed l2fc data.') # level 5 parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', help= 'Columns that uniquely identify a condition.') parser$add_argument('--out', default= getwd(), help= 'Path to the output directory.') From de5c54e1386e1fb92ff543b9bb6edc51ef05b7a6 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 14:15:31 -0400 Subject: [PATCH 089/127] Update and reorder parameteres --- scripts/collapse_replicates.sh | 6 ++-- scripts/collate_fastq_reads.sh | 7 +++-- scripts/compute_l2fc.sh | 6 ++-- scripts/filter_counts.sh | 11 ++++--- scripts/filteredCounts_QC.sh | 31 +++++++++----------- scripts/join_metadata.sh | 12 ++++---- scripts/launch_job.sh | 51 +++++++++++++++++++-------------- scripts/make_config_file.groovy | 3 +- 8 files changed, 68 insertions(+), 59 deletions(-) diff --git a/scripts/collapse_replicates.sh b/scripts/collapse_replicates.sh index 0329b863..b2db2a75 100644 --- a/scripts/collapse_replicates.sh +++ b/scripts/collapse_replicates.sh @@ -29,9 +29,11 @@ echo LFC is: $LFC echo Rscript collapse_replicates.R -c $LFC \ --out $BUILD_DIR \ ---sig_cols $SIG_COLS +--sig_cols $SIG_COLS \ +--cell_line_cols $CELL_LINE_COLS Rscript collapse_replicates.R -c $LFC \ --out $BUILD_DIR \ ---sig_cols $SIG_COLS +--sig_cols $SIG_COLS \ +--cell_line_cols $CELL_LINE_COLS diff --git a/scripts/collate_fastq_reads.sh b/scripts/collate_fastq_reads.sh index 16735e99..6acea4ea 100644 --- a/scripts/collate_fastq_reads.sh +++ b/scripts/collate_fastq_reads.sh @@ -108,12 +108,13 @@ echo CELL_LINE_META is: $CELL_LINE_META args=( --raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED" --sample_meta "$SAMPLE_META" ---out "$BUILD_DIR" +--cell_line_meta "$CELL_LINE_META" +--CB_meta "$CONTROL_BARCODE_META" --sequencing_index_cols="$SEQUENCING_INDEX_COLS" --id_cols "$ID_COLS" --reverse_index2 "$REVERSE_INDEX2" ---cell_line_meta "$CELL_LINE_META" ---CB_meta "$CONTROL_BARCODE_META" +--barcode_col "$BARCODE_COL" +--out "$BUILD_DIR" ) echo Rscript collate_fastq_reads.R "${args[@]}" diff --git a/scripts/compute_l2fc.sh b/scripts/compute_l2fc.sh index 270a9651..6bbf9f27 100644 --- a/scripts/compute_l2fc.sh +++ b/scripts/compute_l2fc.sh @@ -32,7 +32,8 @@ echo Rscript compute_l2fc.R -c $NORMALIZED_COUNTS \ --sig_cols $SIG_COLS \ --ctrl_cols $CONTROL_COLS \ --count_threshold $COUNT_THRESHOLD \ ---normalized_counts $NORMALIZED_COUNTS +--normalized_counts $NORMALIZED_COUNTS \ +--cell_line_cols $CELL_LINE_COLS Rscript compute_l2fc.R -c $NORMALIZED_COUNTS \ --out $BUILD_DIR \ @@ -41,4 +42,5 @@ Rscript compute_l2fc.R -c $NORMALIZED_COUNTS \ --sig_cols $SIG_COLS \ --ctrl_cols $CONTROL_COLS \ --count_threshold $COUNT_THRESHOLD \ ---normalized_counts $NORMALIZED_COUNTS +--normalized_counts $NORMALIZED_COUNTS \ +--cell_line_cols $CELL_LINE_COLS diff --git a/scripts/filter_counts.sh b/scripts/filter_counts.sh index 0130677f..76f72f55 100644 --- a/scripts/filter_counts.sh +++ b/scripts/filter_counts.sh @@ -29,11 +29,11 @@ else fi #Enforces abs paths -if [[ "$RAW_COUNTS" = /* ]] +if [[ "$PRISM_BARCODE_COUNTS" = /* ]] then - RAW_COUNTS=$(ls $RAW_COUNTS) + PRISM_BARCODE_COUNTS=$(ls $PRISM_BARCODE_COUNTS) else - RAW_COUNTS=$BUILD_DIR/$RAW_COUNTS + PRISM_BARCODE_COUNTS=$BUILD_DIR/$PRISM_BARCODE_COUNTS fi echo $CELL_LINE_META @@ -74,21 +74,20 @@ fi echo Build dir is: $BUILD_DIR echo SAMPLE_META is: $SAMPLE_META -echo RAW_COUNTS is: $RAW_COUNTS +echo PRISM_BARCODE_COUNTS is: $PRISM_BARCODE_COUNTS echo CELL_LINE_META is: $CELL_LINE_META echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META echo CELL_SET_META is: $CELL_SET_META echo ID_COLS is: $ID_COLS args=( --c "$RAW_COUNTS" +--prism_barcode_counts "$PRISM_BARCODE_COUNTS" --sample_meta "$SAMPLE_META" --cell_line_meta "$CELL_LINE_META" --CB_meta "$CONTROL_BARCODE_META" --cell_set_meta "$CELL_SET_META" --id_cols "$ID_COLS" --out "$BUILD_DIR" ---count_threshold "$COUNT_THRESHOLD" --pool_id "$PULL_POOL_ID" --rm_data "$REMOVE_DATA" --assay_pool_meta "$ASSAY_POOL_META" diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh index c2653172..bc826a64 100644 --- a/scripts/filteredCounts_QC.sh +++ b/scripts/filteredCounts_QC.sh @@ -65,11 +65,19 @@ else fi #Enforces abs paths -if [[ "$RAW_COUNTS" = /* ]] +if [[ "$PRISM_BARCODE_COUNTS" = /* ]] then - RAW_COUNTS=$(ls $RAW_COUNTS) + PRISM_BARCODE_COUNTS=$(ls $PRISM_BARCODE_COUNTS) else - RAW_COUNTS=$BUILD_DIR/$RAW_COUNTS + PRISM_BARCODE_COUNTS=$BUILD_DIR/$PRISM_BARCODE_COUNTS +fi + +#Enforces abs paths +if [[ "$UNKNOWN_BARCODE_COUNTS" = /* ]] +then + UNKNOWN_BARCODE_COUNTS=$(ls $UNKNOWN_BARCODE_COUNTS) +else + UNKNOWN_BARCODE_COUNTS=$BUILD_DIR/$UNKNOWN_BARCODE_COUNTS fi #Enforces abs paths @@ -105,7 +113,6 @@ echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META echo COUNT_THRESHOLD is: $COUNT_THRESHOLD echo RAW_COUNTS_UNCOLLAPSED is: $RAW_COUNTS_UNCOLLAPSED echo LFC is: $LFC -echo RAW_COUNTS is: $RAW_COUNTS echo REVERSE_INDEX2 is: $REVERSE_INDEX2 args=( @@ -117,23 +124,13 @@ args=( --count_threshold "$COUNT_THRESHOLD" --control_type "$CTL_TYPES" --raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED" ---raw_counts "$RAW_COUNTS" +--prism_barcode_counts "$PRISM_BARCODE_COUNTS" +--unknown_barcode_counts "$UNKNOWN_BARCODE_COUNTS" --lfc "$LFC" --id_cols "$ID_COLS" --reverse_index2 "$REVERSE_INDEX2" ) -echo Rscript filteredCounts_QC.R --sample_meta $SAMPLE_META \ ---annotated_counts $ANNOTATED_COUNTS \ ---normalized_counts $NORMALIZED_COUNTS \ ---sig_cols $SIG_COLS \ ---out $BUILD_DIR \ ---count_threshold $COUNT_THRESHOLD \ ---reverse_index2 $REVERSE_INDEX2 \ ---control_type $CTL_TYPES \ ---raw_counts_uncollapsed $RAW_COUNTS_UNCOLLAPSED \ ---raw_counts $RAW_COUNTS \ ---lfc $LFC \ ---id_cols $ID_COLS +echo Rscript filteredCounts_QC.R "${args[@]}" Rscript filteredCounts_QC.R "${args[@]}" \ No newline at end of file diff --git a/scripts/join_metadata.sh b/scripts/join_metadata.sh index d13f1698..dd736dd8 100644 --- a/scripts/join_metadata.sh +++ b/scripts/join_metadata.sh @@ -36,11 +36,11 @@ else fi #Enforces abs paths -if [[ "$COLLAPSED_VALUES" = /* ]] +if [[ "$COLLAPSED_LFC" = /* ]] then - COLLAPSED_VALUES=$(ls $COLLAPSED_VALUES) + COLLAPSED_LFC=$(ls $COLLAPSED_LFC) else - COLLAPSED_VALUES=$BUILD_DIR/$COLLAPSED_VALUES + COLLAPSED_LFC=$BUILD_DIR/$COLLAPSED_LFC fi #Enforces abs paths @@ -61,18 +61,18 @@ fi echo Build dir is: $BUILD_DIR echo LFC is: $LFC -echo COLLAPSED_VALUES is: $COLLAPSED_VALUES +echo COLLAPSED_LFC is: $COLLAPSED_LFC echo SAMPLE_META is: $SAMPLE_META echo Rscript join_metadata.R --lfc $LFC \ ---collapsed_l2fc $COLLAPSED_VALUES \ +--collapsed_lfc $COLLAPSED_LFC \ --assay_pool_meta $ASSAY_POOL_META \ --out $BUILD_DIR \ --sig_cols $SIG_COLS \ --sample_meta $SAMPLE_META Rscript join_metadata.R --lfc $LFC \ ---collapsed_l2fc $COLLAPSED_VALUES \ +--collapsed_lfc $COLLAPSED_LFC \ --assay_pool_meta $ASSAY_POOL_META \ --out $BUILD_DIR \ --sig_cols $SIG_COLS \ diff --git a/scripts/launch_job.sh b/scripts/launch_job.sh index e2378644..1ed0d9d9 100644 --- a/scripts/launch_job.sh +++ b/scripts/launch_job.sh @@ -23,13 +23,17 @@ fi # List of parameters PARAMS=( - SEQ_TYPE API_URL BUILD_DIR INDEX_1 INDEX_2 BARCODE_SUFFIX REVERSE_INDEX2 - SAMPLE_META CONTROL_BARCODE_META CTL_TYPES ID_COLS SIG_COLS - RUN_NORM CONTROL_COLS COUNT_THRESHOLD COUNT_COL_NAME BUILD_NAME - CONVERT_SUSHI PULL_POOL_ID RUN_EPS_QC PSEUDOCOUNT REMOVE_DATA DAYS - SEQUENCING_INDEX_COLS RAW_COUNTS CELL_SET_META CELL_LINE_META FILTERED_COUNTS - LFC COUNTS ANNOTATED_COUNTS COLLAPSED_VALUES NORMALIZED_COUNTS ASSAY_POOL_META - RAW_COUNTS_UNCOLLAPSED + SEQ_TYPE BUILD_DIR INDEX_1 INDEX_2 BARCODE_SUFFIX REVERSE_INDEX2 RUN_NORM BUILD_NAME + CONVERT_SUSHI PULL_POOL_ID RUN_EPS_QC REMOVE_DATA DAYS COUNTS + # metadata files + SAMPLE_META CELL_SET_META CELL_LINE_META CONTROL_BARCODE_META ASSAY_POOL_META + # susi files + RAW_COUNTS_UNCOLLAPSED PRISM_BARCODE_COUNTS UNKNOWN_BARCODE_COUNTS ANNOTATED_COUNTS + FILTERED_COUNTS NORMALIZED_COUNTS LFC COLLAPSED_LFC + # column name paramters + SEQUENCING_INDEX_COLS ID_COLS CELL_LINE_COLS SIG_COLS CONTROL_COLS + # additional parameters + BARCODE_COL PSEUDOCOUNT COUNT_COL_NAME CTL_TYPES COUNT_THRESHOLD API_URL ) # Load parameters @@ -79,33 +83,36 @@ echo "Running in container:" -e BARCODE_SUFFIX="$BARCODE_SUFFIX" \ -e REVERSE_INDEX2="$REVERSE_INDEX2" \ -e SAMPLE_META="$SAMPLE_META" \ + -e CELL_SET_META="$CELL_SET_META" \ + -e CELL_LINE_META="$CELL_LINE_META" \ -e CONTROL_BARCODE_META="$CONTROL_BARCODE_META" \ - -e CTL_TYPES="$CTL_TYPES" \ + -e ASSAY_POOL_META="$ASSAY_POOL_META" \ + -e RAW_COUNTS_UNCOLLAPSED="$RAW_COUNTS_UNCOLLAPSED"\ + -e PRISM_BARCODE_COUNTS="$PRISM_BARCODE_COUNTS"\ + -e UNKNOWN_BARCODE_COUNTS="$UNKNOWN_BARCODE_COUNTS"\ + -e ANNOTATED_COUNTS="$ANNOTATED_COUNTS" \ + -e FILTERED_COUNTS="$FILTERED_COUNTS" \ + -e NORMALIZED_COUNTS="$NORMALIZED_COUNTS" \ + -e LFC="$LFC" \ + -e COLLAPSED_LFC="$COLLAPSED_LFC" \ + -e SEQUENCING_INDEX_COLS="$SEQUENCING_INDEX_COLS" \ -e ID_COLS="$ID_COLS" \ + -e CELL_LINE_COLS="$CELL_LINE_COLS" \ -e SIG_COLS="$SIG_COLS" \ - -e RUN_NORM="$RUN_NORM" \ -e CONTROL_COLS="$CONTROL_COLS" \ - -e COUNT_THRESHOLD="$COUNT_THRESHOLD" \ + -e BARCODE_COL="$BARCODE_COL" \ + -e PSEUDOCOUNT="$PSEUDOCOUNT" \ -e COUNT_COL_NAME="$COUNT_COL_NAME" \ + -e CTL_TYPES="$CTL_TYPES" \ + -e COUNT_THRESHOLD="$COUNT_THRESHOLD" \ + -e RUN_NORM="$RUN_NORM" \ -e BUILD_NAME="$BUILD_NAME" \ -e CONVERT_SUSHI="$CONVERT_SUSHI" \ -e PULL_POOL_ID="$PULL_POOL_ID" \ -e RUN_EPS_QC="$RUN_EPS_QC" \ - -e PSEUDOCOUNT="$PSEUDOCOUNT" \ -e REMOVE_DATA="$REMOVE_DATA" \ -e DAYS="$DAYS" \ - -e SEQUENCING_INDEX_COLS="$SEQUENCING_INDEX_COLS" \ - -e RAW_COUNTS="$RAW_COUNTS" \ - -e CELL_SET_META="$CELL_SET_META" \ - -e CELL_LINE_META="$CELL_LINE_META" \ - -e FILTERED_COUNTS="$FILTERED_COUNTS" \ - -e LFC="$LFC" \ -e COUNTS="$COUNTS" \ - -e ANNOTATED_COUNTS="$ANNOTATED_COUNTS" \ - -e COLLAPSED_VALUES="$COLLAPSED_VALUES" \ - -e NORMALIZED_COUNTS="$NORMALIZED_COUNTS" \ - -e ASSAY_POOL_META="$ASSAY_POOL_META" \ - -e RAW_COUNTS_UNCOLLAPSED="$RAW_COUNTS_UNCOLLAPSED"\ -v "$WORKSPACE:/workspace" \ -v /cmap/tools/analysis2clue/credentials:/root/.aws/credentials:ro \ -v /local/jenkins/.clue_api_key:/local/jenkins/.clue_api_key:ro \ diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy index 513998b1..5b7ea040 100644 --- a/scripts/make_config_file.groovy +++ b/scripts/make_config_file.groovy @@ -54,6 +54,7 @@ pipeline { string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual controls in COMPUTE_LFC') // Additional parameters + string(name: 'BARCODE_COL', defaultValue: 'forward_read_cl_barcode', description: 'In COLLATE_FASTQ_READS, the name of the column containing the read') string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'In CBNORMALIZE, the pesudocount value for log transformations.') string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'In COMPUTE_LFC, the name of the numeric column to use for calculations') string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'In COMPUTE_LFC, the value in trt_type that indicates the negative controls') @@ -125,7 +126,7 @@ pipeline { 'SEQUENCING_INDEX_COLS', 'ID_COLS', 'CELL_LINE_COLS', 'SIG_COLS', 'CONTROL_COLS', // additional parameters - 'PSEUDOCOUNT', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD', 'API_URL' + 'BARCODE_COL', 'PSEUDOCOUNT', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD', 'API_URL' ] def config = [:] From cb31403a0cac50afa7fcebf9a0e44549d1311808 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 14:20:25 -0400 Subject: [PATCH 090/127] Added commas --- scripts/make_config_file.groovy | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy index 5b7ea040..eb039213 100644 --- a/scripts/make_config_file.groovy +++ b/scripts/make_config_file.groovy @@ -116,11 +116,11 @@ pipeline { 'COUNTS', 'SCREEN', // metadata files - 'SAMPLE_META', 'CELL_SET_META', 'CELL_LINE_META', 'CONTROL_BARCODE_META', 'ASSAY_POOL_META' + 'SAMPLE_META', 'CELL_SET_META', 'CELL_LINE_META', 'CONTROL_BARCODE_META', 'ASSAY_POOL_META', // sushi files 'RAW_COUNTS_UNCOLLAPSED', 'PRISM_BARCODE_COUNTS', 'UNKNOWN_BARCODE_COUNTS', - 'ANNOTATED_COUNTS', 'FILTERED_COUNTS', 'NORMALIZED_COUNTS', 'LFC', 'COLLAPSED_LFC' + 'ANNOTATED_COUNTS', 'FILTERED_COUNTS', 'NORMALIZED_COUNTS', 'LFC', 'COLLAPSED_LFC', // column name parameters 'SEQUENCING_INDEX_COLS', 'ID_COLS', 'CELL_LINE_COLS', 'SIG_COLS', 'CONTROL_COLS', From b22ac03e54558a92620054da9e3e547800ff26ad Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 14:24:24 -0400 Subject: [PATCH 091/127] Removed raw_counts --- scripts/filter_counts.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/scripts/filter_counts.sh b/scripts/filter_counts.sh index 76f72f55..fb8a2d9a 100644 --- a/scripts/filter_counts.sh +++ b/scripts/filter_counts.sh @@ -8,12 +8,6 @@ then exit -1 fi -if [ -z "$RAW_COUNTS" ] -then - echo RAW_COUNTS parameter empty - exit -1 -fi - if [ -z "$SAMPLE_META" ] then echo SAMPLE_META parameter empty From b171ff820cf56d6a2086db22719518d98e1a9fcd Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 14:32:15 -0400 Subject: [PATCH 092/127] Update name --- scripts/join_metadata.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/join_metadata.sh b/scripts/join_metadata.sh index dd736dd8..b4159ec1 100644 --- a/scripts/join_metadata.sh +++ b/scripts/join_metadata.sh @@ -15,7 +15,7 @@ then fi -if [ -z "$COLLAPSED_VALUES" ] +if [ -z "$COLLAPSED_LFC" ] then echo Collapsed l2fc parameter empty exit -1 From 05780aadeac274870d5c7640ff7efc5210c1dc1c Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 14:36:32 -0400 Subject: [PATCH 093/127] Dropped CCLE_name as a key --- scripts/join_metadata.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R index 226858ba..63433c8a 100644 --- a/scripts/join_metadata.R +++ b/scripts/join_metadata.R @@ -56,7 +56,7 @@ if(file.exists(args$lfc)) { if(assay_pool_meta_exists) { l2fc_with_meta_columns= join_metadata(input_df= l2fc_with_meta_columns, metadata= input_assay_pool_meta, - key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set')) + key_cols= c('DepMap_ID', 'cell_set')) } else { print('WARNING: Assay pool meta not detected and will not be joined onto l2fc.') } @@ -83,7 +83,7 @@ if(file.exists(args$collapsed_l2fc)) { print('Attempting to add assay_pool_meta to collapsed l2fc.') collapsed_l2fc_with_meta_columns= join_metadata(input_df= collapsed_l2fc_with_meta_columns, metadata= input_assay_pool_meta, - key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set')) + key_cols= c('DepMap_ID', 'cell_set')) } else { print('WARNING: Assay pool meta not detected and will not be joined onto collapsed l2fc.') } From c6dc30c406b6534c68fa9befa4e709dafdad3939 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 14:43:24 -0400 Subject: [PATCH 094/127] Fixed renaming --- scripts/join_metadata.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R index 63433c8a..63910922 100644 --- a/scripts/join_metadata.R +++ b/scripts/join_metadata.R @@ -64,13 +64,13 @@ if(file.exists(args$lfc)) { # Write out outpath= paste(args$out, 'l2fc_with_meta_columns.csv', sep='/') print(paste("Writing l2fc_with_meta_columns.csv to ", outpath)) - write.csv(l2fc_with_meta_columns, outpath, row.names= FALSE, quote= FALSE) + l2fc_with_meta_columns %>% write.csv(outpath, row.names= FALSE, quote= FALSE) } else { print('WARNING: l2fc.csv does not exist. Skipping this file.') } # Add sample meta and assay pool meta to collapsed_l2fc table ---- -if(file.exists(args$collapsed_l2fc)) { +if(file.exists(args$collapsed_lfc)) { print('Attempting to add sample_meta to collapsed l2fc.') collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',') @@ -91,7 +91,7 @@ if(file.exists(args$collapsed_l2fc)) { # Write out outpath= paste(args$out, 'collapsed_l2fc_with_meta_columns.csv', sep='/') print(paste("Writing collapsed_l2fc_with_meta_columns.csv to ", outpath)) - write.csv(collapsed_l2fc_with_meta_columns, outpath, row.names= FALSE, quote= FALSE) + collapsed_l2fc_with_meta_columns %>% write.csv(outpath, row.names= FALSE, quote= FALSE) } else { print('WARNING: collapsed_l2fc.csv does not exist. Skipping this file.') } From e924827a02a0aa56e5726b285bd14b0e687b6447 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 14:45:20 -0400 Subject: [PATCH 095/127] Caught last bug --- scripts/join_metadata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R index 63910922..79d442f6 100644 --- a/scripts/join_metadata.R +++ b/scripts/join_metadata.R @@ -72,7 +72,7 @@ if(file.exists(args$lfc)) { # Add sample meta and assay pool meta to collapsed_l2fc table ---- if(file.exists(args$collapsed_lfc)) { print('Attempting to add sample_meta to collapsed l2fc.') - collapsed_l2fc= data.table::fread(args$collapsed_l2fc, header= T, sep= ',') + collapsed_l2fc= data.table::fread(args$collapsed_lfc, header= T, sep= ',') # Add sample meta columns to collapsed l2fc collapsed_l2fc_with_meta_columns= join_metadata(input_df= collapsed_l2fc, metadata= sample_meta, From ec143ea824aa3e64765183910f037991a65ab028 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 15:06:42 -0400 Subject: [PATCH 096/127] Added kitchen utensils --- scripts/collapse_replicates.R | 1 + scripts/compute_l2fc.R | 1 + 2 files changed, 2 insertions(+) diff --git a/scripts/collapse_replicates.R b/scripts/collapse_replicates.R index 771b08f9..d48d54b1 100755 --- a/scripts/collapse_replicates.R +++ b/scripts/collapse_replicates.R @@ -5,6 +5,7 @@ library(tidyverse) suppressPackageStartupMessages(library(argparse)) suppressPackageStartupMessages(library(magrittr)) source("./src/collapse_bio_reps.R") +source("./src/kitchen_utensils.R") # Argument parser ---- parser <- ArgumentParser() diff --git a/scripts/compute_l2fc.R b/scripts/compute_l2fc.R index 38b51ddd..7ef42653 100755 --- a/scripts/compute_l2fc.R +++ b/scripts/compute_l2fc.R @@ -5,6 +5,7 @@ library(tidyverse) suppressPackageStartupMessages(library(argparse)) suppressPackageStartupMessages(library(dplyr)) source("./src/compute_l2fc.R") +source("./src/kitchen_utensils.R") # Argument parser ---- parser <- ArgumentParser() From 10f7c9e24d5e09d688a6e33f05a876d5f08abd75 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 15:07:00 -0400 Subject: [PATCH 097/127] Removed counts flag filter --- scripts/src/collapse_bio_reps.R | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/scripts/src/collapse_bio_reps.R b/scripts/src/collapse_bio_reps.R index 223d3d68..5c09eecb 100755 --- a/scripts/src/collapse_bio_reps.R +++ b/scripts/src/collapse_bio_reps.R @@ -1,19 +1,3 @@ -#' validate_columns_exist -#' -#' This function checks that a list of columns are present in a dataframe. -#' -#' @param selected_columns A vector of strings each representing a column name -#' @param df A dataframe to check against -#' @return Boolean -validate_columns_exist= function(selected_columns, df) { - # Check that all of selected_columns are in df - if(any(!selected_columns %in% colnames(df))) { - return(FALSE) - } else { - return(TRUE) - } -} - #' validate_num_bio_reps #' #' This function checks that all the expected flowcells are present in a table of detected flowcells. @@ -57,8 +41,7 @@ collapse_bio_reps= function(l2fc, sig_cols, cell_line_cols= c('project_code', 'D } # Median collapsing bio replicates ---- - collapsed_counts= l2fc %>% dplyr::filter(is.na(counts_flag)) %>% - tidyr::unite(col= 'sig_id', all_of(sig_cols), sep= ':', na.rm= FALSE, remove= FALSE) %>% + collapsed_counts= l2fc %>% tidyr::unite(col= 'sig_id', all_of(sig_cols), sep= ':', na.rm= FALSE, remove= FALSE) %>% dplyr::group_by(pick(all_of(c(cell_line_cols, 'sig_id', sig_cols)))) %>% dplyr::summarise(trt_median_n= median(mean_n), trt_median_normalized_n= median(mean_normalized_n), trt_mad_sqrtN= mad(log2(mean_normalized_n)) / sqrt(dplyr::n()), From a2259cf502a01a6c06d7e2e692662b6fc47cd922 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 15:09:14 -0400 Subject: [PATCH 098/127] Remove some columns Dropped num_tech_reps, a num_tech_rep check, control_MAD_QC, and counts_flag --- scripts/src/compute_l2fc.R | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/scripts/src/compute_l2fc.R b/scripts/src/compute_l2fc.R index f89d9797..3007253f 100755 --- a/scripts/src/compute_l2fc.R +++ b/scripts/src/compute_l2fc.R @@ -1,19 +1,3 @@ -#' validate_columns_exist -#' -#' This function checks that a list of columns are present in a dataframe. -#' -#' @param selected_columns A vector of strings each representing a column name -#' @param df A dataframe to check against -#' @return Boolean -validate_columns_exist= function(selected_columns, df) { - # Check that all of selected_columns are in df - if(any(!selected_columns %in% colnames(df))) { - return(FALSE) - } else { - return(TRUE) - } -} - #' compute_l2fc #' #' takes normalized counts and computes log-fold change values as compared to the designated control condition @@ -68,13 +52,12 @@ compute_l2fc= function(normalized_counts, dplyr::filter(!(trt_type %in% c("empty", "", "CB_only")) & !is.na(trt_type), !is.na(CCLE_name)) %>% dplyr::group_by(pick(all_of(c(cell_line_cols, 'trt_type', bio_rep_id_cols, ctrl_cols)))) %>% dplyr::summarise(mean_n= mean(n), - mean_normalized_n = mean(!!rlang::sym(count_col_name)), - num_tech_reps= dplyr::n()) %>% dplyr::ungroup() + mean_normalized_n = mean(!!rlang::sym(count_col_name))) %>% dplyr::ungroup() # Print out the occurrence of each count of tech_reps - print('Number of technical replicate collapsed across all cell lines and biological replicates:') - print(collapsed_tech_rep %>% dplyr::group_by(num_tech_reps) %>% - dplyr::summarise(count= dplyr::n()) %>% dplyr::ungroup()) + # print('Number of technical replicate collapsed across all cell lines and biological replicates:') + # print(collapsed_tech_rep %>% dplyr::group_by(num_tech_reps) %>% + # dplyr::summarise(count= dplyr::n()) %>% dplyr::ungroup()) # Pull out negative controls and collapse any biological replicates ---- print('Collapsing control conditions on the following columns: ') @@ -84,8 +67,7 @@ compute_l2fc= function(normalized_counts, dplyr::summarise(control_median_n= median(mean_n), control_median_normalized_n = median(mean_normalized_n), control_mad_sqrtN = mad(log2(mean_normalized_n))/sqrt(dplyr::n()), - num_ctrl_bio_reps = dplyr::n()) %>% dplyr::ungroup() %>% - dplyr::mutate(control_MAD_QC = (control_mad_sqrtN <= 0.5/log10(2))) #%>% # New: adjusted cut off to log2 + num_ctrl_bio_reps = dplyr::n()) %>% dplyr::ungroup() # Validation: Check that negative controls were extracted ---- if(nrow(controls)==0) { @@ -95,8 +77,7 @@ compute_l2fc= function(normalized_counts, # Join neg_cons and compute l2fc ---- l2fc= collapsed_tech_rep %>% dplyr::filter(!trt_type %in% c(control_type, 'day_0')) %>% dplyr::inner_join(controls, by= c(cell_line_cols, ctrl_cols), relationship='many-to-one') %>% - dplyr::mutate(l2fc= log2(mean_normalized_n/control_median_normalized_n), - counts_flag= ifelse(control_median_n < count_threshold, paste0('negcon<', count_threshold), NA)) + dplyr::mutate(l2fc= log2(mean_normalized_n/control_median_normalized_n)) return(l2fc) } From cbbe3a7bc0ca1d1f3c50fe38cecdca7e65cef139 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 4 Oct 2024 17:12:53 -0400 Subject: [PATCH 099/127] Updated QC job name Also added a text description test --- scripts/make_config_file.groovy | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy index eb039213..f122fc61 100644 --- a/scripts/make_config_file.groovy +++ b/scripts/make_config_file.groovy @@ -13,7 +13,7 @@ pipeline { booleanParam(name: 'CBNORMALIZE', defaultValue: true, description: 'Check this to trigger the CBnormalize job.') booleanParam(name: 'COMPUTE_LFC', defaultValue: true, description: 'Check this to trigger the compute_l2fc job.') booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.') - booleanParam(name: 'FILTER_COUNTS_QC', defaultValue: true, description: 'Check this to trigger the QC job.') + booleanParam(name: 'QC_IMAGES', defaultValue: true, description: 'Check this to trigger the QC job.') booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.') booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.') booleanParam(name: 'RUN_NORM', defaultValue: true, description: 'Run normalization module on data.') @@ -30,7 +30,7 @@ pipeline { string(name: 'COMMIT_ID', defaultValue: '', description: 'Specific commit ID to use (leave empty if using the latest commit in the branch or if already specified in the config file.)') // Metadata files used by sushi - string(name: 'SAMPLE_META', defaultValue: 'sample_meta.csv', description: 'File name of sample metadata within the BUILD_DIR directory.') + string(name: 'SAMPLE_META', defaultValue: 'sample_meta.csv', description: 'File name of sample metadata within the BUILD_DIR directory.\n can text be formated here?') string(name: 'CELL_SET_META', defaultValue: 'cell_set_meta.csv', description: 'Cell Set Metadata. Static cell_line_meta location: /data/vdb/prismSeq/cell_set_meta.csv') string(name: 'CELL_LINE_META', defaultValue: 'cell_line_meta.csv', description: 'File in BUILD_DIR containing cell line metadata') string(name: 'CONTROL_BARCODE_META', defaultValue: 'CB_meta.csv', description: 'Metadata for control barcodes.') @@ -40,11 +40,11 @@ pipeline { string(name: 'RAW_COUNTS_UNCOLLAPSED', defaultValue: 'raw_counts_uncollapsed.csv', description: 'Filename in BUILD_DIR containing nori output') string(name: 'PRISM_BARCODE_COUNTS', defaultValue: 'prism_barcode_counts.csv', description: 'Filename in BUILD_DIR containing PRISM barcode counts') string(name: 'UNKNOWN_BARCODE_COUNTS', defaultValue: 'unknown_barcode_counts.csv', description: 'Filename in BUILD_DIR containing unknown barcode counts') - string(name: 'ANNOTATED_COUNTS', defaultValue: 'annotated_counts.csv', description: 'File in BUILD_DIR containing annotated counts') - string(name: 'FILTERED_COUNTS', defaultValue: 'filtered_counts.csv', description: 'File in BUILD_DIR containing filtered counts') - string(name: 'NORMALIZED_COUNTS', defaultValue: 'normalized_counts.csv', description: 'File in BUILD_DIR containing normalized counts') - string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'File containing log2 fold change values') - string(name: 'COLLAPSED_LFC', defaultValue: 'collapsed_l2fc.csv', description: 'File in BUILD_DIR containing replicate collapsed l2fc values') + string(name: 'ANNOTATED_COUNTS', defaultValue: 'annotated_counts.csv', description: 'Filename in BUILD_DIR containing annotated counts') + string(name: 'FILTERED_COUNTS', defaultValue: 'filtered_counts.csv', description: 'Filename in BUILD_DIR containing filtered counts') + string(name: 'NORMALIZED_COUNTS', defaultValue: 'normalized_counts.csv', description: 'Filename in BUILD_DIR containing normalized counts') + string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'Filename containing log2 fold change values') + string(name: 'COLLAPSED_LFC', defaultValue: 'collapsed_l2fc.csv', description: 'Filename in BUILD_DIR containing replicate collapsed l2fc values') // Column names parameters string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'flowcell_names,index_1,index_2', description: 'Sequencing index columns used in COLLATE_FASTQ_READS') @@ -58,7 +58,7 @@ pipeline { string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'In CBNORMALIZE, the pesudocount value for log transformations.') string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'In COMPUTE_LFC, the name of the numeric column to use for calculations') string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'In COMPUTE_LFC, the value in trt_type that indicates the negative controls') - string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'In FILTER_COUNTS_QC, the threshold for calling reads with low counts') + string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'In QC_IMAGES, the threshold for calling reads with low counts') string(name: 'API_URL', defaultValue: 'https://api.clue.io/api/', description: 'API URL') } @@ -219,7 +219,7 @@ pipeline { if (params.COLLAPSE) { scriptsToRun.add('collapse_replicates.sh') } - if (params.FILTER_COUNTS_QC) { + if (params.QC_IMAGES) { scriptsToRun.add('filteredCounts_QC.sh') } if (params.JOIN_METADATA) { From 9291889f7eba96cc1dd95de7a24967806f0e19a9 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Tue, 8 Oct 2024 17:06:49 -0400 Subject: [PATCH 100/127] Moved most common params up --- scripts/make_config_file.groovy | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy index f122fc61..529d7861 100644 --- a/scripts/make_config_file.groovy +++ b/scripts/make_config_file.groovy @@ -29,8 +29,16 @@ pipeline { booleanParam(name: 'USE_LATEST', defaultValue: true, description: 'Check this to use the most up to date version from the specified branch. If not checked, will use the specified commit.') string(name: 'COMMIT_ID', defaultValue: '', description: 'Specific commit ID to use (leave empty if using the latest commit in the branch or if already specified in the config file.)') + // Most common parameters + // Column names parameters + string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'flowcell_names,index_1,index_2', description: 'Sequencing index columns used in COLLATE_FASTQ_READS') + string(name: 'ID_COLS', defaultValue: 'pcr_plate,pcr_well', description: 'Columns to concat to create unique ID for each sample-replicate') + string(name: 'CELL_LINE_COLS', defaultValue: 'DepMap_ID', description: 'Columns in intermediate files that describe a read or cell line') + string(name: 'SIG_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day', description: 'Signature columns') + string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual controls in COMPUTE_LFC') + // Metadata files used by sushi - string(name: 'SAMPLE_META', defaultValue: 'sample_meta.csv', description: 'File name of sample metadata within the BUILD_DIR directory.\n can text be formated here?') + string(name: 'SAMPLE_META', defaultValue: 'sample_meta.csv', description: 'File name of sample metadata within the BUILD_DIR directory.') string(name: 'CELL_SET_META', defaultValue: 'cell_set_meta.csv', description: 'Cell Set Metadata. Static cell_line_meta location: /data/vdb/prismSeq/cell_set_meta.csv') string(name: 'CELL_LINE_META', defaultValue: 'cell_line_meta.csv', description: 'File in BUILD_DIR containing cell line metadata') string(name: 'CONTROL_BARCODE_META', defaultValue: 'CB_meta.csv', description: 'Metadata for control barcodes.') @@ -46,19 +54,12 @@ pipeline { string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'Filename containing log2 fold change values') string(name: 'COLLAPSED_LFC', defaultValue: 'collapsed_l2fc.csv', description: 'Filename in BUILD_DIR containing replicate collapsed l2fc values') - // Column names parameters - string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'flowcell_names,index_1,index_2', description: 'Sequencing index columns used in COLLATE_FASTQ_READS') - string(name: 'ID_COLS', defaultValue: 'pcr_plate,pcr_well', description: 'Columns to concat to create unique ID for each sample-replicate') - string(name: 'CELL_LINE_COLS', defaultValue: 'DepMap_ID', description: 'Columns in intermediate files that describe a read or cell line') - string(name: 'SIG_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day', description: 'Signature columns') - string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual controls in COMPUTE_LFC') - // Additional parameters string(name: 'BARCODE_COL', defaultValue: 'forward_read_cl_barcode', description: 'In COLLATE_FASTQ_READS, the name of the column containing the read') string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'In CBNORMALIZE, the pesudocount value for log transformations.') string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'In COMPUTE_LFC, the name of the numeric column to use for calculations') string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'In COMPUTE_LFC, the value in trt_type that indicates the negative controls') - string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'In QC_IMAGES, the threshold for calling reads with low counts') + string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'Drops cell lines below this threshold in the negative controls') string(name: 'API_URL', defaultValue: 'https://api.clue.io/api/', description: 'API URL') } From aeeb837060fe3b98765ddec2530a81856e2c81fd Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 09:39:16 -0400 Subject: [PATCH 101/127] Updated unknown barcode identification Added a low_abundance_threshold filter to rename reads --- scripts/src/collate_fastq_reads.R | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 1aba70a9..aeb3aff6 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -52,7 +52,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, id_cols= c('pcr_plate', 'pcr_well'), known_barcodes, reverse_index2= FALSE, - barcode_col= 'forward_read_cl_barcode') { + barcode_col= 'forward_read_cl_barcode', + low_abundance_threshold= 20) { require(tidyverse) require(data.table) @@ -165,11 +166,14 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Performing inner join with data.table instead of dplyr summed_reads= data.table::merge.data.table(uncollapsed_raw_counts, sequencing_map, by= sequencing_index_cols) # Code below is checking if a barcode is in the list of known barcodes. - # If the barcode is not in the list of known barcodes, then the barcode is replaced with the string "unknown_reads". - # Function := performs the mutate inplace without copying the dataframe. + # If the barcode is not in the list of known barcodes and its counts is below the low_abundance_threshold, + # then the barcode is replaced with the string "unknown_low_abundance_barcode". + # Function := performs the mutate in place without copying the dataframe. # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%. - summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes), - get(barcode_col), 'unknown_reads')] + summed_reads[, c(barcode_col) := data.table::fifelse( + !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, + 'unknown_low_abundance_barcode', get(barcode_col))] + # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells. summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] @@ -187,6 +191,6 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Return list of two dfs with known or unknown read counts ---- print('Completing collate_fastq_reads.') - return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] != 'unknown_reads',], - unknown_barcode_counts= summed_reads[summed_reads[[barcode_col]] == 'unknown_reads',])) + return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], + unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,])) } From 4e8f6773291ee7c087220e09774fc82703bedcbd Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 09:50:39 -0400 Subject: [PATCH 102/127] Added back unknown reads summary --- scripts/src/QC_images.R | 73 ++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 19 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index 5b16b2c7..a45b911f 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -470,6 +470,7 @@ QC_images= function(raw_counts_uncollapsed_path, prism_barcode_counts, unknown_barcode_counts, annotated_counts, normalized_counts= NA, l2fc, sample_meta, + barcode_col= 'forward_read_cl_barcode', id_cols= c('pcr_plate', 'pcr_well'), cell_line_cols= c('DepMap_ID'), sig_cols, @@ -490,7 +491,7 @@ QC_images= function(raw_counts_uncollapsed_path, if(is.na(out)) {out= getwd()} # Create empty vector to collect potential errors when running QCs - skipped_qcs= c() + skipped_qcs= c() # Count number of distinct profile to help scale some plots. num_profiles= annotated_counts %>% dplyr::distinct(pick(all_of(id_cols))) %>% nrow() @@ -580,9 +581,43 @@ QC_images= function(raw_counts_uncollapsed_path, skipped_qcs = c(skipped_qcs, potential_error) } + ## 4. Unknown barcodes ---- + print('4. Generating table of unknown barcode reads ...') + potential_error= base::tryCatch({ + unknown_totals= unknown_barcode_counts[, .(well_total= sum(n)), by= id_cols] + prism_totals= prism_barcode_counts[, .(well_total= sum(n)), by= id_cols] + well_totals= data.table::rbindlist(list(unknown_totals, prism_totals))[, .(well_total= sum(well_total)), by= id_cols] + + unknown_barcodes= unknown_barcode_counts %>% + dplyr::filter(.data[[barcode_col]] != 'unknown_reads') %>% + dplyr::left_join(well_totals, by= id_cols) %>% + dplyr::mutate(read_percent= n / well_total) %>% + dplyr::group_by(pick(all_of(barcode_col))) %>% + dplyr::summarise(total= sum(n), + median_read= median(n), + median_percent= median(read_percent), + max_read= max(n), + max_percent= max(read_percent), + num_wells= dplyr::n()) %>% dplyr::ungroup() %>% + dplyr::arrange(dplyr::desc(median_percent)) + + unknown_barcodes %>% write.csv(file= paste(out, 'unknown_barcodes_summary.csv', sep= '/'), + row.names= FALSE, quote= FALSE) + }, error= function(e) { + print(e) + print('Encountered an error when creating the total counts barplot. Skipping this output ...') + return('Totalc ounts image') + }) + + # Collect returned string if an error occurred + if(!is.null(potential_error)) { + skipped_qcs = c(skipped_qcs, potential_error) + } + # + # Assay QCs _________________________ ---- - ## 4. Cell lines recovered ---- - print('4. Generating cell_lines_present image ...') + ## 5. Cell lines recovered ---- + print('5. Generating cell_lines_present image ...') potential_error= base::tryCatch({ cl_rec= create_recovery_barplot(filtered_counts, id_cols= id_cols, facet_col= 'pcr_plate', count_threshold= count_threshold, plot_type= 'percent') @@ -603,8 +638,8 @@ QC_images= function(raw_counts_uncollapsed_path, skipped_qcs = c(skipped_qcs, potential_error) } - ## 5. Cell line contaminants ---- - print('5. Generating cell line contaminants ...') + ## 6. Cell line contaminants ---- + print('6. Generating cell line contaminants ...') potential_error= base::tryCatch({ contams= annotated_counts %>% dplyr::filter(expected_read == FALSE) %>% dplyr::mutate(barcode_id= ifelse(is.na(CCLE_name), Name, CCLE_name)) %>% @@ -625,8 +660,8 @@ QC_images= function(raw_counts_uncollapsed_path, skipped_qcs = c(skipped_qcs, potential_error) } - ## 6. Cumulative counts by lines in negcons ---- - print('6. Generating cumulative image ...') + ## 7. Cumulative counts by lines in negcons ---- + print('7. Generating cumulative image ...') potential_error= base::tryCatch({ cdf_plot= create_cdf_plot(filtered_counts %>% dplyr::filter(trt_type == control_type), id_cols= id_cols, @@ -651,9 +686,9 @@ QC_images= function(raw_counts_uncollapsed_path, skipped_qcs= c(skipped_qcs, potential_error) } - ## 7. Control barcode trends ---- + ## 8. Control barcode trends ---- if(contains_cbs & is.data.frame(normalized_counts)) { - print('7. Generating control_barcode_trend image') + print('8. Generating control_barcode_trend image') potential_error= base::tryCatch({ trend_sc= create_ctrlBC_scatterplots(normalized_counts %>% dplyr::filter(control_barcodes %in% c("Y", "T", T)), id_cols, value_col= 'log2_n') @@ -674,11 +709,11 @@ QC_images= function(raw_counts_uncollapsed_path, skipped_qcs= c(skipped_qcs, potential_error) } } else { - print('7. No control barcodes detected. Skipping control_barcode_trend image.') + print('8. No control barcodes detected. Skipping control_barcode_trend image.') } - ## 8. Sample correlation ----- - print('8. Generating sample_cor image ...') + ## 9. Sample correlation ----- + print('9. Generating sample_cor image ...') potential_error= base::tryCatch({ cor_df= filtered_counts %>% dplyr::filter(!is.na(DepMap_ID), !is.na(trt_type), !trt_type %in% c('empty', '', 'CB_only')) %>% @@ -704,13 +739,13 @@ QC_images= function(raw_counts_uncollapsed_path, skipped_qcs= c(skipped_qcs, potential_error) } - ## 9. Tech rep correlations ---- + ## 10. Tech rep correlations ---- if(is.data.frame(normalized_counts) & 'tech_rep' %in% colnames(normalized_counts)) { # Check if there are more at least two tech reps unique_tech_reps= na.omit(unique(normalized_counts$tech_rep)) if(length(unique_tech_reps) >= 2) { - print('9. Generating tech rep correlations image ...') + print('10. Generating tech rep correlations image ...') # Set up replicate groups depending "bio_rep" column if('bio_rep' %in% colnames(normalized_counts) & !'bio_rep' %in% sig_cols) { replicate_group_cols= c(sig_cols, 'bio_rep') @@ -752,13 +787,13 @@ QC_images= function(raw_counts_uncollapsed_path, } } else { - print('9. No technical replicates detected. Skipping tech_reps scatter plot.') + print('10. No technical replicates detected. Skipping tech_reps scatter plot.') } } else { - print('9. No technical replicates detected. Skipping tech_reps scatter plot.') + print('10. No technical replicates detected. Skipping tech_reps scatter plot.') } - ## 10. Bio rep correlations ---- + ## 11. Bio rep correlations ---- if('bio_rep' %in% colnames(l2fc)) { unique_bio_reps= na.omit(unique(l2fc$bio_rep)) @@ -782,7 +817,7 @@ QC_images= function(raw_counts_uncollapsed_path, # } # Bio replicate heatmap - print('10. Generating bio rep correlations heatmap ...') + print('11. Generating bio rep correlations heatmap ...') potential_error= base::tryCatch({ bio_corr_hm= create_cor_heatmap(input_df= l2fc_with_log2, row_id_cols= cell_line_cols, @@ -805,7 +840,7 @@ QC_images= function(raw_counts_uncollapsed_path, } } else { - print('10. No biological replicates detected. Skipping bio_rep heatmap.') + print('11. No biological replicates detected. Skipping bio_rep heatmap.') } } From 8ef52039b58c0632054dc9764fdf3c983ca33831 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 11:05:46 -0400 Subject: [PATCH 103/127] Updated scripts for new params --- scripts/collate_fastq_reads.R | 5 ++++- scripts/collate_fastq_reads.sh | 1 + scripts/filter_counts.R | 1 - scripts/filteredCounts_QC.R | 3 +++ scripts/filteredCounts_QC.sh | 1 + scripts/launch_job.sh | 2 +- scripts/make_config_file.groovy | 3 ++- 7 files changed, 12 insertions(+), 4 deletions(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index 89421e57..5dce7d7e 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -24,6 +24,8 @@ parser$add_argument("--reverse_index2", type="logical", default=FALSE, help= "Reverse complement of index 2 for NovaSeq and NextSeq") parser$add_argument("--barcode_col", default= "forward_read_cl_barcode", help= "Name of the column in uncollapsed_raw_counts that contains the barcode sequences.") +parser$add_argument('--low_abundance_threshold', default= 20, + help= 'For unknown barcodes, counts below this threshold will be marked as an unknown barcode.') parser$add_argument("-o", "--out", default=getwd(), help = "Output path. Default is working directory") # get command line options, if help option encountered print help and exit @@ -65,7 +67,8 @@ chunked_results= process_in_chunks(large_file_path= args$raw_counts_uncollapsed, id_cols= id_cols, known_barcodes= unique(c(cell_line_meta$Sequence, CB_meta$Sequence)), reverse_index2= args$reverse_index2, - barcode_col= args$barcode_col) + barcode_col= args$barcode_col, + low_abundance_threshold= as.numeric(args$low_abundance_threshold)) # From each chunk, extract prism_barcode_counts and bind the rows together into one dataframe. prism_barcode_counts= data.table::rbindlist(lapply(chunked_results, function(x) x$prism_barcode_counts)) diff --git a/scripts/collate_fastq_reads.sh b/scripts/collate_fastq_reads.sh index 6acea4ea..358444e0 100644 --- a/scripts/collate_fastq_reads.sh +++ b/scripts/collate_fastq_reads.sh @@ -114,6 +114,7 @@ args=( --id_cols "$ID_COLS" --reverse_index2 "$REVERSE_INDEX2" --barcode_col "$BARCODE_COL" +--low_abundance_threshold "$LOW_ABUNDANCE_THRESHOLD" --out "$BUILD_DIR" ) diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R index 7cd0326f..15ed449f 100755 --- a/scripts/filter_counts.R +++ b/scripts/filter_counts.R @@ -48,7 +48,6 @@ CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',') # Convert input strings into vectors ---- id_cols= unlist(strsplit(args$id_cols, ",")) -# What is this check doing? -YL ---- # make sure LUA codes in cell line meta are unique cell_line_meta %<>% dplyr::group_by(LUA) %>% diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R index 1287745b..52a9efa0 100755 --- a/scripts/filteredCounts_QC.R +++ b/scripts/filteredCounts_QC.R @@ -28,6 +28,8 @@ parser$add_argument('--annotated_counts', default= 'annotated_counts.csv', help= parser$add_argument('--normalized_counts', default= 'normalized_counts.csv', help= 'Path to normalized_counts.csv') parser$add_argument('--lfc', default= 'l2fc.csv', help= 'Path to l2fc.csv') parser$add_argument('-s', '--sample_meta', default= 'sample_meta.csv', help= 'Path to sample_meta.csv') +parser$add_argument("--barcode_col", default= "forward_read_cl_barcode", + help= "Name of the column in uncollapsed_raw_counts that contains the barcode sequences.") parser$add_argument('--id_cols', default= 'pcr_plate,pcr_well', help= 'Sample meta columns used to identify every PCR well') parser$add_argument('--cell_line_cols', default= 'DepMap_ID', help= 'Sushi columns used to identify a read') parser$add_argument('--sig_cols', default= 'cell_set,treatment,dose,dose_unit,day', @@ -73,6 +75,7 @@ QC_images(raw_counts_uncollapsed_path= args$raw_counts_uncollapsed, l2fc= l2fc, sample_meta= sample_meta, id_cols= id_cols, + barcode_col= args$barcode_col, cell_line_cols= cell_line_cols, sig_cols= sig_cols, control_type= args$control_type, diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh index bc826a64..6a013270 100644 --- a/scripts/filteredCounts_QC.sh +++ b/scripts/filteredCounts_QC.sh @@ -128,6 +128,7 @@ args=( --unknown_barcode_counts "$UNKNOWN_BARCODE_COUNTS" --lfc "$LFC" --id_cols "$ID_COLS" +--barcode_col "$BARCODE_COL" --reverse_index2 "$REVERSE_INDEX2" ) diff --git a/scripts/launch_job.sh b/scripts/launch_job.sh index 1ed0d9d9..48ba689f 100644 --- a/scripts/launch_job.sh +++ b/scripts/launch_job.sh @@ -33,7 +33,7 @@ PARAMS=( # column name paramters SEQUENCING_INDEX_COLS ID_COLS CELL_LINE_COLS SIG_COLS CONTROL_COLS # additional parameters - BARCODE_COL PSEUDOCOUNT COUNT_COL_NAME CTL_TYPES COUNT_THRESHOLD API_URL + BARCODE_COL LOW_ABUNDANCE_THRESHOLD PSEUDOCOUNT COUNT_COL_NAME CTL_TYPES COUNT_THRESHOLD API_URL ) # Load parameters diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy index 529d7861..59b6dfa7 100644 --- a/scripts/make_config_file.groovy +++ b/scripts/make_config_file.groovy @@ -56,6 +56,7 @@ pipeline { // Additional parameters string(name: 'BARCODE_COL', defaultValue: 'forward_read_cl_barcode', description: 'In COLLATE_FASTQ_READS, the name of the column containing the read') + string(name: 'LOW_ABUNDANCE_THRESHOLD', defaultValue: '20', description: 'In COLLATE_FASTQ_READS, threshold for unknown barcodes') string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'In CBNORMALIZE, the pesudocount value for log transformations.') string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'In COMPUTE_LFC, the name of the numeric column to use for calculations') string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'In COMPUTE_LFC, the value in trt_type that indicates the negative controls') @@ -127,7 +128,7 @@ pipeline { 'SEQUENCING_INDEX_COLS', 'ID_COLS', 'CELL_LINE_COLS', 'SIG_COLS', 'CONTROL_COLS', // additional parameters - 'BARCODE_COL', 'PSEUDOCOUNT', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD', 'API_URL' + 'BARCODE_COL', 'LOW_ABUNDANCE_THRESHOLD', 'PSEUDOCOUNT', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD', 'API_URL' ] def config = [:] From ae8f4e54139b894df2c601cf0262d8e74a1b781b Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 11:53:49 -0400 Subject: [PATCH 104/127] for troubleshooting --- scripts/collate_fastq_reads.R | 7 ++++++- scripts/src/collate_fastq_reads.R | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index 5dce7d7e..01ecc129 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -83,7 +83,12 @@ unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, ar # Validation: Basic file size check ---- if(nrow(prism_barcode_counts) == 0) { stop('ERROR: Empty file generated. No rows in prism_barcode_counts output.') -} +} + +# Trouble shooting ---- +nrow(prism_barcode_counts) +nrow(unknown_barcode_counts) +# # Write out files ---- out_file= paste(args$out, 'prism_barcode_counts.csv', sep='/') diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index aeb3aff6..c589ec1a 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -189,6 +189,9 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, print('Warning: Low index purity!') } else {} + print(nrow(summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,])) + print(nrow(summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,])) + # Return list of two dfs with known or unknown read counts ---- print('Completing collate_fastq_reads.') return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], From a4ac5b2e18b8261bd2819ef45c7b31d332e33799 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 11:59:22 -0400 Subject: [PATCH 105/127] Jenkins troubleshooting --- scripts/collate_fastq_reads.R | 5 ----- scripts/src/collate_fastq_reads.R | 7 ++++--- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index 01ecc129..782f3cbd 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -85,11 +85,6 @@ if(nrow(prism_barcode_counts) == 0) { stop('ERROR: Empty file generated. No rows in prism_barcode_counts output.') } -# Trouble shooting ---- -nrow(prism_barcode_counts) -nrow(unknown_barcode_counts) -# - # Write out files ---- out_file= paste(args$out, 'prism_barcode_counts.csv', sep='/') print(paste("Writing prism_barcode_counts.csv to ", out_file)) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index c589ec1a..36ea45ee 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -171,7 +171,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Function := performs the mutate in place without copying the dataframe. # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%. summed_reads[, c(barcode_col) := data.table::fifelse( - !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, + !(get(barcode_col) %chin% unique(known_barcodes)) & (n < low_abundance_threshold), 'unknown_low_abundance_barcode', get(barcode_col))] # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells. @@ -189,11 +189,12 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, print('Warning: Low index purity!') } else {} + # troubleshooting print(nrow(summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,])) - print(nrow(summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,])) + print(nrow(summed_reads[!(summed_reads[[barcode_col]] %chin% known_barcodes),])) # Return list of two dfs with known or unknown read counts ---- print('Completing collate_fastq_reads.') return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], - unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,])) + unknown_barcode_counts= summed_reads[!(summed_reads[[barcode_col]] %chin% known_barcodes),])) } From f062978b992741d2b4e3b83bb7a47d479877e56a Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 12:07:30 -0400 Subject: [PATCH 106/127] jenkins troubleshoot --- scripts/src/collate_fastq_reads.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 36ea45ee..b2be11bd 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -171,7 +171,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Function := performs the mutate in place without copying the dataframe. # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%. summed_reads[, c(barcode_col) := data.table::fifelse( - !(get(barcode_col) %chin% unique(known_barcodes)) & (n < low_abundance_threshold), + !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, 'unknown_low_abundance_barcode', get(barcode_col))] # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells. @@ -189,12 +189,13 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, print('Warning: Low index purity!') } else {} - # troubleshooting + # troubleshooting ---- + head(summed_reads) print(nrow(summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,])) print(nrow(summed_reads[!(summed_reads[[barcode_col]] %chin% known_barcodes),])) # Return list of two dfs with known or unknown read counts ---- print('Completing collate_fastq_reads.') return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], - unknown_barcode_counts= summed_reads[!(summed_reads[[barcode_col]] %chin% known_barcodes),])) + unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,])) } From 5923da0f3bf091b887f34b5995abf71a2c177073 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 12:08:12 -0400 Subject: [PATCH 107/127] more jenkins troubleshoot --- scripts/src/collate_fastq_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index b2be11bd..61b5b95d 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -190,7 +190,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, } else {} # troubleshooting ---- - head(summed_reads) + print(head(summed_reads)) print(nrow(summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,])) print(nrow(summed_reads[!(summed_reads[[barcode_col]] %chin% known_barcodes),])) From 3480402dbbe2774aab0dc8f6ea6ec1bacfe6cfad Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 12:12:20 -0400 Subject: [PATCH 108/127] jenkin troubleshoot --- scripts/src/collate_fastq_reads.R | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 61b5b95d..6fc5df95 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -170,9 +170,16 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # then the barcode is replaced with the string "unknown_low_abundance_barcode". # Function := performs the mutate in place without copying the dataframe. # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%. - summed_reads[, c(barcode_col) := data.table::fifelse( - !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, - 'unknown_low_abundance_barcode', get(barcode_col))] + + # works? + summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | + n >= low_abundance_threshold, + get(barcode_col), 'unknown_low_abundance_barcode')] + + # not working + # summed_reads[, c(barcode_col) := data.table::fifelse( + # !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, + # 'unknown_low_abundance_barcode', get(barcode_col))] # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells. summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] From f8972d7db84da04381a279f7169f093006cb57a7 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 12:13:23 -0400 Subject: [PATCH 109/127] jenkins troubleshoot --- scripts/src/collate_fastq_reads.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 6fc5df95..d3b53e72 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -172,8 +172,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%. # works? - summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | - n >= low_abundance_threshold, + summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes), get(barcode_col), 'unknown_low_abundance_barcode')] # not working From bd20c4f9103f387a71cc878555b5717ce49a5324 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 13:28:00 -0400 Subject: [PATCH 110/127] jenkins troubleshoot --- scripts/src/collate_fastq_reads.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index d3b53e72..f59e2ed8 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -172,12 +172,15 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%. # works? - summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes), + # summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes), + # get(barcode_col), 'unknown_low_abundance_barcode')] + summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | + n > low_abundance_threshold, get(barcode_col), 'unknown_low_abundance_barcode')] # not working # summed_reads[, c(barcode_col) := data.table::fifelse( - # !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, + # !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, # 'unknown_low_abundance_barcode', get(barcode_col))] # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells. From 297ab1b3ceaa3fc054c4c8c16d9ca9630dd687de Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 13:51:00 -0400 Subject: [PATCH 111/127] Jenkins test --- scripts/src/collate_fastq_reads.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index f59e2ed8..aaa7bc52 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -174,8 +174,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # works? # summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes), # get(barcode_col), 'unknown_low_abundance_barcode')] - summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | - n > low_abundance_threshold, + summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes) | + n >= low_abundance_threshold, get(barcode_col), 'unknown_low_abundance_barcode')] # not working From 3e1651c69b1b21a4ad905b56602eb244242d300b Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 14:20:08 -0400 Subject: [PATCH 112/127] Potential jenkins fix --- scripts/src/collate_fastq_reads.R | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index aaa7bc52..0cebb234 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -172,16 +172,20 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%. # works? - # summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes), - # get(barcode_col), 'unknown_low_abundance_barcode')] - summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes) | - n >= low_abundance_threshold, - get(barcode_col), 'unknown_low_abundance_barcode')] - - # not working - # summed_reads[, c(barcode_col) := data.table::fifelse( - # !get(barcode_col) %chin% unique(known_barcodes) & n < low_abundance_threshold, - # 'unknown_low_abundance_barcode', get(barcode_col))] + # summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes), + # get(barcode_col), 'unknown_low_abundance_barcode')] + + # two columns? + summed_reads[, temp := ifelse(get(barcode_col) %chin% unique(known_barcodes) | n >= low_abundance_threshold, + TRUE, FALSE)] + summed_reads[, c(barcode_col) := data.table::fifelse(temp, get(barcode_col), 'unknown_low_abundance_barcode')] + summed_reads[, temp := NULL] + + + # doesnt work + # summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes) | + # n >= low_abundance_threshold, + # get(barcode_col), 'unknown_low_abundance_barcode')] # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells. summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] From c7b7488f238fce2af52f89360648658d6c50b7a2 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 14:26:27 -0400 Subject: [PATCH 113/127] Jenkins test --- scripts/src/collate_fastq_reads.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 0cebb234..2edbe350 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -178,13 +178,14 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # two columns? summed_reads[, temp := ifelse(get(barcode_col) %chin% unique(known_barcodes) | n >= low_abundance_threshold, TRUE, FALSE)] - summed_reads[, c(barcode_col) := data.table::fifelse(temp, get(barcode_col), 'unknown_low_abundance_barcode')] - summed_reads[, temp := NULL] - + #summed_reads[, c(barcode_col) := data.table::fifelse(temp, get(barcode_col), 'unknown_low_abundance_barcode')] + #summed_reads[, temp := NULL] + print(head(summed_reads)) - # doesnt work + # This code is more efficient, but doesn't work in Jenkins only works locally + # The problem appears after adding a second condition in the ifelse - not sure why this is happening. # summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes) | - # n >= low_abundance_threshold, + # n >= low_abundance_threshold, # get(barcode_col), 'unknown_low_abundance_barcode')] # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells. From ee2b1e3d810f0baab7f0b7cfe0077973976cc58c Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 14:34:58 -0400 Subject: [PATCH 114/127] Jenkins test --- scripts/src/collate_fastq_reads.R | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 2edbe350..07d3eabf 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -176,7 +176,7 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # get(barcode_col), 'unknown_low_abundance_barcode')] # two columns? - summed_reads[, temp := ifelse(get(barcode_col) %chin% unique(known_barcodes) | n >= low_abundance_threshold, + summed_reads[, temp := ifelse(get(barcode_col) %in% unique(known_barcodes) | n >= low_abundance_threshold, TRUE, FALSE)] #summed_reads[, c(barcode_col) := data.table::fifelse(temp, get(barcode_col), 'unknown_low_abundance_barcode')] #summed_reads[, temp := NULL] @@ -203,11 +203,6 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, print('Warning: Low index purity!') } else {} - # troubleshooting ---- - print(head(summed_reads)) - print(nrow(summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,])) - print(nrow(summed_reads[!(summed_reads[[barcode_col]] %chin% known_barcodes),])) - # Return list of two dfs with known or unknown read counts ---- print('Completing collate_fastq_reads.') return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], From 454a2356f7ad842ed8822ea1bce14991c1b70812 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 14:54:02 -0400 Subject: [PATCH 115/127] test jenkins --- scripts/src/collate_fastq_reads.R | 37 ++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 07d3eabf..456acdb3 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -171,25 +171,34 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Function := performs the mutate in place without copying the dataframe. # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%. + # summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | + # n >= low_abundance_threshold, + # get(barcode_col), 'unknown_low_abundance_barcode')] + + # The code above is the initial implementation. It works locally, but not on Jenkins. + # The problem appears to occur when adding a second condition in the ifelse - not sure why this is happening. + # %chin% to %in% - error persists + # data.table::fifelse to base::ifelse - error persists + # Jenkins and local are working with the same version of data.table. + # works? # summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes), # get(barcode_col), 'unknown_low_abundance_barcode')] - # two columns? - summed_reads[, temp := ifelse(get(barcode_col) %in% unique(known_barcodes) | n >= low_abundance_threshold, - TRUE, FALSE)] + # wasted + summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] + prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,] + + unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,] + unknown_barcode_counts[, c(barcode_col) := data.table::fifelse(n >= low_abundance_threshold, + get(barcode_col), 'unknown_low_abundance_barcode')] + unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)] + #summed_reads[, c(barcode_col) := data.table::fifelse(temp, get(barcode_col), 'unknown_low_abundance_barcode')] #summed_reads[, temp := NULL] - print(head(summed_reads)) - - # This code is more efficient, but doesn't work in Jenkins only works locally - # The problem appears after adding a second condition in the ifelse - not sure why this is happening. - # summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes) | - # n >= low_abundance_threshold, - # get(barcode_col), 'unknown_low_abundance_barcode')] # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells. - summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] + #summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] # Calculate index purity ---- # This is only accurate if the Nori input file is small enough to fit into a chunk. @@ -205,6 +214,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Return list of two dfs with known or unknown read counts ---- print('Completing collate_fastq_reads.') - return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], - unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,])) + return(list(prism_barcode_counts= prism_barcode_counts, + unknown_barcode_counts= unknown_barcode_counts)) + #return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], + # unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,])) } From b72e9e05cdc599b2db81da47c6fd7b27c2937b55 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 14:59:13 -0400 Subject: [PATCH 116/127] jenkins test --- scripts/src/collate_fastq_reads.R | 33 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 456acdb3..16ec2b98 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -171,9 +171,9 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Function := performs the mutate in place without copying the dataframe. # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%. - # summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | - # n >= low_abundance_threshold, - # get(barcode_col), 'unknown_low_abundance_barcode')] + summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | + n >= 20, + get(barcode_col), 'unknown_low_abundance_barcode')] # The code above is the initial implementation. It works locally, but not on Jenkins. # The problem appears to occur when adding a second condition in the ifelse - not sure why this is happening. @@ -186,19 +186,18 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # get(barcode_col), 'unknown_low_abundance_barcode')] # wasted - summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] - prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,] - - unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,] - unknown_barcode_counts[, c(barcode_col) := data.table::fifelse(n >= low_abundance_threshold, - get(barcode_col), 'unknown_low_abundance_barcode')] - unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)] + # print(low_abundance_threshold) + # summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] + # prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,] + # + # unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,] + # unknown_barcode_counts[, c(barcode_col) := data.table::fifelse(n >= low_abundance_threshold, + # get(barcode_col), 'unknown_low_abundance_barcode')] + # unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)] - #summed_reads[, c(barcode_col) := data.table::fifelse(temp, get(barcode_col), 'unknown_low_abundance_barcode')] - #summed_reads[, temp := NULL] # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells. - #summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] + summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] # Calculate index purity ---- # This is only accurate if the Nori input file is small enough to fit into a chunk. @@ -214,8 +213,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Return list of two dfs with known or unknown read counts ---- print('Completing collate_fastq_reads.') - return(list(prism_barcode_counts= prism_barcode_counts, - unknown_barcode_counts= unknown_barcode_counts)) - #return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], - # unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,])) + # return(list(prism_barcode_counts= prism_barcode_counts, + # unknown_barcode_counts= unknown_barcode_counts)) + return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], + unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,])) } From fb3ced7ff4983c6c40363c57c20b23e7a1c1d7da Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 15:05:38 -0400 Subject: [PATCH 117/127] Found and fixed a bug --- scripts/launch_job.sh | 1 + scripts/src/collate_fastq_reads.R | 26 +------------------------- 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/scripts/launch_job.sh b/scripts/launch_job.sh index 48ba689f..a843d8a3 100644 --- a/scripts/launch_job.sh +++ b/scripts/launch_job.sh @@ -105,6 +105,7 @@ echo "Running in container:" -e COUNT_COL_NAME="$COUNT_COL_NAME" \ -e CTL_TYPES="$CTL_TYPES" \ -e COUNT_THRESHOLD="$COUNT_THRESHOLD" \ + -e LOW_ABUNDANCE_THRESHOLD="$LOW_ABUNDANCE_THRESHOLD" -e RUN_NORM="$RUN_NORM" \ -e BUILD_NAME="$BUILD_NAME" \ -e CONVERT_SUSHI="$CONVERT_SUSHI" \ diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 16ec2b98..f3001fae 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -170,32 +170,10 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # then the barcode is replaced with the string "unknown_low_abundance_barcode". # Function := performs the mutate in place without copying the dataframe. # Functions fifelse and %chin% are just faster data.table versions of ifelse and %in%. - summed_reads[, c(barcode_col) := data.table::fifelse(get(barcode_col) %chin% unique(known_barcodes) | - n >= 20, + n >= low_abundance_threshold, get(barcode_col), 'unknown_low_abundance_barcode')] - - # The code above is the initial implementation. It works locally, but not on Jenkins. - # The problem appears to occur when adding a second condition in the ifelse - not sure why this is happening. - # %chin% to %in% - error persists - # data.table::fifelse to base::ifelse - error persists - # Jenkins and local are working with the same version of data.table. - - # works? - # summed_reads[, c(barcode_col) := ifelse(get(barcode_col) %chin% unique(known_barcodes), - # get(barcode_col), 'unknown_low_abundance_barcode')] - - # wasted - # print(low_abundance_threshold) - # summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] - # prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,] - # - # unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,] - # unknown_barcode_counts[, c(barcode_col) := data.table::fifelse(n >= low_abundance_threshold, - # get(barcode_col), 'unknown_low_abundance_barcode')] - # unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, barcode_col)] - # Use data.table to group by id_cols and barcode_col and sum up reads across flowcells. summed_reads= summed_reads[, .(n= sum(n)), by= c(id_cols, barcode_col)] @@ -213,8 +191,6 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Return list of two dfs with known or unknown read counts ---- print('Completing collate_fastq_reads.') - # return(list(prism_barcode_counts= prism_barcode_counts, - # unknown_barcode_counts= unknown_barcode_counts)) return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,])) } From 4352d59d204e6b063731eb8834b1ab73ef216bbb Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 15:08:17 -0400 Subject: [PATCH 118/127] Added missing slash --- scripts/launch_job.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/launch_job.sh b/scripts/launch_job.sh index a843d8a3..9196b5c5 100644 --- a/scripts/launch_job.sh +++ b/scripts/launch_job.sh @@ -105,7 +105,7 @@ echo "Running in container:" -e COUNT_COL_NAME="$COUNT_COL_NAME" \ -e CTL_TYPES="$CTL_TYPES" \ -e COUNT_THRESHOLD="$COUNT_THRESHOLD" \ - -e LOW_ABUNDANCE_THRESHOLD="$LOW_ABUNDANCE_THRESHOLD" + -e LOW_ABUNDANCE_THRESHOLD="$LOW_ABUNDANCE_THRESHOLD" \ -e RUN_NORM="$RUN_NORM" \ -e BUILD_NAME="$BUILD_NAME" \ -e CONVERT_SUSHI="$CONVERT_SUSHI" \ From 12138256f5e1ae05efdf8fdacf1abf7901780a6a Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Wed, 9 Oct 2024 15:52:54 -0400 Subject: [PATCH 119/127] Remove run_norm parameter --- scripts/launch_job.sh | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/scripts/launch_job.sh b/scripts/launch_job.sh index 9196b5c5..ca354fd6 100644 --- a/scripts/launch_job.sh +++ b/scripts/launch_job.sh @@ -23,17 +23,19 @@ fi # List of parameters PARAMS=( - SEQ_TYPE BUILD_DIR INDEX_1 INDEX_2 BARCODE_SUFFIX REVERSE_INDEX2 RUN_NORM BUILD_NAME - CONVERT_SUSHI PULL_POOL_ID RUN_EPS_QC REMOVE_DATA DAYS COUNTS + SEQ_TYPE BUILD_DIR INDEX_1 INDEX_2 BARCODE_SUFFIX REVERSE_INDEX2 BUILD_NAME + CONVERT_SUSHI PULL_POOL_ID RUN_EPS_QC REMOVE_DATA DAYS COUNTS API_URL # metadata files SAMPLE_META CELL_SET_META CELL_LINE_META CONTROL_BARCODE_META ASSAY_POOL_META # susi files RAW_COUNTS_UNCOLLAPSED PRISM_BARCODE_COUNTS UNKNOWN_BARCODE_COUNTS ANNOTATED_COUNTS FILTERED_COUNTS NORMALIZED_COUNTS LFC COLLAPSED_LFC - # column name paramters - SEQUENCING_INDEX_COLS ID_COLS CELL_LINE_COLS SIG_COLS CONTROL_COLS - # additional parameters - BARCODE_COL LOW_ABUNDANCE_THRESHOLD PSEUDOCOUNT COUNT_COL_NAME CTL_TYPES COUNT_THRESHOLD API_URL + # collate_fastq_reads parameters + SEQUENCING_INDEX_COLS ID_COLS LOW_ABUNDANCE_THRESHOLD BARCODE_COL + # normalize parameters + PSEUDOCOUNT + # compute_l2fc parameters + CELL_LINE_COLS SIG_COLS CONTROL_COLS COUNT_COL_NAME CTL_TYPES COUNT_THRESHOLD ) # Load parameters @@ -106,7 +108,6 @@ echo "Running in container:" -e CTL_TYPES="$CTL_TYPES" \ -e COUNT_THRESHOLD="$COUNT_THRESHOLD" \ -e LOW_ABUNDANCE_THRESHOLD="$LOW_ABUNDANCE_THRESHOLD" \ - -e RUN_NORM="$RUN_NORM" \ -e BUILD_NAME="$BUILD_NAME" \ -e CONVERT_SUSHI="$CONVERT_SUSHI" \ -e PULL_POOL_ID="$PULL_POOL_ID" \ From 2a997c4e3fe10e82d82c5bf58369235675470930 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Thu, 10 Oct 2024 16:07:06 -0400 Subject: [PATCH 120/127] Updated error message --- scripts/src/QC_images.R | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index a45b911f..8bff0b0f 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -2,10 +2,11 @@ #' #' Create the qc table with index purity and cell line purity. #' -#' @param raw_counts_uncollapsed Dataframe output from nori. -#' @param raw_counts Raw counts dataframe outputed from collate_fastq_reads. -#' @param filtered_counts Filtered counts dataframe outputed from filter_raw_reads. -#' @param value_col String name of the counts column present all three dataframes. +#' @param raw_counts_uncollapsed_path Path to the nori raw_counts_uncollapsed file. +#' @param unknown_barcode_counts Dataframe of unknown barcodes. +#' @param prism_barcode_counts Dataframe of prism barcodes extracted from the nori. +#' @param filtered_counts Filtered counts dataframe created from filter_raw_reads. +#' @param value_col String name of the counts column present all the four input dataframes. #' @param file_path Location to write out the output. #' @returns Writes out a QC_table to the file_path. create_qc_table= function(raw_counts_uncollapsed_path, unknown_barcode_counts, @@ -129,7 +130,6 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) { return(total_counts_plot) } - #' Cell line recover barplot #' #' Creates barplots of the cell lines recovered. The parameter "plot_type" can be used to plot the percentage or @@ -141,7 +141,7 @@ create_total_counts_barplot= function(filtered_counts, id_cols, facet_col= NA) { #' @param id_cols Vector of column names that identify each sample. #' @param facet_col String name of the column in filtered_counts to facet the plot. #' @param value_col String name of the column in filtered_counts that contains the counts. -#' @param counts_threshold Threshold used to determine low counts. +#' @param count_threshold Threshold used to determine low counts. #' @param plot_type String of either "percent" or "count" to adjust the y axis to be either the percentage or the #' total number of cell lines. #' @param include_ctrl_bcs Boolean. Set to TRUE if control barcodes are to be counted. @@ -449,14 +449,14 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou #' #' Takes in various pipeline outputs and generates 11 QC files. #' -#' @param raw_counts_uncollapsed Dataframe output from nori. This is used to generate purity metrics and -#' the index summaries. -#' @param raw_counts Raw counts dataframe from the collate_fastq_reads modules. This is used to generate puritu metrics. +#' @param raw_counts_uncollapsed_path Path to the raw_counts_uncollapsed file. +#' @param prism_barcode_counts Dataframe of prism barcodes identified in the run. +#' @param unknown_barcode_counts Dataframe of unknown barcodes. #' @param annotated_counts Annotated counts dataframe from the filter_raw_reads module. -#' @param filtered_counts Filtered counts dataframe from the filter_raw_reads module. #' @param normalized_counts Normalized counts dataframe from the normalize module. This is an optional parameter. #' @param l2fc L2FC dataframe from the compute_l2fc module. This is used for the bio_reps plot. #' @param sample_meta Dataframe of the sample metadata for the sequencing run. +#' @param barcode_col String name of the column containing the barcode sequences. #' @param cell_line_cols Vector of sample meta column names used to describe a cell line or barcode. #' @param id_cols Vector of sample meta column names used to identify each PCR well. #' This defaults to "pcr_plate", "pcr_well". @@ -605,8 +605,8 @@ QC_images= function(raw_counts_uncollapsed_path, row.names= FALSE, quote= FALSE) }, error= function(e) { print(e) - print('Encountered an error when creating the total counts barplot. Skipping this output ...') - return('Totalc ounts image') + print('Encountered an error when creating the summary unknown barcode reads. Skipping this output ...') + return('unknown barcode reads') }) # Collect returned string if an error occurred From 42e7a93c0b4753e0b07175550002bf0d3b2c4c35 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Thu, 10 Oct 2024 16:08:37 -0400 Subject: [PATCH 121/127] Fixed flag --- scripts/src/collapse_bio_reps.R | 3 ++- scripts/src/compute_l2fc.R | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/src/collapse_bio_reps.R b/scripts/src/collapse_bio_reps.R index 5c09eecb..c93537e6 100755 --- a/scripts/src/collapse_bio_reps.R +++ b/scripts/src/collapse_bio_reps.R @@ -41,7 +41,8 @@ collapse_bio_reps= function(l2fc, sig_cols, cell_line_cols= c('project_code', 'D } # Median collapsing bio replicates ---- - collapsed_counts= l2fc %>% tidyr::unite(col= 'sig_id', all_of(sig_cols), sep= ':', na.rm= FALSE, remove= FALSE) %>% + collapsed_counts= l2fc %>% dplyr::filter(is.na(counts_flag)) %>% + tidyr::unite(col= 'sig_id', all_of(sig_cols), sep= ':', na.rm= FALSE, remove= FALSE) %>% dplyr::group_by(pick(all_of(c(cell_line_cols, 'sig_id', sig_cols)))) %>% dplyr::summarise(trt_median_n= median(mean_n), trt_median_normalized_n= median(mean_normalized_n), trt_mad_sqrtN= mad(log2(mean_normalized_n)) / sqrt(dplyr::n()), diff --git a/scripts/src/compute_l2fc.R b/scripts/src/compute_l2fc.R index 3007253f..5db25f2d 100755 --- a/scripts/src/compute_l2fc.R +++ b/scripts/src/compute_l2fc.R @@ -77,7 +77,8 @@ compute_l2fc= function(normalized_counts, # Join neg_cons and compute l2fc ---- l2fc= collapsed_tech_rep %>% dplyr::filter(!trt_type %in% c(control_type, 'day_0')) %>% dplyr::inner_join(controls, by= c(cell_line_cols, ctrl_cols), relationship='many-to-one') %>% - dplyr::mutate(l2fc= log2(mean_normalized_n/control_median_normalized_n)) + dplyr::mutate(l2fc= log2(mean_normalized_n/control_median_normalized_n), + counts_flag= ifelse(control_median_n < count_threshold, paste0('negcon<', count_threshold), NA)) return(l2fc) } From 19bb90f07d07178660a5b95e798f3512d4412dbb Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Thu, 10 Oct 2024 16:08:52 -0400 Subject: [PATCH 122/127] Updated comments --- scripts/collate_fastq_reads.R | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index 782f3cbd..2ccfbaf5 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -56,8 +56,8 @@ if(!validate_columns_exist(id_cols, sample_meta)) { } # Run collate_fastq_reads on chunks of raw_counts_uncollapsed.csv ---- -# raw_counts_uncollapsed can be too large to read into memory, -# so collate_fastq_reads is performed on chunks of the large file. +# raw_counts_uncollapsed could be too large to read into memory, +# so collate_fastq_reads is performed on chunks of the raw_counts_uncollapsed file. chunked_results= process_in_chunks(large_file_path= args$raw_counts_uncollapsed, chunk_size= 10^6, action= collate_fastq_reads, @@ -70,14 +70,13 @@ chunked_results= process_in_chunks(large_file_path= args$raw_counts_uncollapsed, barcode_col= args$barcode_col, low_abundance_threshold= as.numeric(args$low_abundance_threshold)) -# From each chunk, extract prism_barcode_counts and bind the rows together into one dataframe. +# From each chunk, extract prism_barcode_counts or unknown_barcode_counts and bind those rows together. +# Then use data.table to aggregate and sum up reads across the chunks. +# data.table functions are faster and less memory intensivie. prism_barcode_counts= data.table::rbindlist(lapply(chunked_results, function(x) x$prism_barcode_counts)) -# Use data.table to group_by id_cols and barcode_col to sum up reads across all chunks. prism_barcode_counts= prism_barcode_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)] -# From each chunk, extract unknown_barcode_counts and bind the rows together into one dataframe. unknown_barcode_counts= data.table::rbindlist(lapply(chunked_results, function(x) x$unknown_barcode_counts)) -# Use data.table to group_by id_cols and barcode_col to sum up reads across all chunks. unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)] # Validation: Basic file size check ---- From ad2b61f3115766d14243fc2344a4dd00c06a5e4f Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Thu, 10 Oct 2024 16:09:11 -0400 Subject: [PATCH 123/127] Reordered params and added comments --- scripts/make_config_file.groovy | 56 ++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy index 59b6dfa7..49ac31a6 100644 --- a/scripts/make_config_file.groovy +++ b/scripts/make_config_file.groovy @@ -8,16 +8,15 @@ pipeline { parameters { booleanParam(name: 'TRIGGER_BUILD', defaultValue: true, description: 'Check this to trigger the build. If unchecked, the build will not be triggered and only the config.json will be generated.') booleanParam(name: 'CREATE_CELLDB_METADATA', defaultValue: true, description: 'Check this to trigger the create_celldb_metadata job.') + booleanParam(name: 'PULL_POOL_ID', defaultValue: false, description: 'Flag indicating whether to pull pool IDs from CellDB - only applicable to cell sets (i.e. EXT.PR500.CS01.1.A, EXT.PR500.CS01.1.B, etc).') booleanParam(name: 'COLLATE_FASTQ_READS', defaultValue: true, description: 'Check this to trigger the collate_fastq_reads job.') booleanParam(name: 'FILTER_COUNTS', defaultValue: true, description: 'Check this to trigger the filter_counts job.') + booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.') booleanParam(name: 'CBNORMALIZE', defaultValue: true, description: 'Check this to trigger the CBnormalize job.') booleanParam(name: 'COMPUTE_LFC', defaultValue: true, description: 'Check this to trigger the compute_l2fc job.') booleanParam(name: 'COLLAPSE', defaultValue: true, description: 'Check this to trigger the collapse job.') booleanParam(name: 'QC_IMAGES', defaultValue: true, description: 'Check this to trigger the QC job.') - booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job.') - booleanParam(name: 'REMOVE_DATA', defaultValue: false, description: 'Select if there is experimental data that needs to be removed before normalization. TODO: expand on this.') - booleanParam(name: 'RUN_NORM', defaultValue: true, description: 'Run normalization module on data.') - booleanParam(name: 'PULL_POOL_ID', defaultValue: false, description: 'Flag indicating whether to pull pool IDs from CellDB - only applicable to cell sets (i.e. EXT.PR500.CS01.1.A, EXT.PR500.CS01.1.B, etc).') + booleanParam(name: 'JOIN_METADATA', defaultValue: true, description: 'Check this to trigger the join_metadata job. This should be checked you are planning on ruunning CONVERT_SUSHI.') booleanParam(name: 'CONVERT_SUSHI', defaultValue: false, description: 'Convert output column headers to format for MTS pipeline and upload to s3.') booleanParam(name: 'RUN_EPS_QC', defaultValue: false, description: 'Run EPS QC') string(name: 'BUILD_DIR', defaultValue: '/cmap/obelix/pod/prismSeq/', description: 'Output path to deposit build. Format should be /directory/PROJECT_CODE/BUILD_NAME') @@ -25,27 +24,36 @@ pipeline { string(name: 'SCREEN', defaultValue: '', description: 'Screen name from COMET, necessary if using COMET for sample metadata.') string(name: 'SEQ_TYPE', defaultValue: 'DRAGEN', description: 'Choose DRAGEN, MiSeq, HiSeq, or NovaSeq. MiSeq and HiSeq/NovaSeq return files named differently. This setting sets the INDEX_1, INDEX_2, and BARCODE_SUFFIX parameters in fastq2readcount. Select DRAGEN if fastq files are from the DRAGEN pipeline from GP. Choosing NovaSeq reverses index 2.') string(name: 'DAYS', defaultValue: '', description: 'If running the sushi_to_mts module, provide any days/timepoints (separated by commas) that should be dropped from output data. No quotes needed (ie, 2,8).') + + // pipeline version string(name: 'GIT_BRANCH', defaultValue: 'main', description: 'Pipeline branch to use') booleanParam(name: 'USE_LATEST', defaultValue: true, description: 'Check this to use the most up to date version from the specified branch. If not checked, will use the specified commit.') string(name: 'COMMIT_ID', defaultValue: '', description: 'Specific commit ID to use (leave empty if using the latest commit in the branch or if already specified in the config file.)') // Most common parameters - // Column names parameters - string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'flowcell_names,index_1,index_2', description: 'Sequencing index columns used in COLLATE_FASTQ_READS') - string(name: 'ID_COLS', defaultValue: 'pcr_plate,pcr_well', description: 'Columns to concat to create unique ID for each sample-replicate') + string(name: 'SEQUENCING_INDEX_COLS', defaultValue: 'flowcell_names,index_1,index_2', description: 'Used in COLLATE_FASTQ_READS, this is a comma separate list of sequencing columns in the sample meta that are need to identify every PCR well in the run.') + string(name: 'ID_COLS', defaultValue: 'pcr_plate,pcr_well', description: 'Used in COLLATE_FASTQ_READS, columns to concat to create unique ID for each sample-replicate') string(name: 'CELL_LINE_COLS', defaultValue: 'DepMap_ID', description: 'Columns in intermediate files that describe a read or cell line') - string(name: 'SIG_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day', description: 'Signature columns') - string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual controls in COMPUTE_LFC') + string(name: 'SIG_COLS', defaultValue: 'cell_set,treatment,dose,dose_unit,day', description: 'Signature columns, these describe unique treatment conditions and generally should not include replicate information.') + string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day', description: 'Set of columns that define individual negative control conditions.') - // Metadata files used by sushi + // Sushi Input files + string(name: 'RAW_COUNTS_UNCOLLAPSED', defaultValue: 'raw_counts_uncollapsed.csv', description: 'Filename in BUILD_DIR containing nori output') string(name: 'SAMPLE_META', defaultValue: 'sample_meta.csv', description: 'File name of sample metadata within the BUILD_DIR directory.') string(name: 'CELL_SET_META', defaultValue: 'cell_set_meta.csv', description: 'Cell Set Metadata. Static cell_line_meta location: /data/vdb/prismSeq/cell_set_meta.csv') string(name: 'CELL_LINE_META', defaultValue: 'cell_line_meta.csv', description: 'File in BUILD_DIR containing cell line metadata') string(name: 'CONTROL_BARCODE_META', defaultValue: 'CB_meta.csv', description: 'Metadata for control barcodes.') string(name: 'ASSAY_POOL_META', defaultValue: 'assay_pool_meta.txt', description: 'File in BUILD_DIR containing assay pool metadata') - // Files consumed and created by sushi - string(name: 'RAW_COUNTS_UNCOLLAPSED', defaultValue: 'raw_counts_uncollapsed.csv', description: 'Filename in BUILD_DIR containing nori output') + // Additional parameters ordered by when they first appear + string(name: 'BARCODE_COL', defaultValue: 'forward_read_cl_barcode', description: 'Used in COLLATE_FASTQ_READS, the name of the column containing the read') + string(name: 'LOW_ABUNDANCE_THRESHOLD', defaultValue: '20', description: 'Used in COLLATE_FASTQ_READS, threshold for unknown barcodes') + string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'Used in CBNORMALIZE, the pesudocount value for log transformations.') + string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'Used in COMPUTE_LFC, the name of the numeric column to use for calculations') + string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'Used in COMPUTE_LFC, the value in trt_type that indicates the negative controls') + string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'Used in COMPUTE_LFC, the count threshold for the collapsed negative controls. Cell lines in the negative controls below this threshold will be dropped from log2 fold change calculations.') + + // Files created by sushi string(name: 'PRISM_BARCODE_COUNTS', defaultValue: 'prism_barcode_counts.csv', description: 'Filename in BUILD_DIR containing PRISM barcode counts') string(name: 'UNKNOWN_BARCODE_COUNTS', defaultValue: 'unknown_barcode_counts.csv', description: 'Filename in BUILD_DIR containing unknown barcode counts') string(name: 'ANNOTATED_COUNTS', defaultValue: 'annotated_counts.csv', description: 'Filename in BUILD_DIR containing annotated counts') @@ -53,14 +61,7 @@ pipeline { string(name: 'NORMALIZED_COUNTS', defaultValue: 'normalized_counts.csv', description: 'Filename in BUILD_DIR containing normalized counts') string(name: 'LFC', defaultValue: 'l2fc.csv', description: 'Filename containing log2 fold change values') string(name: 'COLLAPSED_LFC', defaultValue: 'collapsed_l2fc.csv', description: 'Filename in BUILD_DIR containing replicate collapsed l2fc values') - - // Additional parameters - string(name: 'BARCODE_COL', defaultValue: 'forward_read_cl_barcode', description: 'In COLLATE_FASTQ_READS, the name of the column containing the read') - string(name: 'LOW_ABUNDANCE_THRESHOLD', defaultValue: '20', description: 'In COLLATE_FASTQ_READS, threshold for unknown barcodes') - string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'In CBNORMALIZE, the pesudocount value for log transformations.') - string(name: 'COUNT_COL_NAME', defaultValue: 'normalized_n', description: 'In COMPUTE_LFC, the name of the numeric column to use for calculations') - string(name: 'CTL_TYPES', defaultValue: 'negcon', description: 'In COMPUTE_LFC, the value in trt_type that indicates the negative controls') - string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'Drops cell lines below this threshold in the negative controls') + // Other string(name: 'API_URL', defaultValue: 'https://api.clue.io/api/', description: 'API URL') } @@ -113,8 +114,8 @@ pipeline { steps { script { def paramList = [ - 'SEQ_TYPE', 'API_URL', 'BUILD_DIR', 'INDEX_1', 'INDEX_2', 'BARCODE_SUFFIX', 'REVERSE_INDEX2', - 'RUN_NORM', 'BUILD_NAME', 'CONVERT_SUSHI', 'PULL_POOL_ID', 'RUN_EPS_QC', 'REMOVE_DATA', 'DAYS', + 'SEQ_TYPE', 'API_URL', 'BUILD_DIR', 'INDEX_1', 'INDEX_2', 'BARCODE_SUFFIX', + 'BUILD_NAME', 'CONVERT_SUSHI', 'PULL_POOL_ID', 'RUN_EPS_QC', 'REMOVE_DATA', 'DAYS', 'COUNTS', 'SCREEN', // metadata files @@ -124,11 +125,14 @@ pipeline { 'RAW_COUNTS_UNCOLLAPSED', 'PRISM_BARCODE_COUNTS', 'UNKNOWN_BARCODE_COUNTS', 'ANNOTATED_COUNTS', 'FILTERED_COUNTS', 'NORMALIZED_COUNTS', 'LFC', 'COLLAPSED_LFC', - // column name parameters - 'SEQUENCING_INDEX_COLS', 'ID_COLS', 'CELL_LINE_COLS', 'SIG_COLS', 'CONTROL_COLS', + // collate_fastq_reads parameters + 'SEQUENCING_INDEX_COLS', 'ID_COLS', 'BARCODE_COL', 'LOW_ABUNDANCE_THRESHOLD', 'REVERSE_INDEX2', + + // normalize parameters + 'PSEUDOCOUNT', - // additional parameters - 'BARCODE_COL', 'LOW_ABUNDANCE_THRESHOLD', 'PSEUDOCOUNT', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD', 'API_URL' + // compute_l2fc paramters + 'SIG_COLS', 'CONTROL_COLS', 'CELL_LINE_COLS', 'COUNT_COL_NAME', 'CTL_TYPES', 'COUNT_THRESHOLD', ] def config = [:] From b3294fac0c020fbd53f561fa528a96db18aa22fb Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Thu, 10 Oct 2024 17:07:24 -0400 Subject: [PATCH 124/127] Added more comments --- scripts/src/kitchen_utensils.R | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/src/kitchen_utensils.R b/scripts/src/kitchen_utensils.R index cfe9c0f4..d11eddd2 100644 --- a/scripts/src/kitchen_utensils.R +++ b/scripts/src/kitchen_utensils.R @@ -6,11 +6,14 @@ #' #' This function runs some action over chunks of a large file. At the end, returns a list of all the chunks #' -#' @param large_file_path description -#' @param chunk_size description -#' @param action A function passed to act on each chunk +#' @param large_file_path Path to a large csv file. This file may be too large to read into R. +#' @param chunk_size The number of rows in a chunk. +#' @param action A function to perform over a chunk. #' @param ... Additional parameters to be passed into the action parameter process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) { + # Read in the column names. These names will be passed onto each chunk. + # When reading a file in chunks, the column names in the first line are not always passed. + # Use data.table to read in just the headers with nrow= 0. header_col_names= data.table::fread(large_file_path, header= TRUE, sep= ',', nrow= 0) %>% colnames() chunk_idx= 1 # Counter to keep track of chunks in a loop current_chunk_size= chunk_size # Variable for loop exit condition @@ -18,6 +21,9 @@ process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) { # For each chunk, call an action while(current_chunk_size == chunk_size) { + # Read in a chunk of the large file and set the column names. + # nrow - the number of rows to read in + # skip - the number of rows to skip before starting to read in. current_chunk= data.table::fread(large_file_path, header= FALSE, sep= ',', col.names= header_col_names, nrow= chunk_size, skip= chunk_size * (chunk_idx - 1) + 1) @@ -25,6 +31,7 @@ process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) { current_chunk_size= nrow(current_chunk) # set current chunk size to stop loop print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' ')) + # Call the action over the chunk chunk_collector[[chunk_idx]]= do.call(action, list(current_chunk, ...)) chunk_idx= chunk_idx + 1 } From 84d7dc1e99eaef22f5500c88ccffcb8a63f02aba Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Thu, 10 Oct 2024 17:27:29 -0400 Subject: [PATCH 125/127] Dropped run_norm parameter --- scripts/CBnormalize.sh | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/scripts/CBnormalize.sh b/scripts/CBnormalize.sh index e23c1979..4af2fdd7 100644 --- a/scripts/CBnormalize.sh +++ b/scripts/CBnormalize.sh @@ -52,27 +52,18 @@ echo SAMPLE_META is: $SAMPLE_META echo $RUN_NORM -if [[ "$RUN_NORM" == "true" ]] -then - echo "Running normalization module" - - echo Rscript CBnormalize.R -c $FILTERED_COUNTS \ - --CB_meta $CONTROL_BARCODE_META \ - --pseudocount $PSEUDOCOUNT \ - --id_cols $ID_COLS \ - --out $BUILD_DIR +echo "Running normalization module" - Rscript CBnormalize.R -c $FILTERED_COUNTS \ - --CB_meta $CONTROL_BARCODE_META \ - --pseudocount $PSEUDOCOUNT \ - --id_cols $ID_COLS \ - --out $BUILD_DIR +echo Rscript CBnormalize.R -c $FILTERED_COUNTS \ +--CB_meta $CONTROL_BARCODE_META \ +--pseudocount $PSEUDOCOUNT \ +--id_cols $ID_COLS \ +--out $BUILD_DIR - COUNTS="normalized_counts.csv" +Rscript CBnormalize.R -c $FILTERED_COUNTS \ +--CB_meta $CONTROL_BARCODE_META \ +--pseudocount $PSEUDOCOUNT \ +--id_cols $ID_COLS \ +--out $BUILD_DIR -else - echo "Not running normalization module" - COUNTS=$FILTERED_COUNTS - COUNT_COL_NAME="n" - echo $COUNTS -fi +COUNTS="normalized_counts.csv" From 3469baee5aa65d52d558e277f0ead632ff40af7b Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Thu, 10 Oct 2024 17:28:48 -0400 Subject: [PATCH 126/127] Added back CCLE_name Added back CCLE_name in order to get the adapter script to work --- scripts/join_metadata.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/join_metadata.R b/scripts/join_metadata.R index 79d442f6..78aaa44d 100644 --- a/scripts/join_metadata.R +++ b/scripts/join_metadata.R @@ -56,7 +56,7 @@ if(file.exists(args$lfc)) { if(assay_pool_meta_exists) { l2fc_with_meta_columns= join_metadata(input_df= l2fc_with_meta_columns, metadata= input_assay_pool_meta, - key_cols= c('DepMap_ID', 'cell_set')) + key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set')) } else { print('WARNING: Assay pool meta not detected and will not be joined onto l2fc.') } @@ -83,7 +83,7 @@ if(file.exists(args$collapsed_lfc)) { print('Attempting to add assay_pool_meta to collapsed l2fc.') collapsed_l2fc_with_meta_columns= join_metadata(input_df= collapsed_l2fc_with_meta_columns, metadata= input_assay_pool_meta, - key_cols= c('DepMap_ID', 'cell_set')) + key_cols= c('DepMap_ID', 'CCLE_name', 'cell_set')) } else { print('WARNING: Assay pool meta not detected and will not be joined onto collapsed l2fc.') } From 0f1bb43ba352086e107557df4c58c97491257993 Mon Sep 17 00:00:00 2001 From: YuhJong Liu Date: Fri, 11 Oct 2024 08:54:51 -0400 Subject: [PATCH 127/127] Drop wells without a cell set --- scripts/src/filter_raw_reads.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/src/filter_raw_reads.R b/scripts/src/filter_raw_reads.R index 1a3384f7..83d036c6 100755 --- a/scripts/src/filter_raw_reads.R +++ b/scripts/src/filter_raw_reads.R @@ -75,7 +75,8 @@ filter_raw_reads = function(prism_barcode_counts, # every row is a cell line that is expected in a PCR well. print('Creating template of expected reads.') # Join cell_set_meta and cell_line_meta. The cell_set can be a name "P939" or a list of LUAs. - template= sample_meta %>% dplyr::left_join(cell_set_meta, by= 'cell_set') %>% + template= sample_meta %>% dplyr::filter(!is.na(cell_set)) %>% # this filter prevents some NA rows + dplyr::left_join(cell_set_meta, by= 'cell_set') %>% dplyr::mutate(members= ifelse(is.na(members), str_split(cell_set, ';'), str_split(members, ';'))) %>% tidyr::unnest(cols= members) %>% dplyr::left_join(cell_line_meta, by= dplyr::join_by('members'=='LUA'), relationship= 'many-to-one') %>%