Skip to content

Commit

Permalink
Merge branch 'dev' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
jdavis3141 authored Oct 17, 2024
2 parents af61ab1 + 86ab176 commit 74a4326
Show file tree
Hide file tree
Showing 26 changed files with 1,694 additions and 995 deletions.
1 change: 1 addition & 0 deletions scripts/CBnormalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ options(cli.unicode = FALSE)
library(argparse)
library(magrittr)
source("./src/normalize.R")
source("./src/kitchen_utensils.R")

# Argument parser ----
parser <- ArgumentParser()
Expand Down
33 changes: 12 additions & 21 deletions scripts/CBnormalize.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,27 +52,18 @@ echo SAMPLE_META is: $SAMPLE_META
echo $RUN_NORM


if [[ "$RUN_NORM" == "true" ]]
then
echo "Running normalization module"

echo Rscript CBnormalize.R -c $FILTERED_COUNTS \
--CB_meta $CONTROL_BARCODE_META \
--pseudocount $PSEUDOCOUNT \
--id_cols $ID_COLS \
--out $BUILD_DIR
echo "Running normalization module"

Rscript CBnormalize.R -c $FILTERED_COUNTS \
--CB_meta $CONTROL_BARCODE_META \
--pseudocount $PSEUDOCOUNT \
--id_cols $ID_COLS \
--out $BUILD_DIR
echo Rscript CBnormalize.R -c $FILTERED_COUNTS \
--CB_meta $CONTROL_BARCODE_META \
--pseudocount $PSEUDOCOUNT \
--id_cols $ID_COLS \
--out $BUILD_DIR

COUNTS="normalized_counts.csv"
Rscript CBnormalize.R -c $FILTERED_COUNTS \
--CB_meta $CONTROL_BARCODE_META \
--pseudocount $PSEUDOCOUNT \
--id_cols $ID_COLS \
--out $BUILD_DIR

else
echo "Not running normalization module"
COUNTS=$FILTERED_COUNTS
COUNT_COL_NAME="n"
echo $COUNTS
fi
COUNTS="normalized_counts.csv"
1 change: 1 addition & 0 deletions scripts/collapse_replicates.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ library(tidyverse)
suppressPackageStartupMessages(library(argparse))
suppressPackageStartupMessages(library(magrittr))
source("./src/collapse_bio_reps.R")
source("./src/kitchen_utensils.R")

# Argument parser ----
parser <- ArgumentParser()
Expand Down
6 changes: 4 additions & 2 deletions scripts/collapse_replicates.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@ echo LFC is: $LFC

echo Rscript collapse_replicates.R -c $LFC \
--out $BUILD_DIR \
--sig_cols $SIG_COLS
--sig_cols $SIG_COLS \
--cell_line_cols $CELL_LINE_COLS


Rscript collapse_replicates.R -c $LFC \
--out $BUILD_DIR \
--sig_cols $SIG_COLS
--sig_cols $SIG_COLS \
--cell_line_cols $CELL_LINE_COLS
104 changes: 72 additions & 32 deletions scripts/collate_fastq_reads.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,30 @@ options(cli.unicode = FALSE)
library(argparse)
library(magrittr)
library(tidyverse)
library(data.table)
source("./src/collate_fastq_reads.R")
source("./src/kitchen_utensils.R")

# Argument parser ----
parser <- ArgumentParser()
# specify desired options
parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]")
parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output")
parser$add_argument("-c", "--uncollapsed_raw_counts", default="raw_counts_uncollapsed.csv",
help="path to file containing uncollapsed raw counts file")
parser$add_argument("--sample_meta", default="sample_meta.csv", help = "Sample metadata")
parser$add_argument("--sequencing_index_cols", default= "index_1,index_2",
help = "Sequencing columns in the sample meta")
parser$add_argument('--raw_counts_uncollapsed', default= "raw_counts_uncollapsed.csv",
help= "path to file containing uncollapsed raw counts file")
parser$add_argument("--sample_meta", default="sample_meta.csv", help= "Sample metadata")
parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell line metadata")
parser$add_argument("--CB_meta", default= "CB_meta.csv", help= "Control Barcode metadata")
parser$add_argument('--sequencing_index_cols', default= 'index_1,index_2',
help= 'List of sequencing columns in the sample meta.')
parser$add_argument("--id_cols", default= "pcr_plate,pcr_well",
help = "Columns that identify a unique PCR well")
parser$add_argument("--reverse_index2", type="logical", default=FALSE,
help= "Reverse complement of index 2 for NovaSeq and NextSeq")
parser$add_argument("--barcode_col", default= "forward_read_cl_barcode",
help= "Name of the column in uncollapsed_raw_counts that contains the barcode sequences.")
parser$add_argument('--low_abundance_threshold', default= 20,
help= 'For unknown barcodes, counts below this threshold will be marked as an unknown barcode.')
parser$add_argument("-o", "--out", default=getwd(), help = "Output path. Default is working directory")

# get command line options, if help option encountered print help and exit
Expand All @@ -24,31 +36,59 @@ if (args$out == "") {
args$out = args$wkdir
}

# Run collate_fastq_reads if uncollapsed file exists ----
expected_file_path <- paste(args$out, "raw_counts_uncollapsed.csv", sep='/')

if(file.exists(expected_file_path)) {
sample_meta= data.table::fread(args$sample_meta, header= T, sep= ',', data.table= F)
uncollapsed_raw_counts= data.table::fread(expected_file_path, header= T, sep= ',', data.table= F)
sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ","))

# Validation: Check if sequencing_index_cols is composed of sample meta column names
if (!all(sequencing_index_cols %in% colnames(sample_meta))) {
stop(paste("Colnames not found in sample_meta, check metadata or --sequencing_index_cols argument:",
args$sequencing_index_cols))
}

print("Collating fastq reads")
raw_counts= collate_fastq_reads(uncollapsed_raw_counts, sample_meta, sequencing_index_cols)

# Validation: Basic file size check
if(nrow(raw_counts) == 0) {
stop('ERROR: Empty file generated. No rows in raw_counts output.')
}

rc_out_file = paste(args$out, 'raw_counts.csv', sep='/')
print(paste("Writing to file: ", rc_out_file))
write.csv(raw_counts, rc_out_file, row.names=F, quote=T)
} else {
print("Uncollapsed raw counts file not detected. Proceeding with generating filtered counts file.")
# Read in metadata files as data.table objects ----
sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',')
cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',')
CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',')

# Parse some parameters into vectors ----
sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ","))
id_cols= unlist(strsplit(args$id_cols, ","))

# Validation: Check that sequencing_index_cols present in the sample meta ----
if(!validate_columns_exist(sequencing_index_cols, sample_meta)) {
stop('One or more sequencing_index_cols is NOT present in the sample meta.')
}

# Validation: Check that id_cols are present in the sample meta ----
if(!validate_columns_exist(id_cols, sample_meta)) {
stop('One or more id_cols is NOT present in the sample meta.')
}

# Run collate_fastq_reads on chunks of raw_counts_uncollapsed.csv ----
# raw_counts_uncollapsed could be too large to read into memory,
# so collate_fastq_reads is performed on chunks of the raw_counts_uncollapsed file.
chunked_results= process_in_chunks(large_file_path= args$raw_counts_uncollapsed,
chunk_size= 10^6,
action= collate_fastq_reads,
# Parameters for collate_fastq_reads
sample_meta= sample_meta,
sequencing_index_cols= sequencing_index_cols,
id_cols= id_cols,
known_barcodes= unique(c(cell_line_meta$Sequence, CB_meta$Sequence)),
reverse_index2= args$reverse_index2,
barcode_col= args$barcode_col,
low_abundance_threshold= as.numeric(args$low_abundance_threshold))

# From each chunk, extract prism_barcode_counts or unknown_barcode_counts and bind those rows together.
# Then use data.table to aggregate and sum up reads across the chunks.
# data.table functions are faster and less memory intensivie.
prism_barcode_counts= data.table::rbindlist(lapply(chunked_results, function(x) x$prism_barcode_counts))
prism_barcode_counts= prism_barcode_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)]

unknown_barcode_counts= data.table::rbindlist(lapply(chunked_results, function(x) x$unknown_barcode_counts))
unknown_barcode_counts= unknown_barcode_counts[, .(n= sum(n)), by= c(id_cols, args$barcode_col)]

# Validation: Basic file size check ----
if(nrow(prism_barcode_counts) == 0) {
stop('ERROR: Empty file generated. No rows in prism_barcode_counts output.')
}

# Write out files ----
out_file= paste(args$out, 'prism_barcode_counts.csv', sep='/')
print(paste("Writing prism_barcode_counts.csv to ", out_file))
write.csv(prism_barcode_counts, out_file, row.names= FALSE, quote= FALSE)

out_file= paste(args$out, 'unknown_barcode_counts.csv', sep='/')
print(paste("Writing unknown_barcode_counts.csv to ", out_file))
write.csv(unknown_barcode_counts, out_file, row.names= FALSE, quote= FALSE)
40 changes: 37 additions & 3 deletions scripts/collate_fastq_reads.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,36 @@ then
exit -1
fi

#Enforces abs paths
if [[ "$RAW_COUNTS_UNCOLLAPSED" = /* ]]
then
RAW_COUNTS_UNCOLLAPSED=$(ls $RAW_COUNTS_UNCOLLAPSED)
else
RAW_COUNTS_UNCOLLAPSED=$BUILD_DIR/$RAW_COUNTS_UNCOLLAPSED
fi

#Enforces abs paths
if [[ "$SAMPLE_META" = /* ]]
then
SAMPLE_META=$(ls $SAMPLE_META)
SAMPLE_META=$(ls $SAMPLE_META)
else
SAMPLE_META=$BUILD_DIR/$SAMPLE_META
SAMPLE_META=$BUILD_DIR/$SAMPLE_META
fi

#Enforces abs paths
if [[ "$CELL_LINE_META" = /* ]]
then
CELL_LINE_META=$(ls $CELL_LINE_META)
else
CELL_LINE_META=$BUILD_DIR/$CELL_LINE_META
fi

#Enforces abs paths
if [[ "$CONTROL_BARCODE_META" = /* ]]
then
CONTROL_BARCODE_META=$(ls $CONTROL_BARCODE_META)
else
CONTROL_BARCODE_META=$BUILD_DIR/$CONTROL_BARCODE_META
fi

echo Build dir is: $BUILD_DIR
Expand All @@ -77,11 +101,21 @@ PROJECT_DIR=$(dirname "$BUILD_DIR")
PROJECT_CODE=$(basename "$PROJECT_DIR")

echo Project Code: $PROJECT_CODE
echo REVERSE_INDEX2 is: $REVERSE_INDEX2
echo CONTROL_BARCODE_META is: $CONTROL_BARCODE_META
echo CELL_LINE_META is: $CELL_LINE_META

args=(
--raw_counts_uncollapsed "$RAW_COUNTS_UNCOLLAPSED"
--sample_meta "$SAMPLE_META"
--out "$BUILD_DIR"
--cell_line_meta "$CELL_LINE_META"
--CB_meta "$CONTROL_BARCODE_META"
--sequencing_index_cols="$SEQUENCING_INDEX_COLS"
--id_cols "$ID_COLS"
--reverse_index2 "$REVERSE_INDEX2"
--barcode_col "$BARCODE_COL"
--low_abundance_threshold "$LOW_ABUNDANCE_THRESHOLD"
--out "$BUILD_DIR"
)

echo Rscript collate_fastq_reads.R "${args[@]}"
Expand Down
1 change: 1 addition & 0 deletions scripts/compute_l2fc.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ library(tidyverse)
suppressPackageStartupMessages(library(argparse))
suppressPackageStartupMessages(library(dplyr))
source("./src/compute_l2fc.R")
source("./src/kitchen_utensils.R")

# Argument parser ----
parser <- ArgumentParser()
Expand Down
6 changes: 4 additions & 2 deletions scripts/compute_l2fc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ echo Rscript compute_l2fc.R -c $NORMALIZED_COUNTS \
--sig_cols $SIG_COLS \
--ctrl_cols $CONTROL_COLS \
--count_threshold $COUNT_THRESHOLD \
--normalized_counts $NORMALIZED_COUNTS
--normalized_counts $NORMALIZED_COUNTS \
--cell_line_cols $CELL_LINE_COLS

Rscript compute_l2fc.R -c $NORMALIZED_COUNTS \
--out $BUILD_DIR \
Expand All @@ -41,4 +42,5 @@ Rscript compute_l2fc.R -c $NORMALIZED_COUNTS \
--sig_cols $SIG_COLS \
--ctrl_cols $CONTROL_COLS \
--count_threshold $COUNT_THRESHOLD \
--normalized_counts $NORMALIZED_COUNTS
--normalized_counts $NORMALIZED_COUNTS \
--cell_line_cols $CELL_LINE_COLS
Loading

0 comments on commit 74a4326

Please sign in to comment.