Skip to content

Commit

Permalink
Merge pull request #4 from cmap/dev
Browse files Browse the repository at this point in the history
Merge dev into main
  • Loading branch information
AnupJonchhe authored Oct 12, 2021
2 parents 07228da + e5bcb04 commit 1f8701e
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 45 deletions.
5 changes: 1 addition & 4 deletions CBnormalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ suppressPackageStartupMessages(library(reshape2))
## barcodes - a vector of control barcode Name identifiers
normalize <- function(X, barcodes) {
normalized <- X %>%
dplyr::group_by(sample_ID) %>%
dplyr::group_by(profile_id) %>%
dplyr::mutate(log_normalized_n = glm(y ~ x,
data = tibble(
y = log_dose[Name %in% barcodes],
Expand All @@ -37,15 +37,12 @@ parser$add_argument("-v", "--verbose", action="store_true", default=TRUE,
parser$add_argument("-q", "--quietly", action="store_false",
dest="verbose", help="Print little output")


parser$add_argument("--wkdir", default=getwd(), help="Working directory")
parser$add_argument("-c", "--filtered_counts", default="filtered_counts.csv",
help="path to file containing filtered counts")
parser$add_argument("--CB_meta", default="../metadata/CB_meta.csv", help = "Control Barcode metadata")
parser$add_argument("-o", "--out", default="", help = "Output path. Default is working directory")



# get command line options, if help option encountered print help and exit
args <- parser$parse_args()

Expand Down
85 changes: 85 additions & 0 deletions bcl2fastq.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/bin/bash
source /broad/software/scripts/useuse
reuse .bcl2fastq2-v2.20.0 > /dev/null
reuse .bcl2fastq2-2.20.0.422 > /dev/null

OUT_DIR=/xchip/prism/bcl2fastq/
PSEQ_obelix=/cmap/obelix/pod/prismSeq/

#optional
if test $# -lt 1; then
printf "Usage ./bcl2fastq.sh [options]\nOptions include:\n"
printf -- "-s, --seq_code \t\t Sequencer run code E.g. JNV9V \n"
printf -- "-p, --proj_code \t Project code to prep project directory in /cmap/obelix/pod/prismSeq/ \n"
printf -- "-b, --build_dir \t Build directory, usually on /cmap/obelix/. Overrides PROJ_CODE\n"
printf -- "-o, --out_dir \t\t Path to temp storage of fastq files on /xchip/prism/ \n"
printf -- "-h, --help \t\t Print this help text\n"
exit 1
fi

while test $# -gt 0; do
case "$1" in
-h|--help)
printf "Usage ./bcl2fastq.sh [options]\nOptions include:\n"
printf -- "-s, --seq_code \t\t Sequencer run code E.g. JNV9V \n"
printf -- "-p, --proj_code \t Project code to prep project directory in /cmap/obelix/pod/prismSeq/ \n"
printf -- "-b, --build_dir \t Build directory, usually on /cmap/obelix/. Overrides PROJ_CODE\n"
printf -- "-o, --out_dir \t\t Path to temp storage of fastq files on /xchip/prism/ \n"
printf -- "-h, --help \t\t Print this help text\n"
exit 0
;;
-s|--seq_code)
shift
SEQ_CODE=$1
;;
-b|--build_dir)
shift
#echo $1
BUILD_DIR=$1
;;
-p|--proj_code)
shift
#echo $1
PROJ_CODE=$1
;;
-o|--out_dir)
shift
#echo $1
OUT_DIR=$1
;;
*)
printf "Unknown parameter: %s \n" "$1"
shift
;;
esac
shift
done

if [ ! -d $OUT_DIR ]
then
mkdir $OUT_DIR
fi

RUNFOLDER_DIR=$(echo /xchip/prism/MiSeq\ Outputs/*-$SEQ_CODE)

echo $RUNFOLDER_DIR

bcl2fastq --runfolder-dir "$RUNFOLDER_DIR" --output-dir $OUT_DIR/$SEQ_CODE --minimum-trimmed-read-length 35 --mask-short-adapter-reads 22 --create-fastq-for-index-reads

if [ -z $BUILD_DIR ]
then
BUILD_DIR=$PSEQ_obelix/$PROJ_CODE
fi

if [ ! -d $BUILD_DIR ]
then
mkdir $BUILD_DIR
fi

if [ ! -d $BUILD_DIR/fastq/ ]
then
mkdir $BUILD_DIR/fastq/
fi

echo Copying fastq files from $OUT_DIR/$SEQ_CODE/ to $BUILD_DIR/fastq/
cp $OUT_DIR/$SEQ_CODE/*.fastq.gz $BUILD_DIR/fastq/
35 changes: 20 additions & 15 deletions compute_l2fc.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,34 +12,39 @@ suppressPackageStartupMessages(library(dplyr)) #n()
## takes:
## normalized_counts - table with normalized_n column and control_sample column that designates the
## name of the control sample for each treatment sample
compute_l2fc = function(normalized_counts, control_types) {
compute_l2fc = function(normalized_counts, control_type) {
treatments = normalized_counts %>%
filter(!(trt_type %in% control_types),
filter(trt_type!=control_type, trt_type!="day_0",
is.na(Name)) %>%
dplyr::select(-Name, -log_dose, -n, -log_n, -log_normalized_n) %>%
group_by_at(setdiff(names(.), c("normalized_n", "tech_rep", "profile_id"))) %>%
group_by_at(setdiff(names(.), c("normalized_n", "tech_rep"))) %>%
dplyr::summarise(sum_normalized_n = sum(normalized_n)) %>%
ungroup()

controls = normalized_counts %>%
filter(trt_type %in% control_types,
filter(trt_type==control_type,
is.na(Name)) %>%
mutate(control_sample=sample_ID) %>%
dplyr::select(-Name, -log_dose, -n, -log_n, -log_normalized_n) %>%
group_by_at(setdiff(names(.), c("normalized_n", "tech_rep", "profile_id"))) %>%
group_by_at(setdiff(names(.), c("normalized_n", "tech_rep"))) %>%
dplyr::summarise(sum_normalized_n = sum(normalized_n)) %>%
ungroup() %>%
#group_by_at(setdiff(names(.), c("sum_normalized_n", "bio_rep"))) %>%
group_by(CCLE_name, DepMap_ID, prism_cell_set, control_sample) %>%
group_by(CCLE_name, DepMap_ID, prism_cell_set) %>%
dplyr::summarise(control_median_normalized_n = median(sum_normalized_n),
control_mad_sqrtN = mad(log10(sum_normalized_n))/sqrt(n())) %>%
ungroup() %>%
mutate(control_pass_QC = ifelse(control_mad_sqrtN > 0.5, F, T)) %>%
dplyr::select(CCLE_name, DepMap_ID, prism_cell_set, control_sample, control_median_normalized_n, control_mad_sqrtN, control_pass_QC)
dplyr::select(CCLE_name, DepMap_ID, prism_cell_set, control_median_normalized_n, control_mad_sqrtN, control_pass_QC)

if(nrow(controls)==0) {
print("No samples found for indicated control type.")
stop()
}

l2fc = treatments %>%
merge(controls, by=c("CCLE_name", "DepMap_ID", "prism_cell_set", "control_sample"), all.x=T, all.y=T) %>%
merge(controls, by=c("CCLE_name", "DepMap_ID", "prism_cell_set"), all.x=T, all.y=T) %>%
mutate(l2fc=log2(sum_normalized_n/control_median_normalized_n)) %>%
dplyr::relocate(project_code, CCLE_name, DepMap_ID, prism_cell_set, sample_ID, trt_type, control_sample, control_barcodes,
bio_rep)
dplyr::relocate(project_code, CCLE_name, DepMap_ID, prism_cell_set, profile_id, trt_type, control_barcodes,
bio_rep)

return(l2fc)
}
Expand All @@ -54,7 +59,7 @@ parser$add_argument("-q", "--quietly", action="store_false",
parser$add_argument("--wkdir", default=getwd(), help="Working directory")
parser$add_argument("-c", "--normalized_counts", default="normalized_counts.csv",
help="path to file containing normalized counts")
parser$add_argument("-ct", "--control_types", default="trt_ctrl,negcon", help="trt_types to use as control")
parser$add_argument("-ct", "--control_type", default="negcon", help="trt_type to use as control")
parser$add_argument("-o","--out", default="", help = "Output path. Default is working directory")

# get command line options, if help option encountered print help and exit
Expand All @@ -64,12 +69,12 @@ if (args$out == ""){
args$out = args$wkdir
}

control_types = unlist(strsplit(args$control_types, ","))
control_type = args$control_type

normalized_counts = read.csv(args$normalized_counts)

print("computing log-fold change")
l2fc = compute_l2fc(normalized_counts, control_types)
l2fc = compute_l2fc(normalized_counts, control_type)

l2fc_out = paste(args$out, "l2fc.csv", sep="/")
write.csv(l2fc, l2fc_out, row.names=F, quote=F)
4 changes: 2 additions & 2 deletions filter_counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ filter_raw_reads = function(raw_counts, sample_meta, cell_line_meta, cell_set_me
dplyr::select(-any_of(c("flowcell_name", "flowcell_lane", "index_1", "index_2", "members",
"lysate_well", "lysate_plate", "pcr_well", "pcr_plate",
"forward_read_cl_barcode", "LUA"))) %>%
dplyr::relocate(project_code, CCLE_name, DepMap_ID, prism_cell_set, Name, log_dose, profile_id, sample_ID, trt_type, control_sample, control_barcodes,
dplyr::relocate(project_code, CCLE_name, DepMap_ID, prism_cell_set, Name, log_dose, profile_id, trt_type, control_barcodes,
bio_rep, tech_rep) %>%
dplyr::relocate(n, .after=last_col())

Expand Down Expand Up @@ -86,7 +86,7 @@ parser$add_argument("-s", "--sample_meta", default="", help = "Sample metadata")
parser$add_argument("--cell_line_meta", default="../metadata/cell_line_meta.csv", help = "Cell Line metadata")
parser$add_argument("--cell_set_meta", default="../metadata/cell_set_meta.csv", help = "Cell set metadata")
parser$add_argument("--CB_meta", default="../metadata/CB_meta.csv", help = "Control Barcode metadata")
parser$add_argument("--id_cols", default="sample_ID,pcr_well,tech_rep", help = "Columns used to generate profile ids, comma-separated colnames from --sample_meta")
parser$add_argument("--id_cols", default="treatment,dose,dose_unit,day", help = "Columns used to generate profile ids, comma-separated colnames from --sample_meta")

# get command line options, if help option encountered print help and exit
args <- parser$parse_args()
Expand Down
2 changes: 1 addition & 1 deletion generate_biomarkers.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ suppressPackageStartupMessages(library(cdsrbiomarker))
generate_biomarkers = function(collapsed_values) {
bio_in = collapsed_values %>%
filter(trt_pass_QC) %>%
dcast(DepMap_ID~sample_ID+control_sample, value.var="median_l2fc") %>%
dcast(DepMap_ID~profile_id, value.var="median_l2fc") %>%
column_to_rownames("DepMap_ID")

bio_out = cdsrbiomarker::get_biomarkers(bio_in)
Expand Down
41 changes: 18 additions & 23 deletions replicate_QC.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(tibble))
suppressPackageStartupMessages(library(stringr))

## check_replicate_cor
## checks that technical and biological replicates are all well correlated with each other
Expand All @@ -18,7 +19,7 @@ suppressPackageStartupMessages(library(tibble))
check_replicate_cor = function(normalized_counts, out) {
tech_rep_cor = normalized_counts %>%
filter(is.na(Name)) %>%
dcast(CCLE_name~sample_ID+bio_rep+tech_rep, value.var="log_normalized_n") %>%
dcast(CCLE_name~profile_id+bio_rep+tech_rep, value.var="log_normalized_n") %>%
dplyr::select(-CCLE_name) %>%
cor(use="complete.obs") %>% as.data.frame()

Expand All @@ -27,31 +28,25 @@ check_replicate_cor = function(normalized_counts, out) {

tech_rep_cor_long = tech_rep_cor %>%
rownames_to_column("sample_1") %>%
melt(id.vars="sample_1", variable.name="sample_2", value.name="cor") %>%
mutate(sample_ID_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 1) %>% unlist(),
sample_ID_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 1) %>% unlist()) %>%
filter(sample_ID_1 == sample_ID_2) %>%
mutate(bio_rep_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 2) %>% unlist(),
bio_rep_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 2) %>% unlist()) %>%
filter(bio_rep_1 == bio_rep_2) %>%
mutate(tech_rep_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 3) %>% unlist(),
tech_rep_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 3) %>% unlist()) %>%
filter(tech_rep_2>tech_rep_1) %>%
dplyr::rename(sample_ID = sample_ID_1, bio_rep = bio_rep_1) %>%
dcast(sample_ID+bio_rep~tech_rep_1+tech_rep_2, value.var="cor")
melt(id.vars="sample_1", variable.name="sample_2", value.name="tech_rep_cor") %>%
mutate(sample_1 = gsub('.{2}$', '', sample_1),
sample_2 = gsub('.{2}$', '', sample_2)) %>%
filter(sample_1 == sample_2) %>%
dplyr::rename(profile_id = sample_1) %>%
dplyr::select(profile_id, tech_rep_cor)

trep_long_out = paste(args$out, "tech_rep_cor_long.csv", sep='/')
write.csv(tech_rep_cor_long, trep_long_out, row.names=T, quote=F)
write.csv(tech_rep_cor_long, trep_long_out, row.names=F, quote=F)

tech_collapsed_counts = normalized_counts %>%
filter(is.na(Name)) %>%
dplyr::select(-Name, -log_dose, -n, -log_n, -log_normalized_n, -profile_id) %>%
dplyr::select(-Name, -log_dose, -n, -log_n, -log_normalized_n) %>%
group_by_at(setdiff(names(.), c("normalized_n", "tech_rep"))) %>%
dplyr::summarise(sum_normalized_n = sum(normalized_n)) %>%
ungroup()

bio_rep_cor = tech_collapsed_counts %>%
dcast(CCLE_name~sample_ID+bio_rep, value.var="sum_normalized_n") %>%
dcast(CCLE_name~profile_id+bio_rep, value.var="sum_normalized_n") %>%
dplyr::select(-CCLE_name) %>%
cor(use="complete.obs") %>%
as.data.frame()
Expand All @@ -62,17 +57,17 @@ check_replicate_cor = function(normalized_counts, out) {
bio_rep_cor_long = bio_rep_cor %>%
rownames_to_column("sample_1") %>%
melt(id.vars="sample_1", variable.name="sample_2", value.name="cor") %>%
mutate(sample_ID_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 1) %>% unlist(),
sample_ID_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 1) %>% unlist()) %>%
mutate(sample_ID_1 = gsub('.{2}$', '', sample_1),
sample_ID_2 = gsub('.{2}$', '', sample_2)) %>%
filter(sample_ID_1 == sample_ID_2) %>%
mutate(bio_rep_1 = as.character(sample_1) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 2) %>% unlist(),
bio_rep_2 = as.character(sample_2) %>% purrr::map(strsplit, "_") %>% purrr::map(`[[`, 1) %>% purrr::map(`[`, 2) %>% unlist()) %>%
mutate(bio_rep_1 = as.character(sample_1) %>% purrr::map(str_sub, -1, -1) %>% unlist(),
bio_rep_2 = as.character(sample_2) %>% purrr::map(str_sub, -1, -1) %>% unlist()) %>%
filter(bio_rep_2>bio_rep_1) %>%
dplyr::rename(sample_ID = sample_ID_1) %>%
dcast(sample_ID~bio_rep_1+bio_rep_2, value.var="cor")
dplyr::rename(profile_id = sample_ID_1) %>%
dcast(profile_id~bio_rep_1+bio_rep_2, value.var="cor")

brep_long_out = paste(args$out, "bio_rep_cor_long.csv", sep='/')
write.csv(bio_rep_cor_long, brep_long_out, row.names=T, quote=F)
write.csv(bio_rep_cor_long, brep_long_out, row.names=F, quote=F)
}


Expand Down

0 comments on commit 1f8701e

Please sign in to comment.