-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
begin exploring markdown for qc readout
- Loading branch information
1 parent
8aa2371
commit 427c9d2
Showing
5 changed files
with
171 additions
and
201 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
name: Build Rmarkdown container (env/qc_markdown.Dockerfile) | ||
|
||
on: | ||
push: | ||
paths: | ||
- 'env/qc_markdown.Dockerfile' | ||
- '.github/workflows/qc_markdown.yml' | ||
pull_request: | ||
paths: | ||
- 'env/qc_markdown.Dockerfile' | ||
- '.github/workflows/qc_markdown.yml' | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v2 | ||
|
||
# Build Tools | ||
- name: Build and Publish | ||
uses: elgohr/Publish-Docker-Github-Action@v5 | ||
with: | ||
name: sjwidmay/haplotype_reconstruction_qtl_nf | ||
username: ${{ secrets.SJW_DOCKER_USER }} | ||
password: ${{ secrets.SJW_DOCKER_PASS }} | ||
snapshot: true | ||
dockerfile: qc_markdown.Dockerfile | ||
workdir: "env" | ||
tags: "qc_markdown" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,16 +5,33 @@ library(ggplot2) | |
library(qtlcharts) | ||
library(broman) | ||
library(fst) | ||
DO_cross <- qtl2::read_cross2("data/DOforqtl2.json") | ||
DO_cross <- qtl2::drop_nullmarkers(DO_cross) | ||
################################################################################ | ||
# Perform marker and sample QC using cross object and intensities upstream in | ||
# haplotype reconstruction pipeline. | ||
# | ||
# Sam Widmayer | ||
# [email protected] | ||
# 20231208 | ||
################################################################################ | ||
test_dir <- "/fastscratch/QC_HAP_outputDir/work/b8/fdfd8ce7bdd497445041b0ac27a8b7" | ||
setwd(test_dir) | ||
args <- commandArgs(trailingOnly = TRUE) | ||
|
||
# import cross object | ||
# cross <- args[1] | ||
cross <- "/projects/compsci/vmp/USERS/widmas/haplotype_reconstruction_qtl-nf/projects/do_oocyte/geno_probs/cross.RData" | ||
load(cross) | ||
|
||
# import intensities | ||
# intensities <- args[2] | ||
intensities <- "/projects/compsci/vmp/USERS/widmas/haplotype_reconstruction_qtl-nf/projects/do_oocyte/qtl2genos/intensities.fst" | ||
print(intensities) | ||
|
||
X4WC_cross <- qtl2::read_cross2("data/4WCforqtl2.json") | ||
X4WC_cross <- qtl2::drop_nullmarkers(X4WC_cross) | ||
|
||
# Reordering genotypes so that most common allele in founders is first | ||
for(chr in seq_along(DO_cross$founder_geno)) { | ||
fg <- DO_cross$founder_geno[[chr]] | ||
g <- DO_cross$geno[[chr]] | ||
for(chr in seq_along(cross$founder_geno)) { | ||
fg <- cross$founder_geno[[chr]] | ||
g <- cross$geno[[chr]] | ||
f1 <- colSums(fg==1)/colSums(fg != 0) | ||
|
||
fg[fg==0] <- NA | ||
|
@@ -26,178 +43,56 @@ for(chr in seq_along(DO_cross$founder_geno)) { | |
fg[is.na(fg)] <- 0 | ||
g[is.na(g)] <- 0 | ||
|
||
DO_cross$founder_geno[[chr]] <- fg | ||
DO_cross$geno[[chr]] <- g | ||
} | ||
for(chr in seq_along(X4WC_cross$founder_geno)) { | ||
fg <- X4WC_cross$founder_geno[[chr]] | ||
g <- X4WC_cross$geno[[chr]] | ||
f1 <- colSums(fg==1)/colSums(fg != 0) | ||
|
||
fg[fg==0] <- NA | ||
g[g==0] <- NA | ||
|
||
fg[,f1 < 0.5] <- 4 - fg[,f1 < 0.5] | ||
g[,f1 < 0.5] <- 4 - g[,f1 < 0.5] | ||
|
||
fg[is.na(fg)] <- 0 | ||
g[is.na(g)] <- 0 | ||
|
||
X4WC_cross$founder_geno[[chr]] <- fg | ||
X4WC_cross$geno[[chr]] <- g | ||
cross$founder_geno[[chr]] <- fg | ||
cross$geno[[chr]] <- g | ||
} | ||
|
||
# percent missing genotypes - DO | ||
percent_missing <- qtl2::n_missing(DO_cross, "ind", "prop")*100 | ||
missing_genos_df <- data.frame(names(percent_missing), percent_missing) %>% | ||
`colnames<-`(c("sample","percent_missing")) | ||
missing_genos_plot <- ggplot(data = missing_genos_df, mapping = aes(x = sample, | ||
y = percent_missing)) + | ||
theme_bw() + | ||
geom_point(shape = 21) + | ||
ylim(c(0,100)) + | ||
labs(title = "DO Missing Genotypes") + | ||
theme(legend.position = "bottom", | ||
panel.grid = element_blank(), | ||
axis.text.x = element_blank(), | ||
axis.ticks.x = element_blank()) | ||
ggsave(missing_genos_plot, filename = "plots/DO_missing_genos.png", width = 6, height = 6) | ||
percent_missing <- qtl2::n_missing(cross, "ind", "prop")*100 | ||
|
||
# percent missing genotypes - 4WC | ||
percent_missing_4WC <- qtl2::n_missing(X4WC_cross, "ind", "prop")*100 | ||
missing_genos_df <- data.frame(names(percent_missing_4WC), percent_missing_4WC) %>% | ||
`colnames<-`(c("sample","percent_missing")) | ||
missing_genos_plot <- ggplot(data = missing_genos_df, mapping = aes(x = sample, | ||
y = percent_missing_4WC)) + | ||
theme_bw() + | ||
geom_point(shape = 21) + | ||
ylim(c(0,100)) + | ||
labs(title = "4WC Missing Genotypes") + | ||
theme(legend.position = "bottom", | ||
panel.grid = element_blank(), | ||
axis.text.x = element_blank(), | ||
axis.ticks.x = element_blank()) | ||
missing_genos_plot | ||
ggsave(missing_genos_plot, filename = "plots/4WC_missing_genos.png", width = 6, height = 6) | ||
# Sample Duplicates | ||
cg <- compare_geno(cross, cores=0) | ||
|
||
## DO Sex checks | ||
## Reading in all probe intensities | ||
int <- fst::read.fst("data/DO4WC_intensities.fst") | ||
# Sex checks | ||
# Reading in all probe intensities | ||
int <- fst::read.fst(intensities) | ||
int <- int[seq(1, nrow(int), by=2),-(1:2)] + int[-seq(1, nrow(int), by=2),-(1:2)] | ||
DOint <- int[,which(colnames(int) %in% qtl2::ind_ids(DO_cross))] | ||
X4WCint <- int[,which(colnames(int) %in% qtl2::ind_ids(X4WC_cross))] | ||
int <- int[,which(colnames(int) %in% qtl2::ind_ids(cross))] | ||
|
||
# Interactive plot of DO array intensities per sample | ||
n <- names(sort(percent_missing, decreasing=TRUE)) | ||
DO_iboxplot <- iboxplot(log10(t(DOint[,n])+1), orderByMedian=FALSE, chartOpts=list(ylab="log10(SNP intensity + 1)")) | ||
save(DO_iboxplot, file = "plots/DO_array_interactiveboxplot.RData") | ||
# Interactive plot of 4WC array intensities per sample | ||
n <- names(sort(percent_missing_4WC, decreasing=TRUE)) | ||
X4WC_iboxplot <- iboxplot(log10(t(X4WCint[,n])+1), orderByMedian=FALSE, chartOpts=list(ylab="log10(SNP intensity + 1)")) | ||
save(X4WC_iboxplot, file = "plots/4WC_array_interactiveboxplot.RData") | ||
|
||
## Reading in sex chromosome intensities | ||
xint <- qtl2::read_csv_numer(filename = "data/all_genos/DO_4WC_chrXint.csv") | ||
yint <- qtl2::read_csv_numer(filename = "data/all_genos/DO_4WC_chrYint.csv") | ||
DO_covar <- read.csv("data/DO_covar.csv") | ||
DOxint <- xint[,colnames(xint)[which(colnames(xint) %in% DO_covar$SampleID)]] | ||
DOyint <- yint[,colnames(yint)[which(colnames(yint) %in% DO_covar$SampleID)]] | ||
sex <- substr(colnames(DOxint), 1, 1) | ||
|
||
## Testing for uninformative markers and filtering those out | ||
x_pval <- apply(DOxint, 1, function(a) t.test(a ~ sex)$p.value) | ||
y_pval <- apply(DOyint, 1, function(a) t.test(a ~ sex)$p.value) | ||
DOxint_ave <- colMeans(DOxint[x_pval < 0.05/length(x_pval),], na.rm=TRUE) | ||
DOyint_ave <- colMeans(DOyint[y_pval < 0.05/length(y_pval),], na.rm=TRUE) | ||
|
||
## Plotting sex chromosome intensities to verify sexes | ||
xyints <- data.frame(DOxint_ave, DOyint_ave) %>% | ||
dplyr::mutate(sample = rownames(.)) %>% | ||
dplyr::left_join(., DO_covar %>% dplyr::rename(sample = SampleID)) | ||
rownames(xyints) <- NULL | ||
labels <- paste0(names(DOxint_ave), " (", round(percent_missing), "%)") | ||
DOsexcheck_plot <- ggplot(data = xyints, mapping = aes(x = DOxint_ave, | ||
y = DOyint_ave, | ||
fill = Sex, | ||
label = labels)) + | ||
theme_bw() + | ||
geom_point(shape = 21, size = 4) + | ||
scale_fill_manual(values = c("green","purple")) + | ||
labs(x = "Average X chr intensity", | ||
y = "Average Y chr intensity") | ||
plotly::ggplotly(DOsexcheck_plot, tooltip = "label") | ||
|
||
|
||
## 4WC Sex Checks | ||
## Reading in sex chromosome intensities | ||
X4WC_covar <- read.csv("data/4WC_covar.csv") | ||
X4WCxint <- xint[,colnames(xint)[which(colnames(xint) %in% X4WC_covar$SampleID)]] | ||
X4WCyint <- yint[,colnames(yint)[which(colnames(yint) %in% X4WC_covar$SampleID)]] | ||
X4WCmetadata <- data.frame(colnames(X4WCxint)) %>% | ||
`colnames<-`(c("sample")) %>% | ||
dplyr::left_join(., X4WC_covar %>% | ||
dplyr::rename(sample = SampleID)) | ||
sex <- X4WCmetadata$Sex | ||
|
||
## Testing for uninformative markers and filtering those out | ||
x_pval <- apply(X4WCxint, 1, function(a) t.test(a ~ sex)$p.value) | ||
y_pval <- apply(X4WCyint, 1, function(a) t.test(a ~ sex)$p.value) | ||
X4WCxint_ave <- colMeans(X4WCxint[x_pval < 0.05/length(x_pval),], na.rm=TRUE) | ||
X4WCyint_ave <- colMeans(X4WCyint[y_pval < 0.05/length(y_pval),], na.rm=TRUE) | ||
|
||
## Plotting sex chromosome intensities to verify sexes | ||
xyints <- data.frame(X4WCxint_ave, X4WCyint_ave) %>% | ||
dplyr::mutate(sample = rownames(.)) %>% | ||
dplyr::left_join(., X4WC_covar %>% dplyr::rename(sample = SampleID)) | ||
rownames(xyints) <- NULL | ||
labels <- paste0(names(X4WCxint_ave), " (", round(percent_missing_4WC), "%)") | ||
X4WCsexcheck_plot <- ggplot(data = xyints, mapping = aes(x = X4WCxint_ave, | ||
y = X4WCyint_ave, | ||
fill = Sex, | ||
label = labels)) + | ||
theme_bw() + | ||
geom_point(shape = 21, size = 4) + | ||
scale_fill_manual(values = c("blue","orange")) + | ||
labs(x = "Average X chr intensity", | ||
y = "Average Y chr intensity") | ||
plotly::ggplotly(X4WCsexcheck_plot, tooltip = "label") | ||
|
||
|
||
## Sample Duplicates | ||
cg <- compare_geno(DO_cross, cores=0) | ||
summary(cg) | ||
cg <- compare_geno(X4WC_cross, cores=0) | ||
summary(cg) | ||
|
||
save(DO_cross, file = "data/DO_cross.RData") | ||
save(X4WC_cross, file = "data/4WC_cross.RData") | ||
|
||
iboxplot <- iboxplot(log10(t(int)+1), | ||
orderByMedian=TRUE, | ||
chartOpts=list(ylab="log10(SNP intensity + 1)")) | ||
|
||
## Genotype Frequencies | ||
## DO_cross | ||
g <- do.call("cbind", DO_cross$geno[1:19]) | ||
fg <- do.call("cbind", DO_cross$founder_geno[1:19]) | ||
g <- do.call("cbind", cross$geno[1:19]) | ||
fg <- do.call("cbind", cross$founder_geno[1:19]) | ||
|
||
# find markers with missing genotypes in samples | ||
g <- g[,colSums(fg==0)==0] | ||
|
||
# find markers with missing genotypes in the founders | ||
fg <- fg[,colSums(fg==0)==0] | ||
fgn <- colSums(fg==3) | ||
gf_ind <- vector("list", 4) | ||
for(i in 1:4) { | ||
gf_ind[[i]] <- t(apply(g[,fgn==i], 1, function(a) table(factor(a, 1:3))/sum(a != 0))) | ||
} | ||
|
||
png(file=paste0("plots/DOcross_genotype_frequencies.png")) | ||
# plot genotype frequencies | ||
par(mfrow=c(2,2), mar=c(0.6, 0.6, 2.6, 0.6)) | ||
for(i in 1:4) { | ||
triplot(c("AA", "AB", "BB"), main=paste0("MAF = ", i, "/8")) | ||
tripoints(gf_ind[[i]], pch=21, bg="lightblue") | ||
tripoints(c((1-i/8)^2, 2*i/8*(1-i/8), (i/8)^2), pch=21, bg="violetred") | ||
|
||
if(i>=3) { # label mouse with lowest het | ||
wh <- which(gf_ind[[i]][,2] == min(gf_ind[[i]][,2])) | ||
tritext(gf_ind[[i]][wh,,drop=FALSE] + c(0.02, -0.02, 0), | ||
names(wh), adj=c(0, 1)) | ||
} | ||
|
||
# label other mice | ||
if(i==1) { | ||
lab <- rownames(gf_ind[[i]])[gf_ind[[i]][,2]>0.3] | ||
|
@@ -211,7 +106,7 @@ for(i in 1:4) { | |
else if(i==4) { | ||
lab <- rownames(gf_ind[[i]])[gf_ind[[i]][,2]>0.6] | ||
} | ||
|
||
for(ind in lab) { | ||
if(grepl("^F", ind) && i != 3) { | ||
tritext(gf_ind[[i]][ind,,drop=FALSE] + c(-0.01, 0, +0.01), ind, adj=c(1,0.5)) | ||
|
@@ -220,54 +115,65 @@ for(i in 1:4) { | |
} | ||
} | ||
} | ||
dev.off() | ||
|
||
|
||
# ## 4WC ## NOT GENERALIZABLE TO THINGS OTHER THAN DO (probably because different MAF distribution) | ||
# g <- do.call("cbind", X4WC_cross$geno[1:19]) | ||
# fg <- do.call("cbind", X4WC_cross$founder_geno[1:19]) | ||
# g <- g[,colSums(fg==0)==0] | ||
# fg <- fg[,colSums(fg==0)==0] | ||
# fgn <- colSums(fg==3) | ||
# gf_ind <- vector("list", 4) | ||
# for(i in 1:4) { | ||
# gf_ind[[i]] <- t(apply(g[,fgn==i], 1, function(a) table(factor(a, 1:3))/sum(a != 0))) | ||
# } | ||
# ## Reading in sex chromosome intensities | ||
# xint <- qtl2::read_csv_numer(filename = "data/all_genos/chrXint.csv") | ||
# yint <- qtl2::read_csv_numer(filename = "data/all_genos/chrYint.csv") | ||
# | ||
# png(file=paste0("plots/4WC_genotype_frequencies.png")) | ||
# par(mfrow=c(2,2), mar=c(0.6, 0.6, 2.6, 0.6)) | ||
# for(i in 1:4) { | ||
# triplot(c("AA", "AB", "BB"), main=paste0("MAF = ", i, "/8")) | ||
# tripoints(gf_ind[[i]], pch=21, bg="lightblue") | ||
# tripoints(c((1-i/8)^2, 2*i/8*(1-i/8), (i/8)^2), pch=21, bg="violetred") | ||
# DOxint <- xint[,colnames(xint)[which(colnames(xint) %in% metadata$sample)]] | ||
# DOyint <- yint[,colnames(yint)[which(colnames(yint) %in% metadata$sample)]] | ||
# sex <- cross$is_female | ||
# sex <- dplyr::if_else(condition = sex == TRUE, true = "F", false = "M") | ||
# names(sex) <- names(cross$is_female) | ||
# | ||
# if(length(levels(as.factor(sex))) == 1){ | ||
# print("Skipping t-test; only 1 level to t-test") | ||
# DOxint_ave <- colMeans(DOxint, na.rm=TRUE) | ||
# DOyint_ave <- colMeans(DOyint, na.rm=TRUE) | ||
# xyints <- data.frame(DOxint_ave, DOyint_ave) | ||
# | ||
# # test | ||
# xyints <- cbind(xyints, sex) | ||
# labels <- paste0(rownames(xyints), " (", round(percent_missing), "%)") | ||
# DOsexcheck_plot <- ggplot(data = xyints, mapping = aes(x = DOxint_ave, | ||
# y = DOyint_ave, | ||
# fill = sex, | ||
# label = labels)) + | ||
# theme_bw() + | ||
# geom_point(shape = 21, size = 4) + | ||
# scale_fill_manual(values = c("green","purple")) + | ||
# labs(x = "Average X chr intensity", | ||
# y = "Average Y chr intensity") | ||
# | ||
# if(i>=3) { # label mouse with lowest het | ||
# wh <- which(gf_ind[[i]][,2] == min(gf_ind[[i]][,2])) | ||
# tritext(gf_ind[[i]][wh,,drop=FALSE] + c(0.02, -0.02, 0), | ||
# names(wh), adj=c(0, 1)) | ||
# } | ||
# plotly::ggplotly(DOsexcheck_plot, tooltip = "label") | ||
# } else { | ||
# | ||
# # label other mice | ||
# if(i==1) { | ||
# lab <- rownames(gf_ind[[i]])[gf_ind[[i]][,2]>0.3] | ||
# } | ||
# else if(i==2) { | ||
# lab <- rownames(gf_ind[[i]])[gf_ind[[i]][,2]>0.48] | ||
# } | ||
# else if(i==3) { | ||
# lab <- rownames(gf_ind[[i]])[gf_ind[[i]][,2]>0.51] | ||
# } | ||
# else if(i==4) { | ||
# lab <- rownames(gf_ind[[i]])[gf_ind[[i]][,2]>0.6] | ||
# } | ||
# print("Testing for uninformative markers and filtering those out") | ||
# x_pval <- apply(DOxint, 1, function(a) t.test(a ~ sex)$p.value) | ||
# y_pval <- apply(DOyint, 1, function(a) t.test(a ~ sex)$p.value) | ||
# DOxint_ave <- colMeans(DOxint[x_pval < 0.05/length(x_pval),], na.rm=TRUE) | ||
# DOyint_ave <- colMeans(DOyint[y_pval < 0.05/length(y_pval),], na.rm=TRUE) | ||
# | ||
# for(ind in lab) { | ||
# if(grepl("^F", ind) && i != 3) { | ||
# tritext(gf_ind[[i]][ind,,drop=FALSE] + c(-0.01, 0, +0.01), ind, adj=c(1,0.5)) | ||
# } else { | ||
# tritext(gf_ind[[i]][ind,,drop=FALSE] + c(0.01, 0, -0.01), ind, adj=c(0,0.5)) | ||
# } | ||
# } | ||
# ## Plotting sex chromosome intensities to verify sexes | ||
# xyints <- data.frame(DOxint_ave, DOyint_ave) %>% | ||
# dplyr::mutate(sample = rownames(.)) | ||
# rownames(xyints) <- NULL | ||
# labels <- paste0(names(DOxint_ave), " (", round(percent_missing), "%)") | ||
# DOsexcheck_plot <- ggplot(data = xyints, mapping = aes(x = DOxint_ave, | ||
# y = DOyint_ave, | ||
# fill = Sex, | ||
# label = labels)) + | ||
# theme_bw() + | ||
# geom_point(shape = 21, size = 4) + | ||
# scale_fill_manual(values = c("green","purple")) + | ||
# labs(x = "Average X chr intensity", | ||
# y = "Average Y chr intensity") | ||
# plotly::ggplotly(DOsexcheck_plot, tooltip = "label") | ||
# } | ||
# dev.off() | ||
# | ||
# | ||
|
||
# # save(cross, file = "data/cross.RData") | ||
# | ||
# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,7 @@ library(qtl2convert) | |
# | ||
# Sam Widmayer | ||
# [email protected] | ||
# 20230714 | ||
# 20231208 | ||
################################################################################ | ||
# test_dir <- "/fastscratch/QC_HAP_outputDir/work/b2/23a342cd54730a19c9efab3e958378" | ||
# setwd(test_dir) | ||
|
Oops, something went wrong.