diff --git a/Scripts/Ancestry_identifier/Ancestry_identifier.R b/Scripts/Ancestry_identifier/Ancestry_identifier.R index d596b705..0368eb38 100644 --- a/Scripts/Ancestry_identifier/Ancestry_identifier.R +++ b/Scripts/Ancestry_identifier/Ancestry_identifier.R @@ -1,340 +1,340 @@ -#!/usr/bin/Rscript -# This script was written by Oliver Pain whilst at King's College London University. -start.time <- Sys.time() -library("optparse") - -option_list = list( -make_option("--target_plink_chr", action="store", default=NULL, type='character', - help="Path to per chromosome target PLINK2 files [required]"), -make_option("--target_keep", action="store", default=NULL, type='character', - help="Path to file listing individuals in the target sample to retain [optional]"), -make_option("--ref_plink_chr", action="store", default=NULL, type='character', - help="Path to per chromosome reference PLINK2 files [required]"), -make_option("--ref_keep", action="store", default=NULL, type='character', - help="Path to file listing individuals in the reference sample to retain [optional]"), -make_option("--maf", action="store", default=0.05, type='numeric', - help="Minor allele frequency threshold [optional]"), -make_option("--geno", action="store", default=0.02, type='numeric', - help="Variant missingness threshold [optional]"), -make_option("--hwe", action="store", default=1e-6, type='numeric', - help="Hardy Weinberg p-value threshold. [optional]"), -make_option("--n_pcs", action="store", default=6, type='numeric', - help="Number of PCs (min=4) [optional]"), -make_option("--plink2", action="store", default='plink2', type='character', - help="Path PLINK software binary [optional]"), -make_option("--output", action="store", default=NULL, type='character', - help="Path for output files [required]"), -make_option("--pop_data", action="store", default=NULL, type='character', - help="Population data for the reference samples [required]"), -make_option("--model_method", action="store", default='glmnet', type='character', - help="Method used for generate prediction model [optional]"), -make_option("--sd_rule", action="store", default=F, type='logical', - help="Logical indicating whether the 3SD rule should be used to define ancestry, or the model-based approach [optional]"), -make_option("--prob_thresh", action="store", default=0.95, type='numeric', - help="Indicates whether probability threshold should be used when defining ancestry [optional]"), -make_option("--test", action="store", default=NA, type='character', - help="Specify test mode [optional]"), -make_option("--memory", action="store", default=5000, type='numeric', - help="Memory limit [optional]") -) - -opt = parse_args(OptionParser(option_list=option_list)) - -# Load dependencies -library(GenoUtils) -source('../functions/misc.R') -source_all('../functions') -library(data.table) -library(caret) -library(pROC) -library(verification) -library(ggplot2) -library(cowplot) - -# Check required inputs -if(is.null(opt$target_plink_chr)){ - stop('--target_plink_chr must be specified.\n') -} -if(is.null(opt$ref_plink_chr)){ - stop('--ref_plink_chr must be specified.\n') -} -if(is.null(opt$output)){ - stop('--output must be specified.\n') -} -if(is.null(opt$pop_data)){ - stop('--pop_data must be specified.\n') -} - -# Create output directory -opt$out_dir<-paste0(dirname(opt$output),'/') -system(paste0('mkdir -p ',opt$out_dir)) - -# Create temp directory -tmp_dir<-tempdir() - -# Initiate log file -log_file <- paste0(opt$output,'.log') -log_header(log_file = log_file, opt = opt, script = 'Ancestry_identifier.R', start.time = start.time) - - -# If testing, change CHROMS to chr value, and lower ancestry probability threshold -if(!is.na(opt$test) && opt$test == 'NA'){ - opt$test<-NA -} -if(!is.na(opt$test)){ - CHROMS <- as.numeric(gsub('chr','',opt$test)) - opt$prob_thresh <- 0.5 - log_add(log_file = log_file, message = 'Lowering prob_thresh parameter to 0.5 for testing.') -} - -if(nrow(fread(paste0(opt$ref_plink_chr, CHROMS[1],'.psam'))) < 100){ - stop('opt$ref_plink_chr must contain at least 100 individuals.') -} - -########### -# Extract target_keep -########### - -if(!is.null(opt$target_keep)){ - plink_subset(keep = opt$target_keep, chr = CHROMS, plink2 = opt$plink2, pfile = opt$target_plink_chr, out = paste0(tmp_dir,'/target_subset.chr')) - opt$target_plink_chr_subset<-paste0(tmp_dir,'/target_subset') -} else { - opt$target_plink_chr_subset<-opt$target_plink_chr -} - -########### -# Extract ref_keep -########### - -if(!is.null(opt$ref_keep)){ - plink_subset(keep = opt$ref_keep, chr = CHROMS, plink2 = opt$plink2, pfile = opt$ref_plink_chr, out = paste0(tmp_dir,'/ref_subset.chr')) - opt$ref_plink_chr_subset<-paste0(tmp_dir,'/ref_subset.chr') -} else { - opt$ref_plink_chr_subset<-opt$ref_plink_chr -} - -########### -# QC target -########### - -# If target sample size is <100, only apply SNP missingness parameter -psam<-fread(paste0(opt$target_plink_chr_subset, CHROMS[1], '.psam')) - -if(nrow(psam) > 100){ - target_qc_snplist<-plink_qc_snplist(pfile = opt$target_plink_chr_subset, chr = CHROMS, plink2 = opt$plink2, geno = opt$geno, maf = opt$maf, hwe = opt$hwe) -} else { - target_qc_snplist<-plink_qc_snplist(pfile = opt$target_plink_chr_subset, chr = CHROMS, plink2 = opt$plink2, geno = opt$geno) - log_add(log_file = log_file, message = 'Target sample size is <100 so only checking genotype missingness.') -} - -########### -# QC reference -########### - -ref_qc_snplist<-plink_qc_snplist(pfile = opt$ref_plink_chr_subset, chr = CHROMS, plink2 = opt$plink2, geno = opt$geno, maf = opt$maf, hwe = opt$hwe) - -########### -# Harmonise target and reference genetic data -########### - -# read in target pvar file -targ_pvar<-read_pvar(opt$target_plink_chr_subset, chr = CHROMS) - -# read in reference pvar file -ref_pvar<-read_pvar(opt$ref_plink_chr_subset, chr = CHROMS) - -# retain variants surviving QC -targ_pvar<-targ_pvar[targ_pvar$SNP %in% intersect(target_qc_snplist, ref_qc_snplist), ] -ref_pvar<-ref_pvar[ref_pvar$SNP %in% intersect(target_qc_snplist, ref_qc_snplist), ] - -# insert IUPAC codes -targ_pvar$IUPAC <- snp_iupac(targ_pvar$A1, targ_pvar$A2) -ref_pvar$IUPAC <- snp_iupac(ref_pvar$A1, ref_pvar$A2) - -# Identify SNPs present in both samples (allowing for strand flips) -# Identify SNPs that need to be flipped -target_ref<-merge(targ_pvar, ref_pvar, by='SNP') -flip <- detect_strand_flip(target_ref$IUPAC.x, target_ref$IUPAC.y) - -flip_snplist<-NULL -if(sum(flip) > 0){ - flip_snplist<-target_ref$SNP.y[flip] - log_add(log_file = log_file, message = paste0(sum(flip), 'variants will be flipped.')) -} - -# Remove variants where IUPAC codes do not match (allowing for strand flips) -matched <- which((target_ref$IUPAC.x == target_ref$IUPAC.y) | flip) -target_ref<-target_ref[matched,] - -log_add(log_file = log_file, message = paste0(nrow(target_ref),' variants match between target and reference after QC.')) - -########### -# Identify list of LD independent SNPs -########### - -log_add(log_file = log_file, message = 'Identifying LD independent SNPs based on reference data.') - -# Subset ref_pvar to contain QC'd variants -ref_pvar<-ref_pvar[ref_pvar$SNP %in% target_ref$SNP,] - -# Remove regions of high LD -ref_pvar <- remove_regions(dat = ref_pvar, regions = long_ld_coord) -log_add(log_file = log_file, message = paste0(nrow(ref_pvar),' variants after removal of LD high regions.')) - -# Perform LD pruning -ld_indep <- plink_prune(pfile = opt$ref_plink_chr_subset, plink2 = opt$plink2, extract = ref_pvar$SNP, chr = CHROMS) -log_add(log_file = log_file, message = paste0(length(ld_indep),' independent variants retained.')) - -########### -# Perform PCA based on reference -########### - -log_add(log_file = log_file, message = 'Performing PCA based on reference.') - -snp_weights<-plink_pca(pfile = opt$ref_plink_chr_subset, chr = CHROMS, plink2 = opt$plink2, extract = ld_indep, flip = flip_snplist, n_pc = opt$n_pcs) -fwrite(snp_weights, paste0(tmp_dir,'/ref.eigenvec.var'), row.names = F, quote=F, sep=' ', na='NA') - -### -# Calculate PCs in the reference sample for scaling the target sample factor scores. -### - -log_add(log_file = log_file, message = 'Computing reference PCs.') - -# Calculate PCs in the reference -ref_pcs<-plink_score(pfile = opt$ref_plink_chr_subset, chr = CHROMS, plink2 = opt$plink2, score = paste0(tmp_dir,'/ref.eigenvec.var')) - -# Scale across all individuals -ref_pcs_centre_scale <- score_mean_sd(scores = ref_pcs) - -### -# Create model predicting reference populations -### - -log_add(log_file = log_file, message = 'Deriving model predicting ref_pop groups.') - -# Read in reference pop_data -pop_data <- read_pop_data(opt$pop_data) - -# Scale the reference PCs -ref_pcs_scaled<-score_scale(score = ref_pcs, ref_scale = ref_pcs_centre_scale) -ref_pcs_scaled_pop<-merge(ref_pcs_scaled, pop_data, by=c('FID','IID')) - -# Build model -model <- train(y=as.factor(ref_pcs_scaled_pop$POP), x=ref_pcs_scaled_pop[, grepl('PC',names(ref_pcs_scaled_pop)), with=F], method=opt$model_method, metric='logLoss', trControl=trainControl(method="cv", number=5, classProbs= TRUE, savePredictions = 'final', summaryFunction = multiClassSummary)) - -saveRDS(model$finalModel, paste0(opt$output,'.model.rds')) - -##### -# Calculate PCs in target sample -##### - -log_add(log_file = log_file, message = 'Calculating PCs in the target sample.') -targ_pcs<-plink_score(pfile = opt$target_plink_chr_subset, chr = CHROMS, plink2 = opt$plink2, score = paste0(tmp_dir,'/ref.eigenvec.var')) -targ_pcs_scaled<-score_scale(score = targ_pcs, ref_scale = ref_pcs_centre_scale) - -### -# Create plot PC scores of target sample compared to the reference -### - -log_add(log_file = log_file, message = 'Plotting target sample PCs on reference.') - -# Combine ref and targ PCs -targ_pcs_scaled$POP<-'Target' -ref_pcs_targ_pcs<-rbind(ref_pcs_scaled_pop,targ_pcs_scaled) - -PC_1_2<-ggplot(ref_pcs_targ_pcs[ref_pcs_targ_pcs$POP != 'Target',], aes(x=PC1,y=PC2, colour=POP)) + - geom_point() + - geom_point(data=ref_pcs_targ_pcs[ref_pcs_targ_pcs$POP == 'Target',], aes(x=PC1,y=PC2), colour='black', shape=21) + - labs(title = "PCs 1 and 2", colour="") + - theme_half_open() + - background_grid() + - theme(plot.title = element_text(hjust = 0.5)) - -PC_3_4<-ggplot(ref_pcs_targ_pcs[ref_pcs_targ_pcs$POP != 'Target',], aes(x=PC3,y=PC4, colour=POP)) + - geom_point() + - geom_point(data=ref_pcs_targ_pcs[ref_pcs_targ_pcs$POP == 'Target',], aes(x=PC3,y=PC4), colour='black', shape=21) + - labs(title = "PCs 3 and 4", colour="") + - theme_half_open() + - background_grid() + - theme(plot.title = element_text(hjust = 0.5)) - -png(paste0(opt$output,'.pc_plot.png'), units='px', res=300, width=4000, height=2000) - plot_grid(PC_1_2,PC_3_4) -dev.off() - -### -# Estimate probability of outcomes in model -### - -log_add(log_file = log_file, message = 'Inferring population membership in target.') - -# Read in model -pop_model_pred<-predict(object = model$finalModel, newx = data.matrix(targ_pcs_scaled[, grepl('PC',names(targ_pcs_scaled)), with=F]), type = "response", s=model$finalModel$lambdaOpt) -pop_model_pred<-as.data.frame.table(pop_model_pred) -pop_model_pred<-data.table( FID=targ_pcs_scaled$FID, - IID=targ_pcs_scaled$IID, - pop=as.character(pop_model_pred$Var2), - prob=round(pop_model_pred$Freq,3)) - -pop_model_pred<-dcast.data.table(pop_model_pred, formula=FID + IID~pop, value.var = "prob") - -fwrite(pop_model_pred, paste0(opt$output,'.model_pred'), sep='\t') - -# Create keep files based on the results -dir.create(paste0(opt$out_dir,'/keep_files/model_based'), recursive = T) -if(!is.na(opt$prob_thresh)){ - pop_model_pred$max_prob<-apply(pop_model_pred[,-1:-2], 1, max) - pop_model_pred<-pop_model_pred[pop_model_pred$max_prob > opt$prob_thresh,] - pop_model_pred$max_prob<-NULL -} - -N_group<-NULL -for(i in names(pop_model_pred[,-1:-2])){ - tmp_keep<-pop_model_pred[apply(pop_model_pred[,-1:-2], 1, function(x) x[i] == max(x)),1:2] - N_group<-rbind(N_group, data.frame(Group=i, N=nrow(tmp_keep))) - fwrite(tmp_keep, paste0(opt$out_dir,'/keep_files/model_based/',i,'.keep'), sep=' ', col.names=F) -} - -N_group<-rbind(N_group, data.frame(Group='Unassigned', N=nrow(targ_pcs_scaled) - nrow(pop_model_pred))) - -sink(file = log_file, append = T) -cat('----------\n') -cat('N per group based on model:\n') -print.data.frame(N_group, row.names = FALSE, quote = FALSE, right = FALSE) -cat('----------\n') -sink() - -if(opt$sd_rule){ - dir.create(paste0(opt$out_dir,'/keep_files/sd_based'), recursive = T) - N_group<-NULL - for(pop_i in unique(pop_data$POP)){ - - # Calculate scale of PCs within reference population - ref_pcs_scaled_i <- score_mean_sd(scores = ref_pcs, keep = pop_data[pop_data$POP == pop_i, c('FID','IID'), with=F]) - - # Scale the target PC based on reference mean and SD - targ_pcs_scaled_i<-score_scale(score = targ_pcs, ref_scale = ref_pcs_scaled_i) - - # Identify individuals with PCs <3 - targ_pcs_scaled_i<-targ_pcs_scaled_i[!apply(targ_pcs_scaled_i[,-1:-2], 1, function(x) any(x > 3 | x < -3)),] - - N_group<-rbind(N_group, data.frame(Group=pop_i, N=nrow(targ_pcs_scaled_i))) - - # Save keep file of individuals that fit the population - fwrite(targ_pcs_scaled_i[,1:2], paste0(opt$out_dir,'/keep_files/sd_based/',pop_i,'.keep'), col.names=F, sep='\t') - } - - sink(file = log_file, append = T) - cat('----------\n') - cat('N per group based on SD rule:\n') - print.data.frame(N_group, row.names = FALSE, quote = FALSE, right = FALSE) - cat('----------\n') - sink() -} - -end.time <- Sys.time() -time.taken <- end.time - start.time -sink(file = paste(opt$output,'.log',sep=''), append = T) -cat('Analysis finished at',as.character(end.time),'\n') -cat('Analysis duration was',as.character(round(time.taken,2)),attr(time.taken, 'units'),'\n') -sink() +#!/usr/bin/Rscript +# This script was written by Oliver Pain whilst at King's College London University. +start.time <- Sys.time() +library("optparse") + +option_list = list( +make_option("--target_plink_chr", action="store", default=NULL, type='character', + help="Path to per chromosome target PLINK2 files [required]"), +make_option("--target_keep", action="store", default=NULL, type='character', + help="Path to file listing individuals in the target sample to retain [optional]"), +make_option("--ref_plink_chr", action="store", default=NULL, type='character', + help="Path to per chromosome reference PLINK2 files [required]"), +make_option("--ref_keep", action="store", default=NULL, type='character', + help="Path to file listing individuals in the reference sample to retain [optional]"), +make_option("--maf", action="store", default=0.05, type='numeric', + help="Minor allele frequency threshold [optional]"), +make_option("--geno", action="store", default=0.02, type='numeric', + help="Variant missingness threshold [optional]"), +make_option("--hwe", action="store", default=1e-6, type='numeric', + help="Hardy Weinberg p-value threshold. [optional]"), +make_option("--n_pcs", action="store", default=6, type='numeric', + help="Number of PCs (min=4) [optional]"), +make_option("--plink2", action="store", default='plink2', type='character', + help="Path PLINK software binary [optional]"), +make_option("--output", action="store", default=NULL, type='character', + help="Path for output files [required]"), +make_option("--pop_data", action="store", default=NULL, type='character', + help="Population data for the reference samples [required]"), +make_option("--model_method", action="store", default='glmnet', type='character', + help="Method used for generate prediction model [optional]"), +make_option("--sd_rule", action="store", default=F, type='logical', + help="Logical indicating whether the 3SD rule should be used to define ancestry, or the model-based approach [optional]"), +make_option("--prob_thresh", action="store", default=0.95, type='numeric', + help="Indicates whether probability threshold should be used when defining ancestry [optional]"), +make_option("--test", action="store", default=NA, type='character', + help="Specify test mode [optional]"), +make_option("--memory", action="store", default=5000, type='numeric', + help="Memory limit [optional]") +) + +opt = parse_args(OptionParser(option_list=option_list)) + +# Load dependencies +library(GenoUtils) +source('../functions/misc.R') +source_all('../functions') +library(data.table) +library(caret) +library(pROC) +library(verification) +library(ggplot2) +library(cowplot) + +# Check required inputs +if(is.null(opt$target_plink_chr)){ + stop('--target_plink_chr must be specified.\n') +} +if(is.null(opt$ref_plink_chr)){ + stop('--ref_plink_chr must be specified.\n') +} +if(is.null(opt$output)){ + stop('--output must be specified.\n') +} +if(is.null(opt$pop_data)){ + stop('--pop_data must be specified.\n') +} + +# Create output directory +opt$out_dir<-paste0(dirname(opt$output),'/') +system(paste0('mkdir -p ',opt$out_dir)) + +# Create temp directory +tmp_dir<-tempdir() + +# Initiate log file +log_file <- paste0(opt$output,'.log') +log_header(log_file = log_file, opt = opt, script = 'Ancestry_identifier.R', start.time = start.time) + + +# If testing, change CHROMS to chr value, and lower ancestry probability threshold +if(!is.na(opt$test) && opt$test == 'NA'){ + opt$test<-NA +} +if(!is.na(opt$test)){ + CHROMS <- as.numeric(gsub('chr','',opt$test)) + opt$prob_thresh <- 0.5 + log_add(log_file = log_file, message = 'Lowering prob_thresh parameter to 0.5 for testing.') +} + +if(nrow(fread(paste0(opt$ref_plink_chr, CHROMS[1],'.psam'))) < 100){ + stop('opt$ref_plink_chr must contain at least 100 individuals.') +} + +########### +# Extract target_keep +########### + +if(!is.null(opt$target_keep)){ + plink_subset(keep = opt$target_keep, chr = CHROMS, plink2 = opt$plink2, pfile = opt$target_plink_chr, out = paste0(tmp_dir,'/target_subset.chr')) + opt$target_plink_chr_subset<-paste0(tmp_dir,'/target_subset') +} else { + opt$target_plink_chr_subset<-opt$target_plink_chr +} + +########### +# Extract ref_keep +########### + +if(!is.null(opt$ref_keep)){ + plink_subset(keep = opt$ref_keep, chr = CHROMS, plink2 = opt$plink2, pfile = opt$ref_plink_chr, out = paste0(tmp_dir,'/ref_subset.chr')) + opt$ref_plink_chr_subset<-paste0(tmp_dir,'/ref_subset.chr') +} else { + opt$ref_plink_chr_subset<-opt$ref_plink_chr +} + +########### +# QC target +########### + +# If target sample size is <100, only apply SNP missingness parameter +psam<-fread(paste0(opt$target_plink_chr_subset, CHROMS[1], '.psam')) + +if(nrow(psam) > 100){ + target_qc_snplist<-plink_qc_snplist(pfile = opt$target_plink_chr_subset, chr = CHROMS, plink2 = opt$plink2, geno = opt$geno, maf = opt$maf, hwe = opt$hwe) +} else { + target_qc_snplist<-plink_qc_snplist(pfile = opt$target_plink_chr_subset, chr = CHROMS, plink2 = opt$plink2, geno = opt$geno) + log_add(log_file = log_file, message = 'Target sample size is <100 so only checking genotype missingness.') +} + +########### +# QC reference +########### + +ref_qc_snplist<-plink_qc_snplist(pfile = opt$ref_plink_chr_subset, chr = CHROMS, plink2 = opt$plink2, geno = opt$geno, maf = opt$maf, hwe = opt$hwe) + +########### +# Harmonise target and reference genetic data +########### + +# read in target pvar file +targ_pvar<-read_pvar(opt$target_plink_chr_subset, chr = CHROMS) + +# read in reference pvar file +ref_pvar<-read_pvar(opt$ref_plink_chr_subset, chr = CHROMS) + +# retain variants surviving QC +targ_pvar<-targ_pvar[targ_pvar$SNP %in% intersect(target_qc_snplist, ref_qc_snplist), ] +ref_pvar<-ref_pvar[ref_pvar$SNP %in% intersect(target_qc_snplist, ref_qc_snplist), ] + +# insert IUPAC codes +targ_pvar$IUPAC <- snp_iupac(targ_pvar$A1, targ_pvar$A2) +ref_pvar$IUPAC <- snp_iupac(ref_pvar$A1, ref_pvar$A2) + +# Identify SNPs present in both samples (allowing for strand flips) +# Identify SNPs that need to be flipped +target_ref<-merge(targ_pvar, ref_pvar, by='SNP') +flip <- detect_strand_flip(target_ref$IUPAC.x, target_ref$IUPAC.y) + +flip_snplist<-NULL +if(sum(flip) > 0){ + flip_snplist<-target_ref$SNP.y[flip] + log_add(log_file = log_file, message = paste0(sum(flip), 'variants will be flipped.')) +} + +# Remove variants where IUPAC codes do not match (allowing for strand flips) +matched <- which((target_ref$IUPAC.x == target_ref$IUPAC.y) | flip) +target_ref<-target_ref[matched,] + +log_add(log_file = log_file, message = paste0(nrow(target_ref),' variants match between target and reference after QC.')) + +########### +# Identify list of LD independent SNPs +########### + +log_add(log_file = log_file, message = 'Identifying LD independent SNPs based on reference data.') + +# Subset ref_pvar to contain QC'd variants +ref_pvar<-ref_pvar[ref_pvar$SNP %in% target_ref$SNP,] + +# Remove regions of high LD +ref_pvar <- remove_regions(dat = ref_pvar, regions = long_ld_coord) +log_add(log_file = log_file, message = paste0(nrow(ref_pvar),' variants after removal of LD high regions.')) + +# Perform LD pruning +ld_indep <- plink_prune(pfile = opt$ref_plink_chr_subset, plink2 = opt$plink2, extract = ref_pvar$SNP, chr = CHROMS) +log_add(log_file = log_file, message = paste0(length(ld_indep),' independent variants retained.')) + +########### +# Perform PCA based on reference +########### + +log_add(log_file = log_file, message = 'Performing PCA based on reference.') + +snp_weights<-plink_pca(pfile = opt$ref_plink_chr_subset, chr = CHROMS, plink2 = opt$plink2, extract = ld_indep, flip = flip_snplist, n_pc = opt$n_pcs) +fwrite(snp_weights, paste0(tmp_dir,'/ref.eigenvec.var'), row.names = F, quote=F, sep=' ', na='NA') + +### +# Calculate PCs in the reference sample for scaling the target sample factor scores. +### + +log_add(log_file = log_file, message = 'Computing reference PCs.') + +# Calculate PCs in the reference +ref_pcs<-plink_score(pfile = opt$ref_plink_chr_subset, chr = CHROMS, plink2 = opt$plink2, score = paste0(tmp_dir,'/ref.eigenvec.var')) + +# Scale across all individuals +ref_pcs_centre_scale <- score_mean_sd(scores = ref_pcs) + +### +# Create model predicting reference populations +### + +log_add(log_file = log_file, message = 'Deriving model predicting ref_pop groups.') + +# Read in reference pop_data +pop_data <- read_pop_data(opt$pop_data) + +# Scale the reference PCs +ref_pcs_scaled<-score_scale(score = ref_pcs, ref_scale = ref_pcs_centre_scale) +ref_pcs_scaled_pop<-merge(ref_pcs_scaled, pop_data, by=c('FID','IID')) + +# Build model +model <- train(y=as.factor(ref_pcs_scaled_pop$POP), x=ref_pcs_scaled_pop[, grepl('PC',names(ref_pcs_scaled_pop)), with=F], method=opt$model_method, metric='logLoss', trControl=trainControl(method="cv", number=5, classProbs= TRUE, savePredictions = 'final', summaryFunction = multiClassSummary)) + +saveRDS(model$finalModel, paste0(opt$output,'.model.rds')) + +##### +# Calculate PCs in target sample +##### + +log_add(log_file = log_file, message = 'Calculating PCs in the target sample.') +targ_pcs<-plink_score(pfile = opt$target_plink_chr_subset, chr = CHROMS, plink2 = opt$plink2, score = paste0(tmp_dir,'/ref.eigenvec.var')) +targ_pcs_scaled<-score_scale(score = targ_pcs, ref_scale = ref_pcs_centre_scale) + +### +# Create plot PC scores of target sample compared to the reference +### + +log_add(log_file = log_file, message = 'Plotting target sample PCs on reference.') + +# Combine ref and targ PCs +targ_pcs_scaled$POP<-'Target' +ref_pcs_targ_pcs<-rbind(ref_pcs_scaled_pop,targ_pcs_scaled) + +PC_1_2<-ggplot(ref_pcs_targ_pcs[ref_pcs_targ_pcs$POP != 'Target',], aes(x=PC1,y=PC2, colour=POP)) + + geom_point() + + geom_point(data=ref_pcs_targ_pcs[ref_pcs_targ_pcs$POP == 'Target',], aes(x=PC1,y=PC2), colour='black', shape=21) + + labs(title = "PCs 1 and 2", colour="") + + theme_half_open() + + background_grid() + + theme(plot.title = element_text(hjust = 0.5)) + +PC_3_4<-ggplot(ref_pcs_targ_pcs[ref_pcs_targ_pcs$POP != 'Target',], aes(x=PC3,y=PC4, colour=POP)) + + geom_point() + + geom_point(data=ref_pcs_targ_pcs[ref_pcs_targ_pcs$POP == 'Target',], aes(x=PC3,y=PC4), colour='black', shape=21) + + labs(title = "PCs 3 and 4", colour="") + + theme_half_open() + + background_grid() + + theme(plot.title = element_text(hjust = 0.5)) + +png(paste0(opt$output,'.pc_plot.png'), units='px', res=300, width=4000, height=2000) + plot_grid(PC_1_2,PC_3_4) +dev.off() + +### +# Estimate probability of outcomes in model +### + +log_add(log_file = log_file, message = 'Inferring population membership in target.') + +# Read in model +pop_model_pred<-predict(object = model$finalModel, newx = data.matrix(targ_pcs_scaled[, grepl('PC',names(targ_pcs_scaled)), with=F]), type = "response", s=model$finalModel$lambdaOpt) +pop_model_pred<-as.data.frame.table(pop_model_pred) +pop_model_pred<-data.table( FID=targ_pcs_scaled$FID, + IID=targ_pcs_scaled$IID, + pop=as.character(pop_model_pred$Var2), + prob=round(pop_model_pred$Freq,3)) + +pop_model_pred<-dcast.data.table(pop_model_pred, formula=FID + IID~pop, value.var = "prob") + +fwrite(pop_model_pred, paste0(opt$output,'.model_pred'), sep='\t') + +# Create keep files based on the results +dir.create(paste0(opt$out_dir,'/keep_files/model_based'), recursive = T) +if(!is.na(opt$prob_thresh)){ + pop_model_pred$max_prob<-apply(pop_model_pred[,-1:-2], 1, max) + pop_model_pred<-pop_model_pred[pop_model_pred$max_prob > opt$prob_thresh,] + pop_model_pred$max_prob<-NULL +} + +N_group<-NULL +for(i in names(pop_model_pred[,-1:-2])){ + tmp_keep<-pop_model_pred[apply(pop_model_pred[,-1:-2], 1, function(x) x[i] == max(x)),1:2] + N_group<-rbind(N_group, data.frame(Group=i, N=nrow(tmp_keep))) + fwrite(tmp_keep, paste0(opt$out_dir,'/keep_files/model_based/',i,'.keep'), sep=' ', col.names=F) +} + +N_group<-rbind(N_group, data.frame(Group='Unassigned', N=nrow(targ_pcs_scaled) - nrow(pop_model_pred))) + +sink(file = log_file, append = T) +cat('----------\n') +cat('N per group based on model:\n') +print.data.frame(N_group, row.names = FALSE, quote = FALSE, right = FALSE) +cat('----------\n') +sink() + +if(opt$sd_rule){ + dir.create(paste0(opt$out_dir,'/keep_files/sd_based'), recursive = T) + N_group<-NULL + for(pop_i in unique(pop_data$POP)){ + + # Calculate scale of PCs within reference population + ref_pcs_scaled_i <- score_mean_sd(scores = ref_pcs, keep = pop_data[pop_data$POP == pop_i, c('FID','IID'), with=F]) + + # Scale the target PC based on reference mean and SD + targ_pcs_scaled_i<-score_scale(score = targ_pcs, ref_scale = ref_pcs_scaled_i) + + # Identify individuals with PCs <3 + targ_pcs_scaled_i<-targ_pcs_scaled_i[!apply(targ_pcs_scaled_i[,-1:-2], 1, function(x) any(x > 3 | x < -3)),] + + N_group<-rbind(N_group, data.frame(Group=pop_i, N=nrow(targ_pcs_scaled_i))) + + # Save keep file of individuals that fit the population + fwrite(targ_pcs_scaled_i[,1:2], paste0(opt$out_dir,'/keep_files/sd_based/',pop_i,'.keep'), col.names=F, sep='\t') + } + + sink(file = log_file, append = T) + cat('----------\n') + cat('N per group based on SD rule:\n') + print.data.frame(N_group, row.names = FALSE, quote = FALSE, right = FALSE) + cat('----------\n') + sink() +} + +end.time <- Sys.time() +time.taken <- end.time - start.time +sink(file = paste(opt$output,'.log',sep=''), append = T) +cat('Analysis finished at',as.character(end.time),'\n') +cat('Analysis duration was',as.character(round(time.taken,2)),attr(time.taken, 'units'),'\n') +sink() diff --git a/Scripts/format_target/format_target.R b/Scripts/format_target/format_target.R index 124c5b8c..69055d68 100644 --- a/Scripts/format_target/format_target.R +++ b/Scripts/format_target/format_target.R @@ -151,11 +151,9 @@ targ_pvar<-targ_pvar[,c('CHR','BP','SNP','A2','A1'),with=F] # Label SNP with _dup if the RSID is duplicated, so these variants are removed. dup_snp<-duplicated(targ_pvar$SNP) -log_add(log_file = log_file, message = paste0('Removing ', sum(dup_snp),' duplicate variants.')) +log_add(log_file = log_file, message = paste0('Removing ', sum(dup_snp),' duplicate variants - May have IUPAC NA.')) targ_pvar$SNP[dup_snp]<-paste0(targ_pvar$SNP[dup_snp],'_dup') -log_add(log_file = log_file, message = paste0(sum(!dup_snp)," of ", nrow(ref)," reference variants are in the target.")) - # Write out new bim file names(targ_pvar)<-c('#CHROM','POS','ID','REF','ALT') fwrite(targ_pvar, paste0(tmp_dir,'/subset.pvar'), col.names=T, row.names=F, quote=F, na='NA', sep=' ') @@ -180,7 +178,7 @@ ref_psam<-fread(paste0(opt$ref,'.psam')) names(ref_psam)<-gsub('\\#', '', names(ref_psam)) ref_psam <- ref_psam[, names(ref_psam) %in% c('FID', 'IID'), with = F] if(ncol(ref_psam) == 1){ - ref_ID_update<-data.frame(ref_psam$`IID`, paste0(ref_psam$`#IID`,'_REF')) + ref_ID_update<-data.frame(ref_psam$`IID`, paste0(ref_psam$`IID`,'_REF')) } else { ref_ID_update<-data.frame(ref_psam$`FID`, ref_psam$`IID`, paste0(ref_psam$`FID`,'_REF'), paste0(ref_psam$`IID`,'_REF')) } @@ -188,10 +186,14 @@ fwrite(ref_ID_update, paste0(tmp_dir,'/ref_ID_update.txt'), sep=' ', col.names=F system(paste0(opt$plink2,' --pfile ',opt$ref,' --make-pgen --update-ids ',tmp_dir,'/ref_ID_update.txt --out ',tmp_dir,'/REF --memory 5000 --threads 1')) # Merge target and reference plink files to insert missing SNPs -system(paste0(opt$plink2,' --pfile ',tmp_dir,'/subset --pmerge ',tmp_dir,'/REF --make-pgen --memory 5000 --threads 1 --out ',tmp_dir,'/subset')) +# plink2's pmerge only handles concatenation for the time being +# In the meantime, convert the ref and target into plink1 binaries, merge, and then convert back to plink2 binaries +system(paste0(opt$plink2,' --pfile ',tmp_dir,'/subset --make-bed --memory 5000 --threads 1 --out ',tmp_dir,'/subset')) +system(paste0(opt$plink2,' --pfile ',tmp_dir,'/REF --make-bed --out ',tmp_dir,'/REF --memory 5000 --threads 1')) +system(paste0(opt$plink,' --bfile ',tmp_dir,'/subset --bmerge ',tmp_dir,'/REF --make-bed --allow-no-sex --out ',tmp_dir,'/ref_targ')) # Extract only target individuals -system(paste0(opt$plink2,' --pfile ',tmp_dir,'/subset --remove ',tmp_dir,'/REF.psam --make-pgen --memory 5000 --threads 1 --out ',opt$output)) +system(paste0(opt$plink2,' --bfile ',tmp_dir,'/ref_targ --remove ',tmp_dir,'/REF.psam --make-pgen --memory 5000 --threads 1 --out ',opt$output)) end.time <- Sys.time() time.taken <- end.time - start.time diff --git a/Scripts/pipeline_reports/indiv_report_creator.Rmd b/Scripts/pipeline_reports/indiv_report_creator.Rmd index f780da3b..33f543df 100644 --- a/Scripts/pipeline_reports/indiv_report_creator.Rmd +++ b/Scripts/pipeline_reports/indiv_report_creator.Rmd @@ -124,11 +124,11 @@ for(chr in CHROMS){ # Count the number of variants in the target sample data that match reference variants nvar_in_target <- sum(sapply(format_target_logs, function(x) - as.numeric(gsub(' .*', '', x[grepl('reference variants are in the target.$', x)])))) + as.numeric(gsub('.* ','', gsub(' reference variants.', '', x[grepl('^Target contains', x) & grepl('reference variants.$', x)]))))) nvar_in_ref <- sum(sapply(format_target_logs, function(x) - as.numeric(gsub('.* ','', gsub(' reference variants are in the target.', '', x[grepl('reference variants are in the target.$', x)]))))) + as.numeric(gsub('.* ','', gsub(' variants', '', x[grepl('^Reference data contains ', x)]))))) if(imp_incl){ diff --git a/Scripts/pipeline_reports/samp_report_creator.Rmd b/Scripts/pipeline_reports/samp_report_creator.Rmd index f95391e5..b986b5fe 100644 --- a/Scripts/pipeline_reports/samp_report_creator.Rmd +++ b/Scripts/pipeline_reports/samp_report_creator.Rmd @@ -105,11 +105,11 @@ for(chr in CHROMS){ # Count the number of variants in the target sample data that match reference variants nvar_in_target <- sum(sapply(format_target_logs, function(x) - as.numeric(gsub(' .*', '', x[grepl('reference variants are in the target.$', x)])))) + as.numeric(gsub('.* ','', gsub(' reference variants.', '', x[grepl('^Target contains', x) & grepl('reference variants.$', x)]))))) nvar_in_ref <- sum(sapply(format_target_logs, function(x) - as.numeric(gsub('.* ','', gsub(' reference variants are in the target.', '', x[grepl('reference variants are in the target.$', x)]))))) + as.numeric(gsub('.* ','', gsub(' variants', '', x[grepl('^Reference data contains ', x)]))))) cat0("- The target sample contains ", nrow(target_psam), " individuals.\n") cat0("- The target genotype data was provided in ", target_list$type[target_list$name == params$name], " format.\n") diff --git a/Scripts/target_scoring/target_scoring.R b/Scripts/target_scoring/target_scoring.R index 97ea7a36..e145b240 100644 --- a/Scripts/target_scoring/target_scoring.R +++ b/Scripts/target_scoring/target_scoring.R @@ -80,7 +80,16 @@ if(!is.na(opt$test)){ ##### log_add(log_file = log_file, message = 'Calculating polygenic scores in the target sample.') -scores<-plink_score(pfile = opt$target_plink_chr, chr = CHROMS, plink2 = opt$plink2, score = opt$ref_score, keep = opt$target_keep, frq = opt$ref_freq_chr, threads = opt$n_cores) +scores <- + plink_score( + pfile = opt$target_plink_chr, + chr = CHROMS, + plink2 = opt$plink2, + score = opt$ref_score, + keep = opt$target_keep, + frq = opt$ref_freq_chr, + threads = opt$n_cores + ) ### # Scale the polygenic scores based on the reference diff --git a/docs/Images/OpenSNP/genopred-v1-comp-yengo_eur.png b/docs/Images/OpenSNP/genopred-v1-comp-yengo_eur.png index 29af679a..e616c4b6 100644 Binary files a/docs/Images/OpenSNP/genopred-v1-comp-yengo_eur.png and b/docs/Images/OpenSNP/genopred-v1-comp-yengo_eur.png differ diff --git a/docs/Images/OpenSNP/genopred-v1-comp_strict-yengo_eur.png b/docs/Images/OpenSNP/genopred-v1-comp_strict-yengo_eur.png index 05adbc33..4cb55eff 100644 Binary files a/docs/Images/OpenSNP/genopred-v1-comp_strict-yengo_eur.png and b/docs/Images/OpenSNP/genopred-v1-comp_strict-yengo_eur.png differ diff --git a/docs/Images/OpenSNP/genopred-v1-comp_strict_eur-yengo_eur.png b/docs/Images/OpenSNP/genopred-v1-comp_strict_eur-yengo_eur.png index 226613ee..00492dd4 100644 Binary files a/docs/Images/OpenSNP/genopred-v1-comp_strict_eur-yengo_eur.png and b/docs/Images/OpenSNP/genopred-v1-comp_strict_eur-yengo_eur.png differ diff --git a/docs/Images/OpenSNP/genopred-yengo_eas.png b/docs/Images/OpenSNP/genopred-yengo_eas.png index 62a3041d..ed7f95be 100644 Binary files a/docs/Images/OpenSNP/genopred-yengo_eas.png and b/docs/Images/OpenSNP/genopred-yengo_eas.png differ diff --git a/docs/Images/OpenSNP/genopred-yengo_eur-external.png b/docs/Images/OpenSNP/genopred-yengo_eur-external.png index c9ed4d23..802e6290 100644 Binary files a/docs/Images/OpenSNP/genopred-yengo_eur-external.png and b/docs/Images/OpenSNP/genopred-yengo_eur-external.png differ diff --git a/docs/Images/OpenSNP/genopred-yengo_eur.png b/docs/Images/OpenSNP/genopred-yengo_eur.png index e9629c5b..d8f5d193 100644 Binary files a/docs/Images/OpenSNP/genopred-yengo_eur.png and b/docs/Images/OpenSNP/genopred-yengo_eur.png differ diff --git a/docs/Images/OpenSNP/time_cpu_bench_pgs_methods.csv b/docs/Images/OpenSNP/time_cpu_bench_pgs_methods.csv index e579e199..7879ac56 100644 --- a/docs/Images/OpenSNP/time_cpu_bench_pgs_methods.csv +++ b/docs/Images/OpenSNP/time_cpu_bench_pgs_methods.csv @@ -1,8 +1,8 @@ method,s,h:m:s,max_rss,max_vms,max_uss,max_pss,io_in,io_out,mean_load,cpu_time,file,rule,label -dbslmm,670.2083,0:11:10,1150.19,518875.48,1127.88,1128.51,0,792.55,7.77,272.6,prep_pgs_dbslmm_i-yengo_eur.txt,prep_pgs_dbslmm_i,DBSLMM -lassosum,234.1869,0:03:54,4270.06,521977.98,4255.07,4255.75,0,1635.82,43.3,102.93,prep_pgs_lassosum_i-yengo_eur.txt,prep_pgs_lassosum_i,lassosum -ldpred2,1895.106,0:31:35,20009.57,537189.04,19997.63,19998.87,0,18987.26,23.69,495.81,prep_pgs_ldpred2_i-yengo_eur.txt,prep_pgs_ldpred2_i,LDpred2 -megaprs,3974.4036,1:06:14,12626.76,523095.61,12613.95,12615.65,0,21801.18,293.76,11922.15,prep_pgs_megaprs_i-yengo_eur.txt,prep_pgs_megaprs_i,MegaPRS -prscs,13389.7369,3:43:09,11977.42,16895.49,7635.95,8028.87,158.48,438.68,938.84,127317.72,prep_pgs_prscs_i-yengo_eur.txt,prep_pgs_prscs_i,PRS-CS -ptclump,38.5799,0:00:38,391.48,4519.65,378.23,378.57,0,213.75,25.98,12.91,prep_pgs_ptclump_i-yengo_eur.txt,prep_pgs_ptclump_i,pT+clump -sbayesr,1850.8346,0:30:50,23888.48,45932.89,20929.11,21196.32,0,2811.34,828.8,15532.77,prep_pgs_sbayesr_i-yengo_eur.txt,prep_pgs_sbayesr_i,SBayesR +dbslmm,584.3754,0:09:44,1250.93,518932.27,1228.86,1235.85,0,832.08,9.55,350.71,prep_pgs_dbslmm_i-yengo_eur.txt,prep_pgs_dbslmm_i,DBSLMM +lassosum,222.346,0:03:42,4320.55,522076.59,4307.17,4307.96,0,1600.7,45.6,107.35,prep_pgs_lassosum_i-yengo_eur.txt,prep_pgs_lassosum_i,lassosum +ldpred2,4677.1611,1:17:57,25796.93,537282.12,25580.52,25582.33,0,18703.18,36.04,1752.48,prep_pgs_ldpred2_i-yengo_eur.txt,prep_pgs_ldpred2_i,LDpred2 +megaprs,3931.2688,1:05:31,12622.96,523078.8,12610.11,12610.78,0,21771.48,293.97,11783.88,prep_pgs_megaprs_i-yengo_eur.txt,prep_pgs_megaprs_i,MegaPRS +prscs,13365.5906,3:42:45,11930.78,16875.75,7588.81,7982.41,0,421.89,942.42,127479.74,prep_pgs_prscs_i-yengo_eur.txt,prep_pgs_prscs_i,PRS-CS +ptclump,27.0675,0:00:27,399.71,4519.64,385.74,388.07,0,243.38,32.97,12.32,prep_pgs_ptclump_i-yengo_eur.txt,prep_pgs_ptclump_i,pT+clump +sbayesr,2255.2539,0:37:35,23888.02,45932.63,20929.12,21196.56,0,2877.13,864.48,19629.33,prep_pgs_sbayesr_i-yengo_eur.txt,prep_pgs_sbayesr_i,SBayesR diff --git a/docs/Images/OpenSNP/time_cpu_bench_pgs_methods.png b/docs/Images/OpenSNP/time_cpu_bench_pgs_methods.png index e094f76f..e8070372 100644 Binary files a/docs/Images/OpenSNP/time_cpu_bench_pgs_methods.png and b/docs/Images/OpenSNP/time_cpu_bench_pgs_methods.png differ diff --git a/docs/opensnp_benchmark.Rmd b/docs/opensnp_benchmark.Rmd index bf7b5289..3bbcc76d 100644 --- a/docs/opensnp_benchmark.Rmd +++ b/docs/opensnp_benchmark.Rmd @@ -134,7 +134,7 @@ library(data.table) # Create config file conf <- c( - 'outdir: /users/k1806347/oliverpainfel/Data/OpenSNP/GenoPred/test4', + 'outdir: /users/k1806347/oliverpainfel/Data/OpenSNP/GenoPred/test5', 'config_file: misc/opensnp/config.yaml', 'gwas_list: misc/opensnp/gwas_list.txt', 'score_list: misc/opensnp/score_list.txt', @@ -228,8 +228,8 @@ library(cowplot) # Read in configuration specific benchmark files bm_files_i <- paste0( - '/users/k1806347/oliverpainfel/Data/OpenSNP/GenoPred/test4/reference/benchmarks/', - list.files('/users/k1806347/oliverpainfel/Data/OpenSNP/GenoPred/test4/reference/benchmarks/') + '/users/k1806347/oliverpainfel/Data/OpenSNP/GenoPred/test5/reference/benchmarks/', + list.files('/users/k1806347/oliverpainfel/Data/OpenSNP/GenoPred/test5/reference/benchmarks/') ) # Read in benchmark files @@ -315,7 +315,7 @@ source_all('../functions') pheno <- fread('/users/k1806347/oliverpainfel/Data/OpenSNP/processed/pheno/height.txt') # Define pgs_methods used -pgs_methods <- c('external','ptclump', 'dbslmm', 'prscs', 'sbayesr', 'lassosum', 'ldpred2', 'megaprs') +pgs_methods <- c('external',read_param(config = 'misc/opensnp/config.yaml', param = 'pgs_methods', return_obj = F)) # Define pgs_methods applied to non-EUR GWAS pgs_methods_eur <- c('ptclump','lassosum','megaprs','prscs','dbslmm') @@ -673,6 +673,7 @@ keep_list <- fread('/users/k1806347/oliverpainfel/Data/OpenSNP/GenoPred/test1/op # Read in pgs gwas_list <- fread('misc/opensnp/gwas_list.txt') +gwas_list$name<-gsub('_','',gwas_list$name) pgs_methods <- c('pt_clump','dbslmm','prscs','sbayesr','lassosum','ldpred2','megaprs') pgs_methods_eur <- c('pt_clump','lassosum','megaprs') @@ -731,6 +732,7 @@ write.csv( row.names = F ) +cor<-fread('/scratch/prj/oliverpainfel/Data/OpenSNP/assoc/genopred-v1-yengo-assoc.csv') # Restrict to best only cor_subset <- NULL for(pop_i in unique(cor$pop)){ @@ -907,6 +909,7 @@ write.csv( '/scratch/prj/oliverpainfel/Data/OpenSNP/assoc/genopred-v1-strict-yengo-assoc.csv', row.names = F ) +cor<-fread('/scratch/prj/oliverpainfel/Data/OpenSNP/assoc/genopred-v1-strict-yengo-assoc.csv') # Restrict to best only cor_subset <- NULL @@ -1005,7 +1008,7 @@ dev.off() # Plot the same plot only using the EUR target population in OpenSNP ###### -tmp <- cor_both[cor_both$gwas == 'yengoeur' & cor_both$pop == 'EUR\n N = 653', ] +tmp <- cor_both[cor_both$gwas == 'yengoeur' & grepl('EUR', cor_both$pop), ] y_lim <- c(min(tmp$r - tmp$se), max(tmp$r + tmp$se)) v1_plot <- @@ -1028,7 +1031,7 @@ v1_plot <- background_grid() + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5, size=12)) + - facet_grid(pop ~ Version) + + facet_grid(. ~ Version) + panel_border() v2_plot <- @@ -1051,7 +1054,7 @@ v2_plot <- background_grid() + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5, size=12)) + - facet_grid(pop ~ Version) + + facet_grid(. ~ Version) + panel_border() png('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/docs/Images/OpenSNP/genopred-v1-comp_strict_eur-yengo_eur.png', @@ -1078,12 +1081,16 @@ dev.off() +
+
![Using same ancestry classification threshold](Images/OpenSNP/genopred-v1-comp_strict-yengo_eur.png)
+
+
![Showing results in European OpenSNP data only](Images/OpenSNP/genopred-v1-comp_strict_eur-yengo_eur.png) diff --git a/docs/opensnp_benchmark.html b/docs/opensnp_benchmark.html index 6e39cab3..b0381025 100644 --- a/docs/opensnp_benchmark.html +++ b/docs/opensnp_benchmark.html @@ -13,7 +13,7 @@ OpenSNP Benchmark - + @@ -35,8 +35,8 @@ - - + +