Merge pull request #82 from opain/dev

Dev
opain · Feb 22, 2024 · d4f0d80 · d4f0d80
2 parents 37e10d9 + fd1e4af
commit d4f0d80
Show file tree

Hide file tree

Showing 82 changed files with 8,171 additions and 2,296 deletions.
diff --git a/Scripts/Ancestry_identifier/Ancestry_identifier.R b/Scripts/Ancestry_identifier/Ancestry_identifier.R
diff --git a/Scripts/Ancestry_identifier/README.md b/Scripts/Ancestry_identifier/README.md
@@ -1,64 +1,62 @@
-# Ancestry_identifier.R
-
-This is script is used to infer the global ancestry of a target sample based on a reference-projected principal component (PC) elastic net model.
-
-## Pre-requisites
-The following software is required for the prediction pipeline:
-
-* PLINK v1.9 (https://www.cog-genomics.org/plink2/).
-* PLINK v2 (https://www.cog-genomics.org/plink/2.0/).
-* Per chromosome files for the desired reference genotype data in binary PLINK format (e.g. 1000 Genomes).
-* Per chromsome or genome-wide target genotype data in binary PLINK format.
-* Per chromsome or genome-wide target genotype data in binary PLINK format.
-* A file listing the path to a series of keep files for the reference genotype data used for scaling PCs.
-* A file indicating which population reference individuals correspond to.
-
-See example code below and corresponding files to see examples.
-
-* R packages:
-```R
-install.packages(c('data.table','caret','pROC','verification','ggplot2','cowplot'))
-```
-
-## Parameters
-
-| Flag     | Description                                                  | Default |
-| :------- | ------------------------------------------------------------ | :-----: |
-| --target_plink_chr | Path to per chromosome target PLINK files [required] | NA |
-| --target_plink | Path to genome-wide target PLINK files [required] | NA |
-| --ref_plink_chr | Path to per chromosome reference PLINK files [required] | NA |
-| --target_fam | Target sample fam file. [optional] | NA |
-| --maf | Minor allele frequency threshold [optional] | NA |
-| --geno | Variant missingness threshold [optional] | 0.02 |
-| --hwe | Hardy Weinberg p-value threshold. [optional] | NA |
-| --n_pcs | Number of PCs (min=4) [optional] | 10 |
-| --plink | Path PLINK software binary [required] | plink |
-| --plink2 | Path PLINK software binary [required] | plink |
-| --output | Path for output files [required] | ./PC_projector_output/Output |
-| --ref_pop_scale | List of keep files for ancestry specific scaling [optional] | NA |
-| --pop_data | Population data for the reference samples [optional] | NA |
-| --model_method | Method used for generate prediction model. Only glmnet tested. [optional] | glmnet |
-| --SD_rule | Logical indicating whether the 3SD rule should be used to define ancestry, or the model-based approach [optional] | FALSE |
-| --prob_thresh | Indicates whether probability threshold should be used when defining ancestry [optional] | NA |
-| --memory | Memory limit [optional] | 5000 |
-
-
-## Output files
-
-The script will calculate PCs for reference individuals (output.eigenvec), the SNP weights for the PCs (output.eigenvec.var), the mean and SD of each PC in the full reference (output.scale), and the mean and SD of each PC in each reference group specified in ref_pop_scale (output.group.scale). The script will also output the elastic net model predicting groups in pop_data as an .rds, along with some accuracy metrics (output.pop_data_prediction _details.txt). 
-
-The script will also files indicating the ancestry of the target sample. The script will output keep files indicating the ancestry group of each individual, determined using the 3SD rule (output.ref_pop_scale.keep), and the PC scores scaled and centred using the reference mean and SD (output.pop_data.eigenvec). The script will also output keep files indicating the ancestry of the target samples based on the elastic net model (output.model_pred.pop_data). The script will output .png files showing the PCs of target samples relative to the reference samples.
-
-## Examples
-```sh
-Rscript Ancestry_identifier.R \
-  --target_plink_chr target/target.chr \
-  --ref_plink_chr ref/ref.chr \
-  --n_pcs 6 \
-  --plink plink1.9 \
-  --plink2 plink2 \
-  --output test \
-  --ref_pop_scale ref_super_pop_keep_list.txt \
-  --pop_data ref_pop_dat.txt \
-  --prob_thresh 0.5
-```
+# Ancestry_identifier.R
+
+This is script is used to infer the global ancestry of a target sample based on a reference-projected principal component (PC) elastic net model.
+
+## Pre-requisites
+The following software is required for the prediction pipeline:
+
+* PLINK v1.9 (https://www.cog-genomics.org/plink2/).
+* PLINK v2 (https://www.cog-genomics.org/plink/2.0/).
+* Per chromosome files for the desired reference genotype data in binary PLINK format (e.g. 1000 Genomes).
+* Per chromsome or genome-wide target genotype data in binary PLINK format.
+* Per chromsome or genome-wide target genotype data in binary PLINK format.
+* A file listing the path to a series of keep files for the reference genotype data used for scaling PCs.
+* A file indicating which population reference individuals correspond to.
+
+See example code below and corresponding files to see examples.
+
+* R packages:
+```R
+install.packages(c('data.table','caret','pROC','verification','ggplot2','cowplot'))
+```
+
+## Parameters
+
+| Flag     | Description                                                  | Default |
+| :------- | ------------------------------------------------------------ | :-----: |
+| --target_plink_chr | Path to per chromosome target PLINK files [required] | NA |
+| --ref_plink_chr | Path to per chromosome reference PLINK files [required] | NA |
+| --maf | Minor allele frequency threshold [optional] | NA |
+| --geno | Variant missingness threshold [optional] | 0.02 |
+| --hwe | Hardy Weinberg p-value threshold. [optional] | NA |
+| --n_pcs | Number of PCs (min=4) [optional] | 10 |
+| --plink | Path PLINK software binary [required] | plink |
+| --plink2 | Path PLINK software binary [required] | plink |
+| --output | Path for output files [required] | ./PC_projector_output/Output |
+| --ref_pop_scale | List of keep files for ancestry specific scaling [optional] | NA |
+| --pop_data | Population data for the reference samples [optional] | NA |
+| --model_method | Method used for generate prediction model. Only glmnet tested. [optional] | glmnet |
+| --SD_rule | Logical indicating whether the 3SD rule should be used to define ancestry, or the model-based approach [optional] | FALSE |
+| --prob_thresh | Indicates whether probability threshold should be used when defining ancestry [optional] | NA |
+| --memory | Memory limit [optional] | 5000 |
+
+
+## Output files
+
+The script will calculate PCs for reference individuals (output.eigenvec), the SNP weights for the PCs (output.eigenvec.var), the mean and SD of each PC in the full reference (output.scale), and the mean and SD of each PC in each reference group specified in ref_pop_scale (output.group.scale). The script will also output the elastic net model predicting groups in pop_data as an .rds, along with some accuracy metrics (output.pop_data_prediction _details.txt). 
+
+The script will also files indicating the ancestry of the target sample. The script will output keep files indicating the ancestry group of each individual, determined using the 3SD rule (output.ref_pop_scale.keep), and the PC scores scaled and centred using the reference mean and SD (output.pop_data.eigenvec). The script will also output keep files indicating the ancestry of the target samples based on the elastic net model (output.model_pred.pop_data). The script will output .png files showing the PCs of target samples relative to the reference samples.
+
+## Examples
+```sh
+Rscript Ancestry_identifier.R \
+  --target_plink_chr target/target.chr \
+  --ref_plink_chr ref/ref.chr \
+  --n_pcs 6 \
+  --plink plink1.9 \
+  --plink2 plink2 \
+  --output test \
+  --ref_pop_scale ref_super_pop_keep_list.txt \
+  --pop_data ref_pop_dat.txt \
+  --prob_thresh 0.5
+```
diff --git a/Scripts/external_score_processor/external_score_processor.R b/Scripts/external_score_processor/external_score_processor.R
@@ -260,10 +260,15 @@ if(nrow(targ_matched) < 0.75*n_snp_orig){
 	log_add(log_file = log_file, message = 'Calculating polygenic scores in reference.')
 
 	# Calculate scores in the full reference
-	ref_pgs <- plink_score(bfile = opt$ref_plink_chr, chr = CHROMS, plink2 = opt$plink2, score = paste0(opt$output,'.score.gz'))
+	ref_pgs <- plink_score(pfile = opt$ref_plink_chr, chr = CHROMS, plink2 = opt$plink2, score = paste0(opt$output,'.score.gz'))
 
 	# Calculate scale within each reference population
 	pop_data <- fread(opt$pop_data)
+	pop_data<-data.table(
+	  FID=pop_data$`#IID`,
+	  IID=pop_data$`#IID`,
+	  POP=pop_data$POP
+	)
 
 	for(pop_i in unique(pop_data$POP)){
 	ref_pgs_scale_i <- score_mean_sd(scores = ref_pgs, keep = pop_data[pop_data$POP == pop_i, c('FID','IID'), with=F])

diff --git a/Scripts/format_target/format_target.R b/Scripts/format_target/format_target.R
@@ -115,39 +115,50 @@ write.table(ref_target$SNP.y, paste0(tmp_dir,'/extract_list_2.txt'), col.names =
 
 # First extract variants based on original ID
 if(opt$format == 'plink1'){
-  system(paste0(opt$plink2,' --bfile ',opt$target, ' --extract ', tmp_dir,'/extract_list_1.txt --make-bed --memory 5000 --threads 1 --out ', tmp_dir,'/subset'))
+  system(paste0(opt$plink2,' --bfile ',opt$target, ' --extract ', tmp_dir,'/extract_list_1.txt --make-pgen --memory 5000 --threads 1 --out ', tmp_dir,'/subset'))
 }
 if(opt$format == 'plink2'){
-  system(paste0(opt$plink2,' --pfile ',opt$target, ' --extract ', tmp_dir,'/extract_list_1.txt --make-bed --memory 5000 --threads 1 --out ', tmp_dir,'/subset'))
+  system(paste0(opt$plink2,' --pfile ',opt$target, ' --extract ', tmp_dir,'/extract_list_1.txt --make-pgen --memory 5000 --threads 1 --out ', tmp_dir,'/subset'))
 }
 if(opt$format == 'bgen'){
-  system(paste0(opt$plink2,' --bgen ',opt$target,'.bgen ref-last --sample ',gsub('.chr.*','',opt$target),'.sample --hard-call-threshold 0.1 --extract ', tmp_dir,'/extract_list_1.txt --make-bed --memory 5000 --threads 1 --out ', tmp_dir,'/subset'))
+  system(paste0(opt$plink2,' --bgen ',opt$target,'.bgen ref-last --sample ',gsub('.chr.*','',opt$target),'.sample --extract ', tmp_dir,'/extract_list_1.txt --make-pgen --memory 5000 --threads 1 --out ', tmp_dir,'/subset'))
 }
 if(opt$format == 'vcf'){
-  system(paste0(opt$plink2,' --vcf ',opt$target,'.vcf.gz --vcf-min-gq 10 --extract ', tmp_dir,'/extract_list_1.txt --make-bed --memory 5000 --threads 1 --out ', tmp_dir,'/subset'))
+  system(paste0(opt$plink2,' --vcf ',opt$target,'.vcf.gz --extract ', tmp_dir,'/extract_list_1.txt --make-pgen --memory 5000 --threads 1 --out ', tmp_dir,'/subset'))
+}
+
+# Ensure both FID and IID are present in the .psam file
+targ_psam <- fread(paste0(tmp_dir,'/subset.psam'))
+names(targ_psam)<-gsub('\\#', '', names(targ_psam))
+if(sum(names(targ_psam) %in% c('FID', 'IID')) == 1){
+  targ_psam$FID <- targ_psam$IID
+  targ_psam <- targ_psam[, c('FID','IID', names(targ_psam)[!(names(targ_psam) %in% c('FID','IID'))]), with=F]
+  names(targ_psam)[1]<-paste0('#FID')
+  fwrite(targ_psam, paste0(tmp_dir,'/subset.psam'), col.names=T, row.names=F, quote=F, na='NA', sep=' ')
 }
 
 # Now edit bim file to update IDs to reference IDs
-targ_bim<-fread(paste0(tmp_dir,'/subset.bim'))
-names(targ_bim)<-c('CHR','SNP','POS','BP','A1','A2')
+targ_pvar<-fread(paste0(tmp_dir,'/subset.pvar'))
+names(targ_pvar)<-c('CHR','BP','SNP','A2','A1')
 
 # Update SNP with reference SNP value based on CHR:BP:IUPAC in the previously matched ref and target data
-targ_bim$IUPAC<-snp_iupac(targ_bim$A1, targ_bim$A2)
-targ_bim$ID<-paste0(targ_bim$CHR,':',targ_bim$BP,':',targ_bim$IUPAC)
-targ_bim$SNP<-targ_bim$ID # Give SNP column a unique value before updating to reference value
+targ_pvar$IUPAC<-snp_iupac(targ_pvar$A1, targ_pvar$A2)
+targ_pvar$ID<-paste0(targ_pvar$CHR,':',targ_pvar$BP,':',targ_pvar$IUPAC)
+targ_pvar$SNP<-targ_pvar$ID # Give SNP column a unique value before updating to reference value
 ref_target$ID<-paste0(ref_target$CHR,':',ref_target$BP,':',ref_target$IUPAC.x)
-targ_bim[ref_target, on=.(ID), SNP := i.SNP.y]
-targ_bim<-targ_bim[,c('CHR','SNP','POS','BP','A1','A2'),with=F]
+targ_pvar[ref_target, on=.(ID), SNP := i.SNP.y]
+targ_pvar<-targ_pvar[,c('CHR','BP','SNP','A2','A1'),with=F]
 
 # Label SNP with _dup if the RSID is duplicated, so these variants are removed.
-dup_snp<-duplicated(targ_bim$SNP)
+dup_snp<-duplicated(targ_pvar$SNP)
 log_add(log_file = log_file, message = paste0('Removing ', sum(dup_snp),' duplicate variants.'))
-targ_bim$SNP[dup_snp]<-paste0(targ_bim$SNP[dup_snp],'_dup')
+targ_pvar$SNP[dup_snp]<-paste0(targ_pvar$SNP[dup_snp],'_dup')
 
 log_add(log_file = log_file, message = paste0(sum(!dup_snp)," of ", nrow(ref)," reference variants are in the target."))
 
 # Write out new bim file
-fwrite(targ_bim, paste0(tmp_dir,'/subset.bim'), col.names=F, row.names=F, quote=F, na='NA', sep=' ')
+names(targ_pvar)<-c('#CHROM','POS','ID','REF','ALT')
+fwrite(targ_pvar, paste0(tmp_dir,'/subset.pvar'), col.names=T, row.names=F, quote=F, na='NA', sep=' ')
 
 # Extract variants based on new reference RSIDs
 # and flip variants if there are any to be flipped
@@ -156,7 +167,7 @@ if(sum(flip) > 0){
   plink_opt<-paste0(plink_opt, paste0('--flip ',tmp_dir,'/flip_list.txt '))
 }
 
-system(paste0(opt$plink,' --bfile ',tmp_dir,'/subset ',plink_opt,'--extract ', tmp_dir,'/extract_list_2.txt --make-bed --memory 5000 --threads 1 --out ', tmp_dir,'/subset'))
+system(paste0(opt$plink2,' --pfile ',tmp_dir,'/subset ',plink_opt,'--extract ', tmp_dir,'/extract_list_2.txt --make-pgen --memory 5000 --threads 1 --out ', tmp_dir,'/subset'))
 
 ##################
 # Insert missing SNPs into the reference data
@@ -165,16 +176,22 @@ system(paste0(opt$plink,' --bfile ',tmp_dir,'/subset ',plink_opt,'--extract ', t
 log_add(log_file = log_file, message = 'Inserting missing reference variants.')
 
 # Update IDs in reference to avoid conflict with the target
-ref_fam<-fread(paste0(opt$ref,'.fam'))
-ref_ID_update<-data.frame(ref_fam$V1, ref_fam$V2, paste0(ref_fam$V1,'_REF'),paste0(ref_fam$V2,'_REF'))
+ref_psam<-fread(paste0(opt$ref,'.psam'))
+names(ref_psam)<-gsub('\\#', '', names(ref_psam))
+ref_psam <- ref_psam[, names(ref_psam) %in% c('FID', 'IID'), with = F]
+if(ncol(ref_psam) == 1){
+  ref_ID_update<-data.frame(ref_psam$`IID`, paste0(ref_psam$`#IID`,'_REF'))
+} else {
+  ref_ID_update<-data.frame(ref_psam$`FID`, ref_psam$`IID`, paste0(ref_psam$`FID`,'_REF'), paste0(ref_psam$`IID`,'_REF'))
+}
 fwrite(ref_ID_update, paste0(tmp_dir,'/ref_ID_update.txt'), sep=' ', col.names=F)
-system(paste0(opt$plink,' --bfile ',opt$ref,' --make-bed --update-ids ',tmp_dir,'/ref_ID_update.txt --out ',tmp_dir,'/REF --memory 7000'))
+system(paste0(opt$plink2,' --pfile ',opt$ref,' --make-pgen --update-ids ',tmp_dir,'/ref_ID_update.txt --out ',tmp_dir,'/REF --memory 5000 --threads 1'))
 
 # Merge target and reference plink files to insert missing SNPs
-system(paste0(opt$plink,' --bfile ',tmp_dir,'/subset --bmerge ',tmp_dir,'/REF --make-bed --out ',tmp_dir,'/subset'))
+system(paste0(opt$plink2,' --pfile ',tmp_dir,'/subset --pmerge ',tmp_dir,'/REF --make-pgen --memory 5000 --threads 1 --out ',tmp_dir,'/subset'))
 
 # Extract only target individuals
-system(paste0(opt$plink,' --bfile ',tmp_dir,'/subset --remove ',tmp_dir,'/REF.fam --make-bed --out ',opt$output))
+system(paste0(opt$plink2,' --pfile ',tmp_dir,'/subset --remove ',tmp_dir,'/REF.psam --make-pgen --memory 5000 --threads 1 --out ',opt$output))
 
 end.time <- Sys.time()
 time.taken <- end.time - start.time