diff --git a/DESCRIPTION b/DESCRIPTION index bbbd0e0..85e5435 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,7 +12,7 @@ Description: Use a 'glmmkin' class object (GMMAT package) from the null model to License: GPL-3 Copyright: See COPYRIGHTS for details. Imports: Rcpp, Matrix, parallel, MASS, SeqArray, SeqVarTools, foreach, - GMMAT, CompQuadForm + GMMAT, CompQuadForm, data.table Suggests: doMC, testthat LinkingTo: Rcpp, RcppArmadillo Encoding: UTF-8 diff --git a/NAMESPACE b/NAMESPACE index 53fee81..f50fbac 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ importFrom("stats", "as.formula", "binomial", "dbeta", "glm", "model.frame", "pnorm", "uniroot", "integrate", "weights", "vcov") importFrom("utils", "read.table", "write.table") importFrom("CompQuadForm", "davies", "liu") +importFrom("data.table", "fread") importFrom("SeqArray", "seqOpen", "seqGetData", "seqClose", "seqSetFilter") importFrom("SeqVarTools", "missingGenotypeRate", "alleleFrequency", "altDosage") importFrom("GMMAT", "glmmkin", "glmm.score", "SMMAT") diff --git a/R/MAGEE.R b/R/MAGEE.R index 25136c2..ba43bef 100644 --- a/R/MAGEE.R +++ b/R/MAGEE.R @@ -1,4 +1,4 @@ -MAGEE <- function(null.obj, interaction, geno.file, group.file, group.file.sep = "\t", bgen.samplefile = NULL, interaction.covariates = NULL, meta.file.prefix = NULL, MAF.range = c(1e-7, 0.5), MAF.weights.beta = c(1, 25), miss.cutoff = 1, missing.method = "impute2mean", method = "davies", tests = "JF", use.minor.allele = FALSE, auto.flip = FALSE, Garbage.Collection = FALSE, is.dosage = FALSE, ncores = 1){ +MAGEE <- function(null.obj, interaction, geno.file, group.file, group.file.sep = "\t", bgen.samplefile = NULL, interaction.covariates = NULL, meta.file.prefix = NULL, MAF.range = c(1e-7, 0.5), AF.strata.range = c(0, 1), MAF.weights.beta = c(1, 25), miss.cutoff = 1, missing.method = "impute2mean", method = "davies", tests = "JF", use.minor.allele = FALSE, auto.flip = FALSE, Garbage.Collection = FALSE, is.dosage = FALSE, ncores = 1){ if(Sys.info()["sysname"] == "Windows" && ncores > 1) { warning("The package doMC is not available on Windows... Switching to single thread...") ncores <- 1 @@ -86,7 +86,7 @@ MAGEE <- function(null.obj, interaction, geno.file, group.file, group.file.sep = } variant.id <- paste(chr, pos, ref, alt, sep = ":") rm(chr, pos, ref, alt); gc() - group.info <- try(read.table(group.file, header = FALSE, col.names = c("group", "chr", "pos", "ref", "alt", "weight"), colClasses = c("character","character","integer","character","character","numeric"), sep = group.file.sep), silent = TRUE) + group.info <- try(fread(group.file, header = FALSE, data.table = FALSE, col.names = c("group", "chr", "pos", "ref", "alt", "weight"), colClasses = c("character","character","integer","character","character","numeric"), sep = group.file.sep), silent = TRUE) if (inherits(group.info, "try-error")) { stop("Error: cannot read group.file!") } @@ -176,7 +176,7 @@ MAGEE <- function(null.obj, interaction, geno.file, group.file, group.file.sep = if(!is.null(strata)) { # E is not continuous freq.tmp <- sapply(strata.list, function(x) colMeans(geno[x, , drop = FALSE], na.rm = TRUE)/2) # freq.tmp is a matrix, each column is a strata, and each row is a varirant if (length(dim(freq.tmp)) == 2) freq_strata <- apply(freq.tmp, 1, range) else freq_strata <- as.matrix(range(freq.tmp)) # freq_strata is the range of allele freq across strata.list - include <- include & !is.na(freq_strata[1,]) & !is.na(freq_strata[2,]) & freq_strata[1,] >= MAF.range[1] & freq_strata[2,] <= 1-MAF.range[1] + include <- include & !is.na(freq_strata[1,]) & !is.na(freq_strata[2,]) & freq_strata[1,] >= AF.strata.range[1] & freq_strata[2,] <= AF.strata.range[2] rm(freq.tmp) } n.p <- sum(include) @@ -385,7 +385,7 @@ MAGEE <- function(null.obj, interaction, geno.file, group.file, group.file.sep = if(!is.null(strata)) { # E is not continuous freq.tmp <- sapply(strata.list, function(x) colMeans(geno[x, , drop = FALSE], na.rm = TRUE)/2) # freq.tmp is a matrix, each column is a strata, and each row is a varirant if (length(dim(freq.tmp)) == 2) freq_strata <- apply(freq.tmp, 1, range) else freq_strata <- as.matrix(range(freq.tmp)) # freq_strata is the range of allele freq across strata.list - include <- include & !is.na(freq_strata[1,]) & !is.na(freq_strata[2,]) & freq_strata[1,] >= MAF.range[1] & freq_strata[2,] <= 1-MAF.range[1] + include <- include & !is.na(freq_strata[1,]) & !is.na(freq_strata[2,]) & freq_strata[1,] >= AF.strata.range[1] & freq_strata[2,] <= AF.strata.range[2] rm(freq.tmp) } n.p <- sum(include) @@ -551,7 +551,7 @@ MAGEE <- function(null.obj, interaction, geno.file, group.file, group.file.sep = if (is.null(bgen.samplefile)) { stop("Error: bgen file does not contain sample identifiers. A .sample file (bgen.samplefile) is needed.") } - sample.id <- read.table(bgen.samplefile, header = TRUE, sep = " ") + sample.id <- fread(bgen.samplefile, header = TRUE, data.table = FALSE) if ((nrow(sample.id)-1) != bgenInfo$N){ stop(paste0("Error: Number of sample identifiers in BGEN sample file (", nrow(sample.id)-1, ") does not match number of samples in BGEN file (", bgenInfo$N,").")) } @@ -594,7 +594,7 @@ MAGEE <- function(null.obj, interaction, geno.file, group.file, group.file.sep = variant.id <- paste(bgenVariant$VariantInfo$CHR, bgenVariant$VariantInfo$POS, bgenVariant$VariantInfo$A1, bgenVariant$VariantInfo$A2, sep = ":") gc() variant.idx <- 1:length(variant.id) - group.info <- try(read.table(group.file, header = FALSE, col.names = c("group", "chr", "pos", "ref", "alt", "weight"), colClasses = c("character","character","integer","character","character","numeric"), sep = group.file.sep), silent = TRUE) + group.info <- try(fread(group.file, header = FALSE, data.table = FALSE, col.names = c("group", "chr", "pos", "ref", "alt", "weight"), colClasses = c("character","character","integer","character","character","numeric"), sep = group.file.sep), silent = TRUE) if (inherits(group.info, "try-error")) { stop("Error: cannot read group.file!") } @@ -683,7 +683,7 @@ MAGEE <- function(null.obj, interaction, geno.file, group.file, group.file.sep = if(!is.null(strata)) { # E is not continuous freq.tmp <- sapply(strata.list, function(x) colMeans(geno[x, , drop = FALSE], na.rm = TRUE)/2) # freq.tmp is a matrix, each column is a strata, and each row is a varirant if (length(dim(freq.tmp)) == 2) freq_strata <- apply(freq.tmp, 1, range) else freq_strata <- as.matrix(range(freq.tmp)) # freq_strata is the range of allele freq across strata.list - include <- include & !is.na(freq_strata[1,]) & !is.na(freq_strata[2,]) & freq_strata[1,] >= MAF.range[1] & freq_strata[2,] <= 1-MAF.range[1] + include <- include & !is.na(freq_strata[1,]) & !is.na(freq_strata[2,]) & freq_strata[1,] >= AF.strata.range[1] & freq_strata[2,] <= AF.strata.range[2] rm(freq.tmp) } n.p <- sum(include) @@ -886,7 +886,7 @@ MAGEE <- function(null.obj, interaction, geno.file, group.file, group.file.sep = if(!is.null(strata)) { # E is not continuous freq.tmp <- sapply(strata.list, function(x) colMeans(geno[x, , drop = FALSE], na.rm = TRUE)/2) # freq.tmp is a matrix, each column is a strata, and each row is a varirant if (length(dim(freq.tmp)) == 2) freq_strata <- apply(freq.tmp, 1, range) else freq_strata <- as.matrix(range(freq.tmp)) # freq_strata is the range of allele freq across strata.list - include <- include & !is.na(freq_strata[1,]) & !is.na(freq_strata[2,]) & freq_strata[1,] >= MAF.range[1] & freq_strata[2,] <= 1-MAF.range[1] + include <- include & !is.na(freq_strata[1,]) & !is.na(freq_strata[2,]) & freq_strata[1,] >= AF.strata.range[1] & freq_strata[2,] <= AF.strata.range[2] rm(freq.tmp) } n.p <- sum(include) @@ -1217,7 +1217,7 @@ MAGEE.prep <- function(null.obj, interaction, geno.file, group.file, interaction } variant.id <- paste(chr, pos, ref, alt, sep = ":") rm(chr, pos, ref, alt); gc() - group.info <- try(read.table(group.file, header = FALSE, col.names = c("group", "chr", "pos", "ref", "alt", "weight"), colClasses = c("character","character","integer","character","character","numeric"), sep = group.file.sep), silent = TRUE) + group.info <- try(fread(group.file, header = FALSE, data.table = FALSE, col.names = c("group", "chr", "pos", "ref", "alt", "weight"), colClasses = c("character","character","integer","character","character","numeric"), sep = group.file.sep), silent = TRUE) if (inherits(group.info, "try-error")) { stop("Error: cannot read group.file!") } @@ -1258,7 +1258,7 @@ MAGEE.prep <- function(null.obj, interaction, geno.file, group.file, interaction return(out) } -MAGEE.lowmem <- function(MAGEE.prep.obj, geno.file = NULL, meta.file.prefix = NULL, MAF.range = c(1e-7, 0.5), MAF.weights.beta = c(1, 25), miss.cutoff = 1, missing.method = "impute2mean", method = "davies", tests = "JF", use.minor.allele = FALSE, Garbage.Collection = FALSE, is.dosage = FALSE, ncores = 1) +MAGEE.lowmem <- function(MAGEE.prep.obj, geno.file = NULL, meta.file.prefix = NULL, MAF.range = c(1e-7, 0.5), AF.strata.range = c(0, 1), MAF.weights.beta = c(1, 25), miss.cutoff = 1, missing.method = "impute2mean", method = "davies", tests = "JF", use.minor.allele = FALSE, Garbage.Collection = FALSE, is.dosage = FALSE, ncores = 1) { if(!inherits(MAGEE.prep.obj, "MAGEE.prep")) stop("Error: MAGEE.prep.obj must be a class MAGEE.prep object!") is.Windows <- Sys.info()["sysname"] == "Windows" @@ -1354,7 +1354,7 @@ MAGEE.lowmem <- function(MAGEE.prep.obj, geno.file = NULL, meta.file.prefix = NU if(!is.null(strata)) { # E is not continuous freq.tmp <- sapply(strata.list, function(x) colMeans(geno[x, , drop = FALSE], na.rm = TRUE)/2) # freq.tmp is a matrix, each column is a strata, and each row is a varirant if (length(dim(freq.tmp)) == 2) freq_strata <- apply(freq.tmp, 1, range) else freq_strata <- as.matrix(range(freq.tmp)) # freq_strata is the range of allele freq across strata.list - include <- include & !is.na(freq_strata[1,]) & !is.na(freq_strata[2,]) & freq_strata[1,] >= MAF.range[1] & freq_strata[2,] <= 1-MAF.range[1] + include <- include & !is.na(freq_strata[1,]) & !is.na(freq_strata[2,]) & freq_strata[1,] >= AF.strata.range[1] & freq_strata[2,] <= AF.strata.range[2] rm(freq.tmp) } n.p <- sum(include) @@ -1557,7 +1557,7 @@ MAGEE.lowmem <- function(MAGEE.prep.obj, geno.file = NULL, meta.file.prefix = NU if(!is.null(strata)) { # E is not continuous freq.tmp <- sapply(strata.list, function(x) colMeans(geno[x, , drop = FALSE], na.rm = TRUE)/2) # freq.tmp is a matrix, each column is a strata, and each row is a varirant if (length(dim(freq.tmp)) == 2) freq_strata <- apply(freq.tmp, 1, range) else freq_strata <- as.matrix(range(freq.tmp)) # freq_strata is the range of allele freq across strata.list - include <- include & !is.na(freq_strata[1,]) & !is.na(freq_strata[2,]) & freq_strata[1,] >= MAF.range[1] & freq_strata[2,] <= 1-MAF.range[1] + include <- include & !is.na(freq_strata[1,]) & !is.na(freq_strata[2,]) & freq_strata[1,] >= AF.strata.range[1] & freq_strata[2,] <= AF.strata.range[2] rm(freq.tmp) } n.p <- sum(include) diff --git a/R/MAGEE.meta.R b/R/MAGEE.meta.R index 7fffe92..5ebe7dc 100644 --- a/R/MAGEE.meta.R +++ b/R/MAGEE.meta.R @@ -16,7 +16,7 @@ MAGEE.meta <- function(meta.files.prefix, n.files = rep(1, length(meta.files.pre JV <- "JV" %in% tests JF <- "JF" %in% tests JD <- "JD" %in% tests - group.info <- try(read.table(group.file, header = FALSE, col.names = c("group", "chr", "pos", "ref", "alt", "weight"), colClasses = c("character","character","integer","character","character","numeric"), sep = group.file.sep), silent = TRUE) + group.info <- try(fread(group.file, header = FALSE, data.table = FALSE, col.names = c("group", "chr", "pos", "ref", "alt", "weight"), colClasses = c("character","character","integer","character","character","numeric"), sep = group.file.sep), silent = TRUE) if (inherits(group.info, "try-error")) { stop("Error: cannot read group.file!") } @@ -32,7 +32,7 @@ MAGEE.meta <- function(meta.files.prefix, n.files = rep(1, length(meta.files.pre for(i in 1:n.cohort) { # Read the scores for each study from each core tmp.scores <- NULL for(j in 1:n.files[i]) { # n.files[i] is the number of cores for the i-th study - tmp <- try(read.table(paste0(meta.files.prefix[i], ".score.", j), header = TRUE, as.is = TRUE)) + tmp <- try(fread(paste0(meta.files.prefix[i], ".score.", j), header = TRUE, data.table = FALSE)) if (inherits(tmp,"try-error")) { stop(paste0("Error: cannot read ", meta.files.prefix[i], ".score.", j, "!")) } diff --git a/R/glmm.gei.R b/R/glmm.gei.R index 5d77a83..1186b82 100644 --- a/R/glmm.gei.R +++ b/R/glmm.gei.R @@ -651,7 +651,7 @@ glmm.gei <- function(null.obj, interaction, geno.file, outfile, bgen.samplefile= if (is.null(bgen.samplefile)) { stop("Error: bgen file does not contain sample identifiers. A .sample file (bgen.samplefile) is needed.") } - sample.id <- read.table(bgen.samplefile, header = TRUE, sep = " ") + sample.id <- fread(bgen.samplefile, header = TRUE, data.table = FALSE) if ((nrow(sample.id)-1) != bgenInfo$N){ stop(paste0("Error: Number of sample identifiers in BGEN sample file (", nrow(sample.id)-1, ") does not match number of samples in BGEN file (", bgenInfo$N,").")) } diff --git a/R/glmm.gei.meta.R b/R/glmm.gei.meta.R index 47999f8..f6251e1 100644 --- a/R/glmm.gei.meta.R +++ b/R/glmm.gei.meta.R @@ -6,7 +6,7 @@ glmm.gei.meta <- function(files, outfile, interaction, SNPID = rep("SNPID", leng if(length(Non_Effect_Allele) != k) stop("Error: \"Non_Effect_Allele\" must have the same length as \"files\"!") if(length(Effect_Allele) != k) stop("Error: \"Effect_Allele\" must have the same length as \"files\"!") col.include <- c("Beta_Marginal", "SE_Beta_Marginal", "P_Value_Marginal", "Beta_G", paste0("Beta_G.",interaction), "SE_Beta_G", paste0("SE_Beta_G.", interaction), paste0("Cov_Beta_G_G.", interaction), "P_Value_Interaction", "P_Value_Joint") - master <- read.table(files[1], header=T, as.is=T)[, c(SNPID[1], CHR[1], POS[1],Non_Effect_Allele[1], Effect_Allele[1], "N_Samples", "AF", col.include)] + master <- fread(files[1], header=T, data.table = FALSE)[, c(SNPID[1], CHR[1], POS[1],Non_Effect_Allele[1], Effect_Allele[1], "N_Samples", "AF", col.include)] names(master)[1:5] <- c("SNPID", "CHR", "POS", "Non_Effect_Allele", "Effect_Allele") master <- master[apply(!is.na(master[, col.include]), 1, all),] master$SNPID <- paste(master$CHR, master$POS, master$Non_Effect_Allele, master$Effect_Allele, sep = ":") @@ -23,7 +23,7 @@ glmm.gei.meta <- function(files, outfile, interaction, SNPID = rep("SNPID", leng flag <- rep(0, nrow(master)) if(k > 1) { for(i in 2:k) { - tmp <- read.table(files[i], header=T, as.is=T)[, c(SNPID[i], CHR[i],POS[i],Non_Effect_Allele[i], Effect_Allele[i], "N_Samples", "AF", col.include)] + tmp <- fread(files[i], header=T, data.table = FALSE)[, c(SNPID[i], CHR[i],POS[i],Non_Effect_Allele[i], Effect_Allele[i], "N_Samples", "AF", col.include)] names(tmp)[1:5] <- c("SNPID", "CHR", "POS", "Non_Effect_Allele", "Effect_Allele") tmp <- tmp[apply(!is.na(tmp[, col.include]), 1, all),] tmp$SNPID <- paste(tmp$CHR, tmp$POS, tmp$Non_Effect_Allele, tmp$Effect_Allele, sep = ":") diff --git a/README.md b/README.md index 9288fd2..b9168b0 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ See Section 3.2 of the instructions on building R with Intel MKL. ## Version -The current version is 1.2.0 (June 2, 2022). +The current version is 1.2.1 (March 23, 2023). ## License This software is licensed under GPL-3. diff --git a/man/MAGEE.Rd b/man/MAGEE.Rd index 64c84ba..293bd5d 100644 --- a/man/MAGEE.Rd +++ b/man/MAGEE.Rd @@ -12,7 +12,7 @@ Use a glmmkin class object from the null GLMM to perform variant set-based main \usage{ MAGEE(null.obj, interaction, geno.file, group.file, group.file.sep = "\t", bgen.samplefile = NULL, interaction.covariates = NULL, meta.file.prefix = NULL, - MAF.range = c(1e-7, 0.5), MAF.weights.beta = c(1, 25), miss.cutoff = 1, + MAF.range = c(1e-7, 0.5), AF.strata.range = c(0, 1), MAF.weights.beta = c(1, 25), miss.cutoff = 1, missing.method = "impute2mean", method = "davies", tests = "JF", use.minor.allele = FALSE, auto.flip = FALSE, Garbage.Collection = FALSE, is.dosage = FALSE, ncores = 1) @@ -21,7 +21,7 @@ MAGEE.prep(null.obj, interaction, geno.file, group.file, interaction.covariates group.file.sep = "\t", auto.flip = FALSE) MAGEE.lowmem(MAGEE.prep.obj, geno.file = NULL, meta.file.prefix = NULL, - MAF.range = c(1e-7, 0.5), MAF.weights.beta = c(1, 25), miss.cutoff = 1, + MAF.range = c(1e-7, 0.5), AF.strata.range = c(0, 1), MAF.weights.beta = c(1, 25), miss.cutoff = 1, missing.method = "impute2mean", method = "davies", tests = "JF", use.minor.allele = FALSE, Garbage.Collection = FALSE, is.dosage = FALSE, ncores = 1) @@ -54,6 +54,9 @@ the prefix for meta-analysis (default = \code{"NULL"}). } \item{MAF.range}{ a numeric vector of length 2 defining the minimum and maximum minor allele frequencies of variants that should be included in the analysis (default = c(1e-7, 0.5)). +} + \item{AF.strata.range}{ +a numeric vector of length 2 defining the minimum and maximum coding allele frequencies of variants in each stratum that should be included in the analysis, if the environmental factor is categorical (default = c(0, 1)). } \item{MAF.weights.beta}{ a numeric vector of length 2 defining the beta probability density function parameters on the minor allele frequencies. This internal minor allele frequency weight is multiplied by the external weight given by the group.file. To turn off internal minor allele frequency weight and only use the external weight given by the group.file, use c(1, 1) to assign flat weights (default = c(1, 25)). @@ -116,10 +119,10 @@ mean coding allele frequency for variants in the test unit group. maximum coding allele frequency for variants in the test unit group. } \item{freq.strata.min}{ -minimum coding allele frequency of each strata if the environmental factor is categorical. +minimum coding allele frequency of each stratum if the environmental factor is categorical. } \item{freq.strata.max}{ -maximum coding allele frequency of each strata if the environmental factor is categorical. +maximum coding allele frequency of each stratum if the environmental factor is categorical. } \item{MV.pval}{ MV test p-value.