From cccd5b3b759c52e3951e5c8b019c663134e232b1 Mon Sep 17 00:00:00 2001 From: Gibran Hemani Date: Tue, 20 Feb 2024 11:10:30 +0000 Subject: [PATCH 1/2] added arguments to clump_data to allow local clumping --- R/ld.R | 6 ++++-- man/clump_data.Rd | 8 +++++++- tests/testthat/test_ld.R | 10 ++++++++-- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/R/ld.R b/R/ld.R index 08a6d824..6ac2c36b 100644 --- a/R/ld.R +++ b/R/ld.R @@ -18,10 +18,12 @@ #' @param clump_p1 Clumping sig level for index SNPs, default is `1`. #' @param clump_p2 Clumping sig level for secondary SNPs, default is `1`. #' @param pop Super-population to use as reference panel. Default = "EUR". Options are EUR, SAS, EAS, AFR, AMR. 'legacy' also available - which is a previously used version of the EUR panel with a slightly different set of markers +#' @param bfile If this is provided then will use the API. Default = ‘NULL’ +#' @param plink_bin If ‘NULL’ and ‘bfile’ is not ‘NULL’ then will detect packaged plink binary for specific OS. Otherwise specify path to plink binary. Default = ‘NULL’ #' #' @export #' @return Data frame -clump_data <- function(dat, clump_kb=10000, clump_r2=0.001, clump_p1=1, clump_p2=1, pop="EUR") +clump_data <- function(dat, clump_kb=10000, clump_r2=0.001, clump_p1=1, clump_p2=1, pop="EUR", bfile=NULL, plink_bin=NULL) { # .Deprecated("ieugwasr::ld_clump()") @@ -53,7 +55,7 @@ clump_data <- function(dat, clump_kb=10000, clump_r2=0.001, clump_p1=1, clump_p2 } d <- data.frame(rsid=dat$SNP, pval=dat[[pval_column]], id=dat$id.exposure) - out <- ieugwasr::ld_clump(d, clump_kb=clump_kb, clump_r2=clump_r2, clump_p=clump_p1, pop=pop) + out <- ieugwasr::ld_clump(d, clump_kb=clump_kb, clump_r2=clump_r2, clump_p=clump_p1, pop=pop, bfile=bfile, plink_bin=plink_bin) keep <- paste(dat$SNP, dat$id.exposure) %in% paste(out$rsid, out$id) return(dat[keep, ]) } diff --git a/man/clump_data.Rd b/man/clump_data.Rd index e2d03ee0..f93048e0 100644 --- a/man/clump_data.Rd +++ b/man/clump_data.Rd @@ -10,7 +10,9 @@ clump_data( clump_r2 = 0.001, clump_p1 = 1, clump_p2 = 1, - pop = "EUR" + pop = "EUR", + bfile = NULL, + plink_bin = NULL ) } \arguments{ @@ -25,6 +27,10 @@ clump_data( \item{clump_p2}{Clumping sig level for secondary SNPs, default is \code{1}.} \item{pop}{Super-population to use as reference panel. Default = "EUR". Options are EUR, SAS, EAS, AFR, AMR. 'legacy' also available - which is a previously used version of the EUR panel with a slightly different set of markers} + +\item{bfile}{If this is provided then will use the API. Default = ‘NULL’} + +\item{plink_bin}{If ‘NULL’ and ‘bfile’ is not ‘NULL’ then will detect packaged plink binary for specific OS. Otherwise specify path to plink binary. Default = ‘NULL’} } \value{ Data frame diff --git a/tests/testthat/test_ld.R b/tests/testthat/test_ld.R index 33c85d21..47c257f6 100644 --- a/tests/testthat/test_ld.R +++ b/tests/testthat/test_ld.R @@ -18,11 +18,17 @@ test_that("matrix", { expect_equal(ncol(b), nrow(out)) }) - - a <- extract_instruments(c("ieu-a-2", "ieu-a-1001"), clump=FALSE) out <- clump_data(a) test_that("clump multiple", { expect_equal(length(unique(a$id.exposure)), length(unique(out$id.exposure))) }) + +test_that("clump local", { + skip_on_ci() + skip_on_cran() + skip_if_not(file.exists("/Users/gh13047/repo/opengwas-api-internal/opengwas-api/app/ld_files/EUR.bim")) + aclump <- clump_data(a, bfile="/Users/gh13047/repo/opengwas-api-internal/opengwas-api/app/ld_files/EUR", plink_bin="plink") +}) + From 97ed057e959527b17204f34604116fa951c469c5 Mon Sep 17 00:00:00 2001 From: Gibran Hemani Date: Tue, 20 Feb 2024 15:23:07 +0000 Subject: [PATCH 2/2] updated local version of multivariable MR --- DESCRIPTION | 2 +- R/multivariable_mr.R | 98 +++++++++++++++++++++++-------- R/read_data.R | 2 +- man/mv_extract_exposures_local.Rd | 10 +++- tests/testthat/test_mvmr_local.R | 31 ++++++++++ 5 files changed, 116 insertions(+), 27 deletions(-) create mode 100644 tests/testthat/test_mvmr_local.R diff --git a/DESCRIPTION b/DESCRIPTION index fb0ab9bb..b10ae18d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TwoSampleMR Title: Two Sample MR Functions and Interface to MR Base Database -Version: 0.5.9 +Version: 0.5.10 Authors@R: c( person("Gibran", "Hemani", , "g.hemani@bristol.ac.uk", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-0920-1055")), diff --git a/R/multivariable_mr.R b/R/multivariable_mr.R index efabd08f..040f1eb9 100644 --- a/R/multivariable_mr.R +++ b/R/multivariable_mr.R @@ -56,10 +56,12 @@ mv_extract_exposures <- function(id_exposure, clump_r2=0.001, clump_kb=10000, ha #' Attempt to perform MVMR using local data #' -#' Under construction +#' Allows you to read in summary data from text files to format the multivariable exposure dataset. +#' +#' Note that you can provide an array of column names for each column, which is of length `filenames_exposure` #' #' @param filenames_exposure Filenames for each exposure dataset. Must have header with at least SNP column present. Following arguments are used for determining how to read the filename and clumping etc. -#' @param sep Specify delimeter in file. The default is space, i.e. `sep=" "`. +#' @param sep Specify delimeter in file. The default is space, i.e. `sep=" "`. If length is 1 it will use the same `sep` value for each exposure dataset. You can provide a vector of values, one for each exposure dataset, if the values are different across datasets. The same applies to all dataset-formatting options listed below. #' @param phenotype_col Optional column name for the column with phenotype name corresponding the the SNP. If not present then will be created with the value `"Outcome"`. Default is `"Phenotype"`. #' @param snp_col Required name of column with SNP rs IDs. The default is `"SNP"`. #' @param beta_col Required for MR. Name of column with effect sizes. THe default is `"beta"`. @@ -83,36 +85,84 @@ mv_extract_exposures <- function(id_exposure, clump_r2=0.001, clump_kb=10000, ha #' #' @export #' @return List -mv_extract_exposures_local <- function(filenames_exposure, sep = " ", phenotype_col = "Phenotype", snp_col = "SNP", beta_col = "beta", se_col = "se", eaf_col = "eaf", effect_allele_col = "effect_allele", other_allele_col = "other_allele", pval_col = "pval", units_col = "units", ncase_col = "ncase", ncontrol_col = "ncontrol", samplesize_col = "samplesize", gene_col = "gene", id_col = "id", min_pval = 1e-200, log_pval = FALSE, pval_threshold=5e-8, clump_r2=0.001, clump_kb=10000, harmonise_strictness=2) -{ +mv_extract_exposures_local <- function( + filenames_exposure, + sep = " ", + phenotype_col = "Phenotype", + snp_col = "SNP", + beta_col = "beta", + se_col = "se", + eaf_col = "eaf", + effect_allele_col = "effect_allele", + other_allele_col = "other_allele", + pval_col = "pval", + units_col = "units", + ncase_col = "ncase", + ncontrol_col = "ncontrol", + samplesize_col = "samplesize", + gene_col = "gene", + id_col = "id", + min_pval = 1e-200, + log_pval = FALSE, + pval_threshold=5e-8, + plink_bin=NULL, + bfile=NULL, + clump_r2=0.001, + clump_kb=10000, + pop="EUR", + harmonise_strictness=2 +) { message("WARNING: Experimental function") + + n <- length(filenames_exposure) + if(length(sep) == 1) {sep <- rep(sep, n)} + if(length(phenotype_col) == 1) {phenotype_col <- rep(phenotype_col, n)} + if(length(snp_col) == 1) {snp_col <- rep(snp_col, n)} + if(length(beta_col) == 1) {beta_col <- rep(beta_col, n)} + if(length(se_col) == 1) {se_col <- rep(se_col, n)} + if(length(eaf_col) == 1) {eaf_col <- rep(eaf_col, n)} + if(length(effect_allele_col) == 1) {effect_allele_col <- rep(effect_allele_col, n)} + if(length(other_allele_col) == 1) {other_allele_col <- rep(other_allele_col, n)} + if(length(pval_col) == 1) {pval_col <- rep(pval_col, n)} + if(length(units_col) == 1) {units_col <- rep(units_col, n)} + if(length(ncase_col) == 1) {ncase_col <- rep(ncase_col, n)} + if(length(ncontrol_col) == 1) {ncontrol_col <- rep(ncontrol_col, n)} + if(length(samplesize_col) == 1) {samplesize_col <- rep(samplesize_col, n)} + if(length(gene_col) == 1) {gene_col <- rep(gene_col, n)} + if(length(id_col) == 1) {id_col <- rep(id_col, n)} + if(length(min_pval) == 1) {min_pval <- rep(min_pval, n)} + if(length(log_pval) == 1) {log_pval <- rep(log_pval, n)} + l_full <- list() l_inst <- list() for(i in 1:length(filenames_exposure)) { l_full[[i]] <- read_outcome_data(filenames_exposure[i], - sep = sep, - phenotype_col = phenotype_col, - snp_col = snp_col, - beta_col = beta_col, - se_col = se_col, - eaf_col = eaf_col, - effect_allele_col = effect_allele_col, - other_allele_col = other_allele_col, - pval_col = pval_col, - units_col = units_col, - ncase_col = ncase_col, - ncontrol_col = ncontrol_col, - samplesize_col = samplesize_col, - gene_col = gene_col, - id_col = id_col, - min_pval = min_pval, - log_pval = log_pval + sep = sep[i], + phenotype_col = phenotype_col[i], + snp_col = snp_col[i], + beta_col = beta_col[i], + se_col = se_col[i], + eaf_col = eaf_col[i], + effect_allele_col = effect_allele_col[i], + other_allele_col = other_allele_col[i], + pval_col = pval_col[i], + units_col = units_col[i], + ncase_col = ncase_col[i], + ncontrol_col = ncontrol_col[i], + samplesize_col = samplesize_col[i], + gene_col = gene_col[i], + id_col = id_col[i], + min_pval = min_pval[i], + log_pval = log_pval[i] ) + if(l_full[[i]]$outcome[1] == "outcome") l_full[[i]]$outcome <- paste0("exposure", i) l_inst[[i]] <- subset(l_full[[i]], pval.outcome < pval_threshold) + l_inst[[i]] <- subset(l_inst[[i]], !duplicated(SNP)) l_inst[[i]] <- convert_outcome_to_exposure(l_inst[[i]]) l_inst[[i]] <- subset(l_inst[[i]], pval.exposure < pval_threshold) - l_inst[[i]] <- clump_data(l_inst[[i]], clump_p1=pval_threshold, clump_r2=clump_r2, clump_kb=clump_kb) + l_inst[[i]] <- clump_data(l_inst[[i]], clump_p1=pval_threshold, clump_r2=clump_r2, clump_kb=clump_kb, bfile=bfile, plink_bin=plink_bin, pop=pop) + message("Identified ", nrow(l_inst[[i]]), " hits for trait ", l_inst[[i]]$exposure[1]) } exposure_dat <- dplyr::bind_rows(l_inst) @@ -121,9 +171,11 @@ mv_extract_exposures_local <- function(filenames_exposure, sep = " ", phenotype_ temp$id.exposure <- 1 temp <- temp[order(temp$pval.exposure, decreasing=FALSE), ] temp <- subset(temp, !duplicated(SNP)) - temp <- clump_data(temp, clump_p1=pval_threshold, clump_r2=clump_r2, clump_kb=clump_kb) + temp <- clump_data(temp, clump_p1=pval_threshold, clump_r2=clump_r2, clump_kb=clump_kb, bfile=bfile, plink_bin=plink_bin, pop=pop) exposure_dat <- subset(exposure_dat, SNP %in% temp$SNP) + message("Identified ", length(unique(temp$SNP)), " variants to include") + d1 <- lapply(l_full, function(x) { subset(x, SNP %in% exposure_dat$SNP) }) %>% dplyr::bind_rows() diff --git a/R/read_data.R b/R/read_data.R index 0cba52e3..28e11cb6 100644 --- a/R/read_data.R +++ b/R/read_data.R @@ -202,7 +202,7 @@ format_data <- function(dat, type="exposure", snps=NULL, header=TRUE, } } - if ( log_pval ) + if(log_pval) { dat$pval <- 10^-dat[[pval_col]] } diff --git a/man/mv_extract_exposures_local.Rd b/man/mv_extract_exposures_local.Rd index f1838575..e7d677bd 100644 --- a/man/mv_extract_exposures_local.Rd +++ b/man/mv_extract_exposures_local.Rd @@ -24,15 +24,18 @@ mv_extract_exposures_local( min_pval = 1e-200, log_pval = FALSE, pval_threshold = 5e-08, + plink_bin = NULL, + bfile = NULL, clump_r2 = 0.001, clump_kb = 10000, + pop = "EUR", harmonise_strictness = 2 ) } \arguments{ \item{filenames_exposure}{Filenames for each exposure dataset. Must have header with at least SNP column present. Following arguments are used for determining how to read the filename and clumping etc.} -\item{sep}{Specify delimeter in file. The default is space, i.e. \code{sep=" "}.} +\item{sep}{Specify delimeter in file. The default is space, i.e. \code{sep=" "}. If length is 1 it will use the same \code{sep} value for each exposure dataset. You can provide a vector of values, one for each exposure dataset, if the values are different across datasets. The same applies to all dataset-formatting options listed below.} \item{phenotype_col}{Optional column name for the column with phenotype name corresponding the the SNP. If not present then will be created with the value \code{"Outcome"}. Default is \code{"Phenotype"}.} @@ -78,5 +81,8 @@ mv_extract_exposures_local( List } \description{ -Under construction +Allows you to read in summary data from text files to format the multivariable exposure dataset. +} +\details{ +Note that you can provide an array of column names for each column, which is of length \code{filenames_exposure} } diff --git a/tests/testthat/test_mvmr_local.R b/tests/testthat/test_mvmr_local.R new file mode 100644 index 00000000..0c220c61 --- /dev/null +++ b/tests/testthat/test_mvmr_local.R @@ -0,0 +1,31 @@ +context("mvmr local") + +test_that("mv exposure local", { + skip_on_ci() + skip_on_cran() + + a <- ieugwasr::tophits("ieu-a-2") + b <- ieugwasr::tophits("ieu-a-1001") + rsid <- unique(c(a$rsid, b$rsid)) + a1 <- ieugwasr::associations(rsid, "ieu-a-2") + a2 <- ieugwasr::associations(rsid, "ieu-a-1001") + + f1 <- tempfile() + f2 <- tempfile() + write.table(a1, file=f1, row=F, col=T, qu=F, sep="\t") + write.table(a2, file=f2, row=F, col=T, qu=F, sep="\t") + + exposure_dat <- mv_extract_exposures_local( + c(f1, f2), + sep = "\t", + snp_col=c("rsid"), + beta_col=c("beta"), + se_col=c("se"), + effect_allele_col=c("ea"), + other_allele_col=c("nea"), + pval_col=c("p") + ) + + expect_true(nrow(exposure_dat) > 100) +}) +