From cccd5b3b759c52e3951e5c8b019c663134e232b1 Mon Sep 17 00:00:00 2001
From: Gibran Hemani <explodecomputer@gmail.com>
Date: Tue, 20 Feb 2024 11:10:30 +0000
Subject: [PATCH 1/2] added arguments to clump_data to allow local clumping

---
 R/ld.R                   |  6 ++++--
 man/clump_data.Rd        |  8 +++++++-
 tests/testthat/test_ld.R | 10 ++++++++--
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/R/ld.R b/R/ld.R
index 08a6d824..6ac2c36b 100644
--- a/R/ld.R
+++ b/R/ld.R
@@ -18,10 +18,12 @@
 #' @param clump_p1 Clumping sig level for index SNPs, default is `1`.
 #' @param clump_p2 Clumping sig level for secondary SNPs, default is `1`.
 #' @param pop Super-population to use as reference panel. Default = "EUR". Options are EUR, SAS, EAS, AFR, AMR. 'legacy' also available - which is a previously used version of the EUR panel with a slightly different set of markers
+#' @param bfile If this is provided then will use the API. Default = ‘NULL’
+#' @param plink_bin If ‘NULL’ and ‘bfile’ is not ‘NULL’ then will detect packaged plink binary for specific OS. Otherwise specify path to plink binary. Default = ‘NULL’
 #'
 #' @export
 #' @return Data frame
-clump_data <- function(dat, clump_kb=10000, clump_r2=0.001, clump_p1=1, clump_p2=1, pop="EUR")
+clump_data <- function(dat, clump_kb=10000, clump_r2=0.001, clump_p1=1, clump_p2=1, pop="EUR", bfile=NULL, plink_bin=NULL)
 {
 	# .Deprecated("ieugwasr::ld_clump()")
 
@@ -53,7 +55,7 @@ clump_data <- function(dat, clump_kb=10000, clump_r2=0.001, clump_p1=1, clump_p2
 	}
 
 	d <- data.frame(rsid=dat$SNP, pval=dat[[pval_column]], id=dat$id.exposure)
-	out <- ieugwasr::ld_clump(d, clump_kb=clump_kb, clump_r2=clump_r2, clump_p=clump_p1, pop=pop)
+	out <- ieugwasr::ld_clump(d, clump_kb=clump_kb, clump_r2=clump_r2, clump_p=clump_p1, pop=pop, bfile=bfile, plink_bin=plink_bin)
 	keep <- paste(dat$SNP, dat$id.exposure) %in% paste(out$rsid, out$id)
 	return(dat[keep, ])
 }
diff --git a/man/clump_data.Rd b/man/clump_data.Rd
index e2d03ee0..f93048e0 100644
--- a/man/clump_data.Rd
+++ b/man/clump_data.Rd
@@ -10,7 +10,9 @@ clump_data(
   clump_r2 = 0.001,
   clump_p1 = 1,
   clump_p2 = 1,
-  pop = "EUR"
+  pop = "EUR",
+  bfile = NULL,
+  plink_bin = NULL
 )
 }
 \arguments{
@@ -25,6 +27,10 @@ clump_data(
 \item{clump_p2}{Clumping sig level for secondary SNPs, default is \code{1}.}
 
 \item{pop}{Super-population to use as reference panel. Default = "EUR". Options are EUR, SAS, EAS, AFR, AMR. 'legacy' also available - which is a previously used version of the EUR panel with a slightly different set of markers}
+
+\item{bfile}{If this is provided then will use the API. Default = ‘NULL’}
+
+\item{plink_bin}{If ‘NULL’ and ‘bfile’ is not ‘NULL’ then will detect packaged plink binary for specific OS. Otherwise specify path to plink binary. Default = ‘NULL’}
 }
 \value{
 Data frame
diff --git a/tests/testthat/test_ld.R b/tests/testthat/test_ld.R
index 33c85d21..47c257f6 100644
--- a/tests/testthat/test_ld.R
+++ b/tests/testthat/test_ld.R
@@ -18,11 +18,17 @@ test_that("matrix", {
 	expect_equal(ncol(b), nrow(out))
 })
 
-
-
 a <- extract_instruments(c("ieu-a-2", "ieu-a-1001"), clump=FALSE)
 out <- clump_data(a)
 
 test_that("clump multiple", {
 	expect_equal(length(unique(a$id.exposure)), length(unique(out$id.exposure)))
 })
+
+test_that("clump local", {
+	skip_on_ci()
+	skip_on_cran()
+	skip_if_not(file.exists("/Users/gh13047/repo/opengwas-api-internal/opengwas-api/app/ld_files/EUR.bim"))
+	aclump <- clump_data(a, bfile="/Users/gh13047/repo/opengwas-api-internal/opengwas-api/app/ld_files/EUR", plink_bin="plink")
+})
+

From 97ed057e959527b17204f34604116fa951c469c5 Mon Sep 17 00:00:00 2001
From: Gibran Hemani <explodecomputer@gmail.com>
Date: Tue, 20 Feb 2024 15:23:07 +0000
Subject: [PATCH 2/2] updated local version of multivariable MR

---
 DESCRIPTION                       |  2 +-
 R/multivariable_mr.R              | 98 +++++++++++++++++++++++--------
 R/read_data.R                     |  2 +-
 man/mv_extract_exposures_local.Rd | 10 +++-
 tests/testthat/test_mvmr_local.R  | 31 ++++++++++
 5 files changed, 116 insertions(+), 27 deletions(-)
 create mode 100644 tests/testthat/test_mvmr_local.R

diff --git a/DESCRIPTION b/DESCRIPTION
index fb0ab9bb..b10ae18d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: TwoSampleMR
 Title: Two Sample MR Functions and Interface to MR Base Database
-Version: 0.5.9
+Version: 0.5.10
 Authors@R: c(
     person("Gibran", "Hemani", , "g.hemani@bristol.ac.uk", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-0920-1055")),
diff --git a/R/multivariable_mr.R b/R/multivariable_mr.R
index efabd08f..040f1eb9 100644
--- a/R/multivariable_mr.R
+++ b/R/multivariable_mr.R
@@ -56,10 +56,12 @@ mv_extract_exposures <- function(id_exposure, clump_r2=0.001, clump_kb=10000, ha
 
 #' Attempt to perform MVMR using local data
 #'
-#' Under construction
+#' Allows you to read in summary data from text files to format the multivariable exposure dataset. 
+#' 
+#' Note that you can provide an array of column names for each column, which is of length `filenames_exposure`
 #'
 #' @param filenames_exposure Filenames for each exposure dataset. Must have header with at least SNP column present. Following arguments are used for determining how to read the filename and clumping etc.
-#' @param sep Specify delimeter in file. The default is space, i.e. `sep=" "`.
+#' @param sep Specify delimeter in file. The default is space, i.e. `sep=" "`. If length is 1 it will use the same `sep` value for each exposure dataset. You can provide a vector of values, one for each exposure dataset, if the values are different across datasets. The same applies to all dataset-formatting options listed below.
 #' @param phenotype_col Optional column name for the column with phenotype name corresponding the the SNP. If not present then will be created with the value `"Outcome"`. Default is `"Phenotype"`.
 #' @param snp_col Required name of column with SNP rs IDs. The default is `"SNP"`.
 #' @param beta_col Required for MR. Name of column with effect sizes. THe default is `"beta"`.
@@ -83,36 +85,84 @@ mv_extract_exposures <- function(id_exposure, clump_r2=0.001, clump_kb=10000, ha
 #'
 #' @export
 #' @return List
-mv_extract_exposures_local <- function(filenames_exposure, sep = " ", phenotype_col = "Phenotype", snp_col = "SNP", beta_col = "beta", se_col = "se", eaf_col = "eaf", effect_allele_col = "effect_allele", other_allele_col = "other_allele", pval_col = "pval", units_col = "units", ncase_col = "ncase", ncontrol_col = "ncontrol", samplesize_col = "samplesize", gene_col = "gene", id_col = "id", min_pval = 1e-200, log_pval = FALSE, pval_threshold=5e-8, clump_r2=0.001, clump_kb=10000, harmonise_strictness=2)
-{
+mv_extract_exposures_local <- function(
+	filenames_exposure, 
+	sep = " ", 
+	phenotype_col = "Phenotype",
+	snp_col = "SNP",
+	beta_col = "beta",
+	se_col = "se",
+	eaf_col = "eaf",
+	effect_allele_col = "effect_allele",
+	other_allele_col = "other_allele",
+	pval_col = "pval",
+	units_col = "units",
+	ncase_col = "ncase",
+	ncontrol_col = "ncontrol",
+	samplesize_col = "samplesize",
+	gene_col = "gene",
+	id_col = "id",
+	min_pval = 1e-200,
+	log_pval = FALSE,
+	pval_threshold=5e-8,
+	plink_bin=NULL,
+	bfile=NULL,
+	clump_r2=0.001,
+	clump_kb=10000,
+	pop="EUR",
+	harmonise_strictness=2
+) {
 	message("WARNING: Experimental function")
+
+	n <- length(filenames_exposure)
+	if(length(sep) == 1) {sep <- rep(sep, n)}
+	if(length(phenotype_col) == 1) {phenotype_col <- rep(phenotype_col, n)}
+	if(length(snp_col) == 1) {snp_col <- rep(snp_col, n)}
+	if(length(beta_col) == 1) {beta_col <- rep(beta_col, n)}
+	if(length(se_col) == 1) {se_col <- rep(se_col, n)}
+	if(length(eaf_col) == 1) {eaf_col <- rep(eaf_col, n)}
+	if(length(effect_allele_col) == 1) {effect_allele_col <- rep(effect_allele_col, n)}
+	if(length(other_allele_col) == 1) {other_allele_col <- rep(other_allele_col, n)}
+	if(length(pval_col) == 1) {pval_col <- rep(pval_col, n)}
+	if(length(units_col) == 1) {units_col <- rep(units_col, n)}
+	if(length(ncase_col) == 1) {ncase_col <- rep(ncase_col, n)}
+	if(length(ncontrol_col) == 1) {ncontrol_col <- rep(ncontrol_col, n)}
+	if(length(samplesize_col) == 1) {samplesize_col <- rep(samplesize_col, n)}
+	if(length(gene_col) == 1) {gene_col <- rep(gene_col, n)}
+	if(length(id_col) == 1) {id_col <- rep(id_col, n)}
+	if(length(min_pval) == 1) {min_pval <- rep(min_pval, n)}
+	if(length(log_pval) == 1) {log_pval <- rep(log_pval, n)}
+
 	l_full <- list()
 	l_inst <- list()
 	for(i in 1:length(filenames_exposure))
 	{
 		l_full[[i]] <- read_outcome_data(filenames_exposure[i], 
-			sep = sep,
-			phenotype_col = phenotype_col,
-			snp_col = snp_col,
-			beta_col = beta_col,
-			se_col = se_col,
-			eaf_col = eaf_col,
-			effect_allele_col = effect_allele_col,
-			other_allele_col = other_allele_col,
-			pval_col = pval_col,
-			units_col = units_col,
-			ncase_col = ncase_col,
-			ncontrol_col = ncontrol_col,
-			samplesize_col = samplesize_col,
-			gene_col = gene_col,
-			id_col = id_col,
-			min_pval = min_pval,
-			log_pval = log_pval
+			sep = sep[i],
+			phenotype_col = phenotype_col[i],
+			snp_col = snp_col[i],
+			beta_col = beta_col[i],
+			se_col = se_col[i],
+			eaf_col = eaf_col[i],
+			effect_allele_col = effect_allele_col[i],
+			other_allele_col = other_allele_col[i],
+			pval_col = pval_col[i],
+			units_col = units_col[i],
+			ncase_col = ncase_col[i],
+			ncontrol_col = ncontrol_col[i],
+			samplesize_col = samplesize_col[i],
+			gene_col = gene_col[i],
+			id_col = id_col[i],
+			min_pval = min_pval[i],
+			log_pval = log_pval[i]
 		)
+		if(l_full[[i]]$outcome[1] == "outcome") l_full[[i]]$outcome <- paste0("exposure", i)
 		l_inst[[i]] <- subset(l_full[[i]], pval.outcome < pval_threshold)
+		l_inst[[i]] <- subset(l_inst[[i]], !duplicated(SNP))
 		l_inst[[i]] <- convert_outcome_to_exposure(l_inst[[i]])
 		l_inst[[i]] <- subset(l_inst[[i]], pval.exposure < pval_threshold)
-		l_inst[[i]] <- clump_data(l_inst[[i]], clump_p1=pval_threshold, clump_r2=clump_r2, clump_kb=clump_kb)
+		l_inst[[i]] <- clump_data(l_inst[[i]], clump_p1=pval_threshold, clump_r2=clump_r2, clump_kb=clump_kb, bfile=bfile, plink_bin=plink_bin, pop=pop)
+		message("Identified ", nrow(l_inst[[i]]), " hits for trait ", l_inst[[i]]$exposure[1])
 	}
 
 	exposure_dat <- dplyr::bind_rows(l_inst)
@@ -121,9 +171,11 @@ mv_extract_exposures_local <- function(filenames_exposure, sep = " ", phenotype_
 	temp$id.exposure <- 1
 	temp <- temp[order(temp$pval.exposure, decreasing=FALSE), ]
 	temp <- subset(temp, !duplicated(SNP))
-	temp <- clump_data(temp, clump_p1=pval_threshold, clump_r2=clump_r2, clump_kb=clump_kb)
+	temp <- clump_data(temp, clump_p1=pval_threshold, clump_r2=clump_r2, clump_kb=clump_kb, bfile=bfile, plink_bin=plink_bin, pop=pop)
 	exposure_dat <- subset(exposure_dat, SNP %in% temp$SNP)
 
+	message("Identified ", length(unique(temp$SNP)), " variants to include")
+
 	d1 <- lapply(l_full, function(x) {
 		subset(x, SNP %in% exposure_dat$SNP)
 		}) %>% dplyr::bind_rows()
diff --git a/R/read_data.R b/R/read_data.R
index 0cba52e3..28e11cb6 100644
--- a/R/read_data.R
+++ b/R/read_data.R
@@ -202,7 +202,7 @@ format_data <- function(dat, type="exposure", snps=NULL, header=TRUE,
 		}
 	}
 
-	if ( log_pval )
+	if(log_pval)
 	{
 		dat$pval <- 10^-dat[[pval_col]]
 	}
diff --git a/man/mv_extract_exposures_local.Rd b/man/mv_extract_exposures_local.Rd
index f1838575..e7d677bd 100644
--- a/man/mv_extract_exposures_local.Rd
+++ b/man/mv_extract_exposures_local.Rd
@@ -24,15 +24,18 @@ mv_extract_exposures_local(
   min_pval = 1e-200,
   log_pval = FALSE,
   pval_threshold = 5e-08,
+  plink_bin = NULL,
+  bfile = NULL,
   clump_r2 = 0.001,
   clump_kb = 10000,
+  pop = "EUR",
   harmonise_strictness = 2
 )
 }
 \arguments{
 \item{filenames_exposure}{Filenames for each exposure dataset. Must have header with at least SNP column present. Following arguments are used for determining how to read the filename and clumping etc.}
 
-\item{sep}{Specify delimeter in file. The default is space, i.e. \code{sep=" "}.}
+\item{sep}{Specify delimeter in file. The default is space, i.e. \code{sep=" "}. If length is 1 it will use the same \code{sep} value for each exposure dataset. You can provide a vector of values, one for each exposure dataset, if the values are different across datasets. The same applies to all dataset-formatting options listed below.}
 
 \item{phenotype_col}{Optional column name for the column with phenotype name corresponding the the SNP. If not present then will be created with the value \code{"Outcome"}. Default is \code{"Phenotype"}.}
 
@@ -78,5 +81,8 @@ mv_extract_exposures_local(
 List
 }
 \description{
-Under construction
+Allows you to read in summary data from text files to format the multivariable exposure dataset.
+}
+\details{
+Note that you can provide an array of column names for each column, which is of length \code{filenames_exposure}
 }
diff --git a/tests/testthat/test_mvmr_local.R b/tests/testthat/test_mvmr_local.R
new file mode 100644
index 00000000..0c220c61
--- /dev/null
+++ b/tests/testthat/test_mvmr_local.R
@@ -0,0 +1,31 @@
+context("mvmr local")
+
+test_that("mv exposure local", {
+	skip_on_ci()
+	skip_on_cran()
+
+    a <- ieugwasr::tophits("ieu-a-2")
+    b <- ieugwasr::tophits("ieu-a-1001")
+    rsid <- unique(c(a$rsid, b$rsid))
+    a1 <- ieugwasr::associations(rsid, "ieu-a-2")
+    a2 <- ieugwasr::associations(rsid, "ieu-a-1001")
+
+    f1 <- tempfile()
+    f2 <- tempfile()
+    write.table(a1, file=f1, row=F, col=T, qu=F, sep="\t")
+    write.table(a2, file=f2, row=F, col=T, qu=F, sep="\t")
+
+    exposure_dat <- mv_extract_exposures_local(
+        c(f1, f2),
+        sep = "\t",
+        snp_col=c("rsid"),
+        beta_col=c("beta"),
+        se_col=c("se"),
+        effect_allele_col=c("ea"),
+        other_allele_col=c("nea"),
+        pval_col=c("p")
+    )
+
+    expect_true(nrow(exposure_dat) > 100)
+})
+