diff --git a/DESCRIPTION b/DESCRIPTION index c71448f..9ccd4fc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,31 +1,34 @@ Package: kalis Type: Package Title: High Performance Li & Stephens Local Ancestry Inference -Version: 1.0.0 +Version: 2.0.0 Authors@R: c(person("Louis", "Aslett", role = c("aut", "cre"), - email = "louis.aslett@durham.ac.uk"), + email = "louis.aslett@durham.ac.uk", + comment = c(ORCID = "0000-0003-2211-233X")), person("Ryan", "Christ", role = "aut", - email = "rchrist@wustl.edu")) + email = "rchrist@wustl.edu", + comment = c(ORCID = "0000-0002-2049-3389"))) Author: Louis Aslett [aut, cre], Ryan Christ [aut] Maintainer: Louis Aslett -Description: kalis provides a high performance implementation of the Li & - Stephens model for local - ancestry inference (local referring to a region of the genome). For a set of N - phased haplotypes, kalis computes the posterior marginal probability of each - haplotype copying every other haplotype by running N hidden Markov models in - parallel. This yields an N x N distance matrix that summarizes the recent - local ancestry at each variant of interest. The package provides functionality - for specifying a recombination map, site-specific mutation rates, and - differing prior copying probabilities for each recipient haplotype. Extensive - use is made of low level threading and CPU vector instructions. +Description: kalis provides a high performance + implementation of the Li & Stephens model + for local ancestry inference (local referring to a region of the genome). For + a set of N phased haplotypes, kalis computes the posterior marginal + probability of each haplotype copying every other haplotype by running N + hidden Markov models in parallel. This yields an N x N distance matrix that + summarizes the recent local ancestry at each variant of interest. The package + provides functionality for specifying a recombination map, site-specific + mutation rates, and differing prior copying probabilities for each recipient + haplotype. Extensive use is made of low level threading and CPU vector + instructions. License: GPL (>= 3) +BugReports: https://github.com/louisaslett/kalis/issues URL: https://kalis.louisaslett.com/, https://github.com/louisaslett/kalis LazyData: TRUE Depends: R (>= 3.5.0) Imports: utils, - methods, stats, parallel, dplyr, @@ -47,7 +50,8 @@ Suggests: rmarkdown, fastcluster, lattice, - testthat (>= 3.0.0) + testthat (>= 3.0.0), + data.table VignetteBuilder: knitr Encoding: UTF-8 Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index e67a690..9b46aee 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,31 +1,49 @@ # Generated by roxygen2: do not edit by hand S3method(plot,kalisDistanceMatrix) +S3method(plot,kalisIterator) S3method(print,kalisBackwardTable) +S3method(print,kalisCheckpointTable) S3method(print,kalisForwardTable) +S3method(print,kalisIterator) S3method(print,kalisParameters) +S3method(targets,kalisIterator) export(Backward) export(CacheHaplotypes) export(CacheSummary) +export(CalcCheckpointTables) export(CalcRho) +export(CalcTraces) +export(CladeMat) export(ClearHaplotypeCache) export(CopyTable) +export(CreateForwardTableCache) export(DistMat) +export(FillTableCache) export(Forward) +export(ForwardIterator) +export(ForwardUsingTableCache) export(L) export(MakeBackwardTable) export(MakeForwardTable) export(N) export(Parameters) export(PostProbs) +export(PruneCladeMat) export(QueryCache) export(ReadHaplotypes) export(ResetTable) +export(Sprigs) export(WriteHaplotypes) import(checkmate) import(dplyr) importFrom(digest,digest) importFrom(glue,glue) importFrom(glue,glue_collapse) +importFrom(graphics,axis) +importFrom(prettyunits,pretty_bytes) importFrom(rlang,duplicate) +importFrom(stats,sd) +importFrom(utils,getFromNamespace) +importFrom(utils,tail) useDynLib(kalis, .registration = TRUE, .fixes = "CCall_") diff --git a/R/CacheHaplotypes.R b/R/CacheHaplotypes.R index d900e11..fe5e967 100644 --- a/R/CacheHaplotypes.R +++ b/R/CacheHaplotypes.R @@ -36,7 +36,7 @@ assign("L", NA, envir = pkgVars) # must be integer #' #' (num rows)x(num cols) = (num variants)x(num haplotypes). #' -#' It is fine to delete this matrix from R after calling \code{CacheHaplotypes}. +#' It is fine to delete this matrix from R after calling [CacheHaplotypes()]. #' #' #' **HDF5 format** @@ -48,6 +48,9 @@ assign("L", NA, envir = pkgVars) # must be integer #' #' #' +#' @references +#' Aslett, L.J.M. and Christ, R.R. (2024) "kalis: a modern implementation of the Li & Stephens model for local ancestry inference in R", *BMC Bioinformatics*, **25**(1). Available at: \doi{10.1186/s12859-024-05688-8}. +#' #' @param haps can be the name of a file from which the haplotypes are to be read, or can be an R matrix containing only 0/1s. #' See Details section for supported file types. #' @param loci.idx an optional vector of indices specifying the variants to load into the cache, indexed from 1. @@ -219,6 +222,9 @@ CacheHaplotypes.err <- function(err) { #' To achieve higher performance, kalis internally represents haplotypes in an efficient raw binary format in memory which cannot be directly viewed or manipulated in R. #' This function enables you to copy whole or partial views of haplotypes/variants out of this low-level format and into a standard R matrix of 0's and 1's. #' +#' @references +#' Aslett, L.J.M. and Christ, R.R. (2024) "kalis: a modern implementation of the Li & Stephens model for local ancestry inference in R", *BMC Bioinformatics*, **25**(1). Available at: \doi{10.1186/s12859-024-05688-8}. +#' #' @param loci.idx which variants to retrieve from the cache, specified as a (vector) index. #' This enables specifying variants by offset in the order they were loaded into the cache (from 1 to the number of variants). #' @param hap.idx which haplotypes to retrieve from the cache, specified as a (vector) index. @@ -295,6 +301,9 @@ QueryCache <- function(loci.idx = NULL, hap.idx = NULL) { #' In particular, this cache sits outside R's memory management and will never be garbage collected (unless R is quit or the package is unloaded). #' Therefore, this function is provided to enable freeing the memory used by this cache. #' +#' @references +#' Aslett, L.J.M. and Christ, R.R. (2024) "kalis: a modern implementation of the Li & Stephens model for local ancestry inference in R", *BMC Bioinformatics*, **25**(1). Available at: \doi{10.1186/s12859-024-05688-8}. +#' #' @return Nothing is returned. #' #' @seealso [CacheHaplotypes()] to create a haplotype cache; diff --git a/R/CacheHaplotypes_hdf5.R b/R/CacheHaplotypes_hdf5.R index 41fda9e..d5934a9 100644 --- a/R/CacheHaplotypes_hdf5.R +++ b/R/CacheHaplotypes_hdf5.R @@ -112,6 +112,7 @@ CacheHaplotypes.hdf5.hdf5r <- function(hdf5.file, else res <- t(h5.haps[hap.idx[current.step:upto],loci.idx]) current.step <<- upto + 1 + storage.mode(res) <- "integer" res } } @@ -201,10 +202,11 @@ CacheHaplotypes.hdf5.rhdf5 <- function(hdf5.file, } upto <- min(current.step + step.size - 1, N) if(!transpose) - res <- matrix(as.integer(rhdf5::h5read(hdf5.file, haps.path, index = list(loci.idx, hap.idx[current.step:upto]))), nrow = length(loci.idx)) + res <- rhdf5::h5read(hdf5.file, haps.path, index = list(loci.idx, hap.idx[current.step:upto])) else - res <- t(matrix(as.integer(rhdf5::h5read(hdf5.file, haps.path, index = list(hap.idx[current.step:upto], loci.idx))), ncol = length(loci.idx))) + res <- t(rhdf5::h5read(hdf5.file, haps.path, index = list(hap.idx[current.step:upto], loci.idx))) current.step <<- upto + 1 + storage.mode(res) <- "integer" res } } diff --git a/R/CacheSummary.R b/R/CacheSummary.R index b89510a..c923096 100644 --- a/R/CacheSummary.R +++ b/R/CacheSummary.R @@ -1,12 +1,15 @@ #' Retrieve information about the haplotype cache #' +#' @references +#' Aslett, L.J.M. and Christ, R.R. (2024) "kalis: a modern implementation of the Li & Stephens model for local ancestry inference in R", *BMC Bioinformatics*, **25**(1). Available at: \doi{10.1186/s12859-024-05688-8}. +#' #' @return -#' \code{CacheSummary()} prints information about the current state of the kalis cache. -#' Also invisibly returns a vector giving the dimensions of the cached haplotype data (num variants, num haplotypes), or \code{NULL} if the cache is empty. +#' `CacheSummary()` prints information about the current state of the kalis cache. +#' Also invisibly returns a vector giving the dimensions of the cached haplotype data (num variants, num haplotypes), or `NULL` if the cache is empty. #' -#' \code{N()} returns the number of haplotypes currently in the kalis cache, or \code{NULL} if the cache is empty. +#' `N()` returns the number of haplotypes currently in the kalis cache, or `NULL` if the cache is empty. #' -#' \code{L()} returns the number of variants currently in the kalis cache, or \code{NULL} if the cache is empty. +#' `L()` returns the number of variants currently in the kalis cache, or `NULL` if the cache is empty. #' #' @examples #' # First fill the cache with the toy data included in the package @@ -24,6 +27,7 @@ #' N() #' L() #' +#' @importFrom prettyunits pretty_bytes #' @export CacheSummary <- function() { N <- get("N", envir = pkgVars) diff --git a/R/CalcTraces.R b/R/CalcTraces.R new file mode 100644 index 0000000..0c105a7 --- /dev/null +++ b/R/CalcTraces.R @@ -0,0 +1,41 @@ +#' Fast Calculation of Matrix Trace and Hilbert Schmidt Norm +#' +#' Provides multithreaded calculation of trace and Hilbert Schmidt Norm of a matrix \eqn{PMP} (where \eqn{P} is a projection matrix and \eqn{M} is real symmetric) without explicitly forming \eqn{PMP}. +#' +#' \eqn{P} here is assumed to have the form \eqn{I-QQ'} for some matrix \eqn{Q} of orthogonal columns. +#' +#' @references +#' Christ, R.R., Wang, X., Aslett, L.J.M., Steinsaltz, D. and Hall, I. (2024) "Clade Distillation for Genome-wide Association Studies", bioRxiv 2024.09.30.615852. Available at: \doi{10.1101/2024.09.30.615852}. +#' +#' @param M +#' a real symmetric R matrix +#' @param tX +#' `t((Q %*% (J%*%Q)) - (M %*% Q))` +#' @param tQ +#' `t(Q)` +#' @param J +#' `crossprod(Q, M)` +#' @param from_recipient +#' haplotype index at which to start trace calculation --- useful for distributed computation (experimental feature, more documentation to come) +#' @param nthreads +#' the number of CPU cores to use. +#' By default uses the `parallel` package to detect the number of physical cores. +#' +#' @return +#' A list containing three elements: +#' +#' \describe{ +#' \item{`trace`}{the trace, \eqn{\mathrm{tr}(PMP)};} +#' \item{`hsnorm2`}{the *squared* Hilbert Schmidt Norm of \eqn{PMP}, \eqn{\mathrm{tr}((PMP)'PMP)};} +#' \item{`diag`}{the diagonal of \eqn{PMP}.} +#' } +#' +#' @examples +#' # TODO +#' +#' @export +CalcTraces <- function(M, tX, tQ, J, + from_recipient = 1L, + nthreads = min(parallel::detectCores(logical = FALSE), ncol(M))) { + .Call(CCall_CalcTraces, M, tX, tQ, J, from_recipient, nthreads) +} diff --git a/R/CladeMat.R b/R/CladeMat.R new file mode 100644 index 0000000..7bda476 --- /dev/null +++ b/R/CladeMat.R @@ -0,0 +1,83 @@ +#' Fast clade matrix construction +#' +#' Constructs a clade matrix using forward and backward tables. +#' The clade matrix captures genetic relatedness information in the distances from the Li & Stephens model that are not captured in the called clades. +#' +#' `CladeMat()` uses the forward and backward tables to construct the corresponding clade matrix which can then be tested, for example using a standard quadratic form score statistic. +#' +#' @references +#' Christ, R.R., Wang, X., Aslett, L.J.M., Steinsaltz, D. and Hall, I. (2024) "Clade Distillation for Genome-wide Association Studies", bioRxiv 2024.09.30.615852. Available at: \doi{10.1101/2024.09.30.615852}. +#' +#' @param fwd +#' a `kalisForwardTable` object, as returned by [MakeForwardTable()] and propagated to a target variant by [Forward()]. +#' This table must be at the same variant location as argument `bck`. +#' @param bck +#' a `kalisBackwardTable` object, as returned by [MakeBackwardTable()] and propagated to a target variant by [Backward()]. +#' This table must be at the same variant location as argument `fwd`. +#' @param M +#' a matrix with half the number of rows and columns as the corresponding forward/backward tables. +#' This matrix is overwritten in place with the clade matrix result for performance reasons. +#' @param unit.dist +#' the change in distance that is expected to correspond to a single mutation (typically \eqn{-\log(\mu)}) for the LS model) +#' @param thresh +#' a regularization parameter: differences of distances must exceed this threshold (in `unit.dist` units) in order to cause the introduction of a probabilistic clade. +#' Defaults to `0.2`. +#' @param max1var +#' a logical regularization parameter. +#' When `TRUE`, differences in distances exceeding 1 `unit.dist` are set to 1 (so that any edge in the latent ancestral tree with multiple mutations on them are treated as if only one mutation was on it). +#' @param nthreads +#' the number of CPU cores to use. +#' By default uses the `parallel` package to detect the number of physical cores. +#' +#' @return +#' A list, the first element contains a list of tied nearest neighbours (one for each haplotype). +#' Other elements of the returned list are for internal use by [PruneCladeMat()] to allow for efficient removal of singletons and sprigs. +#' +#' @examples +#' # TODO +#' +#' +#' @export CladeMat +CladeMat <- function(fwd, bck, M, unit.dist, thresh = 0.2, max1var = FALSE, + nthreads = min(parallel::detectCores(logical = FALSE), fwd$to_recipient-fwd$from_recipient+1)){ + + # input checks + ######################### + input_checks_for_probs_and_dist_mat(fwd,bck) + + if(nrow(fwd$alpha)%%2 !=0 || ncol(fwd$alpha)%%2 !=0 || nrow(bck$beta)%%2 !=0 || ncol(bck$beta)%%2 !=0 ){ + stop("fwd and bck must both have an even number of recipient haplotypes and an even number of donor haplotypes") + } + + if(!is.matrix(M) || !is.double(M) || nrow(M) != nrow(fwd$alpha)/2 || ncol(M) != ncol(fwd$alpha)/2){ + stop("M must be a matrix of doubles with nrow(fwd$alpha)/2 rows and ncol(fwd$alpha)/2 columns")} + + if(!is.atomic(unit.dist) || length(unit.dist)!=1L || !is.finite(unit.dist) || unit.dist <= 0){ + stop("unit.dist must be a number greater than 0")} + + if(is.integer(unit.dist)){ + unit.dist <- as.double(unit.dist) + } else { + if(!is.double(unit.dist)){stop("unit.dist must be a number greater than 0")}} + + if(!is.atomic(thresh) || length(thresh)!=1L || !is.finite(thresh) || thresh < 0 || thresh > 1){ + stop("thresh must be a number in [0,1]")} + + if(is.integer(thresh)){ + thresh <- as.double(thresh) + } else { + if(!is.double(thresh)){stop("thresh must be a number in [0,1]")}} + + if(!is.logical(max1var) || length(max1var) > 1){ + stop("max1var must be TRUE or FALSE")} + + nthreads <- as.integer(nthreads) + if(!is.integer(nthreads) || length(nthreads)!=1L || !is.finite(nthreads) || nthreads <= 0){ + stop("nthreads must be a positive integer")} + + if(nthreads > ncol(fwd$alpha)/2){ + stop("nthreads cannot be greater than the number of recipient haplotypes divided by 2.") + } + + invisible(.Call(CCall_CladeMat, fwd, bck, M, unit.dist, thresh, max1var, nthreads)) +} diff --git a/R/Clades.R b/R/Clades.R new file mode 100644 index 0000000..ef6fdcb --- /dev/null +++ b/R/Clades.R @@ -0,0 +1,693 @@ + + + +get_neigh <- function(x,i){ + idx <- x[[1]][c(i,i+1L)] + x[[2]][seq.int(idx[1],idx[2]-1L)] +} + +get_neigh_seq <- function(x, i, return.lengths = FALSE){ + from <- x[[1]][i] + nvec <- x[[1]][i+1] - from + if(return.lengths){ + list("seq" = x[[2]][sequence(nvec,from)], + "lengths" = nvec) + } else { + x[[2]][sequence(nvec,from)] + } +} + +#' Greedy sprig calling based on nearest neighbourhoods +#' +#' Infer sprigs (very small clades) within a local phylogeny based on one-directional nearest neighbourhoods assigned to each haplotype. +#' +#' Call maximal cliques within a directed graph where edges correspond to nearest neighbour relationships. +#' +#' @references +#' Christ, R.R., Wang, X., Aslett, L.J.M., Steinsaltz, D. and Hall, I. (2024) "Clade Distillation for Genome-wide Association Studies", bioRxiv 2024.09.30.615852. Available at: \doi{10.1101/2024.09.30.615852}. +#' +#' @param x +#' list of integers where each entry is a nearest neighbourhood as returned by [CladeMat()] +#' @param old.sprigs +#' If `TRUE`, use an earlier (undocumented) version of maximal clique calling, defaults to `FALSE`. +#' +#' @return +#' a list containing: +#' \describe{ +#' \item{`assignments`}{a vector of integers such that `$assignments[i]` gives the sprig to which haplotype `i` belongs. `NA` if a haplotype was not assigned to a sprig.} +#' \item{`to.prune`}{a vector of logicals, `$to.prune[i]==TRUE` when haplotype i has been assigned to a sprig} +#' \item{`num.sprigs`}{total number of sprigs calls, equal to `max($assignements, rm.na = TRUE)`} +#' } +#' +#' @examples +#' # TODO +#' +#' @export Sprigs +Sprigs <- function(x, old.sprigs = FALSE){ + + N <- length(x[[1]])-1L + roster <- rep(NA_integer_,N) + label <- 0L + done <- neighborhood.is.sprig.ind <- rep(FALSE,N) + + if(old.sprigs){ + + xx <- as.list(1:N) + for(i in 1:N){xx[[i]] <- get_neigh(x,i)} + + roster <- Sprigs_old(xx, use.forking = FALSE, nthreads = 1L, add.self = FALSE) + + label <- attr(roster,"n.sprigs") + + attributes(roster) <- NULL + + neighborhood.is.sprig.ind <- !is.na(roster) + + + # for(i in sample.int(N)){ + # + # if(done[i]){next} + # + # C = get_neigh(x,i) + # lC <- length(C) + # + # neigh.list <- get_neigh_seq(x, C, return.lengths = TRUE) + # + # temp.table <- table(neigh.list[[1]]) + # proposed.set <- as.integer(names(temp.table)[which(temp.table == lC)]) + # + # # in case the neighborhood of i overshoots into previously established cliques, it has a BIG effect in real data + # proposed.set <- proposed.set[!done[proposed.set]] + # + # if(length(proposed.set) > 1 && i %in% proposed.set){ + # + # label <- label + 1L + # # this repeated intersection step has truly a small effect but + # # helps guard us against the case where i might have erroneously added some candidates in its neighborhood that + # # do not belong in the clade and do not include some clade members. This steps helps us recover those clade members + # + # neigh.list <- get_neigh_seq(x, proposed.set, return.lengths = TRUE) + # temp.table <- table(neigh.list[[1]]) + # proposed.set <- as.integer(names(temp.table)[which(temp.table == lC)]) + # proposed.set <- proposed.set[!done[proposed.set]] + # + # roster[proposed.set] <- label + # done[proposed.set] <- TRUE + # neighborhood.is.sprig.ind[proposed.set] <- neigh.list[[2]] == lC # indicator that a haplotype's neighborhood is exactly the proposed sprig + # } + # } + + + } else { + + # the randomness in indices here is not really essential but + for(i in seq_len(N)){ #sample.int(N)){ + + if(done[i]){next} + + C = get_neigh(x,i) # keep simple get_neigh here because about 2x faster than get_neigh_seq when only querying one index + C <- C[!done[C]] # we know this will at least include i because of the if(!done[i]) check above and every neighborhood includes self + + lC <- length(C) + + if(lC == 1){ # C is an orphan that could never be a part of another clade because all of it's neighbors are already assigned + done[i] <- TRUE + next} + + neigh.list <- get_neigh_seq(x, C, return.lengths = TRUE) + + if(all(tabulate(factor(neigh.list[[1]],C),nbins = lC)==lC)){ + label <- label + 1L + roster[C] <- label + done[C] <- TRUE + neighborhood.is.sprig.ind[C] <- neigh.list[[2]] == lC # indicator that a haplotype's neighborhood is exactly the proposed sprig + } + } + + } + + list("assignments" = roster, + "to.prune" = neighborhood.is.sprig.ind, + "num.sprigs" = label) +} + + +#' @importFrom utils getFromNamespace +UpdateMatrixInPlace <- function(M,row.idx,col.idx,x){ + invisible(.Call(getFromNamespace("CCall_UpdateRealInPlace","kalis"),M, + as.integer(row.idx + (col.idx-1L)*nrow(M)),x)) +} + +# test <- matrix(as.double(1:144),12,12) +# UpdateMatrixInPlace(test,c(5,12,12),c(1,3,5),as.double(c(100,200,300))) + +#' Prune called sprigs or singletons from inferred clade matrix +#' +#' Use haplotype nearest neighbourhoods and other information returned by [CladeMat()] to efficiently remove structure corresponding to singletons or called sprigs from clade matrix 'M' +#' +#' @references +#' Christ, R.R., Wang, X., Aslett, L.J.M., Steinsaltz, D. and Hall, I. (2024) "Clade Distillation for Genome-wide Association Studies", bioRxiv 2024.09.30.615852. Available at: \doi{10.1101/2024.09.30.615852}. +#' +#' @param M +#' clade matrix such as that updated by [CladeMat()] +#' @param neigh +#' a list of nearest neighborhoods as retuned by [CladeMat()] +#' @param sprigs +#' a sprigs object as returned by [Sprigs()] +#' @param prune +#' a character indicating the type of information to be removed from the [CladeMat()]. +#' See Details. +#' @param from_recipient +#' haplotype index at which to start trace calculation --- useful for distributed computation (experimental feature, more documentation to come) +#' +#' @return +#' There is nothing returned. +#' +#' **NOTE:** for performance reasons, `M` is updated in-place. +#' +#' @examples +#' # TODO +#' +#' @export PruneCladeMat +PruneCladeMat <- function(M, neigh, sprigs, prune = "singleton.info", from_recipient = 1L){ + + if(!from_recipient%%2){stop("from_recipient must be odd and encode the index of the first recipient haplotype")} + + N.recipients <- 2 * ncol(M) + + if(prune=="singleton.info"){ + + v <- neigh[[2]][[2]] - neigh[[2]][[1]] + v <- v[seq.int(1,N.recipients,2)] + v[seq.int(2,N.recipients,2)] + UpdateMatrixInPlace(M, + seq.int(from = (from_recipient+1L)/2L,length.out = ncol(M)), + seq.int(from = 1, to = ncol(M)), + v) + + } else if(prune=="sprigs"){ + + neigh.list <- get_neigh_seq(neigh[[1]], which(sprigs$to.prune), return.lengths = TRUE) + + hap.idx <- cbind(neigh.list[[1]], rep(which(sprigs$to.prune), times = neigh.list[[2]])) + + key <- rep(0L,nrow(hap.idx)) + + hap.idx.odd <- hap.idx%%2 + temp.hap.idx.odd <- hap.idx.odd[,1] + hap.idx.odd[,2] + key[temp.hap.idx.odd==2] <- 1L + key[temp.hap.idx.odd==0] <- 4L + temp.hap.idx.odd <- hap.idx.odd[,1] - hap.idx.odd[,2] + key[temp.hap.idx.odd==1] <- 3L + key[temp.hap.idx.odd==-1] <- 2L + + sim.updates <- rep((neigh[[2]][[3]]-neigh[[2]][[2]])[sprigs$to.prune], times = neigh.list[[2]]) + + # if M was not already dediploided: + # M[hap.idx] <- M[hap.idx] - sim.updates + + # since M is dediploided, we run + + if(!is.na(match(1L,key))){ + to.fetch <- key==1L + UpdateMatrixInPlace(M, + (hap.idx[to.fetch,1L]+1L)/2L, + (hap.idx[to.fetch,2L]+1L)/2L, + sim.updates[to.fetch])} + + if(!is.na(match(2L,key))){ + to.fetch <- key==2L + UpdateMatrixInPlace(M, + hap.idx[to.fetch,1L]/2L, + (hap.idx[to.fetch,2L]+1L)/2L, + sim.updates[to.fetch])} + + if(!is.na(match(3L,key))){ + to.fetch <- key==3L + UpdateMatrixInPlace(M, + (hap.idx[to.fetch,1L]+1L)/2L, + (hap.idx[to.fetch,2L])/2L, + sim.updates[to.fetch])} + + if(!is.na(match(4L,key))){ + to.fetch <- key==4L + UpdateMatrixInPlace(M, + hap.idx[to.fetch,1L]/2L, + hap.idx[to.fetch,2L]/2L, + sim.updates[to.fetch])} + + } else { + stop("unrecognized option for prune") + } + invisible(NULL) +} + + + + + +# Predecessor to CladeMat with some extra functionality, kept for future reference. +# #' Probabilistic Clades +# #' +# #' Utility for calling probabilistic clades at, in between, or excluding variants. +# #' +# #' ...? longer description +# #' +# #' @references +# #' Christ, R.R., Wang, X., Aslett, L.J.M., Steinsaltz, D. and Hall, I. (2024) "Clade Distillation for Genome-wide Association Studies", bioRxiv 2024.09.30.615852. Available at: \doi{10.1101/2024.09.30.615852}. +# #' +# #' @param fwd +# #' a `kalisForwardTable` object, as returned by [MakeForwardTable()] and propagated to a target variant by [Forward()]. +# #' This table must be at the same variant location as argument `bck`. +# #' @param bck +# #' a `kalisBackwardTable` object, as returned by [MakeBackwardTable()] and propagated to a target variant by [Backward()]. +# #' This table must be at the same variant location as argument `fwd`. +# #' @param pars +# #' a `kalisParameters` object, as returned by [Parameters()]. +# #' @param beta.theta.opts +# #' a list; see Details in [DistMat()] documentation page. +# #' @param safety.checks +# #' a logical, should safety checks be applied to the distances? +# #' See [DistMat()]. +# #' @param neighbors +# #' a logical, should nearest neighbors be pre-calculated? +# #' See [Neighbors()]. +# #' @param use.forking +# #' a logical, should forked processes be used? +# #' @param forking.chunk.size +# #' ...? +# #' @param mc.preschedule +# #' ...? +# #' @param nthreads +# #' the number of CPU cores to use. +# #' By default no parallelism is used. +# #' +# #' @return +# #' a `kalisClades` object encoding probabilistic clade calls +# #' +# #' @importFrom data.table frank +# #' @export Clades +# Clades <- function(fwd, bck, pars, beta.theta.opts = NULL, +# safety.checks = FALSE, neighbors = FALSE, +# #use.bettermc = FALSE, +# use.forking = FALSE, +# forking.chunk.size = 100L, +# mc.preschedule = FALSE, # FALSE is more conservative of memory but means many new forked processes need to be launched so it's slower than TRUE +# nthreads = 1L){ +# # currently only outputs a list but should eventually also output a matrix of integers and an attribute list of clades +# +# unit.mut.dist <- -log(pars$pars$mu) +# +# M <- DistMat(fwd, bck, beta.theta.opts = beta.theta.opts, nthreads = nthreads) +# +# if(safety.checks){ +# M[!is.finite(M)] <- 0 +# diag(M) <- NA_real_ +# } +# +# rank_donors_func <- function(x, type="linear_20", neighbors = FALSE, mac.range = c(NA,NA)){ +# rank_donors_func_res <- as.list(1:length(x)) +# for(j in 1:length(x)){ +# d.ranks <- data.table::frank(M[,x[j]], na.last = FALSE, ties.method = "first") +# phi <- c(diff(M[order(d.ranks),x[j]]),0) +# phi[1] <- 0 +# phi <- phi / unit.mut.dist # an N-long vector +# if(type == "linear_20"){ +# phi[phi > 1] <- 1 +# phi[phi < 0.2] <- 0 +# } else if(type == "step_80"){ +# phi[phi < 0.8] <- 0 +# phi[phi > 0] <- 1 +# } +# +# if(!is.na(mac.range[1])){phi[1:mac.range[1]] <- 0} +# if(!is.na(mac.range[2])){phi[mac.range[2]:nrow(fwd$alpha)] <- 0} +# +# i <- which(phi!=0) +# +# # compress phi +# clades <- cbind(i,phi[i]) # if i = integer(0) (no clades called), clades will be a 0 x 2 matrix. +# attr(d.ranks,"clades") <- clades +# +# if(neighbors){ +# attr(d.ranks,"neighbors") <- if(nrow(clades)){ +# match(2:clades[1,1],d.ranks) +# } else { +# NA_integer_ +# } +# } +# +# rank_donors_func_res[[j]] <- d.ranks +# } +# rank_donors_func_res +# } +# +# +# chunks <- chunk_int(ncol(M), chunk.size = forking.chunk.size) +# +# if(use.forking){ +# # if(use.bettermc){ +# # rank.list <- bettermc::mclapply(chunks, rank_donors_func, neighbors = neighbors, mc.preschedule = mc.preschedule, mc.cores=nthreads, mc.share.copy = FALSE) +# # } else { +# rank.list <- parallel::mclapply(chunks, rank_donors_func, neighbors = neighbors, mc.preschedule = mc.preschedule, mc.cores=nthreads) +# #} +# } else { +# rank.list <- lapply(chunks, rank_donors_func, neighbors = neighbors) # this matrix is ranked in each column, not scaled by Ne or Mu +# } +# +# rank.list <- unlist(rank.list,recursive = FALSE) +# +# attr(rank.list,"from_recipient") <- fwd$from_recipient +# attr(rank.list,"to_recipient") <- fwd$to_recipient +# +# class(rank.list) <- c("kalisClades","list") # rank.list is a list where each element is a vector of ranks with attributes clades +# +# rank.list +# } +# +# +# #' Neighbors +# #' +# #' Utility for calling tied nearest neighbors for each recipient haplotype +# #' @param x a `kalisClades` object returned by [Clades()] +# #' @param use.forking a logical, should forked processes be used? +# #' @param nthreads the number of CPU cores to use. Currently, no parallelism is used. +# #' @return +# #' a `kalisNeighbors` encoding the nearest neighbors for each recipient haplotype +# #' +# #' @export Neighbors +# Neighbors <- function(x, +# #use.bettermc = FALSE, +# use.forking = FALSE, nthreads = 1L){ +# # currently only supports list x but should support matrix x as well +# +# if(!is.null(attr(x[[1]],"neighbors"))){ +# +# neighbors <- lapply(x,function(z){attr(z,"neighbors")}) +# +# } else { +# +# +# call_neighbors <- function(z){ +# # x should be a vector of ranks with attribute "clades" +# clades <- attr(z,"clades") +# if(nrow(clades)){ +# match(2:clades[1,1],z) +# } else { +# NA_integer_ +# } +# } +# +# if(use.forking){ +# # if(use.bettermc){ +# # neighbors <- bettermc::mclapply(x, call_neighbors, mc.cores = nthreads, mc.share.copy = FALSE) +# # } else { +# neighbors <- parallel::mclapply(x, call_neighbors, mc.cores = nthreads) +# #} +# } else { +# neighbors <- lapply(x,call_neighbors) +# } +# } +# +# attr(neighbors,"from_recipient") <- attr(x,"from_recipient") +# attr(neighbors,"to_recipient") <- attr(x,"to_recipient") +# class(neighbors) <- c("kalisNeighbors","list") +# +# neighbors +# } + + +# Old sprigs and clademat have many ideas (including adjustments for pop structure) +# that are not incorporated into new functions -- ideas to come back to +Sprigs_old <- function(x, use.forking = FALSE, nthreads = 1L, add.self = TRUE){ + + # this version of Sprigs still has a bit of randomness in it's sprig building between runs on the same input + # which can be seen by running table(is.na(s),is.na(s1)) where s and s1 are the output of Sprigs + # for the same data run twice. it's relatively minor + + if(inherits(x,"kalisClades")){ + if(!is.null(attr(x[[1]],"neighbors"))){ + x <- lapply(x,function(z){attr(z,"neighbors")}) + } else { + stop("The kalisClades provided do not have the Neighbors pre-calculated, use kalis::Neighbors to obtain them and then pass them to Sprigs") + } + } + + # x here is a list that's N long st x[[i]] gives the indices of the (tied) nearest neighbors of i + roster <- rep(NA_integer_,length(x)) + + label <- 0L + # add self to own neighborhood + if(add.self){x <- mapply(c,x,1:length(x))} + + done <- rep(FALSE,length(x)) + to.prune <- rep(NA_integer_,length(x)) + + # the randomness in indices here is not really essential but + for(i in sample.int(length(x))){ + if(!done[i]){ + + # pulling out cliques in the graph that are fully connected bi-directionally: + # if i is in a clique, rather trivially, this will return the full clique + # Note, we require i %in% proposed.set to prevent called cliques from being broken up later in the for loop: + # if i is not in a clique, then it's still possible for a partial clique to be returned that doesn't include i if i projects onto + # a superset or subset of a clique. If i supercedes a clique member and projects + # onto a subset, this clique subset will be overwritten later by the larger clique. However, it would still be possible for a i that comes + # after all of the clade members in our for loop to break up the clique by projecting onto a subset of them. + # Enforcing i %in% proposed.set avoids that possibility. + + # we also require that length(proposed.set) > 1 so that we don't end up with solo cliques being called that are just i by itself. + + #missing_sprig_6 <- c(6103,1804, 6015, 4726, 4752, 807,3118,3991,6466,6068, 10,1250, 3669, 3658, 1997, 1399, 1116, 3738, 5015) + proposed.set <- Reduce(intersect,x[x[[i]]]) + # we can really speed up this Reduce by using table and looking for entries that are present in all groups + + # in case the neighborhood of i overshoots into previously established cliques, it has a BIG effect in real data + proposed.set <- proposed.set[!done[proposed.set]] + + if(length(proposed.set) > 1 && i %in% proposed.set){ + + label <- label + 1L + # this repeated intersection step has truly a small effect but + # helps guard us against the case where i might have erroneously added some candidates in its neighborhood that + # do not belong in the clade and do not include some clade members. This steps helps us recover those clade members + proposed.set <- Reduce(intersect,x[proposed.set]) + proposed.set <- proposed.set[!done[proposed.set]] + + # if(!all(is.na(roster[missing_sprig_6])) && !all(roster[missing_sprig_6]==6L)){ + # print(i) + # print(label) + # print(roster[missing_sprig_6]) + # browser() + # } + roster[proposed.set] <- label + done[proposed.set] <- TRUE + } + } + + # individuals that are not part of a fully connected clique are left with NA_integer_ on the roster + } + + # Size frequency spectrum: table(table(roster)) + + attr(roster,"n.sprigs") <- label + attr(roster,"from_recipient") <- attr(x,"from_recipient") + attr(roster,"to_recipient") <- attr(x,"to_recipient") + class(roster) <- c("kalisSprigs","integer") + + roster +} + +##Testing Sprigs +## kalis::Sprigs(list( +## 1:2, +## 1:2, +## 3:7, +## 1:10, +## 1:10, +## 1:10, +## 5:11 +## )) +# +# #' CladeMat OLD +# #' +# #' Utility for contructing a probabilistic clade matrix +# #' @param x a `kalisClades` object returned by [Clades()] +# #' @param ploidy an integer, the ploidy of the organism +# #' @param sprigs.to.prune a `kalisSprigs` object returned by [Sprigs()] encoding sprigs that should be excluded from the matrix returned +# #' @param assemble a logical, if `FALSE` return the clade matrix as a list of columns rather than as a symmetrized matrix +# #' @param use.forking a logical, should forked processes be used? +# #' @param nthreads the number of CPU cores to use. Currently, no parallelism is used. +# #' @return +# #' a matrix representation of the probabilistic clades provided +# #' +# #' @export CladeMat_old +#CladeMat_old <- function(x, ploidy = 2L, sprigs.to.prune = NULL, assemble = TRUE, +# #use.bettermc = FALSE, +# use.forking = FALSE, forking.chunk.size = 100L, mc.preschedule = FALSE, nthreads = 1L){ +# +# # prepare sprigs +# if(is.null(sprigs.to.prune)){sprigs.to.prune <- integer()} +# sl <- length(sprigs.to.prune) +# if(sl){sprig.sizes <- tabulate(sprigs.to.prune)} +# +# n.recipient.samples <- as.integer(length(x)/ploidy) +# +# chunks <- chunk_int(n.recipient.samples, chunk.size = forking.chunk.size) +# +# if(ploidy == 1){ +# omega_func <- function(s){ +# omega_func_res <- as.list(1:length(s)) +# for(j in 1:length(s)){ +# +# N <- length(x[[s[j]]]) +# +# idx <- attr(x[[s[j]]],"clades")[,1] +# phi <- attr(x[[s[j]]],"clades")[,2] +# +# # prune sprig +# if(sl && !is.na(sprigs.to.prune[s[j]]) && length(idx) && sprig.sizes[sprigs.to.prune[s[j]]] == idx[1]){ +# idx <- idx[-1] +# phi <- phi[-1] +# } +# +# # we know that phi[N] = 0, so there must always be a 0 appended +# omega_func_res[[j]] <- inverse.rle(list("values" = c(rev(cumsum(rev(phi/idx))),0), +# "lengths" = diff(c(0,idx,N))))[x[[s[j]]]] +# +# } +# omega_func_res +# } +# +# } else if(ploidy == 2){ +# +# omega_func <- function(s){ +# omega_func_res <- as.list(1:length(s)) +# for(j in 1:length(s)){ +# N <- length(x[[s[j]]]) +# +# idx <- attr(x[[s[j]*2-1]],"clades")[,1] +# phi <- attr(x[[s[j]*2-1]],"clades")[,2] +# +# idx2 <- attr(x[[s[j]*2]],"clades")[,1] +# phi2 <- attr(x[[s[j]*2]],"clades")[,2] +# +# +# if(sl && !is.na(sprigs.to.prune[s[j]*2-1]) && length(idx) && sprig.sizes[sprigs.to.prune[s[j]*2-1]] == idx[1]){ +# idx <- idx[-1] +# phi <- phi[-1] +# } +# +# if(sl && !is.na(sprigs.to.prune[s[j]*2]) && length(idx2) && sprig.sizes[sprigs.to.prune[s[j]*2]] == idx2[1]){ +# idx2 <- idx2[-1] +# phi2 <- phi2[-1] +# } +# +# # we know that phi[N] = 0, so there must always be a 0 appended +# w <- inverse.rle(list("values" = c(rev(cumsum(rev(phi/idx))),0), +# "lengths" = diff(c(0,idx,N))))[x[[s[j]*2-1]]] + +# inverse.rle(list("values" = c(rev(cumsum(rev(phi2/idx2))),0), +# "lengths" = diff(c(0,idx2,N))))[x[[s[j]*2]]] +# +# omega_func_res[[j]] <- w[seq(1,N,by=2)] + w[seq(2,N,by=2)] +# } +# omega_func_res +# } +# +# } else { +# stop("Relatedness currently only supports ploidy = 1 or 2") +# } +# +# # we don't simplify this list to a matrix at this stage to help preserve memory. +# if(use.forking){ +# # if(use.bettermc){ +# # res <- bettermc::mclapply(chunks, omega_func, mc.preschedule = mc.preschedule, mc.cores=nthreads, mc.share.copy = FALSE) +# # } else { +# res <- parallel::mclapply(chunks, omega_func, mc.preschedule = mc.preschedule, mc.cores=nthreads) +# #} +# } else { +# res <- lapply(chunks, omega_func) +# } +# +# res <- unlist(res, recursive = FALSE) +# +# if(assemble){ +# res <- do.call(cbind,res) +# res <- 0.5 * (res + t(res)) +# } +# +# res +#} + +chunk_int <- function(n, chunk.size = 100){ + # subdivide 1:n into chunks of size at most chunk.size + if(n < 1){stop("n must be an integer >= 1")} + interval.starts <- seq(1,n,by=chunk.size) + interval.ends <- c(interval.starts[-1]-1,n) + res <- as.list(1:length(interval.starts)) + for(i in 1:length(interval.starts)){ + res[[i]] <- seq.int(interval.starts[i],interval.ends[i])} + res +} +# +# use.forking <- FALSE +# use.forking <- TRUE +# nthreads <- 8L +# +# require(kalis) +# haps.file <-"~/Dropbox/Benchmarking_StatGen/kalis_benchmarking_tests/benchmark_on_msprime_simulations/msprime_sim_N_100000_L_10000.hdf5" +# CacheHaplotypes(haps = haps.file,loci.idx = 1:1000,hap.idx = 1:24000)#SmallHaps) +# #CacheHaplotypes(SmallHaps) +# pars <- Parameters(rep(1e-2, L() - 1)) +# fwd <- MakeForwardTable(pars) +# bck <- MakeBackwardTable(pars) +# Forward(fwd,pars,floor(L()/2),1) +# Backward(bck,pars,floor(L()/2),1) +# # +# start <- proc.time() +# rl2 <- Clades(fwd, bck, pars, neighbors = TRUE, safety.checks = FALSE, use.forking = use.forking, nthreads = nthreads) +# finish <- proc.time() - start +# print(finish) +# # +# sprigs <- Sprigs(rl2) +# start <- proc.time() +# M <- CladeMat(rl2, sprigs.to.prune = sprigs, use.forking = use.forking, nthreads=nthreads) +# finish <- proc.time() - start +# print(finish) +# + +# rl<- readRDS("~/Downloads/clades_test.rds") +# all.equal(rl,rl2) + +# sprigs <- CallSprigs(rl, use.forking = use.forking, nthreads = nthreads) +# +# #hist(sapply(rl,function(x){nrow(attr(x,"clades"))}),breaks=20) +# #mean(unlist(lapply(rl,function(x){attr(x,"clades")[,2]}))>0.5) +# +# rl <- CladeMat(rl, ploidy = 2L, sprigs.to.prune = sprigs, assemble = FALSE, use.forking = use.forking, nthreads = nthreads) +# rl <- do.call(cbind,rl) +# rl <- 0.5 * (rl + t(rl)) +# +# r2 <- -r2 +# class(r2) <- c("kalisDistanceMatrix","matrix") +# plot(r2) +# +# M <- DistMat(fwd,bck) +# M <- M + t(M) +# +# perm <- fastcluster::hclust(stats::as.dist(M),method="average")$order +# +# layout(matrix(1:3,1)) +# print(lattice::levelplot(M[perm,][,rev(perm)], +# useRaster = TRUE, +# col.regions = grDevices::colorRampPalette(RColorBrewer::brewer.pal(9,name = "BuPu"))(100), +# yaxt = "n", xaxt = "n", xlab = "", ylab = "", xaxt = "n")) +# print(lattice::levelplot(r1[perm,][,rev(perm)], +# useRaster = TRUE, +# col.regions = grDevices::colorRampPalette(RColorBrewer::brewer.pal(9,name = "BuPu"))(100), +# yaxt = "n", xaxt = "n", xlab = "", ylab = "", xaxt = "n")) +# print(lattice::levelplot(r2[perm,][,rev(perm)], +# useRaster = TRUE, +# col.regions = grDevices::colorRampPalette(RColorBrewer::brewer.pal(9,name = "BuPu"))(100), +# yaxt = "n", xaxt = "n", xlab = "", ylab = "", xaxt = "n")) +# + diff --git a/R/FB.R b/R/FB.R index 51476d3..1023c80 100644 --- a/R/FB.R +++ b/R/FB.R @@ -6,9 +6,12 @@ #' `Forward` implements the forward algorithm to advance the Li and Stephens rescaled hidden Markov model forward probabilities to a new target variant. #' Naturally, this can only propagate a table to variants downstream of its current position. #' -#' For mathematical details please see Section 2 of the kalis paper (TODO: ref). +#' For mathematical details please see Section 2 of the kalis paper (Aslett and Christ, 2024). #' Note that the precise formulation of the forward equation is determined by whether the flag `use.spiedel` is set in the parameters provided in `pars`. #' +#' @references +#' Aslett, L.J.M. and Christ, R.R. (2024) "kalis: a modern implementation of the Li & Stephens model for local ancestry inference in R", *BMC Bioinformatics*, **25**(1). Available at: \doi{10.1186/s12859-024-05688-8}. +#' #' @param fwd a `kalisForwardTable` object, as returned by #' [MakeForwardTable()]. #' @param pars a `kalisParameters` object, as returned by @@ -111,7 +114,7 @@ Forward <- function(fwd, #' variant. #' Naturally, this can only propagate a table to variants upstream of its current position. #' -#' For mathematical details please see Section 2 of the kalis paper (TODO: ref). +#' For mathematical details please see Section 2 of the kalis paper (Aslett and Christ, 2024). #' Note that the precise formulation of the backward equation is determined by whether the flag `use.spiedel` is set in the parameters provided in `pars`. #' #' **Beta-theta space** @@ -123,6 +126,8 @@ Forward <- function(fwd, #' A backward table in beta-theta space (with `beta.theta = TRUE`) can be propagated to an upstream variant without incorporating that variant, thereby moving to beta space (`beta.theta = FALSE`), and vice versa. #' However, while a backward table in beta space (`beta.theta = FALSE`) can be updated to incorporate the current variant, a backward table that is already in beta-theta space can not move to beta space without changing variants -- that would involve "forgetting" the current variant (see Examples). #' +#' @references +#' Aslett, L.J.M. and Christ, R.R. (2024) "kalis: a modern implementation of the Li & Stephens model for local ancestry inference in R", *BMC Bioinformatics*, **25**(1). Available at: \doi{10.1186/s12859-024-05688-8}. #' #' @param bck a `kalisBackwardTable` object, as returned by #' [MakeBackwardTable()]. diff --git a/R/GoldMasterClades.R b/R/GoldMasterClades.R new file mode 100644 index 0000000..96e6e09 --- /dev/null +++ b/R/GoldMasterClades.R @@ -0,0 +1,58 @@ +##### Gold Master ##### + +goldmaster.blobby <- function(alpha,beta,recipient_hap, unit.dist = 1, thresh = 0.2){ + f <- function(x,c){ifelse(x nrow(alpha)-1 || as.integer(left_recipient_hap)!=left_recipient_hap){ + stop("left_recipient_hap must be an integer in [1,nrow(alpha)-1]") + } + if(c < 0 || c > 1){stop("c must be in [0,1]")} + + v <- goldmaster.blobby(alpha[,1],beta[,1],recipient_hap=left_recipient_hap,unit.dist,thresh) + v <- v + goldmaster.blobby(alpha[,2],beta[,2],recipient_hap=left_recipient_hap+1L,unit.dist,thresh) + v[seq.int(1,length(v),2)] + v[seq.int(2,length(v),2)] +} + +goldmaster.blobby.full <- function(alpha,beta,left_recipient_hap, unit.dist, thresh){ + if(ncol(alpha) != ncol(beta)){stop("alpha and beta must have the same number of columns")} + if(nrow(alpha) != nrow(beta)){stop("alpha and beta must have the same number of rows")} + + if( nrow(alpha)%%2 || ncol(alpha)%%2 ){stop("alpha must be a matrix with an even number of rows and columns")} + + if(left_recipient_hap <= 0 || left_recipient_hap > nrow(alpha)-1 || as.integer(left_recipient_hap)!=left_recipient_hap){ + stop("left_recipient_hap must be an integer in [1,nrow(alpha)-1]") + } + + if(thresh < 0 || thresh > 1){stop("thresh must be in [0,1]")} + + n.samps <- ncol(alpha)/2 + + M <- matrix(0,n.samps,n.samps) + + for(i in 1:n.samps){ + v <- goldmaster.blobby(alpha[,2*i-1],beta[,2*i-1],recipient_hap=left_recipient_hap+2*i-2L,unit.dist,thresh) + v <- v + goldmaster.blobby(alpha[,2*i],beta[,2*i],recipient_hap=left_recipient_hap+2*i-1L,unit.dist,thresh) + M[,i] <- v[seq.int(1,length(v),2)] + v[seq.int(2,length(v),2)] + } + M +} + + +CladeMat.GM <- function(fwd,bck,unit.dist,thresh = 0.2){ + M <- goldmaster.blobby.full(fwd$alpha,bck$beta,left_recipient_hap = bck$from_recipient,unit.dist,thresh) + M +} \ No newline at end of file diff --git a/R/IndividualSequenceIO_H5.R b/R/IndividualSequenceIO_H5.R index c06ca58..d6b593c 100644 --- a/R/IndividualSequenceIO_H5.R +++ b/R/IndividualSequenceIO_H5.R @@ -11,6 +11,9 @@ #' #' Note that if `hdf5.file` exists but does not contain a dataset named `haps`, then `WriteHaplotypes` will simply create a `haps` dataset within the existing file. #' +#' @references +#' Aslett, L.J.M. and Christ, R.R. (2024) "kalis: a modern implementation of the Li & Stephens model for local ancestry inference in R", *BMC Bioinformatics*, **25**(1). Available at: \doi{10.1186/s12859-024-05688-8}. +#' #' @param hdf5.file the name of the file which the haplotypes are to be written to. #' @param haps a vector or a matrix where each column is a haplotype to be stored in the file `hdf5.file`. #' @param hap.ids a character vector naming haplotypes when writing, or which haplotypes are to be read. @@ -38,7 +41,8 @@ #' haps <- matrix(sample(0:1, n.haps*n.vars, replace = TRUE), #' nrow = n.vars, ncol = n.haps) #' -#' # ... write them to a file, giving alphabetic letters "A" through "T" as the #' # haplotype names ... +#' # ... write them to a file, giving alphabetic letters "A" through "T" as the +#' # haplotype names ... #' WriteHaplotypes("~/myhaps.h5", haps, hap.ids = LETTERS[1:20]) #' #' # ... and confirm we can read a chosen portion back. Try to read back @@ -99,6 +103,7 @@ WriteHaplotypes <- function(hdf5.file, haps, write.loci.ids <- TRUE } + # NOTE: this previously had to be removed to allow running on WashU cluster -- if error triggers here again LA and RC to discuss if(length(find.package("rhdf5", quiet = TRUE)) == 0) { stop("The WriteHaplotypes function requires the optional rhdf5 package to be installed (see Bioconductor https://bioconductor.org/packages/rhdf5)") } @@ -248,12 +253,12 @@ WriteHaplotypes <- function(hdf5.file, haps, # Write message(glue("Writing {N} haplotype(s) of size {L} ...\n")) - rhdf5::h5write(as.array(haps), h5, haps.name, index = list(NULL, (N.old+1):(N.old+N))) + rhdf5::h5writeDataset(as.array(haps), h5, haps.name, index = list(NULL, (N.old+1):(N.old+N))) if(write.hap.ids) { - rhdf5::h5write(as.array(as.character(hap.ids)), h5, hap.ids.name, index = list((N.old+1):(N.old+N))) + rhdf5::h5writeDataset(as.array(as.character(hap.ids)), h5, hap.ids.name, index = list((N.old+1):(N.old+N))) } if(write.loci.ids) { - rhdf5::h5write(as.array(as.character(loci.ids)), h5, loci.ids.name, index = list(1:L)) + rhdf5::h5writeDataset(as.array(as.character(loci.ids)), h5, loci.ids.name, index = list(1:L)) } rhdf5::h5closeAll() @@ -351,6 +356,7 @@ ReadHaplotypes <- function(hdf5.file, if(!identical(loci.ids, NA) && !identical(loci.idx, NA)) { stop("Can only specify one of loci.ids or loci.idx argument.") } + # NOTE: this previously had to be removed to allow running on WashU cluster -- if error triggers here again LA and RC to discuss if(length(find.package("rhdf5", quiet = TRUE)) == 0) { stop("The ReadHaplotypes function requires the optional rhdf5 package to be installed (see Bioconductor https://bioconductor.org/packages/rhdf5)") } diff --git a/R/Iterator.R b/R/Iterator.R new file mode 100644 index 0000000..4a2984d --- /dev/null +++ b/R/Iterator.R @@ -0,0 +1,538 @@ +#' Build an efficient iterator over loci +#' +#' Function factory to create a `kalisForwardIterator` for propagating a forward table iteratively over target variants using a table cache and optimal checkpointing. +#' +#' See example. +#' +#' @references +#' Christ, R.R., Wang, X., Aslett, L.J.M., Steinsaltz, D. and Hall, I. (2024) "Clade Distillation for Genome-wide Association Studies", bioRxiv 2024.09.30.615852. Available at: \doi{10.1101/2024.09.30.615852}. +#' +#' @param pars +#' a `kalisParameters` object, as returned by [Parameters()]. +#' @param ram.ckpts +#' an integer specifying the number of checkpoints to store in RAM. +#' @param targets +#' a vector of variants to iterate over (starting with the most downstream target). +#' @param base.fwd.table +#' a `kalisForwardTable` either at the most upstream target, or if the targets are evenly spaced, one interval upstream of the most upstream target. +#' If `NULL` (the default), this is interpreted as the prior `Pi`, see [Parameters()]. +#' @param disk.ckpts +#' an integer specifying the number of checkpoints to store on disk. +#' @param disk.dir +#' a path to a directory where a temporary folder may be made to store checkpoints on disk. +#' @param from_recipient +#' first recipient haplotype included in the tables of the cache, if creating a partial forward table. +#' By default all are included from the first recipient haplotype. +#' Haplotypes are indexed from 1. +#' @param to_recipient +#' last recipient haplotype included in the tables of the cache, if creating a partial forward table. +#' By default all are included upto the last recipient haplotype. +#' Haplotypes are indexed from 1. +#' @param lookup.tables +#' an optional list as returned by [CalcCheckpointTables()]. +#' @param cache +#' a `kalisCheckpointTable` object, as returned by [CreateForwardTableCache()] or this function. +#' By default `NULL`, which causes this function to create a new cache. +#' @param save.cache +#' a logical. +#' When `TRUE` does not reliquish the table cache upon exhaustion of the iterator. +#' Defaults to `FALSE`. +#' @param force.unif +#' a logical, if `TRUE` iterate over targets as if they were uniformly spaced. +#' WARNING: DO NOT use this in conjunction with the targets method, still experimental. +#' With `force.unif = TRUE`, the resulting iterator will appear to be targeting the first `length(targets)` variants with all methods, but in fact will be silently iterating over the original targets. +#' +#' @return +#' A function for iterating over the set of target variants. +#' The returned function has prototype: +#' +#' `function(fwd, pars, t, nthreads = 1)` +#' +#' which matches the standard [Forward()] function, but which uses the table cache to speed up propagation to the target variant. +#' See [Forward()] for an explanation of arguments. +#' +#' @seealso +#' [MakeForwardTable()] to create a `kalisForwardTable`; +#' [CreateForwardTableCache()] to create a cache which can be used with this function. +#' +#' @examples +#' \dontrun{ +#' data("SmallHaps") +#' CacheHaplotypes(SmallHaps) +#' pars <- Parameters() +#' fwd <- MakeForwardTable(pars) +#' bck <- MakeBackwardTable(pars) +#' Iter <- ForwardIterator(2) +#' for(t in targets(Iter)){ +#' Iter(fwd,pars,t) +#' Backward(bck,pars,t) +#' print(paste("Mean Distance at locus",t,"is",mean(DistMat(fwd,bck)))) +#' } +#' } +#' +#' @export +ForwardIterator <- function(pars, + ram.ckpts = 1L, + targets = 1:L(), + base.fwd.table = NULL, + disk.ckpts = 0, + disk.dir = NULL, + from_recipient = 1, + to_recipient = Inf, + lookup.tables = NULL, + cache = NULL, + save.cache = FALSE, + force.unif = FALSE){ + + force(force.unif) + + # Sanity checks + #################### + ram.ckpts <- as.integer(ram.ckpts) + if(ram.ckpts <= 0){stop("ram.ckpts must be a positive integer")} + + + # Check to ensure that the cache provided can actually be recycled for this problem + + if(!is.null(cache)){ + for(i in 1:length(cache)){ + if(cache[[i]]$from_recipient!=from_recipient){stop("The provided cache must have the same from_recipient as currently requested.")} + if(cache[[i]]$to_recipient!=min(to_recipient,N())){stop("The provided cache must have the same to_recipient as currently requested.")} + } + } + + + + # Only RAM checkpoints for now + ################## + # for now we ignore disk checkpoints: + num.available.ckpts <- ram.ckpts + + if(disk.ckpts != 0 | !is.null(disk.dir)){ + warning("disk checkpoints not yet implemnted, proceeding ignoring disk.ckpts and disk.dir") + } + + + # Cover case when base.fwd.table provided + ################## + + if(is.null(base.fwd.table)){ + use.pi <- TRUE + }else{ + if( !("kalisForwardTable" %in% class(base.fwd.table)) ){stop("base.fwd.table is not a kalisForwardTable")} + if(any(targets < base.fwd.table$l)){stop("no targets may be less than base.fwd.table$l")} + use.pi <- FALSE + } + + + if(force.unif){ + + if(!use.pi){ + if(targets[1] != base.fwd.table$l){stop("When using force.unif, for now, base.fwd.table$l must be AT the first target")} + } + + targets.idx <- targets + targets <- 1:length(targets) + base.fwd.table.l <- 1 + + } else { + targets.idx <- NULL + base.fwd.table.l <- base.fwd.table$l + } + + + # Figure out whether using uniform or general checkpointing + #################### + # by default + uniform.ckpts <- FALSE + first.target.given <- FALSE + + intervals <- unique(diff(targets)) + + if(length(intervals) == 1){ + if(use.pi){ + if(targets[1] == intervals){ uniform.ckpts <- TRUE } + }else{ + if(targets[1] == base.fwd.table.l){ uniform.ckpts <- TRUE; first.target.given <- TRUE} + if( (targets[1] - intervals) == base.fwd.table.l){ uniform.ckpts <- TRUE; first.target.given <- FALSE} + } + } + rm(intervals) + + + # Perform Table Benchmarking + #################### + # bench <- BenchmarkTables() + + propagation.cost <- 1:length(targets) + + + # Solve Optimal Checkpointing + ############################### + + if(uniform.ckpts){ + + if(is.null(lookup.tables)){ + message("Calculating Optimal Checkpoint Schedule") + lookup.tables <- CalcCheckpointTables(propagation.cost,num.available.ckpts) + } + + cost.table <- lookup.tables$cost + index.table <- lookup.tables$index + + + + + if(!first.target.given){ + + uniform_SolveSchedule <- uniform_MakeSolveSchedule(targets,cost.table,index.table) + assign("uniform_SolveSchedule",uniform_SolveSchedule,envir = environment(uniform_SolveSchedule)) + + uniform_SolveSchedule(1,length(targets),num.available.ckpts) + + cost <- uniform_LookupCost(length(targets),num.available.ckpts,cost.table) + + }else{ + + uniform_SolveSchedule <- uniform_MakeSolveSchedule(targets[-1],cost.table,index.table) + assign("uniform_SolveSchedule",uniform_SolveSchedule,envir = environment(uniform_SolveSchedule)) + + uniform_SolveSchedule(1,length(targets)-1,num.available.ckpts) + cost <- uniform_LookupCost(length(targets)-1,num.available.ckpts,cost.table) + + } + + sch <- uniform_trim.sch(uniform_SolveSchedule) + + # I don't believe we need to modify this schedule in order to still request the baseline locus as our last target + + }else{ + + stop("Solving the non-uniform checkpointing problem is not yet implemented.") + + } + + + # Construct Table Cache + ######################## + + max.tables <- max(sch$k) + + if(is.null(cache) || length(cache) < max.tables){ + cache <- CreateForwardTableCache(pars = pars,size = Inf, from_recipient = from_recipient, to_recipient = to_recipient, max.tables = max.tables) + + }else{ + + for(i in 1:length(cache)){ + + if(i > max.tables){ + cache[[i]] <- NULL + }else{ + # check if parameters match, if not, overwrite with warning + if(cache[[i]]$pars.sha256!=pars$sha256){ + warning("The provided cache was initialized with parameters that are different from those currently in pars. Overwritting the pars in the provided cache...") + cache[[i]]$pars.sha256 <- pars$sha256 + } + ResetTable(cache[[i]]) + } + } + + } + + rm(pars); gc() + + + # Construct Iterator + ###################### + + UpdateCache <- MakeUpdateCache(sch, use.pi, targets.idx = targets.idx) + + current.sch <- data.frame("i" = Inf) + + current.target.index <- length(targets) + + iter.internal <- function(fwd, pars, t, nthreads = 1){ + + if(force.unif){t <- match(t,targets.idx)} + + if(current.target.index == 0){warning("This iterator has been exhausted."); return()} + + if(t == targets[current.target.index]){ + current.target.index <<- current.target.index - 1 + }else{ + stop(paste("The next target locus for this iterator is",targets[current.target.index],"not",t)) + } + + if(identical(lobstr::obj_addr(fwd),lobstr::obj_addr(base.fwd.table))){ + stop("base.fwd.table cannot point to the same table as fwd: they must be created independently.") + } + + #print(c(current.sch$i, t)) + if(current.sch$i > t){ current.sch <<- UpdateCache(cache, pars, nthreads, base.fwd.table) } + + if(current.sch$k != 0){ + CopyTable(to = fwd, from = cache[[current.sch$k]]) + }else{ + if(use.pi){ + ResetTable(fwd) + }else{ + CopyTable(to = fwd, from = base.fwd.table) + } + } + + # Clean Up cache unless we're instructed to save it + if(current.target.index == 0){ + if(!save.cache){ + cache <<- NULL + gc() + } + } + + if(force.unif){ + Forward(fwd = fwd, pars = pars, t = targets.idx[t], nthreads = nthreads) + } else { + Forward(fwd = fwd, pars = pars, t = t, nthreads = nthreads) + } + } + + class(iter.internal) <- c("kalisIterator",class(iter.internal)) + + iter.internal +} + + +targets <- function(x) { # put this declaration above and below because it seems that order determines whether targets is recognized + UseMethod("targets") +} + +#' @export +targets.kalisIterator <- function(x){ + if(!"kalisIterator"%in%class(x)){stop("argument must be a kalisIterator")} + rev(get("targets", envir = environment(x))) +} + +targets <- function(x) { + UseMethod("targets") +} + +#' @export +print.kalisIterator <- function(x, ...){ + if(!"kalisIterator"%in%class(x)){stop("argument must be a kalisIterator")} + + if(get("current.target.index", envir=environment(x)) == 0){ + "This is an exhausted kalisIterator." + }else{ + target.range <- range(get("targets", envir = environment(x))) + message(paste("A kalisIterator for",length(get("targets", envir = environment(x))),"targets ranging from",target.range[1],"to",target.range[2]),appendLF = TRUE) + message(paste("Contains",get("max.tables", envir = environment(x)),"checkpoints using ~",utils::object.size(get("cache", envir = environment(x)))/1e9,"Gb of RAM"),appendLF = TRUE) + message(paste("Next target locus:",get("targets", envir = environment(x))[get("current.target.index", envir = environment(x))]),appendLF = TRUE) + message("",appendLF = TRUE) + } +} + +#' @importFrom graphics axis +#' @export +plot.kalisIterator <- function(x, ...){ + if(!"kalisIterator"%in%class(x)){stop("argument must be a kalisIterator")} + sch <- get("sch",envir = environment(x)) + loci <- get("targets",envir = environment(x)) + plot(sch$i[-c(1,nrow(sch))],sch$k[-c(1,nrow(sch))],type="h",lwd=1,bty="n",ylab="K",xlab="locus",las=1,ylim=c(0,max(sch$k)),xlim=range(loci),xaxt="n",yaxt="n", ...) + p.loci <- pretty(loci) + axis(1,at = p.loci ,pos=0) + axis(2,at = pretty(0:max(sch$k)),pos=p.loci[1],las=2) +} + + +#' Calculate Checkpoint Tables +#' +#' Calculate look up tables for solving optimal checkpointing problems with dynamic programming. +#' +#' @references +#' Christ, R.R., Wang, X., Aslett, L.J.M., Steinsaltz, D. and Hall, I. (2024) "Clade Distillation for Genome-wide Association Studies", bioRxiv 2024.09.30.615852. Available at: \doi{10.1101/2024.09.30.615852}. +#' +#' @param propagation.cost +#' a non-negative vector such that `propagation.cost[i]` gives the relative amount of time or cost required to propagate `i` steps. +#' @param max.num.checkpoints +#' the maximum number of checkpoints that should be considered when building the checkpoint table. +#' @param use.R +#' a logical, when `TRUE` use base R rather than C implementation of table building. +#' Defaults to `FALSE`. +#' +#' @return +#' A list containing: +#' +#' \describe{ +#' \item{`cost`}{the matrix \eqn{F} in Christ et al. (2024)} +#' \item{`index`}{the matrix \eqn{H} in Christ et al. (2024)} +#' } +#' +#' @export +CalcCheckpointTables <- function(propagation.cost,max.num.checkpoints, use.R = FALSE){ + start <- proc.time() + + propagation.cost <- as.numeric(propagation.cost) + + max.n <- length(propagation.cost) + + # the first row corresponds to solving a 0 locus problem + cost.table <- matrix(0,nrow=max.n + 1,ncol= max.num.checkpoints + 1) + index.table <- matrix(0L,nrow=max.n,ncol= max.num.checkpoints) + + cost.table[,1] <- c(0,cumsum(as.numeric(propagation.cost))) + + if(use.R){ + + for(k in 1:max.num.checkpoints){ + for(n in 1:max.n){ + # now solving a n long problem with k checkpoints + v <- cost.table[1:n,k + 1] + propagation.cost[1:n] + cost.table[n:1,k] + x <- which.min(v) + index.table[n, k] <- x + cost.table[n + 1,k + 1] <- v[x] + } + print(paste(k,"done at",c(proc.time() - start)[3]/3600,"hours from start.")) + } + }else{ + invisible(.Call(CCall_OptCkpt, cost.table, index.table, propagation.cost)) + } + + cost.table <- cost.table[-1,] + return(list("cost" = cost.table,"index" = index.table)) +} + + +MakeUpdateCache <- function(sch, use.pi, targets.idx = NULL){ + + force(targets.idx) + + exhausted <- FALSE + current.ins <- leading.ins <- 1 + ancestor <- 1 + cost <- 0 + + function(cache, pars, nthreads, base.fwd.table){ + + if(exhausted){ + warning("This iterator has been exhausted.") + return(data.frame("k" = 0L,"i" = 0L)) + } + + repeat{ + + candidates <- which(sch$i[1:(current.ins-1)] <= sch$i[current.ins]) + ancestor <<- candidates[which.max(sch$i[candidates])] + + if(sch$i[leading.ins + 1] < sch$i[ancestor]){ # if the next checkpoint destination is on the left side of the current ancestor + current.ins <<- ancestor + return(sch[current.ins,]) + } else { + leading.ins <<- leading.ins + 1 + current.ins <<- leading.ins + } + + candidates <- which(sch$i[1:(current.ins-1)] <= sch$i[current.ins]) + ancestor <<- candidates[which.max(sch$i[candidates])] + + + if(sch$i[current.ins] != 0){ # we're not at the end yet + + # Update Cache + kk <- sch$k[current.ins] + akk <- sch$k[ancestor] + + if(akk != 0){ + CopyTable(cache[[ kk ]],cache[[ akk ]]) + }else{ + if(use.pi){ + ResetTable(cache[[kk]]) # Pi could also be the baseline table here for the entire interval + }else{ + CopyTable(to = cache[[kk]],base.fwd.table) + } + } + + # advance cache table from ancestor to current checkpoint destination + if(!is.null(targets.idx)){ + Forward(cache[[kk]],pars,targets.idx[sch$i[current.ins]],nthreads) + } else { + Forward(cache[[kk]],pars,sch$i[current.ins],nthreads) + } + if(sch$i[current.ins + 1] > sch$i[current.ins]){ + next + }else{ + return(sch[current.ins,]) + } + + }else{ # we are at the end + exhausted <<- TRUE + rm(cache) + rm(sch, envir = parent.env(environment())) # remove large objects from memory + gc() + return(data.frame("k" = 0L,"i" = 0L)) + } + } + } +} + + + + +uniform_MakeSolveSchedule <- function(loci,cost.table,index.table){ + + uniform_SolveSchedule <- function(){NULL} + + sch.k <- 0L + sch.i <- 0L + nrow.sch <- 1 + + function(i,j,num.available.ckpts){ # i is the index of the first locus and j is the index of the last locus in the problem to solve (from indicies[i] to indicies[j]) + + l.d <- j-i+1 + + k <- as.integer(min(l.d-1,num.available.ckpts)) + if(k==0){return(cost.table[l.d,1])} + + # at this point, we know that num.available.ckpts is at least 1 + # and l.d is at least 2 + + # If neither of the above cases, create a new instruction + ins <- which.max(sch.k < 0) # this is the first emtpy slot for an instruction + if(ins == nrow.sch){ # then we're about to assign to the last schedule entry and need to add on space for instructions before we can call obj.func + sch.k <<- c(sch.k, rep(-1L,50)) + sch.i <<- c(sch.i, rep(-1L,50)) + nrow.sch <<- length(sch.k) + ins <- which.max(sch.k < 0) + } + + sch.k[ins] <<- k + ckpt.location <- index.table[l.d,k] + sch.i[ins] <<- loci[i-1+ckpt.location] + + # solve right problem if the interval to the right contains at least one target locus + if(l.d > ckpt.location){ uniform_SolveSchedule(i+ckpt.location, j, num.available.ckpts - 1) } + #if(l.d > ckpt.location){ get("uniform_SolveSchedule", envir = parent.frame())(i+ckpt.location, j, num.available.ckpts - 1) } + + + # solve left problem if the interval to the left contains at least one target locus + if(ckpt.location > 1){ uniform_SolveSchedule(i, i-2+ckpt.location,num.available.ckpts) } + #if(ckpt.location > 1){ get("uniform_SolveSchedule", envir = parent.frame())(i, i-2+ckpt.location,num.available.ckpts) } + + + return() + } +} + +uniform_LookupCost<- function(L,num.available.ckpts,cost.table){cost.table[L,as.integer(min(L-1,num.available.ckpts)) + 1]} + +uniform_trim.sch <- function(f){ + sch.k <- get("sch.k",envir = environment(f)) + sch.i <- get("sch.i",envir = environment(f)) + # prune + if(any(sch.k == -1)){ + upper.limit <- which.max(sch.k == -1) - 1 + sch.k <- sch.k[1:upper.limit] + sch.i <- sch.i[1:upper.limit] + } + + # create dataframe schedule + sch <- data.frame("k" = c(sch.k,0L), "i" = c(sch.i,0L)) +} diff --git a/R/Parameters.R b/R/Parameters.R index bf40ca1..ecc4693 100644 --- a/R/Parameters.R +++ b/R/Parameters.R @@ -6,11 +6,16 @@ #' #' **NOTE:** the corresponding haplotype data *must* have already been inserted into the kalis cache by a call to [CacheHaplotypes()], since this function performs checks to confirm the dimensionality matches. #' -#' TODO: add kalis paper cross ref. #' See page 3 in Supplemental Information for the original ChromoPainter paper (Lawson et al., 2012) for motivation behind our parameterisation, which is as follows: #' #' \deqn{\rho = 1 - \exp(-s \times cM^\gamma)}{\rho = 1 - exp(-s * cM^\gamma)} #' +#' For a complete description, see the main kalis paper, Aslett and Christ (2024). +#' +#' @references +#' Aslett, L.J.M. and Christ, R.R. (2024) "kalis: a modern implementation of the Li & Stephens model for local ancestry inference in R", *BMC Bioinformatics*, **25**(1). Available at: \doi{10.1186/s12859-024-05688-8}. +#' +#' Lawson, D.J., Hellenthal, G., Myers, S. and Falush, D. (2012) "Inference of population structure using dense haplotype data", *PLoS genetics*, **8**(1). Available at: \doi{10.1371/journal.pgen.1002453}. #' #' @param cM a vector specifying the recombination distance between variants in centimorgans. #' Note element i of this vector should be the distance between variants `i` and `i+1` (not `i` and `i-1`), and thus length one less than the number of variants. @@ -25,10 +30,6 @@ #' #' @seealso [Parameters()] to use the resulting recombination probabilities to construct a `kalisParameters` object. #' -#' @references -#' Lawson, D. J., Hellenthal, G., Myers, S., & Falush, D. (2012). Inference of -#' population structure using dense haplotype data. *PLoS genetics*, **8**(1). -#' #' @examples #' # Load the mini example data and recombination map from the package built-in #' # dataset #' data("SmallHaps") @@ -119,6 +120,13 @@ CalcRho <- function(cM = 0, s = 1, gamma = 1, floor = TRUE) { #' #' Note that there is a computational cost associated with non-uniform copying probabilities, so it is recommended to leave the default of uniform probabilities when appropriate (**Note:** *do not* specify a uniform matrix when uniform probabilities are intended, since this would end up incurring the computational cost of non-uniform probabilities). #' +#' @references +#' Aslett, L.J.M. and Christ, R.R. (2024) "kalis: a modern implementation of the Li & Stephens model for local ancestry inference in R", *BMC Bioinformatics*, **25**(1). Available at: \doi{10.1186/s12859-024-05688-8}. +#' +#' Lawson, D.J., Hellenthal, G., Myers, S.R. and Falush, D. (2012) "Inference of population structure using dense haplotype data", *PLoS Genetics*, **8**(1). Available at: \doi{10.1371/journal.pgen.1002453}. +#' +#' Speidel, L., Forest, M., Shi, S. and Myers, S.R. (2019) "A method for genome-wide genealogy estimation for thousands of samples", *Nature Genetics*, **51**, p. 1321-1329. Available at: \doi{10.1038/s41588-019-0484-x}. +#' #' @param rho recombination probability vector (must be \eqn{L-1} long). #' See [CalcRho()] for assistance constructing this from a recombination #' map/distances. @@ -140,13 +148,6 @@ CalcRho <- function(cM = 0, s = 1, gamma = 1, floor = TRUE) { #' @seealso [MakeForwardTable()] and [MakeBackwardTable()] which construct table objects which internally reference a parameters environment; #' [Forward()] and [Backward()] which propagate those tables according to the Li and Stephens model. #' -#' @references -#' Lawson, D. J., Hellenthal, G., Myers, S., & Falush, D. (2012). Inference of -#' population structure using dense haplotype data. *PLoS genetics*, **8**(1). -#' -#' Speidel, L., Forest, M., Shi, S., & Myers, S. (2019). A method for -#' genome-wide genealogy estimation for thousands of samples. *Nature Genetics*, **51**(1321–1329). -#' #' @examples #' # Load the mini example data and recombination map from the package built-in #' # dataset #' data("SmallHaps") @@ -238,6 +239,7 @@ Parameters <- function(rho = rep(0, get("L", envir = pkgVars)-1), res } +#' @importFrom utils tail #' @export print.kalisParameters <- function(x, ...) { if(is.matrix(x$pars$Pi)) { diff --git a/R/Probs.R b/R/Probs.R index 3396a07..fd0a30b 100644 --- a/R/Probs.R +++ b/R/Probs.R @@ -41,7 +41,10 @@ #' #' When provided, `M` must have dimensions matching that of `fwd$alpha`. #' Typically, that is simply \eqn{N \times N}{N x N} for \eqn{N} haplotypes. -#' However, if kalis is being run in a distributed manner, \code{M} will be a \eqn{N \times R}{N x R} matrix where \eqn{R} is the number of recipient haplotypes on the current machine. +#' However, if kalis is being run in a distributed manner, `M` will be a \eqn{N \times R}{N x R} matrix where \eqn{R} is the number of recipient haplotypes on the current machine. +#' +#' @references +#' Aslett, L.J.M. and Christ, R.R. (2024) "kalis: a modern implementation of the Li & Stephens model for local ancestry inference in R", *BMC Bioinformatics*, **25**(1). Available at: \doi{10.1186/s12859-024-05688-8}. #' #' @param fwd a forward table as returned by [MakeForwardTable()] and propagated to a target variant by [Forward()]. #' Must be at the same variant as `bck` (unless `bck` is in "beta-theta space" in which case if must be downstream ... see [Backward()] for details). @@ -162,7 +165,12 @@ PostProbs <- function(fwd, bck, unif.on.underflow = FALSE, M = NULL, beta.theta. #' #' When provided, `M` must have dimensions matching that of `fwd$alpha`. #' Typically, that is simply \eqn{N \times N}{N x N} for \eqn{N} haplotypes. -#' However, if kalis is being run in a distributed manner, \code{M} will be a \eqn{N \times R}{N x R} matrix where \eqn{R} is the number of recipient haplotypes on the current machine. +#' However, if kalis is being run in a distributed manner, `M` will be a \eqn{N \times R}{N x R} matrix where \eqn{R} is the number of recipient haplotypes on the current machine. +#' +#' @references +#' Aslett, L.J.M. and Christ, R.R. (2024) "kalis: a modern implementation of the Li & Stephens model for local ancestry inference in R", *BMC Bioinformatics*, **25**(1). Available at: \doi{10.1186/s12859-024-05688-8}. +#' +#' Speidel, L., Forest, M., Shi, S. and Myers, S.R. (2019) "A method for genome-wide genealogy estimation for thousands of samples", *Nature Genetics*, **51**, p. 1321-1329. Available at: \doi{10.1038/s41588-019-0484-x}. #' #' @param fwd a forward table as returned by [MakeForwardTable()] and propagated to a target variant by [Forward()]. #' Must be at the same variant as `bck` (unless `bck` is in "beta-theta space" in which case if must be downstream ... see [Backward()] for details). @@ -184,9 +192,6 @@ PostProbs <- function(fwd, bck, unif.on.underflow = FALSE, M = NULL, beta.theta. #' #' If you wish to plot this matrix or perform clustering, you may want to symmetrize the matrix first. #' -#' @references -#' Speidel, L., Forest, M., Shi, S., & Myers, S. (2019). A method for genome-wide genealogy estimation for thousands of samples. *Nature Genetics*, **51**(1321–1329). -#' #' @seealso #' [PostProbs()] to calculate the posterior marginal probabilities \eqn{p_{ji}}{p_(j,i)}; #' [Forward()] to propagate a Forward table to a new variant; @@ -221,7 +226,7 @@ DistMat <- function(fwd, bck, type = "raw", M = NULL, beta.theta.opts = NULL, nthreads = min(parallel::detectCores(logical = FALSE), fwd$to_recipient-fwd$from_recipient+1)){ if(identical(nthreads, "R")) { - if(!is.null(M)){stop("M cannot be NULL when requesting the gold master R version with R nthreads")} + if(!is.null(M)){stop("M must be NULL when requesting the gold master R version with R nthreads")} warning("Warning: using gold master R implementation.") return(invisible(DistMat.GM(fwd, bck, type, beta.theta.opts))) } @@ -256,7 +261,7 @@ DistMat <- function(fwd, bck, type = "raw", M = NULL, beta.theta.opts = NULL, -input_checks_for_probs_and_dist_mat <- function(fwd,bck,beta.theta.opts){ +input_checks_for_probs_and_dist_mat <- function(fwd,bck,beta.theta.opts = NULL){ # RUN GENERAL CHECKS if(fwd$l == 2147483647L){stop("forward table has not been initialized but not propagated to a variant in {1,...,L}.")} @@ -295,7 +300,7 @@ input_checks_for_probs_and_dist_mat <- function(fwd,bck,beta.theta.opts){ if(!inherits(beta.theta.opts$pars,"kalisParameters")){stop("beta.theta.opts$pars must be kalisParameters object.")} - if(!is.numeric(beta.theta.opts$bias) || beta.theta.opts$bias<=0 || beta.theta.opts$bias>=1 ){stop("bias must be numeric and within (0,1). To obtain a distance matrix AT a particular variant, advance bck to that variant in beta space.")} + if(!is.numeric(beta.theta.opts$bias) || beta.theta.opts$bias<0 || beta.theta.opts$bias>1 ){stop("bias must be numeric and within [0,1]. To obtain a distance matrix AT a particular variant, advance bck to that variant in beta space.")} total.rho <- sum(beta.theta.opts$pars$pars$rho[fwd$l:(bck$l - 1)]) @@ -304,24 +309,37 @@ input_checks_for_probs_and_dist_mat <- function(fwd,bck,beta.theta.opts){ } - return(list("rho.fwd" = rho.fwd, "rho.bck" = rho.bck)) + return(invisible(list("rho.fwd" = rho.fwd, "rho.bck" = rho.bck))) }else{ if(bck$l != fwd$l){stop("variant position of the forward table and backward table do not match.")} - return(NULL) + return(invisible(NULL)) } } -#' Plotting function for a kalisDistanceMatrix object +#' Plotting function for a distance matrix #' #' Clusters the given distance matrix and generates a heatmap to display it. #' -#' @param d a kalisDistanceMatrix +#' @references +#' Aslett, L.J.M. and Christ, R.R. (2024) "kalis: a modern implementation of the Li & Stephens model for local ancestry inference in R", *BMC Bioinformatics*, **25**(1). Available at: \doi{10.1186/s12859-024-05688-8}. +#' +#' @param x +#' a distance matrix, such as returned by [DistMat()]. +#' @param cluster.method +#' the agglomeration method to be used, which is passed to the [fastcluster::hclust()] function. +#' This must be (an unambiguous abbreviation of) one of "single", "complete", "average", "mcquitty", "ward.D", "ward.D2", "centroid" or "median". +#' @param ... +#' further arguments to be passed on to the underlying [lattice::levelplot()] plotting function. #' -#' @return There is nothing returned. +#' @return +#' No return value, called for side effects. +#' +#' @examples +#' # TODO #' #' @export plot.kalisDistanceMatrix <- function(x, cluster.method = "average", ...) { diff --git a/R/SmallHaps-data.R b/R/SmallHaps-data.R index 26dfcfa..fe57838 100644 --- a/R/SmallHaps-data.R +++ b/R/SmallHaps-data.R @@ -14,7 +14,9 @@ #' @keywords datasets #' #' @references -#' Kelleher, J., Etheridge, A. M., & McVean, G. (2016). Efficient coalescent simulation and genealogical analysis for large sample sizes. *PLoS computational biology*, **12**(5). +#' Aslett, L.J.M. and Christ, R.R. (2024) "kalis: a modern implementation of the Li & Stephens model for local ancestry inference in R", *BMC Bioinformatics*, **25**(1). Available at: \doi{10.1186/s12859-024-05688-8}. +#' +#' Kelleher, J., Etheridge, A.M. and McVean, G. (2016) "Efficient coalescent simulation and genealogical analysis for large sample sizes", *PLoS Computational Biology*, **12**(5). Available at: \doi{10.1371/journal.pcbi.1004842}. #' #' @examples #' data("SmallHaps") diff --git a/R/TableCache.R b/R/TableCache.R new file mode 100644 index 0000000..be31026 --- /dev/null +++ b/R/TableCache.R @@ -0,0 +1,413 @@ +#' Create cache for forward tables +#' +#' Create an in-memory cache for forward tables to improve efficiency when iterating in reverse along the haplotype sequences. +#' +#' If the objective is to run the Li and Stephens hidden Markov model both forwards and backwards to the same locus (and to do so for every possible locus), then considerable efficiency can be achieved by first performing a full scan forwards, filling a geometrically spaced cache whilst doing so. +#' Then, by working backwards, the backward propagation moves one locus at a time and the forward propagation can move backwards by moving forward from a recently cached local table. +#' +#' Memory for a cache can be allocated using this function and should then be filled with [FillTableCache()]. +#' To use the cache, then instead of using the [Forward()] function, use [ForwardUsingTableCache()]. +#' +#' @references +#' Christ, R.R., Wang, X., Aslett, L.J.M., Steinsaltz, D. and Hall, I. (2024) "Clade Distillation for Genome-wide Association Studies", bioRxiv 2024.09.30.615852. Available at: \doi{10.1101/2024.09.30.615852}. +#' +#' @param pars +#' a `kalisParameters` object, as returned by [Parameters()]. +#' @param size +#' the maximum amount of RAM (in GB) to devote to this cache. +#' By default, 1GB. +#' @param from_recipient +#' first recipient haplotype if creating a partial forward table cache. +#' By default includes from the first recipient haplotype. +#' @param to_recipient +#' last recipient haplotype if creating a partial forward table cache. +#' By default includes to the last recipient haplotype. +#' @param max.tables +#' positive integer indicating the maximum number of tables to use in the cache. +#' Both this and `size` will be honoured, so the number of tables may be smaller than this. +#' By default, equals \eqn{\lfloor\log_2(L)\rfloor}. +#' +#' @return +#' A list of forward tables representing a cache and ready to be filled is returned. +#' +#' @seealso +#' [MakeForwardTable()] to make a forward table; +#' [FillTableCache()] to fill a cache; +#' [ForwardUsingTableCache()] to use a cache; +#' [Forward()] for forward function without using a cache. +#' +#' Alternatively, see [ForwardIterator()] to create an iterator which internally creates or uses a table cache. +#' +#' @examples +#' \dontrun{ +#' # This code assumes you have already: +#' # i) cached the haplotypes using CacheHaplotypes function +#' # ii) setup parameters in a variable called pars +#' # iii) set the number of loci in a variable called L +#' +#' # Allocate up to 10GB to a cache, with parameters already setup in pars ... +#' cache <- CreateForwardTableCache(pars, 10) +#' # ... and fill it +#' FillTableCache(cache, pars, nthreads = 8) +#' +#' # Create forward and backward tables +#' fwd <- MakeForwardTable(pars) +#' bck <- MakeBackwardTable(pars) +#' +#' # Then reach every locus faster by iterating backwards, using the cache to +#' # move the forward table into position faster +#' for(l in L:1) { +#' Backward(bck, pars, l, nthreads = 8) +#' ForwardUsingTableCache(fwd, pars, cache, l, nthreads = 8) +#' # Do whatever work is required at +#' # every locus here using fwd and bck +#' } +#' } +#' +#' @export +CreateForwardTableCache <- function(pars, size = 1, from_recipient = 1, to_recipient = Inf, max.tables = 0) { + if(!("kalisParameters" %in% class(pars))) { + stop("The pars argument is not a valid parameters object.") + } + + N <- get("N", envir = pkgVars) + L <- get("L", envir = pkgVars) + + if(anyNA(N)) { + stop("No haplotypes cached ... cannot determine table size until cache is loaded with CacheAllHaplotypes().") + } + + if(from_recipient>to_recipient) { + stop("from_recipient must be smaller than to_recipient.") + } + if(from_recipient < 1) { + from_recipient <- 1 + } + if(to_recipient > N) { + to_recipient <- N + } + delN <- to_recipient-from_recipient+1 + if(!is.vector(max.tables) || !is.numeric(max.tables) || length(max.tables) != 1 || max.tables < 0) { + stop("max.tables must be a positive scalar.") + } + + cat("Found", N, "haplotypes in the cache.") + if((delN*N+2*delN+1)*8/1e9 > size) { + stop(size, "GB is not big enough for even 1 table.") + } + cat(" Constructing table cache of appropriate size ...\n") + + if(max.tables == 0) { + max.tables <- floor(log2(L)) + } + cache <- list() + i <- 1 + while((length(cache) == 0 || ((utils::object.size(cache)*(length(cache)+1))/length(cache))/1e9 < size) && length(cache)