From 6c9f9f648ad09e7c66302c9b9d397544b0e79f59 Mon Sep 17 00:00:00 2001 From: fabian-s Date: Mon, 20 Jun 2016 18:23:10 +0200 Subject: [PATCH] fix code formatting --- R/fpca.sc.R | 324 +++++++++++++++++++++++++++------------------------- 1 file changed, 168 insertions(+), 156 deletions(-) diff --git a/R/fpca.sc.R b/R/fpca.sc.R index 622f0d93..b56e11f2 100644 --- a/R/fpca.sc.R +++ b/R/fpca.sc.R @@ -10,7 +10,7 @@ ##' ##' FPCA via kernel smoothing of the covariance function, with the diagonal ##' treated separately, was proposed in Staniswalis and Lee (1998) and much -##' extended by Yao et al. (2005), who introduced the "PACE" method. +##' extended by Yao et al. (2005), who introduced the 'PACE' method. ##' \code{fpca.sc} uses penalized splines to smooth the covariance function, as ##' developed by Di et al. (2009) and Goldsmith et al. (2013). ##' @@ -57,7 +57,7 @@ ##' values. This can be very slow. If set to \code{2} (the default), a two-step ##' method that obtains a naive covariance estimate which is then smoothed. ##' @param integration quadrature method for numerical integration; only -##' \code{"trapezoidal"} is currently supported. +##' \code{'trapezoidal'} is currently supported. ##' @return An object of class \code{fpca} containing: ##' \item{Yhat}{FPC approximation (projection onto leading components) ##' of \code{Y.pred} if specified, or else of \code{Y}.} @@ -117,22 +117,22 @@ ##' d = as.numeric(colnames(cd4))) ##' ##' ## plot data for one subject, with curve and interval estimates -##' EX.MM.m = melt(EX.MM, id = "d") +##' EX.MM.m = melt(EX.MM, id = 'd') ##' ggplot(EX.MM.m, aes(x = d, y = value, group = variable, color = variable, linetype = variable)) + ##' geom_path() + ##' scale_linetype_manual(values = c(fitted = 1, ptwise.UB = 2, ##' ptwise.LB = 2, simul.UB = 3, simul.LB = 3)) + ##' scale_color_manual(values = c(fitted = 1, ptwise.UB = 2, ##' ptwise.LB = 2, simul.UB = 3, simul.LB = 3)) + -##' labs(x = "Months since seroconversion", y = "Total CD4 Cell Count") +##' labs(x = 'Months since seroconversion', y = 'Total CD4 Cell Count') ##' ##' ## plot estimated mean function ##' ggplot(Fit.mu, aes(x = d, y = mu)) + geom_path() + -##' labs(x = "Months since seroconversion", y = "Total CD4 Cell Count") +##' labs(x = 'Months since seroconversion', y = 'Total CD4 Cell Count') ##' ##' ## plot the first two estimated basis functions -##' Fit.basis.m = melt(Fit.basis, id = "d") -##' ggplot(subset(Fit.basis.m, variable %in% c("phi.1", "phi.2")), aes(x = d, +##' Fit.basis.m = melt(Fit.basis, id = 'd') +##' ggplot(subset(Fit.basis.m, variable %in% c('phi.1', 'phi.2')), aes(x = d, ##' y = value, group = variable, color = variable)) + geom_path() ##' ##' ## input a dataframe instead of a matrix @@ -156,168 +156,180 @@ ##' @importFrom Matrix nearPD Matrix t as.matrix ##' @importFrom mgcv gam predict.gam ##' @importFrom gamm4 gamm4 -fpca.sc <- function(Y = NULL, ydata = NULL, Y.pred=NULL, argvals = NULL, random.int = FALSE, - nbasis = 10, pve = .99, npc = NULL, var = FALSE, simul = FALSE, sim.alpha = .95, - useSymm = FALSE, makePD = FALSE, center=TRUE, cov.est.method = 2, - integration="trapezoidal") { +fpca.sc <- function(Y = NULL, ydata = NULL, Y.pred = NULL, argvals = NULL, random.int = FALSE, + nbasis = 10, pve = 0.99, npc = NULL, var = FALSE, simul = FALSE, sim.alpha = 0.95, + useSymm = FALSE, makePD = FALSE, center = TRUE, cov.est.method = 2, integration = "trapezoidal") { - stopifnot((!is.null(Y) && is.null(ydata))||(is.null(Y) && !is.null(ydata))) + stopifnot((!is.null(Y) && is.null(ydata)) || (is.null(Y) && !is.null(ydata))) - # if data.frame version of ydata is provided - sparseOrNongrid <- !is.null(ydata) - if (sparseOrNongrid) { - stopifnot(ncol(ydata) == 3) - stopifnot(c(".id", ".index", ".value") == colnames(ydata)) - stopifnot(is.null(argvals)) - Y = irreg2mat(ydata) - argvals = sort(unique(ydata$.index)) - } + # if data.frame version of ydata is provided + sparseOrNongrid <- !is.null(ydata) + if (sparseOrNongrid) { + stopifnot(ncol(ydata) == 3) + stopifnot(c(".id", ".index", ".value") == colnames(ydata)) + stopifnot(is.null(argvals)) + Y = irreg2mat(ydata) + argvals = sort(unique(ydata$.index)) + } - if (is.null(Y.pred)) Y.pred = Y - D = NCOL(Y) - I = NROW(Y) - I.pred = NROW(Y.pred) + if (is.null(Y.pred)) + Y.pred = Y + D = NCOL(Y) + I = NROW(Y) + I.pred = NROW(Y.pred) - if (is.null(argvals)) argvals = seq(0, 1, length = D) + if (is.null(argvals)) + argvals = seq(0, 1, length = D) - d.vec = rep(argvals, each = I) - id = rep(1:I, rep(D, I)) + d.vec = rep(argvals, each = I) + id = rep(1:I, rep(D, I)) - if (center) { - if (random.int){ - ri_data <- data.frame(y = as.vector(Y), d.vec=d.vec, id=factor(id)) - gam0 = gamm4(y ~ s(d.vec, k = nbasis), random=~(1|id), data = ri_data)$gam - rm(ri_data) - } - else gam0 = gam(as.vector(Y) ~ s(d.vec, k = nbasis)) - mu = predict(gam0, newdata = data.frame(d.vec = argvals)) - Y.tilde = Y - matrix(mu, I, D, byrow = TRUE) - } - else { - Y.tilde = Y - mu = rep(0, D) - } + if (center) { + if (random.int) { + ri_data <- data.frame(y = as.vector(Y), d.vec = d.vec, id = factor(id)) + gam0 = gamm4(y ~ s(d.vec, k = nbasis), random = ~(1 | id), data = ri_data)$gam + rm(ri_data) + } else gam0 = gam(as.vector(Y) ~ s(d.vec, k = nbasis)) + mu = predict(gam0, newdata = data.frame(d.vec = argvals)) + Y.tilde = Y - matrix(mu, I, D, byrow = TRUE) + } else { + Y.tilde = Y + mu = rep(0, D) + } - if (cov.est.method==2) { # smooth raw covariance estimate - cov.sum = cov.count = cov.mean = matrix(0, D, D) - for (i in 1:I) { - obs.points = which(!is.na(Y[i, ])) - cov.count[obs.points, obs.points] = cov.count[obs.points, obs.points] + 1 - cov.sum[obs.points, obs.points] = cov.sum[obs.points, obs.points] + tcrossprod(Y.tilde[i, obs.points]) - } - G.0 = ifelse(cov.count == 0, NA, cov.sum/cov.count) - diag.G0 = diag(G.0) - diag(G.0) = NA - if (!useSymm) { - row.vec = rep(argvals, each = D) - col.vec = rep(argvals, D) - npc.0 = matrix(predict(gam(as.vector(G.0) ~ te(row.vec, col.vec, k = nbasis), weights =as.vector(cov.count)), newdata = data.frame(row.vec = row.vec,col.vec = col.vec)), D, D) - npc.0 = (npc.0 + t(npc.0))/2 - } - else { - use <- upper.tri(G.0, diag = TRUE) - use[2, 1] <- use[ncol(G.0), ncol(G.0) - 1] <- TRUE - usecov.count <- cov.count - usecov.count[2, 1] <- usecov.count[ncol(G.0), ncol(G.0) - 1] <- 0 - usecov.count <- as.vector(usecov.count)[use] - use <- as.vector(use) - vG.0 <- as.vector(G.0)[use] - row.vec <- rep(argvals, each = D)[use] - col.vec <- rep(argvals, times = D)[use] - mCov <- gam(vG.0 ~ te(row.vec, col.vec, k = nbasis), weights = usecov.count) - npc.0 <- matrix(NA, D, D) - spred <- rep(argvals, each = D)[upper.tri(npc.0, diag = TRUE)] - tpred <- rep(argvals, times = D)[upper.tri(npc.0, diag = TRUE)] - smVCov <- predict(mCov, newdata = data.frame(row.vec = spred, col.vec = tpred)) - npc.0[upper.tri(npc.0, diag = TRUE)] <- smVCov - npc.0[lower.tri(npc.0)] <- t(npc.0)[lower.tri(npc.0)] - } + if (cov.est.method == 2) { + # smooth raw covariance estimate + cov.sum = cov.count = cov.mean = matrix(0, D, D) + for (i in 1:I) { + obs.points = which(!is.na(Y[i, ])) + cov.count[obs.points, obs.points] = cov.count[obs.points, obs.points] + + 1 + cov.sum[obs.points, obs.points] = cov.sum[obs.points, obs.points] + tcrossprod(Y.tilde[i, + obs.points]) } - - else if (cov.est.method==1) { # smooth y(s1)y(s2) values to obtain covariance estimate - row.vec = col.vec = G.0.vec = c() - cov.sum = cov.count = cov.mean = matrix(0, D, D) - for (i in 1:I) { - obs.points = which(!is.na(Y[i, ])) - temp = tcrossprod(Y.tilde[i, obs.points]) - diag(temp) = NA - row.vec = c(row.vec, rep(argvals[obs.points], each = length(obs.points))) - col.vec = c(col.vec, rep(argvals[obs.points], length(obs.points))) - G.0.vec = c(G.0.vec, as.vector(temp)) - # still need G.O raw to calculate to get the raw to get the diagonal - cov.count[obs.points, obs.points] = cov.count[obs.points, obs.points] + 1 - cov.sum[obs.points, obs.points] = cov.sum[obs.points, obs.points] + tcrossprod(Y.tilde[i, obs.points]) - } - row.vec.pred = rep(argvals, each = D) - col.vec.pred = rep(argvals, D) - npc.0 = matrix(predict(gam(G.0.vec ~ te(row.vec, col.vec, k = nbasis)), newdata = data.frame(row.vec = row.vec.pred, col.vec = col.vec.pred)), D, D) - npc.0 = (npc.0 + t(npc.0))/2 - G.0 = ifelse(cov.count == 0, NA, cov.sum/cov.count) - diag.G0 = diag(G.0) + G.0 = ifelse(cov.count == 0, NA, cov.sum/cov.count) + diag.G0 = diag(G.0) + diag(G.0) = NA + if (!useSymm) { + row.vec = rep(argvals, each = D) + col.vec = rep(argvals, D) + npc.0 = matrix(predict(gam(as.vector(G.0) ~ te(row.vec, col.vec, k = nbasis), + weights = as.vector(cov.count)), newdata = data.frame(row.vec = row.vec, + col.vec = col.vec)), D, D) + npc.0 = (npc.0 + t(npc.0))/2 + } else { + use <- upper.tri(G.0, diag = TRUE) + use[2, 1] <- use[ncol(G.0), ncol(G.0) - 1] <- TRUE + usecov.count <- cov.count + usecov.count[2, 1] <- usecov.count[ncol(G.0), ncol(G.0) - 1] <- 0 + usecov.count <- as.vector(usecov.count)[use] + use <- as.vector(use) + vG.0 <- as.vector(G.0)[use] + row.vec <- rep(argvals, each = D)[use] + col.vec <- rep(argvals, times = D)[use] + mCov <- gam(vG.0 ~ te(row.vec, col.vec, k = nbasis), weights = usecov.count) + npc.0 <- matrix(NA, D, D) + spred <- rep(argvals, each = D)[upper.tri(npc.0, diag = TRUE)] + tpred <- rep(argvals, times = D)[upper.tri(npc.0, diag = TRUE)] + smVCov <- predict(mCov, newdata = data.frame(row.vec = spred, col.vec = tpred)) + npc.0[upper.tri(npc.0, diag = TRUE)] <- smVCov + npc.0[lower.tri(npc.0)] <- t(npc.0)[lower.tri(npc.0)] } - - if (makePD) { - npc.0 <- { - tmp <- Matrix::nearPD(npc.0, corr = FALSE, keepDiag = FALSE, do2eigen = TRUE, trace = TRUE) - as.matrix(tmp$mat) - } + } else if (cov.est.method == 1) { + # smooth y(s1)y(s2) values to obtain covariance estimate + row.vec = col.vec = G.0.vec = c() + cov.sum = cov.count = cov.mean = matrix(0, D, D) + for (i in 1:I) { + obs.points = which(!is.na(Y[i, ])) + temp = tcrossprod(Y.tilde[i, obs.points]) + diag(temp) = NA + row.vec = c(row.vec, rep(argvals[obs.points], each = length(obs.points))) + col.vec = c(col.vec, rep(argvals[obs.points], length(obs.points))) + G.0.vec = c(G.0.vec, as.vector(temp)) + # still need G.O raw to calculate to get the raw to get the diagonal + cov.count[obs.points, obs.points] = cov.count[obs.points, obs.points] + + 1 + cov.sum[obs.points, obs.points] = cov.sum[obs.points, obs.points] + tcrossprod(Y.tilde[i, + obs.points]) } - ### numerical integration for calculation of eigenvalues (see Ramsay & Silverman, Chapter 8) - w <- quadWeights(argvals, method=integration) - Wsqrt <- diag(sqrt(w)) - Winvsqrt <- diag(1/(sqrt(w))) - V <- Wsqrt %*% npc.0 %*% Wsqrt - evalues = eigen(V, symmetric = TRUE, only.values = TRUE)$values - ### - evalues = replace(evalues, which(evalues <= 0), 0) - npc = ifelse(is.null(npc), min(which(cumsum(evalues)/sum(evalues) > pve)), npc) - efunctions = matrix(Winvsqrt%*%eigen(V, symmetric = TRUE)$vectors[, seq(len = npc)], nrow = D, ncol = npc) - evalues = eigen(V, symmetric = TRUE, only.values = TRUE)$values[1:npc] # use correct matrix for eigenvalue problem - cov.hat = efunctions %*% tcrossprod(diag(evalues, nrow = npc, ncol = npc), efunctions) - ### numerical integration for estimation of sigma2 - T.len <- argvals[D] - argvals[1] # total interval length - T1.min <- min(which(argvals >= argvals[1] + 0.25*T.len)) # left bound of narrower interval T1 - T1.max <- max(which(argvals <= argvals[D] - 0.25*T.len)) # right bound of narrower interval T1 - DIAG = (diag.G0 - diag(cov.hat))[T1.min :T1.max] # function values - w2 <- quadWeights(argvals[T1.min:T1.max], method = integration) - sigma2 <- max(weighted.mean(DIAG, w=w2, na.rm = TRUE), 0) + row.vec.pred = rep(argvals, each = D) + col.vec.pred = rep(argvals, D) + npc.0 = matrix(predict(gam(G.0.vec ~ te(row.vec, col.vec, k = nbasis)), newdata = data.frame(row.vec = row.vec.pred, + col.vec = col.vec.pred)), D, D) + npc.0 = (npc.0 + t(npc.0))/2 + G.0 = ifelse(cov.count == 0, NA, cov.sum/cov.count) + diag.G0 = diag(G.0) + } - #### - D.inv = diag(1/evalues, nrow = npc, ncol = npc) - Z = efunctions - Y.tilde = Y.pred - matrix(mu, I.pred, D, byrow = TRUE) - Yhat = matrix(0, nrow = I.pred, ncol = D) - rownames(Yhat) = rownames(Y.pred); colnames(Yhat) = colnames(Y.pred) - scores = matrix(NA, nrow = I.pred, ncol = npc) - VarMats = vector("list", I.pred) - for (i in 1:I.pred) VarMats[[i]] = matrix(NA, nrow = D, ncol = D) - diag.var = matrix(NA, nrow = I.pred, ncol = D) - crit.val = rep(0, I.pred) - for (i.subj in 1:I.pred) { - obs.points = which(!is.na(Y.pred[i.subj, ])) - if (sigma2 == 0 & length(obs.points) < npc) - stop("Measurement error estimated to be zero and there are fewer observed points than PCs; scores cannot be estimated.") - Zcur = matrix(Z[obs.points, ], nrow = length(obs.points), ncol = dim(Z)[2]) - ZtZ_sD.inv = solve(crossprod(Zcur) + sigma2 * D.inv) - scores[i.subj, ] = ZtZ_sD.inv %*% t(Zcur) %*% (Y.tilde[i.subj, obs.points]) - Yhat[i.subj, ] = t(as.matrix(mu)) + scores[i.subj, ] %*% t(efunctions) - if (var) { - VarMats[[i.subj]] = sigma2 * Z %*% ZtZ_sD.inv %*% t(Z) - diag.var[i.subj, ] = diag(VarMats[[i.subj]]) - if (simul & sigma2 != 0) { - norm.samp = mvrnorm(2500, mu = rep(0, D), Sigma = VarMats[[i.subj]])/matrix(sqrt(diag(VarMats[[i.subj]])), nrow = 2500, ncol = D, byrow = TRUE) - crit.val[i.subj] = quantile(apply(abs(norm.samp), 1, max), sim.alpha) - } - } + if (makePD) { + npc.0 <- { + tmp <- Matrix::nearPD(npc.0, corr = FALSE, keepDiag = FALSE, do2eigen = TRUE, + trace = TRUE) + as.matrix(tmp$mat) } + } + ### numerical integration for calculation of eigenvalues (see Ramsay & Silverman, + ### Chapter 8) + w <- quadWeights(argvals, method = integration) + Wsqrt <- diag(sqrt(w)) + Winvsqrt <- diag(1/(sqrt(w))) + V <- Wsqrt %*% npc.0 %*% Wsqrt + evalues = eigen(V, symmetric = TRUE, only.values = TRUE)$values + ### + evalues = replace(evalues, which(evalues <= 0), 0) + npc = ifelse(is.null(npc), min(which(cumsum(evalues)/sum(evalues) > pve)), npc) + efunctions = matrix(Winvsqrt %*% eigen(V, symmetric = TRUE)$vectors[, seq(len = npc)], + nrow = D, ncol = npc) + evalues = eigen(V, symmetric = TRUE, only.values = TRUE)$values[1:npc] # use correct matrix for eigenvalue problem + cov.hat = efunctions %*% tcrossprod(diag(evalues, nrow = npc, ncol = npc), efunctions) + ### numerical integration for estimation of sigma2 + T.len <- argvals[D] - argvals[1] # total interval length + T1.min <- min(which(argvals >= argvals[1] + 0.25 * T.len)) # left bound of narrower interval T1 + T1.max <- max(which(argvals <= argvals[D] - 0.25 * T.len)) # right bound of narrower interval T1 + DIAG = (diag.G0 - diag(cov.hat))[T1.min:T1.max] # function values + w2 <- quadWeights(argvals[T1.min:T1.max], method = integration) + sigma2 <- max(weighted.mean(DIAG, w = w2, na.rm = TRUE), 0) - ret.objects = c("Yhat", "Y", "scores", "mu", "efunctions", "evalues", "npc", "argvals") + #### + D.inv = diag(1/evalues, nrow = npc, ncol = npc) + Z = efunctions + Y.tilde = Y.pred - matrix(mu, I.pred, D, byrow = TRUE) + Yhat = matrix(0, nrow = I.pred, ncol = D) + rownames(Yhat) = rownames(Y.pred) + colnames(Yhat) = colnames(Y.pred) + scores = matrix(NA, nrow = I.pred, ncol = npc) + VarMats = vector("list", I.pred) + for (i in 1:I.pred) VarMats[[i]] = matrix(NA, nrow = D, ncol = D) + diag.var = matrix(NA, nrow = I.pred, ncol = D) + crit.val = rep(0, I.pred) + for (i.subj in 1:I.pred) { + obs.points = which(!is.na(Y.pred[i.subj, ])) + if (sigma2 == 0 & length(obs.points) < npc) + stop("Measurement error estimated to be zero and there are fewer observed points than PCs; scores cannot be estimated.") + Zcur = matrix(Z[obs.points, ], nrow = length(obs.points), ncol = dim(Z)[2]) + ZtZ_sD.inv = solve(crossprod(Zcur) + sigma2 * D.inv) + scores[i.subj, ] = ZtZ_sD.inv %*% t(Zcur) %*% (Y.tilde[i.subj, obs.points]) + Yhat[i.subj, ] = t(as.matrix(mu)) + scores[i.subj, ] %*% t(efunctions) if (var) { - ret.objects = c(ret.objects, "sigma2", "diag.var", "VarMats") - if (simul) ret.objects = c(ret.objects, "crit.val") + VarMats[[i.subj]] = sigma2 * Z %*% ZtZ_sD.inv %*% t(Z) + diag.var[i.subj, ] = diag(VarMats[[i.subj]]) + if (simul & sigma2 != 0) { + norm.samp = mvrnorm(2500, mu = rep(0, D), Sigma = VarMats[[i.subj]])/matrix(sqrt(diag(VarMats[[i.subj]])), + nrow = 2500, ncol = D, byrow = TRUE) + crit.val[i.subj] = quantile(apply(abs(norm.samp), 1, max), sim.alpha) + } } - ret = lapply(1:length(ret.objects), function(u) get(ret.objects[u])) - names(ret) = ret.objects - class(ret) = "fpca" - return(ret) + } + + ret.objects = c("Yhat", "Y", "scores", "mu", "efunctions", "evalues", "npc", + "argvals") + if (var) { + ret.objects = c(ret.objects, "sigma2", "diag.var", "VarMats") + if (simul) + ret.objects = c(ret.objects, "crit.val") + } + ret = lapply(1:length(ret.objects), function(u) get(ret.objects[u])) + names(ret) = ret.objects + class(ret) = "fpca" + return(ret) }