diff --git a/DESCRIPTION b/DESCRIPTION index d261335..dee50fe 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,6 +9,7 @@ Maintainer: Owen S. Vallis , Jordan Hochenbaum Description: A technique for detecting anomalies in seasonal univariate time series. +ByteCompile: yes Imports: ggplot2, stringr, lubridate Depends: R (>= 2.10.0) Suggests: testthat diff --git a/R/detect_anoms.R b/R/detect_anoms.R index 0a550e1..bd52ba9 100644 --- a/R/detect_anoms.R +++ b/R/detect_anoms.R @@ -28,70 +28,65 @@ detect_anoms <- function(data, k = 0.49, alpha = 0.05, num_obs_per_period = NULL } # Check if our timestamps are posix - posix_timestamp <- if (class(data[[1]])[1] == "POSIXlt") TRUE else FALSE + posix_timestamp <- if (class(data[[1L]])[1L] == "POSIXlt") TRUE else FALSE # -- Step 1: Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose. - data_decomp <- stl(ts(data[[2]], frequency = num_obs_per_period), + data_decomp <- stl(ts(data[[2L]], frequency = num_obs_per_period), s.window = "periodic", robust = TRUE) - data <- data.frame(timestamp = data[[1]], count = (data[[2]]-data_decomp$time.series[,"seasonal"]-median(data[[2]]))) - data_decomp <- data.frame(timestamp=data[[1]], count=(as.numeric(trunc(data_decomp$time.series[,"trend"]+data_decomp$time.series[,"seasonal"])))) + data <- data.frame(timestamp = data[[1L]], count = (data[[2L]]-data_decomp$time.series[,"seasonal"]-median(data[[2L]]))) + data_decomp <- data.frame(timestamp=data[[1L]], count=(as.numeric(trunc(data_decomp$time.series[,"trend"]+data_decomp$time.series[,"seasonal"])))) if(posix_timestamp){ data_decomp <- format_timestamp(data_decomp) } - # Maximum number of outliers that S-H-ESD can detect (i.e. 49% of data) + # Maximum number of outliers that S-H-ESD can detect (e.g. 49% of data) max_outliers <- trunc(num_obs*k) - dataNAs <- sum(is.na(data[[2]])) + dataNAs <- sum(is.na(data[[2L]])) if (dataNAs > 0) { - if (any(is.na(data[[2]][-(1:dataNAs)]))) + if (any(is.na(data[[2L]][-(1L:dataNAs)]))) stop("Data contains non-leading NAs") else - data[[2]][1:dataNAs] <- 1 + data[[2L]][1L:dataNAs] <- 1 } func_ma <- match.fun(median) func_sigma <- match.fun(mad) ## Define values and vectors. - n <- length(data[[2]]) - lam <- c(1:max_outliers) - R <- c(1:max_outliers) + n <- length(data[[2L]]) if (posix_timestamp) { - R_idx <- as.POSIXlt(data[[1]][1:max_outliers], tz = "UTC") + R_idx <- as.POSIXlt(data[[1L]][1L:max_outliers], tz = "UTC") } else { - R_idx <- c(1:max_outliers) + R_idx <- 1L:max_outliers } - num_anoms <- 0 - + num_anoms <- 0L + # Compute test statistic until r=max_outliers values have been # removed from the sample. - for (i in 1:max_outliers){ + for (i in 1L:max_outliers){ if(verbose) print(paste(i,"/", max_outliers,"completed")) if(one_tail){ if(upper_tail){ - ares <- data[[2]] - func_ma(data[[2]]) + ares <- data[[2L]] - func_ma(data[[2L]]) } else { - ares <- func_ma(data[[2]]) - data[[2]] + ares <- func_ma(data[[2L]]) - data[[2L]] } } else { - ares = abs(data[[2]] - func_ma(data[[2]])) + ares = abs(data[[2L]] - func_ma(data[[2L]])) } - ares <- ares/func_sigma(data[[2]]) - R[i] <- max(ares) - - temp_max_idx <- which(ares == max(ares)) + ares <- ares/func_sigma(data[[2L]]) + R <- max(ares) - if(length(temp_max_idx) > 1) - temp_max_idx <- temp_max_idx[1] + temp_max_idx <- which(ares == R)[1L] - R_idx[i] <- data[[1]][temp_max_idx] + R_idx[i] <- data[[1L]][temp_max_idx] - data <- data[-which(data[[1]] == R_idx[i]), ] + data <- data[-which(data[[1L]] == R_idx[i]), ] ## Compute critical value. if(one_tail){ @@ -100,12 +95,12 @@ detect_anoms <- function(data, k = 0.49, alpha = 0.05, num_obs_per_period = NULL p <- 1 - alpha/(2*(n-i+1)) } - t <- qt(p,(n-i-1)) - lam[i] <- t*(n-i) / sqrt((n-i-1+t**2)*(n-i+1)) + t <- qt(p,(n-i-1L)) + lam <- t*(n-i) / sqrt((n-i-1+t**2)*(n-i+1)) - if(R[i] > lam[i]) + if(R > lam) num_anoms <- i } - - return(list(anoms = R_idx[1:num_anoms], stl = data_decomp)) + + return(list(anoms = R_idx[1L:num_anoms], stl = data_decomp)) } diff --git a/R/ts_anom_detection.R b/R/ts_anom_detection.R index b7c5ef4..faa8dcb 100644 --- a/R/ts_anom_detection.R +++ b/R/ts_anom_detection.R @@ -193,14 +193,14 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', for(i in 1:length(all_data)) { anomaly_direction = switch(direction, - "pos" = data.frame(one_tail=T, upper_tail=T), # upper-tail only (positive going anomalies) - "neg" = data.frame(one_tail=T, upper_tail=F), # lower-tail only (negative going anomalies) - "both" = data.frame(one_tail=F, upper_tail=T)) # Both tails. Tail direction is not actually used. + "pos" = data.frame(one_tail=TRUE, upper_tail=TRUE), # upper-tail only (positive going anomalies) + "neg" = data.frame(one_tail=TRUE, upper_tail=FALSE), # lower-tail only (negative going anomalies) + "both" = data.frame(one_tail=FALSE, upper_tail=TRUE)) # Both tails. Tail direction is not actually used. # detect_anoms actually performs the anomaly detection and returns the results in a list containing the anomalies # as well as the decomposed components of the time series for further analysis. - s_h_esd_timestamps <- detect_anoms(all_data[[i]], k=max_anoms, alpha=alpha, num_obs_per_period=period, use_decomp=T, use_esd=F, - one_tail=anomaly_direction$one_tail, upper_tail=anomaly_direction$upper_tail, verbose=F) + s_h_esd_timestamps <- detect_anoms(all_data[[i]], k=max_anoms, alpha=alpha, num_obs_per_period=period, use_decomp=TRUE, use_esd=FALSE, + one_tail=anomaly_direction$one_tail, upper_tail=anomaly_direction$upper_tail, verbose=FALSE) # store decomposed components in local variable and overwrite s_h_esd_timestamps to contain only the anom timestamps data_decomp <- s_h_esd_timestamps$stl diff --git a/R/vec_anom_detection.R b/R/vec_anom_detection.R index 812d738..9ad9edf 100644 --- a/R/vec_anom_detection.R +++ b/R/vec_anom_detection.R @@ -148,14 +148,14 @@ AnomalyDetectionVec = function(x, max_anoms=0.10, direction='pos', alpha=0.05, p for(i in 1:length(all_data)) { anomaly_direction = switch(direction, - "pos" = data.frame(one_tail=T, upper_tail=T), # upper-tail only (positive going anomalies) - "neg" = data.frame(one_tail=T, upper_tail=F), # lower-tail only (negative going anomalies) - "both" = data.frame(one_tail=F, upper_tail=T)) # Both tails. Tail direction is not actually used. + "pos" = data.frame(one_tail=TRUE, upper_tail=TRUE), # upper-tail only (positive going anomalies) + "neg" = data.frame(one_tail=TRUE, upper_tail=FALSE), # lower-tail only (negative going anomalies) + "both" = data.frame(one_tail=FALSE, upper_tail=TRUE)) # Both tails. Tail direction is not actually used. # detect_anoms actually performs the anomaly detection and returns the results in a list containing the anomalies # as well as the decomposed components of the time series for further analysis. - s_h_esd_timestamps <- detect_anoms(all_data[[i]], k=max_anoms, alpha=alpha, num_obs_per_period=period, use_decomp=T, use_esd=F, - one_tail=anomaly_direction$one_tail, upper_tail=anomaly_direction$upper_tail, verbose=F) + s_h_esd_timestamps <- detect_anoms(all_data[[i]], k=max_anoms, alpha=alpha, num_obs_per_period=period, use_decomp=TRUE, use_esd=FALSE, + one_tail=anomaly_direction$one_tail, upper_tail=anomaly_direction$upper_tail, verbose=FALSE) # store decomposed components in local variable and overwrite s_h_esd_timestamps to contain only the anom timestamps data_decomp <- s_h_esd_timestamps$stl