From d37a7e3286f0ae59a05c0bb215663a586d673273 Mon Sep 17 00:00:00 2001 From: owen vallis Date: Sun, 25 Jan 2015 14:59:06 -0800 Subject: [PATCH 1/5] added comments clarifying that seasonal decomp returns/is used for --- R/detect_anoms.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/detect_anoms.R b/R/detect_anoms.R index bd52ba9..7e02764 100644 --- a/R/detect_anoms.R +++ b/R/detect_anoms.R @@ -34,7 +34,10 @@ detect_anoms <- function(data, k = 0.49, alpha = 0.05, num_obs_per_period = NULL data_decomp <- stl(ts(data[[2L]], frequency = num_obs_per_period), s.window = "periodic", robust = TRUE) + # Remove the seasonal component, and the median of the data to create the univariate remainder data <- data.frame(timestamp = data[[1L]], count = (data[[2L]]-data_decomp$time.series[,"seasonal"]-median(data[[2L]]))) + + # Store the smoothed seasonal component, plus the trend component for use in determining the "expected values" option data_decomp <- data.frame(timestamp=data[[1L]], count=(as.numeric(trunc(data_decomp$time.series[,"trend"]+data_decomp$time.series[,"seasonal"])))) if(posix_timestamp){ From 61c7686dfd8b5e163b8d5bf301eb65775ad0dcaf Mon Sep 17 00:00:00 2001 From: owen vallis Date: Sun, 25 Jan 2015 14:59:58 -0800 Subject: [PATCH 2/5] leading NAs are replaced with 0s now instead of 1s. Addresses issue #19 --- R/detect_anoms.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/detect_anoms.R b/R/detect_anoms.R index 7e02764..9d7786e 100644 --- a/R/detect_anoms.R +++ b/R/detect_anoms.R @@ -51,7 +51,7 @@ detect_anoms <- function(data, k = 0.49, alpha = 0.05, num_obs_per_period = NULL if (any(is.na(data[[2L]][-(1L:dataNAs)]))) stop("Data contains non-leading NAs") else - data[[2L]][1L:dataNAs] <- 1 + data[[2L]][1L:dataNAs] <- 0 } func_ma <- match.fun(median) From 422df9ae96b2c511ce92df1c80b452848718b1c4 Mon Sep 17 00:00:00 2001 From: Arwin Tio Date: Mon, 26 Jan 2015 03:23:39 -0800 Subject: [PATCH 3/5] Issue 20: Issues using daily data with the "long_term" option 1. Added a parameter(piecewise_median_period_weeks) that specifies the piecewise median time windows which was previously defaulted to 2 weeks. 2. Added an error message if max_outliers = 0 in a time window, prompting the user to choose a higher piecewise_median_period_weeks. 3. Made the time period passed to detect_anoms 2*num_obs_per_period + 1 instead of 2*num_obs_per_period. --- R/detect_anoms.R | 26 ++++--- R/ts_anom_detection.R | 156 ++++++++++++++++++++++-------------------- 2 files changed, 97 insertions(+), 85 deletions(-) diff --git a/R/detect_anoms.R b/R/detect_anoms.R index bd52ba9..833ae8e 100644 --- a/R/detect_anoms.R +++ b/R/detect_anoms.R @@ -1,5 +1,5 @@ -detect_anoms <- function(data, k = 0.49, alpha = 0.05, num_obs_per_period = NULL, - use_decomp = TRUE, use_esd = FALSE, one_tail = TRUE, +detect_anoms <- function(data, k = 0.49, alpha = 0.05, num_obs_per_period = NULL, + use_decomp = TRUE, use_esd = FALSE, one_tail = TRUE, upper_tail = TRUE, verbose = FALSE) { # Detects anomalies in a time series using S-H-ESD. # @@ -15,11 +15,11 @@ detect_anoms <- function(data, k = 0.49, alpha = 0.05, num_obs_per_period = NULL # verbose: Additionally printing for debugging. # Returns: # A list containing the anomalies (anoms) and decomposition components (stl). - + if(is.null(num_obs_per_period)) { stop("must supply period length for time series decomposition") } - + num_obs <- nrow(data) # Check to make sure we have at least two periods worth of data for anomaly context @@ -31,21 +31,25 @@ detect_anoms <- function(data, k = 0.49, alpha = 0.05, num_obs_per_period = NULL posix_timestamp <- if (class(data[[1L]])[1L] == "POSIXlt") TRUE else FALSE # -- Step 1: Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose. - data_decomp <- stl(ts(data[[2L]], frequency = num_obs_per_period), + data_decomp <- stl(ts(data[[2L]], frequency = num_obs_per_period), s.window = "periodic", robust = TRUE) - + data <- data.frame(timestamp = data[[1L]], count = (data[[2L]]-data_decomp$time.series[,"seasonal"]-median(data[[2L]]))) data_decomp <- data.frame(timestamp=data[[1L]], count=(as.numeric(trunc(data_decomp$time.series[,"trend"]+data_decomp$time.series[,"seasonal"])))) - + if(posix_timestamp){ data_decomp <- format_timestamp(data_decomp) } # Maximum number of outliers that S-H-ESD can detect (e.g. 49% of data) max_outliers <- trunc(num_obs*k) + if(max_outliers == 0)){ + stop(paste0("With longterm=TRUE, AnomalyDetection splits the data into 2 week periods by default. You have ", num_obs, " observations in a period, which is too few. Set a higher piecewise_median_period_weeks.")) + } + dataNAs <- sum(is.na(data[[2L]])) if (dataNAs > 0) { - if (any(is.na(data[[2L]][-(1L:dataNAs)]))) + if (any(is.na(data[[2L]][-(1L:dataNAs)]))) stop("Data contains non-leading NAs") else data[[2L]][1L:dataNAs] <- 1 @@ -63,7 +67,7 @@ detect_anoms <- function(data, k = 0.49, alpha = 0.05, num_obs_per_period = NULL } num_anoms <- 0L - + # Compute test statistic until r=max_outliers values have been # removed from the sample. for (i in 1L:max_outliers){ @@ -86,7 +90,7 @@ detect_anoms <- function(data, k = 0.49, alpha = 0.05, num_obs_per_period = NULL R_idx[i] <- data[[1L]][temp_max_idx] - data <- data[-which(data[[1L]] == R_idx[i]), ] + data <- data[-which(data[[1L]] == R_idx[i]), ] ## Compute critical value. if(one_tail){ @@ -101,6 +105,6 @@ detect_anoms <- function(data, k = 0.49, alpha = 0.05, num_obs_per_period = NULL if(R > lam) num_anoms <- i } - + return(list(anoms = R_idx[1L:num_anoms], stl = data_decomp)) } diff --git a/R/ts_anom_detection.R b/R/ts_anom_detection.R index faa8dcb..c1734b7 100644 --- a/R/ts_anom_detection.R +++ b/R/ts_anom_detection.R @@ -1,22 +1,24 @@ -#' Anomaly Detection Using Seasonal Hybrid ESD Test +#' Anomaly Detection Using Seasonal Hybrid ESD Test #' -#' A technique for detecting anomalies in seasonal univariate time series where the input is a +#' A technique for detecting anomalies in seasonal univariate time series where the input is a #' series of pairs. #' @name AnomalyDetectionTs -#' @param x Time series as a two column data frame where the first column consists of the +#' @param x Time series as a two column data frame where the first column consists of the #' timestamps and the second column consists of the observations. #' @param max_anoms Maximum number of anomalies that S-H-ESD will detect as a percentage of the #' data. -#' @param direction Directionality of the anomalies to be detected. Options are: +#' @param direction Directionality of the anomalies to be detected. Options are: #' \code{'pos' | 'neg' | 'both'}. -#' @param alpha The level of statistical significance with which to accept or reject anomalies. +#' @param alpha The level of statistical significance with which to accept or reject anomalies. #' @param only_last Find and report anomalies only within the last day or hr in the time series. #' \code{NULL | 'day' | 'hr'}. -#' @param threshold Only report positive going anoms above the threshold specified. Options are: +#' @param threshold Only report positive going anoms above the threshold specified. Options are: #' \code{'None' | 'med_max' | 'p95' | 'p99'}. #' @param e_value Add an additional column to the anoms output containing the expected value. #' @param longterm Increase anom detection efficacy for time series that are greater than a month. #' See Details below. +#' @param piecewise_median_period_weeks The piecewise median time window as described in Vallis, Hochenbaum, and Kejariwal (2014). +#' Defaults to 2. #' @param plot A flag indicating if a plot with both the time series and the estimated anoms, #' indicated by circles, should also be returned. #' @param y_log Apply log scaling to the y-axis. This helps with viewing plots that have extremely @@ -26,20 +28,20 @@ #' @details #' \code{longterm} This option should be set when the input time series is longer than a month. #' The option enables the approach described in Vallis, Hochenbaum, and Kejariwal (2014).\cr\cr -#' \code{threshold} Filter all negative anomalies and those anomalies whose magnitude is smaller -#' than one of the specified thresholds which include: the median -#' of the daily max values (med_max), the 95th percentile of the daily max values (p95), and the +#' \code{threshold} Filter all negative anomalies and those anomalies whose magnitude is smaller +#' than one of the specified thresholds which include: the median +#' of the daily max values (med_max), the 95th percentile of the daily max values (p95), and the #' 99th percentile of the daily max values (p99). #' @param title Title for the output plot. #' @return The returned value is a list with the following components. #' @return \item{anoms}{Data frame containing timestamps, values, and optionally expected values.} #' @return \item{plot}{A graphical object if plotting was requested by the user. The plot contains #' the estimated anomalies annotated on the input time series.} -#' @return One can save \code{anoms} to a file in the following fashion: +#' @return One can save \code{anoms} to a file in the following fashion: #' \code{write.csv([["anoms"]], file=)} -#' @return One can save \code{plot} to a file in the following fashion: +#' @return One can save \code{plot} to a file in the following fashion: #' \code{ggsave(, plot=[["plot"]])} -#' @references Vallis, O., Hochenbaum, J. and Kejariwal, A., (2014) "A Novel Technique for +#' @references Vallis, O., Hochenbaum, J. and Kejariwal, A., (2014) "A Novel Technique for #' Long-Term Anomaly Detection in the Cloud", 6th USENIX, Philadelphia, PA. #' @references Rosner, B., (May 1983), "Percentage Points for a Generalized ESD Many-Outlier Procedure" #' , Technometrics, 25(2), pp. 165-172. @@ -47,7 +49,7 @@ #' @docType data #' @keywords datasets #' @name raw_data -#' +#' #' @examples #' data(raw_data) #' AnomalyDetectionTs(raw_data, max_anoms=0.02, direction='both', plot=TRUE) @@ -55,13 +57,13 @@ #' AnomalyDetectionTs(raw_data, max_anoms=0.02, direction='both', only_last="day", plot=TRUE) #' @seealso \code{\link{AnomalyDetectionVec}} #' @export -#' -AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', - alpha = 0.05, only_last = NULL, threshold = 'None', - e_value = FALSE, longterm = FALSE, plot = FALSE, - y_log = FALSE, xlabel = '', ylabel = 'count', +#' +AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', + alpha = 0.05, only_last = NULL, threshold = 'None', + e_value = FALSE, longterm = FALSE, piecewise_median_period_weeks = 2, plot = FALSE, + y_log = FALSE, xlabel = '', ylabel = 'count', title = NULL){ - + # Check for supported inputs types if(!is.data.frame(x)){ stop("data must be a single data frame.") @@ -78,7 +80,7 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', if (any((names(x) == c("timestamp", "count")) == FALSE)) { colnames(x) <- c("timestamp", "count") } - + # Sanity check all input parameters if(max_anoms > .49){ stop(paste("max_anoms must be less than 50% of the data points (max_anoms =", round(max_anoms*length(x[[2]]), 0), " data_points =", length(x[[2]]),").")) @@ -93,7 +95,7 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', stop("only_last must be either 'day' or 'hr'") } if(!threshold %in% c('None','med_max','p95','p99')){ - stop("threshold options are: None | med_max | p95 | p99.") + stop("threshold options are: None | med_max | p95 | p99.") } if(!is.logical(e_value)){ stop("e_value must be either TRUE (T) or FALSE (F)") @@ -101,6 +103,9 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', if(!is.logical(longterm)){ stop("longterm must be either TRUE (T) or FALSE (F)") } + if(piecewise_median_period_weeks < 2){ + stop("piecewise_median_period_weeks must be at greater than 2 weeks") + } if(!is.logical(plot)){ stop("plot must be either TRUE (T) or FALSE (F)") } @@ -121,13 +126,13 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', } else { title <- paste(title, " : ", sep="") } - + # -- Main analysis: Perform S-H-ESD - + # Derive number of observations in a single day. # Although we derive this in S-H-ESD, we also need it to be minutley later on so we do it here first. gran <- get_gran(x, 1) - + if(gran == "day"){ num_days_per_line <- 7 if(is.character(only_last) && only_last == 'hr'){ @@ -136,88 +141,91 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', } else { num_days_per_line <- 1 } - + # Aggregate data to minutely if secondly if(gran == "sec"){ x <- format_timestamp(aggregate(x[2], format(x[1], "%Y-%m-%d %H:%M:00"), eval(parse(text="sum")))) } - + period = switch(gran, min = 1440, hr = 24, # if the data is daily, then we need to bump the period to weekly to get multiple examples day = 7) num_obs <- length(x[[2]]) - + if(max_anoms < 1/num_obs){ max_anoms <- 1/num_obs } - + # -- Setup for longterm time series - + # If longterm is enabled, break the data into subset data frames and store in all_data if(longterm){ - # Pre-allocate list with size equal to the number of two week chunks in x + any left over chunk + # Pre-allocate list with size equal to the number of piecewise_median_period_weeks chunks in x + any left over chunk # handle edge cases for daily and single column data period lengths if(gran == "day"){ - num_obs_two_week <- period*2 + # STL needs 2*period + 1 observations + num_obs_in_period <- period*piecewise_median_period_weeks + 1 + num_days_in_period <- (7*piecewise_median_period_weeks) + 1 } else { - num_obs_two_week <- period*14 + num_obs_in_period <- period*7*piecewise_median_period_weeks + num_days_in_period <- (7*piecewise_median_period_weeks) } - + # Store last date in time series - last_date <- x[[1]][num_obs] - - all_data <- vector(mode="list", length=ceiling(length(x[[1]])/(num_obs_two_week))) - # Subset x into two week chunks - for(j in seq(1,length(x[[1]]), by=num_obs_two_week)){ + last_date <- x[[1]][num_obs] + + all_data <- vector(mode="list", length=ceiling(length(x[[1]])/(num_obs_in_period))) + # Subset x into piecewise_median_period_weeks chunks + for(j in seq(1,length(x[[1]]), by=num_obs_in_period)){ start_date <- x[[1]][j] - end_date <- min(start_date + lubridate::weeks(2), x[[1]][length(x[[1]])]) + end_date <- min(start_date + lubridate::days(num_days_in_period), x[[1]][length(x[[1]])]) # if there is at least 14 days left, subset it, otherwise subset last_date - 14days - if(difftime(end_date, start_date, units = "days") == as.difftime(14, units="days")){ - all_data[[ceiling(j/(num_obs_two_week))]] <- subset(x, x[[1]] >= start_date & x[[1]] < end_date) + if(difftime(end_date, start_date, units = "days") == as.difftime(num_days_in_period, units="days")){ + all_data[[ceiling(j/(num_obs_in_period))]] <- subset(x, x[[1]] >= start_date & x[[1]] < end_date) }else{ - all_data[[ceiling(j/(num_obs_two_week))]] <- subset(x, x[[1]] > (last_date-lubridate::weeks(2)) & x[[1]] <= last_date) + all_data[[ceiling(j/(num_obs_in_period))]] <- subset(x, x[[1]] > (last_date-lubridate::days(num_days_in_period)) & x[[1]] <= last_date) } } }else{ # If longterm is not enabled, then just overwrite all_data list with x as the only item all_data <- list(x) } - + # Create empty data frames to store all anoms and seasonal+trend component from decomposition all_anoms <- data.frame(timestamp=numeric(0), count=numeric(0)) seasonal_plus_trend <- data.frame(timestamp=numeric(0), count=numeric(0)) - + # Detect anomalies on all data (either entire data in one-pass, or in 2 week blocks if longterm=TRUE) for(i in 1:length(all_data)) { - + anomaly_direction = switch(direction, "pos" = data.frame(one_tail=TRUE, upper_tail=TRUE), # upper-tail only (positive going anomalies) "neg" = data.frame(one_tail=TRUE, upper_tail=FALSE), # lower-tail only (negative going anomalies) "both" = data.frame(one_tail=FALSE, upper_tail=TRUE)) # Both tails. Tail direction is not actually used. - + # detect_anoms actually performs the anomaly detection and returns the results in a list containing the anomalies # as well as the decomposed components of the time series for further analysis. - s_h_esd_timestamps <- detect_anoms(all_data[[i]], k=max_anoms, alpha=alpha, num_obs_per_period=period, use_decomp=TRUE, use_esd=FALSE, - one_tail=anomaly_direction$one_tail, upper_tail=anomaly_direction$upper_tail, verbose=FALSE) - - # store decomposed components in local variable and overwrite s_h_esd_timestamps to contain only the anom timestamps + s_h_esd_timestamps <- detect_anoms(all_data[[i]], k=max_anoms, alpha=alpha, num_obs_per_period=period, use_decomp=TRUE, use_esd=FALSE, + one_tail=anomaly_direction$one_tail, upper_tail=anomaly_direction$upper_tail, verbose=FALSE) + + # store decomposed components in local variable and overwrite s_h_esd_timestamps to contain only the anom timestamps data_decomp <- s_h_esd_timestamps$stl s_h_esd_timestamps <- s_h_esd_timestamps$anoms - + # -- Step 3: Use detected anomaly timestamps to extract the actual anomalies (timestamp and value) from the data if(length(s_h_esd_timestamps) > 0){ anoms <- subset(all_data[[i]], (all_data[[i]][[1]] %in% s_h_esd_timestamps)) } else { anoms <- data.frame(timestamp=numeric(0), count=numeric(0)) } - + # Filter the anomalies using one of the thresholding functions if applicable if(threshold != "None"){ # Calculate daily max values - periodic_maxs <- tapply(x[[2]],as.Date(x[[1]]),FUN=max) - + periodic_maxs <- tapply(x[[2]],as.Date(x[[1]]),FUN=max) + # Calculate the threshold set by the user if(threshold == 'med_max'){ thresh <- median(periodic_maxs) @@ -232,11 +240,11 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', all_anoms <- rbind(all_anoms, anoms) seasonal_plus_trend <- rbind(seasonal_plus_trend, data_decomp) } - + # Cleanup potential duplicates all_anoms <- all_anoms[!duplicated(all_anoms[[1]]), ] seasonal_plus_trend <- seasonal_plus_trend[!duplicated(seasonal_plus_trend[[1]]), ] - + # -- If only_last was set by the user, create subset of the data that represent the most recent day if(!is.null(only_last)){ start_date <- x[[1]][num_obs]-lubridate::days(7) @@ -253,9 +261,9 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', start_date <- lubridate::floor_date(x[[1]][num_obs]-lubridate::days(2), "day") start_anoms <- x[[1]][num_obs]-lubridate::hours(1) breaks <- 3 - } + } } - + # subset the last days worth of data x_subset_single_day <- subset(x, (x[[1]] > start_anoms)) # When plotting anoms for the last day only we only show the previous weeks data @@ -263,62 +271,62 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', all_anoms <- subset(all_anoms, all_anoms[[1]] >= x_subset_single_day[[1]][1]) num_obs <- length(x_subset_single_day[[2]]) } - + # Calculate number of anomalies as a percentage anom_pct <- (length(all_anoms[[2]]) / num_obs) * 100 - + # If there are no anoms, then let's exit if(anom_pct == 0){ print("No anomalies detected.") return (list("anoms"=NULL, "plot"=NULL)) } - + if(plot){ # -- Build title for plots utilizing parameters set by user plot_title <- paste(title, round(anom_pct, digits=2), "% Anomalies (alpha=", alpha, ", direction=", direction,")", sep="") if(longterm){ plot_title <- paste(plot_title, ", longterm=T", sep="") } - + # -- Plot raw time series data color_name <- paste("\"", title, "\"", sep="") alpha <- 0.8 if(!is.null(only_last)){ xgraph <- ggplot2::ggplot(x_subset_week, ggplot2::aes_string(x="timestamp", y="count")) + ggplot2::theme_bw() + ggplot2::theme(panel.grid.major = ggplot2::element_blank(), panel.grid.minor = ggplot2::element_blank(), text=ggplot2::element_text(size = 14)) - xgraph <- xgraph + ggplot2::geom_line(data=x_subset_week, ggplot2::aes_string(colour=color_name), alpha=alpha*.33) + ggplot2::geom_line(data=x_subset_single_day, ggplot2::aes_string(color=color_name), alpha=alpha) + xgraph <- xgraph + ggplot2::geom_line(data=x_subset_week, ggplot2::aes_string(colour=color_name), alpha=alpha*.33) + ggplot2::geom_line(data=x_subset_single_day, ggplot2::aes_string(color=color_name), alpha=alpha) week_rng = get_range(x_subset_week, index=2, y_log=y_log) day_rng = get_range(x_subset_single_day, index=2, y_log=y_log) yrange = c(min(week_rng[1],day_rng[1]), max(week_rng[2],day_rng[2])) xgraph <- add_day_labels_datetime(xgraph, breaks=breaks, start=as.POSIXlt(min(x_subset_week[[1]]), tz="UTC"), end=as.POSIXlt(max(x_subset_single_day[[1]]), tz="UTC"), days_per_line=num_days_per_line) - xgraph <- xgraph + ggplot2::labs(x=xlabel, y=ylabel, title=plot_title) + xgraph <- xgraph + ggplot2::labs(x=xlabel, y=ylabel, title=plot_title) }else{ xgraph <- ggplot2::ggplot(x, ggplot2::aes_string(x="timestamp", y="count")) + ggplot2::theme_bw() + ggplot2::theme(panel.grid.major = ggplot2::element_line(colour = "gray60"), panel.grid.major.y = ggplot2::element_blank(), panel.grid.minor = ggplot2::element_blank(), text=ggplot2::element_text(size = 14)) xgraph <- xgraph + ggplot2::geom_line(data=x, ggplot2::aes_string(colour=color_name), alpha=alpha) yrange <- get_range(x, index=2, y_log=y_log) - xgraph <- xgraph + ggplot2::scale_x_datetime(labels=function(x) ifelse(as.POSIXlt(x, tz="UTC")$hour != 0,strftime(x, format="%kh", tz="UTC"), strftime(x, format="%b %e", tz="UTC")), + xgraph <- xgraph + ggplot2::scale_x_datetime(labels=function(x) ifelse(as.POSIXlt(x, tz="UTC")$hour != 0,strftime(x, format="%kh", tz="UTC"), strftime(x, format="%b %e", tz="UTC")), expand=c(0,0)) xgraph <- xgraph + ggplot2::labs(x=xlabel, y=ylabel, title=plot_title) } - + # Add anoms to the plot as circles. # We add zzz_ to the start of the name to ensure that the anoms are listed after the data sets. - xgraph <- xgraph + ggplot2::geom_point(data=all_anoms, ggplot2::aes_string(color=paste("\"zzz_",title,"\"",sep="")), size = 3, shape = 1) - + xgraph <- xgraph + ggplot2::geom_point(data=all_anoms, ggplot2::aes_string(color=paste("\"zzz_",title,"\"",sep="")), size = 3, shape = 1) + # Hide legend - xgraph <- xgraph + ggplot2::theme(legend.position="none") - + xgraph <- xgraph + ggplot2::theme(legend.position="none") + # Use log scaling if set by user xgraph <- xgraph + add_formatted_y(yrange, y_log=y_log) - + } - + # Store expected values if set by user if(e_value) { - anoms <- data.frame(timestamp=all_anoms[[1]], anoms=all_anoms[[2]], expected_value=subset(seasonal_plus_trend[[2]], as.POSIXlt(seasonal_plus_trend[[1]], tz="UTC") %in% all_anoms[[1]])) + anoms <- data.frame(timestamp=all_anoms[[1]], anoms=all_anoms[[2]], expected_value=subset(seasonal_plus_trend[[2]], as.POSIXlt(seasonal_plus_trend[[1]], tz="UTC") %in% all_anoms[[1]])) } else { anoms <- data.frame(timestamp=all_anoms[[1]], anoms=all_anoms[[2]]) } - + # Lastly, return anoms and optionally the plot if requested by the user if(plot){ return (list(anoms = anoms, plot = xgraph)) From 60f02c7111cbf8a323ad54b6849882cdb645401d Mon Sep 17 00:00:00 2001 From: owen vallis Date: Wed, 4 Mar 2015 14:31:37 -0800 Subject: [PATCH 4/5] Rebuilt docs and removed extra paranthesis in detect_anoms.R from pull request in Issue #20 --- NAMESPACE | 2 +- R/detect_anoms.R | 2 +- man/AnomalyDetectionTs.Rd | 11 ++++++++--- man/AnomalyDetectionVec.Rd | 3 ++- man/raw_data.Rd | 3 ++- 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 3f1e850..b6fa8d9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,4 @@ -# Generated by roxygen2 (4.0.2): do not edit by hand +# Generated by roxygen2 (4.1.0): do not edit by hand export(AnomalyDetectionTs) export(AnomalyDetectionVec) diff --git a/R/detect_anoms.R b/R/detect_anoms.R index 833ae8e..e86911a 100644 --- a/R/detect_anoms.R +++ b/R/detect_anoms.R @@ -43,7 +43,7 @@ detect_anoms <- function(data, k = 0.49, alpha = 0.05, num_obs_per_period = NULL # Maximum number of outliers that S-H-ESD can detect (e.g. 49% of data) max_outliers <- trunc(num_obs*k) - if(max_outliers == 0)){ + if(max_outliers == 0){ stop(paste0("With longterm=TRUE, AnomalyDetection splits the data into 2 week periods by default. You have ", num_obs, " observations in a period, which is too few. Set a higher piecewise_median_period_weeks.")) } diff --git a/man/AnomalyDetectionTs.Rd b/man/AnomalyDetectionTs.Rd index 514d238..11cfee3 100644 --- a/man/AnomalyDetectionTs.Rd +++ b/man/AnomalyDetectionTs.Rd @@ -1,12 +1,14 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/ts_anom_detection.R \docType{data} \name{AnomalyDetectionTs} \alias{AnomalyDetectionTs} \title{Anomaly Detection Using Seasonal Hybrid ESD Test} \usage{ AnomalyDetectionTs(x, max_anoms = 0.1, direction = "pos", alpha = 0.05, - only_last = NULL, threshold = "None", e_value = F, longterm = F, - plot = F, y_log = F, xlabel = "", ylabel = "count", title = NULL) + only_last = NULL, threshold = "None", e_value = FALSE, + longterm = FALSE, piecewise_median_period_weeks = 2, plot = FALSE, + y_log = FALSE, xlabel = "", ylabel = "count", title = NULL) } \arguments{ \item{x}{Time series as a two column data frame where the first column consists of the @@ -31,6 +33,9 @@ data.} \item{longterm}{Increase anom detection efficacy for time series that are greater than a month. See Details below.} +\item{piecewise_median_period_weeks}{The piecewise median time window as described in Vallis, Hochenbaum, and Kejariwal (2014). +Defaults to 2.} + \item{plot}{A flag indicating if a plot with both the time series and the estimated anoms, indicated by circles, should also be returned.} diff --git a/man/AnomalyDetectionVec.Rd b/man/AnomalyDetectionVec.Rd index 9aa77b4..d95f10d 100644 --- a/man/AnomalyDetectionVec.Rd +++ b/man/AnomalyDetectionVec.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/vec_anom_detection.R \docType{data} \name{AnomalyDetectionVec} \alias{AnomalyDetectionVec} diff --git a/man/raw_data.Rd b/man/raw_data.Rd index 33fb3b6..00e9fb9 100644 --- a/man/raw_data.Rd +++ b/man/raw_data.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/raw_data.R \docType{data} \name{raw_data} \alias{raw_data} From aaa43a81da648eec4a62c719fb27338d91b23f53 Mon Sep 17 00:00:00 2001 From: owen vallis Date: Sun, 15 Mar 2015 20:47:21 -0700 Subject: [PATCH 5/5] modified gitignore to ignore r studio files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 69057df..83a69ee 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ # RStudio files .Rproj.user/ +.Rproj.user +*.Rproj +