From 1ae9c39a01c6a809ea0693c40a816d95acbc5fd1 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Fri, 3 Aug 2018 11:44:17 -0400 Subject: [PATCH 01/32] move test helper data creation into package proper To make the same test data currently used for unit tests available for code examples and more generally to the package, move it into R/ instead of tests/. This has some caveats since it produces an object instead of functions during execution (package build), but should be OK for the time being. --- NAMESPACE | 1 + tests/testthat/helper_data.R => R/zz_helper_data.R | 14 ++++++++++---- man/test_data.Rd | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) rename tests/testthat/helper_data.R => R/zz_helper_data.R (91%) create mode 100644 man/test_data.Rd diff --git a/NAMESPACE b/NAMESPACE index 984d24c..6dcd575 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -51,3 +51,4 @@ export(summarize_sample) export(summarize_sample_guided) export(tabulate_allele_names) export(tally_cts_per_locus) +export(test_data) diff --git a/tests/testthat/helper_data.R b/R/zz_helper_data.R similarity index 91% rename from tests/testthat/helper_data.R rename to R/zz_helper_data.R index 522f99a..e9eeb64 100644 --- a/tests/testthat/helper_data.R +++ b/R/zz_helper_data.R @@ -1,14 +1,20 @@ # simulated data for testing ---------------------------------------------- -# I'm shoving all this into a list to keep it separate from all the other -# objects devtools::load_all() dumps into the namespace. Users loading via -# devtools otherwise end up with a bunch of extra objects that are only relevant -# during testing. +# I'm shoving all this into a list to keep it separate from the non-test-related +# objects in the namespace, but still have it available to both the unit tests +# and regular use. +# Note that having an object stored directly in the package like this (and +# forcing it to be the last file loaded when building) isn't ideal since it gets +# created and stored at build time even though the code is mixed in with the +# regular R functions. A better way might be to explicitly build the test_data +# list and store it in data/ as Hadley describes: +# http://r-pkgs.had.co.nz/data.html #' Helper Data for Tests #' #' This list is a bundle of shared data and functions for running unit tests. +#' @export test_data <- within(list(), { txt.locus_attrs <- "Locus LengthMin LengthMax LengthBuffer Motif Primer ReversePrimer A 131 179 20 TAGA TATCACTGGTGTTAGTCCTCTG CACAGTTGTGTGAGCCAGTC diff --git a/man/test_data.Rd b/man/test_data.Rd new file mode 100644 index 0000000..de1bd8e --- /dev/null +++ b/man/test_data.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/zz_helper_data.R +\docType{data} +\name{test_data} +\alias{test_data} +\title{Helper Data for Tests} +\format{An object of class \code{list} of length 17.} +\usage{ +test_data +} +\description{ +This list is a bundle of shared data and functions for running unit tests. +} +\keyword{datasets} From 6cfd4b2e4d0845d57b79a62661a5eb38fcbeddc2 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Fri, 3 Aug 2018 16:28:55 -0400 Subject: [PATCH 02/32] For #19: add first code example (analyze_seqs) Add outline of CHIIMP workflow in package-level documentation and the first working code example for one function (analyze_seqs) --- R/analyze_seqs.R | 17 +++++++++++++ R/chiimp.R | 56 ++++++++++++++++++++++++++++++++++++++++ man/analyze_seqs.Rd | 16 ++++++++++++ man/chiimp-package.Rd | 59 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 148 insertions(+) diff --git a/R/analyze_seqs.R b/R/analyze_seqs.R index ba06d48..2286425 100644 --- a/R/analyze_seqs.R +++ b/R/analyze_seqs.R @@ -40,6 +40,23 @@ #' @return data frame of dereplicated sequences with added annotations. #' #' @export +#' +#' @examples +#' # Starting from non-locus-specific sequences, +#' # a locus attributes table, and requiring +#' # three side-by-side motif repeats to register +#' # as a motif match for a locus, +#' raw_seq_vector <- c(test_data$seqs1$A, test_data$seqs1$B) +#' locus_attrs <- test_data$locus_attrs +#' num_adjacent_repeats <- 3 +#' # Convert the character vector of sequences +#' # into a data frame with one row per +#' # unique sequence. +#' seq_data <- analyze_seqs(raw_seq_vector, +#' locus_attrs, +#' num_adjacent_repeats) +#' +#' @export analyze_seqs <- function(seqs, locus_attrs, nrepeats) { # Dereplicate sequences tbl <- table(seqs) diff --git a/R/chiimp.R b/R/chiimp.R index ddeb218..d5a7906 100644 --- a/R/chiimp.R +++ b/R/chiimp.R @@ -4,6 +4,62 @@ #' Analyze Microsatellites #' #' Analyze DNA microsatellites in high-throughput sequencing datasets. +#' +#' @details +#' +#' Starting from file inputs and producing file outputs, the overall workflow +#' (handled by \code{\link{full_analysis}} as a configuration-driven wrapper for +#' the entire process) is: +#' +#' * Load input data +#' * Load data frame of sample information from a spreadsheet via +#' \code{\link{load_dataset}} or directly from filenames via +#' \code{\link{prepare_dataset}}. +#' * Load data frame of locus attributes via \code{\link{load_locus_attrs}} +#' * Optionally, load data frame of names for allele sequences via +#' \code{\link{load_allele_names}}. +#' * Optionally, load data frame of known genotypes for named individuals via +#' \code{\link{load_genotypes}}. +#' * Analyze dataset via \code{\link{analyze_dataset}} +#' * Load each sequence data file into a character vector with +#' \code{\link{load_seqs}} and process into a dereplicated data frame with +#' \code{\link{analyze_seqs}}. +#' * For each sample, filter the sequences from the relevant per-file data +#' frame to just those matching the expected locus and identify possible +#' alleles, via \code{\link{analyze_sample}}. (There may be a many-to-one +#' relationship of samples to files, for example with sequencer +#' multiplexing.) +#' * Process each per-sample data frames into a summary list of attributes +#' giving alleles identified and related information, via +#' \code{\link{summarize_sample}}. +#' * Organize \code{analyze_dataset} results into a list of per-file data +#' frames, a list of per-sample data frames, and a single summary data +#' frame across all samples. +#' * Summarize results and add additional comparisons (cross-sample and to +#' known-individual) via \code{\link{summarize_dataset}}. +#' * Tabulate sequence counts per sample matching each locus' primer via +#' \code{\link{tally_cts_per_locus}}. +#' * Align identified alleles for each locus via +#' \code{\link{align_alleles}}. +#' * Create a sample-to-sample distance matrix of allele mismatches via +#' \code{\link{make_dist_mat}}. +#' * If genotypes for known individuals were provided, create a +#' sample-to-known-individual distance matrix via +#' \code{\link{make_dist_mat_known}}. +#' * If identities of samples were provided, score genotyping success via +#' \code{\link{match_known_genotypes}} and +#' \code{\link{categorize_genotype_results}}. +#' * Save analysis results to files. +#' * Create an HTML report document summarizing all results. +#' +#' The workflow above outlines CHIIMP's behavior when called as a standalone +#' program, where \code{\link{main}} loads a configuration file into a nested +#' list of options and calls \code{\link{full_analysis}}. The public functions +#' linked above can also be used idependently; see the documentation and code +#' examples for the individual functions for more information. +#' +#' @md +#' "_PACKAGE" # Analysis ---------------------------------------------------------------- diff --git a/man/analyze_seqs.Rd b/man/analyze_seqs.Rd index 2120b52..77350fc 100644 --- a/man/analyze_seqs.Rd +++ b/man/analyze_seqs.Rd @@ -49,3 +49,19 @@ by each unique sequence compared to the total for that particular matching locus. } } +\examples{ +# Starting from non-locus-specific sequences, +# a locus attributes table, and requiring +# three side-by-side motif repeats to register +# as a motif match for a locus, +raw_seq_vector <- c(test_data$seqs1$A, test_data$seqs1$B) +locus_attrs <- test_data$locus_attrs +num_adjacent_repeats <- 3 +# Convert the character vector of sequences +# into a data frame with one row per +# unique sequence. +seq_data <- analyze_seqs(raw_seq_vector, + locus_attrs, + num_adjacent_repeats) + +} diff --git a/man/chiimp-package.Rd b/man/chiimp-package.Rd index bb6a635..a1aa873 100644 --- a/man/chiimp-package.Rd +++ b/man/chiimp-package.Rd @@ -8,6 +8,65 @@ \description{ Analyze DNA microsatellites in high-throughput sequencing datasets. } +\details{ +Starting from file inputs and producing file outputs, the overall workflow +(handled by \code{\link{full_analysis}} as a configuration-driven wrapper for +the entire process) is: +\itemize{ +\item Load input data +\itemize{ +\item Load data frame of sample information from a spreadsheet via +\code{\link{load_dataset}} or directly from filenames via +\code{\link{prepare_dataset}}. +\item Load data frame of locus attributes via \code{\link{load_locus_attrs}} +\item Optionally, load data frame of names for allele sequences via +\code{\link{load_allele_names}}. +\item Optionally, load data frame of known genotypes for named individuals via +\code{\link{load_genotypes}}. +} +\item Analyze dataset via \code{\link{analyze_dataset}} +\itemize{ +\item Load each sequence data file into a character vector with +\code{\link{load_seqs}} and process into a dereplicated data frame with +\code{\link{analyze_seqs}}. +\item For each sample, filter the sequences from the relevant per-file data +frame to just those matching the expected locus and identify possible +alleles, via \code{\link{analyze_sample}}. (There may be a many-to-one +relationship of samples to files, for example with sequencer +multiplexing.) +\item Process each per-sample data frames into a summary list of attributes +giving alleles identified and related information, via +\code{\link{summarize_sample}}. +\item Organize \code{analyze_dataset} results into a list of per-file data +frames, a list of per-sample data frames, and a single summary data +frame across all samples. +} +\item Summarize results and add additional comparisons (cross-sample and to +known-individual) via \code{\link{summarize_dataset}}. +\itemize{ +\item Tabulate sequence counts per sample matching each locus' primer via +\code{\link{tally_cts_per_locus}}. +\item Align identified alleles for each locus via +\code{\link{align_alleles}}. +\item Create a sample-to-sample distance matrix of allele mismatches via +\code{\link{make_dist_mat}}. +\item If genotypes for known individuals were provided, create a +sample-to-known-individual distance matrix via +\code{\link{make_dist_mat_known}}. +\item If identities of samples were provided, score genotyping success via +\code{\link{match_known_genotypes}} and +\code{\link{categorize_genotype_results}}. +} +\item Save analysis results to files. +\item Create an HTML report document summarizing all results. +} + +The workflow above outlines CHIIMP's behavior when called as a standalone +program, where \code{\link{main}} loads a configuration file into a nested +list of options and calls \code{\link{full_analysis}}. The public functions +linked above can also be used idependently; see the documentation and code +examples for the individual functions for more information. +} \author{ \strong{Maintainer}: Jesse Connell \email{ancon@upenn.edu} From 48352f0c013777864dc9dd2ce35aeb512cd202ba Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Fri, 3 Aug 2018 16:48:02 -0400 Subject: [PATCH 03/32] comment cleanup --- R/analyze_seqs.R | 2 -- man/analyze_seqs.Rd | 1 - 2 files changed, 3 deletions(-) diff --git a/R/analyze_seqs.R b/R/analyze_seqs.R index 2286425..bbb8227 100644 --- a/R/analyze_seqs.R +++ b/R/analyze_seqs.R @@ -55,8 +55,6 @@ #' seq_data <- analyze_seqs(raw_seq_vector, #' locus_attrs, #' num_adjacent_repeats) -#' -#' @export analyze_seqs <- function(seqs, locus_attrs, nrepeats) { # Dereplicate sequences tbl <- table(seqs) diff --git a/man/analyze_seqs.Rd b/man/analyze_seqs.Rd index 77350fc..62116cf 100644 --- a/man/analyze_seqs.Rd +++ b/man/analyze_seqs.Rd @@ -63,5 +63,4 @@ num_adjacent_repeats <- 3 seq_data <- analyze_seqs(raw_seq_vector, locus_attrs, num_adjacent_repeats) - } From eb45c8d850211298ccce74475a13d6d88a8d2b48 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Fri, 3 Aug 2018 17:32:10 -0400 Subject: [PATCH 04/32] more docs for io/configuration --- R/io.R | 81 ++++++++++++++++++++++++------------ man/load_config.Rd | 8 +++- man/load_dataset.Rd | 21 ++++++---- man/load_genotypes.Rd | 4 +- man/load_locus_attrs.Rd | 9 +++- man/prepare_dataset.Rd | 13 +++--- man/save_alignment_images.Rd | 3 +- man/save_alignments.Rd | 3 +- man/save_dataset.Rd | 3 +- man/save_histograms.Rd | 4 +- man/save_sample_data.Rd | 5 ++- man/save_seqfile_data.Rd | 7 ++-- 12 files changed, 107 insertions(+), 54 deletions(-) diff --git a/R/io.R b/R/io.R index e5340a5..208f080 100644 --- a/R/io.R +++ b/R/io.R @@ -19,13 +19,19 @@ dataset_cols <- c("Filename", "Replicate", "Sample", "Locus") #' #' Load a YAML-formatted text file of configuration options for microsatellite #' analysis. This is currently just a wrapper around -#' \code{\link[yaml]{yaml.load}}. +#' \code{\link[yaml]{yaml.load}}. The \code{\link{main}} function loads +#' configuration options with this for use by \code{\link{full_analysis}}. #' #' @param fp path to configuration file. #' #' @return list of configuration options #' #' @export +#' +#' @examples +#' filename <- system.file("example_config.yml", package = "chiimp") +#' config <- load_config(filename) +#' # And then: full_analysis(config) load_config <- function(fp) { if (is.na(fp)) return(list()) @@ -35,8 +41,12 @@ load_config <- function(fp) { #' Load table of locus attributes #' -#' Load a comma-separated table of locus attributes to use for analysis. +#' Load a comma-separated table of locus attributes to use for analysis. This +#' is called automatically during \code{\link{full_analysis}}, with the data +#' frame then used by \code{\link{analyze_seqs}} within +#' \code{\link{analyze_dataset}}. #' +#' @details #' Columns Required: #' * Locus: Unique identifier for a given locus #' * LengthMin: Minimum known allele sequence length for this locus @@ -54,6 +64,10 @@ load_config <- function(fp) { #' @return data frame of locus attributes #' #' @export +#' +#' @examples +#' filename <- system.file("example_locus_attrs.csv", package = "chiimp") +#' locus_attrs <- load_locus_attrs(filename) load_locus_attrs <- function(fp.locus_attrs, ...) { data <- utils::read.table(fp.locus_attrs, header = TRUE, @@ -99,7 +113,9 @@ load_allele_names <- function(fp, ...) { #' Load table of genotypes #' -#' Load a comma-separated table of genotypes, one pair of alleles per row. +#' Load a comma-separated table of genotypes, one pair of alleles per row. This +#' information is used to compare samples to genotypes of known individuals in +#' \code{\link{summarize_dataset}}. #' #' @param fp path to text file. #' @param ... additional arguments passed to \code{\link[utils]{read.table}}. @@ -127,15 +143,20 @@ load_genotypes <- function(fp, ...) { #' Load table of sample attributes #' #' Load a comma-separated table of sample attributes for the dataset to be -#' anlayzed. Columns should be Filename (the path to each data file), Replicate -#' (an identifier for repeated samples; use blanks if not applicable), Sample -#' (identifier for a given biological sample), and Locus (a locus identifier -#' matching that used in the locus attributes table). +#' analyzed. Alternatively, use \code{\link{prepare_dataset}} to automatically +#' read sample attributes from filenames. If more than one locus is to be +#' analyzed from a single sequencer sample (i.e., multiplexed samples), either +#' the \code{locusmap} argument to \code{prepare_dataset} can be used, or +#' \code{load_dataset} with an explicit mapping of loci to files. #' -#' Alternatively, use \code{\link{prepare_dataset}} to automatically read sample -#' attributes from filenames. If more than one locus is to be analyzed from a -#' single sequencer sample (i.e., multiplexed samples), \code{load_dataset} -#' should be used. +#' @details +#' Columns Required: +#' * Filename: path to each data file +#' * Replicate: identifier for repeated samples; use blanks if not applicable +#' * Sample: identifier for a given biological sample +#' * Locus: locus identifier matching that used in the locus attributes table +#' (see \code{\link{load_locus_attrs}}) +#' @md #' #' @param fp path to text file. #' @param ... additional arguments passed to \code{\link[utils]{read.table}}. @@ -161,7 +182,8 @@ load_dataset <- function(fp, ...) { #' Save table of sample attributes #' -#' Save a comma-separated table of sample attributes. +#' Save a comma-separated table of sample attributes. (This is a convenience +#' function not used automatically in the analysis.) #' #' @param data data frame of sample attributes as produced by #' \code{\link{prepare_dataset}} or \code{\link{load_dataset}}. @@ -181,12 +203,13 @@ save_dataset <- function(data, fp, ...) { #' Extract Sample Attributes from Filenames #' #' Find files matching a pattern in a given directory, and build a data frame of -#' standard sample attributes from fields in the filenames. Alternatively, use -#' \code{\link{load_dataset}} to load a spreadsheet of sample attributes -#' explicitly. \code{load_dataset} can be used for cases where more than one -#' locus is to be analyzed from a single sequencer sample (i.e., multiplexed -#' samples), though the \code{locusmap} argument here can allow automatic -#' matching of locus names for multiplexed samples. +#' standard sample attributes from fields in the filenames. Nested directory +#' structures are supported. Alternatively, use \code{\link{load_dataset}} to +#' load a spreadsheet of sample attributes explicitly. \code{load_dataset} can +#' be used for cases where more than one locus is to be analyzed from a single +#' sequencer sample (i.e., multiplexed samples), though the \code{locusmap} +#' argument here can allow automatic matching of locus names for multiplexed +#' samples. #' #' @param dp directory path to search for matching data files. #' @param pattern regular expression to use for parsing filenames. There should @@ -358,9 +381,10 @@ save_allele_seqs <- function(results_summary, dp) { #' Save per-file processed data to text files #' -#' Save each per-file data frame produced by \code{\link{analyze_dataset}} to a -#' separate file in the specified directory path, in CSV format. The directory -#' structure will start at the first shared directory of the input file paths. +#' Save each per-file data frame produced by \code{\link{analyze_dataset}} (via +#' \code{\link{analyze_seqs}}) to a separate file in the specified directory +#' path, in CSV format. The directory structure will start at the first shared +#' directory of the input file paths. #' For example, if the inputs were /data/run1/file.fastq and #' /data/run2/file.fastq there will be run1 and run2 directories inside the #' given `dp` directory. @@ -387,8 +411,9 @@ save_seqfile_data <- function(results_file_data, dp) { #' Save per-sample processed data to text files #' -#' Save each per-sample data frame produced by \code{\link{analyze_dataset}} to -#' a separate file in the specified directory path, in CSV format. +#' Save each per-sample data frame produced by \code{\link{analyze_dataset}} +#' (via \code{\link{analyze_sample}}) to a separate file in the specified +#' directory path, in CSV format. #' #' @param results_data list of per-sample data frames as produced by #' \code{\link{analyze_dataset}}. @@ -408,7 +433,8 @@ save_sample_data <- function(results_data, dp) { #' #' Take a list of alignments, one per locus, and save each to a separate fasta #' file in a specified directory. If any of the per-locus alignment objects is -#' NA it will be skipped. +#' NA it will be skipped. These are produced by \code{\link{summarize_dataset}} +#' via \code{\link{align_alleles}}. #' #' @param alignments list of MSA alignment objects, such as created by #' \code{\link{summarize_dataset}} via \code{\link{align_alleles}}. The name @@ -434,7 +460,8 @@ save_alignments <- function(alignments, dp) { #' #' Take a list of alignments, one per locus, and save a plot of each to a #' separate image file in a specified directory. If any of the per-locus -#' alignment objects is NA it will be skipped. +#' alignment objects is NA it will be skipped. These are produced by +#' \code{\link{summarize_dataset}} via \code{\link{align_alleles}}. #' #' @param alignments list of MSA alignment objects, such as created by #' \code{\link{summarize_dataset}} via \code{\link{align_alleles}}. The name @@ -469,8 +496,8 @@ save_alignment_images <- function(alignments, dp, image.func="png", #' Save sequence histogram visualizations to image files #' -#' Take a full results list and save a histogram of each sample to a separate -#' image file in a specified directory. +#' Take a full results list and save a histogram (via \code{\link{histogram}}) +#' of each sample to a separate image file in a specified directory. #' #' @param results list of results as created by \code{\link{analyze_dataset}}. #' @param dp output directory path. diff --git a/man/load_config.Rd b/man/load_config.Rd index ba992e5..d4adf04 100644 --- a/man/load_config.Rd +++ b/man/load_config.Rd @@ -15,5 +15,11 @@ list of configuration options \description{ Load a YAML-formatted text file of configuration options for microsatellite analysis. This is currently just a wrapper around -\code{\link[yaml]{yaml.load}}. +\code{\link[yaml]{yaml.load}}. The \code{\link{main}} function loads +configuration options with this for use by \code{\link{full_analysis}}. +} +\examples{ +filename <- system.file("example_config.yml", package = "chiimp") +config <- load_config(filename) +# And then: full_analysis(config) } diff --git a/man/load_dataset.Rd b/man/load_dataset.Rd index 8e65a50..1f2c5a3 100644 --- a/man/load_dataset.Rd +++ b/man/load_dataset.Rd @@ -16,14 +16,19 @@ data frame of sample attributes for the dataset. } \description{ Load a comma-separated table of sample attributes for the dataset to be -anlayzed. Columns should be Filename (the path to each data file), Replicate -(an identifier for repeated samples; use blanks if not applicable), Sample -(identifier for a given biological sample), and Locus (a locus identifier -matching that used in the locus attributes table). +analyzed. Alternatively, use \code{\link{prepare_dataset}} to automatically +read sample attributes from filenames. If more than one locus is to be +analyzed from a single sequencer sample (i.e., multiplexed samples), either +the \code{locusmap} argument to \code{prepare_dataset} can be used, or +\code{load_dataset} with an explicit mapping of loci to files. } \details{ -Alternatively, use \code{\link{prepare_dataset}} to automatically read sample -attributes from filenames. If more than one locus is to be analyzed from a -single sequencer sample (i.e., multiplexed samples), \code{load_dataset} -should be used. +Columns Required: +\itemize{ +\item Filename: path to each data file +\item Replicate: identifier for repeated samples; use blanks if not applicable +\item Sample: identifier for a given biological sample +\item Locus: locus identifier matching that used in the locus attributes table +(see \code{\link{load_locus_attrs}}) +} } diff --git a/man/load_genotypes.Rd b/man/load_genotypes.Rd index ecae24e..61075f8 100644 --- a/man/load_genotypes.Rd +++ b/man/load_genotypes.Rd @@ -15,5 +15,7 @@ load_genotypes(fp, ...) data frame of genotypes } \description{ -Load a comma-separated table of genotypes, one pair of alleles per row. +Load a comma-separated table of genotypes, one pair of alleles per row. This +information is used to compare samples to genotypes of known individuals in +\code{\link{summarize_dataset}}. } diff --git a/man/load_locus_attrs.Rd b/man/load_locus_attrs.Rd index 93a7a3a..d985235 100644 --- a/man/load_locus_attrs.Rd +++ b/man/load_locus_attrs.Rd @@ -15,7 +15,10 @@ load_locus_attrs(fp.locus_attrs, ...) data frame of locus attributes } \description{ -Load a comma-separated table of locus attributes to use for analysis. +Load a comma-separated table of locus attributes to use for analysis. This +is called automatically during \code{\link{full_analysis}}, with the data +frame then used by \code{\link{analyze_seqs}} within +\code{\link{analyze_dataset}}. } \details{ Columns Required: @@ -30,3 +33,7 @@ matching sequences to loci \item ReversePrimer: The reverse PCR primer sequence } } +\examples{ +filename <- system.file("example_locus_attrs.csv", package = "chiimp") +locus_attrs <- load_locus_attrs(filename) +} diff --git a/man/prepare_dataset.Rd b/man/prepare_dataset.Rd index 45f33f3..a003801 100644 --- a/man/prepare_dataset.Rd +++ b/man/prepare_dataset.Rd @@ -34,10 +34,11 @@ data frame of metadata for all files found } \description{ Find files matching a pattern in a given directory, and build a data frame of -standard sample attributes from fields in the filenames. Alternatively, use -\code{\link{load_dataset}} to load a spreadsheet of sample attributes -explicitly. \code{load_dataset} can be used for cases where more than one -locus is to be analyzed from a single sequencer sample (i.e., multiplexed -samples), though the \code{locusmap} argument here can allow automatic -matching of locus names for multiplexed samples. +standard sample attributes from fields in the filenames. Nested directory +structures are supported. Alternatively, use \code{\link{load_dataset}} to +load a spreadsheet of sample attributes explicitly. \code{load_dataset} can +be used for cases where more than one locus is to be analyzed from a single +sequencer sample (i.e., multiplexed samples), though the \code{locusmap} +argument here can allow automatic matching of locus names for multiplexed +samples. } diff --git a/man/save_alignment_images.Rd b/man/save_alignment_images.Rd index e6d2740..1d04653 100644 --- a/man/save_alignment_images.Rd +++ b/man/save_alignment_images.Rd @@ -25,5 +25,6 @@ of each alignment will be used for its filename.} \description{ Take a list of alignments, one per locus, and save a plot of each to a separate image file in a specified directory. If any of the per-locus -alignment objects is NA it will be skipped. +alignment objects is NA it will be skipped. These are produced by +\code{\link{summarize_dataset}} via \code{\link{align_alleles}}. } diff --git a/man/save_alignments.Rd b/man/save_alignments.Rd index eb6997e..8356c93 100644 --- a/man/save_alignments.Rd +++ b/man/save_alignments.Rd @@ -16,5 +16,6 @@ of each alignment will be used for its filename.} \description{ Take a list of alignments, one per locus, and save each to a separate fasta file in a specified directory. If any of the per-locus alignment objects is -NA it will be skipped. +NA it will be skipped. These are produced by \code{\link{summarize_dataset}} +via \code{\link{align_alleles}}. } diff --git a/man/save_dataset.Rd b/man/save_dataset.Rd index 3aedc51..91de23d 100644 --- a/man/save_dataset.Rd +++ b/man/save_dataset.Rd @@ -15,5 +15,6 @@ save_dataset(data, fp, ...) \item{...}{additional arguments passed to \code{\link[utils]{read.table}}.} } \description{ -Save a comma-separated table of sample attributes. +Save a comma-separated table of sample attributes. (This is a convenience +function not used automatically in the analysis.) } diff --git a/man/save_histograms.Rd b/man/save_histograms.Rd index 0e988f0..62466de 100644 --- a/man/save_histograms.Rd +++ b/man/save_histograms.Rd @@ -21,6 +21,6 @@ save_histograms(results, dp, image.func = "png", width = 1600, \item{res}{integer resolution of image in PPI.} } \description{ -Take a full results list and save a histogram of each sample to a separate -image file in a specified directory. +Take a full results list and save a histogram (via \code{\link{histogram}}) +of each sample to a separate image file in a specified directory. } diff --git a/man/save_sample_data.Rd b/man/save_sample_data.Rd index c721401..6fa3eb9 100644 --- a/man/save_sample_data.Rd +++ b/man/save_sample_data.Rd @@ -13,6 +13,7 @@ save_sample_data(results_data, dp) \item{dp}{output directory path to use for all files.} } \description{ -Save each per-sample data frame produced by \code{\link{analyze_dataset}} to -a separate file in the specified directory path, in CSV format. +Save each per-sample data frame produced by \code{\link{analyze_dataset}} +(via \code{\link{analyze_sample}}) to a separate file in the specified +directory path, in CSV format. } diff --git a/man/save_seqfile_data.Rd b/man/save_seqfile_data.Rd index 708e01f..c0f8d12 100644 --- a/man/save_seqfile_data.Rd +++ b/man/save_seqfile_data.Rd @@ -13,9 +13,10 @@ save_seqfile_data(results_file_data, dp) \item{dp}{output directory path to use for all files.} } \description{ -Save each per-file data frame produced by \code{\link{analyze_dataset}} to a -separate file in the specified directory path, in CSV format. The directory -structure will start at the first shared directory of the input file paths. +Save each per-file data frame produced by \code{\link{analyze_dataset}} (via +\code{\link{analyze_seqs}}) to a separate file in the specified directory +path, in CSV format. The directory structure will start at the first shared +directory of the input file paths. For example, if the inputs were /data/run1/file.fastq and /data/run2/file.fastq there will be run1 and run2 directories inside the given `dp` directory. From 5ec66c3bcffc114100da871b665c45a057505cf5 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Fri, 3 Aug 2018 17:32:43 -0400 Subject: [PATCH 05/32] configuration docs --- R/configuration.R | 3 ++- man/config.defaults.Rd | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/configuration.R b/R/configuration.R index 4c2dc2e..175498e 100644 --- a/R/configuration.R +++ b/R/configuration.R @@ -4,7 +4,8 @@ #' aspects of the microsatellite analysis. These can be overridden by passing a #' list to \code{\link{full_analysis}} with entries of the same names, or via #' the configuration file passed to \code{\link{main}} and read via -#' \code{\link{load_config}}. +#' \code{\link{load_config}}. Check the contents of \code{config.defaults} +#' itself to see all of the build-time defaults. #' #' Notable Options: #' * dataset_opts: diff --git a/man/config.defaults.Rd b/man/config.defaults.Rd index 731d113..f669f8c 100644 --- a/man/config.defaults.Rd +++ b/man/config.defaults.Rd @@ -13,7 +13,8 @@ The entries in this list show the build-time configuration defaults for all aspects of the microsatellite analysis. These can be overridden by passing a list to \code{\link{full_analysis}} with entries of the same names, or via the configuration file passed to \code{\link{main}} and read via -\code{\link{load_config}}. +\code{\link{load_config}}. Check the contents of \code{config.defaults} +itself to see all of the build-time defaults. } \details{ Notable Options: From aac1384a5848574189e5a7b27daa3d91d1beb915 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Fri, 3 Aug 2018 17:33:11 -0400 Subject: [PATCH 06/32] more package-level docs --- R/chiimp.R | 12 ++++++++++-- man/chiimp-package.Rd | 12 ++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/R/chiimp.R b/R/chiimp.R index d5a7906..6d00ed3 100644 --- a/R/chiimp.R +++ b/R/chiimp.R @@ -11,7 +11,8 @@ #' (handled by \code{\link{full_analysis}} as a configuration-driven wrapper for #' the entire process) is: #' -#' * Load input data +#' * Load input data. The input spreadsheets are text files using +#' comma-separated values (CSV). #' * Load data frame of sample information from a spreadsheet via #' \code{\link{load_dataset}} or directly from filenames via #' \code{\link{prepare_dataset}}. @@ -49,9 +50,16 @@ #' * If identities of samples were provided, score genotyping success via #' \code{\link{match_known_genotypes}} and #' \code{\link{categorize_genotype_results}}. -#' * Save analysis results to files. +#' * Save analysis results to files. Spreadsheets are in CSV format for output +#' as well as input. Some output files are in FASTA format (alignments and +#' alleles) or are PNG images (alignment visualization and sequence count +#' histograms). If specified in the configuration, \code{\link{saveRDS}} is +#' called on the entire output as well, saving to \code{results.rds} by +#' default. #' * Create an HTML report document summarizing all results. #' +#' For defaults used in the configuration, see \code{\link{config.defaults}}. +#' #' The workflow above outlines CHIIMP's behavior when called as a standalone #' program, where \code{\link{main}} loads a configuration file into a nested #' list of options and calls \code{\link{full_analysis}}. The public functions diff --git a/man/chiimp-package.Rd b/man/chiimp-package.Rd index a1aa873..6cd0e5e 100644 --- a/man/chiimp-package.Rd +++ b/man/chiimp-package.Rd @@ -13,7 +13,8 @@ Starting from file inputs and producing file outputs, the overall workflow (handled by \code{\link{full_analysis}} as a configuration-driven wrapper for the entire process) is: \itemize{ -\item Load input data +\item Load input data. The input spreadsheets are text files using +comma-separated values (CSV). \itemize{ \item Load data frame of sample information from a spreadsheet via \code{\link{load_dataset}} or directly from filenames via @@ -57,10 +58,17 @@ sample-to-known-individual distance matrix via \code{\link{match_known_genotypes}} and \code{\link{categorize_genotype_results}}. } -\item Save analysis results to files. +\item Save analysis results to files. Spreadsheets are in CSV format for output +as well as input. Some output files are in FASTA format (alignments and +alleles) or are PNG images (alignment visualization and sequence count +histograms). If specified in the configuration, \code{\link{saveRDS}} is +called on the entire output as well, saving to \code{results.rds} by +default. \item Create an HTML report document summarizing all results. } +For defaults used in the configuration, see \code{\link{config.defaults}}. + The workflow above outlines CHIIMP's behavior when called as a standalone program, where \code{\link{main}} loads a configuration file into a nested list of options and calls \code{\link{full_analysis}}. The public functions From 2123df1746743136ab8f68be9df22cebfec3954e Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Fri, 3 Aug 2018 17:33:19 -0400 Subject: [PATCH 07/32] update README: mention ?chiimp for package docs --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d0d1aa7..20145fc 100644 --- a/README.md +++ b/README.md @@ -9,4 +9,6 @@ high-throughput sequencing datasets. For automated installation and program usage see GUIDE.pdf in a [released version](https://github.com/ShawHahnLab/chiimp/releases). The most recent released version is [0.2.1](https://github.com/ShawHahnLab/chiimp/releases/tag/0.2.1). -For usage as an R package also see the built-in package documentation. +For usage as an R package also see the built-in package documentation. The +package-level page (`?chiimp`) provides an overview with links to specific +functions. From 06dc87dbaf3ed3786a41776ba1f87328989ce600 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 7 Nov 2018 14:54:01 -0500 Subject: [PATCH 08/32] update guide to clarify installation --- GUIDE.Rmd | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/GUIDE.Rmd b/GUIDE.Rmd index 8704ec5..9fabb63 100644 --- a/GUIDE.Rmd +++ b/GUIDE.Rmd @@ -42,18 +42,21 @@ are installed, follow the specific instructions below for your operating system. On Windows, double-click the `install_windows.cmd` script. This will install the package and R dependencies, and create a desktop shortcut. +### Mac OS + +On Mac OS, double-click the `install_mac.command` shell script to automatically +install the package along with R dependencies and create a desktop alias. If +the install script won't open because of a security warning, you can +right-click (control+click) and select "Open" in the menu that appears. +Apple has specific instructions [here](https://support.apple.com/kb/PH25088?locale=en_US) +about these security setings. + ### Linux On Linux, run the `install_linux.sh` shell script to automatically install the package along with R dependencies. An icon for the program is created at `$HOME/Desktop/CHIIMP`. -### Mac OS - -On Mac OS, run the `install_mac.command` shell script to automatically install -the package along with R dependencies. An icon for the program is created at -`$HOME/Desktop/CHIIMP`. - ## Input Data Organization The information CHIIMP uses during analysis is: From f760ffefe64763c268a2290aa3786c70151cf38f Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 7 Nov 2018 15:00:04 -0500 Subject: [PATCH 09/32] update datestamp on guide --- GUIDE.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GUIDE.Rmd b/GUIDE.Rmd index 9fabb63..8e35af2 100644 --- a/GUIDE.Rmd +++ b/GUIDE.Rmd @@ -4,7 +4,7 @@ title: "CHIIMP User Guide" author: "Jesse Connell" -date: "2018/07/23" +date: "2018/11/07" output: pdf_document: toc: true From 828916d7cb85fe0eea4b68ced8f678cb12cde0bf Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 5 Dec 2018 15:34:30 -0500 Subject: [PATCH 10/32] warn in full_analysis of any zero-read input files --- .travis.yml | 1 + R/chiimp.R | 5 +++++ inst/bin/demo_empty.sh | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+) create mode 100755 inst/bin/demo_empty.sh diff --git a/.travis.yml b/.travis.yml index d367744..73836ab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,3 +11,4 @@ script: - R CMD build . - R CMD check *tar.gz - ./inst/bin/demo.sh + - ./inst/bin/demo_empty.sh diff --git a/R/chiimp.R b/R/chiimp.R index ddeb218..8e80135 100644 --- a/R/chiimp.R +++ b/R/chiimp.R @@ -73,6 +73,11 @@ full_analysis <- function(config, dataset=NULL) { summary_function = sample_summary_func, known_alleles = allele.names, name_args = cfg$dataset_analysis$name_args) + empties <- sum(sapply(results$files, nrow) == 0) + if (empties) { + logmsg(paste("WARNING: Zero reads for", empties, "of", + length(results$files), "data files")) + } results$allele.names <- allele.names results$locus_attrs <- locus_attrs if (cfg$verbose) logmsg("Summarizing results...") diff --git a/inst/bin/demo_empty.sh b/inst/bin/demo_empty.sh new file mode 100755 index 0000000..c138b8f --- /dev/null +++ b/inst/bin/demo_empty.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +# Wrapper script to set up and execute a demo, using the "chiimp" executable +# R script and the test data. +# +# Special case: all empty input files + +dir=$(readlink -f $(dirname $BASH_SOURCE)) +scratch=$(mktemp -d) +mkdir -p "$scratch/str-dataset" +for samp in {1..3}; do + for locus in A B 1 2; do + touch "$scratch/str-dataset/Replicate1-Sample${samp}-${locus}.fasta" + done +done +cp "$dir/../example_locus_attrs.csv" "$scratch/locus_attrs.csv" +cp "$dir/../example_config.yml" "$scratch/config.yml" +cd "$scratch" +echo "dataset_analysis: { ncores: 1 }" >> "config.yml" +"$dir/chiimp" "config.yml" From 3ba9f053e29307fff345a2e86b886323b19993e3 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 5 Dec 2018 16:20:11 -0500 Subject: [PATCH 11/32] add failing tests for load_dataset/prepare_dataset Relates to #20 --- tests/testthat/test_io.R | 79 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test_io.R b/tests/testthat/test_io.R index 89bbbf3..dfbb012 100644 --- a/tests/testthat/test_io.R +++ b/tests/testthat/test_io.R @@ -3,6 +3,10 @@ context("Test input/output functions") # helpers ------------------------------------------------------------------- +# append an empty string to each the given files +touch <- function(fps) { + lapply(fps, function(fp) cat("", file = fp, append = TRUE)) +} # Take a version of the raw locus_attrs text from helper_data.R, save it to a # temporary file in TSV format, and return the path. @@ -25,7 +29,7 @@ setup_data_dir <- function(replicates, samples, loci, ord=c(1, 2, 3)) { # (hardcoded to match stems) pattern <- "(\\d+)-(\\d+)-([A-Za-z0-9]+)" fps <- file.path(dp, paste0(stems, ".fastq")) - lapply(fps, function(fp) cat("", file = fp)) + touch(fps) return(list(dp = dp, fps = fps, pattern = pattern)) } @@ -113,19 +117,50 @@ with(test_data, { test_that("load_dataset loads sample attributes", { dataset_known <- setup_dataset() + # Touch the input files so they at least exist + data.dir <- tempfile() + dir.create(data.dir) + setwd(data.dir) + touch(dataset_known$Filename) + # Write dataset CSV fp <- tempfile() write.csv(dataset_known, file = fp, na = "", row.names = F) - dataset <- load_dataset(fp) + expect_silent({ + dataset <- load_dataset(fp) + }) + expect_identical(dataset, dataset_known) + }) + + test_that("load_dataset warns of missing files", { + # Here we'll skip writing any actual data files, so load_dataset should + # complain. + dataset_known <- setup_dataset() + data.dir <- tempfile() + dir.create(data.dir) + setwd(data.dir) + fp <- tempfile() + write.csv(dataset_known, file = fp, na = "", row.names = F) + # expect_message and capture_messages both do NOT catch text send to stderr, + # though capture.output(..., type = "message") does. + msg <- capture.output({ + dataset <- load_dataset(fp) + }, type = "message") + expect_true(length(grep("WARNING: Missing 60 of 60 data files", msg)) == 1) expect_identical(dataset, dataset_known) }) + # test save_dataset ------------------------------------------------------- test_that("save_dataset saves sample attributes", { # We'll just make sure that saving and then re-loading provides what was # saved. load_dataset is tested separately above. + data.dir <- tempfile() + dir.create(data.dir) + setwd(data.dir) dataset_known <- setup_dataset() + touch(dataset_known$Filename) fp <- tempfile() save_dataset(dataset_known, fp) dataset <- load_dataset(fp) @@ -198,7 +233,7 @@ with(test_data, { samples <- 1:5 loci <- sort(rownames(locus_attrs)) data <- setup_data_dir(replicates, samples, loci) - cat("", file = paste0(data$fps[3], ".extra")) + touch(paste0(data$fps[3], ".extra")) expect_warning({ dataset <- prepare_dataset(data$dp, data$pattern) }, @@ -211,8 +246,8 @@ with(test_data, { samples <- 1:5 loci <- sort(rownames(locus_attrs)) data <- setup_data_dir(replicates, samples, loci) - cat("", file = paste0(data$fps[3], ".2")) - cat("", file = paste0(data$fps[3], ".3")) + touch(paste0(data$fps[3], ".2")) + touch(paste0(data$fps[3], ".3")) dataset <- prepare_dataset(data$dp, pattern = "()1-(\\d+)-([A-Za-z0-9]+)", autorep = TRUE) @@ -229,7 +264,24 @@ with(test_data, { }) test_that("prepare_dataset works on nested directories", { - skip("test not yet implemented") + ## Create two separate sets of files in subdirectories + replicates <- 1:3 + samples1 <- 1:3 + samples2 <- 4:6 + loci <- sort(rownames(locus_attrs)) + data1 <- setup_data_dir(replicates, samples1, loci) + data2 <- setup_data_dir(replicates, samples2, loci) + # move both sets into a single parent directory + dp <- tempfile() + dir.create(dp) + file.copy(data1$dp, dp, recursive=TRUE) + file.copy(data2$dp, dp, recursive=TRUE) + # build dataset from parent directory + dataset <- prepare_dataset(dp, data1$pattern) + expect_equal(colnames(dataset), + c("Filename", "Replicate", "Sample", "Locus")) + expect_equal(sort(dataset$Filename), + sort(list.files(dp, recursive = TRUE, full.names = TRUE))) }) test_that("prepare_dataset can separate multiplexed samples", { @@ -259,6 +311,21 @@ with(test_data, { expect_equal(dataset, dataset_known) }) + test_that("prepare_dataset handles missing data directory", { + dp <- tempfile() + expect_error({ + prepare_dataset(dp, "(\\d+)-(\\d+)-([A-Za-z0-9]+)") + }, paste("ERROR: directory path for data files does not exist:", dp)) + }) + + test_that("prepare_dataset handles no-samples case", { + dp <- tempfile() + dir.create(dp) + expect_error({ + prepare_dataset(dp, "(\\d+)-(\\d+)-([A-Za-z0-9]+)") + }, paste("ERROR: no data files found:", dp)) + }) + # test save_seqfile_data -------------------------------------------------- From 5faa5f5b9cb5f38019d6678e00511eb66de91038 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 5 Dec 2018 16:37:08 -0500 Subject: [PATCH 12/32] Add error handling in load_dataset/prepare_dataset Relates to #20 --- R/chiimp.R | 16 ---------------- R/io.R | 18 ++++++++++++++++-- R/util.R | 18 ++++++++++++++++++ man/logmsg.Rd | 2 +- man/prepare_dataset.Rd | 3 ++- 5 files changed, 37 insertions(+), 20 deletions(-) diff --git a/R/chiimp.R b/R/chiimp.R index 8e80135..5093754 100644 --- a/R/chiimp.R +++ b/R/chiimp.R @@ -199,19 +199,3 @@ format_pandoc_args <- function(metadata) { paste("--metadata=", metadata, sep = "") } -#' Write Log Message -#' -#' Print a log message to the standard error stream. -#' -#' @param msg text to print. -#' @param col2 extra text to show at right margin; defaults to current time. -#' @param end ending to concatenate to message; defaults to newline character. -logmsg <- function(msg, col2=as.character(Sys.time()), end="\n") { - if (!is.null(col2)) { - # right-justify col2, aim to fit total msg within 80 characters - pad <- max(1, 80 - nchar(msg) - nchar(col2)) - msg <- paste0(msg, paste(rep(" ", pad), collapse = ""), col2) - } - # stderr: file descriptor 2 - cat(paste0(msg, end), file = 2) -} diff --git a/R/io.R b/R/io.R index e5340a5..98b597b 100644 --- a/R/io.R +++ b/R/io.R @@ -151,8 +151,13 @@ load_dataset <- function(fp, ...) { na.strings = "", ...) col.missing <- is.na(match(dataset_cols, colnames(data))) + files.missing <- ! file.exists(data$Filename) + if (any(files.missing)) { + logmsg(paste("WARNING: Missing", sum(files.missing), "of", + length(files.missing), "data files")) + } if (any(col.missing)) { - warning(paste("Missing columns in genotypes table:", + warning(paste("Missing columns in dataset table:", dataset_cols[col.missing])) } rownames(data) <- make_rownames(data) @@ -186,7 +191,8 @@ save_dataset <- function(data, fp, ...) { #' explicitly. \code{load_dataset} can be used for cases where more than one #' locus is to be analyzed from a single sequencer sample (i.e., multiplexed #' samples), though the \code{locusmap} argument here can allow automatic -#' matching of locus names for multiplexed samples. +#' matching of locus names for multiplexed samples. If the directory path given +#' does not exist or if no matching files are found, an error is thrown. #' #' @param dp directory path to search for matching data files. #' @param pattern regular expression to use for parsing filenames. There should @@ -210,12 +216,20 @@ save_dataset <- function(data, fp, ...) { #' @export prepare_dataset <- function(dp, pattern, ord = c(1, 2, 3), autorep=FALSE, locusmap=NULL) { + if (! dir.exists(dp)) { + stop(paste("ERROR: directory path for data files does not exist:", + dp)) + } # get all matching filenames and extract substrings seq_files <- list.files(path = dp, pattern = pattern, full.names = TRUE, recursive = TRUE, include.dirs = FALSE) + if (! length(seq_files)) { + stop(paste("ERROR: no data files found:", + dp)) + } seq_file_attrs <- stringr::str_match_all(seq_files, pattern) if (! all(sapply(seq_file_attrs, length) == length(ord) + 1)) { warning("Some filenames did not match the given pattern") diff --git a/R/util.R b/R/util.R index 39b4efe..614326e 100644 --- a/R/util.R +++ b/R/util.R @@ -180,3 +180,21 @@ remove_shared_root_dir <- function(fps_full) { # Equivalent of /dev/null for the build platform. fp_devnull <- c(unix = "/dev/null", windows = "nul")[.Platform$OS.type] # nolint + + +#' Write Log Message +#' +#' Print a log message to the standard error stream. +#' +#' @param msg text to print. +#' @param col2 extra text to show at right margin; defaults to current time. +#' @param end ending to concatenate to message; defaults to newline character. +logmsg <- function(msg, col2=as.character(Sys.time()), end="\n") { + if (!is.null(col2)) { + # right-justify col2, aim to fit total msg within 80 characters + pad <- max(1, 80 - nchar(msg) - nchar(col2)) + msg <- paste0(msg, paste(rep(" ", pad), collapse = ""), col2) + } + # stderr: file descriptor 2 + cat(paste0(msg, end), file = 2) +} diff --git a/man/logmsg.Rd b/man/logmsg.Rd index 2975e3c..cbf964b 100644 --- a/man/logmsg.Rd +++ b/man/logmsg.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/chiimp.R +% Please edit documentation in R/util.R \name{logmsg} \alias{logmsg} \title{Write Log Message} diff --git a/man/prepare_dataset.Rd b/man/prepare_dataset.Rd index 45f33f3..a623377 100644 --- a/man/prepare_dataset.Rd +++ b/man/prepare_dataset.Rd @@ -39,5 +39,6 @@ standard sample attributes from fields in the filenames. Alternatively, use explicitly. \code{load_dataset} can be used for cases where more than one locus is to be analyzed from a single sequencer sample (i.e., multiplexed samples), though the \code{locusmap} argument here can allow automatic -matching of locus names for multiplexed samples. +matching of locus names for multiplexed samples. If the directory path given +does not exist or if no matching files are found, an error is thrown. } From 38a309bc5b0a77369b482e9cebb8873eb26dd332 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 5 Dec 2018 16:53:19 -0500 Subject: [PATCH 13/32] add failing test for wrong loci in analyze_dataset This relates to #21. --- tests/testthat/test_analyze_dataset.R | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test_analyze_dataset.R b/tests/testthat/test_analyze_dataset.R index 270b5ba..b85bc67 100644 --- a/tests/testthat/test_analyze_dataset.R +++ b/tests/testthat/test_analyze_dataset.R @@ -153,10 +153,22 @@ with(test_data, { expect_equal(results$samples[["3-B"]]$SeqName[2], "220-fb9a92") }) - test_that("analyze_dataset warns of missing loci", { + test_that("analyze_dataset handles missing loci", { # If there are locus names in dataset$Locus that are not present in the - # rownames of locus_attrs, it should throw a warning. - skip("test not yet implemented") + # rownames of locus_attrs, it should throw an error. + data.dir <- tempfile() + # the names are case-sensitive! + seqs <- lapply(seqs, function(s) {names(s) <- c("a", "b", 1, 2); s}) + write_seqs(seqs, data.dir) + # prepare_dataset tested separately in test_io.R + dataset <- prepare_dataset(data.dir, "()(\\d+)-([A-Za-z0-9]+).fasta") + expect_error({ + results <- analyze_dataset(dataset, locus_attrs, + analysis_opts = list(fraction.min = 0.05), + summary_opts = list(counts.min = 500), + nrepeats = 3, + ncores = 1) + }, "ERROR: Locus names in dataset not in attributes table: a, b") }) }) From 6bf0862b2027e0664c1c7ee35feb40076b92af35 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 5 Dec 2018 17:12:49 -0500 Subject: [PATCH 14/32] Check for locus name mismatch in analyze_dataset This relates to #21. --- R/analyze_dataset.R | 9 ++++++++- man/analyze_dataset.Rd | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/R/analyze_dataset.R b/R/analyze_dataset.R index e095ad6..14b033c 100644 --- a/R/analyze_dataset.R +++ b/R/analyze_dataset.R @@ -11,7 +11,8 @@ #' in the summary data frame will be sorted according to the ordering of loci in #' \code{locus_attrs} and by the sample attributes. Processed files are stored #' separately (as there may be multiple samples per file) and named by input -#' file path. +#' file path. An error is thrown if any locus entries in the given dataset are +#' not found in the locus attributes data frame. #' #' @param dataset data frame of sample details as produced by #' \code{\link{prepare_dataset}}. @@ -53,6 +54,12 @@ analyze_dataset <- function(dataset, summary_function=summarize_sample, known_alleles=NULL, name_args=list()) { + if (! all(dataset$Locus %in% locus_attrs$Locus)) { + rogue_loci <- unique(dataset$Locus[! dataset$Locus %in% locus_attrs$Locus]) + msg <- paste("ERROR: Locus names in dataset not in attributes table:", + paste(rogue_loci, collapse = ", ")) + stop(msg) + } if (ncores == 0) { ncores <- max(1, as.integer(parallel::detectCores() / 2) - 1) } diff --git a/man/analyze_dataset.Rd b/man/analyze_dataset.Rd index c989238..30f0f63 100644 --- a/man/analyze_dataset.Rd +++ b/man/analyze_dataset.Rd @@ -56,5 +56,6 @@ of processed samples. The entries in the processed-samples list and the rows in the summary data frame will be sorted according to the ordering of loci in \code{locus_attrs} and by the sample attributes. Processed files are stored separately (as there may be multiple samples per file) and named by input -file path. +file path. An error is thrown if any locus entries in the given dataset are +not found in the locus attributes data frame. } From f9082e9fc75cd516a20c6826626d6acae1d75b62 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Mon, 10 Dec 2018 11:48:54 -0500 Subject: [PATCH 15/32] move touch helper function into package --- R/util.R | 6 +++++- tests/testthat/test_io.R | 4 ---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/util.R b/R/util.R index 614326e..f7662ff 100644 --- a/R/util.R +++ b/R/util.R @@ -181,7 +181,6 @@ remove_shared_root_dir <- function(fps_full) { # Equivalent of /dev/null for the build platform. fp_devnull <- c(unix = "/dev/null", windows = "nul")[.Platform$OS.type] # nolint - #' Write Log Message #' #' Print a log message to the standard error stream. @@ -198,3 +197,8 @@ logmsg <- function(msg, col2=as.character(Sys.time()), end="\n") { # stderr: file descriptor 2 cat(paste0(msg, end), file = 2) } + +# append an empty string to each the given files +touch <- function(fps) { + lapply(fps, function(fp) cat("", file = fp, append = TRUE)) +} diff --git a/tests/testthat/test_io.R b/tests/testthat/test_io.R index dfbb012..99fec27 100644 --- a/tests/testthat/test_io.R +++ b/tests/testthat/test_io.R @@ -3,10 +3,6 @@ context("Test input/output functions") # helpers ------------------------------------------------------------------- -# append an empty string to each the given files -touch <- function(fps) { - lapply(fps, function(fp) cat("", file = fp, append = TRUE)) -} # Take a version of the raw locus_attrs text from helper_data.R, save it to a # temporary file in TSV format, and return the path. From ef5add8db28447ac70ae2a95d235f476e94137da Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Mon, 10 Dec 2018 13:41:26 -0500 Subject: [PATCH 16/32] add failing test (empty files in analyze_dataset) --- tests/testthat/test_analyze_dataset.R | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/testthat/test_analyze_dataset.R b/tests/testthat/test_analyze_dataset.R index b85bc67..ebd6800 100644 --- a/tests/testthat/test_analyze_dataset.R +++ b/tests/testthat/test_analyze_dataset.R @@ -171,4 +171,24 @@ with(test_data, { }, "ERROR: Locus names in dataset not in attributes table: a, b") }) + test_that("analyze_dataset warns of empty input files", { + # If we have no reads at all right from the start, we should warn the user. + data.dir <- tempfile() + write_seqs(seqs, data.dir) + # empty out one file + fps <- list.files(data.dir, full.names = TRUE) + unlink(fps[1]) + touch(fps[1]) + dataset <- prepare_dataset(data.dir, "()(\\d+)-([A-Za-z0-9]+).fasta") + msg <- capture.output({ + results <- analyze_dataset(dataset, locus_attrs, + analysis_opts = list(fraction.min = 0.05), + summary_opts = list(counts.min = 500), + nrepeats = 3, + ncores = 1) + }, type = "message") + msg_exp <- "WARNING: Zero reads for 1 of 12 data files" + expect_true(length(grep(msg_exp, msg)) == 1) + }) + }) From 70c5cf71d7d0bed2ae88531861a85b07d545f349 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Mon, 10 Dec 2018 13:53:36 -0500 Subject: [PATCH 17/32] move warning for empty files to analyze_dataset This is more applicable directly in analyze_dataset and easier to test (should fix previous failing test) --- R/analyze_dataset.R | 8 ++++++++ R/chiimp.R | 5 ----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/R/analyze_dataset.R b/R/analyze_dataset.R index 14b033c..45e2eaf 100644 --- a/R/analyze_dataset.R +++ b/R/analyze_dataset.R @@ -139,6 +139,14 @@ analyze_dataset <- function(dataset, summary_function = summary_function, analyzed_files = analyzed_files) } + + # Check if any of the raw data files had no reads to start with. + empties <- sum(sapply(analyzed_files, nrow) == 0) + if (empties) { + logmsg(paste("WARNING: Zero reads for", empties, "of", + length(analyzed_files), "data files")) + } + # Recombined results into a summary data frame and a list of full sample data. results <- tidy_analyzed_dataset(dataset, raw.results) results$files <- analyzed_files diff --git a/R/chiimp.R b/R/chiimp.R index 5093754..808d9a4 100644 --- a/R/chiimp.R +++ b/R/chiimp.R @@ -73,11 +73,6 @@ full_analysis <- function(config, dataset=NULL) { summary_function = sample_summary_func, known_alleles = allele.names, name_args = cfg$dataset_analysis$name_args) - empties <- sum(sapply(results$files, nrow) == 0) - if (empties) { - logmsg(paste("WARNING: Zero reads for", empties, "of", - length(results$files), "data files")) - } results$allele.names <- allele.names results$locus_attrs <- locus_attrs if (cfg$verbose) logmsg("Summarizing results...") From c7cf032197891f3ac850512af6506196fc6e8b92 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Mon, 10 Dec 2018 15:52:45 -0500 Subject: [PATCH 18/32] Add failing test set for plot_heatmap (For #22) --- tests/testthat/test_report.R | 76 +++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test_report.R b/tests/testthat/test_report.R index 7269883..873fe22 100644 --- a/tests/testthat/test_report.R +++ b/tests/testthat/test_report.R @@ -64,7 +64,81 @@ with(test_data, { }) }) - # test plot_cts_per_locus ------------------------------------------------- + +# test plot_heatmap ------------------------------------------------------- + + # empty out columns of a results summary data frame + zero_summary <- function(results_summary) { + for (x in c("Seq", "Count", "Length", "Name")) { + results_summary[, paste0(c("Allele1", "Allele2"), x)] <- NA + } + for (x in c("Homozygous", "Ambiguous", "Stutter", "Artifact")) { + results_summary[[x]] <- FALSE + } + for (x in c("ProminentSeqs")) { + results_summary[[x]] <- 0 + } + return(results_summary) + } + + test_that("plot_heatmap renders heatmap of attribute", { + # basic test of plot_heatmap. It should return a pheatmap object. + with(results_summary_data, { + fp_img <- tempfile() + png(fp_img) + plot_data <- plot_heatmap(results, "Stutter") + dev.off() + expect_equal(class(plot_data), "pheatmap") + }) + }) + + test_that("plot_heatmap handles empty results", { + # heatmap rendering should still work for a completely empty dataset (all NA + # entries) + + with(results_summary_data, { + fp_img <- tempfile() + # empty out certain columns + results$summary <- zero_summary(results$summary) + # the function should still work as before + png(fp_img) + plot_data <- plot_heatmap(results, "Stutter") + dev.off() + expect_equal(class(plot_data), "pheatmap") + }) + }) + + test_that("plot_heatmap handles single-value case", { + # heatmap rendering should still work for a dataset with only one unique + # value + with(results_summary_data, { + fp_img <- tempfile() + # force all entries to a single value + results$summary$Stutter <- TRUE + png(fp_img) + plot_data <- plot_heatmap(results, "Stutter") + dev.off() + expect_equal(class(plot_data), "pheatmap") + }) + }) + + test_that("plot_heatmap handles single-value case with blanks", { + # heatmap rendering should still work for a dataset with only one unique + # value plus some NA entries + with(results_summary_data, { + fp_img <- tempfile() + # force all entries to a single value + results$summary$Stutter <- TRUE + results$summary[1:4, ] <- zero_summary(results$summary[1:4, ]) + png(fp_img) + plot_data <- plot_heatmap(results, "Stutter") + dev.off() + expect_equal(class(plot_data), "pheatmap") + }) + }) + +# test plot_cts_per_locus ------------------------------------------------- + test_that("plot_cts_per_locus plots heatmap of counts per matched locus", { # It doesn't return anything useful right now, but it should run without From 934c9315dcb2dbdabd2ece1db702643c3c060378 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Mon, 10 Dec 2018 16:20:19 -0500 Subject: [PATCH 19/32] For #22: fix NA edge case in plot_heatmap --- R/report.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/report.R b/R/report.R index 5918036..07417e8 100644 --- a/R/report.R +++ b/R/report.R @@ -352,7 +352,7 @@ plot_heatmap <- function(results, if (all(is.na(data))) data[, ] <- 0 if (min(data, na.rm = T) == max(data, na.rm = T)) - breaks <- range(c(0, max(data), 1)) + breaks <- range(c(0, max(data, na.rm = TRUE), 1)) pheatmap::pheatmap(data, cluster_rows = F, From a43f0293a270fc88ecfe0657d789cc5e2d8a9a37 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Tue, 11 Dec 2018 11:04:39 -0500 Subject: [PATCH 20/32] update lint and release-prep helper scripts --- .utils/lint.R | 82 ++++++++++++++++++++++++++++++++++++------ .utils/prep_release.sh | 15 +++++--- 2 files changed, 82 insertions(+), 15 deletions(-) diff --git a/.utils/lint.R b/.utils/lint.R index e32f3c9..997cea4 100755 --- a/.utils/lint.R +++ b/.utils/lint.R @@ -3,15 +3,75 @@ # Lint the package that contains this file's directory, minus some lint # categories that just annoy me. + +# Configure Linters ------------------------------------------------------- + + +# This could be done via a lintr config file if I took the time to figure out +# the syntax. + +linters <- list( + # Linters to add to default list. + # T_and_F_symbol is quite new as of 2018-12-10. + yes = c( + "T_and_F_symbol" # "use TRUE and FALSE, not T and F" + ), + # Linters to remove from default list. + # Some of these are from older versions. + no = c( + "multiple_dots", # "Don't use dots in names" + "camel_case", # "Don't capitalize stuff" + "object_name", # "Don't use dots in names, don't capitalize" + "object_usage" # "I don't see that variable" + ) +) + + +# Detect Package Path ----------------------------------------------------- + + +# If run as a script args <- commandArgs() -f <- gsub("^--file=", "", args[grep("^--file=", args)]) -f <- normalizePath(f) -path <- dirname(dirname(f)) - -linters_no <- c("multiple_dots", # "Don't use dots in names" - "camel_case", # "Don't capitalize stuff" - "object_usage") # "I don't see that variable" -linters_no <- paste0(linters_no, "_linter") -linters <- lintr::default_linters[-match(linters_no, - names(lintr::default_linters))] -lintr::lint_package(path = path, linters = linters) +path_script <- gsub("^--file=", "", args[grep("^--file=", args)]) +path_script <- normalizePath(path_script) +path_pkg <- dirname(dirname(path_script)) + +# If run in Rstudio for example +if (! length(path_pkg)) { + # https://stackoverflow.com/a/16046056 + if (length(sys.frames())) { + path_pkg <- dirname(dirname(sys.frame(1)$ofile)) + } else { + # Last fallback, for example run in separate code chunks + path_pkg <- getwd() + } +} + + +# Run lintr --------------------------------------------------------------- + +library(lintr) +linters$combined <- lintr::default_linters + +# Remove linters +linters$no <- paste0(linters$no, "_linter") +idx <- match(linters$no, names(lintr::default_linters)) +idx <- idx[! is.na(idx)] +linters$combined <- lintr::default_linters[-idx] + +# Add linters +linters$yes <- paste0(linters$yes, "_linter") +linters$yes <- linters$yes[linters$yes %in% ls("package:lintr")] +.names <- names(linters$yes) +linters$yes <- get(linters$yes, "package:lintr") +names(linters$yes) <- .names +linters$combined <- c(linters$combined, linters$yes) + +# Run +results <- lintr::lint_package(path = path_pkg, linters = linters$combined) +results +if (length(path_script) == 1) { + if (length(results) > 0) { + quit(status = 1) + } +} diff --git a/.utils/prep_release.sh b/.utils/prep_release.sh index 1ba6ddf..8110d80 100755 --- a/.utils/prep_release.sh +++ b/.utils/prep_release.sh @@ -1,11 +1,16 @@ #!/usr/bin/env bash +# NOTE: assumes working directory is the project directory + set -e VERSION=$1 chiimp_check='x<-devtools::check();quit(save="no",status=length(c(x$errors,x$warnings)))' +# Run lint script +./.utils/lint.R + # Update version in download link in README VER_MSG="The most recent released version is" TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag" @@ -26,7 +31,9 @@ zip -r chiimp-v${VERISON}.zip chiimp/* tar czvf chiimp-v${VERSION}.tgz chiimp/* popd -# TODO show reminder of checks before tagging a release: -# * full test on all three platforms -# * make sure NEWS.md contains all updates under a heading matching this version -# * make sure GUIDE.Rmd is up-to-date and the rendered GUIDE.pdf is correct +echo "REMINDER BEFORE TAGGING RELEASE $VERSION:" +echo +echo " * Run full test on Mac OS, Windows, and Linux" +echo " * Update NEWS.md with all updates under a heading matching this version" +echo " * Make sure GUIDE.Rmd is up-to-date and rendered GUIDE.pdf is correct" +echo From c97a1a5bdb6980b418e73bf70bc2726d6100aa53 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Tue, 11 Dec 2018 11:14:00 -0500 Subject: [PATCH 21/32] update R/ files to pass lint check --- R/analyze_sample.R | 6 ++---- R/analyze_seqs.R | 10 +++++----- R/chiimp.R | 1 - R/histogram.R | 2 +- R/io.R | 10 +++++----- R/markdown.R | 4 ++-- R/report.R | 18 +++++++++--------- R/summarize_dataset.R | 24 ++++++++++++------------ R/util.R | 4 ++-- 9 files changed, 38 insertions(+), 41 deletions(-) diff --git a/R/analyze_sample.R b/R/analyze_sample.R index a1eab15..7b615bf 100644 --- a/R/analyze_sample.R +++ b/R/analyze_sample.R @@ -108,9 +108,8 @@ analyze_sample_guided <- function(seq_data, sample.attrs, fraction.min) { switch(length(expected_lengths) + 1, # Zero expected lengths: analyze as usual - analyze_sample(seq_data, sample.attrs, fraction.min), + analyze_sample(seq_data, sample.attrs, fraction.min), { # One expected length: may be homozygous or heterozygous. - { # Find rows of interest, matching expected length. idxl <- chunk$Length == expected_lengths within(chunk, { @@ -128,10 +127,9 @@ analyze_sample_guided <- function(seq_data, sample.attrs, fraction.min) { # And that's it. We make no comment on the remaining entries and # leave them as NA. }) - }, + }, { # Two expected lengths: definitely heterozygous. No need to consider # fractions here. - { within(chunk, { Category <- factor(, levels = categories) # Exclude ambiguous sequences first. diff --git a/R/analyze_seqs.R b/R/analyze_seqs.R index ba06d48..a16fccc 100644 --- a/R/analyze_seqs.R +++ b/R/analyze_seqs.R @@ -52,7 +52,7 @@ analyze_seqs <- function(seqs, locus_attrs, nrepeats) { Count = count, Length = nchar(seqs), stringsAsFactors = FALSE) - data <- data[order(data$Count, decreasing = T), ] + data <- data[order(data$Count, decreasing = TRUE), ] rownames(data) <- NULL # Label rows with the apparent locus by checking primer sequences. Note that # this uses the first matching locus for each row. @@ -74,7 +74,7 @@ analyze_seqs <- function(seqs, locus_attrs, nrepeats) { data$FractionOfTotal <- data$Count / sum(data$Count) data$FractionOfLocus <- with(data, { total_per_locus <- unlist(lapply(levels(MatchingLocus), function(loc) - sum(data[MatchingLocus == loc, "Count"], na.rm = T))) + sum(data[MatchingLocus == loc, "Count"], na.rm = TRUE))) names(total_per_locus) <- levels(MatchingLocus) Count / total_per_locus[MatchingLocus] }) @@ -96,7 +96,7 @@ find_matching_primer <- function(sample.data, locus_attrs) { matches <- do.call(cbind, lapply(rownames(locus_attrs), function(locus_name) { primer <- as.character(locus_attrs[locus_name, "Primer"]) result <- grepl(primer, substr(sample.data$Seq, 1, nchar(primer) + 10)) - c(locus_name)[as.numeric( (! result) + 1)] + c(locus_name)[as.numeric((! result) + 1)] })) # Collapse that set down to just the first match for each entry. first.matches <- apply(matches, 1, function(m) m[match(TRUE, !is.na(m))]) @@ -172,7 +172,7 @@ find_stutter <- function(sample.data, locus_attrs, # across rows for this locus, for (locus_name in rownames(locus_attrs)) { idxl_main <- sample.data$MatchingLocus == locus_name - idxl_main[is.na(idxl_main)] <- F + idxl_main[is.na(idxl_main)] <- FALSE d <- sample.data[idxl_main, ] motif.len <- nchar(as.character(locus_attrs[locus_name, "Motif"])) # Any given index in d could be stutter. @@ -221,7 +221,7 @@ find_artifact <- function(sample.data, locus_attrs, # across rows for this locus, for (locus_name in rownames(locus_attrs)) { idxl_main <- sample.data$MatchingLocus == locus_name - idxl_main[is.na(idxl_main)] <- F + idxl_main[is.na(idxl_main)] <- FALSE d <- sample.data[idxl_main, ] # for each length it's the index of the first (highest-count) entry with # that length. diff --git a/R/chiimp.R b/R/chiimp.R index 808d9a4..23f8f9e 100644 --- a/R/chiimp.R +++ b/R/chiimp.R @@ -193,4 +193,3 @@ format_pandoc_args <- function(metadata) { function(s) paste0("\"", s, "\"")), sep = ":") paste("--metadata=", metadata, sep = "") } - diff --git a/R/histogram.R b/R/histogram.R index b8f5b22..85c4a82 100644 --- a/R/histogram.R +++ b/R/histogram.R @@ -72,7 +72,7 @@ str_hist_setup <- function(seq_data, sample_data = NULL) { lapply(split(sample_data, sample_data$Length), function(chunk) { - chunk <- chunk[ ! is.na(chunk$SeqName), ] + chunk <- chunk[! is.na(chunk$SeqName), ] if (nrow(chunk) == 0) return() idx <- which(chunk$Count == max(chunk$Count)) diff --git a/R/io.R b/R/io.R index 98b597b..7bc1fbc 100644 --- a/R/io.R +++ b/R/io.R @@ -109,7 +109,7 @@ load_allele_names <- function(fp, ...) { #' @export load_genotypes <- function(fp, ...) { data <- utils::read.table(fp, - header = T, + header = TRUE, sep = ",", colClasses = "character", na.strings = "", @@ -145,7 +145,7 @@ load_genotypes <- function(fp, ...) { #' @export load_dataset <- function(fp, ...) { data <- utils::read.table(fp, - header = T, + header = TRUE, sep = ",", colClasses = "character", na.strings = "", @@ -388,9 +388,9 @@ save_seqfile_data <- function(results_file_data, dp) { fps_rel <- remove_shared_root_dir(names(results_file_data)) invisible(lapply(names(results_file_data), function(n) { fp_this <- fps_rel[n] - dp_this <- ifelse (dirname(fp_this) != ".", - file.path(dp, dirname(fp_this)), - dp) + dp_this <- ifelse(dirname(fp_this) != ".", + file.path(dp, dirname(fp_this)), + dp) if (! dir.exists(dp_this)) { dir.create(dp_this, recursive = TRUE) } diff --git a/R/markdown.R b/R/markdown.R index 9fc6658..89e8d2d 100644 --- a/R/markdown.R +++ b/R/markdown.R @@ -40,7 +40,7 @@ kable_genotypes <- function(data, group_samples=FALSE) { k <- knitr::kable(data, row.names = FALSE, format = "html") k <- kableExtra::kable_styling(k, bootstrap_options = bootstrap_options, - full_width = F) + full_width = FALSE) if (group_samples) k <- k_group_rows(k, grouping) k @@ -78,7 +78,7 @@ kable_idents <- function(tbl, closest) { k <- knitr::kable(tbl, row.names = FALSE, format = "html") k <- kableExtra::kable_styling(k, bootstrap_options = bootstrap_options, - full_width = F) + full_width = FALSE) # Group rows by sample obs.select <- tbl$Distance == "" diff --git a/R/report.R b/R/report.R index 07417e8..6534b4a 100644 --- a/R/report.R +++ b/R/report.R @@ -248,7 +248,7 @@ plot_alignment <- function(alignment, labels=NULL, include.blanks=FALSE, ...) { graphics::axis(4, at = 1:length(seqs), labels = labels, - tick = F, + tick = FALSE, padj = -2.5, cex.axis = 0.6) return(list(seqs = seqs, groups = groups, labels = labels)) @@ -259,7 +259,7 @@ plot_alignment <- function(alignment, labels=NULL, include.blanks=FALSE, ...) { # A skewed 0 -> 1 scale for color-coding distance tables make.dist_scale <- function(n) { - ( (0:n) / n) ** (1 / 3) + ((0:n) / n) ** (1 / 3) } #' Plot Distance Matrix @@ -351,12 +351,12 @@ plot_heatmap <- function(results, # Handle edge cases where all the values are the same and/or all NA if (all(is.na(data))) data[, ] <- 0 - if (min(data, na.rm = T) == max(data, na.rm = T)) + if (min(data, na.rm = TRUE) == max(data, na.rm = TRUE)) breaks <- range(c(0, max(data, na.rm = TRUE), 1)) pheatmap::pheatmap(data, - cluster_rows = F, - cluster_cols = F, + cluster_rows = FALSE, + cluster_cols = FALSE, display_numbers = labels, breaks = breaks, color = color, @@ -472,7 +472,7 @@ plot_heatmap_prominent_seqs <- function(results, ...) { #' @export plot_heatmap_proportions <- function(results, ...) { cts <- results$summary[, c("Allele1Count", "Allele2Count")] - prop.counted <- rowSums(cts, na.rm = T) / results$summary$CountLocus + prop.counted <- rowSums(cts, na.rm = TRUE) / results$summary$CountLocus results$summary$ProportionCounted <- prop.counted # A color scale going from red at 0 to white at 1, but values skewed toward # white. @@ -515,7 +515,7 @@ plot_cts_per_locus <- function(cts_per_locus, idx.row=NULL, render=TRUE, ...) { if (all(is.na(cts_per_locus))) { breaks <- 0:1 # handle the all-zero case } else { - breaks <- 0:ceiling(max(cts_per_locus, na.rm = T)) + breaks <- 0:ceiling(max(cts_per_locus, na.rm = TRUE)) } color <- viridis::viridis(max(breaks)) @@ -523,8 +523,8 @@ plot_cts_per_locus <- function(cts_per_locus, idx.row=NULL, render=TRUE, ...) { cts_per_locus <- cts_per_locus[idx.row, ] } pheatmap::pheatmap(cts_per_locus, - cluster_rows = F, - cluster_cols = F, + cluster_rows = FALSE, + cluster_cols = FALSE, gaps_col = c(1, 2), color = color, breaks = breaks, diff --git a/R/summarize_dataset.R b/R/summarize_dataset.R index de40fe0..93bb141 100644 --- a/R/summarize_dataset.R +++ b/R/summarize_dataset.R @@ -184,8 +184,8 @@ calc_genotype_distance <- function(g1, g2, na.reject = TRUE) { alleles <- cbind(alleles1, alleles2) alleles sum(apply(alleles, 1, - function(row) min(sum(row[1:2] != row[3:4], na.rm = T), - sum(row[2:1] != row[3:4]), na.rm = T))) + function(row) min(sum(row[1:2] != row[3:4], na.rm = TRUE), + sum(row[2:1] != row[3:4]), na.rm = TRUE))) } #' Find closest matches in distance matrix @@ -206,8 +206,8 @@ find_closest_matches <- function(dist_mat, range=2, maximum=8) { entries <- lapply(1:nrow(dist_mat), function(nr) { m <- min(dist_mat[nr, ]) nearby <- dist_mat[nr, dist_mat[nr, ] < m + range & - dist_mat[nr, ] < maximum, drop = F] - nearby <- nearby[1, order(nearby), drop = F] + dist_mat[nr, ] < maximum, drop = FALSE] + nearby <- nearby[1, order(nearby), drop = FALSE] nm <- colnames(nearby) nearby <- nearby[1, ] names(nearby) <- nm @@ -318,19 +318,19 @@ summarize_genotypes <- function(results_summary, # integers (otherwise we're just sorting text) if (is.character(combo[[vars[1]]])) { idx <- nchar(combo[[vars[1]]]) > nchar(combo[[vars[2]]]) | - ( nchar(combo[[vars[1]]]) == nchar(combo[[vars[2]]]) & - combo[[vars[1]]] > combo[[vars[2]]] ) + (nchar(combo[[vars[1]]]) == nchar(combo[[vars[2]]]) & + combo[[vars[1]]] > combo[[vars[2]]]) } else { idx <- combo[[vars[1]]] > combo[[vars[2]]] } - idx[is.na(idx)] <- F + idx[is.na(idx)] <- FALSE combo[idx, vars] <- combo[idx, rev(vars)] # Mark those cases that are not homozygous but just happen to have the same # short name (i.e. allele sequence length) - combo[, vars[2]] <- ifelse( !(results_summary$Homozygous) & - combo[, vars[1]] == combo[, vars[2]], - paste0(combo[, vars[2]], "*"), - combo[, vars[2]]) + combo[, vars[2]] <- ifelse(!(results_summary$Homozygous) & + combo[, vars[1]] == combo[, vars[2]], + paste0(combo[, vars[2]], "*"), + combo[, vars[2]]) # Reshape into wide table tbl <- stats::reshape(combo, v.names = vars, idvar = "ID", timevar = "Locus", direction = "wide") @@ -392,7 +392,7 @@ summarize_attribute <- function(results_summary, attrib, repeats = 2) { combo$ID <- paste(combo$Sample, combo$Replicate, sep = "-") # reshape into wide table tbl <- stats::reshape(combo, - v.names = make.names(attrib_rep, unique = T), + v.names = make.names(attrib_rep, unique = TRUE), idvar = "ID", timevar = "Locus", direction = "wide") tbl <- tbl[, -3] diff --git a/R/util.R b/R/util.R index f7662ff..6decc8b 100644 --- a/R/util.R +++ b/R/util.R @@ -34,8 +34,8 @@ make_entry_id <- function(data) { cols.idx <- cols.idx[!is.na(cols.idx)] cols.idx <- cols.idx[unlist(lapply(cols.idx, function(x) { !all(is.na(data[, x])) - }) )] - data.names <- data[, cols.idx, drop = F] + }))] + data.names <- data[, cols.idx, drop = FALSE] sapply(1:nrow(data.names), function(nr) { entries <- lapply(data.names[nr, !is.na(data.names[nr, ])], as.character) do.call(paste, as.list(c(entries, sep = "-"))) From 84ce9412e3a748ee79855093f923ad3d1fe01611 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Tue, 11 Dec 2018 11:19:46 -0500 Subject: [PATCH 22/32] update tests/files to pass lint check, except io --- tests/testthat/test_analyze_dataset.R | 29 +++++++++++++++++------- tests/testthat/test_analyze_seqs.R | 10 ++++----- tests/testthat/test_report.R | 4 ++-- tests/testthat/test_summarize_dataset.R | 30 +++++++++++++------------ 4 files changed, 44 insertions(+), 29 deletions(-) diff --git a/tests/testthat/test_analyze_dataset.R b/tests/testthat/test_analyze_dataset.R index ebd6800..1f72c82 100644 --- a/tests/testthat/test_analyze_dataset.R +++ b/tests/testthat/test_analyze_dataset.R @@ -102,12 +102,21 @@ with(test_data, { write_seqs(seqs, data.dir) dataset <- prepare_dataset(data.dir, "()(\\d+)-([A-Za-z0-9]+).fasta") known_alleles <- data.frame(Locus = c("1", "1", "A"), - Seq = c("ACAGTCAAGAATAACTGCCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCTGTGGCTCAAAAGCTGAAT", - "ACAGTCAAGAATAACTGCCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCTGTGGCTCAAAAGCTGAAT", - "TATCACTGGTGTTAGTCCTCTGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACACAGTTGTGTGAGCCAGTC"), - Name = c("280-a", - "260-X", - "different_name_format")) + Seq = c(paste0("ACAGTCAAGAATAACTGCCCTATCTATCTATCTATCTATCTATCTATCTATCTA", + "TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC", + "TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA", + "TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC", + "TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCTGTGGCTCA", + "AAAGCTGAAT"), + paste0("ACAGTCAAGAATAACTGCCCTATCTATCTATCTATCTATCTATCTATCTATCTA", + "TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC", + "TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA", + "TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC", + "TATCTATCTATCTATCTATCTATCCTGTGGCTCAAAAGCTGAAT"), + paste0("TATCACTGGTGTTAGTCCTCTGTAGATAGATAGATAGATAGATAGATAGATAGA", + "TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA", + "GATAGATAGATAGATAGATAGATAGATAGATAGACACAGTTGTGTGAGCCAGTC")), + Name = c("280-a", "260-X", "different_name_format")) results <- analyze_dataset(dataset, locus_attrs, analysis_opts = list(fraction.min = 0.05), summary_opts = list(counts.min = 500), @@ -142,7 +151,8 @@ with(test_data, { # Second called allele, if present, will be below the first somewhere. # Remaining seqs will be unnamed. if (! results$summary[nm, "Homozygous"]) { - idx <- match(results$summary[nm, "Allele2Seq"], results$samples[[nm]]$Seq) + idx <- match(results$summary[nm, "Allele2Seq"], + results$samples[[nm]]$Seq) expect_equal(results$summary[nm, "Allele2Name"], results$samples[[nm]]$SeqName[idx]) } @@ -158,7 +168,10 @@ with(test_data, { # rownames of locus_attrs, it should throw an error. data.dir <- tempfile() # the names are case-sensitive! - seqs <- lapply(seqs, function(s) {names(s) <- c("a", "b", 1, 2); s}) + seqs <- lapply(seqs, function(s) { + names(s) <- c("a", "b", 1, 2) + s + }) write_seqs(seqs, data.dir) # prepare_dataset tested separately in test_io.R dataset <- prepare_dataset(data.dir, "()(\\d+)-([A-Za-z0-9]+).fasta") diff --git a/tests/testthat/test_analyze_seqs.R b/tests/testthat/test_analyze_seqs.R index 02342a5..85d3623 100644 --- a/tests/testthat/test_analyze_seqs.R +++ b/tests/testthat/test_analyze_seqs.R @@ -14,9 +14,9 @@ with(test_data, { expect_equal(chunk[2, "Length"], 194) with(chunk, { expect_equal(droplevels(MatchingLocus), factor(c("A", "A"))) - expect_equal(MotifMatch, c(T, T)) - expect_equal(LengthMatch, c(T, T)) - expect_equal(Ambiguous, c(F, F)) + expect_equal(MotifMatch, c(TRUE, TRUE)) + expect_equal(LengthMatch, c(TRUE, TRUE)) + expect_equal(Ambiguous, c(FALSE, FALSE)) expect_equal(Stutter, as.integer(c(NA, NA))) expect_equal(Artifact, as.integer(c(NA, NA))) }) @@ -86,8 +86,8 @@ with(test_data, { test_that("analyze_seqs marks artifact entries", { s <- seqs1$A # Take that first stutter and make it an artifact instead - highest <- names(sort(table(s), decreasing = T)[1]) - stutter <- names(sort(table(s), decreasing = T)[3]) + highest <- names(sort(table(s), decreasing = TRUE)[1]) + stutter <- names(sort(table(s), decreasing = TRUE)[3]) idx <- s == stutter s[idx] <- highest substr(s[idx], nchar(stutter), nchar(stutter)) <- "X" diff --git a/tests/testthat/test_report.R b/tests/testthat/test_report.R index 873fe22..39d9d8c 100644 --- a/tests/testthat/test_report.R +++ b/tests/testthat/test_report.R @@ -42,7 +42,7 @@ with(test_data, { # created but should be completely blank. tbl_known <- data.frame(ID = as.character(1:3), stringsAsFactors = FALSE) for (locus in c("A", "B", "1", "2")) { - tbl_known[paste(locus, c("1", "2"), sep="_")] <- "" + tbl_known[paste(locus, c("1", "2"), sep = "_")] <- "" } with(results_summary_data, { results$summary[, c("Allele1Name", "Allele2Name")] <- NA @@ -154,7 +154,7 @@ with(test_data, { # It should still run without errors with(results_summary_data, { results <- summarize_dataset(results) - results$cts_per_locus[,] <- 0 + results$cts_per_locus[, ] <- 0 output <- plot_cts_per_locus(results$cts_per_locus, render = FALSE) expect_null(output) }) diff --git a/tests/testthat/test_summarize_dataset.R b/tests/testthat/test_summarize_dataset.R index 5d47206..630d0d4 100644 --- a/tests/testthat/test_summarize_dataset.R +++ b/tests/testthat/test_summarize_dataset.R @@ -71,7 +71,8 @@ with(test_data, { }) }) - test_that("make_dist_mat_known produces sample-to-individual distance matrix", { + test_that( + "make_dist_mat_known produces sample-to-individual distance matrix", { skip("test not yet implemented") }) @@ -86,7 +87,7 @@ with(test_data, { test_that("calc_genotype_distance handles multiple data types", { # As written it should work with character, numeric, whatever-- it just uses - # the != operator. + # the not-equal operator. g.num <- c(1, 2, 3, 4) g.alpha <- c("a", "b", "c", "d") d <- calc_genotype_distance(g.num, g.alpha) @@ -156,8 +157,8 @@ with(test_data, { }) test_that("align_alleles produces per-allele alignments", { - # By default the allele sequences are dereplicated and then aligned, but this - # tests the other option. + # By default the allele sequences are dereplicated and then aligned, but + # this tests the other option. with(results_summary_data, { alignments <- align_alleles(results$summary, derep = FALSE) expect_equal(names(alignments), levels(results$summary$Locus)) @@ -203,7 +204,8 @@ with(test_data, { with(results_summary_data, { # Overwrite Locus A seqs with same content idx <- which(results$summary$Locus == "A") - results$summary[idx, "Allele1Seq"] <- results$summary[idx[1], "Allele1Seq"] + results$summary[idx, "Allele1Seq"] <- results$summary[idx[1], + "Allele1Seq"] results$summary[idx, "Allele2Seq"] <- NA alignments <- align_alleles(results$summary) # This should still work, just with the same sequence back for A's @@ -222,16 +224,16 @@ with(test_data, { # We should see 5000 reads total, and then 17 split away for each of the # four other loci. Rownames should match sample row IDs. cts_expected <- data.frame(Total = 5000, - Matching = 5000 - 17*3, + Matching = 5000 - 17 * 3, A = as.integer(rep(17, 12)), B = as.integer(rep(17, 12)), `1` = as.integer(rep(17, 12)), `2` = as.integer(rep(17, 12)), check.names = FALSE) - cts_expected$A[1:3] <- as.integer(5000 - 17*3) - cts_expected$B[4:6] <- as.integer(5000 - 17*3) - cts_expected$`1`[7:9] <- as.integer(5000 - 17*3) - cts_expected$`2`[10:12] <- as.integer(5000 - 17*3) + cts_expected$A[1:3] <- as.integer(5000 - 17 * 3) + cts_expected$B[4:6] <- as.integer(5000 - 17 * 3) + cts_expected$`1`[7:9] <- as.integer(5000 - 17 * 3) + cts_expected$`2`[10:12] <- as.integer(5000 - 17 * 3) rownames(cts_expected) <- paste(rep(1:3, times = 4), rep(c("A", "B", "1", "2"), each = 3), sep = "-") @@ -259,14 +261,14 @@ with(test_data, { # We should see 5000 reads total, and then 17 split away for each of the # four other loci. Rownames should match sample row IDs. cts_expected <- data.frame(Total = 5000 - 17, - Matching = 5000 - 17*3, + Matching = 5000 - 17 * 3, A = as.integer(rep(17, 9)), B = as.integer(rep(17, 9)), `1` = as.integer(rep(17, 9)), check.names = FALSE) - cts_expected$A[1:3] <- as.integer(5000 - 17*3) - cts_expected$B[4:6] <- as.integer(5000 - 17*3) - cts_expected$`1`[7:9] <- as.integer(5000 - 17*3) + cts_expected$A[1:3] <- as.integer(5000 - 17 * 3) + cts_expected$B[4:6] <- as.integer(5000 - 17 * 3) + cts_expected$`1`[7:9] <- as.integer(5000 - 17 * 3) rownames(cts_expected) <- paste(rep(1:3, times = 3), rep(c("A", "B", "1"), each = 3), sep = "-") From 44ed3ce132bb5e4709d1d2b52844431a8325dbad Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Tue, 11 Dec 2018 11:20:32 -0500 Subject: [PATCH 23/32] update tests to load example locus_attrs from file This should finish clearing the remaining lint as well. --- tests/testthat/helper_data.R | 19 +++++++++---------- tests/testthat/test_io.R | 10 +++++----- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/tests/testthat/helper_data.R b/tests/testthat/helper_data.R index 522f99a..1aa470c 100644 --- a/tests/testthat/helper_data.R +++ b/tests/testthat/helper_data.R @@ -10,15 +10,14 @@ #' #' This list is a bundle of shared data and functions for running unit tests. test_data <- within(list(), { - txt.locus_attrs <- "Locus LengthMin LengthMax LengthBuffer Motif Primer ReversePrimer -A 131 179 20 TAGA TATCACTGGTGTTAGTCCTCTG CACAGTTGTGTGAGCCAGTC -B 194 235 20 TAGA AGTCTCTCTTTCTCCTTGCA TAGGAGCCTGTGGTCCTGTT -1 232 270 20 TATC ACAGTCAAGAATAACTGCCC CTGTGGCTCAAAAGCTGAAT -2 218 337 20 TCCA TTGTCTCCCCAGTTGCTA TCTGTCATAAACCGTCTGCA" - f.locus_attrs <- textConnection(txt.locus_attrs) - locus_attrs <- read.table(f.locus_attrs, header = T, stringsAsFactors = F) + f.locus_attrs <- devtools::package_file("inst", "example_locus_attrs.csv") + txt.locus_attrs <- readChar(f.locus_attrs, + nchars = file.info(f.locus_attrs)$size) + locus_attrs <- read.table(f.locus_attrs, + header = TRUE, + stringsAsFactors = FALSE, + sep = ",") rownames(locus_attrs) <- locus_attrs$Locus - close(f.locus_attrs) rm(f.locus_attrs) sample.data.cols <- c("Seq", "Count", "Length", "MatchingLocus", "MotifMatch", @@ -33,7 +32,7 @@ B 194 235 20 TAGA AGTCTCTCTTTCTCCTTGCA make.seq_junk <- function(N) { nucleotides <- c("A", "T", "C", "G") vapply(runif(N, min = 1, max = 20), function(L) - paste0(sample(nucleotides, L, replace = T), collapse = ""), + paste0(sample(nucleotides, L, replace = TRUE), collapse = ""), "character") } @@ -83,7 +82,7 @@ B 194 235 20 TAGA AGTCTCTCTTTCTCCTTGCA if (cross_contam_ratio > 0) { others <- locus_attrs[-match(locus_name, rownames(locus_attrs)), ] for (i in 1:nrow(others)) { - idx <- seq(i, length(seqs), cross_contam_ratio*nrow(others)) + idx <- seq(i, length(seqs), cross_contam_ratio * nrow(others)) seqs[idx] <- simulate.seqs(locus_name = others$Locus[i], locus_attrs = locus_attrs, N = length(idx), diff --git a/tests/testthat/test_io.R b/tests/testthat/test_io.R index 99fec27..62499c8 100644 --- a/tests/testthat/test_io.R +++ b/tests/testthat/test_io.R @@ -92,7 +92,7 @@ with(test_data, { test_that("load_locus_attrs complains for repeated loci", { # It should throw a warning if any Locus names are repeated. - txt.wrong <- gsub("\nB ", "\nA ", txt.locus_attrs) + txt.wrong <- gsub("\nB,", "\nA,", txt.locus_attrs) fp <- write_locus_attrs(txt.wrong) expect_warning({ locus_attrs_test <- load_locus_attrs(fp) @@ -120,7 +120,7 @@ with(test_data, { touch(dataset_known$Filename) # Write dataset CSV fp <- tempfile() - write.csv(dataset_known, file = fp, na = "", row.names = F) + write.csv(dataset_known, file = fp, na = "", row.names = FALSE) expect_silent({ dataset <- load_dataset(fp) }) @@ -135,7 +135,7 @@ with(test_data, { dir.create(data.dir) setwd(data.dir) fp <- tempfile() - write.csv(dataset_known, file = fp, na = "", row.names = F) + write.csv(dataset_known, file = fp, na = "", row.names = FALSE) # expect_message and capture_messages both do NOT catch text send to stderr, # though capture.output(..., type = "message") does. msg <- capture.output({ @@ -270,8 +270,8 @@ with(test_data, { # move both sets into a single parent directory dp <- tempfile() dir.create(dp) - file.copy(data1$dp, dp, recursive=TRUE) - file.copy(data2$dp, dp, recursive=TRUE) + file.copy(data1$dp, dp, recursive = TRUE) + file.copy(data2$dp, dp, recursive = TRUE) # build dataset from parent directory dataset <- prepare_dataset(dp, data1$pattern) expect_equal(colnames(dataset), From fe9dc378c1b6c22606026b6cda73cbae680c3b68 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Tue, 11 Dec 2018 12:06:09 -0500 Subject: [PATCH 24/32] improve file loading in test helper This checks both with and without the "inst" parent directory and uses system.file instead of devtools::package_file so that we can reliably find the path even when running from /tmp or elsewhere. --- tests/testthat/helper_data.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/testthat/helper_data.R b/tests/testthat/helper_data.R index 1aa470c..d517dd7 100644 --- a/tests/testthat/helper_data.R +++ b/tests/testthat/helper_data.R @@ -10,7 +10,13 @@ #' #' This list is a bundle of shared data and functions for running unit tests. test_data <- within(list(), { - f.locus_attrs <- devtools::package_file("inst", "example_locus_attrs.csv") + # Careful! When running via a package check we might be in temporary + # installed copy in /tmp or elsewhere, and probably won't have the "inst" + # directory anymore. Alternatively when running with devtools::test() we + # will. + f.locus_attrs <- unique(system.file(c("inst/example_locus_attrs.csv", + "example_locus_attrs.csv"), + package = getPackageName())) txt.locus_attrs <- readChar(f.locus_attrs, nchars = file.info(f.locus_attrs)$size) locus_attrs <- read.table(f.locus_attrs, @@ -18,7 +24,6 @@ test_data <- within(list(), { stringsAsFactors = FALSE, sep = ",") rownames(locus_attrs) <- locus_attrs$Locus - rm(f.locus_attrs) sample.data.cols <- c("Seq", "Count", "Length", "MatchingLocus", "MotifMatch", "LengthMatch", "Ambiguous", "Stutter", "Artifact", From e37cb13cd779dbc185d79ca40d61c183690ffc70 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Tue, 11 Dec 2018 12:13:10 -0500 Subject: [PATCH 25/32] tidy echo text in prep release script [skip ci] --- .utils/prep_release.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.utils/prep_release.sh b/.utils/prep_release.sh index 8110d80..11e21b6 100755 --- a/.utils/prep_release.sh +++ b/.utils/prep_release.sh @@ -9,6 +9,7 @@ VERSION=$1 chiimp_check='x<-devtools::check();quit(save="no",status=length(c(x$errors,x$warnings)))' # Run lint script +echo "Running lint check" ./.utils/lint.R # Update version in download link in README @@ -31,6 +32,7 @@ zip -r chiimp-v${VERISON}.zip chiimp/* tar czvf chiimp-v${VERSION}.tgz chiimp/* popd +echo echo "REMINDER BEFORE TAGGING RELEASE $VERSION:" echo echo " * Run full test on Mac OS, Windows, and Linux" From 4e1d7f6b57cc188b3a7938d5d5d8bf9f3f10843c Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Tue, 11 Dec 2018 12:13:55 -0500 Subject: [PATCH 26/32] better docs for default configuration --- R/configuration.R | 4 +++- man/config.defaults.Rd | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/R/configuration.R b/R/configuration.R index 4c2dc2e..519ed39 100644 --- a/R/configuration.R +++ b/R/configuration.R @@ -10,7 +10,9 @@ #' * dataset_opts: #' * dp: directory path to input sequence files #' * pattern: regular expression for the input filename pattern -#' * ord: order of fields in the input filename pattern +#' * ord: order of fields Replicate, Sample, and Locus in in the input +#' filename pattern. For example, if Locus is the first field followed by +#' Replicate and Sample, set \code{ord=c(3, 1, 2)}. #' * output: #' * dp: directory path for saving output data #' * fp_dataset: file path to table of sample attributes to use, rather than diff --git a/man/config.defaults.Rd b/man/config.defaults.Rd index 731d113..5b84b21 100644 --- a/man/config.defaults.Rd +++ b/man/config.defaults.Rd @@ -22,7 +22,9 @@ Notable Options: \itemize{ \item dp: directory path to input sequence files \item pattern: regular expression for the input filename pattern -\item ord: order of fields in the input filename pattern +\item ord: order of fields Replicate, Sample, and Locus in in the input +filename pattern. For example, if Locus is the first field followed by +Replicate and Sample, set \code{ord=c(3, 1, 2)}. } \item output: \itemize{ From ffa36ae12fd3b0aaf1671bf658d5b225f291b8da Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Tue, 11 Dec 2018 15:14:02 -0500 Subject: [PATCH 27/32] explicit call to getPackageName in test helper Travis is claiming it can't find getPackageName for R 3.4.4. Does this make a difference? --- tests/testthat/helper_data.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/helper_data.R b/tests/testthat/helper_data.R index d517dd7..9050ca9 100644 --- a/tests/testthat/helper_data.R +++ b/tests/testthat/helper_data.R @@ -16,7 +16,7 @@ test_data <- within(list(), { # will. f.locus_attrs <- unique(system.file(c("inst/example_locus_attrs.csv", "example_locus_attrs.csv"), - package = getPackageName())) + package = methods::getPackageName())) txt.locus_attrs <- readChar(f.locus_attrs, nchars = file.info(f.locus_attrs)$size) locus_attrs <- read.table(f.locus_attrs, From 8ff1c41efd6b5a70ba528a32119570d3337b51e6 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Tue, 11 Dec 2018 16:35:30 -0500 Subject: [PATCH 28/32] add documentation for source code layout for #19 --- R/chiimp.R | 48 ++++++++++++++++++++++++++++++++++-- man/chiimp-package.Rd | 57 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 101 insertions(+), 4 deletions(-) diff --git a/R/chiimp.R b/R/chiimp.R index 75b1444..8ece2c6 100644 --- a/R/chiimp.R +++ b/R/chiimp.R @@ -1,9 +1,13 @@ # Run a full microsatellite analysis and handle configuration and command-line # execution. -#' Analyze Microsatellites +#' CHIIMP #' -#' Analyze DNA microsatellites in high-throughput sequencing datasets. +#' Computational, High-throughput Individual Identification through +#' Microsatellite Profiling. For a conceptual overview see the latest +#' [user guide](https://shawhahnlab.github.io/chiimp/GUIDE.pdf) and +#' [additional documentation](https://shawhahnlab.github.io/chiimp/docs/) at +#' . #' #' @details #' @@ -66,6 +70,46 @@ #' linked above can also be used idependently; see the documentation and code #' examples for the individual functions for more information. #' +#' +#' **The Package structure of the source files, grouped by topic:** +#' * Main Interface: +#' * chiimp.R: Main entry point for command-line usage (\code{\link{main}}) +#' and R usage (\code{\link{full_analysis}}). +#' * Data Analysis: +#' * analyze_dataset.R: High-level interface to analyze all samples across a +#' given dataset (\code{\link{analyze_dataset}}); used by +#' \code{\link{full_analysis}} to manage the main part of the processing. +#' * summarize_dataset.R: High-level interface to provide inter-sample and +#' inter-locus analyses (\code{\link{summarize_dataset}}); used by +#' \code{\link{full_analysis}} to manage the second stage of the +#' processing. +#' * analyze_seqs.R: Low-level interface to convert raw sequence input to a +#' data frame of unique sequences (\code{\link{analyze_seqs}}); used by +#' \code{\link{analyze_dataset}}. +#' * analyze_sample.R: Low-level interface to extract per-locus details from +#' a data frame of unique sequences (\code{\link{analyze_sample}}); used by +#' \code{\link{analyze_dataset}}. +#' * summarize_sample.R: Low-level interface to condense each sample data +#' frame into a a concise list of consistent attributes, suitable for +#' binding together across samples for a dataset +#' (\code{\link{summarize_sample}}); used by \code{\link{analyze_dataset}}. +#' * categorize.R: Low-level helper functions used by +#' \code{\link{summarize_dataset}} for samples with known identity. +#' * Plotting and reporting: +#' * report.R: Various plotting and summarizing functions used when rendering +#' a report in \code{\link{full_analysis}}. +#' * histogram.R: Sequence histogram plotting tools (\code{\link{histogram}}) +#' as used during \code{\link{full_analysis}}. +#' * markdown.R: Various helper functions for adding tables and plots to an R +#' Markdown report as used in \code{\link{full_analysis}}. +#' * Utility Functions and Configuration: +#' * configuration.R: The default configuration options +#' (\code{\link{config.defaults}}) used by \code{\link{full_analysis}}. +#' * io.R: various helper input/output functions used loading and saving +#' sequence data files, spreadsheets, and plots used in multiple parts of the +#' package. +#' * util.R: Various helper functions used in multiple parts of the package. +#' #' @md #' "_PACKAGE" diff --git a/man/chiimp-package.Rd b/man/chiimp-package.Rd index 6cd0e5e..2ba8a89 100644 --- a/man/chiimp-package.Rd +++ b/man/chiimp-package.Rd @@ -4,9 +4,13 @@ \name{chiimp-package} \alias{chiimp} \alias{chiimp-package} -\title{Analyze Microsatellites} +\title{CHIIMP} \description{ -Analyze DNA microsatellites in high-throughput sequencing datasets. +Computational, High-throughput Individual Identification through +Microsatellite Profiling. For a conceptual overview see the latest +\href{https://shawhahnlab.github.io/chiimp/GUIDE.pdf}{user guide} and +\href{https://shawhahnlab.github.io/chiimp/docs/}{additional documentation} at +\url{https://shawhahnlab.github.io/chiimp/}. } \details{ Starting from file inputs and producing file outputs, the overall workflow @@ -74,6 +78,55 @@ program, where \code{\link{main}} loads a configuration file into a nested list of options and calls \code{\link{full_analysis}}. The public functions linked above can also be used idependently; see the documentation and code examples for the individual functions for more information. + +\strong{The Package structure of the source files, grouped by topic:} +\itemize{ +\item Main Interface: +\itemize{ +\item chiimp.R: Main entry point for command-line usage (\code{\link{main}}) +and R usage (\code{\link{full_analysis}}). +} +\item Data Analysis: +\itemize{ +\item analyze_dataset.R: High-level interface to analyze all samples across a +given dataset (\code{\link{analyze_dataset}}); used by +\code{\link{full_analysis}} to manage the main part of the processing. +\item summarize_dataset.R: High-level interface to provide inter-sample and +inter-locus analyses (\code{\link{summarize_dataset}}); used by +\code{\link{full_analysis}} to manage the second stage of the +processing. +\item analyze_seqs.R: Low-level interface to convert raw sequence input to a +data frame of unique sequences (\code{\link{analyze_seqs}}); used by +\code{\link{analyze_dataset}}. +\item analyze_sample.R: Low-level interface to extract per-locus details from +a data frame of unique sequences (\code{\link{analyze_sample}}); used by +\code{\link{analyze_dataset}}. +\item summarize_sample.R: Low-level interface to condense each sample data +frame into a a concise list of consistent attributes, suitable for +binding together across samples for a dataset +(\code{\link{summarize_sample}}); used by \code{\link{analyze_dataset}}. +\item categorize.R: Low-level helper functions used by +\code{\link{summarize_dataset}} for samples with known identity. +} +\item Plotting and reporting: +\itemize{ +\item report.R: Various plotting and summarizing functions used when rendering +a report in \code{\link{full_analysis}}. +\item histogram.R: Sequence histogram plotting tools (\code{\link{histogram}}) +as used during \code{\link{full_analysis}}. +\item markdown.R: Various helper functions for adding tables and plots to an R +Markdown report as used in \code{\link{full_analysis}}. +} +\item Utility Functions and Configuration: +\itemize{ +\item configuration.R: The default configuration options +(\code{\link{config.defaults}}) used by \code{\link{full_analysis}}. +\item io.R: various helper input/output functions used loading and saving +sequence data files, spreadsheets, and plots used in multiple parts of the +package. +\item util.R: Various helper functions used in multiple parts of the package. +} +} } \author{ \strong{Maintainer}: Jesse Connell \email{ancon@upenn.edu} From 423ecb86e8d38787cf0df3be48a60d489fb33840 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Tue, 11 Dec 2018 16:54:23 -0500 Subject: [PATCH 29/32] add code examples for main and full_analysis This also updates the example configuration file to use only one CPU core so that automated checks on the examples don't complain. --- R/chiimp.R | 36 ++++++++++++++++++++++++++++++++++-- inst/example_config.yml | 5 +++++ man/full_analysis.Rd | 18 ++++++++++++++++++ man/main.Rd | 21 +++++++++++++++++++-- 4 files changed, 76 insertions(+), 4 deletions(-) diff --git a/R/chiimp.R b/R/chiimp.R index 8ece2c6..03c8a97 100644 --- a/R/chiimp.R +++ b/R/chiimp.R @@ -129,6 +129,22 @@ #' @return list of results, with the full configuration list included as #' "config." #' +#' @examples +#' # Set up a temporary copy of the CHIIMP test data +#' example_dir <- tempfile() +#' dir.create(example_dir) +#' setwd(example_dir) +#' test_data$write_seqs(test_data$seqs, +#' "str-dataset", +#' "Replicate1-Sample%s-%s.fasta") +#' locus_attrs_path <- system.file("example_locus_attrs.csv", +#' package = "chiimp") +#' file.copy(locus_attrs_path, "locus_attrs.csv") +#' # Run the example analysis +#' config_path <- system.file("example_config.yml", package = "chiimp") +#' config <- load_config(config_path) +#' results <- full_analysis(config) +#' #' @export full_analysis <- function(config, dataset=NULL) { # Overaly explicit configuration onto the default settings @@ -205,8 +221,9 @@ full_analysis <- function(config, dataset=NULL) { #' Handle full microsatellite analysis from command-line #' -#' Read configuration from command-line arguments and run -#' \code{\link{full_analysis}}. +#' A small wrapper function to read a configuration file path from command-line +#' arguments, load the configuration data (see \code{\link{load_config}}), and +#' run \code{\link{full_analysis}}. #' #' @param args optional character vector of arguments to use rather than those #' detected with \code{\link[base]{commandArgs}}. @@ -214,6 +231,21 @@ full_analysis <- function(config, dataset=NULL) { #' @return list of results, with the full configuration list included as #' "config." #' +#' @examples +#' # Set up a temporary copy of the CHIIMP test data +#' example_dir <- tempfile() +#' dir.create(example_dir) +#' setwd(example_dir) +#' test_data$write_seqs(test_data$seqs, +#' "str-dataset", +#' "Replicate1-Sample%s-%s.fasta") +#' locus_attrs_path <- system.file("example_locus_attrs.csv", +#' package = "chiimp") +#' file.copy(locus_attrs_path, "locus_attrs.csv") +#' # Run the example analysis +#' config_path <- system.file("example_config.yml", package = "chiimp") +#' results <- main(config_path) +#' #' @export main <- function(args=NULL) { if (missing(args)) diff --git a/inst/example_config.yml b/inst/example_config.yml index d51c26b..be3befc 100644 --- a/inst/example_config.yml +++ b/inst/example_config.yml @@ -13,6 +13,11 @@ dataset_opts: dp: str-dataset pattern: Replicate(\d+)-Sample(\d+)-([A-Za-z0-9]+) +# "dataset_analysis" defines options passed to analyze_dataset() by +# full_analysis(). Here we'll just ensure that any example analysis run with +# this configuration is single-threaded. +dataset_analysis: + ncores: 1 # "output" defines options related to analysis output. # "dp" defines the directory that will contain all output files. output: diff --git a/man/full_analysis.Rd b/man/full_analysis.Rd index f2958fd..2f82d44 100644 --- a/man/full_analysis.Rd +++ b/man/full_analysis.Rd @@ -21,3 +21,21 @@ list of results, with the full configuration list included as Given a list of configuration options, run all aspects of a microsatellite analysis, and save the corresponding output files. } +\examples{ +# Set up a temporary copy of the CHIIMP test data +example_dir <- tempfile() +dir.create(example_dir) +setwd(example_dir) +test_data$write_seqs(test_data$seqs, + "str-dataset", + "Replicate1-Sample\%s-\%s.fasta") +locus_attrs_path <- system.file("example_locus_attrs.csv", + package = "chiimp") +file.copy(locus_attrs_path, "locus_attrs.csv") +# Run the example analysis +config_path <- system.file("example_config.yml", package = "chiimp") +config <- load_config(config_path) +results <- full_analysis(config) + + +} diff --git a/man/main.Rd b/man/main.Rd index 7c8dd8d..890bbb0 100644 --- a/man/main.Rd +++ b/man/main.Rd @@ -15,6 +15,23 @@ list of results, with the full configuration list included as "config." } \description{ -Read configuration from command-line arguments and run -\code{\link{full_analysis}}. +A small wrapper function to read a configuration file path from command-line +arguments, load the configuration data (see \code{\link{load_config}}), and +run \code{\link{full_analysis}}. +} +\examples{ +# Set up a temporary copy of the CHIIMP test data +example_dir <- tempfile() +dir.create(example_dir) +setwd(example_dir) +test_data$write_seqs(test_data$seqs, + "str-dataset", + "Replicate1-Sample\%s-\%s.fasta") +locus_attrs_path <- system.file("example_locus_attrs.csv", + package = "chiimp") +file.copy(locus_attrs_path, "locus_attrs.csv") +# Run the example analysis +config_path <- system.file("example_config.yml", package = "chiimp") +results <- main(config_path) + } From 9815c497446fd1fc94a00aff2fe71588e1b0e124 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Tue, 11 Dec 2018 17:16:00 -0500 Subject: [PATCH 30/32] update NEWS.md for latest changes --- NEWS.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/NEWS.md b/NEWS.md index 82c55b4..b453c5b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,19 @@ +# chiimp dev + + * Fixed heatmap plotting via updated `plot_heatmap` for cases with blank + results and only one unique non-blank value ([#22]). + * Added check in `analyze_dataset` for locus name mismatches between dataset + table and locus attributes table ([#21]). + * Added check in `prepare_dataset` for missing data directory ([#20]). + * Added check in `prepare_dataset` for zero-detected-files case. + * Added check in `load_dataset` for missing data files. + * Added check in `full_analysis` to warn if any input data files are + completely empty. + +[#22]: https://github.com/ShawHahnLab/chiimp/issues/22 +[#21]: https://github.com/ShawHahnLab/chiimp/issues/21 +[#20]: https://github.com/ShawHahnLab/chiimp/issues/20 + # chiimp 0.2.1 * Minor improvements to release process ([#14]). From 34218d4ff6a426bd4596beb6219640b9046d3f98 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 12 Dec 2018 08:12:15 -0500 Subject: [PATCH 31/32] remove redundant ncores lines in demo scripts --- inst/bin/demo.sh | 1 - inst/bin/demo_empty.sh | 1 - 2 files changed, 2 deletions(-) diff --git a/inst/bin/demo.sh b/inst/bin/demo.sh index 3fbddda..b2e89f3 100755 --- a/inst/bin/demo.sh +++ b/inst/bin/demo.sh @@ -9,5 +9,4 @@ R --vanilla -q -e "devtools::load_all('.', quiet=T); test_data\$write_seqs(test_ cp "$dir/../example_locus_attrs.csv" "$scratch/locus_attrs.csv" cp "$dir/../example_config.yml" "$scratch/config.yml" cd "$scratch" -echo "dataset_analysis: { ncores: 1 }" >> "config.yml" "$dir/chiimp" "config.yml" diff --git a/inst/bin/demo_empty.sh b/inst/bin/demo_empty.sh index c138b8f..d5a3ce5 100755 --- a/inst/bin/demo_empty.sh +++ b/inst/bin/demo_empty.sh @@ -16,5 +16,4 @@ done cp "$dir/../example_locus_attrs.csv" "$scratch/locus_attrs.csv" cp "$dir/../example_config.yml" "$scratch/config.yml" cd "$scratch" -echo "dataset_analysis: { ncores: 1 }" >> "config.yml" "$dir/chiimp" "config.yml" From d896926ff8225994e4972a182fa0546c6c8f9955 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 12 Dec 2018 08:55:51 -0500 Subject: [PATCH 32/32] bump version to 0.2.2 --- DESCRIPTION | 2 +- NEWS.md | 2 +- README.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 01f15d3..e955ea3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: chiimp Title: Computational, High-throughput Individual Identification through Microsatellite Profiling -Version: 0.2.1 +Version: 0.2.2 Authors@R: person("Jesse", "Connell", email = "ancon@upenn.edu", role = c("aut", "cre")) Description: An R package to analyze microsatellites in high-throughput sequencing datasets. Depends: R (>= 3.2.3) diff --git a/NEWS.md b/NEWS.md index b453c5b..82cea8e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# chiimp dev +# chiimp 0.2.2 * Fixed heatmap plotting via updated `plot_heatmap` for cases with blank results and only one unique non-blank value ([#22]). diff --git a/README.md b/README.md index 20145fc..b5dce79 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ high-throughput sequencing datasets. For automated installation and program usage see GUIDE.pdf in a [released version](https://github.com/ShawHahnLab/chiimp/releases). -The most recent released version is [0.2.1](https://github.com/ShawHahnLab/chiimp/releases/tag/0.2.1). +The most recent released version is [0.2.2](https://github.com/ShawHahnLab/chiimp/releases/tag/0.2.2). For usage as an R package also see the built-in package documentation. The package-level page (`?chiimp`) provides an overview with links to specific functions.