From f800f1b6cd999f141c0a6a89f42896c0ce1bf9f3 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 12 Dec 2018 18:55:20 -0500 Subject: [PATCH 01/19] add spell-check scripting for docs Use devtools::spell_check() to check for spelling errors in the files in man/. --- .utils/prep_release.sh | 39 ++++++++++++++++++++++++++------------- .utils/spellcheck.R | 12 ++++++++++++ .utils/wordlist.txt | 26 ++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 13 deletions(-) create mode 100755 .utils/spellcheck.R create mode 100644 .utils/wordlist.txt diff --git a/.utils/prep_release.sh b/.utils/prep_release.sh index 11e21b6..ad84f07 100755 --- a/.utils/prep_release.sh +++ b/.utils/prep_release.sh @@ -5,37 +5,50 @@ set -e VERSION=$1 +SEP="===" chiimp_check='x<-devtools::check();quit(save="no",status=length(c(x$errors,x$warnings)))' +echo "$SEP Running spell check" +./.utils/spellcheck.R + # Run lint script -echo "Running lint check" +echo "$SEP Running lint check" ./.utils/lint.R -# Update version in download link in README -VER_MSG="The most recent released version is" -TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag" -SED_README="s:$VER_MSG \\[[0-9.]+\\]\\($TAG_URL/[0-9.]+\\)\\.:$VER_MSG [$VERSION]($TAG_URL/$VERSION).:" -sed -i -r "$SED_README" README.md +if [[ $VERSION != "" ]]; then + # Update version in download link in README + VER_MSG="The most recent released version is" + TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag" + SED_README="s:$VER_MSG \\[[0-9.]+\\]\\($TAG_URL/[0-9.]+\\)\\.:$VER_MSG [$VERSION]($TAG_URL/$VERSION).:" + sed -i -r "$SED_README" README.md -# Update version in DESCRIPTION and NEWS.md -sed -i "s/Version: .*$/Version: $VERSION/" DESCRIPTION -sed -i "s/# chiimp dev/# chiimp $VERSION/" NEWS.md + # Update version in DESCRIPTION and NEWS.md + sed -i "s/Version: .*$/Version: $VERSION/" DESCRIPTION + sed -i "s/# chiimp dev/# chiimp $VERSION/" NEWS.md +fi +echo "$SEP Running devtools::check()" R --slave --vanilla -e "$chiimp_check" + +echo "$SEP Rendering user guide" R --slave --vanilla -e "rmarkdown::render('GUIDE.Rmd', output_file = 'GUIDE.pdf', quiet = TRUE)" # Create bundled ZIP and TGZ versions without hidden top level files (such as # the git and travis stuff) and with the GUIDE.pdf. -pushd .. -zip -r chiimp-v${VERISON}.zip chiimp/* -tar czvf chiimp-v${VERSION}.tgz chiimp/* -popd +if [[ $VERSION != "" ]]; then + echo "$SEP Creating release archives" + pushd .. + zip -r chiimp-v${VERISON}.zip chiimp/* + tar czvf chiimp-v${VERSION}.tgz chiimp/* + popd +fi echo echo "REMINDER BEFORE TAGGING RELEASE $VERSION:" echo echo " * Run full test on Mac OS, Windows, and Linux" echo " * Update NEWS.md with all updates under a heading matching this version" +echo " * Check README.md for link to this version" echo " * Make sure GUIDE.Rmd is up-to-date and rendered GUIDE.pdf is correct" echo diff --git a/.utils/spellcheck.R b/.utils/spellcheck.R new file mode 100755 index 0000000..a18399d --- /dev/null +++ b/.utils/spellcheck.R @@ -0,0 +1,12 @@ +#!/usr/bin/env Rscript + +# Spell-check the documentation files. Note they'll have to be updated e.g. +# with devtools::document() first. + +ignore <- read.table(".utils/wordlist.txt", + header = FALSE, + stringsAsFactors = FALSE)[, 1] +results <- devtools::spell_check(ignore = ignore) +if (length(results) > 0) { + results +} diff --git a/.utils/wordlist.txt b/.utils/wordlist.txt new file mode 100644 index 0000000..a40be6a --- /dev/null +++ b/.utils/wordlist.txt @@ -0,0 +1,26 @@ +ABCD +ACTG +artifactual +autocalculated +Autogenerate +CHIIMP +CHIIMP's +config +Connell +dereplicated +Dereplicates +FASTA +FASTQ +genotype +Genotype +heterozygous +Heterozygous +homozygous +Homozygous +MSA +pandoc +Pandoc +PPI +seqs +STR +YAML From 11a6e888837e04329c651873fc8158a0351a2d20 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 12 Dec 2018 18:57:01 -0500 Subject: [PATCH 02/19] fix typos and unescaped code bits in roxygen docs --- R/analyze_dataset.R | 15 ++++---- R/analyze_sample.R | 13 +++---- R/analyze_seqs.R | 40 ++++++++++----------- R/categorize.R | 20 +++++------ R/chiimp.R | 57 +++++++++++++++--------------- R/configuration.R | 24 ++++++------- R/io.R | 28 +++++++-------- R/report.R | 12 +++---- R/summarize_dataset.R | 26 +++++++------- R/summarize_sample.R | 38 ++++++++++---------- R/util.R | 2 +- man/analyze_sample.Rd | 13 +++---- man/analyze_seqs.Rd | 36 +++++++++---------- man/categorize_genotype_results.Rd | 12 +++---- man/chiimp-package.Rd | 53 +++++++++++++-------------- man/config.defaults.Rd | 24 ++++++------- man/find_artifact.Rd | 4 +-- man/full_analysis.Rd | 1 - man/load_locus_attrs.Rd | 16 ++++----- man/load_seqs.Rd | 4 +-- man/match_known_genotypes.Rd | 8 ++--- man/name_alleles_in_table.Rd | 2 +- man/name_known_sequences.Rd | 15 ++++---- man/plot_dist_mat.Rd | 2 +- man/plot_heatmap.Rd | 6 ++-- man/report_idents.Rd | 2 +- man/save_alignments.Rd | 2 +- man/save_seqfile_data.Rd | 6 ++-- man/summarize_attribute.Rd | 4 +-- man/summarize_dataset.Rd | 20 ++++++----- man/summarize_sample.Rd | 38 ++++++++++---------- man/tabulate_allele_names.Rd | 2 +- 32 files changed, 279 insertions(+), 266 deletions(-) diff --git a/R/analyze_dataset.R b/R/analyze_dataset.R index 45e2eaf..7849d75 100644 --- a/R/analyze_dataset.R +++ b/R/analyze_dataset.R @@ -189,9 +189,9 @@ tidy_analyzed_dataset <- function(dataset, raw.results) { #' For the given results list (pair of summary data frame and list of per-sample #' data frames as produced by \code{\link{tidy_analyzed_dataset}}), add columns #' to all data frames defining names for recognized sequences. For the summary -#' data frame this will be Allele1Name and Allele2Name. For each sample data -#' frame this will be SeqName, defined for any sequences represented in the -#' summary or in a given known alleles set. +#' data frame this will be \code{Allele1Name} and \code{Allele2Name}. For each +#' sample data frame this will be \code{SeqName}, defined for any sequences +#' represented in the summary or in a given known alleles set. #' #' @param results results list as produced by #' \code{\link{tidy_analyzed_dataset}}. @@ -202,10 +202,11 @@ tidy_analyzed_dataset <- function(dataset, raw.results) { #' \code{\link{make_allele_name}}. #' #' @return list of results, with \code{summary} set to the single summary data -#' frame and \code{data} the per-sample data frames. A "SeqName" column in -#' sample data frames and "Allele1Name" and "Allele2Name" columns in the -#' summary data frame will associate any sequence matching a known allele (for -#' either the given table or the current dataset) with a text name. +#' frame and \code{data} the per-sample data frames. A \code{SeqName} column +#' in sample data frames and \code{Allele1Name} and \code{Allele2Name} columns +#' in the summary data frame will associate any sequence matching a known +#' allele (for either the given table or the current dataset) with a text +#' name. name_known_sequences <- function(results, known_alleles, name_args) { # Name all of the called alleles across samples results$summary <- name_alleles_in_table(results$summary, known_alleles, diff --git a/R/analyze_sample.R b/R/analyze_sample.R index 7b615bf..ada9b70 100644 --- a/R/analyze_sample.R +++ b/R/analyze_sample.R @@ -76,12 +76,13 @@ analyze_sample <- function(seq_data, sample.attrs, fraction.min) { } #' @describeIn analyze_sample version of sample analysis guided by expected -#' sequence length values. Additional items ExpectedLength1 and optionally -#' ExpectedLength2 can be supplied in the \code{sample.attrs} list. If NA or -#' missing the behavior will match \code{analyze_sample}. If two expected -#' lengths are given, the fraction.min argument is ignored. If at least one -#' expected length is given, the stutter/artifact filtering is disabled. From -#' here use \code{\link{summarize_sample_guided}}. +#' sequence length values. Additional items \code{ExpectedLength1} and +#' optionally \code{ExpectedLength2} can be supplied in the +#' \code{sample.attrs} list. If NA or missing the behavior will match +#' \code{analyze_sample}. If two expected lengths are given, the fraction.min +#' argument is ignored. If at least one expected length is given, the +#' stutter/artifact filtering is disabled. From here use +#' \code{\link{summarize_sample_guided}}. #' #' @export analyze_sample_guided <- function(seq_data, sample.attrs, fraction.min) { diff --git a/R/analyze_seqs.R b/R/analyze_seqs.R index 1586877..ad27310 100644 --- a/R/analyze_seqs.R +++ b/R/analyze_seqs.R @@ -9,27 +9,27 @@ #' #' @details #' Columns in the returned data frame: -#' * Seq: sequence text for each unique sequence -#' * Count: integer count of occurrences of this exact sequence -#' * Length: integer sequence length -#' * MatchingLocus: factor for the name of the locus matching each sequence, -#' by checking the primer -#' * MotifMatch: logical: are there are least \code{nrepeats} perfect +#' * \code{Seq}: sequence text for each unique sequence +#' * \code{Count}: integer count of occurrences of this exact sequence +#' * \code{Length}: integer sequence length +#' * \code{MatchingLocus}: factor for the name of the locus matching each +#' sequence, by checking the primer +#' * \code{MotifMatch}: logical: are there are least \code{nrepeats} perfect #' adjacent repeats of the STR motif for the matching locus? -#' * LengthMatch: logical: is the sequence length within the expected range -#' for the matching locus? -#' * Ambiguous: logical: are there unexpected characters in the sequence +#' * \code{LengthMatch}: logical: is the sequence length within the expected +#' range for the matching locus? +#' * \code{Ambiguous}: logical: are there unexpected characters in the sequence #' content? -#' * Stutter: integer: for any sequence that looks like potential PCR stutter, -#' the index of the row that may be the source of the stutter band. -#' * Artifact: integer: for any sequence that looks like potential PCR artifact -#' (other than stutter), the index of the row that may be the source of the -#' stutter band. -#' * FractionOfTotal: numeric fraction of the number of sequences +#' * \code{Stutter}: integer: for any sequence that looks like potential PCR +#' stutter, the index of the row that may be the source of the stutter band. +#' * \code{Artifact}: integer: for any sequence that looks like potential PCR +#' artifact (other than stutter), the index of the row that may be the source +#' of the stutter band. +#' * \code{FractionOfTotal}: numeric fraction of the number of sequences #' represented by each unique sequence compared to the total. -#' * FractionOfLocus: numeric fraction of the number of sequences represented -#' by each unique sequence compared to the total for that particular -#' matching locus. +#' * \code{FractionOfLocus}: numeric fraction of the number of sequences +#' represented by each unique sequence compared to the total for that +#' particular matching locus. #' @md #' #' @param seqs character vector containing sequences. @@ -214,8 +214,8 @@ find_stutter <- function(sample.data, locus_attrs, #' Searches a processed STR sample for entries that may be PCR artifacts, other #' than stutter, from another entry in the sample. Potential artifacts are #' sequences with counts lower than another sequence by a given ratio and -#' sequence length within 1 bp of the other sequence. This only considers -#' STR-labeled rows and requires a given entry to have counts at most +#' sequence length within 1 nucleotide of the other sequence. This only +#' considers STR-labeled rows and requires a given entry to have counts at most #' \code{count.ratio_max} compared to the candidate "source" entry to be #' considered an artifact. Sequence content is not currently considered, just #' relative sequence lengths and counts. diff --git a/R/categorize.R b/R/categorize.R index 00d9c2c..96155cb 100644 --- a/R/categorize.R +++ b/R/categorize.R @@ -4,10 +4,10 @@ #' #' Using the Name column of the given results summary data frame, pair each #' called genotype with the known alleles. A data frame with two columns, -#' CorrectAllele1Seq and CorrectAllele2Seq, is returned. If matching entries are -#' found in Allele1Seq and/or Allele2Seq the order will be preserved, and at -#' this point the two allele entries should match up directly for genotypes that -#' were called correctly. +#' \code{CorrectAllele1Seq} and \code{CorrectAllele2Seq}, is returned. If +#' matching entries are found in \code{Allele1Seq} and/or \code{Allele2Seq} the +#' order will be preserved, and at this point the two allele entries should +#' match up directly for genotypes that were called correctly. #' #' @param results_summary cross-sample summary data frame as produced by #' \code{\link{analyze_dataset}}. @@ -41,10 +41,10 @@ match_known_genotypes <- function(results_summary, genotypes.known) { #' Categorize genotyping results #' -#' For a given results summary data frame that has CorrectAllele1Seq and Correct -#' Allele2Seq columns (such as produced by \code{\link{match_known_genotypes}}) -#' added, create a factor labeling every row of the input data frame by its -#' genotyping outcome. +#' For a given results summary data frame that has \code{CorrectAllele1Seq} and +#' \code{CorrectAllele2Seq} columns (such as produced by +#' \code{\link{match_known_genotypes}}) added, create a factor labeling every +#' row of the input data frame by its genotyping outcome. #' #' @details #' Levels in the returned factor, in order: @@ -56,8 +56,8 @@ match_known_genotypes <- function(results_summary, genotypes.known) { #' * Dropped Allele: One called allele is correct for a heterozygous individual, #' but no second allele was called. #' -#' Cases that should not occur, such as CorrectAllele1Seq and CorrectAllele2Seq -#' both set to NA, map to NA in the returned factor. +#' Cases that should not occur, such as \code{CorrectAllele1Seq} and +#' \code{CorrectAllele2Seq} both set to NA, map to NA in the returned factor. #' @md #' #' @param results_summary cross-sample summary data frame as produced by diff --git a/R/chiimp.R b/R/chiimp.R index 03c8a97..6a7df4a 100644 --- a/R/chiimp.R +++ b/R/chiimp.R @@ -67,48 +67,49 @@ #' The workflow above outlines CHIIMP's behavior when called as a standalone #' program, where \code{\link{main}} loads a configuration file into a nested #' list of options and calls \code{\link{full_analysis}}. The public functions -#' linked above can also be used idependently; see the documentation and code +#' linked above can also be used independently; see the documentation and code #' examples for the individual functions for more information. #' #' #' **The Package structure of the source files, grouped by topic:** #' * Main Interface: -#' * chiimp.R: Main entry point for command-line usage (\code{\link{main}}) -#' and R usage (\code{\link{full_analysis}}). +#' * \code{chiimp.R}: Main entry point for command-line usage +#' (\code{\link{main}}) and R usage (\code{\link{full_analysis}}). #' * Data Analysis: -#' * analyze_dataset.R: High-level interface to analyze all samples across a -#' given dataset (\code{\link{analyze_dataset}}); used by +#' * \code{analyze_dataset.R}: High-level interface to analyze all samples +#' across a given dataset (\code{\link{analyze_dataset}}); used by #' \code{\link{full_analysis}} to manage the main part of the processing. -#' * summarize_dataset.R: High-level interface to provide inter-sample and -#' inter-locus analyses (\code{\link{summarize_dataset}}); used by +#' * \code{summarize_dataset.R}: High-level interface to provide inter-sample +#' and inter-locus analyses (\code{\link{summarize_dataset}}); used by #' \code{\link{full_analysis}} to manage the second stage of the #' processing. -#' * analyze_seqs.R: Low-level interface to convert raw sequence input to a -#' data frame of unique sequences (\code{\link{analyze_seqs}}); used by -#' \code{\link{analyze_dataset}}. -#' * analyze_sample.R: Low-level interface to extract per-locus details from -#' a data frame of unique sequences (\code{\link{analyze_sample}}); used by -#' \code{\link{analyze_dataset}}. -#' * summarize_sample.R: Low-level interface to condense each sample data -#' frame into a a concise list of consistent attributes, suitable for +#' * \code{analyze_seqs.R}: Low-level interface to convert raw sequence input +#' to a data frame of unique sequences (\code{\link{analyze_seqs}}); used +#' by \code{\link{analyze_dataset}}. +#' * \code{analyze_sample.R}: Low-level interface to extract per-locus +#' details from a data frame of unique sequences +#' (\code{\link{analyze_sample}}); used by \code{\link{analyze_dataset}}. +#' * \code{summarize_sample.R}: Low-level interface to condense each sample +#' data frame into a a concise list of consistent attributes, suitable for #' binding together across samples for a dataset #' (\code{\link{summarize_sample}}); used by \code{\link{analyze_dataset}}. -#' * categorize.R: Low-level helper functions used by +#' * \code{categorize.R}: Low-level helper functions used by #' \code{\link{summarize_dataset}} for samples with known identity. #' * Plotting and reporting: -#' * report.R: Various plotting and summarizing functions used when rendering -#' a report in \code{\link{full_analysis}}. -#' * histogram.R: Sequence histogram plotting tools (\code{\link{histogram}}) -#' as used during \code{\link{full_analysis}}. -#' * markdown.R: Various helper functions for adding tables and plots to an R -#' Markdown report as used in \code{\link{full_analysis}}. +#' * \code{report.R}: Various plotting and summarizing functions used when +#' rendering a report in \code{\link{full_analysis}}. +#' * \code{histogram.R}: Sequence histogram plotting tools +#' (\code{\link{histogram}}) as used during \code{\link{full_analysis}}. +#' * \code{markdown.R}: Various helper functions for adding tables and plots +#' to an R Markdown report as used in \code{\link{full_analysis}}. #' * Utility Functions and Configuration: -#' * configuration.R: The default configuration options -#' (\code{\link{config.defaults}}) used by \code{\link{full_analysis}}. -#' * io.R: various helper input/output functions used loading and saving -#' sequence data files, spreadsheets, and plots used in multiple parts of the -#' package. -#' * util.R: Various helper functions used in multiple parts of the package. +#' * \code{configuration.R}: The default configuration options +#' (\code{\link{config.defaults}}) used by \code{\link{full_analysis}}. +#' * \code{io.R}: various helper input/output functions used loading and +#' saving sequence data files, spreadsheets, and plots used in multiple +#' parts of the package. +#' * \code{util.R}: Various helper functions used in multiple parts of the +#' package. #' #' @md #' diff --git a/R/configuration.R b/R/configuration.R index e0947d3..e4d22f8 100644 --- a/R/configuration.R +++ b/R/configuration.R @@ -8,18 +8,18 @@ #' itself to see all of the build-time defaults. #' #' Notable Options: -#' * dataset_opts: -#' * dp: directory path to input sequence files -#' * pattern: regular expression for the input filename pattern -#' * ord: order of fields Replicate, Sample, and Locus in in the input -#' filename pattern. For example, if Locus is the first field followed by -#' Replicate and Sample, set \code{ord=c(3, 1, 2)}. -#' * output: -#' * dp: directory path for saving output data -#' * fp_dataset: file path to table of sample attributes to use, rather than -#' detecting via dataset_opts -#' * fp_locus_attrs: file path to locus attributes CSV file -#' * fp_genotypes_known: file path to known genotypes CSV file +#' * \code{dataset_opts}: +#' * \code{dp}: directory path to input sequence files +#' * \code{pattern}: regular expression for the input filename pattern +#' * \code{ord}: order of fields Replicate, Sample, and Locus in in the +#' input filename pattern. For example, if Locus is the first field +#' followed by Replicate and Sample, set \code{ord=c(3, 1, 2)}. +#' * \code{output}: +#' * \code{dp}: directory path for saving output data +#' * \code{fp_dataset}: file path to table of sample attributes to use, rather +#' than detecting via dataset_opts +#' * \code{fp_locus_attrs}: file path to locus attributes CSV file +#' * \code{fp_genotypes_known}: file path to known genotypes CSV file #' @md #' #' @export diff --git a/R/io.R b/R/io.R index 02ac8e1..31fce4b 100644 --- a/R/io.R +++ b/R/io.R @@ -48,14 +48,14 @@ load_config <- function(fp) { #' #' @details #' Columns Required: -#' * Locus: Unique identifier for a given locus -#' * LengthMin: Minimum known allele sequence length for this locus -#' * LengthMax: Minimum known allele sequence length for this locus -#' * LengthBuffer: Additional sequence length below LengthMin and above -#' LengthMax to accept for a candidate allele -#' * Primer: The forward PCR primer sequence for a given locus, used when -#' matching sequences to loci -#' * ReversePrimer: The reverse PCR primer sequence +#' * \code{Locus}: Unique identifier for a given locus +#' * \code{LengthMin}: Minimum known allele sequence length for this locus +#' * \code{LengthMax}: Minimum known allele sequence length for this locus +#' * \code{LengthBuffer}: Additional sequence length below \code{LengthMin} +#' and above \code{LengthMax} to accept for a candidate allele +#' * \code{Primer}: The forward PCR primer sequence for a given locus, used +#' when matching sequences to loci +#' * \code{ReversePrimer}: The reverse PCR primer sequence #' @md #' #' @param fp.locus_attrs path to text file. @@ -318,8 +318,8 @@ prepare_dataset <- function(dp, pattern, ord = c(1, 2, 3), autorep=FALSE, #' Load vector of sequences from FASTA/FASTQ file #' #' Load a vector of character sequences from the given path. This is just a -#' wrapper around dnar to choose the parser based on filename. Only the -#' sequences are returned, not IDs or quality scores. +#' wrapper around \code{\link[dnar:read.fa]{dnar}} to choose the parser based on +#' filename. Only the sequences are returned, not IDs or quality scores. #' #' @param fp path to sequence file #' @@ -399,9 +399,9 @@ save_allele_seqs <- function(results_summary, dp) { #' \code{\link{analyze_seqs}}) to a separate file in the specified directory #' path, in CSV format. The directory structure will start at the first shared #' directory of the input file paths. -#' For example, if the inputs were /data/run1/file.fastq and -#' /data/run2/file.fastq there will be run1 and run2 directories inside the -#' given `dp` directory. +#' For example, if the inputs were \code{/data/run1/file.fastq} and +#' \code{/data/run2/file.fastq} there will be run1 and run2 directories inside +#' the given \code{dp} directory. #' #' @param results_file_data list of per-file data frames as produced by #' \code{\link{analyze_dataset}}. @@ -445,7 +445,7 @@ save_sample_data <- function(results_data, dp) { #' Save alignments to FASTA files #' -#' Take a list of alignments, one per locus, and save each to a separate fasta +#' Take a list of alignments, one per locus, and save each to a separate FASTA #' file in a specified directory. If any of the per-locus alignment objects is #' NA it will be skipped. These are produced by \code{\link{summarize_dataset}} #' via \code{\link{align_alleles}}. diff --git a/R/report.R b/R/report.R index 6534b4a..75fa38b 100644 --- a/R/report.R +++ b/R/report.R @@ -23,7 +23,7 @@ normalize_alleles <- function(data) { #' Allele pairs are shown in a standardized order with homozygous entries shown #' twice. #' -#' @param data data frame containing Allele1Name and Allele2Name colums such as +#' @param data data frame containing Allele1Name and Allele2Name columns such as #' the first list item produced by \code{\link{analyze_dataset}}. If allele #' names are not yet present call \code{\link{name_alleles_in_table}}. #' @param extra_cols names or index values of additional columns from input data @@ -126,7 +126,7 @@ report_genotypes <- function(results, #' @param na.replicates text to replace NA entries with for the Replicates #' column. #' -#' @return data frame showing summary of sample genotypes with interleved +#' @return data frame showing summary of sample genotypes with interleaved #' genotypes for similar known individuals. #' #' @export @@ -268,7 +268,7 @@ make.dist_scale <- function(n) { #' #' @param dist_mat distance matrix as produced by #' \code{\link{summarize_dataset}} via \code{\link{make_dist_mat}}. -#' @param num.alleles the maximum number of matching/mis-matching alleles. Used +#' @param num.alleles the maximum number of matching/mismatching alleles. Used #' to determine color scaling. Defaults to the highest observed distance in #' the matrix. #' @param dist.display_thresh distance value at or below which distances will be @@ -319,9 +319,9 @@ plot_dist_mat <- function(dist_mat, num.alleles=max(dist_mat), #' Render heatmap of STR attribute across samples and loci #' #' Given a cross-sample summary data frame as produced by -#' \code{\link{analyze_dataset}} and the name of a column (e.g., Stutter, -#' Homozygous, ProminentSequences), plot a heatmap of the values for that -#' attribute, with sample identifiers on rows and loci on columns. The +#' \code{\link{analyze_dataset}} and the name of a column (e.g., \code{Stutter}, +#' \code{Homozygous}, \code{ProminentSequences}), plot a heatmap of the values +#' for that attribute, with sample identifiers on rows and loci on columns. The #' attribute will be coerced to numeric. #' #' @param results combined results list diff --git a/R/summarize_dataset.R b/R/summarize_dataset.R index 93bb141..df07ba6 100644 --- a/R/summarize_dataset.R +++ b/R/summarize_dataset.R @@ -9,24 +9,26 @@ #' #' @details #' Additional entries in the returned list: -#' * alignments: inter-allele alignments for each locus, from +#' * \code{alignments}: inter-allele alignments for each locus, from #' \code{\link{align_alleles}}. -#' * dist_mat: inter-sample distance matrix, from \code{\link{make_dist_mat}}. -#' * dist_mat_known: if genotypes.known is given, this distance matrix of -#' sample-to-individual values will be present, from -#' \code{\link{make_dist_mat_known}}. +#' * \code{dist_mat}: inter-sample distance matrix, from +#' \code{\link{make_dist_mat}}. +#' * \code{dist_mat_known}: if genotypes.known is given, this distance matrix +#' of sample-to-individual values will be present, from +#' \code{\link{make_dist_mat_known}}. #' #' If genotypes.known is given *and* a Name column is present in #' \code{results$summary}, samples will be matched with the genotypes in #' genotypes.known and additional columns will be present in the summary data #' frame: -#' * CorrectAllele1Seq: One correct allele sequence for the individual. The -#' order of this and \code{CorrectAllele2Seq} will be matched to +#' * \code{CorrectAllele1Seq}: One correct allele sequence for the individual. +#' The order of this and \code{CorrectAllele2Seq} will be matched to #' \code{Allele1Seq} and \code{Allele2Seq} if possible. See #' \code{\link{match_known_genotypes}}. -#' * CorrectAllele2Seq: A second correct allele sequence, as above. -#' * GenotypeResult: Categorization for each entry as Correct, Incorrect, -#' Blank, or Dropped Allele. See \code{\link{categorize_genotype_results}}. +#' * \code{CorrectAllele2Seq}: A second correct allele sequence, as above. +#' * \code{GenotypeResult}: Categorization for each entry as Correct, +#' Incorrect, Blank, or Dropped Allele. See +#' \code{\link{categorize_genotype_results}}. #' #' @md #' @@ -374,8 +376,8 @@ summarize_genotypes_known <- function(genotypes_known, tbl_genotypes=NULL) { #' #' Tabulate a single arbitrary attribute across loci, assuming repeats by two #' for the alleles. This is used for color-coding summary heatmaps (see -#' \code{\link{plot_heatmap}}) on top of the attribute values, like Homozgyous -#' or ProminentSeqs. +#' \code{\link{plot_heatmap}}) on top of the attribute values, like +#' \code{Homozygous} or \code{ProminentSeqs}. #' #' @param results_summary cross-sample summary data frame as produced by #' \code{\link{analyze_dataset}}. diff --git a/R/summarize_sample.R b/R/summarize_sample.R index 09c317f..2f76af7 100644 --- a/R/summarize_sample.R +++ b/R/summarize_sample.R @@ -22,22 +22,24 @@ sample_summary_funcs <- c("summarize_sample", #' @details #' Entries in the returned list: #' * For Allele1 and Allele2: -#' * Seq: sequence text for each allele. -#' * Count: integer count of occrrences of this exact sequence. -#' * Length: integer sequence length. -#' * Homozygous: If the sample appears homozygous (if so, the Allele2 entries -#' will be NA). -#' * Ambiguous: If a potential allele was ignored due to ambiguous bases in -#' sequence content (such as "N"). -#' * Stutter: If a potential allele was ignored due to apparent PCR stutter. -#' * Artifact: If a potential allele was ignored due to apparent PCR artifact -#' (other than stutter). -#' * CountTotal: The total number of sequences in the original sample data. -#' * CountLocus: The number of sequences matching all criteria for the +#' * \code{Seq}: sequence text for each allele. +#' * \code{Count}: integer count of occurrences of this exact sequence. +#' * \code{Length}: integer sequence length. +#' * \code{Homozygous}: If the sample appears homozygous (if so, the Allele2 +#' entries will be NA). +#' * \code{Ambiguous}: If a potential allele was ignored due to ambiguous bases +#' in sequence content (such as "N"). +#' * \code{Stutter}: If a potential allele was ignored due to apparent PCR +#' stutter. +#' * \code{Artifact}: If a potential allele was ignored due to apparent PCR +#' artifact (other than stutter). +#' * \code{CountTotal}: The total number of sequences in the original sample +#' data. +#' * \code{CountLocus}: The number of sequences matching all criteria for the #' specified locus in the original sample data. -#' * ProminentSeqs: The number of entries above the specified threshold after -#' all filtering. This should be either one (for a homozygous sample) or two -#' (for a heterozygous sample) but conditions such as cross-sample +#' * \code{ProminentSeqs}: The number of entries above the specified threshold +#' after all filtering. This should be either one (for a homozygous sample) or +#' two (for a heterozygous sample) but conditions such as cross-sample #' contamination or excessive PCR stutter can lead to more than two. #' @md #' @@ -88,9 +90,9 @@ summarize_sample <- function(sample_data, sample.attrs, counts.min) { } #' @describeIn summarize_sample Summarize a processed STR sample Using known -#' lengths. If ExpectedLength1 and optionally ExpectedLength2 are given in -#' \code{sample.attrs}, the \code{counts.min} threshold is ignored. See also -#' \code{\link{analyze_sample_guided}}. +#' lengths. If \code{ExpectedLength1} and optionally \code{ExpectedLength2} +#' are given in \code{sample.attrs}, the \code{counts.min} threshold is +#' ignored. See also \code{\link{analyze_sample_guided}}. #' #' @export summarize_sample_guided <- function(sample_data, sample.attrs, counts.min) { diff --git a/R/util.R b/R/util.R index 6decc8b..9fb13e9 100644 --- a/R/util.R +++ b/R/util.R @@ -124,7 +124,7 @@ order_alleles <- function(nms) { #' the given data frame. Names from the given known_alleles data frame will be #' used for recognized sequences. #' -#' @param data data frame containing Allele1Seq and Allele2Seq colums such as +#' @param data data frame containing Allele1Seq and Allele2Seq columns such as #' the first list item produced by \code{\link{analyze_dataset}}. #' @param known_alleles data frame of custom allele names as defined for #' \code{\link{load_allele_names}}. if NULL only automatically generated diff --git a/man/analyze_sample.Rd b/man/analyze_sample.Rd index 1ad7f38..6ca52c8 100644 --- a/man/analyze_sample.Rd +++ b/man/analyze_sample.Rd @@ -62,12 +62,13 @@ non-stutter artifact sequence criteria as defined by the Artifact column of \code{\link{summarize_sample}}. \item \code{analyze_sample_guided}: version of sample analysis guided by expected -sequence length values. Additional items ExpectedLength1 and optionally -ExpectedLength2 can be supplied in the \code{sample.attrs} list. If NA or -missing the behavior will match \code{analyze_sample}. If two expected -lengths are given, the fraction.min argument is ignored. If at least one -expected length is given, the stutter/artifact filtering is disabled. From -here use \code{\link{summarize_sample_guided}}. +sequence length values. Additional items \code{ExpectedLength1} and +optionally \code{ExpectedLength2} can be supplied in the +\code{sample.attrs} list. If NA or missing the behavior will match +\code{analyze_sample}. If two expected lengths are given, the fraction.min +argument is ignored. If at least one expected length is given, the +stutter/artifact filtering is disabled. From here use +\code{\link{summarize_sample_guided}}. \item \code{analyze_sample_naive}: version of sample analysis without stutter/artifact filtering. From here use \code{\link{summarize_sample}} diff --git a/man/analyze_seqs.Rd b/man/analyze_seqs.Rd index 62116cf..b19096b 100644 --- a/man/analyze_seqs.Rd +++ b/man/analyze_seqs.Rd @@ -26,27 +26,27 @@ all loci are treated equally. \details{ Columns in the returned data frame: \itemize{ -\item Seq: sequence text for each unique sequence -\item Count: integer count of occurrences of this exact sequence -\item Length: integer sequence length -\item MatchingLocus: factor for the name of the locus matching each sequence, -by checking the primer -\item MotifMatch: logical: are there are least \code{nrepeats} perfect +\item \code{Seq}: sequence text for each unique sequence +\item \code{Count}: integer count of occurrences of this exact sequence +\item \code{Length}: integer sequence length +\item \code{MatchingLocus}: factor for the name of the locus matching each +sequence, by checking the primer +\item \code{MotifMatch}: logical: are there are least \code{nrepeats} perfect adjacent repeats of the STR motif for the matching locus? -\item LengthMatch: logical: is the sequence length within the expected range -for the matching locus? -\item Ambiguous: logical: are there unexpected characters in the sequence +\item \code{LengthMatch}: logical: is the sequence length within the expected +range for the matching locus? +\item \code{Ambiguous}: logical: are there unexpected characters in the sequence content? -\item Stutter: integer: for any sequence that looks like potential PCR stutter, -the index of the row that may be the source of the stutter band. -\item Artifact: integer: for any sequence that looks like potential PCR artifact -(other than stutter), the index of the row that may be the source of the -stutter band. -\item FractionOfTotal: numeric fraction of the number of sequences +\item \code{Stutter}: integer: for any sequence that looks like potential PCR +stutter, the index of the row that may be the source of the stutter band. +\item \code{Artifact}: integer: for any sequence that looks like potential PCR +artifact (other than stutter), the index of the row that may be the source +of the stutter band. +\item \code{FractionOfTotal}: numeric fraction of the number of sequences represented by each unique sequence compared to the total. -\item FractionOfLocus: numeric fraction of the number of sequences represented -by each unique sequence compared to the total for that particular -matching locus. +\item \code{FractionOfLocus}: numeric fraction of the number of sequences +represented by each unique sequence compared to the total for that +particular matching locus. } } \examples{ diff --git a/man/categorize_genotype_results.Rd b/man/categorize_genotype_results.Rd index 11066fa..7abbefb 100644 --- a/man/categorize_genotype_results.Rd +++ b/man/categorize_genotype_results.Rd @@ -16,10 +16,10 @@ factor defining genotyping result category for every row of the input data frame. } \description{ -For a given results summary data frame that has CorrectAllele1Seq and Correct -Allele2Seq columns (such as produced by \code{\link{match_known_genotypes}}) -added, create a factor labeling every row of the input data frame by its -genotyping outcome. +For a given results summary data frame that has \code{CorrectAllele1Seq} and +\code{CorrectAllele2Seq} columns (such as produced by +\code{\link{match_known_genotypes}}) added, create a factor labeling every +row of the input data frame by its genotyping outcome. } \details{ Levels in the returned factor, in order: @@ -32,6 +32,6 @@ were supplied. but no second allele was called. } -Cases that should not occur, such as CorrectAllele1Seq and CorrectAllele2Seq -both set to NA, map to NA in the returned factor. +Cases that should not occur, such as \code{CorrectAllele1Seq} and +\code{CorrectAllele2Seq} both set to NA, map to NA in the returned factor. } diff --git a/man/chiimp-package.Rd b/man/chiimp-package.Rd index 2ba8a89..3860e32 100644 --- a/man/chiimp-package.Rd +++ b/man/chiimp-package.Rd @@ -76,55 +76,56 @@ For defaults used in the configuration, see \code{\link{config.defaults}}. The workflow above outlines CHIIMP's behavior when called as a standalone program, where \code{\link{main}} loads a configuration file into a nested list of options and calls \code{\link{full_analysis}}. The public functions -linked above can also be used idependently; see the documentation and code +linked above can also be used independently; see the documentation and code examples for the individual functions for more information. \strong{The Package structure of the source files, grouped by topic:} \itemize{ \item Main Interface: \itemize{ -\item chiimp.R: Main entry point for command-line usage (\code{\link{main}}) -and R usage (\code{\link{full_analysis}}). +\item \code{chiimp.R}: Main entry point for command-line usage +(\code{\link{main}}) and R usage (\code{\link{full_analysis}}). } \item Data Analysis: \itemize{ -\item analyze_dataset.R: High-level interface to analyze all samples across a -given dataset (\code{\link{analyze_dataset}}); used by +\item \code{analyze_dataset.R}: High-level interface to analyze all samples +across a given dataset (\code{\link{analyze_dataset}}); used by \code{\link{full_analysis}} to manage the main part of the processing. -\item summarize_dataset.R: High-level interface to provide inter-sample and -inter-locus analyses (\code{\link{summarize_dataset}}); used by +\item \code{summarize_dataset.R}: High-level interface to provide inter-sample +and inter-locus analyses (\code{\link{summarize_dataset}}); used by \code{\link{full_analysis}} to manage the second stage of the processing. -\item analyze_seqs.R: Low-level interface to convert raw sequence input to a -data frame of unique sequences (\code{\link{analyze_seqs}}); used by -\code{\link{analyze_dataset}}. -\item analyze_sample.R: Low-level interface to extract per-locus details from -a data frame of unique sequences (\code{\link{analyze_sample}}); used by -\code{\link{analyze_dataset}}. -\item summarize_sample.R: Low-level interface to condense each sample data -frame into a a concise list of consistent attributes, suitable for +\item \code{analyze_seqs.R}: Low-level interface to convert raw sequence input +to a data frame of unique sequences (\code{\link{analyze_seqs}}); used +by \code{\link{analyze_dataset}}. +\item \code{analyze_sample.R}: Low-level interface to extract per-locus +details from a data frame of unique sequences +(\code{\link{analyze_sample}}); used by \code{\link{analyze_dataset}}. +\item \code{summarize_sample.R}: Low-level interface to condense each sample +data frame into a a concise list of consistent attributes, suitable for binding together across samples for a dataset (\code{\link{summarize_sample}}); used by \code{\link{analyze_dataset}}. -\item categorize.R: Low-level helper functions used by +\item \code{categorize.R}: Low-level helper functions used by \code{\link{summarize_dataset}} for samples with known identity. } \item Plotting and reporting: \itemize{ -\item report.R: Various plotting and summarizing functions used when rendering -a report in \code{\link{full_analysis}}. -\item histogram.R: Sequence histogram plotting tools (\code{\link{histogram}}) -as used during \code{\link{full_analysis}}. -\item markdown.R: Various helper functions for adding tables and plots to an R -Markdown report as used in \code{\link{full_analysis}}. +\item \code{report.R}: Various plotting and summarizing functions used when +rendering a report in \code{\link{full_analysis}}. +\item \code{histogram.R}: Sequence histogram plotting tools +(\code{\link{histogram}}) as used during \code{\link{full_analysis}}. +\item \code{markdown.R}: Various helper functions for adding tables and plots +to an R Markdown report as used in \code{\link{full_analysis}}. } \item Utility Functions and Configuration: \itemize{ -\item configuration.R: The default configuration options +\item \code{configuration.R}: The default configuration options (\code{\link{config.defaults}}) used by \code{\link{full_analysis}}. -\item io.R: various helper input/output functions used loading and saving -sequence data files, spreadsheets, and plots used in multiple parts of the +\item \code{io.R}: various helper input/output functions used loading and +saving sequence data files, spreadsheets, and plots used in multiple +parts of the package. +\item \code{util.R}: Various helper functions used in multiple parts of the package. -\item util.R: Various helper functions used in multiple parts of the package. } } } diff --git a/man/config.defaults.Rd b/man/config.defaults.Rd index 18c3a0c..ee7df56 100644 --- a/man/config.defaults.Rd +++ b/man/config.defaults.Rd @@ -19,22 +19,22 @@ itself to see all of the build-time defaults. \details{ Notable Options: \itemize{ -\item dataset_opts: +\item \code{dataset_opts}: \itemize{ -\item dp: directory path to input sequence files -\item pattern: regular expression for the input filename pattern -\item ord: order of fields Replicate, Sample, and Locus in in the input -filename pattern. For example, if Locus is the first field followed by -Replicate and Sample, set \code{ord=c(3, 1, 2)}. +\item \code{dp}: directory path to input sequence files +\item \code{pattern}: regular expression for the input filename pattern +\item \code{ord}: order of fields Replicate, Sample, and Locus in in the +input filename pattern. For example, if Locus is the first field +followed by Replicate and Sample, set \code{ord=c(3, 1, 2)}. } -\item output: +\item \code{output}: \itemize{ -\item dp: directory path for saving output data +\item \code{dp}: directory path for saving output data } -\item fp_dataset: file path to table of sample attributes to use, rather than -detecting via dataset_opts -\item fp_locus_attrs: file path to locus attributes CSV file -\item fp_genotypes_known: file path to known genotypes CSV file +\item \code{fp_dataset}: file path to table of sample attributes to use, rather +than detecting via dataset_opts +\item \code{fp_locus_attrs}: file path to locus attributes CSV file +\item \code{fp_genotypes_known}: file path to known genotypes CSV file } } \keyword{datasets} diff --git a/man/find_artifact.Rd b/man/find_artifact.Rd index 4f524ad..48e4b40 100644 --- a/man/find_artifact.Rd +++ b/man/find_artifact.Rd @@ -23,8 +23,8 @@ integer vector specifying, for each entry, the row index for another Searches a processed STR sample for entries that may be PCR artifacts, other than stutter, from another entry in the sample. Potential artifacts are sequences with counts lower than another sequence by a given ratio and -sequence length within 1 bp of the other sequence. This only considers -STR-labeled rows and requires a given entry to have counts at most +sequence length within 1 nucleotide of the other sequence. This only +considers STR-labeled rows and requires a given entry to have counts at most \code{count.ratio_max} compared to the candidate "source" entry to be considered an artifact. Sequence content is not currently considered, just relative sequence lengths and counts. diff --git a/man/full_analysis.Rd b/man/full_analysis.Rd index 2f82d44..5bc9774 100644 --- a/man/full_analysis.Rd +++ b/man/full_analysis.Rd @@ -37,5 +37,4 @@ config_path <- system.file("example_config.yml", package = "chiimp") config <- load_config(config_path) results <- full_analysis(config) - } diff --git a/man/load_locus_attrs.Rd b/man/load_locus_attrs.Rd index d985235..a981c85 100644 --- a/man/load_locus_attrs.Rd +++ b/man/load_locus_attrs.Rd @@ -23,14 +23,14 @@ frame then used by \code{\link{analyze_seqs}} within \details{ Columns Required: \itemize{ -\item Locus: Unique identifier for a given locus -\item LengthMin: Minimum known allele sequence length for this locus -\item LengthMax: Minimum known allele sequence length for this locus -\item LengthBuffer: Additional sequence length below LengthMin and above -LengthMax to accept for a candidate allele -\item Primer: The forward PCR primer sequence for a given locus, used when -matching sequences to loci -\item ReversePrimer: The reverse PCR primer sequence +\item \code{Locus}: Unique identifier for a given locus +\item \code{LengthMin}: Minimum known allele sequence length for this locus +\item \code{LengthMax}: Minimum known allele sequence length for this locus +\item \code{LengthBuffer}: Additional sequence length below \code{LengthMin} +and above \code{LengthMax} to accept for a candidate allele +\item \code{Primer}: The forward PCR primer sequence for a given locus, used +when matching sequences to loci +\item \code{ReversePrimer}: The reverse PCR primer sequence } } \examples{ diff --git a/man/load_seqs.Rd b/man/load_seqs.Rd index 19d527f..b749013 100644 --- a/man/load_seqs.Rd +++ b/man/load_seqs.Rd @@ -14,6 +14,6 @@ vector of sequences } \description{ Load a vector of character sequences from the given path. This is just a -wrapper around dnar to choose the parser based on filename. Only the -sequences are returned, not IDs or quality scores. +wrapper around \code{\link[dnar:read.fa]{dnar}} to choose the parser based on +filename. Only the sequences are returned, not IDs or quality scores. } diff --git a/man/match_known_genotypes.Rd b/man/match_known_genotypes.Rd index aadd7d7..7022d40 100644 --- a/man/match_known_genotypes.Rd +++ b/man/match_known_genotypes.Rd @@ -21,8 +21,8 @@ data frame with two columns for the two correct alleles, and rows \description{ Using the Name column of the given results summary data frame, pair each called genotype with the known alleles. A data frame with two columns, -CorrectAllele1Seq and CorrectAllele2Seq, is returned. If matching entries are -found in Allele1Seq and/or Allele2Seq the order will be preserved, and at -this point the two allele entries should match up directly for genotypes that -were called correctly. +\code{CorrectAllele1Seq} and \code{CorrectAllele2Seq}, is returned. If +matching entries are found in \code{Allele1Seq} and/or \code{Allele2Seq} the +order will be preserved, and at this point the two allele entries should +match up directly for genotypes that were called correctly. } diff --git a/man/name_alleles_in_table.Rd b/man/name_alleles_in_table.Rd index b888895..02fee4f 100644 --- a/man/name_alleles_in_table.Rd +++ b/man/name_alleles_in_table.Rd @@ -7,7 +7,7 @@ name_alleles_in_table(data, known_alleles = NULL, name_args = list()) } \arguments{ -\item{data}{data frame containing Allele1Seq and Allele2Seq colums such as +\item{data}{data frame containing Allele1Seq and Allele2Seq columns such as the first list item produced by \code{\link{analyze_dataset}}.} \item{known_alleles}{data frame of custom allele names as defined for diff --git a/man/name_known_sequences.Rd b/man/name_known_sequences.Rd index 7be5abf..e6ba414 100644 --- a/man/name_known_sequences.Rd +++ b/man/name_known_sequences.Rd @@ -19,16 +19,17 @@ generated for the summary will be used.} } \value{ list of results, with \code{summary} set to the single summary data - frame and \code{data} the per-sample data frames. A "SeqName" column in - sample data frames and "Allele1Name" and "Allele2Name" columns in the - summary data frame will associate any sequence matching a known allele (for - either the given table or the current dataset) with a text name. + frame and \code{data} the per-sample data frames. A \code{SeqName} column + in sample data frames and \code{Allele1Name} and \code{Allele2Name} columns + in the summary data frame will associate any sequence matching a known + allele (for either the given table or the current dataset) with a text + name. } \description{ For the given results list (pair of summary data frame and list of per-sample data frames as produced by \code{\link{tidy_analyzed_dataset}}), add columns to all data frames defining names for recognized sequences. For the summary -data frame this will be Allele1Name and Allele2Name. For each sample data -frame this will be SeqName, defined for any sequences represented in the -summary or in a given known alleles set. +data frame this will be \code{Allele1Name} and \code{Allele2Name}. For each +sample data frame this will be \code{SeqName}, defined for any sequences +represented in the summary or in a given known alleles set. } diff --git a/man/plot_dist_mat.Rd b/man/plot_dist_mat.Rd index 7cd1342..258b00a 100644 --- a/man/plot_dist_mat.Rd +++ b/man/plot_dist_mat.Rd @@ -11,7 +11,7 @@ plot_dist_mat(dist_mat, num.alleles = max(dist_mat), \item{dist_mat}{distance matrix as produced by \code{\link{summarize_dataset}} via \code{\link{make_dist_mat}}.} -\item{num.alleles}{the maximum number of matching/mis-matching alleles. Used +\item{num.alleles}{the maximum number of matching/mismatching alleles. Used to determine color scaling. Defaults to the highest observed distance in the matrix.} diff --git a/man/plot_heatmap.Rd b/man/plot_heatmap.Rd index 26861e8..9abb8c1 100644 --- a/man/plot_heatmap.Rd +++ b/man/plot_heatmap.Rd @@ -25,8 +25,8 @@ lengths.} } \description{ Given a cross-sample summary data frame as produced by -\code{\link{analyze_dataset}} and the name of a column (e.g., Stutter, -Homozygous, ProminentSequences), plot a heatmap of the values for that -attribute, with sample identifiers on rows and loci on columns. The +\code{\link{analyze_dataset}} and the name of a column (e.g., \code{Stutter}, +\code{Homozygous}, \code{ProminentSequences}), plot a heatmap of the values +for that attribute, with sample identifiers on rows and loci on columns. The attribute will be coerced to numeric. } diff --git a/man/report_idents.Rd b/man/report_idents.Rd index 168dbbc..bbba4cd 100644 --- a/man/report_idents.Rd +++ b/man/report_idents.Rd @@ -16,7 +16,7 @@ report_idents(results, closest, na.replicates = "") column.} } \value{ -data frame showing summary of sample genotypes with interleved +data frame showing summary of sample genotypes with interleaved genotypes for similar known individuals. } \description{ diff --git a/man/save_alignments.Rd b/man/save_alignments.Rd index 8356c93..45e938f 100644 --- a/man/save_alignments.Rd +++ b/man/save_alignments.Rd @@ -14,7 +14,7 @@ of each alignment will be used for its filename.} \item{dp}{output directory path.} } \description{ -Take a list of alignments, one per locus, and save each to a separate fasta +Take a list of alignments, one per locus, and save each to a separate FASTA file in a specified directory. If any of the per-locus alignment objects is NA it will be skipped. These are produced by \code{\link{summarize_dataset}} via \code{\link{align_alleles}}. diff --git a/man/save_seqfile_data.Rd b/man/save_seqfile_data.Rd index c0f8d12..5031456 100644 --- a/man/save_seqfile_data.Rd +++ b/man/save_seqfile_data.Rd @@ -17,7 +17,7 @@ Save each per-file data frame produced by \code{\link{analyze_dataset}} (via \code{\link{analyze_seqs}}) to a separate file in the specified directory path, in CSV format. The directory structure will start at the first shared directory of the input file paths. -For example, if the inputs were /data/run1/file.fastq and -/data/run2/file.fastq there will be run1 and run2 directories inside the -given `dp` directory. +For example, if the inputs were \code{/data/run1/file.fastq} and +\code{/data/run2/file.fastq} there will be run1 and run2 directories inside +the given \code{dp} directory. } diff --git a/man/summarize_attribute.Rd b/man/summarize_attribute.Rd index e996ea4..7e60363 100644 --- a/man/summarize_attribute.Rd +++ b/man/summarize_attribute.Rd @@ -21,6 +21,6 @@ data frame of attribute across samples and loci. \description{ Tabulate a single arbitrary attribute across loci, assuming repeats by two for the alleles. This is used for color-coding summary heatmaps (see -\code{\link{plot_heatmap}}) on top of the attribute values, like Homozgyous -or ProminentSeqs. +\code{\link{plot_heatmap}}) on top of the attribute values, like +\code{Homozygous} or \code{ProminentSeqs}. } diff --git a/man/summarize_dataset.Rd b/man/summarize_dataset.Rd index a1dad0b..8b9c03e 100644 --- a/man/summarize_dataset.Rd +++ b/man/summarize_dataset.Rd @@ -25,11 +25,12 @@ additional entries for inter-sample and inter-locus analyses. \details{ Additional entries in the returned list: \itemize{ -\item alignments: inter-allele alignments for each locus, from +\item \code{alignments}: inter-allele alignments for each locus, from \code{\link{align_alleles}}. -\item dist_mat: inter-sample distance matrix, from \code{\link{make_dist_mat}}. -\item dist_mat_known: if genotypes.known is given, this distance matrix of -sample-to-individual values will be present, from +\item \code{dist_mat}: inter-sample distance matrix, from +\code{\link{make_dist_mat}}. +\item \code{dist_mat_known}: if genotypes.known is given, this distance matrix +of sample-to-individual values will be present, from \code{\link{make_dist_mat_known}}. } @@ -38,12 +39,13 @@ If genotypes.known is given \emph{and} a Name column is present in genotypes.known and additional columns will be present in the summary data frame: \itemize{ -\item CorrectAllele1Seq: One correct allele sequence for the individual. The -order of this and \code{CorrectAllele2Seq} will be matched to +\item \code{CorrectAllele1Seq}: One correct allele sequence for the individual. +The order of this and \code{CorrectAllele2Seq} will be matched to \code{Allele1Seq} and \code{Allele2Seq} if possible. See \code{\link{match_known_genotypes}}. -\item CorrectAllele2Seq: A second correct allele sequence, as above. -\item GenotypeResult: Categorization for each entry as Correct, Incorrect, -Blank, or Dropped Allele. See \code{\link{categorize_genotype_results}}. +\item \code{CorrectAllele2Seq}: A second correct allele sequence, as above. +\item \code{GenotypeResult}: Categorization for each entry as Correct, +Incorrect, Blank, or Dropped Allele. See +\code{\link{categorize_genotype_results}}. } } diff --git a/man/summarize_sample.Rd b/man/summarize_sample.Rd index 09afe31..057686b 100644 --- a/man/summarize_sample.Rd +++ b/man/summarize_sample.Rd @@ -38,23 +38,25 @@ Entries in the returned list: \itemize{ \item For Allele1 and Allele2: \itemize{ -\item Seq: sequence text for each allele. -\item Count: integer count of occrrences of this exact sequence. -\item Length: integer sequence length. +\item \code{Seq}: sequence text for each allele. +\item \code{Count}: integer count of occurrences of this exact sequence. +\item \code{Length}: integer sequence length. } -\item Homozygous: If the sample appears homozygous (if so, the Allele2 entries -will be NA). -\item Ambiguous: If a potential allele was ignored due to ambiguous bases in -sequence content (such as "N"). -\item Stutter: If a potential allele was ignored due to apparent PCR stutter. -\item Artifact: If a potential allele was ignored due to apparent PCR artifact -(other than stutter). -\item CountTotal: The total number of sequences in the original sample data. -\item CountLocus: The number of sequences matching all criteria for the +\item \code{Homozygous}: If the sample appears homozygous (if so, the Allele2 +entries will be NA). +\item \code{Ambiguous}: If a potential allele was ignored due to ambiguous bases +in sequence content (such as "N"). +\item \code{Stutter}: If a potential allele was ignored due to apparent PCR +stutter. +\item \code{Artifact}: If a potential allele was ignored due to apparent PCR +artifact (other than stutter). +\item \code{CountTotal}: The total number of sequences in the original sample +data. +\item \code{CountLocus}: The number of sequences matching all criteria for the specified locus in the original sample data. -\item ProminentSeqs: The number of entries above the specified threshold after -all filtering. This should be either one (for a homozygous sample) or two -(for a heterozygous sample) but conditions such as cross-sample +\item \code{ProminentSeqs}: The number of entries above the specified threshold +after all filtering. This should be either one (for a homozygous sample) or +two (for a heterozygous sample) but conditions such as cross-sample contamination or excessive PCR stutter can lead to more than two. } } @@ -63,8 +65,8 @@ contamination or excessive PCR stutter can lead to more than two. \item \code{summarize_sample}: Default version of sample summary. \item \code{summarize_sample_guided}: Summarize a processed STR sample Using known -lengths. If ExpectedLength1 and optionally ExpectedLength2 are given in -\code{sample.attrs}, the \code{counts.min} threshold is ignored. See also -\code{\link{analyze_sample_guided}}. +lengths. If \code{ExpectedLength1} and optionally \code{ExpectedLength2} +are given in \code{sample.attrs}, the \code{counts.min} threshold is +ignored. See also \code{\link{analyze_sample_guided}}. }} diff --git a/man/tabulate_allele_names.Rd b/man/tabulate_allele_names.Rd index bafc3b6..b716861 100644 --- a/man/tabulate_allele_names.Rd +++ b/man/tabulate_allele_names.Rd @@ -7,7 +7,7 @@ tabulate_allele_names(data, extra_cols = NULL) } \arguments{ -\item{data}{data frame containing Allele1Name and Allele2Name colums such as +\item{data}{data frame containing Allele1Name and Allele2Name columns such as the first list item produced by \code{\link{analyze_dataset}}. If allele names are not yet present call \code{\link{name_alleles_in_table}}.} From 19bed3b989f74bcc2c39f61be9ed8a424de60e41 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Fri, 14 Dec 2018 09:27:01 -0500 Subject: [PATCH 03/19] update NEWS for spell-checking --- NEWS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS.md b/NEWS.md index 82cea8e..f6b67e8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# chiimp dev + + * Added documentation corrections and improvements. + # chiimp 0.2.2 * Fixed heatmap plotting via updated `plot_heatmap` for cases with blank From a1223744eb30f376ffddf2da954f46a138a84feb Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 13 Mar 2019 13:05:34 -0400 Subject: [PATCH 04/19] clean up lingering hardcoded path --- R/zz_helper_data.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/zz_helper_data.R b/R/zz_helper_data.R index 828fe91..bad484c 100644 --- a/R/zz_helper_data.R +++ b/R/zz_helper_data.R @@ -29,6 +29,7 @@ test_data <- within(list(), { header = TRUE, stringsAsFactors = FALSE, sep = ",") + rm(f.locus_attrs) rownames(locus_attrs) <- locus_attrs$Locus sample.data.cols <- c("Seq", "Count", "Length", "MatchingLocus", "MotifMatch", From 1e02e78620fb82250b856e27312bc04538582926 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 13 Mar 2019 16:26:30 -0400 Subject: [PATCH 05/19] Set RNG behavior to match R 3.5.3 I'm hoping this will resolve inconsistencies in RNG behavior between 3.5.3 and the ongoing development that looks to be heading to 3.6.0. --- R/zz_helper_data.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/R/zz_helper_data.R b/R/zz_helper_data.R index bad484c..2b5c353 100644 --- a/R/zz_helper_data.R +++ b/R/zz_helper_data.R @@ -16,6 +16,11 @@ #' This list is a bundle of shared data and functions for running unit tests. #' @export test_data <- within(list(), { + # This is a particularly awkward approach now that in the development branch + # for version 3.6.0 the random number generator has changed its behavior. + # The below is a stopgap measure but this should really be reorganized to not + # need to generate the test data at build-time. + RNGversion("3.5.3") # Careful! When running via a package check we might be in temporary # installed copy in /tmp or elsewhere, and probably won't have the "inst" # directory anymore. Alternatively when running with devtools::test() we From 684c70c48af5812ac94716a7429c749e5c8d00a1 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 13 Mar 2019 16:37:42 -0400 Subject: [PATCH 06/19] update man for test_data --- man/test_data.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/test_data.Rd b/man/test_data.Rd index 41a73b6..de1bd8e 100644 --- a/man/test_data.Rd +++ b/man/test_data.Rd @@ -4,7 +4,7 @@ \name{test_data} \alias{test_data} \title{Helper Data for Tests} -\format{An object of class \code{list} of length 18.} +\format{An object of class \code{list} of length 17.} \usage{ test_data } From 57cb5dea652ac83726a78f66fc8bd8f59a48ecea Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 13 Mar 2019 16:53:19 -0400 Subject: [PATCH 07/19] For #16: safely unlink all temp testing dirs Use unlink instead of file.remove for directories, as only the former works on Windows. Also make sure to clean up the test files and directories for each test. --- tests/testthat/test_analyze_dataset.R | 11 +++++------ tests/testthat/test_io.R | 21 ++++++++++++++++++--- tests/testthat/test_report.R | 5 +++++ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test_analyze_dataset.R b/tests/testthat/test_analyze_dataset.R index 1f72c82..e3f95fb 100644 --- a/tests/testthat/test_analyze_dataset.R +++ b/tests/testthat/test_analyze_dataset.R @@ -28,8 +28,7 @@ with(test_data, { summary_opts = list(counts.min = 500), nrepeats = 3, ncores = 1) - lapply(dataset$Filename, file.remove) - file.remove(data.dir) + unlink(x = data.dir, recursive = TRUE) # Check the overall structure expect_equal(sapply(results, class), c(summary = "data.frame", @@ -47,8 +46,7 @@ with(test_data, { summary_opts = list(counts.min = 500), nrepeats = 3, ncores = 1) - lapply(dataset$Filename, file.remove) - file.remove(data.dir) + unlink(x = data.dir, recursive = TRUE) # Check the summary data frame with(results$summary, { # First update ordering of dataset's rows. The existing order should be @@ -123,8 +121,7 @@ with(test_data, { nrepeats = 3, ncores = 1, known_alleles = known_alleles) - lapply(dataset$Filename, file.remove) - file.remove(data.dir) + unlink(x = data.dir, recursive = TRUE) # Check that the resulting allele names match all the expected values with(results$summary, { @@ -182,6 +179,7 @@ with(test_data, { nrepeats = 3, ncores = 1) }, "ERROR: Locus names in dataset not in attributes table: a, b") + unlink(x = data.dir, recursive = TRUE) }) test_that("analyze_dataset warns of empty input files", { @@ -202,6 +200,7 @@ with(test_data, { }, type = "message") msg_exp <- "WARNING: Zero reads for 1 of 12 data files" expect_true(length(grep(msg_exp, msg)) == 1) + unlink(x = data.dir, recursive = TRUE) }) }) diff --git a/tests/testthat/test_io.R b/tests/testthat/test_io.R index 62499c8..a8fe23a 100644 --- a/tests/testthat/test_io.R +++ b/tests/testthat/test_io.R @@ -119,11 +119,12 @@ with(test_data, { setwd(data.dir) touch(dataset_known$Filename) # Write dataset CSV - fp <- tempfile() + fp <- tempfile(tmpdir = data.dir) write.csv(dataset_known, file = fp, na = "", row.names = FALSE) expect_silent({ dataset <- load_dataset(fp) }) + unlink(x = data.dir, recursive = TRUE) expect_identical(dataset, dataset_known) }) @@ -134,13 +135,14 @@ with(test_data, { data.dir <- tempfile() dir.create(data.dir) setwd(data.dir) - fp <- tempfile() + fp <- tempfile(tmpdir = data.dir) write.csv(dataset_known, file = fp, na = "", row.names = FALSE) # expect_message and capture_messages both do NOT catch text send to stderr, # though capture.output(..., type = "message") does. msg <- capture.output({ dataset <- load_dataset(fp) }, type = "message") + unlink(x = data.dir, recursive = TRUE) expect_true(length(grep("WARNING: Missing 60 of 60 data files", msg)) == 1) expect_identical(dataset, dataset_known) }) @@ -157,9 +159,10 @@ with(test_data, { setwd(data.dir) dataset_known <- setup_dataset() touch(dataset_known$Filename) - fp <- tempfile() + fp <- tempfile(tmpdir = data.dir) save_dataset(dataset_known, fp) dataset <- load_dataset(fp) + unlink(x = data.dir, recursive = TRUE) expect_identical(dataset, dataset_known) }) @@ -174,6 +177,7 @@ with(test_data, { # by default the field ordering is assumed to be replicate, sample, locus data <- setup_data_dir(replicates, samples, loci) dataset <- prepare_dataset(data$dp, data$pattern) + unlink(x = data$dp, recursive = TRUE) expect_equal(colnames(dataset), c("Filename", "Replicate", "Sample", "Locus")) expect_equal(sort(dataset$Filename), sort(data$fps)) @@ -199,6 +203,7 @@ with(test_data, { ord <- c(3, 1, 2) data <- setup_data_dir(replicates, samples, loci, ord) dataset <- prepare_dataset(data$dp, "([A-Za-z0-9]+)-(\\d+)-(\\d+)", ord) + unlink(x = data$dp, recursive = TRUE) expect_equal(colnames(dataset), c("Filename", "Locus", "Replicate", "Sample")) expect_equal(sort(dataset$Filename), sort(data$fps)) @@ -220,6 +225,7 @@ with(test_data, { # warning. data <- setup_data_dir(replicates, samples, loci) expect_warning(dataset <- prepare_dataset(data$dp, "(\\d+)-(\\d+)")) + unlink(x = data$dp, recursive = TRUE) }) test_that("prepare_dataset warns of repeated identifier rows", { @@ -234,6 +240,7 @@ with(test_data, { dataset <- prepare_dataset(data$dp, data$pattern) }, "Some replicate/sample/locus combinations match multiple files") + unlink(x = data$dp, recursive = TRUE) }) test_that("prepare_dataset can autolabel replicates", { @@ -247,6 +254,7 @@ with(test_data, { dataset <- prepare_dataset(data$dp, pattern = "()1-(\\d+)-([A-Za-z0-9]+)", autorep = TRUE) + unlink(x = data$dp, recursive = TRUE) extras <- paste0(data$fps[3], c(".2", ".3")) expect_equal(sort(dataset$Filename), sort(c(data$fps, extras))) expect_equal(as.character(dataset$Locus), @@ -278,6 +286,7 @@ with(test_data, { c("Filename", "Replicate", "Sample", "Locus")) expect_equal(sort(dataset$Filename), sort(list.files(dp, recursive = TRUE, full.names = TRUE))) + unlink(x = c(data1, data2, dp), recursive = TRUE) }) test_that("prepare_dataset can separate multiplexed samples", { @@ -302,6 +311,7 @@ with(test_data, { dataset_known$Replicate <- as.integer(dataset_known$Replicate) # Read dataset from disk using the mapping of locus names dataset <- prepare_dataset(data$dp, data$pattern, locusmap = locusmap) + unlink(x = data$dp, recursive = TRUE) # Aside from the different filenames, does everything match up? dataset_known$Filename <- dataset$Filename expect_equal(dataset, dataset_known) @@ -312,6 +322,7 @@ with(test_data, { expect_error({ prepare_dataset(dp, "(\\d+)-(\\d+)-([A-Za-z0-9]+)") }, paste("ERROR: directory path for data files does not exist:", dp)) + unlink(x = dp, recursive = TRUE) }) test_that("prepare_dataset handles no-samples case", { @@ -320,6 +331,7 @@ with(test_data, { expect_error({ prepare_dataset(dp, "(\\d+)-(\\d+)-([A-Za-z0-9]+)") }, paste("ERROR: no data files found:", dp)) + unlink(x = dp, recursive = TRUE) }) @@ -344,6 +356,7 @@ with(test_data, { fps_observed <- sort(file.path(data.dir, list.files(dp_out, recursive = TRUE))) expect_equal(fps_observed, fps_expected) + unlink(x = data.dir, recursive = TRUE) }) test_that("save_seqfile_data works with directory trees", { @@ -367,6 +380,7 @@ with(test_data, { fps_observed <- sort(file.path(data.dir, list.files(dp_out, recursive = TRUE))) expect_equal(fps_observed, fps_expected) + unlink(x = data.dir, recursive = TRUE) }) test_that("save_seqfile_data works with Windows-style paths", { @@ -393,6 +407,7 @@ with(test_data, { fps_observed <- sort(file.path(data.dir, list.files(dp_out, recursive = TRUE))) expect_equal(fps_observed, fps_expected) + unlink(x = data.dir, recursive = TRUE) }) }) diff --git a/tests/testthat/test_report.R b/tests/testthat/test_report.R index 39d9d8c..64133ee 100644 --- a/tests/testthat/test_report.R +++ b/tests/testthat/test_report.R @@ -11,6 +11,7 @@ with(test_data, { png(fp_img) plot_data <- plot_alignment(alignments[["A"]]) dev.off() + unlink(x = fp_img) groups <- c(" 162 bp", " 178 bp", " 182 bp", " 194 bp") groups <- factor(groups) labels <- c("2", "1", "2", "1") @@ -88,6 +89,7 @@ with(test_data, { png(fp_img) plot_data <- plot_heatmap(results, "Stutter") dev.off() + unlink(x = fp_img) expect_equal(class(plot_data), "pheatmap") }) }) @@ -104,6 +106,7 @@ with(test_data, { png(fp_img) plot_data <- plot_heatmap(results, "Stutter") dev.off() + unlink(x = fp_img) expect_equal(class(plot_data), "pheatmap") }) }) @@ -118,6 +121,7 @@ with(test_data, { png(fp_img) plot_data <- plot_heatmap(results, "Stutter") dev.off() + unlink(x = fp_img) expect_equal(class(plot_data), "pheatmap") }) }) @@ -133,6 +137,7 @@ with(test_data, { png(fp_img) plot_data <- plot_heatmap(results, "Stutter") dev.off() + unlink(x = fp_img) expect_equal(class(plot_data), "pheatmap") }) }) From 52669186639b7ca7bd19a93ea6de26a7a3fb51f7 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Wed, 13 Mar 2019 17:29:34 -0400 Subject: [PATCH 08/19] For #16: update save_seqfile_data tests Update the save_seqfile_data tests to compare the file paths actually used, rather than a fictional combination of two sets. normalizePath also should fix the tests when running on Windows. --- tests/testthat/test_io.R | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test_io.R b/tests/testthat/test_io.R index a8fe23a..764925b 100644 --- a/tests/testthat/test_io.R +++ b/tests/testthat/test_io.R @@ -352,9 +352,12 @@ with(test_data, { ncores = 1) dp_out <- file.path(data.dir, "results", "processed_files") save_seqfile_data(results$files, dp_out) - fps_expected <- sort(paste0(names(results$files), ".csv")) - fps_observed <- sort(file.path(data.dir, - list.files(dp_out, recursive = TRUE))) + fps_expected <- sort(file.path(dp_out, + paste0(basename(names(results$files)), + ".csv"))) + fps_observed <- sort(list.files(dp_out, + recursive = TRUE, + full.names = TRUE)) expect_equal(fps_observed, fps_expected) unlink(x = data.dir, recursive = TRUE) }) @@ -376,9 +379,13 @@ with(test_data, { ncores = 1) dp_out <- file.path(data.dir, "results", "processed_files") save_seqfile_data(results$files, dp_out) - fps_expected <- sort(paste0(names(results$files), ".csv")) - fps_observed <- sort(file.path(data.dir, - list.files(dp_out, recursive = TRUE))) + fps_expected <- sort(file.path(dp_out, + basename(dirname(names(results$files))), + paste0(basename(names(results$files)), + ".csv"))) + fps_observed <- sort(list.files(dp_out, + recursive = TRUE, + full.names = TRUE)) expect_equal(fps_observed, fps_expected) unlink(x = data.dir, recursive = TRUE) }) @@ -403,9 +410,17 @@ with(test_data, { names(results$files) <- gsub("/", "\\\\", names(results$files)) save_seqfile_data(results$files, dp_out) names(results$files) <- gsub("\\\\", "/", names(results$files)) - fps_expected <- sort(paste0(names(results$files), ".csv")) - fps_observed <- sort(file.path(data.dir, - list.files(dp_out, recursive = TRUE))) + fps_expected <- sort(file.path(dp_out, + basename(dirname(names(results$files))), + paste0(basename(names(results$files)), + ".csv"))) + fps_observed <- sort(list.files(dp_out, + recursive = TRUE, + full.names = TRUE)) + # Normalize any lingering \ or / inconsistencies, so this test should also + # pass on Windows itself. + fps_expected <- normalizePath(fps_expected) + fps_observed <- normalizePath(fps_observed) expect_equal(fps_observed, fps_expected) unlink(x = data.dir, recursive = TRUE) }) From 0ed94f0d089b33fc64571a2954c9591da9cd45c8 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Thu, 14 Mar 2019 08:10:58 -0400 Subject: [PATCH 09/19] remove unneeded unlink() in i/o test That test never actually creates its temp file, so it doesn't need to be removed. --- tests/testthat/test_io.R | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/testthat/test_io.R b/tests/testthat/test_io.R index 764925b..d4e8bea 100644 --- a/tests/testthat/test_io.R +++ b/tests/testthat/test_io.R @@ -322,7 +322,6 @@ with(test_data, { expect_error({ prepare_dataset(dp, "(\\d+)-(\\d+)-([A-Za-z0-9]+)") }, paste("ERROR: directory path for data files does not exist:", dp)) - unlink(x = dp, recursive = TRUE) }) test_that("prepare_dataset handles no-samples case", { From 9fab38a5073a1168ea64f7a3324995e28bf70ab2 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Thu, 14 Mar 2019 08:17:05 -0400 Subject: [PATCH 10/19] For #16: also safely unlink during test_data setup --- R/zz_helper_data.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/zz_helper_data.R b/R/zz_helper_data.R index 2b5c353..8d1aaa0 100644 --- a/R/zz_helper_data.R +++ b/R/zz_helper_data.R @@ -159,8 +159,7 @@ test_data <- within(list(), { results <- analyze_dataset(dataset, locus_attrs, nrepeats = 3, ncores = 1, analysis_opts = list(fraction.min = 0.05), summary_opts = list(counts.min = 500)) - lapply(dataset$Filename, file.remove) - file.remove(data.dir) + unlink(data.dir, recursive = TRUE) return(list(dataset = dataset, results = results)) } From 5566b8dc6d71bdfbf9e2ba3d8e5af09bce7ed2a3 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Thu, 14 Mar 2019 08:21:34 -0400 Subject: [PATCH 11/19] Compare fixed strings when testing error messages testthat::expect_error takes a regex by default, and it just happens to work as expected on Linux but fails on Windows due to \ in file paths. adding grep's fixed = TRUE argument to settle this. --- tests/testthat/test_io.R | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test_io.R b/tests/testthat/test_io.R index d4e8bea..25e4341 100644 --- a/tests/testthat/test_io.R +++ b/tests/testthat/test_io.R @@ -320,16 +320,20 @@ with(test_data, { test_that("prepare_dataset handles missing data directory", { dp <- tempfile() expect_error({ - prepare_dataset(dp, "(\\d+)-(\\d+)-([A-Za-z0-9]+)") - }, paste("ERROR: directory path for data files does not exist:", dp)) + prepare_dataset(dp, "(\\d+)-(\\d+)-([A-Za-z0-9]+)") + }, + paste("ERROR: directory path for data files does not exist:", dp), + fixed = TRUE) }) test_that("prepare_dataset handles no-samples case", { dp <- tempfile() dir.create(dp) expect_error({ - prepare_dataset(dp, "(\\d+)-(\\d+)-([A-Za-z0-9]+)") - }, paste("ERROR: no data files found:", dp)) + prepare_dataset(dp, "(\\d+)-(\\d+)-([A-Za-z0-9]+)") + }, + paste("ERROR: no data files found:", dp), + fixed = TRUE) unlink(x = dp, recursive = TRUE) }) From 9eafffba4795d3c9ceb3765d46784109d2280941 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Thu, 14 Mar 2019 09:09:54 -0400 Subject: [PATCH 12/19] Update NEWS.md for latest changes --- NEWS.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/NEWS.md b/NEWS.md index f6b67e8..7a70f53 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,12 @@ # chiimp dev + * Fixed package checks and testing on latest R development releases ([#27]). + * Fixed test behavior on Windows and improved test organization ([#16]). * Added documentation corrections and improvements. +[#27]: https://github.com/ShawHahnLab/chiimp/issues/27 +[#16]: https://github.com/ShawHahnLab/chiimp/issues/16 + # chiimp 0.2.2 * Fixed heatmap plotting via updated `plot_heatmap` for cases with blank From c0d2afd192143808140c7960809e5d6d98b4695b Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Thu, 14 Mar 2019 09:14:33 -0400 Subject: [PATCH 13/19] Add more reminders to release prep script --- .utils/prep_release.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.utils/prep_release.sh b/.utils/prep_release.sh index ad84f07..7dcf9d7 100755 --- a/.utils/prep_release.sh +++ b/.utils/prep_release.sh @@ -52,3 +52,8 @@ echo " * Update NEWS.md with all updates under a heading matching this version" echo " * Check README.md for link to this version" echo " * Make sure GUIDE.Rmd is up-to-date and rendered GUIDE.pdf is correct" echo +echo "ALSO:" +echo " * Draft release from tag on github including archive files with bundled" +echo " GUIDE.pdf" +echo " * Merge release-### into master, dev, and gh-pages" +echo From 89c0278865b2c8f5257d95a2926b9a30a127896e Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Thu, 14 Mar 2019 09:22:21 -0400 Subject: [PATCH 14/19] bump version to 0.2.3 --- DESCRIPTION | 2 +- NEWS.md | 2 +- README.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e955ea3..533db08 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: chiimp Title: Computational, High-throughput Individual Identification through Microsatellite Profiling -Version: 0.2.2 +Version: 0.2.3 Authors@R: person("Jesse", "Connell", email = "ancon@upenn.edu", role = c("aut", "cre")) Description: An R package to analyze microsatellites in high-throughput sequencing datasets. Depends: R (>= 3.2.3) diff --git a/NEWS.md b/NEWS.md index 7a70f53..410440e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# chiimp dev +# chiimp 0.2.3 * Fixed package checks and testing on latest R development releases ([#27]). * Fixed test behavior on Windows and improved test organization ([#16]). diff --git a/README.md b/README.md index b5dce79..b323ea9 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ high-throughput sequencing datasets. For automated installation and program usage see GUIDE.pdf in a [released version](https://github.com/ShawHahnLab/chiimp/releases). -The most recent released version is [0.2.2](https://github.com/ShawHahnLab/chiimp/releases/tag/0.2.2). +The most recent released version is [0.2.3](https://github.com/ShawHahnLab/chiimp/releases/tag/0.2.3). For usage as an R package also see the built-in package documentation. The package-level page (`?chiimp`) provides an overview with links to specific functions. From 44927eb4e4bc445fa33417613686b7ea6a1291db Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Thu, 14 Mar 2019 09:28:02 -0400 Subject: [PATCH 15/19] Add more documentation links to main README --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b323ea9..8f53aba 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,12 @@ Computational, High-throughput Individual Identification through Microsatellite An R package and standalone program to analyze microsatellites in high-throughput sequencing datasets. -For automated installation and program usage see GUIDE.pdf in a -[released version](https://github.com/ShawHahnLab/chiimp/releases). +For automated installation and program usage see [GUIDE.pdf] here or in a +[released version](https://github.com/ShawHahnLab/chiimp/releases), and the [worked examples]. The most recent released version is [0.2.3](https://github.com/ShawHahnLab/chiimp/releases/tag/0.2.3). For usage as an R package also see the built-in package documentation. The package-level page (`?chiimp`) provides an overview with links to specific functions. + +[GUIDE.pdf]: https://shawhahnlab.github.io/chiimp/GUIDE.pdf +[worked examples]: https://shawhahnlab.github.io/chiimp/docs From bca40d8b58ea01ecebb5dcbddcf572cd25c5ce8e Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Thu, 14 Mar 2019 09:46:17 -0400 Subject: [PATCH 16/19] Add "pause" statement to Windows installer This makes it easier to see the installation outcome. (Otherwise the cmd.exe window closes as soon as the installer finishes.) --- install_windows.cmd | 1 + 1 file changed, 1 insertion(+) diff --git a/install_windows.cmd b/install_windows.cmd index 3441b26..4517fd3 100755 --- a/install_windows.cmd +++ b/install_windows.cmd @@ -17,3 +17,4 @@ set pkgdir=%~dp0 REM Run bulk of the install within R. "%rscript%" --vanilla "%pkgdir%\install_windows.R" +pause From cb3f371495452ebdbc2eac5e349ad562264f2b20 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Thu, 14 Mar 2019 09:47:28 -0400 Subject: [PATCH 17/19] Simplify Windows install; do not auto-update deps Only install devtools and msa if not already present; do not run separate dependency install and package test steps as they're already handled in devtools::install(); Disable devtools' automatic package upgrades during install. --- install_windows.R | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/install_windows.R b/install_windows.R index f9fef14..07cc54f 100755 --- a/install_windows.R +++ b/install_windows.R @@ -21,38 +21,25 @@ if (! any(file.access(.libPaths(), 2) == 0)) { .libPaths(dp) } -cat("\n") -cat("### Installing devtools\n") -cat("\n") -install.packages("devtools", repos = "https://cloud.r-project.org") - -cat("\n") -cat("### Installing Bioconductor and MSA\n") -cat("\n") -source("https://bioconductor.org/biocLite.R") -biocLite("msa") - -cat("\n") -cat("### Installing dependencies\n") -cat("\n") -devtools::install_deps(path, dependencies = TRUE) - -cat("\n") -cat("### Testing CHIIMP\n") -cat("\n") -status <- sum(as.data.frame(devtools::test(path))$failed) -if (status == 1) { +if (! require("devtools", character.only = TRUE, quietly = TRUE)) { cat("\n") + cat("### Installing devtools\n") cat("\n") - cat(" Warning: Tests indicated failures.\n") + install.packages("devtools", repos = "https://cloud.r-project.org") +} + +if (! suppressMessages(require("msa", character.only = TRUE, quietly = TRUE))) { cat("\n") + cat("### Installing Bioconductor and MSA\n") cat("\n") + source("https://bioconductor.org/biocLite.R") + biocLite("msa") } cat("\n") cat("### Installing CHIIMP\n") cat("\n") -devtools::install(path) +devtools::install(path, upgrade = "never") shortcut_path <- file.path(UPROF, "Desktop", "CHIIMP.lnk") chiimp_path <- system.file("bin", "chiimp.cmd", package = "chiimp") From 349222b116a32719c80b3cc96bca45c0c363f25c Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Thu, 14 Mar 2019 10:19:34 -0400 Subject: [PATCH 18/19] quieter package checks for windows installer --- install_windows.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/install_windows.R b/install_windows.R index 07cc54f..b0e9103 100755 --- a/install_windows.R +++ b/install_windows.R @@ -21,14 +21,20 @@ if (! any(file.access(.libPaths(), 2) == 0)) { .libPaths(dp) } -if (! require("devtools", character.only = TRUE, quietly = TRUE)) { +haspkg <- function(pkgname) { + suppressMessages(suppressWarnings( + require(pkgname, character.only = TRUE, quietly = TRUE) + )) +} + +if (! haspkg("devtools")) { cat("\n") cat("### Installing devtools\n") cat("\n") install.packages("devtools", repos = "https://cloud.r-project.org") } -if (! suppressMessages(require("msa", character.only = TRUE, quietly = TRUE))) { +if (! haspkg("msa")) { cat("\n") cat("### Installing Bioconductor and MSA\n") cat("\n") From f4a904a6c85714ca129038843476a68b0bddeee0 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Thu, 14 Mar 2019 10:27:25 -0400 Subject: [PATCH 19/19] Also do not auto-update deps for msa package --- install_windows.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install_windows.R b/install_windows.R index b0e9103..1d22178 100755 --- a/install_windows.R +++ b/install_windows.R @@ -39,7 +39,7 @@ if (! haspkg("msa")) { cat("### Installing Bioconductor and MSA\n") cat("\n") source("https://bioconductor.org/biocLite.R") - biocLite("msa") + biocLite("msa", suppressUpdates = TRUE) } cat("\n")