diff --git a/.utils/prep_release.sh b/.utils/prep_release.sh index 11e21b6..7dcf9d7 100755 --- a/.utils/prep_release.sh +++ b/.utils/prep_release.sh @@ -5,37 +5,55 @@ set -e VERSION=$1 +SEP="===" chiimp_check='x<-devtools::check();quit(save="no",status=length(c(x$errors,x$warnings)))' +echo "$SEP Running spell check" +./.utils/spellcheck.R + # Run lint script -echo "Running lint check" +echo "$SEP Running lint check" ./.utils/lint.R -# Update version in download link in README -VER_MSG="The most recent released version is" -TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag" -SED_README="s:$VER_MSG \\[[0-9.]+\\]\\($TAG_URL/[0-9.]+\\)\\.:$VER_MSG [$VERSION]($TAG_URL/$VERSION).:" -sed -i -r "$SED_README" README.md +if [[ $VERSION != "" ]]; then + # Update version in download link in README + VER_MSG="The most recent released version is" + TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag" + SED_README="s:$VER_MSG \\[[0-9.]+\\]\\($TAG_URL/[0-9.]+\\)\\.:$VER_MSG [$VERSION]($TAG_URL/$VERSION).:" + sed -i -r "$SED_README" README.md -# Update version in DESCRIPTION and NEWS.md -sed -i "s/Version: .*$/Version: $VERSION/" DESCRIPTION -sed -i "s/# chiimp dev/# chiimp $VERSION/" NEWS.md + # Update version in DESCRIPTION and NEWS.md + sed -i "s/Version: .*$/Version: $VERSION/" DESCRIPTION + sed -i "s/# chiimp dev/# chiimp $VERSION/" NEWS.md +fi +echo "$SEP Running devtools::check()" R --slave --vanilla -e "$chiimp_check" + +echo "$SEP Rendering user guide" R --slave --vanilla -e "rmarkdown::render('GUIDE.Rmd', output_file = 'GUIDE.pdf', quiet = TRUE)" # Create bundled ZIP and TGZ versions without hidden top level files (such as # the git and travis stuff) and with the GUIDE.pdf. -pushd .. -zip -r chiimp-v${VERISON}.zip chiimp/* -tar czvf chiimp-v${VERSION}.tgz chiimp/* -popd +if [[ $VERSION != "" ]]; then + echo "$SEP Creating release archives" + pushd .. + zip -r chiimp-v${VERISON}.zip chiimp/* + tar czvf chiimp-v${VERSION}.tgz chiimp/* + popd +fi echo echo "REMINDER BEFORE TAGGING RELEASE $VERSION:" echo echo " * Run full test on Mac OS, Windows, and Linux" echo " * Update NEWS.md with all updates under a heading matching this version" +echo " * Check README.md for link to this version" echo " * Make sure GUIDE.Rmd is up-to-date and rendered GUIDE.pdf is correct" echo +echo "ALSO:" +echo " * Draft release from tag on github including archive files with bundled" +echo " GUIDE.pdf" +echo " * Merge release-### into master, dev, and gh-pages" +echo diff --git a/.utils/spellcheck.R b/.utils/spellcheck.R new file mode 100755 index 0000000..a18399d --- /dev/null +++ b/.utils/spellcheck.R @@ -0,0 +1,12 @@ +#!/usr/bin/env Rscript + +# Spell-check the documentation files. Note they'll have to be updated e.g. +# with devtools::document() first. + +ignore <- read.table(".utils/wordlist.txt", + header = FALSE, + stringsAsFactors = FALSE)[, 1] +results <- devtools::spell_check(ignore = ignore) +if (length(results) > 0) { + results +} diff --git a/.utils/wordlist.txt b/.utils/wordlist.txt new file mode 100644 index 0000000..a40be6a --- /dev/null +++ b/.utils/wordlist.txt @@ -0,0 +1,26 @@ +ABCD +ACTG +artifactual +autocalculated +Autogenerate +CHIIMP +CHIIMP's +config +Connell +dereplicated +Dereplicates +FASTA +FASTQ +genotype +Genotype +heterozygous +Heterozygous +homozygous +Homozygous +MSA +pandoc +Pandoc +PPI +seqs +STR +YAML diff --git a/DESCRIPTION b/DESCRIPTION index e955ea3..533db08 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: chiimp Title: Computational, High-throughput Individual Identification through Microsatellite Profiling -Version: 0.2.2 +Version: 0.2.3 Authors@R: person("Jesse", "Connell", email = "ancon@upenn.edu", role = c("aut", "cre")) Description: An R package to analyze microsatellites in high-throughput sequencing datasets. Depends: R (>= 3.2.3) diff --git a/NEWS.md b/NEWS.md index 82cea8e..410440e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,12 @@ +# chiimp 0.2.3 + + * Fixed package checks and testing on latest R development releases ([#27]). + * Fixed test behavior on Windows and improved test organization ([#16]). + * Added documentation corrections and improvements. + +[#27]: https://github.com/ShawHahnLab/chiimp/issues/27 +[#16]: https://github.com/ShawHahnLab/chiimp/issues/16 + # chiimp 0.2.2 * Fixed heatmap plotting via updated `plot_heatmap` for cases with blank diff --git a/R/analyze_dataset.R b/R/analyze_dataset.R index 45e2eaf..7849d75 100644 --- a/R/analyze_dataset.R +++ b/R/analyze_dataset.R @@ -189,9 +189,9 @@ tidy_analyzed_dataset <- function(dataset, raw.results) { #' For the given results list (pair of summary data frame and list of per-sample #' data frames as produced by \code{\link{tidy_analyzed_dataset}}), add columns #' to all data frames defining names for recognized sequences. For the summary -#' data frame this will be Allele1Name and Allele2Name. For each sample data -#' frame this will be SeqName, defined for any sequences represented in the -#' summary or in a given known alleles set. +#' data frame this will be \code{Allele1Name} and \code{Allele2Name}. For each +#' sample data frame this will be \code{SeqName}, defined for any sequences +#' represented in the summary or in a given known alleles set. #' #' @param results results list as produced by #' \code{\link{tidy_analyzed_dataset}}. @@ -202,10 +202,11 @@ tidy_analyzed_dataset <- function(dataset, raw.results) { #' \code{\link{make_allele_name}}. #' #' @return list of results, with \code{summary} set to the single summary data -#' frame and \code{data} the per-sample data frames. A "SeqName" column in -#' sample data frames and "Allele1Name" and "Allele2Name" columns in the -#' summary data frame will associate any sequence matching a known allele (for -#' either the given table or the current dataset) with a text name. +#' frame and \code{data} the per-sample data frames. A \code{SeqName} column +#' in sample data frames and \code{Allele1Name} and \code{Allele2Name} columns +#' in the summary data frame will associate any sequence matching a known +#' allele (for either the given table or the current dataset) with a text +#' name. name_known_sequences <- function(results, known_alleles, name_args) { # Name all of the called alleles across samples results$summary <- name_alleles_in_table(results$summary, known_alleles, diff --git a/R/analyze_sample.R b/R/analyze_sample.R index 7b615bf..ada9b70 100644 --- a/R/analyze_sample.R +++ b/R/analyze_sample.R @@ -76,12 +76,13 @@ analyze_sample <- function(seq_data, sample.attrs, fraction.min) { } #' @describeIn analyze_sample version of sample analysis guided by expected -#' sequence length values. Additional items ExpectedLength1 and optionally -#' ExpectedLength2 can be supplied in the \code{sample.attrs} list. If NA or -#' missing the behavior will match \code{analyze_sample}. If two expected -#' lengths are given, the fraction.min argument is ignored. If at least one -#' expected length is given, the stutter/artifact filtering is disabled. From -#' here use \code{\link{summarize_sample_guided}}. +#' sequence length values. Additional items \code{ExpectedLength1} and +#' optionally \code{ExpectedLength2} can be supplied in the +#' \code{sample.attrs} list. If NA or missing the behavior will match +#' \code{analyze_sample}. If two expected lengths are given, the fraction.min +#' argument is ignored. If at least one expected length is given, the +#' stutter/artifact filtering is disabled. From here use +#' \code{\link{summarize_sample_guided}}. #' #' @export analyze_sample_guided <- function(seq_data, sample.attrs, fraction.min) { diff --git a/R/analyze_seqs.R b/R/analyze_seqs.R index 1586877..ad27310 100644 --- a/R/analyze_seqs.R +++ b/R/analyze_seqs.R @@ -9,27 +9,27 @@ #' #' @details #' Columns in the returned data frame: -#' * Seq: sequence text for each unique sequence -#' * Count: integer count of occurrences of this exact sequence -#' * Length: integer sequence length -#' * MatchingLocus: factor for the name of the locus matching each sequence, -#' by checking the primer -#' * MotifMatch: logical: are there are least \code{nrepeats} perfect +#' * \code{Seq}: sequence text for each unique sequence +#' * \code{Count}: integer count of occurrences of this exact sequence +#' * \code{Length}: integer sequence length +#' * \code{MatchingLocus}: factor for the name of the locus matching each +#' sequence, by checking the primer +#' * \code{MotifMatch}: logical: are there are least \code{nrepeats} perfect #' adjacent repeats of the STR motif for the matching locus? -#' * LengthMatch: logical: is the sequence length within the expected range -#' for the matching locus? -#' * Ambiguous: logical: are there unexpected characters in the sequence +#' * \code{LengthMatch}: logical: is the sequence length within the expected +#' range for the matching locus? +#' * \code{Ambiguous}: logical: are there unexpected characters in the sequence #' content? -#' * Stutter: integer: for any sequence that looks like potential PCR stutter, -#' the index of the row that may be the source of the stutter band. -#' * Artifact: integer: for any sequence that looks like potential PCR artifact -#' (other than stutter), the index of the row that may be the source of the -#' stutter band. -#' * FractionOfTotal: numeric fraction of the number of sequences +#' * \code{Stutter}: integer: for any sequence that looks like potential PCR +#' stutter, the index of the row that may be the source of the stutter band. +#' * \code{Artifact}: integer: for any sequence that looks like potential PCR +#' artifact (other than stutter), the index of the row that may be the source +#' of the stutter band. +#' * \code{FractionOfTotal}: numeric fraction of the number of sequences #' represented by each unique sequence compared to the total. -#' * FractionOfLocus: numeric fraction of the number of sequences represented -#' by each unique sequence compared to the total for that particular -#' matching locus. +#' * \code{FractionOfLocus}: numeric fraction of the number of sequences +#' represented by each unique sequence compared to the total for that +#' particular matching locus. #' @md #' #' @param seqs character vector containing sequences. @@ -214,8 +214,8 @@ find_stutter <- function(sample.data, locus_attrs, #' Searches a processed STR sample for entries that may be PCR artifacts, other #' than stutter, from another entry in the sample. Potential artifacts are #' sequences with counts lower than another sequence by a given ratio and -#' sequence length within 1 bp of the other sequence. This only considers -#' STR-labeled rows and requires a given entry to have counts at most +#' sequence length within 1 nucleotide of the other sequence. This only +#' considers STR-labeled rows and requires a given entry to have counts at most #' \code{count.ratio_max} compared to the candidate "source" entry to be #' considered an artifact. Sequence content is not currently considered, just #' relative sequence lengths and counts. diff --git a/R/categorize.R b/R/categorize.R index 00d9c2c..96155cb 100644 --- a/R/categorize.R +++ b/R/categorize.R @@ -4,10 +4,10 @@ #' #' Using the Name column of the given results summary data frame, pair each #' called genotype with the known alleles. A data frame with two columns, -#' CorrectAllele1Seq and CorrectAllele2Seq, is returned. If matching entries are -#' found in Allele1Seq and/or Allele2Seq the order will be preserved, and at -#' this point the two allele entries should match up directly for genotypes that -#' were called correctly. +#' \code{CorrectAllele1Seq} and \code{CorrectAllele2Seq}, is returned. If +#' matching entries are found in \code{Allele1Seq} and/or \code{Allele2Seq} the +#' order will be preserved, and at this point the two allele entries should +#' match up directly for genotypes that were called correctly. #' #' @param results_summary cross-sample summary data frame as produced by #' \code{\link{analyze_dataset}}. @@ -41,10 +41,10 @@ match_known_genotypes <- function(results_summary, genotypes.known) { #' Categorize genotyping results #' -#' For a given results summary data frame that has CorrectAllele1Seq and Correct -#' Allele2Seq columns (such as produced by \code{\link{match_known_genotypes}}) -#' added, create a factor labeling every row of the input data frame by its -#' genotyping outcome. +#' For a given results summary data frame that has \code{CorrectAllele1Seq} and +#' \code{CorrectAllele2Seq} columns (such as produced by +#' \code{\link{match_known_genotypes}}) added, create a factor labeling every +#' row of the input data frame by its genotyping outcome. #' #' @details #' Levels in the returned factor, in order: @@ -56,8 +56,8 @@ match_known_genotypes <- function(results_summary, genotypes.known) { #' * Dropped Allele: One called allele is correct for a heterozygous individual, #' but no second allele was called. #' -#' Cases that should not occur, such as CorrectAllele1Seq and CorrectAllele2Seq -#' both set to NA, map to NA in the returned factor. +#' Cases that should not occur, such as \code{CorrectAllele1Seq} and +#' \code{CorrectAllele2Seq} both set to NA, map to NA in the returned factor. #' @md #' #' @param results_summary cross-sample summary data frame as produced by diff --git a/R/chiimp.R b/R/chiimp.R index 03c8a97..6a7df4a 100644 --- a/R/chiimp.R +++ b/R/chiimp.R @@ -67,48 +67,49 @@ #' The workflow above outlines CHIIMP's behavior when called as a standalone #' program, where \code{\link{main}} loads a configuration file into a nested #' list of options and calls \code{\link{full_analysis}}. The public functions -#' linked above can also be used idependently; see the documentation and code +#' linked above can also be used independently; see the documentation and code #' examples for the individual functions for more information. #' #' #' **The Package structure of the source files, grouped by topic:** #' * Main Interface: -#' * chiimp.R: Main entry point for command-line usage (\code{\link{main}}) -#' and R usage (\code{\link{full_analysis}}). +#' * \code{chiimp.R}: Main entry point for command-line usage +#' (\code{\link{main}}) and R usage (\code{\link{full_analysis}}). #' * Data Analysis: -#' * analyze_dataset.R: High-level interface to analyze all samples across a -#' given dataset (\code{\link{analyze_dataset}}); used by +#' * \code{analyze_dataset.R}: High-level interface to analyze all samples +#' across a given dataset (\code{\link{analyze_dataset}}); used by #' \code{\link{full_analysis}} to manage the main part of the processing. -#' * summarize_dataset.R: High-level interface to provide inter-sample and -#' inter-locus analyses (\code{\link{summarize_dataset}}); used by +#' * \code{summarize_dataset.R}: High-level interface to provide inter-sample +#' and inter-locus analyses (\code{\link{summarize_dataset}}); used by #' \code{\link{full_analysis}} to manage the second stage of the #' processing. -#' * analyze_seqs.R: Low-level interface to convert raw sequence input to a -#' data frame of unique sequences (\code{\link{analyze_seqs}}); used by -#' \code{\link{analyze_dataset}}. -#' * analyze_sample.R: Low-level interface to extract per-locus details from -#' a data frame of unique sequences (\code{\link{analyze_sample}}); used by -#' \code{\link{analyze_dataset}}. -#' * summarize_sample.R: Low-level interface to condense each sample data -#' frame into a a concise list of consistent attributes, suitable for +#' * \code{analyze_seqs.R}: Low-level interface to convert raw sequence input +#' to a data frame of unique sequences (\code{\link{analyze_seqs}}); used +#' by \code{\link{analyze_dataset}}. +#' * \code{analyze_sample.R}: Low-level interface to extract per-locus +#' details from a data frame of unique sequences +#' (\code{\link{analyze_sample}}); used by \code{\link{analyze_dataset}}. +#' * \code{summarize_sample.R}: Low-level interface to condense each sample +#' data frame into a a concise list of consistent attributes, suitable for #' binding together across samples for a dataset #' (\code{\link{summarize_sample}}); used by \code{\link{analyze_dataset}}. -#' * categorize.R: Low-level helper functions used by +#' * \code{categorize.R}: Low-level helper functions used by #' \code{\link{summarize_dataset}} for samples with known identity. #' * Plotting and reporting: -#' * report.R: Various plotting and summarizing functions used when rendering -#' a report in \code{\link{full_analysis}}. -#' * histogram.R: Sequence histogram plotting tools (\code{\link{histogram}}) -#' as used during \code{\link{full_analysis}}. -#' * markdown.R: Various helper functions for adding tables and plots to an R -#' Markdown report as used in \code{\link{full_analysis}}. +#' * \code{report.R}: Various plotting and summarizing functions used when +#' rendering a report in \code{\link{full_analysis}}. +#' * \code{histogram.R}: Sequence histogram plotting tools +#' (\code{\link{histogram}}) as used during \code{\link{full_analysis}}. +#' * \code{markdown.R}: Various helper functions for adding tables and plots +#' to an R Markdown report as used in \code{\link{full_analysis}}. #' * Utility Functions and Configuration: -#' * configuration.R: The default configuration options -#' (\code{\link{config.defaults}}) used by \code{\link{full_analysis}}. -#' * io.R: various helper input/output functions used loading and saving -#' sequence data files, spreadsheets, and plots used in multiple parts of the -#' package. -#' * util.R: Various helper functions used in multiple parts of the package. +#' * \code{configuration.R}: The default configuration options +#' (\code{\link{config.defaults}}) used by \code{\link{full_analysis}}. +#' * \code{io.R}: various helper input/output functions used loading and +#' saving sequence data files, spreadsheets, and plots used in multiple +#' parts of the package. +#' * \code{util.R}: Various helper functions used in multiple parts of the +#' package. #' #' @md #' diff --git a/R/configuration.R b/R/configuration.R index e0947d3..e4d22f8 100644 --- a/R/configuration.R +++ b/R/configuration.R @@ -8,18 +8,18 @@ #' itself to see all of the build-time defaults. #' #' Notable Options: -#' * dataset_opts: -#' * dp: directory path to input sequence files -#' * pattern: regular expression for the input filename pattern -#' * ord: order of fields Replicate, Sample, and Locus in in the input -#' filename pattern. For example, if Locus is the first field followed by -#' Replicate and Sample, set \code{ord=c(3, 1, 2)}. -#' * output: -#' * dp: directory path for saving output data -#' * fp_dataset: file path to table of sample attributes to use, rather than -#' detecting via dataset_opts -#' * fp_locus_attrs: file path to locus attributes CSV file -#' * fp_genotypes_known: file path to known genotypes CSV file +#' * \code{dataset_opts}: +#' * \code{dp}: directory path to input sequence files +#' * \code{pattern}: regular expression for the input filename pattern +#' * \code{ord}: order of fields Replicate, Sample, and Locus in in the +#' input filename pattern. For example, if Locus is the first field +#' followed by Replicate and Sample, set \code{ord=c(3, 1, 2)}. +#' * \code{output}: +#' * \code{dp}: directory path for saving output data +#' * \code{fp_dataset}: file path to table of sample attributes to use, rather +#' than detecting via dataset_opts +#' * \code{fp_locus_attrs}: file path to locus attributes CSV file +#' * \code{fp_genotypes_known}: file path to known genotypes CSV file #' @md #' #' @export diff --git a/R/io.R b/R/io.R index 02ac8e1..31fce4b 100644 --- a/R/io.R +++ b/R/io.R @@ -48,14 +48,14 @@ load_config <- function(fp) { #' #' @details #' Columns Required: -#' * Locus: Unique identifier for a given locus -#' * LengthMin: Minimum known allele sequence length for this locus -#' * LengthMax: Minimum known allele sequence length for this locus -#' * LengthBuffer: Additional sequence length below LengthMin and above -#' LengthMax to accept for a candidate allele -#' * Primer: The forward PCR primer sequence for a given locus, used when -#' matching sequences to loci -#' * ReversePrimer: The reverse PCR primer sequence +#' * \code{Locus}: Unique identifier for a given locus +#' * \code{LengthMin}: Minimum known allele sequence length for this locus +#' * \code{LengthMax}: Minimum known allele sequence length for this locus +#' * \code{LengthBuffer}: Additional sequence length below \code{LengthMin} +#' and above \code{LengthMax} to accept for a candidate allele +#' * \code{Primer}: The forward PCR primer sequence for a given locus, used +#' when matching sequences to loci +#' * \code{ReversePrimer}: The reverse PCR primer sequence #' @md #' #' @param fp.locus_attrs path to text file. @@ -318,8 +318,8 @@ prepare_dataset <- function(dp, pattern, ord = c(1, 2, 3), autorep=FALSE, #' Load vector of sequences from FASTA/FASTQ file #' #' Load a vector of character sequences from the given path. This is just a -#' wrapper around dnar to choose the parser based on filename. Only the -#' sequences are returned, not IDs or quality scores. +#' wrapper around \code{\link[dnar:read.fa]{dnar}} to choose the parser based on +#' filename. Only the sequences are returned, not IDs or quality scores. #' #' @param fp path to sequence file #' @@ -399,9 +399,9 @@ save_allele_seqs <- function(results_summary, dp) { #' \code{\link{analyze_seqs}}) to a separate file in the specified directory #' path, in CSV format. The directory structure will start at the first shared #' directory of the input file paths. -#' For example, if the inputs were /data/run1/file.fastq and -#' /data/run2/file.fastq there will be run1 and run2 directories inside the -#' given `dp` directory. +#' For example, if the inputs were \code{/data/run1/file.fastq} and +#' \code{/data/run2/file.fastq} there will be run1 and run2 directories inside +#' the given \code{dp} directory. #' #' @param results_file_data list of per-file data frames as produced by #' \code{\link{analyze_dataset}}. @@ -445,7 +445,7 @@ save_sample_data <- function(results_data, dp) { #' Save alignments to FASTA files #' -#' Take a list of alignments, one per locus, and save each to a separate fasta +#' Take a list of alignments, one per locus, and save each to a separate FASTA #' file in a specified directory. If any of the per-locus alignment objects is #' NA it will be skipped. These are produced by \code{\link{summarize_dataset}} #' via \code{\link{align_alleles}}. diff --git a/R/report.R b/R/report.R index 6534b4a..75fa38b 100644 --- a/R/report.R +++ b/R/report.R @@ -23,7 +23,7 @@ normalize_alleles <- function(data) { #' Allele pairs are shown in a standardized order with homozygous entries shown #' twice. #' -#' @param data data frame containing Allele1Name and Allele2Name colums such as +#' @param data data frame containing Allele1Name and Allele2Name columns such as #' the first list item produced by \code{\link{analyze_dataset}}. If allele #' names are not yet present call \code{\link{name_alleles_in_table}}. #' @param extra_cols names or index values of additional columns from input data @@ -126,7 +126,7 @@ report_genotypes <- function(results, #' @param na.replicates text to replace NA entries with for the Replicates #' column. #' -#' @return data frame showing summary of sample genotypes with interleved +#' @return data frame showing summary of sample genotypes with interleaved #' genotypes for similar known individuals. #' #' @export @@ -268,7 +268,7 @@ make.dist_scale <- function(n) { #' #' @param dist_mat distance matrix as produced by #' \code{\link{summarize_dataset}} via \code{\link{make_dist_mat}}. -#' @param num.alleles the maximum number of matching/mis-matching alleles. Used +#' @param num.alleles the maximum number of matching/mismatching alleles. Used #' to determine color scaling. Defaults to the highest observed distance in #' the matrix. #' @param dist.display_thresh distance value at or below which distances will be @@ -319,9 +319,9 @@ plot_dist_mat <- function(dist_mat, num.alleles=max(dist_mat), #' Render heatmap of STR attribute across samples and loci #' #' Given a cross-sample summary data frame as produced by -#' \code{\link{analyze_dataset}} and the name of a column (e.g., Stutter, -#' Homozygous, ProminentSequences), plot a heatmap of the values for that -#' attribute, with sample identifiers on rows and loci on columns. The +#' \code{\link{analyze_dataset}} and the name of a column (e.g., \code{Stutter}, +#' \code{Homozygous}, \code{ProminentSequences}), plot a heatmap of the values +#' for that attribute, with sample identifiers on rows and loci on columns. The #' attribute will be coerced to numeric. #' #' @param results combined results list diff --git a/R/summarize_dataset.R b/R/summarize_dataset.R index 93bb141..df07ba6 100644 --- a/R/summarize_dataset.R +++ b/R/summarize_dataset.R @@ -9,24 +9,26 @@ #' #' @details #' Additional entries in the returned list: -#' * alignments: inter-allele alignments for each locus, from +#' * \code{alignments}: inter-allele alignments for each locus, from #' \code{\link{align_alleles}}. -#' * dist_mat: inter-sample distance matrix, from \code{\link{make_dist_mat}}. -#' * dist_mat_known: if genotypes.known is given, this distance matrix of -#' sample-to-individual values will be present, from -#' \code{\link{make_dist_mat_known}}. +#' * \code{dist_mat}: inter-sample distance matrix, from +#' \code{\link{make_dist_mat}}. +#' * \code{dist_mat_known}: if genotypes.known is given, this distance matrix +#' of sample-to-individual values will be present, from +#' \code{\link{make_dist_mat_known}}. #' #' If genotypes.known is given *and* a Name column is present in #' \code{results$summary}, samples will be matched with the genotypes in #' genotypes.known and additional columns will be present in the summary data #' frame: -#' * CorrectAllele1Seq: One correct allele sequence for the individual. The -#' order of this and \code{CorrectAllele2Seq} will be matched to +#' * \code{CorrectAllele1Seq}: One correct allele sequence for the individual. +#' The order of this and \code{CorrectAllele2Seq} will be matched to #' \code{Allele1Seq} and \code{Allele2Seq} if possible. See #' \code{\link{match_known_genotypes}}. -#' * CorrectAllele2Seq: A second correct allele sequence, as above. -#' * GenotypeResult: Categorization for each entry as Correct, Incorrect, -#' Blank, or Dropped Allele. See \code{\link{categorize_genotype_results}}. +#' * \code{CorrectAllele2Seq}: A second correct allele sequence, as above. +#' * \code{GenotypeResult}: Categorization for each entry as Correct, +#' Incorrect, Blank, or Dropped Allele. See +#' \code{\link{categorize_genotype_results}}. #' #' @md #' @@ -374,8 +376,8 @@ summarize_genotypes_known <- function(genotypes_known, tbl_genotypes=NULL) { #' #' Tabulate a single arbitrary attribute across loci, assuming repeats by two #' for the alleles. This is used for color-coding summary heatmaps (see -#' \code{\link{plot_heatmap}}) on top of the attribute values, like Homozgyous -#' or ProminentSeqs. +#' \code{\link{plot_heatmap}}) on top of the attribute values, like +#' \code{Homozygous} or \code{ProminentSeqs}. #' #' @param results_summary cross-sample summary data frame as produced by #' \code{\link{analyze_dataset}}. diff --git a/R/summarize_sample.R b/R/summarize_sample.R index 09c317f..2f76af7 100644 --- a/R/summarize_sample.R +++ b/R/summarize_sample.R @@ -22,22 +22,24 @@ sample_summary_funcs <- c("summarize_sample", #' @details #' Entries in the returned list: #' * For Allele1 and Allele2: -#' * Seq: sequence text for each allele. -#' * Count: integer count of occrrences of this exact sequence. -#' * Length: integer sequence length. -#' * Homozygous: If the sample appears homozygous (if so, the Allele2 entries -#' will be NA). -#' * Ambiguous: If a potential allele was ignored due to ambiguous bases in -#' sequence content (such as "N"). -#' * Stutter: If a potential allele was ignored due to apparent PCR stutter. -#' * Artifact: If a potential allele was ignored due to apparent PCR artifact -#' (other than stutter). -#' * CountTotal: The total number of sequences in the original sample data. -#' * CountLocus: The number of sequences matching all criteria for the +#' * \code{Seq}: sequence text for each allele. +#' * \code{Count}: integer count of occurrences of this exact sequence. +#' * \code{Length}: integer sequence length. +#' * \code{Homozygous}: If the sample appears homozygous (if so, the Allele2 +#' entries will be NA). +#' * \code{Ambiguous}: If a potential allele was ignored due to ambiguous bases +#' in sequence content (such as "N"). +#' * \code{Stutter}: If a potential allele was ignored due to apparent PCR +#' stutter. +#' * \code{Artifact}: If a potential allele was ignored due to apparent PCR +#' artifact (other than stutter). +#' * \code{CountTotal}: The total number of sequences in the original sample +#' data. +#' * \code{CountLocus}: The number of sequences matching all criteria for the #' specified locus in the original sample data. -#' * ProminentSeqs: The number of entries above the specified threshold after -#' all filtering. This should be either one (for a homozygous sample) or two -#' (for a heterozygous sample) but conditions such as cross-sample +#' * \code{ProminentSeqs}: The number of entries above the specified threshold +#' after all filtering. This should be either one (for a homozygous sample) or +#' two (for a heterozygous sample) but conditions such as cross-sample #' contamination or excessive PCR stutter can lead to more than two. #' @md #' @@ -88,9 +90,9 @@ summarize_sample <- function(sample_data, sample.attrs, counts.min) { } #' @describeIn summarize_sample Summarize a processed STR sample Using known -#' lengths. If ExpectedLength1 and optionally ExpectedLength2 are given in -#' \code{sample.attrs}, the \code{counts.min} threshold is ignored. See also -#' \code{\link{analyze_sample_guided}}. +#' lengths. If \code{ExpectedLength1} and optionally \code{ExpectedLength2} +#' are given in \code{sample.attrs}, the \code{counts.min} threshold is +#' ignored. See also \code{\link{analyze_sample_guided}}. #' #' @export summarize_sample_guided <- function(sample_data, sample.attrs, counts.min) { diff --git a/R/util.R b/R/util.R index 6decc8b..9fb13e9 100644 --- a/R/util.R +++ b/R/util.R @@ -124,7 +124,7 @@ order_alleles <- function(nms) { #' the given data frame. Names from the given known_alleles data frame will be #' used for recognized sequences. #' -#' @param data data frame containing Allele1Seq and Allele2Seq colums such as +#' @param data data frame containing Allele1Seq and Allele2Seq columns such as #' the first list item produced by \code{\link{analyze_dataset}}. #' @param known_alleles data frame of custom allele names as defined for #' \code{\link{load_allele_names}}. if NULL only automatically generated diff --git a/R/zz_helper_data.R b/R/zz_helper_data.R index 828fe91..8d1aaa0 100644 --- a/R/zz_helper_data.R +++ b/R/zz_helper_data.R @@ -16,6 +16,11 @@ #' This list is a bundle of shared data and functions for running unit tests. #' @export test_data <- within(list(), { + # This is a particularly awkward approach now that in the development branch + # for version 3.6.0 the random number generator has changed its behavior. + # The below is a stopgap measure but this should really be reorganized to not + # need to generate the test data at build-time. + RNGversion("3.5.3") # Careful! When running via a package check we might be in temporary # installed copy in /tmp or elsewhere, and probably won't have the "inst" # directory anymore. Alternatively when running with devtools::test() we @@ -29,6 +34,7 @@ test_data <- within(list(), { header = TRUE, stringsAsFactors = FALSE, sep = ",") + rm(f.locus_attrs) rownames(locus_attrs) <- locus_attrs$Locus sample.data.cols <- c("Seq", "Count", "Length", "MatchingLocus", "MotifMatch", @@ -153,8 +159,7 @@ test_data <- within(list(), { results <- analyze_dataset(dataset, locus_attrs, nrepeats = 3, ncores = 1, analysis_opts = list(fraction.min = 0.05), summary_opts = list(counts.min = 500)) - lapply(dataset$Filename, file.remove) - file.remove(data.dir) + unlink(data.dir, recursive = TRUE) return(list(dataset = dataset, results = results)) } diff --git a/README.md b/README.md index b5dce79..8f53aba 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,12 @@ Computational, High-throughput Individual Identification through Microsatellite An R package and standalone program to analyze microsatellites in high-throughput sequencing datasets. -For automated installation and program usage see GUIDE.pdf in a -[released version](https://github.com/ShawHahnLab/chiimp/releases). -The most recent released version is [0.2.2](https://github.com/ShawHahnLab/chiimp/releases/tag/0.2.2). +For automated installation and program usage see [GUIDE.pdf] here or in a +[released version](https://github.com/ShawHahnLab/chiimp/releases), and the [worked examples]. +The most recent released version is [0.2.3](https://github.com/ShawHahnLab/chiimp/releases/tag/0.2.3). For usage as an R package also see the built-in package documentation. The package-level page (`?chiimp`) provides an overview with links to specific functions. + +[GUIDE.pdf]: https://shawhahnlab.github.io/chiimp/GUIDE.pdf +[worked examples]: https://shawhahnlab.github.io/chiimp/docs diff --git a/install_windows.R b/install_windows.R index f9fef14..1d22178 100755 --- a/install_windows.R +++ b/install_windows.R @@ -21,38 +21,31 @@ if (! any(file.access(.libPaths(), 2) == 0)) { .libPaths(dp) } -cat("\n") -cat("### Installing devtools\n") -cat("\n") -install.packages("devtools", repos = "https://cloud.r-project.org") - -cat("\n") -cat("### Installing Bioconductor and MSA\n") -cat("\n") -source("https://bioconductor.org/biocLite.R") -biocLite("msa") - -cat("\n") -cat("### Installing dependencies\n") -cat("\n") -devtools::install_deps(path, dependencies = TRUE) +haspkg <- function(pkgname) { + suppressMessages(suppressWarnings( + require(pkgname, character.only = TRUE, quietly = TRUE) + )) +} -cat("\n") -cat("### Testing CHIIMP\n") -cat("\n") -status <- sum(as.data.frame(devtools::test(path))$failed) -if (status == 1) { +if (! haspkg("devtools")) { cat("\n") + cat("### Installing devtools\n") cat("\n") - cat(" Warning: Tests indicated failures.\n") + install.packages("devtools", repos = "https://cloud.r-project.org") +} + +if (! haspkg("msa")) { cat("\n") + cat("### Installing Bioconductor and MSA\n") cat("\n") + source("https://bioconductor.org/biocLite.R") + biocLite("msa", suppressUpdates = TRUE) } cat("\n") cat("### Installing CHIIMP\n") cat("\n") -devtools::install(path) +devtools::install(path, upgrade = "never") shortcut_path <- file.path(UPROF, "Desktop", "CHIIMP.lnk") chiimp_path <- system.file("bin", "chiimp.cmd", package = "chiimp") diff --git a/install_windows.cmd b/install_windows.cmd index 3441b26..4517fd3 100755 --- a/install_windows.cmd +++ b/install_windows.cmd @@ -17,3 +17,4 @@ set pkgdir=%~dp0 REM Run bulk of the install within R. "%rscript%" --vanilla "%pkgdir%\install_windows.R" +pause diff --git a/man/analyze_sample.Rd b/man/analyze_sample.Rd index 1ad7f38..6ca52c8 100644 --- a/man/analyze_sample.Rd +++ b/man/analyze_sample.Rd @@ -62,12 +62,13 @@ non-stutter artifact sequence criteria as defined by the Artifact column of \code{\link{summarize_sample}}. \item \code{analyze_sample_guided}: version of sample analysis guided by expected -sequence length values. Additional items ExpectedLength1 and optionally -ExpectedLength2 can be supplied in the \code{sample.attrs} list. If NA or -missing the behavior will match \code{analyze_sample}. If two expected -lengths are given, the fraction.min argument is ignored. If at least one -expected length is given, the stutter/artifact filtering is disabled. From -here use \code{\link{summarize_sample_guided}}. +sequence length values. Additional items \code{ExpectedLength1} and +optionally \code{ExpectedLength2} can be supplied in the +\code{sample.attrs} list. If NA or missing the behavior will match +\code{analyze_sample}. If two expected lengths are given, the fraction.min +argument is ignored. If at least one expected length is given, the +stutter/artifact filtering is disabled. From here use +\code{\link{summarize_sample_guided}}. \item \code{analyze_sample_naive}: version of sample analysis without stutter/artifact filtering. From here use \code{\link{summarize_sample}} diff --git a/man/analyze_seqs.Rd b/man/analyze_seqs.Rd index 62116cf..b19096b 100644 --- a/man/analyze_seqs.Rd +++ b/man/analyze_seqs.Rd @@ -26,27 +26,27 @@ all loci are treated equally. \details{ Columns in the returned data frame: \itemize{ -\item Seq: sequence text for each unique sequence -\item Count: integer count of occurrences of this exact sequence -\item Length: integer sequence length -\item MatchingLocus: factor for the name of the locus matching each sequence, -by checking the primer -\item MotifMatch: logical: are there are least \code{nrepeats} perfect +\item \code{Seq}: sequence text for each unique sequence +\item \code{Count}: integer count of occurrences of this exact sequence +\item \code{Length}: integer sequence length +\item \code{MatchingLocus}: factor for the name of the locus matching each +sequence, by checking the primer +\item \code{MotifMatch}: logical: are there are least \code{nrepeats} perfect adjacent repeats of the STR motif for the matching locus? -\item LengthMatch: logical: is the sequence length within the expected range -for the matching locus? -\item Ambiguous: logical: are there unexpected characters in the sequence +\item \code{LengthMatch}: logical: is the sequence length within the expected +range for the matching locus? +\item \code{Ambiguous}: logical: are there unexpected characters in the sequence content? -\item Stutter: integer: for any sequence that looks like potential PCR stutter, -the index of the row that may be the source of the stutter band. -\item Artifact: integer: for any sequence that looks like potential PCR artifact -(other than stutter), the index of the row that may be the source of the -stutter band. -\item FractionOfTotal: numeric fraction of the number of sequences +\item \code{Stutter}: integer: for any sequence that looks like potential PCR +stutter, the index of the row that may be the source of the stutter band. +\item \code{Artifact}: integer: for any sequence that looks like potential PCR +artifact (other than stutter), the index of the row that may be the source +of the stutter band. +\item \code{FractionOfTotal}: numeric fraction of the number of sequences represented by each unique sequence compared to the total. -\item FractionOfLocus: numeric fraction of the number of sequences represented -by each unique sequence compared to the total for that particular -matching locus. +\item \code{FractionOfLocus}: numeric fraction of the number of sequences +represented by each unique sequence compared to the total for that +particular matching locus. } } \examples{ diff --git a/man/categorize_genotype_results.Rd b/man/categorize_genotype_results.Rd index 11066fa..7abbefb 100644 --- a/man/categorize_genotype_results.Rd +++ b/man/categorize_genotype_results.Rd @@ -16,10 +16,10 @@ factor defining genotyping result category for every row of the input data frame. } \description{ -For a given results summary data frame that has CorrectAllele1Seq and Correct -Allele2Seq columns (such as produced by \code{\link{match_known_genotypes}}) -added, create a factor labeling every row of the input data frame by its -genotyping outcome. +For a given results summary data frame that has \code{CorrectAllele1Seq} and +\code{CorrectAllele2Seq} columns (such as produced by +\code{\link{match_known_genotypes}}) added, create a factor labeling every +row of the input data frame by its genotyping outcome. } \details{ Levels in the returned factor, in order: @@ -32,6 +32,6 @@ were supplied. but no second allele was called. } -Cases that should not occur, such as CorrectAllele1Seq and CorrectAllele2Seq -both set to NA, map to NA in the returned factor. +Cases that should not occur, such as \code{CorrectAllele1Seq} and +\code{CorrectAllele2Seq} both set to NA, map to NA in the returned factor. } diff --git a/man/chiimp-package.Rd b/man/chiimp-package.Rd index 2ba8a89..3860e32 100644 --- a/man/chiimp-package.Rd +++ b/man/chiimp-package.Rd @@ -76,55 +76,56 @@ For defaults used in the configuration, see \code{\link{config.defaults}}. The workflow above outlines CHIIMP's behavior when called as a standalone program, where \code{\link{main}} loads a configuration file into a nested list of options and calls \code{\link{full_analysis}}. The public functions -linked above can also be used idependently; see the documentation and code +linked above can also be used independently; see the documentation and code examples for the individual functions for more information. \strong{The Package structure of the source files, grouped by topic:} \itemize{ \item Main Interface: \itemize{ -\item chiimp.R: Main entry point for command-line usage (\code{\link{main}}) -and R usage (\code{\link{full_analysis}}). +\item \code{chiimp.R}: Main entry point for command-line usage +(\code{\link{main}}) and R usage (\code{\link{full_analysis}}). } \item Data Analysis: \itemize{ -\item analyze_dataset.R: High-level interface to analyze all samples across a -given dataset (\code{\link{analyze_dataset}}); used by +\item \code{analyze_dataset.R}: High-level interface to analyze all samples +across a given dataset (\code{\link{analyze_dataset}}); used by \code{\link{full_analysis}} to manage the main part of the processing. -\item summarize_dataset.R: High-level interface to provide inter-sample and -inter-locus analyses (\code{\link{summarize_dataset}}); used by +\item \code{summarize_dataset.R}: High-level interface to provide inter-sample +and inter-locus analyses (\code{\link{summarize_dataset}}); used by \code{\link{full_analysis}} to manage the second stage of the processing. -\item analyze_seqs.R: Low-level interface to convert raw sequence input to a -data frame of unique sequences (\code{\link{analyze_seqs}}); used by -\code{\link{analyze_dataset}}. -\item analyze_sample.R: Low-level interface to extract per-locus details from -a data frame of unique sequences (\code{\link{analyze_sample}}); used by -\code{\link{analyze_dataset}}. -\item summarize_sample.R: Low-level interface to condense each sample data -frame into a a concise list of consistent attributes, suitable for +\item \code{analyze_seqs.R}: Low-level interface to convert raw sequence input +to a data frame of unique sequences (\code{\link{analyze_seqs}}); used +by \code{\link{analyze_dataset}}. +\item \code{analyze_sample.R}: Low-level interface to extract per-locus +details from a data frame of unique sequences +(\code{\link{analyze_sample}}); used by \code{\link{analyze_dataset}}. +\item \code{summarize_sample.R}: Low-level interface to condense each sample +data frame into a a concise list of consistent attributes, suitable for binding together across samples for a dataset (\code{\link{summarize_sample}}); used by \code{\link{analyze_dataset}}. -\item categorize.R: Low-level helper functions used by +\item \code{categorize.R}: Low-level helper functions used by \code{\link{summarize_dataset}} for samples with known identity. } \item Plotting and reporting: \itemize{ -\item report.R: Various plotting and summarizing functions used when rendering -a report in \code{\link{full_analysis}}. -\item histogram.R: Sequence histogram plotting tools (\code{\link{histogram}}) -as used during \code{\link{full_analysis}}. -\item markdown.R: Various helper functions for adding tables and plots to an R -Markdown report as used in \code{\link{full_analysis}}. +\item \code{report.R}: Various plotting and summarizing functions used when +rendering a report in \code{\link{full_analysis}}. +\item \code{histogram.R}: Sequence histogram plotting tools +(\code{\link{histogram}}) as used during \code{\link{full_analysis}}. +\item \code{markdown.R}: Various helper functions for adding tables and plots +to an R Markdown report as used in \code{\link{full_analysis}}. } \item Utility Functions and Configuration: \itemize{ -\item configuration.R: The default configuration options +\item \code{configuration.R}: The default configuration options (\code{\link{config.defaults}}) used by \code{\link{full_analysis}}. -\item io.R: various helper input/output functions used loading and saving -sequence data files, spreadsheets, and plots used in multiple parts of the +\item \code{io.R}: various helper input/output functions used loading and +saving sequence data files, spreadsheets, and plots used in multiple +parts of the package. +\item \code{util.R}: Various helper functions used in multiple parts of the package. -\item util.R: Various helper functions used in multiple parts of the package. } } } diff --git a/man/config.defaults.Rd b/man/config.defaults.Rd index 18c3a0c..ee7df56 100644 --- a/man/config.defaults.Rd +++ b/man/config.defaults.Rd @@ -19,22 +19,22 @@ itself to see all of the build-time defaults. \details{ Notable Options: \itemize{ -\item dataset_opts: +\item \code{dataset_opts}: \itemize{ -\item dp: directory path to input sequence files -\item pattern: regular expression for the input filename pattern -\item ord: order of fields Replicate, Sample, and Locus in in the input -filename pattern. For example, if Locus is the first field followed by -Replicate and Sample, set \code{ord=c(3, 1, 2)}. +\item \code{dp}: directory path to input sequence files +\item \code{pattern}: regular expression for the input filename pattern +\item \code{ord}: order of fields Replicate, Sample, and Locus in in the +input filename pattern. For example, if Locus is the first field +followed by Replicate and Sample, set \code{ord=c(3, 1, 2)}. } -\item output: +\item \code{output}: \itemize{ -\item dp: directory path for saving output data +\item \code{dp}: directory path for saving output data } -\item fp_dataset: file path to table of sample attributes to use, rather than -detecting via dataset_opts -\item fp_locus_attrs: file path to locus attributes CSV file -\item fp_genotypes_known: file path to known genotypes CSV file +\item \code{fp_dataset}: file path to table of sample attributes to use, rather +than detecting via dataset_opts +\item \code{fp_locus_attrs}: file path to locus attributes CSV file +\item \code{fp_genotypes_known}: file path to known genotypes CSV file } } \keyword{datasets} diff --git a/man/find_artifact.Rd b/man/find_artifact.Rd index 4f524ad..48e4b40 100644 --- a/man/find_artifact.Rd +++ b/man/find_artifact.Rd @@ -23,8 +23,8 @@ integer vector specifying, for each entry, the row index for another Searches a processed STR sample for entries that may be PCR artifacts, other than stutter, from another entry in the sample. Potential artifacts are sequences with counts lower than another sequence by a given ratio and -sequence length within 1 bp of the other sequence. This only considers -STR-labeled rows and requires a given entry to have counts at most +sequence length within 1 nucleotide of the other sequence. This only +considers STR-labeled rows and requires a given entry to have counts at most \code{count.ratio_max} compared to the candidate "source" entry to be considered an artifact. Sequence content is not currently considered, just relative sequence lengths and counts. diff --git a/man/full_analysis.Rd b/man/full_analysis.Rd index 2f82d44..5bc9774 100644 --- a/man/full_analysis.Rd +++ b/man/full_analysis.Rd @@ -37,5 +37,4 @@ config_path <- system.file("example_config.yml", package = "chiimp") config <- load_config(config_path) results <- full_analysis(config) - } diff --git a/man/load_locus_attrs.Rd b/man/load_locus_attrs.Rd index d985235..a981c85 100644 --- a/man/load_locus_attrs.Rd +++ b/man/load_locus_attrs.Rd @@ -23,14 +23,14 @@ frame then used by \code{\link{analyze_seqs}} within \details{ Columns Required: \itemize{ -\item Locus: Unique identifier for a given locus -\item LengthMin: Minimum known allele sequence length for this locus -\item LengthMax: Minimum known allele sequence length for this locus -\item LengthBuffer: Additional sequence length below LengthMin and above -LengthMax to accept for a candidate allele -\item Primer: The forward PCR primer sequence for a given locus, used when -matching sequences to loci -\item ReversePrimer: The reverse PCR primer sequence +\item \code{Locus}: Unique identifier for a given locus +\item \code{LengthMin}: Minimum known allele sequence length for this locus +\item \code{LengthMax}: Minimum known allele sequence length for this locus +\item \code{LengthBuffer}: Additional sequence length below \code{LengthMin} +and above \code{LengthMax} to accept for a candidate allele +\item \code{Primer}: The forward PCR primer sequence for a given locus, used +when matching sequences to loci +\item \code{ReversePrimer}: The reverse PCR primer sequence } } \examples{ diff --git a/man/load_seqs.Rd b/man/load_seqs.Rd index 19d527f..b749013 100644 --- a/man/load_seqs.Rd +++ b/man/load_seqs.Rd @@ -14,6 +14,6 @@ vector of sequences } \description{ Load a vector of character sequences from the given path. This is just a -wrapper around dnar to choose the parser based on filename. Only the -sequences are returned, not IDs or quality scores. +wrapper around \code{\link[dnar:read.fa]{dnar}} to choose the parser based on +filename. Only the sequences are returned, not IDs or quality scores. } diff --git a/man/match_known_genotypes.Rd b/man/match_known_genotypes.Rd index aadd7d7..7022d40 100644 --- a/man/match_known_genotypes.Rd +++ b/man/match_known_genotypes.Rd @@ -21,8 +21,8 @@ data frame with two columns for the two correct alleles, and rows \description{ Using the Name column of the given results summary data frame, pair each called genotype with the known alleles. A data frame with two columns, -CorrectAllele1Seq and CorrectAllele2Seq, is returned. If matching entries are -found in Allele1Seq and/or Allele2Seq the order will be preserved, and at -this point the two allele entries should match up directly for genotypes that -were called correctly. +\code{CorrectAllele1Seq} and \code{CorrectAllele2Seq}, is returned. If +matching entries are found in \code{Allele1Seq} and/or \code{Allele2Seq} the +order will be preserved, and at this point the two allele entries should +match up directly for genotypes that were called correctly. } diff --git a/man/name_alleles_in_table.Rd b/man/name_alleles_in_table.Rd index b888895..02fee4f 100644 --- a/man/name_alleles_in_table.Rd +++ b/man/name_alleles_in_table.Rd @@ -7,7 +7,7 @@ name_alleles_in_table(data, known_alleles = NULL, name_args = list()) } \arguments{ -\item{data}{data frame containing Allele1Seq and Allele2Seq colums such as +\item{data}{data frame containing Allele1Seq and Allele2Seq columns such as the first list item produced by \code{\link{analyze_dataset}}.} \item{known_alleles}{data frame of custom allele names as defined for diff --git a/man/name_known_sequences.Rd b/man/name_known_sequences.Rd index 7be5abf..e6ba414 100644 --- a/man/name_known_sequences.Rd +++ b/man/name_known_sequences.Rd @@ -19,16 +19,17 @@ generated for the summary will be used.} } \value{ list of results, with \code{summary} set to the single summary data - frame and \code{data} the per-sample data frames. A "SeqName" column in - sample data frames and "Allele1Name" and "Allele2Name" columns in the - summary data frame will associate any sequence matching a known allele (for - either the given table or the current dataset) with a text name. + frame and \code{data} the per-sample data frames. A \code{SeqName} column + in sample data frames and \code{Allele1Name} and \code{Allele2Name} columns + in the summary data frame will associate any sequence matching a known + allele (for either the given table or the current dataset) with a text + name. } \description{ For the given results list (pair of summary data frame and list of per-sample data frames as produced by \code{\link{tidy_analyzed_dataset}}), add columns to all data frames defining names for recognized sequences. For the summary -data frame this will be Allele1Name and Allele2Name. For each sample data -frame this will be SeqName, defined for any sequences represented in the -summary or in a given known alleles set. +data frame this will be \code{Allele1Name} and \code{Allele2Name}. For each +sample data frame this will be \code{SeqName}, defined for any sequences +represented in the summary or in a given known alleles set. } diff --git a/man/plot_dist_mat.Rd b/man/plot_dist_mat.Rd index 7cd1342..258b00a 100644 --- a/man/plot_dist_mat.Rd +++ b/man/plot_dist_mat.Rd @@ -11,7 +11,7 @@ plot_dist_mat(dist_mat, num.alleles = max(dist_mat), \item{dist_mat}{distance matrix as produced by \code{\link{summarize_dataset}} via \code{\link{make_dist_mat}}.} -\item{num.alleles}{the maximum number of matching/mis-matching alleles. Used +\item{num.alleles}{the maximum number of matching/mismatching alleles. Used to determine color scaling. Defaults to the highest observed distance in the matrix.} diff --git a/man/plot_heatmap.Rd b/man/plot_heatmap.Rd index 26861e8..9abb8c1 100644 --- a/man/plot_heatmap.Rd +++ b/man/plot_heatmap.Rd @@ -25,8 +25,8 @@ lengths.} } \description{ Given a cross-sample summary data frame as produced by -\code{\link{analyze_dataset}} and the name of a column (e.g., Stutter, -Homozygous, ProminentSequences), plot a heatmap of the values for that -attribute, with sample identifiers on rows and loci on columns. The +\code{\link{analyze_dataset}} and the name of a column (e.g., \code{Stutter}, +\code{Homozygous}, \code{ProminentSequences}), plot a heatmap of the values +for that attribute, with sample identifiers on rows and loci on columns. The attribute will be coerced to numeric. } diff --git a/man/report_idents.Rd b/man/report_idents.Rd index 168dbbc..bbba4cd 100644 --- a/man/report_idents.Rd +++ b/man/report_idents.Rd @@ -16,7 +16,7 @@ report_idents(results, closest, na.replicates = "") column.} } \value{ -data frame showing summary of sample genotypes with interleved +data frame showing summary of sample genotypes with interleaved genotypes for similar known individuals. } \description{ diff --git a/man/save_alignments.Rd b/man/save_alignments.Rd index 8356c93..45e938f 100644 --- a/man/save_alignments.Rd +++ b/man/save_alignments.Rd @@ -14,7 +14,7 @@ of each alignment will be used for its filename.} \item{dp}{output directory path.} } \description{ -Take a list of alignments, one per locus, and save each to a separate fasta +Take a list of alignments, one per locus, and save each to a separate FASTA file in a specified directory. If any of the per-locus alignment objects is NA it will be skipped. These are produced by \code{\link{summarize_dataset}} via \code{\link{align_alleles}}. diff --git a/man/save_seqfile_data.Rd b/man/save_seqfile_data.Rd index c0f8d12..5031456 100644 --- a/man/save_seqfile_data.Rd +++ b/man/save_seqfile_data.Rd @@ -17,7 +17,7 @@ Save each per-file data frame produced by \code{\link{analyze_dataset}} (via \code{\link{analyze_seqs}}) to a separate file in the specified directory path, in CSV format. The directory structure will start at the first shared directory of the input file paths. -For example, if the inputs were /data/run1/file.fastq and -/data/run2/file.fastq there will be run1 and run2 directories inside the -given `dp` directory. +For example, if the inputs were \code{/data/run1/file.fastq} and +\code{/data/run2/file.fastq} there will be run1 and run2 directories inside +the given \code{dp} directory. } diff --git a/man/summarize_attribute.Rd b/man/summarize_attribute.Rd index e996ea4..7e60363 100644 --- a/man/summarize_attribute.Rd +++ b/man/summarize_attribute.Rd @@ -21,6 +21,6 @@ data frame of attribute across samples and loci. \description{ Tabulate a single arbitrary attribute across loci, assuming repeats by two for the alleles. This is used for color-coding summary heatmaps (see -\code{\link{plot_heatmap}}) on top of the attribute values, like Homozgyous -or ProminentSeqs. +\code{\link{plot_heatmap}}) on top of the attribute values, like +\code{Homozygous} or \code{ProminentSeqs}. } diff --git a/man/summarize_dataset.Rd b/man/summarize_dataset.Rd index a1dad0b..8b9c03e 100644 --- a/man/summarize_dataset.Rd +++ b/man/summarize_dataset.Rd @@ -25,11 +25,12 @@ additional entries for inter-sample and inter-locus analyses. \details{ Additional entries in the returned list: \itemize{ -\item alignments: inter-allele alignments for each locus, from +\item \code{alignments}: inter-allele alignments for each locus, from \code{\link{align_alleles}}. -\item dist_mat: inter-sample distance matrix, from \code{\link{make_dist_mat}}. -\item dist_mat_known: if genotypes.known is given, this distance matrix of -sample-to-individual values will be present, from +\item \code{dist_mat}: inter-sample distance matrix, from +\code{\link{make_dist_mat}}. +\item \code{dist_mat_known}: if genotypes.known is given, this distance matrix +of sample-to-individual values will be present, from \code{\link{make_dist_mat_known}}. } @@ -38,12 +39,13 @@ If genotypes.known is given \emph{and} a Name column is present in genotypes.known and additional columns will be present in the summary data frame: \itemize{ -\item CorrectAllele1Seq: One correct allele sequence for the individual. The -order of this and \code{CorrectAllele2Seq} will be matched to +\item \code{CorrectAllele1Seq}: One correct allele sequence for the individual. +The order of this and \code{CorrectAllele2Seq} will be matched to \code{Allele1Seq} and \code{Allele2Seq} if possible. See \code{\link{match_known_genotypes}}. -\item CorrectAllele2Seq: A second correct allele sequence, as above. -\item GenotypeResult: Categorization for each entry as Correct, Incorrect, -Blank, or Dropped Allele. See \code{\link{categorize_genotype_results}}. +\item \code{CorrectAllele2Seq}: A second correct allele sequence, as above. +\item \code{GenotypeResult}: Categorization for each entry as Correct, +Incorrect, Blank, or Dropped Allele. See +\code{\link{categorize_genotype_results}}. } } diff --git a/man/summarize_sample.Rd b/man/summarize_sample.Rd index 09afe31..057686b 100644 --- a/man/summarize_sample.Rd +++ b/man/summarize_sample.Rd @@ -38,23 +38,25 @@ Entries in the returned list: \itemize{ \item For Allele1 and Allele2: \itemize{ -\item Seq: sequence text for each allele. -\item Count: integer count of occrrences of this exact sequence. -\item Length: integer sequence length. +\item \code{Seq}: sequence text for each allele. +\item \code{Count}: integer count of occurrences of this exact sequence. +\item \code{Length}: integer sequence length. } -\item Homozygous: If the sample appears homozygous (if so, the Allele2 entries -will be NA). -\item Ambiguous: If a potential allele was ignored due to ambiguous bases in -sequence content (such as "N"). -\item Stutter: If a potential allele was ignored due to apparent PCR stutter. -\item Artifact: If a potential allele was ignored due to apparent PCR artifact -(other than stutter). -\item CountTotal: The total number of sequences in the original sample data. -\item CountLocus: The number of sequences matching all criteria for the +\item \code{Homozygous}: If the sample appears homozygous (if so, the Allele2 +entries will be NA). +\item \code{Ambiguous}: If a potential allele was ignored due to ambiguous bases +in sequence content (such as "N"). +\item \code{Stutter}: If a potential allele was ignored due to apparent PCR +stutter. +\item \code{Artifact}: If a potential allele was ignored due to apparent PCR +artifact (other than stutter). +\item \code{CountTotal}: The total number of sequences in the original sample +data. +\item \code{CountLocus}: The number of sequences matching all criteria for the specified locus in the original sample data. -\item ProminentSeqs: The number of entries above the specified threshold after -all filtering. This should be either one (for a homozygous sample) or two -(for a heterozygous sample) but conditions such as cross-sample +\item \code{ProminentSeqs}: The number of entries above the specified threshold +after all filtering. This should be either one (for a homozygous sample) or +two (for a heterozygous sample) but conditions such as cross-sample contamination or excessive PCR stutter can lead to more than two. } } @@ -63,8 +65,8 @@ contamination or excessive PCR stutter can lead to more than two. \item \code{summarize_sample}: Default version of sample summary. \item \code{summarize_sample_guided}: Summarize a processed STR sample Using known -lengths. If ExpectedLength1 and optionally ExpectedLength2 are given in -\code{sample.attrs}, the \code{counts.min} threshold is ignored. See also -\code{\link{analyze_sample_guided}}. +lengths. If \code{ExpectedLength1} and optionally \code{ExpectedLength2} +are given in \code{sample.attrs}, the \code{counts.min} threshold is +ignored. See also \code{\link{analyze_sample_guided}}. }} diff --git a/man/tabulate_allele_names.Rd b/man/tabulate_allele_names.Rd index bafc3b6..b716861 100644 --- a/man/tabulate_allele_names.Rd +++ b/man/tabulate_allele_names.Rd @@ -7,7 +7,7 @@ tabulate_allele_names(data, extra_cols = NULL) } \arguments{ -\item{data}{data frame containing Allele1Name and Allele2Name colums such as +\item{data}{data frame containing Allele1Name and Allele2Name columns such as the first list item produced by \code{\link{analyze_dataset}}. If allele names are not yet present call \code{\link{name_alleles_in_table}}.} diff --git a/man/test_data.Rd b/man/test_data.Rd index 41a73b6..de1bd8e 100644 --- a/man/test_data.Rd +++ b/man/test_data.Rd @@ -4,7 +4,7 @@ \name{test_data} \alias{test_data} \title{Helper Data for Tests} -\format{An object of class \code{list} of length 18.} +\format{An object of class \code{list} of length 17.} \usage{ test_data } diff --git a/tests/testthat/test_analyze_dataset.R b/tests/testthat/test_analyze_dataset.R index 1f72c82..e3f95fb 100644 --- a/tests/testthat/test_analyze_dataset.R +++ b/tests/testthat/test_analyze_dataset.R @@ -28,8 +28,7 @@ with(test_data, { summary_opts = list(counts.min = 500), nrepeats = 3, ncores = 1) - lapply(dataset$Filename, file.remove) - file.remove(data.dir) + unlink(x = data.dir, recursive = TRUE) # Check the overall structure expect_equal(sapply(results, class), c(summary = "data.frame", @@ -47,8 +46,7 @@ with(test_data, { summary_opts = list(counts.min = 500), nrepeats = 3, ncores = 1) - lapply(dataset$Filename, file.remove) - file.remove(data.dir) + unlink(x = data.dir, recursive = TRUE) # Check the summary data frame with(results$summary, { # First update ordering of dataset's rows. The existing order should be @@ -123,8 +121,7 @@ with(test_data, { nrepeats = 3, ncores = 1, known_alleles = known_alleles) - lapply(dataset$Filename, file.remove) - file.remove(data.dir) + unlink(x = data.dir, recursive = TRUE) # Check that the resulting allele names match all the expected values with(results$summary, { @@ -182,6 +179,7 @@ with(test_data, { nrepeats = 3, ncores = 1) }, "ERROR: Locus names in dataset not in attributes table: a, b") + unlink(x = data.dir, recursive = TRUE) }) test_that("analyze_dataset warns of empty input files", { @@ -202,6 +200,7 @@ with(test_data, { }, type = "message") msg_exp <- "WARNING: Zero reads for 1 of 12 data files" expect_true(length(grep(msg_exp, msg)) == 1) + unlink(x = data.dir, recursive = TRUE) }) }) diff --git a/tests/testthat/test_io.R b/tests/testthat/test_io.R index 62499c8..25e4341 100644 --- a/tests/testthat/test_io.R +++ b/tests/testthat/test_io.R @@ -119,11 +119,12 @@ with(test_data, { setwd(data.dir) touch(dataset_known$Filename) # Write dataset CSV - fp <- tempfile() + fp <- tempfile(tmpdir = data.dir) write.csv(dataset_known, file = fp, na = "", row.names = FALSE) expect_silent({ dataset <- load_dataset(fp) }) + unlink(x = data.dir, recursive = TRUE) expect_identical(dataset, dataset_known) }) @@ -134,13 +135,14 @@ with(test_data, { data.dir <- tempfile() dir.create(data.dir) setwd(data.dir) - fp <- tempfile() + fp <- tempfile(tmpdir = data.dir) write.csv(dataset_known, file = fp, na = "", row.names = FALSE) # expect_message and capture_messages both do NOT catch text send to stderr, # though capture.output(..., type = "message") does. msg <- capture.output({ dataset <- load_dataset(fp) }, type = "message") + unlink(x = data.dir, recursive = TRUE) expect_true(length(grep("WARNING: Missing 60 of 60 data files", msg)) == 1) expect_identical(dataset, dataset_known) }) @@ -157,9 +159,10 @@ with(test_data, { setwd(data.dir) dataset_known <- setup_dataset() touch(dataset_known$Filename) - fp <- tempfile() + fp <- tempfile(tmpdir = data.dir) save_dataset(dataset_known, fp) dataset <- load_dataset(fp) + unlink(x = data.dir, recursive = TRUE) expect_identical(dataset, dataset_known) }) @@ -174,6 +177,7 @@ with(test_data, { # by default the field ordering is assumed to be replicate, sample, locus data <- setup_data_dir(replicates, samples, loci) dataset <- prepare_dataset(data$dp, data$pattern) + unlink(x = data$dp, recursive = TRUE) expect_equal(colnames(dataset), c("Filename", "Replicate", "Sample", "Locus")) expect_equal(sort(dataset$Filename), sort(data$fps)) @@ -199,6 +203,7 @@ with(test_data, { ord <- c(3, 1, 2) data <- setup_data_dir(replicates, samples, loci, ord) dataset <- prepare_dataset(data$dp, "([A-Za-z0-9]+)-(\\d+)-(\\d+)", ord) + unlink(x = data$dp, recursive = TRUE) expect_equal(colnames(dataset), c("Filename", "Locus", "Replicate", "Sample")) expect_equal(sort(dataset$Filename), sort(data$fps)) @@ -220,6 +225,7 @@ with(test_data, { # warning. data <- setup_data_dir(replicates, samples, loci) expect_warning(dataset <- prepare_dataset(data$dp, "(\\d+)-(\\d+)")) + unlink(x = data$dp, recursive = TRUE) }) test_that("prepare_dataset warns of repeated identifier rows", { @@ -234,6 +240,7 @@ with(test_data, { dataset <- prepare_dataset(data$dp, data$pattern) }, "Some replicate/sample/locus combinations match multiple files") + unlink(x = data$dp, recursive = TRUE) }) test_that("prepare_dataset can autolabel replicates", { @@ -247,6 +254,7 @@ with(test_data, { dataset <- prepare_dataset(data$dp, pattern = "()1-(\\d+)-([A-Za-z0-9]+)", autorep = TRUE) + unlink(x = data$dp, recursive = TRUE) extras <- paste0(data$fps[3], c(".2", ".3")) expect_equal(sort(dataset$Filename), sort(c(data$fps, extras))) expect_equal(as.character(dataset$Locus), @@ -278,6 +286,7 @@ with(test_data, { c("Filename", "Replicate", "Sample", "Locus")) expect_equal(sort(dataset$Filename), sort(list.files(dp, recursive = TRUE, full.names = TRUE))) + unlink(x = c(data1, data2, dp), recursive = TRUE) }) test_that("prepare_dataset can separate multiplexed samples", { @@ -302,6 +311,7 @@ with(test_data, { dataset_known$Replicate <- as.integer(dataset_known$Replicate) # Read dataset from disk using the mapping of locus names dataset <- prepare_dataset(data$dp, data$pattern, locusmap = locusmap) + unlink(x = data$dp, recursive = TRUE) # Aside from the different filenames, does everything match up? dataset_known$Filename <- dataset$Filename expect_equal(dataset, dataset_known) @@ -310,16 +320,21 @@ with(test_data, { test_that("prepare_dataset handles missing data directory", { dp <- tempfile() expect_error({ - prepare_dataset(dp, "(\\d+)-(\\d+)-([A-Za-z0-9]+)") - }, paste("ERROR: directory path for data files does not exist:", dp)) + prepare_dataset(dp, "(\\d+)-(\\d+)-([A-Za-z0-9]+)") + }, + paste("ERROR: directory path for data files does not exist:", dp), + fixed = TRUE) }) test_that("prepare_dataset handles no-samples case", { dp <- tempfile() dir.create(dp) expect_error({ - prepare_dataset(dp, "(\\d+)-(\\d+)-([A-Za-z0-9]+)") - }, paste("ERROR: no data files found:", dp)) + prepare_dataset(dp, "(\\d+)-(\\d+)-([A-Za-z0-9]+)") + }, + paste("ERROR: no data files found:", dp), + fixed = TRUE) + unlink(x = dp, recursive = TRUE) }) @@ -340,10 +355,14 @@ with(test_data, { ncores = 1) dp_out <- file.path(data.dir, "results", "processed_files") save_seqfile_data(results$files, dp_out) - fps_expected <- sort(paste0(names(results$files), ".csv")) - fps_observed <- sort(file.path(data.dir, - list.files(dp_out, recursive = TRUE))) + fps_expected <- sort(file.path(dp_out, + paste0(basename(names(results$files)), + ".csv"))) + fps_observed <- sort(list.files(dp_out, + recursive = TRUE, + full.names = TRUE)) expect_equal(fps_observed, fps_expected) + unlink(x = data.dir, recursive = TRUE) }) test_that("save_seqfile_data works with directory trees", { @@ -363,10 +382,15 @@ with(test_data, { ncores = 1) dp_out <- file.path(data.dir, "results", "processed_files") save_seqfile_data(results$files, dp_out) - fps_expected <- sort(paste0(names(results$files), ".csv")) - fps_observed <- sort(file.path(data.dir, - list.files(dp_out, recursive = TRUE))) + fps_expected <- sort(file.path(dp_out, + basename(dirname(names(results$files))), + paste0(basename(names(results$files)), + ".csv"))) + fps_observed <- sort(list.files(dp_out, + recursive = TRUE, + full.names = TRUE)) expect_equal(fps_observed, fps_expected) + unlink(x = data.dir, recursive = TRUE) }) test_that("save_seqfile_data works with Windows-style paths", { @@ -389,10 +413,19 @@ with(test_data, { names(results$files) <- gsub("/", "\\\\", names(results$files)) save_seqfile_data(results$files, dp_out) names(results$files) <- gsub("\\\\", "/", names(results$files)) - fps_expected <- sort(paste0(names(results$files), ".csv")) - fps_observed <- sort(file.path(data.dir, - list.files(dp_out, recursive = TRUE))) + fps_expected <- sort(file.path(dp_out, + basename(dirname(names(results$files))), + paste0(basename(names(results$files)), + ".csv"))) + fps_observed <- sort(list.files(dp_out, + recursive = TRUE, + full.names = TRUE)) + # Normalize any lingering \ or / inconsistencies, so this test should also + # pass on Windows itself. + fps_expected <- normalizePath(fps_expected) + fps_observed <- normalizePath(fps_observed) expect_equal(fps_observed, fps_expected) + unlink(x = data.dir, recursive = TRUE) }) }) diff --git a/tests/testthat/test_report.R b/tests/testthat/test_report.R index 39d9d8c..64133ee 100644 --- a/tests/testthat/test_report.R +++ b/tests/testthat/test_report.R @@ -11,6 +11,7 @@ with(test_data, { png(fp_img) plot_data <- plot_alignment(alignments[["A"]]) dev.off() + unlink(x = fp_img) groups <- c(" 162 bp", " 178 bp", " 182 bp", " 194 bp") groups <- factor(groups) labels <- c("2", "1", "2", "1") @@ -88,6 +89,7 @@ with(test_data, { png(fp_img) plot_data <- plot_heatmap(results, "Stutter") dev.off() + unlink(x = fp_img) expect_equal(class(plot_data), "pheatmap") }) }) @@ -104,6 +106,7 @@ with(test_data, { png(fp_img) plot_data <- plot_heatmap(results, "Stutter") dev.off() + unlink(x = fp_img) expect_equal(class(plot_data), "pheatmap") }) }) @@ -118,6 +121,7 @@ with(test_data, { png(fp_img) plot_data <- plot_heatmap(results, "Stutter") dev.off() + unlink(x = fp_img) expect_equal(class(plot_data), "pheatmap") }) }) @@ -133,6 +137,7 @@ with(test_data, { png(fp_img) plot_data <- plot_heatmap(results, "Stutter") dev.off() + unlink(x = fp_img) expect_equal(class(plot_data), "pheatmap") }) })