Merge pull request #41 from awasyn/docssummarize

Add parameter definitions to summarize.R
JRaviLab · Oct 11, 2024 · 4d58e59 · 4d58e59
2 parents b96198e + 4cbf875
commit 4d58e59
Show file tree

Hide file tree

Showing 15 changed files with 369 additions and 248 deletions.
diff --git a/R/summarize.R b/R/summarize.R
@@ -10,7 +10,7 @@
 # suppressPackageStartupMessages(library(rlang))
 # conflicted::conflict_prefer("filter", "dplyr")
 
-#' Filter by Domains
+#' filterByDomains
 #'
 #' @author Samuel Chen, Janani Ravi
 #' @description filterByDomains filters a data frame by identifying exact domain matches
@@ -88,21 +88,35 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo
 ## COUNTS of DAs and GCs ##
 ## Before/after break up ##
 ###########################
-## Function to obtain element counts (DA, GC)
-#' Count By Column
-#'
-#' @param prot
-#' @param column
-#' @param min.freq
+
+#' countByColumn
+#' @description
+#' Function to obtain element counts (DA, GC)
+#' 
+#' @param prot A data frame containing the dataset to analyze, typically with 
+#' multiple columns including the one specified by the `column` parameter.
+#' @param column A character string specifying the name of the column to analyze. 
+#' The default is "DomArch".
+#' @param min.freq An integer specifying the minimum frequency an element must 
+#' have to be included in the output. Default is 1.
 #'
 #' @importFrom dplyr arrange as_tibble filter select
 #'
-#' @return Describe return, in detail
+#' @return A tibble with two columns:
+#' \describe{
+#'   \item{`column`}{The unique elements from the specified column 
+#'   (e.g., "DomArch").}
+#'   \item{`freq`}{The frequency of each element, i.e., the number of times 
+#'   each element appears in the specified column.}
+#' }
+#' The tibble is filtered to only include elements that have a frequency 
+#' greater than or equal to `min.freq` and does not include elements with `NA` 
+#' values or those starting with a hyphen ("-").
 #' @export
 #'
 #' @examples
 #' \dontrun{
-#' countByColumn()
+#' countByColumn(prot = my_data, column = "DomArch", min.freq = 10)
 #' }
 countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) {
     counts <- prot %>%
@@ -117,25 +131,36 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) {
     return(counts)
 }
 
-#' Elements 2 Words
+#' elements2Words
 #'
 #' @description
 #' Break string ELEMENTS into WORDS for domain architecture (DA) and genomic
 #' context (GC)
 #'
-#' @param prot [dataframe]
-#' @param column [string] column name
-#' @param conversion_type [string] type of conversion: 'da2doms': domain architectures to
-#' domains. 'gc2da' genomic context to domain architectures
+#' @param prot A dataframe containing the dataset to analyze. The specified 
+#' `column` contains the string elements to be processed.
+#' @param column A character string specifying the name of the column to analyze. 
+#' Default is "DomArch".
+#' @param conversion_type A character string specifying the type of conversion. 
+#' Two options are available:
+#' \describe{
+#'   \item{`da2doms`}{Convert domain architectures into individual domains by 
+#'   replacing `+` symbols with spaces.}
+#'   \item{`gc2da`}{Convert genomic context into domain architectures by
+#'    replacing directional symbols (`<-`, `->`, and `|`) with spaces.}
+#' }
 #'
 #' @importFrom dplyr pull
 #' @importFrom stringr str_replace_all
 #'
-#' @return [string] with words delimited by spaces
+#' @return A single string where elements are delimited by spaces. The function 
+#' performs necessary substitutions based on the `conversion_type` and cleans up 
+#' extraneous characters like newlines, tabs, and multiple spaces.
 #'
 #' @examples
 #' \dontrun{
-#' tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> elements2Words()
+#' tibble::tibble(DomArch = c("aaa+bbb", 
+#' "a+b", "b+c", "b-c")) |> elements2Words()
 #' }
 #'
 elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms") {
@@ -170,16 +195,25 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms"
     return(z3)
 }
 
-#' Words 2 Word Counts
+#' words2WordCounts
 #'
 #' @description
 #' Get word counts (wc) [DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)]
 #'
-#' @param string
+#' @param string A character string containing the elements (words) to count. 
+#' This would typically be a space-delimited string representing domain 
+#' architectures or genomic contexts.
 #'
-#' @importFrom dplyr as_tibble filter
+#' @importFrom dplyr as_tibble filter arrange
+#' @importFrom stringr str_replace_all
 #'
-#' @return [tbl_df] table with 2 columns: 1) words & 2) counts/frequency
+#' @return A tibble (tbl_df) with two columns: 
+#' \describe{
+#'   \item{`words`}{A column containing the individual words 
+#'   (domains or domain architectures).}
+#'   \item{`freq`}{A column containing the frequency counts for each word.}
+#' }
+#' 
 #'
 #' @examples
 #' \dontrun{
@@ -216,13 +250,20 @@ words2WordCounts <- function(string) {
         arrange(-freq)
     return(df_word_count)
 }
-## Function to filter based on frequencies
-#' Filter Frequency
+
+#' filterByFrequency
+#' @description
+#' Function to filter based on frequencies
+#' 
+#' @param x A tibble (tbl_df) containing at least two columns: one for 
+#' elements (e.g., `words`) and one for their frequency (e.g., `freq`).
+#' @param min.freq A numeric value specifying the minimum frequency threshold. 
+#' Only elements with frequencies greater than or equal to this value will be 
+#' retained.
 #'
-#' @param x
-#' @param min.freq
+#' @return A tibble with the same structure as `x`, but filtered to include 
+#' only rows where the frequency is greater than or equal to `min.freq`.
 #'
-#' @return Describe return, in detail
 #' @export
 #'
 #' @examples
@@ -237,17 +278,30 @@ filterByFrequency <- function(x, min.freq) {
 #########################
 ## SUMMARY FUNCTIONS ####
 #########################
-#' Summarize by Lineage
+#' MolEvolvR Summary
+#' @name MolEvolvR_summary
+#' @description
+#' A collection of summary functions for the MolEvolvR package.
+#' 
+NULL
+
+#' summarizeByLineage
 #'
-#' @param prot
-#' @param column
-#' @param by
-#' @param query
+#' @param prot A dataframe or tibble containing the data.
+#' @param column A string representing the column to be summarized 
+#' (e.g., `DomArch`). Default is "DomArch".
+#' @param by A string representing the grouping column (e.g., `Lineage`). 
+#' Default is "Lineage".
+#' @param query A string specifying the query pattern for filtering the target 
+#' column. Use "all" to skip filtering and include all rows.
 #'
 #' @importFrom dplyr arrange filter group_by summarise
 #' @importFrom rlang sym
 #'
-#' @return Describe return, in detail
+#' @return A tibble summarizing the counts of occurrences of elements in 
+#' the `column`, grouped by the `by` column. The result includes the number 
+#' of occurrences (`count`) and is arranged in descending order of count.
+#' @rdname MolEvolvR_summary
 #' @export
 #'
 #' @examples
@@ -283,11 +337,18 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage"
 #' Function to summarize and retrieve counts by Domains & Domains+Lineage
 #'
 #'
-#' @param x
+#' @param x A dataframe or tibble containing the data. It must have columns 
+#' named `DomArch` and `Lineage`.
 #'
 #' @importFrom dplyr arrange count desc filter group_by summarise
 #'
-#' @return Describe return, in detail
+#' @return A tibble summarizing the counts of unique domain architectures 
+#' (`DomArch`) per lineage (`Lineage`). The resulting table contains three 
+#' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency 
+#' of each domain architecture for each lineage. The results are arranged in 
+#' descending order of `count`.
+#' @rdname MolEvolvR_summary
+#'
 #' @export
 #'
 #' @examples
@@ -302,17 +363,25 @@ summarizeDomArch_ByLineage <- function(x) {
         arrange(desc(count))
 }
 
-## Function to retrieve counts of how many lineages a DomArch appears in
+
 #' summarizeDomArch
 #'
 #' @description
 #' Function to retrieve counts of how many lineages a DomArch appears in
 #'
-#' @param x
+#' @param x A dataframe or tibble containing the data. It must have a column 
+#' named `DomArch` and a count column, such as `count`, which represents the 
+#' occurrences of each architecture in various lineages.
 #'
 #' @importFrom dplyr arrange group_by filter summarise
 #'
-#' @return Describe return, in detail
+#' @return A tibble summarizing each unique `DomArch`, along with the following 
+#' columns:
+#' - `totalcount`: The total occurrences of each `DomArch` across all lineages.
+#' - `totallin`: The total number of unique lineages in which each `DomArch` 
+#' appears.
+#' The results are arranged in descending order of `totallin` and `totalcount`.
+#' @rdname MolEvolvR_summary
 #' @export
 #'
 #' @examples
@@ -330,11 +399,21 @@ summarizeDomArch <- function(x) {
 
 #' summarizeGenContext_ByDomArchLineage
 #'
-#' @param x
+#' @param x A dataframe or tibble containing the data. It must have columns 
+#' named `GenContext`, `DomArch`, and `Lineage`.
 #'
 #' @importFrom dplyr arrange desc filter group_by n summarise
 #'
-#' @return Define return, in detail
+#' @return A tibble summarizing each unique combination of `GenContext`, 
+#' `DomArch`, and `Lineage`, along with the following columns:
+#' - `GenContext`: The genomic context for each entry.
+#' - `DomArch`: The domain architecture for each entry.
+#' - `Lineage`: The lineage associated with each entry.
+#' - `count`: The total number of occurrences for each combination of 
+#' `GenContext`, `DomArch`, and `Lineage`.
+#'
+#' The results are arranged in descending order of `count`.
+#' @rdname MolEvolvR_summary
 #' @export
 #'
 #' @examples
@@ -354,11 +433,12 @@ summarizeGenContext_ByDomArchLineage <- function(x) {
 
 #' summarizeGenContext_ByLineage
 #'
-#' @param x
+#' @param x A dataframe or tibble containing the data.
 #'
 #' @importFrom dplyr arrange desc filter group_by n summarise
 #'
 #' @return Describe return, in detail
+#' @rdname MolEvolvR_summary
 #' @export
 #'
 #' @examples
@@ -378,11 +458,20 @@ summarizeGenContext_ByLineage <- function(x) {
 
 #' summarizeGenContext
 #'
-#' @param x
+#' @param x A dataframe or tibble containing the data. It must have columns 
+#' named `GenContext`, `DomArch`, and `Lineage`.
 #'
-#' @importFrom dplyr arrange desc filter group_by n_distinct summarise
+#' @importFrom dplyr arrange desc filter group_by n n_distinct summarise
 #'
-#' @return Describe return, in detail
+#' @return A tibble summarizing each unique combination of `GenContext` and 
+#' `Lineage`, along with the following columns:
+#' - `GenContext`: The genomic context for each entry.
+#' - `Lineage`: The lineage associated with each entry.
+#' - `count`: The total number of occurrences for each combination of
+#'  `GenContext` and `Lineage`.
+#'
+#' The results are arranged in descending order of `count`.
+#' @rdname MolEvolvR_summary
 #' @export
 #'
 #' @examples
@@ -404,7 +493,7 @@ summarizeGenContext <- function(x) {
 
 
 ##################
-#' Total Counts
+#' totalGenContextOrDomArchCounts
 #'
 #' @description
 #' Creates a data frame with a totalcount column
@@ -414,16 +503,28 @@ summarizeGenContext <- function(x) {
 #'
 #' @param prot  A data frame that must contain columns:
 #' \itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}
-#' @param column Character. The column to summarize
-#' @param lineage_col
-#' @param cutoff Numeric. Cutoff for total count. Counts below cutoff value will not be shown. Default is 0.
-#' @param RowsCutoff
-#' @param digits
+#' @param column Character. The column to summarize, default is "DomArch".
+#' @param lineage_col Character. The name of the lineage column, default is 
+#' "Lineage".
+#' @param cutoff Numeric. Cutoff for total count. Counts below this cutoff value 
+#' will not be shown. Default is 0.
+#' @param RowsCutoff Logical. If TRUE, filters based on cumulative percentage 
+#' cutoff. Default is FALSE.
+#' @param digits Numeric. Number of decimal places for percentage columns. 
+#' Default is 2.
+#'
 #'
 #' @importFrom dplyr arrange distinct filter group_by left_join mutate select summarise ungroup
 #' @importFrom rlang as_string sym
 #'
-#' @return Define return, in detail
+#' @return A data frame with the following columns:
+#' - `{{ column }}`: Unique values from the specified column.
+#' - `totalcount`: The total count of occurrences for each unique value in 
+#' the specified column.
+#' - `IndividualCountPercent`: The percentage of each `totalcount` relative to 
+#' the overall count.
+#' - `CumulativePercent`: The cumulative percentage of total counts.
+#' @rdname MolEvolvR_summary
 #' @export
 #'
 #' @note Please refer to the source code if you have alternate file formats and/or
@@ -575,7 +676,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col
 
 
 
-#' Find Paralogs
+#' findParalogs
 #'
 #' @description
 #' Creates a data frame of paralogs.