From 2d9acb8b335a9dc281b7a4b460706e90e1a3d218 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Sat, 5 Oct 2024 08:58:46 +0300 Subject: [PATCH 1/3] Add parameter definitions to summarize.R Signed-off-by: Awa Synthia --- NAMESPACE | 1 - R/summarize.R | 157 +++++++++++++++++++++++++++++++---------- man/count_bycol.Rd | 22 +++++- man/elements2words.Rd | 23 ++++-- man/filter_freq.Rd | 10 ++- man/summ.DA.Rd | 13 +++- man/summ.DA.byLin.Rd | 9 ++- man/summ.GC.Rd | 14 +++- man/summ.GC.byDALin.Rd | 15 +++- man/summarize_bylin.Rd | 15 +++- man/total_counts.Rd | 24 +++++-- man/words2wc.Rd | 11 ++- 12 files changed, 249 insertions(+), 65 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 16cf0813..9d73120a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -131,7 +131,6 @@ importFrom(dplyr,if_else) importFrom(dplyr,left_join) importFrom(dplyr,mutate) importFrom(dplyr,n) -importFrom(dplyr,n_distinct) importFrom(dplyr,pull) importFrom(dplyr,relocate) importFrom(dplyr,right_join) diff --git a/R/summarize.R b/R/summarize.R index a9b13e43..e03ca463 100644 --- a/R/summarize.R +++ b/R/summarize.R @@ -91,18 +91,31 @@ filter_by_doms <- function(prot, column = "DomArch", doms_keep = c(), doms_remov ## Function to obtain element counts (DA, GC) #' Count Bycol #' -#' @param prot -#' @param column -#' @param min.freq +#' @param prot A data frame containing the dataset to analyze, typically with +#' multiple columns including the one specified by the `column` parameter. +#' @param column A character string specifying the name of the column to analyze. +#' The default is "DomArch". +#' @param min.freq An integer specifying the minimum frequency an element must +#' have to be included in the output. Default is 1. #' #' @importFrom dplyr arrange as_tibble filter select #' -#' @return Describe return, in detail +#' @return A tibble with two columns: +#' \describe{ +#' \item{`column`}{The unique elements from the specified column +#' (e.g., "DomArch").} +#' \item{`freq`}{The frequency of each element, i.e., the number of times +#' each element appears in the specified column.} +#' } +#' The tibble is filtered to only include elements that have a frequency +#' greater than or equal to `min.freq` and does not include elements with `NA` +#' values or those starting with a hyphen ("-"). +#' #' @export #' #' @examples #' \dontrun{ -#' count_bycol() +#' count_bycol(prot = my_data, column = "DomArch", min.freq = 10) #' } count_bycol <- function(prot = prot, column = "DomArch", min.freq = 1) { counts <- prot %>% @@ -123,19 +136,30 @@ count_bycol <- function(prot = prot, column = "DomArch", min.freq = 1) { #' Break string ELEMENTS into WORDS for domain architecture (DA) and genomic #' context (GC) #' -#' @param prot [dataframe] -#' @param column [string] column name -#' @param conversion_type [string] type of conversion: 'da2doms': domain architectures to -#' domains. 'gc2da' genomic context to domain architectures +#' @param prot A dataframe containing the dataset to analyze. The specified +#' `column` contains the string elements to be processed. +#' @param column A character string specifying the name of the column to analyze. +#' Default is "DomArch". +#' @param conversion_type A character string specifying the type of conversion. +#' Two options are available: +#' \describe{ +#' \item{`da2doms`}{Convert domain architectures into individual domains by +#' replacing `+` symbols with spaces.} +#' \item{`gc2da`}{Convert genomic context into domain architectures by +#' replacing directional symbols (`<-`, `->`, and `|`) with spaces.} +#' } #' #' @importFrom dplyr pull #' @importFrom stringr str_replace_all #' -#' @return [string] with words delimited by spaces +#' @return A single string where elements are delimited by spaces. The function +#' performs necessary substitutions based on the `conversion_type` and cleans up +#' extraneous characters like newlines, tabs, and multiple spaces. #' #' @examples #' \dontrun{ -#' tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> elements2words() +#' tibble::tibble(DomArch = c("aaa+bbb", +#' "a+b", "b+c", "b-c")) |> elements2words() #' } #' elements2words <- function(prot, column = "DomArch", conversion_type = "da2doms") { @@ -175,11 +199,19 @@ elements2words <- function(prot, column = "DomArch", conversion_type = "da2doms" #' @description #' Get word counts (wc) [DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)] #' -#' @param string +#' @param string A character string containing the elements (words) to count. +#' This would typically be a space-delimited string representing domain +#' architectures or genomic contexts. #' -#' @importFrom dplyr as_tibble filter +#' @importFrom dplyr as_tibble filter arrange +#' @importFrom stringr str_replace_all #' -#' @return [tbl_df] table with 2 columns: 1) words & 2) counts/frequency +#' @return A tibble (tbl_df) with two columns: +#' \describe{ +#' \item{`words`}{A column containing the individual words +#' (domains or domain architectures).} +#' \item{`freq`}{A column containing the frequency counts for each word.} +#' } #' #' @examples #' \dontrun{ @@ -219,10 +251,15 @@ words2wc <- function(string) { ## Function to filter based on frequencies #' Filter Frequency #' -#' @param x -#' @param min.freq +#' @param x A tibble (tbl_df) containing at least two columns: one for +#' elements (e.g., `words`) and one for their frequency (e.g., `freq`). +#' @param min.freq A numeric value specifying the minimum frequency threshold. +#' Only elements with frequencies greater than or equal to this value will be +#' retained. +#' +#' @return A tibble with the same structure as `x`, but filtered to include +#' only rows where the frequency is greater than or equal to `min.freq`. #' -#' @return Describe return, in detail #' @export #' #' @examples @@ -239,15 +276,20 @@ filter_freq <- function(x, min.freq) { ######################### #' Summarize by Lineage #' -#' @param prot -#' @param column -#' @param by -#' @param query +#' @param prot A dataframe or tibble containing the data. +#' @param column A string representing the column to be summarized +#' (e.g., `DomArch`). Default is "DomArch". +#' @param by A string representing the grouping column (e.g., `Lineage`). +#' Default is "Lineage". +#' @param query A string specifying the query pattern for filtering the target +#' column. Use "all" to skip filtering and include all rows. #' #' @importFrom dplyr arrange filter group_by summarise #' @importFrom rlang sym #' -#' @return Describe return, in detail +#' @return A tibble summarizing the counts of occurrences of elements in +#' the `column`, grouped by the `by` column. The result includes the number +#' of occurrences (`count`) and is arranged in descending order of count. #' @export #' #' @examples @@ -283,11 +325,17 @@ summarize_bylin <- function(prot = "prot", column = "DomArch", by = "Lineage", #' Function to summarize and retrieve counts by Domains & Domains+Lineage #' #' -#' @param x +#' @param x A dataframe or tibble containing the data. It must have columns +#' named `DomArch` and `Lineage`. #' #' @importFrom dplyr arrange count desc filter group_by summarise #' -#' @return Describe return, in detail +#' @return A tibble summarizing the counts of unique domain architectures +#' (`DomArch`) per lineage (`Lineage`). The resulting table contains three +#' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency +#' of each domain architecture for each lineage. The results are arranged in +#' descending order of `count`. +#' #' @export #' #' @examples @@ -309,11 +357,18 @@ summ.DA.byLin <- function(x) { #' @description #' Function to retrieve counts of how many lineages a DomArch appears in #' -#' @param x +#' @param x A dataframe or tibble containing the data. It must have a column +#' named `DomArch` and a count column, such as `count`, which represents the +#' occurrences of each architecture in various lineages. #' #' @importFrom dplyr arrange group_by filter summarise #' -#' @return Describe return, in detail +#' @return A tibble summarizing each unique `DomArch`, along with the following +#' columns: +#' - `totalcount`: The total occurrences of each `DomArch` across all lineages. +#' - `totallin`: The total number of unique lineages in which each `DomArch` +#' appears. +#' The results are arranged in descending order of `totallin` and `totalcount`. #' @export #' #' @examples @@ -332,11 +387,20 @@ summ.DA <- function(x) { #' summ.GC.byDALin #' -#' @param x +#' @param x A dataframe or tibble containing the data. It must have columns +#' named `GenContext`, `DomArch`, and `Lineage`. #' #' @importFrom dplyr arrange desc filter group_by n summarise #' -#' @return Define return, in detail +#' @return A tibble summarizing each unique combination of `GenContext`, +#' `DomArch`, and `Lineage`, along with the following columns: +#' - `GenContext`: The genomic context for each entry. +#' - `DomArch`: The domain architecture for each entry. +#' - `Lineage`: The lineage associated with each entry. +#' - `count`: The total number of occurrences for each combination of +#' `GenContext`, `DomArch`, and `Lineage`. +#' +#' The results are arranged in descending order of `count`. #' @export #' #' @examples @@ -382,11 +446,19 @@ summ.GC.byLin <- function(x) { #' summ.GC #' -#' @param x +#' @param x A dataframe or tibble containing the data. It must have columns +#' named `GenContext`, `DomArch`, and `Lineage`. #' -#' @importFrom dplyr arrange desc filter group_by n_distinct summarise +#' @importFrom dplyr arrange desc filter group_by n summarise #' -#' @return Describe return, in detail +#' @return A tibble summarizing each unique combination of `GenContext` and +#' `Lineage`, along with the following columns: +#' - `GenContext`: The genomic context for each entry. +#' - `Lineage`: The lineage associated with each entry. +#' - `count`: The total number of occurrences for each combination of +#' `GenContext` and `Lineage`. +#' +#' The results are arranged in descending order of `count`. #' @export #' #' @examples @@ -419,16 +491,27 @@ summ.GC <- function(x) { #' #' @param prot A data frame that must contain columns: #' \itemize{\item Either 'GenContext' or 'DomArch.norep' \item count} -#' @param column Character. The column to summarize -#' @param lineage_col -#' @param cutoff Numeric. Cutoff for total count. Counts below cutoff value will not be shown. Default is 0. -#' @param RowsCutoff -#' @param digits +#' @param column Character. The column to summarize, default is "DomArch". +#' @param lineage_col Character. The name of the lineage column, default is +#' "Lineage". +#' @param cutoff Numeric. Cutoff for total count. Counts below this cutoff value +#' will not be shown. Default is 0. +#' @param RowsCutoff Logical. If TRUE, filters based on cumulative percentage +#' cutoff. Default is FALSE. +#' @param digits Numeric. Number of decimal places for percentage columns. +#' Default is 2. +#' #' #' @importFrom dplyr arrange distinct filter group_by left_join mutate select summarise ungroup #' @importFrom rlang as_string sym #' -#' @return Define return, in detail +#' @return A data frame with the following columns: +#' - `{{ column }}`: Unique values from the specified column. +#' - `totalcount`: The total count of occurrences for each unique value in +#' the specified column. +#' - `IndividualCountPercent`: The percentage of each `totalcount` relative to +#' the overall count. +#' - `CumulativePercent`: The cumulative percentage of total counts. #' @export #' #' @note Please refer to the source code if you have alternate file formats and/or diff --git a/man/count_bycol.Rd b/man/count_bycol.Rd index 884c0f0f..946a7ea2 100644 --- a/man/count_bycol.Rd +++ b/man/count_bycol.Rd @@ -7,16 +7,32 @@ count_bycol(prot = prot, column = "DomArch", min.freq = 1) } \arguments{ -\item{min.freq}{} +\item{prot}{A data frame containing the dataset to analyze, typically with +multiple columns including the one specified by the \code{column} parameter.} + +\item{column}{A character string specifying the name of the column to analyze. +The default is "DomArch".} + +\item{min.freq}{An integer specifying the minimum frequency an element must +have to be included in the output. Default is 1.} } \value{ -Describe return, in detail +A tibble with two columns: +\describe{ +\item{\code{column}}{The unique elements from the specified column +(e.g., "DomArch").} +\item{\code{freq}}{The frequency of each element, i.e., the number of times +each element appears in the specified column.} +} +The tibble is filtered to only include elements that have a frequency +greater than or equal to \code{min.freq} and does not include elements with \code{NA} +values or those starting with a hyphen ("-"). } \description{ Count Bycol } \examples{ \dontrun{ -count_bycol() +count_bycol(prot = my_data, column = "DomArch", min.freq = 10) } } diff --git a/man/elements2words.Rd b/man/elements2words.Rd index 80fcbafb..bda447db 100644 --- a/man/elements2words.Rd +++ b/man/elements2words.Rd @@ -7,15 +7,25 @@ elements2words(prot, column = "DomArch", conversion_type = "da2doms") } \arguments{ -\item{prot}{\link{dataframe}} +\item{prot}{A dataframe containing the dataset to analyze. The specified +\code{column} contains the string elements to be processed.} -\item{column}{\link{string} column name} +\item{column}{A character string specifying the name of the column to analyze. +Default is "DomArch".} -\item{conversion_type}{\link{string} type of conversion: 'da2doms': domain architectures to -domains. 'gc2da' genomic context to domain architectures} +\item{conversion_type}{A character string specifying the type of conversion. +Two options are available: +\describe{ +\item{\code{da2doms}}{Convert domain architectures into individual domains by +replacing \code{+} symbols with spaces.} +\item{\code{gc2da}}{Convert genomic context into domain architectures by +replacing directional symbols (\verb{<-}, \verb{->}, and \code{|}) with spaces.} +}} } \value{ -\link{string} with words delimited by spaces +A single string where elements are delimited by spaces. The function +performs necessary substitutions based on the \code{conversion_type} and cleans up +extraneous characters like newlines, tabs, and multiple spaces. } \description{ Break string ELEMENTS into WORDS for domain architecture (DA) and genomic @@ -23,7 +33,8 @@ context (GC) } \examples{ \dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> elements2words() +tibble::tibble(DomArch = c("aaa+bbb", +"a+b", "b+c", "b-c")) |> elements2words() } } diff --git a/man/filter_freq.Rd b/man/filter_freq.Rd index ce4db5ac..9dfba73b 100644 --- a/man/filter_freq.Rd +++ b/man/filter_freq.Rd @@ -7,10 +7,16 @@ filter_freq(x, min.freq) } \arguments{ -\item{min.freq}{} +\item{x}{A tibble (tbl_df) containing at least two columns: one for +elements (e.g., \code{words}) and one for their frequency (e.g., \code{freq}).} + +\item{min.freq}{A numeric value specifying the minimum frequency threshold. +Only elements with frequencies greater than or equal to this value will be +retained.} } \value{ -Describe return, in detail +A tibble with the same structure as \code{x}, but filtered to include +only rows where the frequency is greater than or equal to \code{min.freq}. } \description{ Filter Frequency diff --git a/man/summ.DA.Rd b/man/summ.DA.Rd index 13717140..01d15b3c 100644 --- a/man/summ.DA.Rd +++ b/man/summ.DA.Rd @@ -7,10 +7,19 @@ summ.DA(x) } \arguments{ -\item{x}{} +\item{x}{A dataframe or tibble containing the data. It must have a column +named \code{DomArch} and a count column, such as \code{count}, which represents the +occurrences of each architecture in various lineages.} } \value{ -Describe return, in detail +A tibble summarizing each unique \code{DomArch}, along with the following +columns: +\itemize{ +\item \code{totalcount}: The total occurrences of each \code{DomArch} across all lineages. +\item \code{totallin}: The total number of unique lineages in which each \code{DomArch} +appears. +The results are arranged in descending order of \code{totallin} and \code{totalcount}. +} } \description{ Function to retrieve counts of how many lineages a DomArch appears in diff --git a/man/summ.DA.byLin.Rd b/man/summ.DA.byLin.Rd index 66555fd5..d88e5d37 100644 --- a/man/summ.DA.byLin.Rd +++ b/man/summ.DA.byLin.Rd @@ -7,10 +7,15 @@ summ.DA.byLin(x) } \arguments{ -\item{x}{} +\item{x}{A dataframe or tibble containing the data. It must have columns +named \code{DomArch} and \code{Lineage}.} } \value{ -Describe return, in detail +A tibble summarizing the counts of unique domain architectures +(\code{DomArch}) per lineage (\code{Lineage}). The resulting table contains three +columns: \code{DomArch}, \code{Lineage}, and \code{count}, which indicates the frequency +of each domain architecture for each lineage. The results are arranged in +descending order of \code{count}. } \description{ Function to summarize and retrieve counts by Domains & Domains+Lineage diff --git a/man/summ.GC.Rd b/man/summ.GC.Rd index fa52a6bf..2ec4d651 100644 --- a/man/summ.GC.Rd +++ b/man/summ.GC.Rd @@ -7,10 +7,20 @@ summ.GC(x) } \arguments{ -\item{x}{} +\item{x}{A dataframe or tibble containing the data. It must have columns +named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} } \value{ -Describe return, in detail +A tibble summarizing each unique combination of \code{GenContext} and +\code{Lineage}, along with the following columns: +\itemize{ +\item \code{GenContext}: The genomic context for each entry. +\item \code{Lineage}: The lineage associated with each entry. +\item \code{count}: The total number of occurrences for each combination of +\code{GenContext} and \code{Lineage}. +} + +The results are arranged in descending order of \code{count}. } \description{ summ.GC diff --git a/man/summ.GC.byDALin.Rd b/man/summ.GC.byDALin.Rd index 34c9f84d..7fc8d443 100644 --- a/man/summ.GC.byDALin.Rd +++ b/man/summ.GC.byDALin.Rd @@ -7,10 +7,21 @@ summ.GC.byDALin(x) } \arguments{ -\item{x}{} +\item{x}{A dataframe or tibble containing the data. It must have columns +named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} } \value{ -Define return, in detail +A tibble summarizing each unique combination of \code{GenContext}, +\code{DomArch}, and \code{Lineage}, along with the following columns: +\itemize{ +\item \code{GenContext}: The genomic context for each entry. +\item \code{DomArch}: The domain architecture for each entry. +\item \code{Lineage}: The lineage associated with each entry. +\item \code{count}: The total number of occurrences for each combination of +\code{GenContext}, \code{DomArch}, and \code{Lineage}. +} + +The results are arranged in descending order of \code{count}. } \description{ summ.GC.byDALin diff --git a/man/summarize_bylin.Rd b/man/summarize_bylin.Rd index a94c54c1..92b93652 100644 --- a/man/summarize_bylin.Rd +++ b/man/summarize_bylin.Rd @@ -7,10 +7,21 @@ summarize_bylin(prot = "prot", column = "DomArch", by = "Lineage", query) } \arguments{ -\item{query}{} +\item{prot}{A dataframe or tibble containing the data.} + +\item{column}{A string representing the column to be summarized +(e.g., \code{DomArch}). Default is "DomArch".} + +\item{by}{A string representing the grouping column (e.g., \code{Lineage}). +Default is "Lineage".} + +\item{query}{A string specifying the query pattern for filtering the target +column. Use "all" to skip filtering and include all rows.} } \value{ -Describe return, in detail +A tibble summarizing the counts of occurrences of elements in +the \code{column}, grouped by the \code{by} column. The result includes the number +of occurrences (\code{count}) and is arranged in descending order of count. } \description{ Summarize by Lineage diff --git a/man/total_counts.Rd b/man/total_counts.Rd index 49db8822..53d70096 100644 --- a/man/total_counts.Rd +++ b/man/total_counts.Rd @@ -17,14 +17,30 @@ total_counts( \item{prot}{A data frame that must contain columns: \itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}} -\item{column}{Character. The column to summarize} +\item{column}{Character. The column to summarize, default is "DomArch".} -\item{cutoff}{Numeric. Cutoff for total count. Counts below cutoff value will not be shown. Default is 0.} +\item{lineage_col}{Character. The name of the lineage column, default is +"Lineage".} -\item{digits}{} +\item{cutoff}{Numeric. Cutoff for total count. Counts below this cutoff value +will not be shown. Default is 0.} + +\item{RowsCutoff}{Logical. If TRUE, filters based on cumulative percentage +cutoff. Default is FALSE.} + +\item{digits}{Numeric. Number of decimal places for percentage columns. +Default is 2.} } \value{ -Define return, in detail +A data frame with the following columns: +\itemize{ +\item \code{{{ column }}}: Unique values from the specified column. +\item \code{totalcount}: The total count of occurrences for each unique value in +the specified column. +\item \code{IndividualCountPercent}: The percentage of each \code{totalcount} relative to +the overall count. +\item \code{CumulativePercent}: The cumulative percentage of total counts. +} } \description{ Creates a data frame with a totalcount column diff --git a/man/words2wc.Rd b/man/words2wc.Rd index 1eba5dc4..69d006d5 100644 --- a/man/words2wc.Rd +++ b/man/words2wc.Rd @@ -7,10 +7,17 @@ words2wc(string) } \arguments{ -\item{string}{} +\item{string}{A character string containing the elements (words) to count. +This would typically be a space-delimited string representing domain +architectures or genomic contexts.} } \value{ -\link{tbl_df} table with 2 columns: 1) words & 2) counts/frequency +A tibble (tbl_df) with two columns: +\describe{ +\item{\code{words}}{A column containing the individual words +(domains or domain architectures).} +\item{\code{freq}}{A column containing the frequency counts for each word.} +} } \description{ Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)} From d9fa04bc729586ab336275083d67fb75420ac138 Mon Sep 17 00:00:00 2001 From: Awa Synthia Date: Mon, 7 Oct 2024 07:42:08 +0300 Subject: [PATCH 2/3] use one documentation file Signed-off-by: Awa Synthia --- R/summarize.R | 16 ++- man/count_bycol.Rd | 38 ------ man/elements2words.Rd | 40 ------- man/filter_by_doms.Rd | 44 ------- man/filter_freq.Rd | 28 ----- man/summ.DA.Rd | 31 ----- man/summ.DA.byLin.Rd | 27 ----- man/summ.GC.Rd | 32 ----- man/summ.GC.byDALin.Rd | 33 ------ man/summ.GC.byLin.Rd | 22 ---- man/summarize.Rd | 260 +++++++++++++++++++++++++++++++++++++++++ man/summarize_bylin.Rd | 36 ------ man/total_counts.Rd | 58 --------- man/words2wc.Rd | 32 ----- 14 files changed, 274 insertions(+), 423 deletions(-) delete mode 100644 man/count_bycol.Rd delete mode 100644 man/elements2words.Rd delete mode 100644 man/filter_by_doms.Rd delete mode 100644 man/filter_freq.Rd delete mode 100644 man/summ.DA.Rd delete mode 100644 man/summ.DA.byLin.Rd delete mode 100644 man/summ.GC.Rd delete mode 100644 man/summ.GC.byDALin.Rd delete mode 100644 man/summ.GC.byLin.Rd create mode 100644 man/summarize.Rd delete mode 100644 man/summarize_bylin.Rd delete mode 100644 man/total_counts.Rd delete mode 100644 man/words2wc.Rd diff --git a/R/summarize.R b/R/summarize.R index e03ca463..0580c15d 100644 --- a/R/summarize.R +++ b/R/summarize.R @@ -29,6 +29,7 @@ #' #' @return Filtered data frame #' @note There is no need to make the domains 'regex safe', that will be handled by this function +#' @name summarize #' @export #' #' @examples @@ -110,7 +111,7 @@ filter_by_doms <- function(prot, column = "DomArch", doms_keep = c(), doms_remov #' The tibble is filtered to only include elements that have a frequency #' greater than or equal to `min.freq` and does not include elements with `NA` #' values or those starting with a hyphen ("-"). -#' +#' @name summarize #' @export #' #' @examples @@ -155,6 +156,7 @@ count_bycol <- function(prot = prot, column = "DomArch", min.freq = 1) { #' @return A single string where elements are delimited by spaces. The function #' performs necessary substitutions based on the `conversion_type` and cleans up #' extraneous characters like newlines, tabs, and multiple spaces. +#' @name summarize #' #' @examples #' \dontrun{ @@ -212,6 +214,8 @@ elements2words <- function(prot, column = "DomArch", conversion_type = "da2doms" #' (domains or domain architectures).} #' \item{`freq`}{A column containing the frequency counts for each word.} #' } +#' +#' @name summarize #' #' @examples #' \dontrun{ @@ -259,6 +263,7 @@ words2wc <- function(string) { #' #' @return A tibble with the same structure as `x`, but filtered to include #' only rows where the frequency is greater than or equal to `min.freq`. +#' @name summarize #' #' @export #' @@ -290,6 +295,7 @@ filter_freq <- function(x, min.freq) { #' @return A tibble summarizing the counts of occurrences of elements in #' the `column`, grouped by the `by` column. The result includes the number #' of occurrences (`count`) and is arranged in descending order of count. +#' @name summarize #' @export #' #' @examples @@ -335,6 +341,7 @@ summarize_bylin <- function(prot = "prot", column = "DomArch", by = "Lineage", #' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency #' of each domain architecture for each lineage. The results are arranged in #' descending order of `count`. +#' @name summarize #' #' @export #' @@ -369,6 +376,7 @@ summ.DA.byLin <- function(x) { #' - `totallin`: The total number of unique lineages in which each `DomArch` #' appears. #' The results are arranged in descending order of `totallin` and `totalcount`. +#' @name summarize #' @export #' #' @examples @@ -401,6 +409,7 @@ summ.DA <- function(x) { #' `GenContext`, `DomArch`, and `Lineage`. #' #' The results are arranged in descending order of `count`. +#' @name summarize #' @export #' #' @examples @@ -421,11 +430,12 @@ summ.GC.byDALin <- function(x) { #' summ.GC.byLin #' -#' @param x +#' @param x A dataframe or tibble containing the data. #' #' @importFrom dplyr arrange desc filter group_by n summarise #' #' @return Describe return, in detail +#' @name summarize #' @export #' #' @examples @@ -459,6 +469,7 @@ summ.GC.byLin <- function(x) { #' `GenContext` and `Lineage`. #' #' The results are arranged in descending order of `count`. +#' @name summarize #' @export #' #' @examples @@ -512,6 +523,7 @@ summ.GC <- function(x) { #' - `IndividualCountPercent`: The percentage of each `totalcount` relative to #' the overall count. #' - `CumulativePercent`: The cumulative percentage of total counts. +#' @name summarize #' @export #' #' @note Please refer to the source code if you have alternate file formats and/or diff --git a/man/count_bycol.Rd b/man/count_bycol.Rd deleted file mode 100644 index 946a7ea2..00000000 --- a/man/count_bycol.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{count_bycol} -\alias{count_bycol} -\title{Count Bycol} -\usage{ -count_bycol(prot = prot, column = "DomArch", min.freq = 1) -} -\arguments{ -\item{prot}{A data frame containing the dataset to analyze, typically with -multiple columns including the one specified by the \code{column} parameter.} - -\item{column}{A character string specifying the name of the column to analyze. -The default is "DomArch".} - -\item{min.freq}{An integer specifying the minimum frequency an element must -have to be included in the output. Default is 1.} -} -\value{ -A tibble with two columns: -\describe{ -\item{\code{column}}{The unique elements from the specified column -(e.g., "DomArch").} -\item{\code{freq}}{The frequency of each element, i.e., the number of times -each element appears in the specified column.} -} -The tibble is filtered to only include elements that have a frequency -greater than or equal to \code{min.freq} and does not include elements with \code{NA} -values or those starting with a hyphen ("-"). -} -\description{ -Count Bycol -} -\examples{ -\dontrun{ -count_bycol(prot = my_data, column = "DomArch", min.freq = 10) -} -} diff --git a/man/elements2words.Rd b/man/elements2words.Rd deleted file mode 100644 index bda447db..00000000 --- a/man/elements2words.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{elements2words} -\alias{elements2words} -\title{Elements 2 Words} -\usage{ -elements2words(prot, column = "DomArch", conversion_type = "da2doms") -} -\arguments{ -\item{prot}{A dataframe containing the dataset to analyze. The specified -\code{column} contains the string elements to be processed.} - -\item{column}{A character string specifying the name of the column to analyze. -Default is "DomArch".} - -\item{conversion_type}{A character string specifying the type of conversion. -Two options are available: -\describe{ -\item{\code{da2doms}}{Convert domain architectures into individual domains by -replacing \code{+} symbols with spaces.} -\item{\code{gc2da}}{Convert genomic context into domain architectures by -replacing directional symbols (\verb{<-}, \verb{->}, and \code{|}) with spaces.} -}} -} -\value{ -A single string where elements are delimited by spaces. The function -performs necessary substitutions based on the \code{conversion_type} and cleans up -extraneous characters like newlines, tabs, and multiple spaces. -} -\description{ -Break string ELEMENTS into WORDS for domain architecture (DA) and genomic -context (GC) -} -\examples{ -\dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", -"a+b", "b+c", "b-c")) |> elements2words() -} - -} diff --git a/man/filter_by_doms.Rd b/man/filter_by_doms.Rd deleted file mode 100644 index cfe255ca..00000000 --- a/man/filter_by_doms.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{filter_by_doms} -\alias{filter_by_doms} -\title{Filter by Domains} -\usage{ -filter_by_doms( - prot, - column = "DomArch", - doms_keep = c(), - doms_remove = c(), - ignore.case = FALSE -) -} -\arguments{ -\item{prot}{Dataframe to filter} - -\item{column}{Column to search for domains in (DomArch column)} - -\item{doms_keep}{Vector of domains that must be identified within column in order for -observation to be kept} - -\item{doms_remove}{Vector of domains that, if found within an observation, will be removed} - -\item{ignore.case}{Should the matching be non case sensitive} -} -\value{ -Filtered data frame -} -\description{ -filter_by_doms filters a data frame by identifying exact domain matches -and either keeping or removing rows with the identified domain -} -\note{ -There is no need to make the domains 'regex safe', that will be handled by this function -} -\examples{ -\dontrun{ -filter_by_doms() -} -} -\author{ -Samuel Chen, Janani Ravi -} diff --git a/man/filter_freq.Rd b/man/filter_freq.Rd deleted file mode 100644 index 9dfba73b..00000000 --- a/man/filter_freq.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{filter_freq} -\alias{filter_freq} -\title{Filter Frequency} -\usage{ -filter_freq(x, min.freq) -} -\arguments{ -\item{x}{A tibble (tbl_df) containing at least two columns: one for -elements (e.g., \code{words}) and one for their frequency (e.g., \code{freq}).} - -\item{min.freq}{A numeric value specifying the minimum frequency threshold. -Only elements with frequencies greater than or equal to this value will be -retained.} -} -\value{ -A tibble with the same structure as \code{x}, but filtered to include -only rows where the frequency is greater than or equal to \code{min.freq}. -} -\description{ -Filter Frequency -} -\examples{ -\dontrun{ -filter_freq() -} -} diff --git a/man/summ.DA.Rd b/man/summ.DA.Rd deleted file mode 100644 index 01d15b3c..00000000 --- a/man/summ.DA.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summ.DA} -\alias{summ.DA} -\title{summ.DA} -\usage{ -summ.DA(x) -} -\arguments{ -\item{x}{A dataframe or tibble containing the data. It must have a column -named \code{DomArch} and a count column, such as \code{count}, which represents the -occurrences of each architecture in various lineages.} -} -\value{ -A tibble summarizing each unique \code{DomArch}, along with the following -columns: -\itemize{ -\item \code{totalcount}: The total occurrences of each \code{DomArch} across all lineages. -\item \code{totallin}: The total number of unique lineages in which each \code{DomArch} -appears. -The results are arranged in descending order of \code{totallin} and \code{totalcount}. -} -} -\description{ -Function to retrieve counts of how many lineages a DomArch appears in -} -\examples{ -\dontrun{ -summ.DA() -} -} diff --git a/man/summ.DA.byLin.Rd b/man/summ.DA.byLin.Rd deleted file mode 100644 index d88e5d37..00000000 --- a/man/summ.DA.byLin.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summ.DA.byLin} -\alias{summ.DA.byLin} -\title{summ.DA.byLin} -\usage{ -summ.DA.byLin(x) -} -\arguments{ -\item{x}{A dataframe or tibble containing the data. It must have columns -named \code{DomArch} and \code{Lineage}.} -} -\value{ -A tibble summarizing the counts of unique domain architectures -(\code{DomArch}) per lineage (\code{Lineage}). The resulting table contains three -columns: \code{DomArch}, \code{Lineage}, and \code{count}, which indicates the frequency -of each domain architecture for each lineage. The results are arranged in -descending order of \code{count}. -} -\description{ -Function to summarize and retrieve counts by Domains & Domains+Lineage -} -\examples{ -\dontrun{ -summ.DA.byLin() -} -} diff --git a/man/summ.GC.Rd b/man/summ.GC.Rd deleted file mode 100644 index 2ec4d651..00000000 --- a/man/summ.GC.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summ.GC} -\alias{summ.GC} -\title{summ.GC} -\usage{ -summ.GC(x) -} -\arguments{ -\item{x}{A dataframe or tibble containing the data. It must have columns -named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} -} -\value{ -A tibble summarizing each unique combination of \code{GenContext} and -\code{Lineage}, along with the following columns: -\itemize{ -\item \code{GenContext}: The genomic context for each entry. -\item \code{Lineage}: The lineage associated with each entry. -\item \code{count}: The total number of occurrences for each combination of -\code{GenContext} and \code{Lineage}. -} - -The results are arranged in descending order of \code{count}. -} -\description{ -summ.GC -} -\examples{ -\dontrun{ -summ.GC() -} -} diff --git a/man/summ.GC.byDALin.Rd b/man/summ.GC.byDALin.Rd deleted file mode 100644 index 7fc8d443..00000000 --- a/man/summ.GC.byDALin.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summ.GC.byDALin} -\alias{summ.GC.byDALin} -\title{summ.GC.byDALin} -\usage{ -summ.GC.byDALin(x) -} -\arguments{ -\item{x}{A dataframe or tibble containing the data. It must have columns -named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} -} -\value{ -A tibble summarizing each unique combination of \code{GenContext}, -\code{DomArch}, and \code{Lineage}, along with the following columns: -\itemize{ -\item \code{GenContext}: The genomic context for each entry. -\item \code{DomArch}: The domain architecture for each entry. -\item \code{Lineage}: The lineage associated with each entry. -\item \code{count}: The total number of occurrences for each combination of -\code{GenContext}, \code{DomArch}, and \code{Lineage}. -} - -The results are arranged in descending order of \code{count}. -} -\description{ -summ.GC.byDALin -} -\examples{ -\dontrun{ -summ.GC.byDALin -} -} diff --git a/man/summ.GC.byLin.Rd b/man/summ.GC.byLin.Rd deleted file mode 100644 index df2a8fb8..00000000 --- a/man/summ.GC.byLin.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summ.GC.byLin} -\alias{summ.GC.byLin} -\title{summ.GC.byLin} -\usage{ -summ.GC.byLin(x) -} -\arguments{ -\item{x}{} -} -\value{ -Describe return, in detail -} -\description{ -summ.GC.byLin -} -\examples{ -\dontrun{ -summ.GC.byLin() -} -} diff --git a/man/summarize.Rd b/man/summarize.Rd new file mode 100644 index 00000000..f149f686 --- /dev/null +++ b/man/summarize.Rd @@ -0,0 +1,260 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarize.R +\name{summarize} +\alias{summarize} +\alias{filter_by_doms} +\alias{count_bycol} +\alias{elements2words} +\alias{words2wc} +\alias{filter_freq} +\alias{summarize_bylin} +\alias{summ.DA.byLin} +\alias{summ.DA} +\alias{summ.GC.byDALin} +\alias{summ.GC.byLin} +\alias{summ.GC} +\alias{total_counts} +\title{Filter by Domains} +\usage{ +filter_by_doms( + prot, + column = "DomArch", + doms_keep = c(), + doms_remove = c(), + ignore.case = FALSE +) + +count_bycol(prot = prot, column = "DomArch", min.freq = 1) + +elements2words(prot, column = "DomArch", conversion_type = "da2doms") + +words2wc(string) + +filter_freq(x, min.freq) + +summarize_bylin(prot = "prot", column = "DomArch", by = "Lineage", query) + +summ.DA.byLin(x) + +summ.DA(x) + +summ.GC.byDALin(x) + +summ.GC.byLin(x) + +summ.GC(x) + +total_counts( + prot, + column = "DomArch", + lineage_col = "Lineage", + cutoff = 90, + RowsCutoff = FALSE, + digits = 2 +) +} +\arguments{ +\item{prot}{A data frame that must contain columns: +\itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}} + +\item{column}{Character. The column to summarize, default is "DomArch".} + +\item{doms_keep}{Vector of domains that must be identified within column in order for +observation to be kept} + +\item{doms_remove}{Vector of domains that, if found within an observation, will be removed} + +\item{ignore.case}{Should the matching be non case sensitive} + +\item{min.freq}{A numeric value specifying the minimum frequency threshold. +Only elements with frequencies greater than or equal to this value will be +retained.} + +\item{conversion_type}{A character string specifying the type of conversion. +Two options are available: +\describe{ +\item{\code{da2doms}}{Convert domain architectures into individual domains by +replacing \code{+} symbols with spaces.} +\item{\code{gc2da}}{Convert genomic context into domain architectures by +replacing directional symbols (\verb{<-}, \verb{->}, and \code{|}) with spaces.} +}} + +\item{string}{A character string containing the elements (words) to count. +This would typically be a space-delimited string representing domain +architectures or genomic contexts.} + +\item{x}{A dataframe or tibble containing the data. It must have columns +named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} + +\item{by}{A string representing the grouping column (e.g., \code{Lineage}). +Default is "Lineage".} + +\item{query}{A string specifying the query pattern for filtering the target +column. Use "all" to skip filtering and include all rows.} + +\item{lineage_col}{Character. The name of the lineage column, default is +"Lineage".} + +\item{cutoff}{Numeric. Cutoff for total count. Counts below this cutoff value +will not be shown. Default is 0.} + +\item{RowsCutoff}{Logical. If TRUE, filters based on cumulative percentage +cutoff. Default is FALSE.} + +\item{digits}{Numeric. Number of decimal places for percentage columns. +Default is 2.} +} +\value{ +Filtered data frame + +A tibble with two columns: +\describe{ +\item{\code{column}}{The unique elements from the specified column +(e.g., "DomArch").} +\item{\code{freq}}{The frequency of each element, i.e., the number of times +each element appears in the specified column.} +} +The tibble is filtered to only include elements that have a frequency +greater than or equal to \code{min.freq} and does not include elements with \code{NA} +values or those starting with a hyphen ("-"). + +A single string where elements are delimited by spaces. The function +performs necessary substitutions based on the \code{conversion_type} and cleans up +extraneous characters like newlines, tabs, and multiple spaces. + +A tibble (tbl_df) with two columns: +\describe{ +\item{\code{words}}{A column containing the individual words +(domains or domain architectures).} +\item{\code{freq}}{A column containing the frequency counts for each word.} +} + +A tibble with the same structure as \code{x}, but filtered to include +only rows where the frequency is greater than or equal to \code{min.freq}. + +A tibble summarizing the counts of occurrences of elements in +the \code{column}, grouped by the \code{by} column. The result includes the number +of occurrences (\code{count}) and is arranged in descending order of count. + +A tibble summarizing the counts of unique domain architectures +(\code{DomArch}) per lineage (\code{Lineage}). The resulting table contains three +columns: \code{DomArch}, \code{Lineage}, and \code{count}, which indicates the frequency +of each domain architecture for each lineage. The results are arranged in +descending order of \code{count}. + +A tibble summarizing each unique \code{DomArch}, along with the following +columns: +\itemize{ +\item \code{totalcount}: The total occurrences of each \code{DomArch} across all lineages. +\item \code{totallin}: The total number of unique lineages in which each \code{DomArch} +appears. +The results are arranged in descending order of \code{totallin} and \code{totalcount}. +} + +A tibble summarizing each unique combination of \code{GenContext}, +\code{DomArch}, and \code{Lineage}, along with the following columns: +\itemize{ +\item \code{GenContext}: The genomic context for each entry. +\item \code{DomArch}: The domain architecture for each entry. +\item \code{Lineage}: The lineage associated with each entry. +\item \code{count}: The total number of occurrences for each combination of +\code{GenContext}, \code{DomArch}, and \code{Lineage}. +} + +The results are arranged in descending order of \code{count}. + +Describe return, in detail + +A tibble summarizing each unique combination of \code{GenContext} and +\code{Lineage}, along with the following columns: +\itemize{ +\item \code{GenContext}: The genomic context for each entry. +\item \code{Lineage}: The lineage associated with each entry. +\item \code{count}: The total number of occurrences for each combination of +\code{GenContext} and \code{Lineage}. +} + +The results are arranged in descending order of \code{count}. + +A data frame with the following columns: +\itemize{ +\item \code{{{ column }}}: Unique values from the specified column. +\item \code{totalcount}: The total count of occurrences for each unique value in +the specified column. +\item \code{IndividualCountPercent}: The percentage of each \code{totalcount} relative to +the overall count. +\item \code{CumulativePercent}: The cumulative percentage of total counts. +} +} +\description{ +filter_by_doms filters a data frame by identifying exact domain matches +and either keeping or removing rows with the identified domain + +Break string ELEMENTS into WORDS for domain architecture (DA) and genomic +context (GC) + +Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)} + +Function to summarize and retrieve counts by Domains & Domains+Lineage + +Function to retrieve counts of how many lineages a DomArch appears in + +Creates a data frame with a totalcount column + +This function is designed to sum the counts column by either Genomic Context or Domain Architecture and creates a totalcount column from those sums. +} +\note{ +There is no need to make the domains 'regex safe', that will be handled by this function + +Please refer to the source code if you have alternate file formats and/or +column names. +} +\examples{ +\dontrun{ +filter_by_doms() +} +\dontrun{ +count_bycol(prot = my_data, column = "DomArch", min.freq = 10) +} +\dontrun{ +tibble::tibble(DomArch = c("aaa+bbb", +"a+b", "b+c", "b-c")) |> elements2words() +} + +\dontrun{ +tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> + elements2words() |> + words2wc() +} + +\dontrun{ +filter_freq() +} +\dontrun{ +library(tidyverse) +tibble(DomArch = c("a+b", "a+b", "b+c", "a+b"), Lineage = c("l1", "l1", "l1", "l2")) |> + summarize_bylin(query = "all") +} + +\dontrun{ +summ.DA.byLin() +} +\dontrun{ +summ.DA() +} +\dontrun{ +summ.GC.byDALin +} +\dontrun{ +summ.GC.byLin() +} +\dontrun{ +summ.GC() +} +\dontrun{ +total_counts(pspa - gc_lin_counts, 0, "GC") +} +} +\author{ +Samuel Chen, Janani Ravi +} diff --git a/man/summarize_bylin.Rd b/man/summarize_bylin.Rd deleted file mode 100644 index 92b93652..00000000 --- a/man/summarize_bylin.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarize_bylin} -\alias{summarize_bylin} -\title{Summarize by Lineage} -\usage{ -summarize_bylin(prot = "prot", column = "DomArch", by = "Lineage", query) -} -\arguments{ -\item{prot}{A dataframe or tibble containing the data.} - -\item{column}{A string representing the column to be summarized -(e.g., \code{DomArch}). Default is "DomArch".} - -\item{by}{A string representing the grouping column (e.g., \code{Lineage}). -Default is "Lineage".} - -\item{query}{A string specifying the query pattern for filtering the target -column. Use "all" to skip filtering and include all rows.} -} -\value{ -A tibble summarizing the counts of occurrences of elements in -the \code{column}, grouped by the \code{by} column. The result includes the number -of occurrences (\code{count}) and is arranged in descending order of count. -} -\description{ -Summarize by Lineage -} -\examples{ -\dontrun{ -library(tidyverse) -tibble(DomArch = c("a+b", "a+b", "b+c", "a+b"), Lineage = c("l1", "l1", "l1", "l2")) |> - summarize_bylin(query = "all") -} - -} diff --git a/man/total_counts.Rd b/man/total_counts.Rd deleted file mode 100644 index 53d70096..00000000 --- a/man/total_counts.Rd +++ /dev/null @@ -1,58 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{total_counts} -\alias{total_counts} -\title{Total Counts} -\usage{ -total_counts( - prot, - column = "DomArch", - lineage_col = "Lineage", - cutoff = 90, - RowsCutoff = FALSE, - digits = 2 -) -} -\arguments{ -\item{prot}{A data frame that must contain columns: -\itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}} - -\item{column}{Character. The column to summarize, default is "DomArch".} - -\item{lineage_col}{Character. The name of the lineage column, default is -"Lineage".} - -\item{cutoff}{Numeric. Cutoff for total count. Counts below this cutoff value -will not be shown. Default is 0.} - -\item{RowsCutoff}{Logical. If TRUE, filters based on cumulative percentage -cutoff. Default is FALSE.} - -\item{digits}{Numeric. Number of decimal places for percentage columns. -Default is 2.} -} -\value{ -A data frame with the following columns: -\itemize{ -\item \code{{{ column }}}: Unique values from the specified column. -\item \code{totalcount}: The total count of occurrences for each unique value in -the specified column. -\item \code{IndividualCountPercent}: The percentage of each \code{totalcount} relative to -the overall count. -\item \code{CumulativePercent}: The cumulative percentage of total counts. -} -} -\description{ -Creates a data frame with a totalcount column - -This function is designed to sum the counts column by either Genomic Context or Domain Architecture and creates a totalcount column from those sums. -} -\note{ -Please refer to the source code if you have alternate file formats and/or -column names. -} -\examples{ -\dontrun{ -total_counts(pspa - gc_lin_counts, 0, "GC") -} -} diff --git a/man/words2wc.Rd b/man/words2wc.Rd deleted file mode 100644 index 69d006d5..00000000 --- a/man/words2wc.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{words2wc} -\alias{words2wc} -\title{Words 2 Word Counts} -\usage{ -words2wc(string) -} -\arguments{ -\item{string}{A character string containing the elements (words) to count. -This would typically be a space-delimited string representing domain -architectures or genomic contexts.} -} -\value{ -A tibble (tbl_df) with two columns: -\describe{ -\item{\code{words}}{A column containing the individual words -(domains or domain architectures).} -\item{\code{freq}}{A column containing the frequency counts for each word.} -} -} -\description{ -Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)} -} -\examples{ -\dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> - elements2words() |> - words2wc() -} - -} From 2da3d1a1eadb1c3d6f140700444e15db46c341d2 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Fri, 11 Oct 2024 08:40:17 -0600 Subject: [PATCH 3/3] summarize.R adjustments - add back importFrom n_distinct() as it appears to be used by summarizeGenContext() - use function call as title -- may specify this in MolEvolvR style guide for consistency - adjust Rd grouping with MolEvolvR_summary @rdname tag for functions that had a clear summary element. This will hopefully avoid confusion with the rather ubiquitous dplyr::summarize - converted some code comments to placeholder descriptions --- NAMESPACE | 1 + R/summarize.R | 58 +++---- man/{summarize.Rd => MolEvolvR_summary.Rd} | 159 ++++---------------- man/countbycolumn.Rd | 26 +++- man/elements2Words.Rd | 25 ++- man/filterbydomains.Rd | 2 +- man/filterbyfrequency.Rd | 14 +- man/findparalogs.Rd | 2 +- man/summarizeDomArch.Rd | 22 --- man/summarizeDomArch_ByLineage.Rd | 22 --- man/summarizeGenContext.Rd | 22 --- man/summarizeGenContext_ByDomArchLineage.Rd | 22 --- man/summarizeGenContext_ByLineage.Rd | 22 --- man/summarizebylineage.Rd | 25 --- man/totalgencontextordomarchcounts.Rd | 42 ------ man/words2wordcounts.Rd | 13 +- 16 files changed, 122 insertions(+), 355 deletions(-) rename man/{summarize.Rd => MolEvolvR_summary.Rd} (52%) delete mode 100644 man/summarizeDomArch.Rd delete mode 100644 man/summarizeDomArch_ByLineage.Rd delete mode 100644 man/summarizeGenContext.Rd delete mode 100644 man/summarizeGenContext_ByDomArchLineage.Rd delete mode 100644 man/summarizeGenContext_ByLineage.Rd delete mode 100644 man/summarizebylineage.Rd delete mode 100644 man/totalgencontextordomarchcounts.Rd diff --git a/NAMESPACE b/NAMESPACE index 2326fc1f..53332439 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -139,6 +139,7 @@ importFrom(dplyr,if_else) importFrom(dplyr,left_join) importFrom(dplyr,mutate) importFrom(dplyr,n) +importFrom(dplyr,n_distinct) importFrom(dplyr,pull) importFrom(dplyr,relocate) importFrom(dplyr,right_join) diff --git a/R/summarize.R b/R/summarize.R index 321a0488..2816f174 100644 --- a/R/summarize.R +++ b/R/summarize.R @@ -10,7 +10,7 @@ # suppressPackageStartupMessages(library(rlang)) # conflicted::conflict_prefer("filter", "dplyr") -#' Filter by Domains +#' filterByDomains #' #' @author Samuel Chen, Janani Ravi #' @description filterByDomains filters a data frame by identifying exact domain matches @@ -29,7 +29,6 @@ #' #' @return Filtered data frame #' @note There is no need to make the domains 'regex safe', that will be handled by this function -#' @name summarize #' @export #' #' @examples @@ -89,9 +88,11 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo ## COUNTS of DAs and GCs ## ## Before/after break up ## ########################### -## Function to obtain element counts (DA, GC) -#' Count By Column -#' + +#' countByColumn +#' @description +#' Function to obtain element counts (DA, GC) +#' #' @param prot A data frame containing the dataset to analyze, typically with #' multiple columns including the one specified by the `column` parameter. #' @param column A character string specifying the name of the column to analyze. @@ -111,7 +112,6 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo #' The tibble is filtered to only include elements that have a frequency #' greater than or equal to `min.freq` and does not include elements with `NA` #' values or those starting with a hyphen ("-"). -#' @name summarize #' @export #' #' @examples @@ -131,7 +131,7 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) { return(counts) } -#' Elements 2 Words +#' elements2Words #' #' @description #' Break string ELEMENTS into WORDS for domain architecture (DA) and genomic @@ -156,7 +156,6 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) { #' @return A single string where elements are delimited by spaces. The function #' performs necessary substitutions based on the `conversion_type` and cleans up #' extraneous characters like newlines, tabs, and multiple spaces. -#' @name summarize #' #' @examples #' \dontrun{ @@ -196,7 +195,7 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms" return(z3) } -#' Words 2 Word Counts +#' words2WordCounts #' #' @description #' Get word counts (wc) [DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)] @@ -215,7 +214,6 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms" #' \item{`freq`}{A column containing the frequency counts for each word.} #' } #' -#' @name summarize #' #' @examples #' \dontrun{ @@ -252,9 +250,11 @@ words2WordCounts <- function(string) { arrange(-freq) return(df_word_count) } -## Function to filter based on frequencies -#' Filter Frequency -#' + +#' filterByFrequency +#' @description +#' Function to filter based on frequencies +#' #' @param x A tibble (tbl_df) containing at least two columns: one for #' elements (e.g., `words`) and one for their frequency (e.g., `freq`). #' @param min.freq A numeric value specifying the minimum frequency threshold. @@ -263,7 +263,6 @@ words2WordCounts <- function(string) { #' #' @return A tibble with the same structure as `x`, but filtered to include #' only rows where the frequency is greater than or equal to `min.freq`. -#' @name summarize #' #' @export #' @@ -279,7 +278,14 @@ filterByFrequency <- function(x, min.freq) { ######################### ## SUMMARY FUNCTIONS #### ######################### -#' Summarize by Lineage +#' MolEvolvR Summary +#' @name MolEvolvR_summary +#' @description +#' A collection of summary functions for the MolEvolvR package. +#' +NULL + +#' summarizeByLineage #' #' @param prot A dataframe or tibble containing the data. #' @param column A string representing the column to be summarized @@ -295,7 +301,7 @@ filterByFrequency <- function(x, min.freq) { #' @return A tibble summarizing the counts of occurrences of elements in #' the `column`, grouped by the `by` column. The result includes the number #' of occurrences (`count`) and is arranged in descending order of count. -#' @name summarize +#' @rdname MolEvolvR_summary #' @export #' #' @examples @@ -341,7 +347,7 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage" #' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency #' of each domain architecture for each lineage. The results are arranged in #' descending order of `count`. -#' @name summarize +#' @rdname MolEvolvR_summary #' #' @export #' @@ -357,7 +363,7 @@ summarizeDomArch_ByLineage <- function(x) { arrange(desc(count)) } -## Function to retrieve counts of how many lineages a DomArch appears in + #' summarizeDomArch #' #' @description @@ -375,7 +381,7 @@ summarizeDomArch_ByLineage <- function(x) { #' - `totallin`: The total number of unique lineages in which each `DomArch` #' appears. #' The results are arranged in descending order of `totallin` and `totalcount`. -#' @name summarize +#' @rdname MolEvolvR_summary #' @export #' #' @examples @@ -407,7 +413,7 @@ summarizeDomArch <- function(x) { #' `GenContext`, `DomArch`, and `Lineage`. #' #' The results are arranged in descending order of `count`. -#' @name summarize +#' @rdname MolEvolvR_summary #' @export #' #' @examples @@ -432,7 +438,7 @@ summarizeGenContext_ByDomArchLineage <- function(x) { #' @importFrom dplyr arrange desc filter group_by n summarise #' #' @return Describe return, in detail -#' @name summarize +#' @rdname MolEvolvR_summary #' @export #' #' @examples @@ -455,7 +461,7 @@ summarizeGenContext_ByLineage <- function(x) { #' @param x A dataframe or tibble containing the data. It must have columns #' named `GenContext`, `DomArch`, and `Lineage`. #' -#' @importFrom dplyr arrange desc filter group_by n summarise +#' @importFrom dplyr arrange desc filter group_by n n_distinct summarise #' #' @return A tibble summarizing each unique combination of `GenContext` and #' `Lineage`, along with the following columns: @@ -465,7 +471,7 @@ summarizeGenContext_ByLineage <- function(x) { #' `GenContext` and `Lineage`. #' #' The results are arranged in descending order of `count`. -#' @name summarize +#' @rdname MolEvolvR_summary #' @export #' #' @examples @@ -487,7 +493,7 @@ summarizeGenContext <- function(x) { ################## -#' Total Counts +#' totalGenContextOrDomArchCounts #' #' @description #' Creates a data frame with a totalcount column @@ -518,7 +524,7 @@ summarizeGenContext <- function(x) { #' - `IndividualCountPercent`: The percentage of each `totalcount` relative to #' the overall count. #' - `CumulativePercent`: The cumulative percentage of total counts. -#' @name summarize +#' @rdname MolEvolvR_summary #' @export #' #' @note Please refer to the source code if you have alternate file formats and/or @@ -670,7 +676,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col -#' Find Paralogs +#' findParalogs #' #' @description #' Creates a data frame of paralogs. diff --git a/man/summarize.Rd b/man/MolEvolvR_summary.Rd similarity index 52% rename from man/summarize.Rd rename to man/MolEvolvR_summary.Rd index f149f686..262c4719 100644 --- a/man/summarize.Rd +++ b/man/MolEvolvR_summary.Rd @@ -1,50 +1,29 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/summarize.R -\name{summarize} -\alias{summarize} -\alias{filter_by_doms} -\alias{count_bycol} -\alias{elements2words} -\alias{words2wc} -\alias{filter_freq} -\alias{summarize_bylin} -\alias{summ.DA.byLin} -\alias{summ.DA} -\alias{summ.GC.byDALin} -\alias{summ.GC.byLin} -\alias{summ.GC} -\alias{total_counts} -\title{Filter by Domains} +\name{MolEvolvR_summary} +\alias{MolEvolvR_summary} +\alias{summarizeByLineage} +\alias{summarizeDomArch_ByLineage} +\alias{summarizeDomArch} +\alias{summarizeGenContext_ByDomArchLineage} +\alias{summarizeGenContext_ByLineage} +\alias{summarizeGenContext} +\alias{totalGenContextOrDomArchCounts} +\title{MolEvolvR Summary} \usage{ -filter_by_doms( - prot, - column = "DomArch", - doms_keep = c(), - doms_remove = c(), - ignore.case = FALSE -) - -count_bycol(prot = prot, column = "DomArch", min.freq = 1) - -elements2words(prot, column = "DomArch", conversion_type = "da2doms") - -words2wc(string) +summarizeByLineage(prot = "prot", column = "DomArch", by = "Lineage", query) -filter_freq(x, min.freq) +summarizeDomArch_ByLineage(x) -summarize_bylin(prot = "prot", column = "DomArch", by = "Lineage", query) +summarizeDomArch(x) -summ.DA.byLin(x) +summarizeGenContext_ByDomArchLineage(x) -summ.DA(x) +summarizeGenContext_ByLineage(x) -summ.GC.byDALin(x) +summarizeGenContext(x) -summ.GC.byLin(x) - -summ.GC(x) - -total_counts( +totalGenContextOrDomArchCounts( prot, column = "DomArch", lineage_col = "Lineage", @@ -59,39 +38,15 @@ total_counts( \item{column}{Character. The column to summarize, default is "DomArch".} -\item{doms_keep}{Vector of domains that must be identified within column in order for -observation to be kept} - -\item{doms_remove}{Vector of domains that, if found within an observation, will be removed} - -\item{ignore.case}{Should the matching be non case sensitive} - -\item{min.freq}{A numeric value specifying the minimum frequency threshold. -Only elements with frequencies greater than or equal to this value will be -retained.} - -\item{conversion_type}{A character string specifying the type of conversion. -Two options are available: -\describe{ -\item{\code{da2doms}}{Convert domain architectures into individual domains by -replacing \code{+} symbols with spaces.} -\item{\code{gc2da}}{Convert genomic context into domain architectures by -replacing directional symbols (\verb{<-}, \verb{->}, and \code{|}) with spaces.} -}} - -\item{string}{A character string containing the elements (words) to count. -This would typically be a space-delimited string representing domain -architectures or genomic contexts.} - -\item{x}{A dataframe or tibble containing the data. It must have columns -named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} - \item{by}{A string representing the grouping column (e.g., \code{Lineage}). Default is "Lineage".} \item{query}{A string specifying the query pattern for filtering the target column. Use "all" to skip filtering and include all rows.} +\item{x}{A dataframe or tibble containing the data. It must have columns +named \code{GenContext}, \code{DomArch}, and \code{Lineage}.} + \item{lineage_col}{Character. The name of the lineage column, default is "Lineage".} @@ -105,33 +60,6 @@ cutoff. Default is FALSE.} Default is 2.} } \value{ -Filtered data frame - -A tibble with two columns: -\describe{ -\item{\code{column}}{The unique elements from the specified column -(e.g., "DomArch").} -\item{\code{freq}}{The frequency of each element, i.e., the number of times -each element appears in the specified column.} -} -The tibble is filtered to only include elements that have a frequency -greater than or equal to \code{min.freq} and does not include elements with \code{NA} -values or those starting with a hyphen ("-"). - -A single string where elements are delimited by spaces. The function -performs necessary substitutions based on the \code{conversion_type} and cleans up -extraneous characters like newlines, tabs, and multiple spaces. - -A tibble (tbl_df) with two columns: -\describe{ -\item{\code{words}}{A column containing the individual words -(domains or domain architectures).} -\item{\code{freq}}{A column containing the frequency counts for each word.} -} - -A tibble with the same structure as \code{x}, but filtered to include -only rows where the frequency is greater than or equal to \code{min.freq}. - A tibble summarizing the counts of occurrences of elements in the \code{column}, grouped by the \code{by} column. The result includes the number of occurrences (\code{count}) and is arranged in descending order of count. @@ -187,13 +115,7 @@ the overall count. } } \description{ -filter_by_doms filters a data frame by identifying exact domain matches -and either keeping or removing rows with the identified domain - -Break string ELEMENTS into WORDS for domain architecture (DA) and genomic -context (GC) - -Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)} +A collection of summary functions for the MolEvolvR package. Function to summarize and retrieve counts by Domains & Domains+Lineage @@ -204,57 +126,32 @@ Creates a data frame with a totalcount column This function is designed to sum the counts column by either Genomic Context or Domain Architecture and creates a totalcount column from those sums. } \note{ -There is no need to make the domains 'regex safe', that will be handled by this function - Please refer to the source code if you have alternate file formats and/or column names. } \examples{ \dontrun{ -filter_by_doms() -} -\dontrun{ -count_bycol(prot = my_data, column = "DomArch", min.freq = 10) -} -\dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", -"a+b", "b+c", "b-c")) |> elements2words() -} - -\dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> - elements2words() |> - words2wc() -} - -\dontrun{ -filter_freq() -} -\dontrun{ library(tidyverse) tibble(DomArch = c("a+b", "a+b", "b+c", "a+b"), Lineage = c("l1", "l1", "l1", "l2")) |> - summarize_bylin(query = "all") + summarizeByLineage(query = "all") } \dontrun{ -summ.DA.byLin() +summarizeDomArch_ByLineage() } \dontrun{ -summ.DA() +summarizeDomArch() } \dontrun{ -summ.GC.byDALin +summarizeGenContext_ByDomArchLineage } \dontrun{ -summ.GC.byLin() +summarizeGenContext_ByLineage() } \dontrun{ -summ.GC() +summarizeGenContext() } \dontrun{ -total_counts(pspa - gc_lin_counts, 0, "GC") -} +totalGenContextOrDomArchCounts(pspa - gc_lin_counts, 0, "GC") } -\author{ -Samuel Chen, Janani Ravi } diff --git a/man/countbycolumn.Rd b/man/countbycolumn.Rd index 34fcc3e0..57ff9ac4 100644 --- a/man/countbycolumn.Rd +++ b/man/countbycolumn.Rd @@ -2,21 +2,37 @@ % Please edit documentation in R/summarize.R \name{countByColumn} \alias{countByColumn} -\title{Count By Column} +\title{countByColumn} \usage{ countByColumn(prot = prot, column = "DomArch", min.freq = 1) } \arguments{ -\item{min.freq}{} +\item{prot}{A data frame containing the dataset to analyze, typically with +multiple columns including the one specified by the \code{column} parameter.} + +\item{column}{A character string specifying the name of the column to analyze. +The default is "DomArch".} + +\item{min.freq}{An integer specifying the minimum frequency an element must +have to be included in the output. Default is 1.} } \value{ -Describe return, in detail +A tibble with two columns: +\describe{ +\item{\code{column}}{The unique elements from the specified column +(e.g., "DomArch").} +\item{\code{freq}}{The frequency of each element, i.e., the number of times +each element appears in the specified column.} +} +The tibble is filtered to only include elements that have a frequency +greater than or equal to \code{min.freq} and does not include elements with \code{NA} +values or those starting with a hyphen ("-"). } \description{ -Count By Column +Function to obtain element counts (DA, GC) } \examples{ \dontrun{ -countByColumn() +countByColumn(prot = my_data, column = "DomArch", min.freq = 10) } } diff --git a/man/elements2Words.Rd b/man/elements2Words.Rd index 1094d363..bfd3071b 100644 --- a/man/elements2Words.Rd +++ b/man/elements2Words.Rd @@ -2,20 +2,30 @@ % Please edit documentation in R/summarize.R \name{elements2Words} \alias{elements2Words} -\title{Elements 2 Words} +\title{elements2Words} \usage{ elements2Words(prot, column = "DomArch", conversion_type = "da2doms") } \arguments{ -\item{prot}{\link{dataframe}} +\item{prot}{A dataframe containing the dataset to analyze. The specified +\code{column} contains the string elements to be processed.} -\item{column}{\link{string} column name} +\item{column}{A character string specifying the name of the column to analyze. +Default is "DomArch".} -\item{conversion_type}{\link{string} type of conversion: 'da2doms': domain architectures to -domains. 'gc2da' genomic context to domain architectures} +\item{conversion_type}{A character string specifying the type of conversion. +Two options are available: +\describe{ +\item{\code{da2doms}}{Convert domain architectures into individual domains by +replacing \code{+} symbols with spaces.} +\item{\code{gc2da}}{Convert genomic context into domain architectures by +replacing directional symbols (\verb{<-}, \verb{->}, and \code{|}) with spaces.} +}} } \value{ -\link{string} with words delimited by spaces +A single string where elements are delimited by spaces. The function +performs necessary substitutions based on the \code{conversion_type} and cleans up +extraneous characters like newlines, tabs, and multiple spaces. } \description{ Break string ELEMENTS into WORDS for domain architecture (DA) and genomic @@ -23,7 +33,8 @@ context (GC) } \examples{ \dontrun{ -tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> elements2Words() +tibble::tibble(DomArch = c("aaa+bbb", +"a+b", "b+c", "b-c")) |> elements2Words() } } diff --git a/man/filterbydomains.Rd b/man/filterbydomains.Rd index 8c885363..afb3e5cb 100644 --- a/man/filterbydomains.Rd +++ b/man/filterbydomains.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/summarize.R \name{filterByDomains} \alias{filterByDomains} -\title{Filter by Domains} +\title{filterByDomains} \usage{ filterByDomains( prot, diff --git a/man/filterbyfrequency.Rd b/man/filterbyfrequency.Rd index d2c5f9cd..15d06d67 100644 --- a/man/filterbyfrequency.Rd +++ b/man/filterbyfrequency.Rd @@ -2,18 +2,24 @@ % Please edit documentation in R/summarize.R \name{filterByFrequency} \alias{filterByFrequency} -\title{Filter Frequency} +\title{filterByFrequency} \usage{ filterByFrequency(x, min.freq) } \arguments{ -\item{min.freq}{} +\item{x}{A tibble (tbl_df) containing at least two columns: one for +elements (e.g., \code{words}) and one for their frequency (e.g., \code{freq}).} + +\item{min.freq}{A numeric value specifying the minimum frequency threshold. +Only elements with frequencies greater than or equal to this value will be +retained.} } \value{ -Describe return, in detail +A tibble with the same structure as \code{x}, but filtered to include +only rows where the frequency is greater than or equal to \code{min.freq}. } \description{ -Filter Frequency +Function to filter based on frequencies } \examples{ \dontrun{ diff --git a/man/findparalogs.Rd b/man/findparalogs.Rd index 4b5edbcf..d92edf71 100644 --- a/man/findparalogs.Rd +++ b/man/findparalogs.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/summarize.R \name{findParalogs} \alias{findParalogs} -\title{Find Paralogs} +\title{findParalogs} \usage{ findParalogs(prot) } diff --git a/man/summarizeDomArch.Rd b/man/summarizeDomArch.Rd deleted file mode 100644 index 11db1afa..00000000 --- a/man/summarizeDomArch.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeDomArch} -\alias{summarizeDomArch} -\title{summarizeDomArch} -\usage{ -summarizeDomArch(x) -} -\arguments{ -\item{x}{} -} -\value{ -Describe return, in detail -} -\description{ -Function to retrieve counts of how many lineages a DomArch appears in -} -\examples{ -\dontrun{ -summarizeDomArch() -} -} diff --git a/man/summarizeDomArch_ByLineage.Rd b/man/summarizeDomArch_ByLineage.Rd deleted file mode 100644 index cf5fac22..00000000 --- a/man/summarizeDomArch_ByLineage.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeDomArch_ByLineage} -\alias{summarizeDomArch_ByLineage} -\title{summarizeDomArch_ByLineage} -\usage{ -summarizeDomArch_ByLineage(x) -} -\arguments{ -\item{x}{} -} -\value{ -Describe return, in detail -} -\description{ -Function to summarize and retrieve counts by Domains & Domains+Lineage -} -\examples{ -\dontrun{ -summarizeDomArch_ByLineage() -} -} diff --git a/man/summarizeGenContext.Rd b/man/summarizeGenContext.Rd deleted file mode 100644 index 5a40811b..00000000 --- a/man/summarizeGenContext.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeGenContext} -\alias{summarizeGenContext} -\title{summarizeGenContext} -\usage{ -summarizeGenContext(x) -} -\arguments{ -\item{x}{} -} -\value{ -Describe return, in detail -} -\description{ -summarizeGenContext -} -\examples{ -\dontrun{ -summarizeGenContext() -} -} diff --git a/man/summarizeGenContext_ByDomArchLineage.Rd b/man/summarizeGenContext_ByDomArchLineage.Rd deleted file mode 100644 index 59e0376e..00000000 --- a/man/summarizeGenContext_ByDomArchLineage.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeGenContext_ByDomArchLineage} -\alias{summarizeGenContext_ByDomArchLineage} -\title{summarizeGenContext_ByDomArchLineage} -\usage{ -summarizeGenContext_ByDomArchLineage(x) -} -\arguments{ -\item{x}{} -} -\value{ -Define return, in detail -} -\description{ -summarizeGenContext_ByDomArchLineage -} -\examples{ -\dontrun{ -summarizeGenContext_ByDomArchLineage -} -} diff --git a/man/summarizeGenContext_ByLineage.Rd b/man/summarizeGenContext_ByLineage.Rd deleted file mode 100644 index 932fe6a7..00000000 --- a/man/summarizeGenContext_ByLineage.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeGenContext_ByLineage} -\alias{summarizeGenContext_ByLineage} -\title{summarizeGenContext_ByLineage} -\usage{ -summarizeGenContext_ByLineage(x) -} -\arguments{ -\item{x}{} -} -\value{ -Describe return, in detail -} -\description{ -summarizeGenContext_ByLineage -} -\examples{ -\dontrun{ -summarizeGenContext_ByLineage() -} -} diff --git a/man/summarizebylineage.Rd b/man/summarizebylineage.Rd deleted file mode 100644 index 2e445913..00000000 --- a/man/summarizebylineage.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{summarizeByLineage} -\alias{summarizeByLineage} -\title{Summarize by Lineage} -\usage{ -summarizeByLineage(prot = "prot", column = "DomArch", by = "Lineage", query) -} -\arguments{ -\item{query}{} -} -\value{ -Describe return, in detail -} -\description{ -Summarize by Lineage -} -\examples{ -\dontrun{ -library(tidyverse) -tibble(DomArch = c("a+b", "a+b", "b+c", "a+b"), Lineage = c("l1", "l1", "l1", "l2")) |> - summarizeByLineage(query = "all") -} - -} diff --git a/man/totalgencontextordomarchcounts.Rd b/man/totalgencontextordomarchcounts.Rd deleted file mode 100644 index f457cb6a..00000000 --- a/man/totalgencontextordomarchcounts.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarize.R -\name{totalGenContextOrDomArchCounts} -\alias{totalGenContextOrDomArchCounts} -\title{Total Counts} -\usage{ -totalGenContextOrDomArchCounts( - prot, - column = "DomArch", - lineage_col = "Lineage", - cutoff = 90, - RowsCutoff = FALSE, - digits = 2 -) -} -\arguments{ -\item{prot}{A data frame that must contain columns: -\itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}} - -\item{column}{Character. The column to summarize} - -\item{cutoff}{Numeric. Cutoff for total count. Counts below cutoff value will not be shown. Default is 0.} - -\item{digits}{} -} -\value{ -Define return, in detail -} -\description{ -Creates a data frame with a totalcount column - -This function is designed to sum the counts column by either Genomic Context or Domain Architecture and creates a totalcount column from those sums. -} -\note{ -Please refer to the source code if you have alternate file formats and/or -column names. -} -\examples{ -\dontrun{ -totalGenContextOrDomArchCounts(pspa - gc_lin_counts, 0, "GC") -} -} diff --git a/man/words2wordcounts.Rd b/man/words2wordcounts.Rd index 7f60f226..370dec7f 100644 --- a/man/words2wordcounts.Rd +++ b/man/words2wordcounts.Rd @@ -2,15 +2,22 @@ % Please edit documentation in R/summarize.R \name{words2WordCounts} \alias{words2WordCounts} -\title{Words 2 Word Counts} +\title{words2WordCounts} \usage{ words2WordCounts(string) } \arguments{ -\item{string}{} +\item{string}{A character string containing the elements (words) to count. +This would typically be a space-delimited string representing domain +architectures or genomic contexts.} } \value{ -\link{tbl_df} table with 2 columns: 1) words & 2) counts/frequency +A tibble (tbl_df) with two columns: +\describe{ +\item{\code{words}}{A column containing the individual words +(domains or domain architectures).} +\item{\code{freq}}{A column containing the frequency counts for each word.} +} } \description{ Get word counts (wc) \link{DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)}