Skip to content

Commit

Permalink
Merge pull request #41 from awasyn/docssummarize
Browse files Browse the repository at this point in the history
Add parameter definitions to summarize.R
  • Loading branch information
the-mayer authored Oct 11, 2024
2 parents b96198e + 4cbf875 commit 4d58e59
Show file tree
Hide file tree
Showing 15 changed files with 369 additions and 248 deletions.
201 changes: 151 additions & 50 deletions R/summarize.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# suppressPackageStartupMessages(library(rlang))
# conflicted::conflict_prefer("filter", "dplyr")

#' Filter by Domains
#' filterByDomains
#'
#' @author Samuel Chen, Janani Ravi
#' @description filterByDomains filters a data frame by identifying exact domain matches
Expand Down Expand Up @@ -88,21 +88,35 @@ filterByDomains <- function(prot, column = "DomArch", doms_keep = c(), doms_remo
## COUNTS of DAs and GCs ##
## Before/after break up ##
###########################
## Function to obtain element counts (DA, GC)
#' Count By Column
#'
#' @param prot
#' @param column
#' @param min.freq

#' countByColumn
#' @description
#' Function to obtain element counts (DA, GC)
#'
#' @param prot A data frame containing the dataset to analyze, typically with
#' multiple columns including the one specified by the `column` parameter.
#' @param column A character string specifying the name of the column to analyze.
#' The default is "DomArch".
#' @param min.freq An integer specifying the minimum frequency an element must
#' have to be included in the output. Default is 1.
#'
#' @importFrom dplyr arrange as_tibble filter select
#'
#' @return Describe return, in detail
#' @return A tibble with two columns:
#' \describe{
#' \item{`column`}{The unique elements from the specified column
#' (e.g., "DomArch").}
#' \item{`freq`}{The frequency of each element, i.e., the number of times
#' each element appears in the specified column.}
#' }
#' The tibble is filtered to only include elements that have a frequency
#' greater than or equal to `min.freq` and does not include elements with `NA`
#' values or those starting with a hyphen ("-").
#' @export
#'
#' @examples
#' \dontrun{
#' countByColumn()
#' countByColumn(prot = my_data, column = "DomArch", min.freq = 10)
#' }
countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) {
counts <- prot %>%
Expand All @@ -117,25 +131,36 @@ countByColumn <- function(prot = prot, column = "DomArch", min.freq = 1) {
return(counts)
}

#' Elements 2 Words
#' elements2Words
#'
#' @description
#' Break string ELEMENTS into WORDS for domain architecture (DA) and genomic
#' context (GC)
#'
#' @param prot [dataframe]
#' @param column [string] column name
#' @param conversion_type [string] type of conversion: 'da2doms': domain architectures to
#' domains. 'gc2da' genomic context to domain architectures
#' @param prot A dataframe containing the dataset to analyze. The specified
#' `column` contains the string elements to be processed.
#' @param column A character string specifying the name of the column to analyze.
#' Default is "DomArch".
#' @param conversion_type A character string specifying the type of conversion.
#' Two options are available:
#' \describe{
#' \item{`da2doms`}{Convert domain architectures into individual domains by
#' replacing `+` symbols with spaces.}
#' \item{`gc2da`}{Convert genomic context into domain architectures by
#' replacing directional symbols (`<-`, `->`, and `|`) with spaces.}
#' }
#'
#' @importFrom dplyr pull
#' @importFrom stringr str_replace_all
#'
#' @return [string] with words delimited by spaces
#' @return A single string where elements are delimited by spaces. The function
#' performs necessary substitutions based on the `conversion_type` and cleans up
#' extraneous characters like newlines, tabs, and multiple spaces.
#'
#' @examples
#' \dontrun{
#' tibble::tibble(DomArch = c("aaa+bbb", "a+b", "b+c", "b-c")) |> elements2Words()
#' tibble::tibble(DomArch = c("aaa+bbb",
#' "a+b", "b+c", "b-c")) |> elements2Words()
#' }
#'
elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms") {
Expand Down Expand Up @@ -170,16 +195,25 @@ elements2Words <- function(prot, column = "DomArch", conversion_type = "da2doms"
return(z3)
}

#' Words 2 Word Counts
#' words2WordCounts
#'
#' @description
#' Get word counts (wc) [DOMAINS (DA) or DOMAIN ARCHITECTURES (GC)]
#'
#' @param string
#' @param string A character string containing the elements (words) to count.
#' This would typically be a space-delimited string representing domain
#' architectures or genomic contexts.
#'
#' @importFrom dplyr as_tibble filter
#' @importFrom dplyr as_tibble filter arrange
#' @importFrom stringr str_replace_all
#'
#' @return [tbl_df] table with 2 columns: 1) words & 2) counts/frequency
#' @return A tibble (tbl_df) with two columns:
#' \describe{
#' \item{`words`}{A column containing the individual words
#' (domains or domain architectures).}
#' \item{`freq`}{A column containing the frequency counts for each word.}
#' }
#'
#'
#' @examples
#' \dontrun{
Expand Down Expand Up @@ -216,13 +250,20 @@ words2WordCounts <- function(string) {
arrange(-freq)
return(df_word_count)
}
## Function to filter based on frequencies
#' Filter Frequency

#' filterByFrequency
#' @description
#' Function to filter based on frequencies
#'
#' @param x A tibble (tbl_df) containing at least two columns: one for
#' elements (e.g., `words`) and one for their frequency (e.g., `freq`).
#' @param min.freq A numeric value specifying the minimum frequency threshold.
#' Only elements with frequencies greater than or equal to this value will be
#' retained.
#'
#' @param x
#' @param min.freq
#' @return A tibble with the same structure as `x`, but filtered to include
#' only rows where the frequency is greater than or equal to `min.freq`.
#'
#' @return Describe return, in detail
#' @export
#'
#' @examples
Expand All @@ -237,17 +278,30 @@ filterByFrequency <- function(x, min.freq) {
#########################
## SUMMARY FUNCTIONS ####
#########################
#' Summarize by Lineage
#' MolEvolvR Summary
#' @name MolEvolvR_summary
#' @description
#' A collection of summary functions for the MolEvolvR package.
#'
NULL

#' summarizeByLineage
#'
#' @param prot
#' @param column
#' @param by
#' @param query
#' @param prot A dataframe or tibble containing the data.
#' @param column A string representing the column to be summarized
#' (e.g., `DomArch`). Default is "DomArch".
#' @param by A string representing the grouping column (e.g., `Lineage`).
#' Default is "Lineage".
#' @param query A string specifying the query pattern for filtering the target
#' column. Use "all" to skip filtering and include all rows.
#'
#' @importFrom dplyr arrange filter group_by summarise
#' @importFrom rlang sym
#'
#' @return Describe return, in detail
#' @return A tibble summarizing the counts of occurrences of elements in
#' the `column`, grouped by the `by` column. The result includes the number
#' of occurrences (`count`) and is arranged in descending order of count.
#' @rdname MolEvolvR_summary
#' @export
#'
#' @examples
Expand Down Expand Up @@ -283,11 +337,18 @@ summarizeByLineage <- function(prot = "prot", column = "DomArch", by = "Lineage"
#' Function to summarize and retrieve counts by Domains & Domains+Lineage
#'
#'
#' @param x
#' @param x A dataframe or tibble containing the data. It must have columns
#' named `DomArch` and `Lineage`.
#'
#' @importFrom dplyr arrange count desc filter group_by summarise
#'
#' @return Describe return, in detail
#' @return A tibble summarizing the counts of unique domain architectures
#' (`DomArch`) per lineage (`Lineage`). The resulting table contains three
#' columns: `DomArch`, `Lineage`, and `count`, which indicates the frequency
#' of each domain architecture for each lineage. The results are arranged in
#' descending order of `count`.
#' @rdname MolEvolvR_summary
#'
#' @export
#'
#' @examples
Expand All @@ -302,17 +363,25 @@ summarizeDomArch_ByLineage <- function(x) {
arrange(desc(count))
}

## Function to retrieve counts of how many lineages a DomArch appears in

#' summarizeDomArch
#'
#' @description
#' Function to retrieve counts of how many lineages a DomArch appears in
#'
#' @param x
#' @param x A dataframe or tibble containing the data. It must have a column
#' named `DomArch` and a count column, such as `count`, which represents the
#' occurrences of each architecture in various lineages.
#'
#' @importFrom dplyr arrange group_by filter summarise
#'
#' @return Describe return, in detail
#' @return A tibble summarizing each unique `DomArch`, along with the following
#' columns:
#' - `totalcount`: The total occurrences of each `DomArch` across all lineages.
#' - `totallin`: The total number of unique lineages in which each `DomArch`
#' appears.
#' The results are arranged in descending order of `totallin` and `totalcount`.
#' @rdname MolEvolvR_summary
#' @export
#'
#' @examples
Expand All @@ -330,11 +399,21 @@ summarizeDomArch <- function(x) {

#' summarizeGenContext_ByDomArchLineage
#'
#' @param x
#' @param x A dataframe or tibble containing the data. It must have columns
#' named `GenContext`, `DomArch`, and `Lineage`.
#'
#' @importFrom dplyr arrange desc filter group_by n summarise
#'
#' @return Define return, in detail
#' @return A tibble summarizing each unique combination of `GenContext`,
#' `DomArch`, and `Lineage`, along with the following columns:
#' - `GenContext`: The genomic context for each entry.
#' - `DomArch`: The domain architecture for each entry.
#' - `Lineage`: The lineage associated with each entry.
#' - `count`: The total number of occurrences for each combination of
#' `GenContext`, `DomArch`, and `Lineage`.
#'
#' The results are arranged in descending order of `count`.
#' @rdname MolEvolvR_summary
#' @export
#'
#' @examples
Expand All @@ -354,11 +433,12 @@ summarizeGenContext_ByDomArchLineage <- function(x) {

#' summarizeGenContext_ByLineage
#'
#' @param x
#' @param x A dataframe or tibble containing the data.
#'
#' @importFrom dplyr arrange desc filter group_by n summarise
#'
#' @return Describe return, in detail
#' @rdname MolEvolvR_summary
#' @export
#'
#' @examples
Expand All @@ -378,11 +458,20 @@ summarizeGenContext_ByLineage <- function(x) {

#' summarizeGenContext
#'
#' @param x
#' @param x A dataframe or tibble containing the data. It must have columns
#' named `GenContext`, `DomArch`, and `Lineage`.
#'
#' @importFrom dplyr arrange desc filter group_by n_distinct summarise
#' @importFrom dplyr arrange desc filter group_by n n_distinct summarise
#'
#' @return Describe return, in detail
#' @return A tibble summarizing each unique combination of `GenContext` and
#' `Lineage`, along with the following columns:
#' - `GenContext`: The genomic context for each entry.
#' - `Lineage`: The lineage associated with each entry.
#' - `count`: The total number of occurrences for each combination of
#' `GenContext` and `Lineage`.
#'
#' The results are arranged in descending order of `count`.
#' @rdname MolEvolvR_summary
#' @export
#'
#' @examples
Expand All @@ -404,7 +493,7 @@ summarizeGenContext <- function(x) {


##################
#' Total Counts
#' totalGenContextOrDomArchCounts
#'
#' @description
#' Creates a data frame with a totalcount column
Expand All @@ -414,16 +503,28 @@ summarizeGenContext <- function(x) {
#'
#' @param prot A data frame that must contain columns:
#' \itemize{\item Either 'GenContext' or 'DomArch.norep' \item count}
#' @param column Character. The column to summarize
#' @param lineage_col
#' @param cutoff Numeric. Cutoff for total count. Counts below cutoff value will not be shown. Default is 0.
#' @param RowsCutoff
#' @param digits
#' @param column Character. The column to summarize, default is "DomArch".
#' @param lineage_col Character. The name of the lineage column, default is
#' "Lineage".
#' @param cutoff Numeric. Cutoff for total count. Counts below this cutoff value
#' will not be shown. Default is 0.
#' @param RowsCutoff Logical. If TRUE, filters based on cumulative percentage
#' cutoff. Default is FALSE.
#' @param digits Numeric. Number of decimal places for percentage columns.
#' Default is 2.
#'
#'
#' @importFrom dplyr arrange distinct filter group_by left_join mutate select summarise ungroup
#' @importFrom rlang as_string sym
#'
#' @return Define return, in detail
#' @return A data frame with the following columns:
#' - `{{ column }}`: Unique values from the specified column.
#' - `totalcount`: The total count of occurrences for each unique value in
#' the specified column.
#' - `IndividualCountPercent`: The percentage of each `totalcount` relative to
#' the overall count.
#' - `CumulativePercent`: The cumulative percentage of total counts.
#' @rdname MolEvolvR_summary
#' @export
#'
#' @note Please refer to the source code if you have alternate file formats and/or
Expand Down Expand Up @@ -575,7 +676,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col



#' Find Paralogs
#' findParalogs
#'
#' @description
#' Creates a data frame of paralogs.
Expand Down
Loading

0 comments on commit 4d58e59

Please sign in to comment.