diff --git a/NAMESPACE b/NAMESPACE index 12c299b..d4cc69b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,4 +48,5 @@ importFrom(stats,density) importFrom(stats,dnorm) importFrom(syntenet,interspecies_synteny) importFrom(syntenet,intraspecies_synteny) +importFrom(utils,head) importFrom(utils,read.table) diff --git a/R/data.R b/R/data.R index 5b418b8..d92a959 100644 --- a/R/data.R +++ b/R/data.R @@ -102,6 +102,7 @@ #' \item{dup1}{Character, duplicated gene 1.} #' \item{dup2}{Character, duplicated gene 2.} #' \item{Ks}{Numeric, Ks values.} +#' \item{type}{Factor, duplication mode.} #' } #' @examples #' data(gmax_ks) diff --git a/R/data_validation.R b/R/data_validation.R new file mode 100644 index 0000000..e0c84e5 --- /dev/null +++ b/R/data_validation.R @@ -0,0 +1,37 @@ + +#' Check if gene names in set 1 are present in set 2 +#' +#' @param ref_ids Character vector of reference gene set. +#' @param test_ids Character vector of test gene set. +#' @param setnames Character vector of length with set names. +#' Default: \code{c("gene pairs", "CDS")} +#' +#' @return TRUE if names match, otherwise an error is shown. +#' @importFrom utils head +#' @details +#' This internal function can be used, for instance, to check if CDS names +#' match gene IDs in the gene pair list. +#' @noRd +check_geneid_match <- function( + ref_ids, test_ids, setnames = c("gene pairs", "CDS") +) { + + mismatch_ids <- ref_ids[!ref_ids %in% test_ids] + mismatch_perc <- length(mismatch_ids) / length(ref_ids) + mismatch_perc <- round(mismatch_perc * 100, 2) + + if(mismatch_perc >0) { + stop( + mismatch_perc, "%", " (N=", length(mismatch_ids), ") of the IDs in ", setnames[1], + " were not found in ", setnames[2], ".\n", + "All gene IDs in ", setnames[1], " must be in ", setnames[2], + ". Did you check if gene IDs match?", + "\n\nHere are some examples of nonmatching IDs (from ", setnames[1], ") :\n", + paste0(head(mismatch_ids, n = 5), collapse = "\n"), + "\n\nAnd here are some examples of IDs in ", setnames[2], ":\n", + paste0(head(test_ids, n = 5), collapse = "\n") + ) + } + + return(TRUE) +} \ No newline at end of file diff --git a/R/ka_ks_analyses.R b/R/ka_ks_analyses.R index 0bd76bd..8773005 100644 --- a/R/ka_ks_analyses.R +++ b/R/ka_ks_analyses.R @@ -11,6 +11,8 @@ #' "YN", "MYN", "MS", "MA", "GNG", "GLWL", "GLPB", "GMLWL", "GMLPB", "GYN", #' and "GMYN". Default: "MYN". #' @param threads Numeric indicating the number of threads to use. Default: 1. +#' @param verbose Logical indicating whether progress messages should be +#' printed on screen. Default: FALSE. #' #' @return A list of data frames containing gene pairs and their Ka, Ks, #' and Ka/Ks values. @@ -42,16 +44,24 @@ #' #' kaks <- pairs2kaks(gene_pairs_list, cds) #' -pairs2kaks <- function(gene_pairs_list, cds, model = "MYN", threads = 1) { +pairs2kaks <- function( + gene_pairs_list, cds, model = "MYN", threads = 1, verbose = FALSE +) { kaks_list <- lapply(seq_along(gene_pairs_list), function(x) { - # Get pairs for species x + # Get pairs and CDS for species x species <- names(gene_pairs_list)[x] + if(verbose) { message("Calculating rates for species '", species, "'") } + pairs <- gene_pairs_list[[x]] names(pairs)[c(1, 2)] <- c("dup1", "dup2") pairs$dup1 <- gsub("^[a-zA-Z]{2,5}_", "", pairs$dup1) pairs$dup2 <- gsub("^[a-zA-Z]{2,5}_", "", pairs$dup2) + fcds <- cds[[species]] + + # Check if IDs in pairs are all present in CDS + c1 <- check_geneid_match(unique(c(pairs$dup1, pairs$dup2)), names(fcds)) # Remove CDS that are not multiple of 3 fcds <- cds[[species]] diff --git a/data/gmax_ks.rda b/data/gmax_ks.rda index 42a56f2..466f379 100644 Binary files a/data/gmax_ks.rda and b/data/gmax_ks.rda differ diff --git a/dev/01_create_pkg.R b/dev/01_create_pkg.R deleted file mode 100644 index c2cef92..0000000 --- a/dev/01_create_pkg.R +++ /dev/null @@ -1,72 +0,0 @@ -## ******************** -## Create the R package -## ******************** - -## To get started, install R from https://cran.r-project.org/ -## and RStudio Desktop https://rstudio.com/products/rstudio/download/#download -## You can install both of them for free. - -## Next, open RStudio as the code that will run benefits from running inside -## RStudio for interactivity purposes. - -## Next, you might need to install several R packages that you can install with -## the following code: -if (!requireNamespace("remotes", quietly = TRUE)) { - install.packages("remotes") -} -remotes::install_cran( - c( - "available", - "BiocManager", - "biocthis", - "devtools", - "knitr", - "pkgdown", - "RefManageR", - "rmarkdown", - "rstudioapi", - "sessioninfo", - "styler", - "usethis" - ) -) -if (!requireNamespace("BiocStyle", quietly = TRUE)) { - BiocManager::install("BiocStyle") -} -## In case you want the development version of biocthis from GitHub -# BiocManager::install("lcolladotor/biocthis") - -## Here's a very quick summary of why these packages are useful: -## * available: to check the name of your package -## * BiocManager: to install Bioconductor packages -## * BiocStyle: for styling your vignette and linking to other packages -## * devtools: to develop R packages -## * knitr: for making your vignette -## * pkgdown: for creating beautiful documentation websites -## * RefManageR: for citing utilities in your package vignette -## * rmarkdown: for making the README.md and processing your vignette -## * remotes: for installing R packages from several locations -## * rstudioapi: for navigating across files in RStudio -## * sessioninfo: for detailed R session information useful to you and your users -## * usethis: for creating templates that will jump start your R package work - - -## Package names have some properties. You can also use: -available::available("doubletrouble") -## to check that your package name is not taken and that it doesn't have -## a meaning that you might not be aware of. - -usethis::create_package("doubletrouble") -## This opens a new window in RStudio - -## Note: If you create packages frequently, check the help file for -## usethis::use_description() for more information on how to set some R author -## defaults. - -## Add package development files from biocthis -biocthis::use_bioc_pkg_templates() - -## Move to the next step: setting up Git and GitHub for your package -rstudioapi::navigateToFile(usethis::proj_path("dev", "02_git_github_setup.R")) - -## This template was made using https://lcolladotor.github.io/biocthis/ diff --git a/dev/02_git_github_setup.R b/dev/02_git_github_setup.R deleted file mode 100644 index 9260dcc..0000000 --- a/dev/02_git_github_setup.R +++ /dev/null @@ -1,50 +0,0 @@ -## Did you miss the previous step? The one about creating your package -rstudioapi::navigateToFile(usethis::proj_path("dev", "01_create_pkg.R")) - -## ******************** -## Setup Git and GitHub -## ******************** - -## Note that Bioconductor doesn't allow *.Rproj files -## So we have to ignore it before anything else -usethis::use_git_ignore("*.Rproj") -usethis::use_git() ## Choose the option to make the commit, then to restart RStudio - -## After the restart, continue by connecting your local git repository to -## GitHub. You might want to use the `organisation` and `private` arguments -args(usethis::use_github) - -## If this is your first time running use_github(), you might have to also run: -usethis::gh_token_help() -## The above command will suggest that you read more at -## https://usethis.r-lib.org/articles/articles/git-credentials.html -## which contains the latest recommendations by the usethis authors for -## configuring your R to GitHub connection. -usethis::create_github_token() -gitcreds::gitcreds_set() -## Type your GitHub token, not your password! Otherwise you might run into this -## problem: https://github.com/r-lib/usethis/issues/1347 - -## In some situations, gitcreds::gitcreds_set() might not work. For example, -## gitcreds::gitcreds_set() is not supported on Linux as discussed at -## https://github.com/r-lib/gitcreds/issues/29. In these situations, -## you have to rely on the old workflow of editing your -## .Renviron file with contents like (note the empty line at the end!): -# GITHUB_PAT=YOUR_40_CHARACTERS_TOKEN -# -usethis::edit_r_environ() -## Then re-start your R session. -rstudioapi::restartSession() -## Editing the .Renviron is strongly discouraged now since it stores as -## simple text your GitHub personal access token (PAT) instead of the -## more secure approach provided by gitcreds. - -## Now run use_github() -usethis::use_github() -## Follow any prompts, such as running on the terminal: -## git push --set-upstream origin master - -## Move to the next step: setting up your package core files -rstudioapi::navigateToFile(usethis::proj_path("dev", "03_core_files.R")) - -## This template was made using https://lcolladotor.github.io/biocthis/ diff --git a/dev/03_core_files.R b/dev/03_core_files.R deleted file mode 100644 index db8f20a..0000000 --- a/dev/03_core_files.R +++ /dev/null @@ -1,108 +0,0 @@ -## Did you miss the previous step? The one about setting up Git and GitHub -rstudioapi::navigateToFile(usethis::proj_path("dev", "02_git_github_setup.R")) - -## *********************************************************** -## Setup the core files for your Bioconductor-friendly package -## *********************************************************** - -## Edit your package DESCRIPTION file -## Check http://r-pkgs.had.co.nz/description.html for details -## as well as http://bioconductor.org/developers/package-guidelines/#description - -## Check https://github.com/lcolladotor/biocthis/blob/master/DESCRIPTION -## for an example. - -## You'll at least want to edit the version to 0.99.0 (or lower) since that's -## the version number you will need to use with Bioconductor. - -## You will also want to add the biocViews field, for example: -## biocViews: Software - -## Many Bioconductor packages use the following license: -## license: Artistic-2.0 - -## You might want to add the Date field as well, which is used when creating -## the package citation information. Use the YYYY-MM-DD format. For example: -## Date: 2020-04-29 - -## This function sets all these defaults for you -biocthis::use_bioc_description() -## However, you still need to edit parts of it manually -rstudioapi::navigateToFile(usethis::proj_path("DESCRIPTION")) - -## Create your README.Rmd file -biocthis::use_bioc_readme_rmd() -devtools::build_readme() - -## Edit accordingly. You might want to install your package also using -## devtools::build() or the RStudio keyboard shortcut: -## macoS: command + shift + B -## Windows: control + shift + B - -## Click on the `knit` button on your README.Rmd file to create the README.md -## file. - -## Add a NEWS.md file -## See http://bioconductor.org/developers/package-guidelines/#news for more -## details about Bioconductor news files. -biocthis::use_bioc_news_md() - -## Add information for users and contributors -biocthis::use_bioc_coc() -usethis::use_tidy_contributing() -biocthis::use_bioc_support() -biocthis::use_bioc_issue_template() -biocthis::use_bioc_citation() - -## Add badges to the README.Rmd file -usethis::use_lifecycle_badge("Experimental") -usethis::use_bioc_badge() -## NOTE: If your Bioconductor package is an experiment, annotation or workflow -## package, you will need to edit the resulting badge! -usethis::use_github_actions_badge("R-CMD-check-bioc") - -## Enable using tests -usethis::use_testthat() -usethis::use_test("duplication_classification") -usethis::use_test("ka_ks_analyses") -usethis::use_coverage() - -## Re-knit your README.Rmd file to update your README.md file -devtools::build_readme() - -## Add a vignette template -pkg <- basename(usethis::proj_get()) -biocthis::use_bioc_vignette(pkg, paste("Introduction to", pkg)) - -## Add a Bioconductor-friendly GitHub actions workflow to check your package -biocthis::use_bioc_github_action() -## If: -## * your package doesn't have testthat tests, change to: has_testthat = 'false' -## * you don't want to run the covr step, change to: run_covr = 'false' -## * you don't want to use pkgdown, change to: run_pkgdown = 'false -rstudioapi::navigateToFile(usethis::proj_path(".github", "workflows", "check-bioc.yml")) - -## Setup up your global git config -usethis::edit_git_config() -## Use the information that matches your GitHub account -## Example contents (4 space indentation): -# [user] -# name = Your Full Name -# email = your.email@somewhere.com -# - -## ************************* WARNING START ********************************* -## WARNING: git commit before running this next function! -## Otherwise you can lose your work!!! -## ************************* WARNING END *********************************** -## -## Deploy with pkgdown at least once locally such that the automatic updates -## from GitHub actions will work. This creates the gh-pages branch in your -## GitHub repository in such a way that pkgdown will recognize it and be -## able to use it later. -pkgdown::deploy_to_branch() ## Check the WARNING above before running this! - -## Move to the next step: updating your package code before a "git commit" -rstudioapi::navigateToFile(usethis::proj_path("dev", "04_update.R")) - -## This template was made using https://lcolladotor.github.io/biocthis/ diff --git a/dev/04_update.R b/dev/04_update.R deleted file mode 100644 index 2dbaf25..0000000 --- a/dev/04_update.R +++ /dev/null @@ -1,31 +0,0 @@ -## Did you miss the previous step? The one about setting up the R package core -## files. -rstudioapi::navigateToFile(usethis::proj_path("dev", "03_core_files.R")) - -## ******************************************** -## Update your package code before a git commit -## ******************************************** - -## Automatically re-style the code in your package to a Bioconductor-friendly -## format -## Note that you can pair this function with the RStudio "Reformat code" -## button on the magic wand drop down menu. The keyboard shortcut is -## macOS: shift + command + A -## Windows: shift + control + A -styler::style_pkg(transformers = biocthis::bioc_style()) -styler::style_dir(usethis::proj_path("dev"), transformers = biocthis::bioc_style()) -styler::style_dir( - usethis::proj_path("vignettes"), - transformers = biocthis::bioc_style(), - filetype = "Rmd" -) -styler::style_file(usethis::proj_path("README.Rmd"), transformers = biocthis::bioc_style()) - -## Re-make the documentation files -devtools::document() - -## You might also need to update the README.md by re-rendering the README.Rmd -## file. -devtools::build_readme() - -## This template was made using https://lcolladotor.github.io/biocthis/ diff --git a/inst/script/data_acquisition.md b/inst/script/data_acquisition.md index 0e7d89e..c0002ef 100644 --- a/inst/script/data_acquisition.md +++ b/inst/script/data_acquisition.md @@ -180,16 +180,25 @@ in the S. cerevisiae genome. ``` r library(Biostrings) -# Get duplicated genes -data(scerevisiae_kaks) -c_full <- scerevisiae_kaks[, c("dup1", "dup2", "type")] +# Load and process data +data("yeast_seq") +data("yeast_annot") +pdata <- syntenet::process_input(yeast_seq, yeast_annot) + +data(diamond_intra) -dup_genes <- unique(c(c_full$dup1, c_full$dup2)) -dup_genes <- gsub(".*_", "", dup_genes) +# Classify gene pairs +c_standard <- classify_gene_pairs( + annotation = pdata$annotation, + blast_list = diamond_intra, + scheme = "standard" +) -dup_sd <- c_full[c_full$type == "SD", ] -dup_sd <- unique(c(dup_sd$dup1, dup_sd$dup2)) -dup_sd <- gsub(".*_", "", dup_sd) +# Get TD-derived pairs +td_pairs <- c_standard$Scerevisiae |> + dplyr::filter(type == "TD") +td_pairs <- unique(c(td_pairs$dup1, td_pairs$dup2)) +td_pairs <- gsub(".*_", "", td_pairs) # Get CDS and keep only longest isoform cds_scerevisiae_full <- readDNAStringSet( @@ -197,7 +206,14 @@ cds_scerevisiae_full <- readDNAStringSet( ) |> ensembl_longest_isoform() # Keep only duplicated genes -cds_scerevisiae <- cds_scerevisiae_full[names(cds_scerevisiae_full) %in% dup_wgd] +cds_scerevisiae <- cds_scerevisiae_full[names(cds_scerevisiae_full) %in% + td_pairs] + +# Write, read, and export file +out <- tempfile(fileext = ".fa") +writeXStringSet(cds_scerevisiae, filepath = out) + +cds_scerevisiae <- Biostrings::readDNAStringSet(out) usethis::use_data(cds_scerevisiae, compress = "xz", overwrite = TRUE) ``` @@ -231,6 +247,16 @@ cds <- list(Scerevisiae = cds_scerevisiae_all) scerevisiae_kaks_list <- pairs2kaks(c_extended, cds) scerevisiae_kaks <- scerevisiae_kaks_list$Scerevisiae +fungi_kaks2 <- fungi_kaks +fungi_kaks2 <- lapply(fungi_kaks2, function(x) { + + x$Ka <- signif(x$Ka, 3) + x$Ks <- signif(x$Ks, 3) + x$Ka_Ks <- signif(x$Ka_Ks, 3) + + return(x) +}) + usethis::use_data(scerevisiae_kaks, compress = "xz", overwrite = TRUE) ``` @@ -278,10 +304,12 @@ cds <- list(Gmax = cds) # Calculate Ks values gmax_kaks_list <- pairs2kaks(c_binary, cds) gmax_ks <- gmax_kaks_list$Gmax -gmax_ks <- gmax_ks[, c("dup1", "dup2", "Ks")] +gmax_ks <- gmax_ks[, c("dup1", "dup2", "Ks", "type")] gmax_ks <- gmax_ks[gmax_ks$Ks <= 2, ] gmax_ks <- gmax_ks[!is.na(gmax_ks$Ks), ] +gmax_ks$Ks <- signif(gmax_ks$Ks, 3) # to reduce object size + usethis::use_data(gmax_ks, compress = "xz", overwrite = TRUE) ``` diff --git a/man/gmax_ks.Rd b/man/gmax_ks.Rd index 7f02492..96009d6 100644 --- a/man/gmax_ks.Rd +++ b/man/gmax_ks.Rd @@ -10,6 +10,7 @@ A data frame with the following variables: \item{dup1}{Character, duplicated gene 1.} \item{dup2}{Character, duplicated gene 2.} \item{Ks}{Numeric, Ks values.} +\item{type}{Factor, duplication mode.} } } \usage{ diff --git a/man/pairs2kaks.Rd b/man/pairs2kaks.Rd index d9e66ef..c2b8d47 100644 --- a/man/pairs2kaks.Rd +++ b/man/pairs2kaks.Rd @@ -4,7 +4,7 @@ \alias{pairs2kaks} \title{Calculate Ka, Ks, and Ka/Ks from duplicate gene pairs} \usage{ -pairs2kaks(gene_pairs_list, cds, model = "MYN", threads = 1) +pairs2kaks(gene_pairs_list, cds, model = "MYN", threads = 1, verbose = FALSE) } \arguments{ \item{gene_pairs_list}{List of data frames containing duplicated gene pairs @@ -19,6 +19,9 @@ Possible values are "Li", "NG86", "NG", "LWL", "LPB", "MLWL", "MLPB", "GY", and "GMYN". Default: "MYN".} \item{threads}{Numeric indicating the number of threads to use. Default: 1.} + +\item{verbose}{Logical indicating whether progress messages should be +printed on screen. Default: FALSE.} } \value{ A list of data frames containing gene pairs and their Ka, Ks, diff --git a/tests/testthat/test-data_validation.R b/tests/testthat/test-data_validation.R new file mode 100644 index 0000000..4f4f2e2 --- /dev/null +++ b/tests/testthat/test-data_validation.R @@ -0,0 +1,9 @@ + +# Start tests ---- +test_that("check_geneid_match() flags mismatches between gene sets", { + + set1 <- c("gene1", "gene2A", "gene3", "gene4A") + set2 <- c("gene1", "gene2", "gene3", "gene4") + + expect_error(check_geneid_match(set1, set2)) +}) diff --git a/tests/testthat/test-ka_ks_analyses.R b/tests/testthat/test-ka_ks_analyses.R index 10a2ea6..b458c5b 100644 --- a/tests/testthat/test-ka_ks_analyses.R +++ b/tests/testthat/test-ka_ks_analyses.R @@ -33,7 +33,7 @@ cds2$Scerevisiae$Q0055 <- Biostrings::subseq( #----Start tests---------------------------------------------------------------- test_that("pairs2kaks() returns a data frame with Ka, Ks, and Ka/Ks", { - kaks <- pairs2kaks(gene_pairs_list, cds) + kaks <- pairs2kaks(gene_pairs_list, cds, verbose = TRUE) kaks2 <- pairs2kaks(gene_pairs_list, cds2) expect_equal(class(kaks), "list") diff --git a/vignettes/doubletrouble_vignette.Rmd b/vignettes/doubletrouble_vignette.Rmd index 7bf4004..8189641 100644 --- a/vignettes/doubletrouble_vignette.Rmd +++ b/vignettes/doubletrouble_vignette.Rmd @@ -352,14 +352,14 @@ package (see `?get_intron_counts()` for a summary of all functions). Here, we will create a list of `TxDb` objects from a list of `GRanges` objects -using the function `makeTxDbFromGRanges` +using the function `makeTxDbFromGRanges()` from `r BiocStyle::Biocpkg("txdbmaker")`. Importantly, to create a `TxDb` from a `GRanges`, the `GRanges` object must contain genomic coordinates for all features, including transcripts, exons, etc. Because of that, we will use annotation from the example data set `yeast_annot`, which was not processed with `syntenet::process_input()`. -```{r} +```{r message=FALSE} library(txdbmaker) # Create a list of `TxDb` objects from a list of `GRanges` objects txdb_list <- lapply(yeast_annot, txdbmaker::makeTxDbFromGRanges) @@ -469,6 +469,10 @@ kaks <- pairs2kaks(gene_pairs, cds_list) head(kaks) ``` +Importantly, `pairs2kaks()` expects all genes in the gene pairs to be present +in the CDS, with matching names. Species abbreviations in gene pairs (added +by `r BiocStyle::Biocpkg("syntenet")`) are automatically removed, so you should +not add them to the sequence names of your CDS. # Identifying and visualizing $K_s$ peaks @@ -587,7 +591,7 @@ age boundaries highlighted in the histogram of $K_s$ values. head(gmax_ks) # Classify gene pairs by age group -pairs_age_group <- split_pairs_by_peak(gmax_ks, peaks) +pairs_age_group <- split_pairs_by_peak(gmax_ks[, c(1,2,3)], peaks) # Inspecting the output names(pairs_age_group) @@ -599,6 +603,31 @@ head(pairs_age_group$pairs) pairs_age_group$plot ``` +Age groups can also be used to identify SD gene pairs that likely originated +from whole-genome duplications. The rationale here is that segmental duplicates +with $K_s$ values near $K_s$ peaks (indicating WGD events) were likely +created by such WGDs. In a similar logic, SD pairs with $K_s$ values that +are too distant from $K_s$ peaks (e.g., >2 standard deviations away from +the mean) were likely created by duplications of large genomic segments, but +not duplications of the entire genome. + +As an example, to find gene pairs in the soybean genome that likely originated +from the WGD event shared by all legumes (at ~58 million years ago), +you'd need to extract SD pairs in age group 2 using the following code: + +```{r} +# Get all pairs in age group 2 +pairs_ag2 <- pairs_age_group$pairs[pairs_age_group$pairs$peak == 2, c(1,2)] + +# Get all SD pairs +sd_pairs <- gmax_ks[gmax_ks$type == "SD", c(1,2)] + +# Merge tables +pairs_wgd_legumes <- merge(pairs_ag2, sd_pairs) + +head(pairs_wgd_legumes) +``` + # Data visualization Last but not least, `r BiocStyle::Biocpkg("doubletrouble")` provides users