From 6fc02c47b9f94bb5a93f9b0d0ad31b2375916e3f Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 21 Aug 2024 09:56:58 +0200 Subject: [PATCH] refactor: improve documentation of proteinToGenome issue #157 - Improve documentation of `proteinToGenome()` describing the input parameter better. - Throw error if provided `IRanges` does not contain protein identifiers. --- .github/workflows/check-bioc.yml | 6 +-- DESCRIPTION | 4 +- R/proteinToX.R | 78 +++++++++++++++++---------- inst/NEWS | 5 ++ man/proteinToGenome.Rd | 15 ++++-- man/proteinToTranscript.Rd | 8 ++- tests/testthat/test_proteinToGenome.R | 15 +++++- 7 files changed, 90 insertions(+), 41 deletions(-) diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 3c8878c..d0a93d9 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -52,9 +52,9 @@ jobs: fail-fast: false matrix: config: - - { os: ubuntu-latest, r: 'devel', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } - - { os: macOS-latest, r: 'devel', bioc: '3.19'} - - { os: windows-latest, r: 'devel', bioc: '3.19'} + - { os: ubuntu-latest, r: 'devel', bioc: 'devel', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } + - { os: macOS-latest, r: '4.4', bioc: 'devel'} + - { os: windows-latest, r: '4.4', bioc: 'devel'} env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true RSPM: ${{ matrix.config.rspm }} diff --git a/DESCRIPTION b/DESCRIPTION index 5ca30af..60d8ded 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: ensembldb Type: Package Title: Utilities to create and use Ensembl-based annotation databases -Version: 2.29.0 +Version: 2.29.1 Authors@R: c(person(given = "Johannes", family = "Rainer", email = "johannes.rainer@eurac.edu", role = c("aut", "cre"), @@ -97,4 +97,4 @@ Collate: 'zzz.R' biocViews: Genetics, AnnotationData, Sequencing, Coverage License: LGPL -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 diff --git a/R/proteinToX.R b/R/proteinToX.R index 153db5c..c47865b 100644 --- a/R/proteinToX.R +++ b/R/proteinToX.R @@ -5,13 +5,19 @@ #' @name proteinToTranscript #' #' @aliases proteinToTranscript proteinToTranscript,EnsDb-method -#' +#' #' @description #' #' `proteinToTranscript` maps protein-relative coordinates to positions within #' the encoding transcript. Note that the returned positions are relative to #' the complete transcript length, which includes the 5' UTR. #' +#' The regions within the protein sequence need to be provided as a named +#' `IRanges` object with the names being protein identifiers and the start and +#' end coordinates (within these proteins) defined by the `IRanges` object. As +#' an alternative to the `IRanges`' names, protein identifiers can also be +#' provided through a metadata column (see details below). +#' #' Similar to the [proteinToGenome()] function, `proteinToTranscript` compares #' for each protein whether the length of its sequence matches the length of #' the encoding CDS and throws a warning if that is not the case. Incomplete @@ -38,9 +44,9 @@ #' use these as input for the mapping function. #' #' @inheritParams proteinToGenome -#' +#' #' @param ... Further arguments to be passed on. -#' +#' #' @return #' #' `IRangesList`, each element being the mapping results for one of the input @@ -158,7 +164,7 @@ setMethod("proteinToTranscript", "EnsDb", else mc$protein_id <- names(prt) ir <- IRanges(start = -1, width = 1) mcols(ir) <- mc - if(!is.null(mcols(prt))) + if(!is.null(mcols(prt))) mcols(ir) <- cbind(mcols(ir),mcols(prt)) ir } else { @@ -178,7 +184,7 @@ setMethod("proteinToTranscript", "EnsDb", if (idType == "uniprot_id") mc$uniprot_id <- names(prt) mcols(res) <- mc - if(!is.null(mcols(prt))) + if(!is.null(mcols(prt))) mcols(res) <- cbind(mcols(res),mcols(prt)) res } @@ -190,8 +196,8 @@ setMethod("proteinToTranscript", "EnsDb", #' @aliases proteinToTranscript proteinToTranscript,Preloaded-method #' #' @inheritParams proteinToGenome -#' -#' @param fiveUTR A `CompressedGRangesList` object generated by +#' +#' @param fiveUTR A `CompressedGRangesList` object generated by #' [fiveUTRsByTranscript()]. #' #' @family coordinate mapping functions @@ -204,13 +210,13 @@ setMethod("proteinToTranscript", "EnsDb", #' #' ## Meanwhile, this function can be called in parallel processes if you preload #' ## the CDS data with desired data columns and fiveUTR data -#' +#' #' cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','uniprot_id','protein_sequence')) #' # cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','protein_sequence')) #' # cds <- cdsBy(edbx,columns = c('tx_id','protein_id','protein_sequence')) #' #' fiveUTR <- fiveUTRsByTranscript(edbx) -#' +#' #' ## Define an IRange with protein-relative coordinates within a protein for #' ## the gene SYP #' syp <- IRanges(start = 4, end = 17) @@ -278,7 +284,7 @@ setMethod("proteinToTranscript", "CompressedGRangesList", else mc$protein_id <- names(prt) ir <- IRanges(start = -1, width = 1) mcols(ir) <- mc - if(!is.null(mcols(prt))) + if(!is.null(mcols(prt))) mcols(ir) <- cbind(mcols(ir),mcols(prt)) ir } else { @@ -298,7 +304,7 @@ setMethod("proteinToTranscript", "CompressedGRangesList", if (idType == "uniprot_id") mc$uniprot_id <- names(prt) mcols(res) <- mc - if(!is.null(mcols(prt))) + if(!is.null(mcols(prt))) mcols(res) <- cbind(mcols(res),mcols(prt)) res } @@ -319,9 +325,16 @@ setMethod("proteinToTranscript", "CompressedGRangesList", #' (and eventually Uniprot to Ensembl protein identifier mappings) from the #' submitted `EnsDb` object (and thus based on annotations from Ensembl). #' -#' Not all coding regions for protein coding transcripts are complete, and the -#' function thus checks also if the length of the coding region matches the -#' length of the protein sequence and throws a warning if that is not the case. +#' The regions within the protein sequence need to be provided as a named +#' `IRanges` object with the names being protein identifiers and the start and +#' end coordinates (within these proteins) defined by the `IRanges` object. As +#' an alternative to the `IRanges`' names, protein identifiers can also be +#' provided through a metadata column (see details below). +#' +#' Note that not all coding regions for protein coding transcripts are +#' complete, and the function thus checks also if the length of the coding +#' region matches the length of the protein sequence and throws a warning if +#' that is not the case. #' #' The genomic coordinates for the within-protein coordinates, the Ensembl #' protein ID, the ID of the encoding transcript and the within protein start @@ -364,7 +377,7 @@ setMethod("proteinToTranscript", "CompressedGRangesList", #' encoding transcripts. #' #' @param id `character(1)` specifying where the protein identifier can be -#' found. Has to be either `"name"` or one of `colnames(mcols(prng))`. +#' found. Has to be either `"name"` or one of `colnames(mcols(x))`. #' #' @param idType `character(1)` defining what type of IDs are provided. Has to #' be one of `"protein_id"` (default), `"uniprot_id"` or `"tx_id"`. @@ -489,7 +502,7 @@ setMethod("proteinToGenome", "EnsDb", mcols(maps) <- cbind(mcols(maps), protein_start = start(prt), protein_end = end(prt)) - if(!is.null(mcols(prt))) + if(!is.null(mcols(prt))) mcols(maps) <- cbind(mcols(maps),mcols(prt)) maps[order(maps$exon_rank)] } @@ -509,7 +522,7 @@ setMethod("proteinToGenome", "EnsDb", #' #' @param db For the method for `EnsDb` objects: An `EnsDb` object to be used to #' retrieve genomic coordinates of encoding transcripts.

-#' For the method for `CompressedGRangesList` objects: A `CompressedGRangesList` object +#' For the method for `CompressedGRangesList` objects: A `CompressedGRangesList` object #' generated by [cdsBy()] where by = 'tx' and columns = c('tx_id' #' ,'protein_id','uniprot_id','protein_sequence'). #' @@ -582,7 +595,7 @@ setMethod("proteinToGenome", "CompressedGRangesList", mcols(maps) <- cbind(mcols(maps), protein_start = start(prt), protein_end = end(prt)) - if(!is.null(mcols(prt))) + if(!is.null(mcols(prt))) mcols(maps) <- cbind(mcols(maps),mcols(prt)) maps[order(maps$exon_rank)] } @@ -719,17 +732,17 @@ setMethod("proteinToGenome", "CompressedGRangesList", cds } -#' @description Fetch the CDS for all transcripts encoding the specified protein using +#' @description Fetch the CDS for all transcripts encoding the specified protein using #' preloaded data. #' #' @note #' -#' Use one query to fetch CDS for all (unique) input IDs using preloaded data. If input +#' Use one query to fetch CDS for all (unique) input IDs using preloaded data. If input #' IDs are Uniprot identifiers we have to perform additional checks and data #' re-organizations because one transcript (and thus CDS) can be associated #' with multiple Uniprot identifiers. -#' -#' @param x `CompressedGRangesList` object generated by [cdsBy()] where +#' +#' @param x `CompressedGRangesList` object generated by [cdsBy()] where #' by = 'tx' and columns = c(listColumns(edb,'tx'),'protein_id','uniprot_id','protein_sequence'). #' #' @param id `character` with the protein ID(s). @@ -749,16 +762,16 @@ setMethod("proteinToGenome", "CompressedGRangesList", setMethod(".cds_for_id2", "CompressedGRangesList", function(x, id, idType = "protein_id") { ## No need for get the transcripts first, just use the metadata of CDS - ## if you find this modification well, you may want to change the + ## if you find this modification well, you may want to change the ## generic .cds_for_id2 function above if (!all(c("tx_id", "protein_id",idType,"protein_sequence") %in% colnames(mcols(x[[1]])))) stop( paste( c( - "CDS must have", - unique(c("'tx_id'","'protein_sequence'", "'protein_id'",paste("'",idType,"'",sep=''))), + "CDS must have", + unique(c("'tx_id'","'protein_sequence'", "'protein_id'",paste("'",idType,"'",sep=''))), "in the metadata." - ), + ), collapse = ' ') ) map <- x@unlistData[as.data.frame(x@partitioning)[,1]]@elementMetadata[,unique(c('tx_id', 'protein_id', idType))] @@ -820,10 +833,17 @@ setMethod(".cds_for_id2", "CompressedGRangesList", #' @noRd .cds_for_id_range <- function(x, range, id = "name", idType = "protein_id") { idType <- match.arg(idType, c("protein_id", "uniprot_id", "tx_id")) - id <- match.arg(id, c("name", colnames(mcols(range)))) + if (!(id %in% c("name", colnames(mcols(range))))) + stop("'id' should be either \"name\" (if identifiers are provided as ", + "the IRanges' names) or one of the column names of the IRanges ", + "metadata (mcols()).", call. = FALSE) if (id == "name") ids <- names(range) else ids <- mcols(range)[, id] + if (length(ids) != length(range) | all(is.na(ids))) + stop("No protein/transcript identifiers found. Please provide the ", + "IDs through the IRanges' names or one of its metadata columns.", + call. = FALSE) cds <- .cds_for_id2(x, ids, idType = idType) ## check returned CDS if their width matches the provided protein ranges mapply(cds, end(range) * 3, @@ -851,7 +871,7 @@ setMethod(".cds_for_id2", "CompressedGRangesList", #' @param x `EnsDb` object. #' #' @param cds `GRangesList` -#' +#' #' @param ... Further arguments to be passed on.. #' #' @return `list` of `GRangesList`s with one list element per input protein, @@ -935,7 +955,7 @@ setMethod(".cds_matching_protein", "EnsDb", #' @author Johannes Rainer, Boyu Yu #' #' @md -#' +#' #' @noRd setMethod(".cds_matching_protein", "list", function(x) { diff --git a/inst/NEWS b/inst/NEWS index 869800c..b150840 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,8 @@ +Changes in version 2.29.1 + +- Improve documentation of `proteinToGenome()` and require named `IRanges` as + input to `proteinToGenome()`. + Changes in version 2.27.1 - Functions for coordinate mapping gain support for pre-loaded genomic ranges diff --git a/man/proteinToGenome.Rd b/man/proteinToGenome.Rd index e2d19c3..714a1f2 100644 --- a/man/proteinToGenome.Rd +++ b/man/proteinToGenome.Rd @@ -23,7 +23,7 @@ generated by \code{\link[=cdsBy]{cdsBy()}} where by = 'tx' and columns = c('tx_i ,'protein_id','uniprot_id','protein_sequence').} \item{id}{\code{character(1)} specifying where the protein identifier can be -found. Has to be either \code{"name"} or one of \code{colnames(mcols(prng))}.} +found. Has to be either \code{"name"} or one of \code{colnames(mcols(x))}.} \item{idType}{\code{character(1)} defining what type of IDs are provided. Has to be one of \code{"protein_id"} (default), \code{"uniprot_id"} or \code{"tx_id"}.} @@ -67,9 +67,16 @@ encoding transcript is identified using protein-to-transcript annotations (and eventually Uniprot to Ensembl protein identifier mappings) from the submitted \code{EnsDb} object (and thus based on annotations from Ensembl). -Not all coding regions for protein coding transcripts are complete, and the -function thus checks also if the length of the coding region matches the -length of the protein sequence and throws a warning if that is not the case. +The regions within the protein sequence need to be provided as a named +\code{IRanges} object with the names being protein identifiers and the start and +end coordinates (within these proteins) defined by the \code{IRanges} object. As +an alternative to the \code{IRanges}' names, protein identifiers can also be +provided through a metadata column (see details below). + +Note that not all coding regions for protein coding transcripts are +complete, and the function thus checks also if the length of the coding +region matches the length of the protein sequence and throws a warning if +that is not the case. The genomic coordinates for the within-protein coordinates, the Ensembl protein ID, the ID of the encoding transcript and the within protein start diff --git a/man/proteinToTranscript.Rd b/man/proteinToTranscript.Rd index 03e0827..115dcc1 100644 --- a/man/proteinToTranscript.Rd +++ b/man/proteinToTranscript.Rd @@ -25,7 +25,7 @@ generated by \code{\link[=cdsBy]{cdsBy()}} where by = 'tx' and columns = c('tx_i \item{...}{Further arguments to be passed on.} \item{id}{\code{character(1)} specifying where the protein identifier can be -found. Has to be either \code{"name"} or one of \code{colnames(mcols(prng))}.} +found. Has to be either \code{"name"} or one of \code{colnames(mcols(x))}.} \item{idType}{\code{character(1)} defining what type of IDs are provided. Has to be one of \code{"protein_id"} (default), \code{"uniprot_id"} or \code{"tx_id"}.} @@ -61,6 +61,12 @@ mapping. the encoding transcript. Note that the returned positions are relative to the complete transcript length, which includes the 5' UTR. +The regions within the protein sequence need to be provided as a named +\code{IRanges} object with the names being protein identifiers and the start and +end coordinates (within these proteins) defined by the \code{IRanges} object. As +an alternative to the \code{IRanges}' names, protein identifiers can also be +provided through a metadata column (see details below). + Similar to the \code{\link[=proteinToGenome]{proteinToGenome()}} function, \code{proteinToTranscript} compares for each protein whether the length of its sequence matches the length of the encoding CDS and throws a warning if that is not the case. Incomplete diff --git a/tests/testthat/test_proteinToGenome.R b/tests/testthat/test_proteinToGenome.R index 49fe9f6..255fa6d 100644 --- a/tests/testthat/test_proteinToGenome.R +++ b/tests/testthat/test_proteinToGenome.R @@ -1,3 +1,14 @@ +test_that("proteinToGenome fails on IRanges without IDs", { + rng <- IRanges(start = c(123, 322), end = c(234, 444)) + expect_error(proteinToGenome(rng, edb), "No protein/transcript") + names(rng) <- c(NA_character_, NA_character_) + expect_error(proteinToGenome(rng, edb), "No protein/transcript") + expect_error(proteinToGenome(rng, edb, id = "ann"), "should be either") + names(rng) <- NULL + mcols(rng)$ann <- c(NA_character_, NA_character_) + expect_error(proteinToGenome(rng, edb), "No protein/transcript") +}) + test_that(".proteinCoordsToTx works", { prts <- IRanges(start = c(1, 2), end = c(1, 2)) res <- .proteinCoordsToTx(prts) @@ -277,7 +288,7 @@ test_that("proteinToTranscript works", { ## Preloaded data test expect_warning(tx_rel_preload <- proteinToTranscript(prngs, cds, - idType = "uniprot_id", + idType = "uniprot_id", fiveUTR = fiveUTR)) expect_equal(tx_rel_preload,tx_rel) @@ -291,7 +302,7 @@ test_that("proteinToTranscript works", { expect_equal(unlist(unname(start(res))), c(-1, -1)) ## Preloaded data test - expect_warning(res_preload <- proteinToTranscript(prngs, cds, + expect_warning(res_preload <- proteinToTranscript(prngs, cds, fiveUTR = fiveUTR)) expect_equal(res_preload,res) })