From 6fc02c47b9f94bb5a93f9b0d0ad31b2375916e3f Mon Sep 17 00:00:00 2001
From: Johannes Rainer <johannes.rainer@gmail.com>
Date: Wed, 21 Aug 2024 09:56:58 +0200
Subject: [PATCH] refactor: improve documentation of proteinToGenome issue #157

- Improve documentation of `proteinToGenome()` describing the input parameter
  better.
- Throw error if provided `IRanges` does not contain protein identifiers.
---
 .github/workflows/check-bioc.yml      |  6 +--
 DESCRIPTION                           |  4 +-
 R/proteinToX.R                        | 78 +++++++++++++++++----------
 inst/NEWS                             |  5 ++
 man/proteinToGenome.Rd                | 15 ++++--
 man/proteinToTranscript.Rd            |  8 ++-
 tests/testthat/test_proteinToGenome.R | 15 +++++-
 7 files changed, 90 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
index 3c8878c..d0a93d9 100644
--- a/.github/workflows/check-bioc.yml
+++ b/.github/workflows/check-bioc.yml
@@ -52,9 +52,9 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - { os: ubuntu-latest, r: 'devel', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
-          - { os: macOS-latest, r: 'devel', bioc: '3.19'}
-          - { os: windows-latest, r: 'devel', bioc: '3.19'}
+          - { os: ubuntu-latest, r: 'devel', bioc: 'devel', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
+          - { os: macOS-latest, r: '4.4', bioc: 'devel'}
+          - { os: windows-latest, r: '4.4', bioc: 'devel'}
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
       RSPM: ${{ matrix.config.rspm }}
diff --git a/DESCRIPTION b/DESCRIPTION
index 5ca30af..60d8ded 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: ensembldb
 Type: Package
 Title: Utilities to create and use Ensembl-based annotation databases
-Version: 2.29.0
+Version: 2.29.1
 Authors@R: c(person(given = "Johannes", family = "Rainer",
 	   email = "johannes.rainer@eurac.edu",
 	   role = c("aut", "cre"),
@@ -97,4 +97,4 @@ Collate:
     'zzz.R'
 biocViews: Genetics, AnnotationData, Sequencing, Coverage
 License: LGPL
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
diff --git a/R/proteinToX.R b/R/proteinToX.R
index 153db5c..c47865b 100644
--- a/R/proteinToX.R
+++ b/R/proteinToX.R
@@ -5,13 +5,19 @@
 #' @name proteinToTranscript
 #'
 #' @aliases proteinToTranscript proteinToTranscript,EnsDb-method
-#' 
+#'
 #' @description
 #'
 #' `proteinToTranscript` maps protein-relative coordinates to positions within
 #' the encoding transcript. Note that the returned positions are relative to
 #' the complete transcript length, which includes the 5' UTR.
 #'
+#' The regions within the protein sequence need to be provided as a named
+#' `IRanges` object with the names being protein identifiers and the start and
+#' end coordinates (within these proteins) defined by the `IRanges` object. As
+#' an alternative to the `IRanges`' names, protein identifiers can also be
+#' provided through a metadata column (see details below).
+#'
 #' Similar to the [proteinToGenome()] function, `proteinToTranscript` compares
 #' for each protein whether the length of its sequence matches the length of
 #' the encoding CDS and throws a warning if that is not the case. Incomplete
@@ -38,9 +44,9 @@
 #' use these as input for the mapping function.
 #'
 #' @inheritParams proteinToGenome
-#' 
+#'
 #' @param ... Further arguments to be passed on.
-#' 
+#'
 #' @return
 #'
 #' `IRangesList`, each element being the mapping results for one of the input
@@ -158,7 +164,7 @@ setMethod("proteinToTranscript", "EnsDb",
                 else mc$protein_id <- names(prt)
                 ir <- IRanges(start = -1, width = 1)
                 mcols(ir) <- mc
-                if(!is.null(mcols(prt))) 
+                if(!is.null(mcols(prt)))
                     mcols(ir) <- cbind(mcols(ir),mcols(prt))
                 ir
             } else {
@@ -178,7 +184,7 @@ setMethod("proteinToTranscript", "EnsDb",
                 if (idType == "uniprot_id")
                     mc$uniprot_id <- names(prt)
                 mcols(res) <- mc
-                if(!is.null(mcols(prt))) 
+                if(!is.null(mcols(prt)))
                     mcols(res) <- cbind(mcols(res),mcols(prt))
                 res
             }
@@ -190,8 +196,8 @@ setMethod("proteinToTranscript", "EnsDb",
 #' @aliases proteinToTranscript proteinToTranscript,Preloaded-method
 #'
 #' @inheritParams proteinToGenome
-#' 
-#' @param fiveUTR A `CompressedGRangesList` object generated by 
+#'
+#' @param fiveUTR A `CompressedGRangesList` object generated by
 #'     [fiveUTRsByTranscript()].
 #'
 #' @family coordinate mapping functions
@@ -204,13 +210,13 @@ setMethod("proteinToTranscript", "EnsDb",
 #'
 #' ## Meanwhile, this function can be called in parallel processes if you preload
 #' ## the CDS data with desired data columns and fiveUTR data
-#' 
+#'
 #' cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','uniprot_id','protein_sequence'))
 #' # cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','protein_sequence'))
 #' # cds <- cdsBy(edbx,columns = c('tx_id','protein_id','protein_sequence'))
 #'
 #' fiveUTR <- fiveUTRsByTranscript(edbx)
-#' 
+#'
 #' ## Define an IRange with protein-relative coordinates within a protein for
 #' ## the gene SYP
 #' syp <- IRanges(start = 4, end = 17)
@@ -278,7 +284,7 @@ setMethod("proteinToTranscript", "CompressedGRangesList",
                 else mc$protein_id <- names(prt)
                 ir <- IRanges(start = -1, width = 1)
                 mcols(ir) <- mc
-                if(!is.null(mcols(prt))) 
+                if(!is.null(mcols(prt)))
                     mcols(ir) <- cbind(mcols(ir),mcols(prt))
                 ir
             } else {
@@ -298,7 +304,7 @@ setMethod("proteinToTranscript", "CompressedGRangesList",
                 if (idType == "uniprot_id")
                     mc$uniprot_id <- names(prt)
                 mcols(res) <- mc
-                if(!is.null(mcols(prt))) 
+                if(!is.null(mcols(prt)))
                     mcols(res) <- cbind(mcols(res),mcols(prt))
                 res
             }
@@ -319,9 +325,16 @@ setMethod("proteinToTranscript", "CompressedGRangesList",
 #' (and eventually Uniprot to Ensembl protein identifier mappings) from the
 #' submitted `EnsDb` object (and thus based on annotations from Ensembl).
 #'
-#' Not all coding regions for protein coding transcripts are complete, and the
-#' function thus checks also if the length of the coding region matches the
-#' length of the protein sequence and throws a warning if that is not the case.
+#' The regions within the protein sequence need to be provided as a named
+#' `IRanges` object with the names being protein identifiers and the start and
+#' end coordinates (within these proteins) defined by the `IRanges` object. As
+#' an alternative to the `IRanges`' names, protein identifiers can also be
+#' provided through a metadata column (see details below).
+#'
+#' Note that not all coding regions for protein coding transcripts are
+#' complete, and the function thus checks also if the length of the coding
+#' region matches the length of the protein sequence and throws a warning if
+#' that is not the case.
 #'
 #' The genomic coordinates for the within-protein coordinates, the Ensembl
 #' protein ID, the ID of the encoding transcript and the within protein start
@@ -364,7 +377,7 @@ setMethod("proteinToTranscript", "CompressedGRangesList",
 #'     encoding transcripts.
 #'
 #' @param id `character(1)` specifying where the protein identifier can be
-#'     found. Has to be either `"name"` or one of `colnames(mcols(prng))`.
+#'     found. Has to be either `"name"` or one of `colnames(mcols(x))`.
 #'
 #' @param idType `character(1)` defining what type of IDs are provided. Has to
 #'     be one of `"protein_id"` (default), `"uniprot_id"` or `"tx_id"`.
@@ -489,7 +502,7 @@ setMethod("proteinToGenome", "EnsDb",
                 mcols(maps) <- cbind(mcols(maps),
                                      protein_start = start(prt),
                                      protein_end = end(prt))
-                if(!is.null(mcols(prt))) 
+                if(!is.null(mcols(prt)))
                     mcols(maps) <- cbind(mcols(maps),mcols(prt))
                 maps[order(maps$exon_rank)]
             }
@@ -509,7 +522,7 @@ setMethod("proteinToGenome", "EnsDb",
 #'
 #' @param db For the method for `EnsDb` objects: An `EnsDb` object to be used to
 #'     retrieve genomic coordinates of encoding transcripts.<br><br>
-#'     For the method for `CompressedGRangesList` objects: A `CompressedGRangesList` object 
+#'     For the method for `CompressedGRangesList` objects: A `CompressedGRangesList` object
 #'     generated by [cdsBy()] where by = 'tx' and columns = c('tx_id'
 #'     ,'protein_id','uniprot_id','protein_sequence').
 #'
@@ -582,7 +595,7 @@ setMethod("proteinToGenome", "CompressedGRangesList",
                 mcols(maps) <- cbind(mcols(maps),
                                      protein_start = start(prt),
                                      protein_end = end(prt))
-                if(!is.null(mcols(prt))) 
+                if(!is.null(mcols(prt)))
                     mcols(maps) <- cbind(mcols(maps),mcols(prt))
                 maps[order(maps$exon_rank)]
             }
@@ -719,17 +732,17 @@ setMethod("proteinToGenome", "CompressedGRangesList",
     cds
 }
 
-#' @description Fetch the CDS for all transcripts encoding the specified protein using 
+#' @description Fetch the CDS for all transcripts encoding the specified protein using
 #'     preloaded data.
 #'
 #' @note
 #'
-#' Use one query to fetch CDS for all (unique) input IDs using preloaded data. If input 
+#' Use one query to fetch CDS for all (unique) input IDs using preloaded data. If input
 #' IDs are Uniprot identifiers we have to perform additional checks and data
 #' re-organizations because one transcript (and thus CDS) can be associated
 #' with multiple Uniprot identifiers.
-#' 
-#' @param x `CompressedGRangesList` object generated by [cdsBy()] where 
+#'
+#' @param x `CompressedGRangesList` object generated by [cdsBy()] where
 #'     by = 'tx' and columns = c(listColumns(edb,'tx'),'protein_id','uniprot_id','protein_sequence').
 #'
 #' @param id `character` with the protein ID(s).
@@ -749,16 +762,16 @@ setMethod("proteinToGenome", "CompressedGRangesList",
 setMethod(".cds_for_id2", "CompressedGRangesList",
   function(x, id, idType = "protein_id") {
     ## No need for get the transcripts first, just use the metadata of CDS
-    ## if you find this modification well, you may want to change the 
+    ## if you find this modification well, you may want to change the
     ## generic .cds_for_id2 function above
     if (!all(c("tx_id", "protein_id",idType,"protein_sequence") %in% colnames(mcols(x[[1]]))))
         stop(
             paste(
                 c(
-                    "CDS must have", 
-                    unique(c("'tx_id'","'protein_sequence'", "'protein_id'",paste("'",idType,"'",sep=''))), 
+                    "CDS must have",
+                    unique(c("'tx_id'","'protein_sequence'", "'protein_id'",paste("'",idType,"'",sep=''))),
                     "in the metadata."
-                ), 
+                ),
             collapse = ' ')
         )
     map <- x@unlistData[as.data.frame(x@partitioning)[,1]]@elementMetadata[,unique(c('tx_id', 'protein_id', idType))]
@@ -820,10 +833,17 @@ setMethod(".cds_for_id2", "CompressedGRangesList",
 #' @noRd
 .cds_for_id_range <- function(x, range, id = "name", idType = "protein_id") {
     idType <- match.arg(idType, c("protein_id", "uniprot_id", "tx_id"))
-    id <- match.arg(id, c("name", colnames(mcols(range))))
+    if (!(id %in% c("name", colnames(mcols(range)))))
+        stop("'id' should be either \"name\" (if identifiers are provided as ",
+             "the IRanges' names) or one of the column names of the IRanges ",
+             "metadata (mcols()).", call. = FALSE)
     if (id == "name")
         ids <- names(range)
     else ids <- mcols(range)[, id]
+    if (length(ids) != length(range) | all(is.na(ids)))
+        stop("No protein/transcript identifiers found. Please provide the ",
+             "IDs through the IRanges' names or one of its metadata columns.",
+             call. = FALSE)
     cds <- .cds_for_id2(x, ids, idType = idType)
     ## check returned CDS if their width matches the provided protein ranges
     mapply(cds, end(range) * 3,
@@ -851,7 +871,7 @@ setMethod(".cds_for_id2", "CompressedGRangesList",
 #' @param x `EnsDb` object.
 #'
 #' @param cds `GRangesList`
-#' 
+#'
 #' @param ... Further arguments to be passed on..
 #'
 #' @return `list` of `GRangesList`s with one list element per input protein,
@@ -935,7 +955,7 @@ setMethod(".cds_matching_protein", "EnsDb",
 #' @author Johannes Rainer, Boyu Yu
 #'
 #' @md
-#' 
+#'
 #' @noRd
 setMethod(".cds_matching_protein", "list",
   function(x) {
diff --git a/inst/NEWS b/inst/NEWS
index 869800c..b150840 100644
--- a/inst/NEWS
+++ b/inst/NEWS
@@ -1,3 +1,8 @@
+Changes in version 2.29.1
+
+- Improve documentation of `proteinToGenome()` and require named `IRanges` as
+  input to `proteinToGenome()`.
+
 Changes in version 2.27.1
 
 - Functions for coordinate mapping gain support for pre-loaded genomic ranges
diff --git a/man/proteinToGenome.Rd b/man/proteinToGenome.Rd
index e2d19c3..714a1f2 100644
--- a/man/proteinToGenome.Rd
+++ b/man/proteinToGenome.Rd
@@ -23,7 +23,7 @@ generated by \code{\link[=cdsBy]{cdsBy()}} where by = 'tx' and columns = c('tx_i
 ,'protein_id','uniprot_id','protein_sequence').}
 
 \item{id}{\code{character(1)} specifying where the protein identifier can be
-found. Has to be either \code{"name"} or one of \code{colnames(mcols(prng))}.}
+found. Has to be either \code{"name"} or one of \code{colnames(mcols(x))}.}
 
 \item{idType}{\code{character(1)} defining what type of IDs are provided. Has to
 be one of \code{"protein_id"} (default), \code{"uniprot_id"} or \code{"tx_id"}.}
@@ -67,9 +67,16 @@ encoding transcript is identified using protein-to-transcript annotations
 (and eventually Uniprot to Ensembl protein identifier mappings) from the
 submitted \code{EnsDb} object (and thus based on annotations from Ensembl).
 
-Not all coding regions for protein coding transcripts are complete, and the
-function thus checks also if the length of the coding region matches the
-length of the protein sequence and throws a warning if that is not the case.
+The regions within the protein sequence need to be provided as a named
+\code{IRanges} object with the names being protein identifiers and the start and
+end coordinates (within these proteins) defined by the \code{IRanges} object. As
+an alternative to the \code{IRanges}' names, protein identifiers can also be
+provided through a metadata column (see details below).
+
+Note that not all coding regions for protein coding transcripts are
+complete, and the function thus checks also if the length of the coding
+region matches the length of the protein sequence and throws a warning if
+that is not the case.
 
 The genomic coordinates for the within-protein coordinates, the Ensembl
 protein ID, the ID of the encoding transcript and the within protein start
diff --git a/man/proteinToTranscript.Rd b/man/proteinToTranscript.Rd
index 03e0827..115dcc1 100644
--- a/man/proteinToTranscript.Rd
+++ b/man/proteinToTranscript.Rd
@@ -25,7 +25,7 @@ generated by \code{\link[=cdsBy]{cdsBy()}} where by = 'tx' and columns = c('tx_i
 \item{...}{Further arguments to be passed on.}
 
 \item{id}{\code{character(1)} specifying where the protein identifier can be
-found. Has to be either \code{"name"} or one of \code{colnames(mcols(prng))}.}
+found. Has to be either \code{"name"} or one of \code{colnames(mcols(x))}.}
 
 \item{idType}{\code{character(1)} defining what type of IDs are provided. Has to
 be one of \code{"protein_id"} (default), \code{"uniprot_id"} or \code{"tx_id"}.}
@@ -61,6 +61,12 @@ mapping.
 the encoding transcript. Note that the returned positions are relative to
 the complete transcript length, which includes the 5' UTR.
 
+The regions within the protein sequence need to be provided as a named
+\code{IRanges} object with the names being protein identifiers and the start and
+end coordinates (within these proteins) defined by the \code{IRanges} object. As
+an alternative to the \code{IRanges}' names, protein identifiers can also be
+provided through a metadata column (see details below).
+
 Similar to the \code{\link[=proteinToGenome]{proteinToGenome()}} function, \code{proteinToTranscript} compares
 for each protein whether the length of its sequence matches the length of
 the encoding CDS and throws a warning if that is not the case. Incomplete
diff --git a/tests/testthat/test_proteinToGenome.R b/tests/testthat/test_proteinToGenome.R
index 49fe9f6..255fa6d 100644
--- a/tests/testthat/test_proteinToGenome.R
+++ b/tests/testthat/test_proteinToGenome.R
@@ -1,3 +1,14 @@
+test_that("proteinToGenome fails on IRanges without IDs", {
+    rng <- IRanges(start = c(123, 322), end = c(234, 444))
+    expect_error(proteinToGenome(rng, edb), "No protein/transcript")
+    names(rng) <- c(NA_character_, NA_character_)
+    expect_error(proteinToGenome(rng, edb), "No protein/transcript")
+    expect_error(proteinToGenome(rng, edb, id = "ann"), "should be either")
+    names(rng) <- NULL
+    mcols(rng)$ann <- c(NA_character_, NA_character_)
+    expect_error(proteinToGenome(rng, edb), "No protein/transcript")
+})
+
 test_that(".proteinCoordsToTx works", {
     prts <- IRanges(start = c(1, 2), end = c(1, 2))
     res <- .proteinCoordsToTx(prts)
@@ -277,7 +288,7 @@ test_that("proteinToTranscript works", {
 
     ## Preloaded data test
     expect_warning(tx_rel_preload <- proteinToTranscript(prngs, cds,
-        idType = "uniprot_id", 
+        idType = "uniprot_id",
         fiveUTR = fiveUTR))
     expect_equal(tx_rel_preload,tx_rel)
 
@@ -291,7 +302,7 @@ test_that("proteinToTranscript works", {
     expect_equal(unlist(unname(start(res))), c(-1, -1))
 
     ## Preloaded data test
-    expect_warning(res_preload <- proteinToTranscript(prngs, cds, 
+    expect_warning(res_preload <- proteinToTranscript(prngs, cds,
         fiveUTR = fiveUTR))
     expect_equal(res_preload,res)
 })