Skip to content

Commit

Permalink
Added the closure isHGNCsymbol(), its factory function .fastCheckFact…
Browse files Browse the repository at this point in the history
…ory() (in R/aaa.R), the script generateHGNCtable.R (in inst/scripts), the HGNC table it uses (in inst/extdata/HGNCsymbols.RDS) and its tests (in tests/testthat/testHGNCsymbol.R). The closure is created when R/isHGNCsymbol.R is being sourced.
  • Loading branch information
hyginn committed Mar 29, 2017
1 parent c100392 commit 200dba2
Show file tree
Hide file tree
Showing 8 changed files with 218 additions and 59 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ export(getProvenance)
export(getUUID)
export(importM.COSMIC)
export(importNet.STRING)
export(isHGNCsymbol)
export(logEvent)
export(logFileName)
export(logMessage)
Expand Down
30 changes: 30 additions & 0 deletions R/aaa.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# fastCheckFactory.R
#
# Utility function to create closures for ID validation

.fastCheckFactory <- function(symbols) {
# Creates a closure to be used to validate whether elements of a
# character vector are present in the "symbols" data frame.
# Parameters:
# symbols: a one column dataframe
# Value:
# fCheck: a closure

symbols$symbol[1] # force access

fCheck <- function(x) {
# Checks whether elements of x are present in the "symbols" data
# frame which was loaded into this closure's environment.
#
# Returns a vector of logicals of the same length as x
if (missing(x) || length(x) == 0) { return(logical()) }

return(!is.na(fastmatch::fmatch(toupper(as.character(x)),
symbols$symbol)))

}

return(fCheck)
}

# [END]
54 changes: 54 additions & 0 deletions R/isHGNCsymbol.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#' isHGNCsymbol.R
#'
#' Check whether gene symbols given are valid HGNC gene symbols.
#'
#' \code{isHGNCsymbol} Checks whether the elements of the input vector are
#' valid HGNC gene symbols (case insensitive) by comparing to a subset
#' of existing gene symbols.
#'
#' The subset of symbols used here contains only approved gene
#' symbols of the following locus types:
#' * gene with protein product
#' * immunoglobulin gene
#' * protocadherin
#' * T-cell receptor gene
#' * RNA: long non-coding, micro, ribosomal, transfer, small nuclear
#' and nucleolar, Y and vault
#' * endogenous retrovirus
#'
#' This function is a closure that contains the HGNC symbol table in its environment. It is produced as part of the .onLoad() tasks. The supporting table is stored in extdata/HGNCsymbols.RDS. The script that was used to generate this table is in scripts/generateHGNCtable.R.
#'
#' Checking is done in a case-insensitive manner.
#'
#' @param x A character vector
#' @return A vector of logicals of length x that contains TRUE for every
#' element that is present in the HGNC symbol table and FALSE
#' for all others.
#'
#' @examples
#' isHGNCsymbol() # logical()
#' isHGNCsymbol(NULL) # logical()
#' isHGNCsymbol(0) # FALSE
#' isHGNCsymbol("A2M") # TRUE
#' isHGNCsymbol(c("123", "234")) # vectorized
#' isHGNCsymbol(c("A1BG", "a1bg", "a1Bg", "A1bG")) # case insensitive
#' x <- c(NA, "A1CF", NULL, "a1bg") # length preserving:
#' length(x) # 3
#' isHGNCsymbol(x) # FALSE, TRUE, TRUE
#'
#' @export

isHGNCsymbol <- function(x) {
stop("This function must be overwritten by a closure factory in .onLoad()")
}

tmp <- readRDS(system.file("extdata",
"HGNCsymbols.RDS",
package="rete"))

isHGNCsymbol <- .fastCheckFactory(tmp)

rm(tmp)


# [END]
Binary file added inst/extdata/HGNCsymbols.RDS
Binary file not shown.
59 changes: 0 additions & 59 deletions inst/scripts/generateHGNCHash.R

This file was deleted.

64 changes: 64 additions & 0 deletions inst/scripts/generateHGNCtable.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# generateHGNCtable.R

# Generates a data frame containing all valid HGNC gene symbols, calls fmatch
# on the data frame so it will create a hash of the data frame to be used by
# .fastCheckFactory() to build a validator closure for HGNC symbols.
#

# Define request for the HGNC custom download CGI interface. Download gene
# symbols and locus types only.
HGNC_URL <- paste0("http://www.genenames.org/cgi-bin/download?",
"col=gd_app_sym&col=gd_locus_type",
"&status=Approved&status_opt=2",
"&where=&order_by=gd_app_sym_sort",
"&format=text&limit=&submit=submit")

HGNCtable <- readr::read_delim(HGNC_URL, delim = "\t")

# Whitelist for Locus Type to be included. We restrict the locus types
# to those which we consider interpreatble in the context of cancer gene
# network analysis.
#
# cat(sprintf("\"%s\",\n", unique(HGNCtable$`Locus Type`)))

whitelistLT <- c("gene with protein product",
"RNA, long non-coding",
# "pseudogene",
# "virus integration site",
# "readthrough",
# "phenotype only",
# "unknown",
# "region",
"endogenous retrovirus",
# "fragile site",
"immunoglobulin gene",
# "immunoglobulin pseudogene",
# "transposable element",
"RNA, micro",
"RNA, ribosomal",
"RNA, transfer",
# "complex locus constituent",
"protocadherin",
# "RNA, cluster",
# "RNA, misc",
"RNA, small nuclear",
# "RNA, small cytoplasmic",
"RNA, small nucleolar",
"RNA, Y",
"T-cell receptor gene",
# "T-cell receptor pseudogene",
"RNA, vault")

# Create a data frame with a single column containing the gene symbols.
HGNCsymbols <- data.frame(symbol =
HGNCtable$`Approved Symbol`[HGNCtable$`Locus Type` %in% whitelistLT],
stringsAsFactors = FALSE)

# Call fmatch, so that it will attach a hash to the data frame.
fastmatch::fmatch("1", HGNCsymbols$symbol)

# save as RDS
# saveRDS(HGNCsymbols, file = "inst/extdata/HGNCsymbols.RDS")


# [END]
50 changes: 50 additions & 0 deletions man/isHGNCsymbol.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions tests/testthat/testIsHGNCsymbol.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# testIsHGNCsymbol.R

context("test the closure isHGNCsymbol()")

test_that("expected input is correctly handled", {
expect_equal(isHGNCsymbol(), logical())
expect_equal(isHGNCsymbol(NULL), logical())
expect_false(isHGNCsymbol(0))
expect_true(isHGNCsymbol("A1BG")) # First in table
expect_true(isHGNCsymbol("a1bg")) # Case insensitive
expect_true(isHGNCsymbol("ZZZ3")) # Last in table
expect_equal(isHGNCsymbol(c(NA, "A1CF", NULL, "a2m")), c(FALSE, TRUE, TRUE))
})

test_that("unexpected input does not lead to output", {
expect_error(isHGNCsymbol(mean), "cannot coerce")
})

# [END]

0 comments on commit 200dba2

Please sign in to comment.