diff --git a/NAMESPACE b/NAMESPACE index 0dcca9b..591810e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(getProvenance) export(getUUID) export(importM.COSMIC) export(importNet.STRING) +export(isHGNCsymbol) export(logEvent) export(logFileName) export(logMessage) diff --git a/R/aaa.R b/R/aaa.R new file mode 100644 index 0000000..00344a5 --- /dev/null +++ b/R/aaa.R @@ -0,0 +1,30 @@ +# fastCheckFactory.R +# +# Utility function to create closures for ID validation + +.fastCheckFactory <- function(symbols) { + # Creates a closure to be used to validate whether elements of a + # character vector are present in the "symbols" data frame. + # Parameters: + # symbols: a one column dataframe + # Value: + # fCheck: a closure + + symbols$symbol[1] # force access + + fCheck <- function(x) { + # Checks whether elements of x are present in the "symbols" data + # frame which was loaded into this closure's environment. + # + # Returns a vector of logicals of the same length as x + if (missing(x) || length(x) == 0) { return(logical()) } + + return(!is.na(fastmatch::fmatch(toupper(as.character(x)), + symbols$symbol))) + + } + + return(fCheck) +} + +# [END] diff --git a/R/isHGNCsymbol.R b/R/isHGNCsymbol.R new file mode 100644 index 0000000..cc787d5 --- /dev/null +++ b/R/isHGNCsymbol.R @@ -0,0 +1,54 @@ +#' isHGNCsymbol.R +#' +#' Check whether gene symbols given are valid HGNC gene symbols. +#' +#' \code{isHGNCsymbol} Checks whether the elements of the input vector are +#' valid HGNC gene symbols (case insensitive) by comparing to a subset +#' of existing gene symbols. +#' +#' The subset of symbols used here contains only approved gene +#' symbols of the following locus types: +#' * gene with protein product +#' * immunoglobulin gene +#' * protocadherin +#' * T-cell receptor gene +#' * RNA: long non-coding, micro, ribosomal, transfer, small nuclear +#' and nucleolar, Y and vault +#' * endogenous retrovirus +#' +#' This function is a closure that contains the HGNC symbol table in its environment. It is produced as part of the .onLoad() tasks. The supporting table is stored in extdata/HGNCsymbols.RDS. The script that was used to generate this table is in scripts/generateHGNCtable.R. +#' +#' Checking is done in a case-insensitive manner. +#' +#' @param x A character vector +#' @return A vector of logicals of length x that contains TRUE for every +#' element that is present in the HGNC symbol table and FALSE +#' for all others. +#' +#' @examples +#' isHGNCsymbol() # logical() +#' isHGNCsymbol(NULL) # logical() +#' isHGNCsymbol(0) # FALSE +#' isHGNCsymbol("A2M") # TRUE +#' isHGNCsymbol(c("123", "234")) # vectorized +#' isHGNCsymbol(c("A1BG", "a1bg", "a1Bg", "A1bG")) # case insensitive +#' x <- c(NA, "A1CF", NULL, "a1bg") # length preserving: +#' length(x) # 3 +#' isHGNCsymbol(x) # FALSE, TRUE, TRUE +#' +#' @export + +isHGNCsymbol <- function(x) { + stop("This function must be overwritten by a closure factory in .onLoad()") +} + +tmp <- readRDS(system.file("extdata", + "HGNCsymbols.RDS", + package="rete")) + +isHGNCsymbol <- .fastCheckFactory(tmp) + +rm(tmp) + + +# [END] diff --git a/inst/extdata/HGNCsymbols.RDS b/inst/extdata/HGNCsymbols.RDS new file mode 100644 index 0000000..c775440 Binary files /dev/null and b/inst/extdata/HGNCsymbols.RDS differ diff --git a/inst/scripts/generateHGNCHash.R b/inst/scripts/generateHGNCHash.R deleted file mode 100644 index aa00c12..0000000 --- a/inst/scripts/generateHGNCHash.R +++ /dev/null @@ -1,59 +0,0 @@ -# generateHGNCHash.R - -#' Generates a data frame containing all valid HGNC gene symbols, calls fmatch -#' on the data frame so it will create a hash of the data frame to be used by -#' checkGeneSymbols.R. -#' -#' @param fURL The URL to the file containing valid HGNC gene symbols. -#' -#' @seealso \code{\link{isGeneSymbol}} is used to check if a gene symbol is a -#' valid HGNC gene symbol that is contained in inst/extdata/HGNCSymbols.rds. - -library(fastmatch) - -# Open the connection/file. -## Change fURL to be the URL of the file containing HGNC gene symbols. -## The file is assumed to contain the symbol, status and locus group of the gene. -fURL <- "http://www.genenames.org/cgi-bin/download?col=gd_app_sym&col=gd_status&col=gd_locus_type&col=gd_locus_group&status=Approved&status=Entry+Withdrawn&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit" -HGNCFile <- url(fURL, open = "r") - -# Read the header and vertorize. -header <- readLines(HGNCFile, n = 1) -headerVector <- unlist(strsplit(header, "\t")) - -# Find the index of the columns that contain the gene symbol, status, and locus -# group of the genes. -symbolCol <- which(headerVector == "Approved Symbol") -statusCol <- which(headerVector == "Status") -locusCol <- which(headerVector == "Locus Group") - -# vectors containing the (important) statuses and locus groups of interest. -impStatuses <- c("Approved") -impLocusGroups <- c("protein-coding gene", "other", "phenotype") - -# Read line by line. -geneSymbols <- c() -while (length(entry <- readLines(HGNCFile, n = 1)) > 0) { - entryVector <- strsplit(entry, "\t") - - # Check the status and locus group of the gene. - status <- entryVector[[1]][statusCol] - locusGroup <- entryVector[[1]][locusCol] - if ((status %in% impStatuses) && (locusGroup %in% impLocusGroups)){ - # append the current gene symbol to the list of valid gene symbols. - geneSymbols <- append(geneSymbols, entryVector[[1]][symbolCol]) - } - -} - -close(HGNCFile) - -# Create a data frame with a single column containing the gene symbols. -geneNames <- data.frame( - Gene_Symbol = geneSymbols, - stringsAsFactors = FALSE) - -# Call fmatch, so that it will create a hash of the data frame. -fmatch("1", geneNames) - -# [END] diff --git a/inst/scripts/generateHGNCtable.R b/inst/scripts/generateHGNCtable.R new file mode 100644 index 0000000..409aefb --- /dev/null +++ b/inst/scripts/generateHGNCtable.R @@ -0,0 +1,64 @@ +# generateHGNCtable.R + +# Generates a data frame containing all valid HGNC gene symbols, calls fmatch +# on the data frame so it will create a hash of the data frame to be used by +# .fastCheckFactory() to build a validator closure for HGNC symbols. +# + +# Define request for the HGNC custom download CGI interface. Download gene +# symbols and locus types only. +HGNC_URL <- paste0("http://www.genenames.org/cgi-bin/download?", + "col=gd_app_sym&col=gd_locus_type", + "&status=Approved&status_opt=2", + "&where=&order_by=gd_app_sym_sort", + "&format=text&limit=&submit=submit") + +HGNCtable <- readr::read_delim(HGNC_URL, delim = "\t") + +# Whitelist for Locus Type to be included. We restrict the locus types +# to those which we consider interpreatble in the context of cancer gene +# network analysis. +# +# cat(sprintf("\"%s\",\n", unique(HGNCtable$`Locus Type`))) + +whitelistLT <- c("gene with protein product", + "RNA, long non-coding", + # "pseudogene", + # "virus integration site", + # "readthrough", + # "phenotype only", + # "unknown", + # "region", + "endogenous retrovirus", + # "fragile site", + "immunoglobulin gene", + # "immunoglobulin pseudogene", + # "transposable element", + "RNA, micro", + "RNA, ribosomal", + "RNA, transfer", + # "complex locus constituent", + "protocadherin", + # "RNA, cluster", + # "RNA, misc", + "RNA, small nuclear", + # "RNA, small cytoplasmic", + "RNA, small nucleolar", + "RNA, Y", + "T-cell receptor gene", + # "T-cell receptor pseudogene", + "RNA, vault") + +# Create a data frame with a single column containing the gene symbols. +HGNCsymbols <- data.frame(symbol = + HGNCtable$`Approved Symbol`[HGNCtable$`Locus Type` %in% whitelistLT], + stringsAsFactors = FALSE) + +# Call fmatch, so that it will attach a hash to the data frame. +fastmatch::fmatch("1", HGNCsymbols$symbol) + +# save as RDS +# saveRDS(HGNCsymbols, file = "inst/extdata/HGNCsymbols.RDS") + + +# [END] diff --git a/man/isHGNCsymbol.Rd b/man/isHGNCsymbol.Rd new file mode 100644 index 0000000..76e9497 --- /dev/null +++ b/man/isHGNCsymbol.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/isHGNCsymbol.R +\name{isHGNCsymbol} +\alias{isHGNCsymbol} +\title{isHGNCsymbol.R} +\usage{ +isHGNCsymbol(x) +} +\arguments{ +\item{x}{A character vector} +} +\value{ +A vector of logicals of length x that contains TRUE for every + element that is present in the HGNC symbol table and FALSE + for all others. +} +\description{ +Check whether gene symbols given are valid HGNC gene symbols. +} +\details{ +\code{isHGNCsymbol} Checks whether the elements of the input vector are + valid HGNC gene symbols (case insensitive) by comparing to a subset + of existing gene symbols. + +The subset of symbols used here contains only approved gene + symbols of the following locus types: + * gene with protein product + * immunoglobulin gene + * protocadherin + * T-cell receptor gene + * RNA: long non-coding, micro, ribosomal, transfer, small nuclear + and nucleolar, Y and vault + * endogenous retrovirus + +This function is a closure that contains the HGNC symbol table in its environment. It is produced as part of the .onLoad() tasks. The supporting table is stored in extdata/HGNCsymbols.RDS. The script that was used to generate this table is in scripts/generateHGNCtable.R. + +Checking is done in a case-insensitive manner. +} +\examples{ +isHGNCsymbol() # logical() +isHGNCsymbol(NULL) # logical() +isHGNCsymbol(0) # FALSE +isHGNCsymbol("A2M") # TRUE +isHGNCsymbol(c("123", "234")) # vectorized +isHGNCsymbol(c("A1BG", "a1bg", "a1Bg", "A1bG")) # case insensitive +x <- c(NA, "A1CF", NULL, "a1bg") # length preserving: +length(x) # 3 +isHGNCsymbol(x) # FALSE, TRUE, TRUE + +} diff --git a/tests/testthat/testIsHGNCsymbol.R b/tests/testthat/testIsHGNCsymbol.R new file mode 100644 index 0000000..2d7768c --- /dev/null +++ b/tests/testthat/testIsHGNCsymbol.R @@ -0,0 +1,19 @@ +# testIsHGNCsymbol.R + +context("test the closure isHGNCsymbol()") + +test_that("expected input is correctly handled", { + expect_equal(isHGNCsymbol(), logical()) + expect_equal(isHGNCsymbol(NULL), logical()) + expect_false(isHGNCsymbol(0)) + expect_true(isHGNCsymbol("A1BG")) # First in table + expect_true(isHGNCsymbol("a1bg")) # Case insensitive + expect_true(isHGNCsymbol("ZZZ3")) # Last in table + expect_equal(isHGNCsymbol(c(NA, "A1CF", NULL, "a2m")), c(FALSE, TRUE, TRUE)) +}) + +test_that("unexpected input does not lead to output", { + expect_error(isHGNCsymbol(mean), "cannot coerce") +}) + +# [END]