forked from hyginn/rete
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added the closure isHGNCsymbol(), its factory function .fastCheckFact…
…ory() (in R/aaa.R), the script generateHGNCtable.R (in inst/scripts), the HGNC table it uses (in inst/extdata/HGNCsymbols.RDS) and its tests (in tests/testthat/testHGNCsymbol.R). The closure is created when R/isHGNCsymbol.R is being sourced.
- Loading branch information
Showing
8 changed files
with
218 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# fastCheckFactory.R | ||
# | ||
# Utility function to create closures for ID validation | ||
|
||
.fastCheckFactory <- function(symbols) { | ||
# Creates a closure to be used to validate whether elements of a | ||
# character vector are present in the "symbols" data frame. | ||
# Parameters: | ||
# symbols: a one column dataframe | ||
# Value: | ||
# fCheck: a closure | ||
|
||
symbols$symbol[1] # force access | ||
|
||
fCheck <- function(x) { | ||
# Checks whether elements of x are present in the "symbols" data | ||
# frame which was loaded into this closure's environment. | ||
# | ||
# Returns a vector of logicals of the same length as x | ||
if (missing(x) || length(x) == 0) { return(logical()) } | ||
|
||
return(!is.na(fastmatch::fmatch(toupper(as.character(x)), | ||
symbols$symbol))) | ||
|
||
} | ||
|
||
return(fCheck) | ||
} | ||
|
||
# [END] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#' isHGNCsymbol.R | ||
#' | ||
#' Check whether gene symbols given are valid HGNC gene symbols. | ||
#' | ||
#' \code{isHGNCsymbol} Checks whether the elements of the input vector are | ||
#' valid HGNC gene symbols (case insensitive) by comparing to a subset | ||
#' of existing gene symbols. | ||
#' | ||
#' The subset of symbols used here contains only approved gene | ||
#' symbols of the following locus types: | ||
#' * gene with protein product | ||
#' * immunoglobulin gene | ||
#' * protocadherin | ||
#' * T-cell receptor gene | ||
#' * RNA: long non-coding, micro, ribosomal, transfer, small nuclear | ||
#' and nucleolar, Y and vault | ||
#' * endogenous retrovirus | ||
#' | ||
#' This function is a closure that contains the HGNC symbol table in its environment. It is produced as part of the .onLoad() tasks. The supporting table is stored in extdata/HGNCsymbols.RDS. The script that was used to generate this table is in scripts/generateHGNCtable.R. | ||
#' | ||
#' Checking is done in a case-insensitive manner. | ||
#' | ||
#' @param x A character vector | ||
#' @return A vector of logicals of length x that contains TRUE for every | ||
#' element that is present in the HGNC symbol table and FALSE | ||
#' for all others. | ||
#' | ||
#' @examples | ||
#' isHGNCsymbol() # logical() | ||
#' isHGNCsymbol(NULL) # logical() | ||
#' isHGNCsymbol(0) # FALSE | ||
#' isHGNCsymbol("A2M") # TRUE | ||
#' isHGNCsymbol(c("123", "234")) # vectorized | ||
#' isHGNCsymbol(c("A1BG", "a1bg", "a1Bg", "A1bG")) # case insensitive | ||
#' x <- c(NA, "A1CF", NULL, "a1bg") # length preserving: | ||
#' length(x) # 3 | ||
#' isHGNCsymbol(x) # FALSE, TRUE, TRUE | ||
#' | ||
#' @export | ||
|
||
isHGNCsymbol <- function(x) { | ||
stop("This function must be overwritten by a closure factory in .onLoad()") | ||
} | ||
|
||
tmp <- readRDS(system.file("extdata", | ||
"HGNCsymbols.RDS", | ||
package="rete")) | ||
|
||
isHGNCsymbol <- .fastCheckFactory(tmp) | ||
|
||
rm(tmp) | ||
|
||
|
||
# [END] |
Binary file not shown.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# generateHGNCtable.R | ||
|
||
# Generates a data frame containing all valid HGNC gene symbols, calls fmatch | ||
# on the data frame so it will create a hash of the data frame to be used by | ||
# .fastCheckFactory() to build a validator closure for HGNC symbols. | ||
# | ||
|
||
# Define request for the HGNC custom download CGI interface. Download gene | ||
# symbols and locus types only. | ||
HGNC_URL <- paste0("http://www.genenames.org/cgi-bin/download?", | ||
"col=gd_app_sym&col=gd_locus_type", | ||
"&status=Approved&status_opt=2", | ||
"&where=&order_by=gd_app_sym_sort", | ||
"&format=text&limit=&submit=submit") | ||
|
||
HGNCtable <- readr::read_delim(HGNC_URL, delim = "\t") | ||
|
||
# Whitelist for Locus Type to be included. We restrict the locus types | ||
# to those which we consider interpreatble in the context of cancer gene | ||
# network analysis. | ||
# | ||
# cat(sprintf("\"%s\",\n", unique(HGNCtable$`Locus Type`))) | ||
|
||
whitelistLT <- c("gene with protein product", | ||
"RNA, long non-coding", | ||
# "pseudogene", | ||
# "virus integration site", | ||
# "readthrough", | ||
# "phenotype only", | ||
# "unknown", | ||
# "region", | ||
"endogenous retrovirus", | ||
# "fragile site", | ||
"immunoglobulin gene", | ||
# "immunoglobulin pseudogene", | ||
# "transposable element", | ||
"RNA, micro", | ||
"RNA, ribosomal", | ||
"RNA, transfer", | ||
# "complex locus constituent", | ||
"protocadherin", | ||
# "RNA, cluster", | ||
# "RNA, misc", | ||
"RNA, small nuclear", | ||
# "RNA, small cytoplasmic", | ||
"RNA, small nucleolar", | ||
"RNA, Y", | ||
"T-cell receptor gene", | ||
# "T-cell receptor pseudogene", | ||
"RNA, vault") | ||
|
||
# Create a data frame with a single column containing the gene symbols. | ||
HGNCsymbols <- data.frame(symbol = | ||
HGNCtable$`Approved Symbol`[HGNCtable$`Locus Type` %in% whitelistLT], | ||
stringsAsFactors = FALSE) | ||
|
||
# Call fmatch, so that it will attach a hash to the data frame. | ||
fastmatch::fmatch("1", HGNCsymbols$symbol) | ||
|
||
# save as RDS | ||
# saveRDS(HGNCsymbols, file = "inst/extdata/HGNCsymbols.RDS") | ||
|
||
|
||
# [END] |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# testIsHGNCsymbol.R | ||
|
||
context("test the closure isHGNCsymbol()") | ||
|
||
test_that("expected input is correctly handled", { | ||
expect_equal(isHGNCsymbol(), logical()) | ||
expect_equal(isHGNCsymbol(NULL), logical()) | ||
expect_false(isHGNCsymbol(0)) | ||
expect_true(isHGNCsymbol("A1BG")) # First in table | ||
expect_true(isHGNCsymbol("a1bg")) # Case insensitive | ||
expect_true(isHGNCsymbol("ZZZ3")) # Last in table | ||
expect_equal(isHGNCsymbol(c(NA, "A1CF", NULL, "a2m")), c(FALSE, TRUE, TRUE)) | ||
}) | ||
|
||
test_that("unexpected input does not lead to output", { | ||
expect_error(isHGNCsymbol(mean), "cannot coerce") | ||
}) | ||
|
||
# [END] |