From 34638d4904babb40994a7786bb93d162a10f0d5c Mon Sep 17 00:00:00 2001 From: FelixErnst Date: Sun, 24 Mar 2024 01:23:07 +0100 Subject: [PATCH] Fixed RMBase loading and update RMBase data dictionary --- DESCRIPTION | 1 + NAMESPACE | 1 + R/makeEpiTxDbFromRMBase.R | 82 ++++++++++++++++++++++++--------- data/rmbase_data.rda | Bin 433 -> 1015 bytes man/makeEpiTxDbFromRMBase.Rd | 10 ++-- vignettes/EpiTxDb-creation.Rmd | 2 +- 6 files changed, 70 insertions(+), 26 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c0fc7c2..3cac48c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,6 +30,7 @@ Imports: httr, xml2, curl, + rex, GenomicFeatures, txdbmaker, GenomicRanges, diff --git a/NAMESPACE b/NAMESPACE index 06fa31d..3132ab6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -40,6 +40,7 @@ import(S4Vectors) import(methods) import(GenomicFeatures) import(txdbmaker) +import(rex) importClassesFrom(IRanges,PartitioningByEnd) importClassesFrom(IRanges,PartitioningByWidth) importFrom(BiocFileCache,BiocFileCache) diff --git a/R/makeEpiTxDbFromRMBase.R b/R/makeEpiTxDbFromRMBase.R index b357583..2641100 100644 --- a/R/makeEpiTxDbFromRMBase.R +++ b/R/makeEpiTxDbFromRMBase.R @@ -31,6 +31,7 @@ NULL #' the original base. This uses #' \code{\link[Modstrings:separate]{removeIncompatibleModifications()}} #' function from the \code{Modstrings} package. +#' @param verbose \code{TRUE} or \code{FALSE}: Should verbose message be prined? #' @param metadata,reassign.ids See \code{\link[=makeEpiTxDb]{makeEpiTxDb}} #' #' @return a \code{EpiTxDb} object. @@ -40,12 +41,15 @@ NULL #' @rdname makeEpiTxDbFromRMBase #' @export -EPITXDB_RMBASE_URL <- "http://rna.sysu.edu.cn/rmbase/download/" +EPITXDB_RMBASE_URL <- "https://rna.sysu.edu.cn/rmbase/download/" +EPITXDB_RMBASE_URL_JSON <- paste0(EPITXDB_RMBASE_URL,"ajax/download.json") # makeEpiTxDbFromRMBase -------------------------------------------------------- .get_RMBase_rnames <- function(organism, genome, modtype){ - paste0("RMBase_",organism,"_",genome,"_",modtype) + rmbase_data <- NULL + utils::data("rmbase_data", envir = environment(), package = "EpiTxDb") + rmbase_data[rmbase_data$organism == organism & rmbase_data$genome == genome & rmbase_data$mod == modtype,]$dataSet } .check_RMBase_files_available <- function(bfc, organism, genome, modtype){ @@ -143,11 +147,20 @@ downloadRMBaseFiles <- function(organism, genome, modtype){ #' @export makeEpiTxDbFromRMBase <- function(organism, genome, modtype, tx = NULL, sequences = NULL, metadata = NULL, - reassign.ids = FALSE){ + reassign.ids = FALSE, verbose = FALSE){ + # Input check + if(!(is.logical(reassign.ids) && length(reassign.ids) == 1L)){ + stop("'reassign.ids' must be TRUE or FALSE.", call. = FALSE) + } + if(!(is.logical(verbose) && length(verbose) == 1L)){ + stop("'verbose' must be TRUE or FALSE.", call. = FALSE) + } + # message("Loading RMBase files ...") files <- downloadRMBaseFiles(organism, genome, modtype) makeEpiTxDbFromRMBaseFiles(files, tx = tx, sequences = sequences, - metadata = metadata, reassign.ids = reassign.ids) + metadata = metadata, reassign.ids = reassign.ids, + verbose = verbose) } # makeEpiTxDbFromRMBaseFiles --------------------------------------------------- @@ -246,7 +259,7 @@ EPITXDB_RMBASE_REQ_COLUMS <- c("chromosome", "modStart", "modEnd", "modId", } #' @importFrom Biostrings DNAStringSet subseq -.extract_GRanges_from_RMBase <- function(rmb, seqtype = "RNA"){ +.extract_GRanges_from_RMBase <- function(rmb, seqtype = "RNA", verbose = FALSE){ ############################################################################ ### check modification information on correct base seq <- Biostrings::DNAStringSet(rmb$sequence) @@ -267,15 +280,20 @@ EPITXDB_RMBASE_REQ_COLUMS <- c("chromosome", "modStart", "modEnd", "modId", nc_type) f <- Modstrings:::values(codec)[match(modValues, Modstrings:::values(codec))] - # check if reported base matches original base + # check if reported base matches original base, 21 is the middle position of + # 41 nucleotides always returned from RMBase subseq <- Biostrings::subseq(seq,21L,21L) base_mm <- Modstrings:::originatingBase(codec)[f] != subseq # if not delete the modifications with a warning if(any(base_mm)){ - warning("Detected mismatch of modification and originating base in ", + warning("Detected ",sum(base_mm), + " mismatch(es) of modification and originating base in ", "RMBase data for '", paste(unique(mod_type), collapse = "', '"), "'. Removing them ... ", call. = FALSE) + if(verbose){ + print(rmb[base_mm,]) + } rmb <- rmb[!base_mm,] } ############################################################################ @@ -337,11 +355,11 @@ EPITXDB_RMBASE_REQ_COLUMS <- c("chromosome", "modStart", "modEnd", "modId", ans } -.get_RMBase_data <- function(files){ +.get_RMBase_data <- function(files, verbose = FALSE){ grl <- lapply(files, function(file){ rmb <- .read_RMBase_file(file) - .extract_GRanges_from_RMBase(rmb) + .extract_GRanges_from_RMBase(rmb, verbose = verbose) }) grl } @@ -376,10 +394,15 @@ EPITXDB_RMBASE_REQ_COLUMS <- c("chromosome", "modStart", "modEnd", "modId", #' @rdname makeEpiTxDbFromRMBase #' @export -getRMBaseDataAsGRanges <- function(files){ +getRMBaseDataAsGRanges <- function(files, verbose = FALSE){ + # Input check + if(!(is.logical(verbose) && length(verbose) == 1L)){ + stop("'verbose' must be TRUE or FALSE.", call. = FALSE) + } + # message("Assembling RMBase data ...") # getting raw data from RMBase files - grl <- .get_RMBase_data(files) + grl <- .get_RMBase_data(files, verbose = verbose) gr <- unlist(GenomicRanges::GRangesList(grl)) gr } @@ -415,8 +438,17 @@ getRMBaseDataAsGRanges <- function(files){ #' @rdname makeEpiTxDbFromRMBase #' @export makeEpiTxDbFromRMBaseFiles <- function(files, tx = NULL, sequences = NULL, - metadata = NULL, reassign.ids = FALSE){ - gr <- getRMBaseDataAsGRanges(files) + metadata = NULL, reassign.ids = FALSE, + verbose = FALSE){ + # Input check + if(!(is.logical(reassign.ids) && length(reassign.ids) == 1L)){ + stop("'reassign.ids' must be TRUE or FALSE.", call. = FALSE) + } + if(!(is.logical(verbose) && length(verbose) == 1L)){ + stop("'verbose' must be TRUE or FALSE.", call. = FALSE) + } + # + gr <- getRMBaseDataAsGRanges(files, verbose = verbose) if(!is.null(tx)){ sl <- GenomeInfoDb::seqlevels(tx) } else if(!is.null(sequences)) { @@ -455,22 +487,28 @@ makeEpiTxDbFromRMBaseFiles <- function(files, tx = NULL, sequences = NULL, #' @importFrom curl curl #' @export listAvailableOrganismsFromRMBase <- function(){ - # con <- curl::curl(EPITXDB_RMBASE_URL) - # page <- xml2::read_html(con) - # organisms <- xml2::xml_attr(xml2::xml_find_all(page,'//img[@alt="[DIR]"]//../following::a'),"href") - # organisms <- gsub("/","",organisms) - # organisms[!(organisms %in% c("ajax","otherspecies"))] + # con <- curl::curl(EPITXDB_RMBASE_URL_JSON) + # downloadData <- jsonlite::fromJSON(con) + # rmbase_data <- downloadData$data[,1:3] + # colnames(rmbase_data) <- c("species","mod","dataSet") + # re <- rex("download/", + # capture(alphas, name="organism"), + # "/RMBase_", + # capture(zero_or_more(any,type="lazy"), name="genome"), + # "_",anything,"/zip/", + # capture(zero_or_more(any,type="lazy"), name="file"),"'") + # rmbase_data <- cbind(rmbase_data,re_matches(downloadData$data$Download,re)) rmbase_data <- NULL utils::data("rmbase_data", envir = environment(), package = "EpiTxDb") as.character(unique(rmbase_data$organism)) } #' @importFrom curl curl +#' @import rex .get_RMBase_files <- function(organism){ - con <- curl::curl(paste0(EPITXDB_RMBASE_URL,organism,"/zip/")) - page <- xml2::read_html(con) - files <- xml2::xml_attr(xml2::xml_find_all(page,'//img[@alt="[ ]"]//../following::a'),"href") - files[!grepl("^old",files)] + rmbase_data <- NULL + utils::data("rmbase_data", envir = environment(), package = "EpiTxDb") + as.character(rmbase_data[rmbase_data$organism == organism,]$file) } .get_RMBase_genomes <- function(files){ diff --git a/data/rmbase_data.rda b/data/rmbase_data.rda index 8541d84849da6bddf4af7ca00f4759903de88989..eaca56573e2be5e5623db3f638fa2c096116507a 100644 GIT binary patch literal 1015 zcmVH00B@0UL6W< zxxfO9GE-7vG--h{4H{qqJp>IJXvn|=OpOpQ=jxhzdJ+Ht000000000Q01VWjN9uu~ zXaE2J00000000gQgGPWdX^3f{$T18cXbl*cjQ{`$q*DaM2-5_oiK1xzQy>~L8e)1z zo>7tNY@^a`Ow{+y%NJnBdZloR8LE;a)eIcbi`C)zsZk1s0NdGk7-3Sr$dH1!z>o23 z%>f0H0~fdOvX2wiFBy z2--j;Fb-F7=V}{TF$b)DAy(+2>Gx3An-!9pBta5_Nx&tr3GV6R0+<3(DN0Z}L`Vk% zpNW&|=wOjjCtJ5`2HKn`NXa%}gpPN*{J$FpWZl|BKnb-vLlQ@;uzm^7=L94TfIvhC zlf~>bh+_o{khnv|!7wmY@s1!RvneZKAaE)M87)F-=WIUdF37_Kt8-+mZ zJYirF0w6>Pg4PWBTW{I9d75F~c2uSsL_{GJC)G3%07Vm>R&tcaNh6d10E9WrZpnZM zh?QW700bxMRsSu!&+g zs@cF!0Vcps0Cj|sLsJIw`>YeH?UmVFYnF@s;&C0 z5h&mK^@SpZnQqBjG)slX3zD7zb_IhPunD8zmq2VZ1)55IW81(2=4`xCI4{+xHL4|d zfcS$r5Wv7N1NM%F zV7XyVxW_IQ@Vsz?DS%k26p;MrNq8v%Wd^F?3Q&~ES~kaEp*ElvaOk-cK%r6?L$F_j lTG~K7FWfZ@?|Z^P>X=MZL=p#N58tu8UC9*TLO`fA{@BV*y`KO8 literal 433 zcmV;i0Z#rxT4*^jL0KkKS#>X(Q2+tD|H1$M$OHfbe_%eNUO>O+-=IJM1ONa5umO0} zFf~l3Pf_U@k5kkJntDJ0G-zOr41zQ&pQfj(dQIg`>S{fzc{KwjYG@i788pHGkx10^ znwmUF$N&HU(?Ox8f#o1T3(T@tqE?vNMC4~E!Gfvj8dS)Di}qzGmq zz*pKRYNoI{nV!lfxSh7#D_MS3*5d?>BM`E*K`|_m$DVl8r^|Fi0t}OhNC2h)ER>zI zWuG5g@b68mL8#7)W80dlZVbzK#E^=RlVWHRXlP9j696%=X`u+#XhQXLG^R}e371G2F@WrCJ}+MSS4leP0L|gI#pi& zpCX>Ua bvSxh~60Z-