diff --git a/.gitignore b/.gitignore index 16e98ef..f514206 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ inst/doc .Rhistory .Rproj* .Rproj.user +vignettes/*.html +vignettes/png/* diff --git a/DESCRIPTION b/DESCRIPTION index 0b29213..5d4ee74 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,68 +1,60 @@ -Package: ggcoverage Type: Package +Package: ggcoverage Title: Visualize Genome/Protein Coverage with Various Annotations -Version: 1.3.0 +Version: 1.4.0 Authors@R: c( - person(given = "Yabing", - family = "Song", - role = c("aut", "cre"), - email = "songyb0519@gmail.com"), - person(given = "Michael", - family = "Jahn", - email = "jahn@mpusp.mpg.de", - role = "aut", + person("Yabing", "Song", , "songyb0519@gmail.com", role = c("aut", "cre", "cph")), + person("Michael", "Jahn", , "jahn@mpusp.mpg.de", role = c("aut", "cph"), comment = c(ORCID = "0000-0002-3913-153X")) ) -Maintainer: Yabing Song -Description: The goal of 'ggcoverage' is to simplify the process of visualizing genome/protein coverage. It contains functions to - load data from BAM, BigWig, BedGraph or txt/xlsx files, create genome/protein coverage plot, add various annotations to - the coverage plot, including base and amino acid annotation, GC annotation, gene annotation, transcript annotation, ideogram annotation, - peak annotation, contact map annotation, link annotation and peotein feature annotation. +Maintainer: Michael Jahn +Description: The goal of 'ggcoverage' is to simplify the process of + visualizing genome/protein coverage. It contains functions to load + data from BAM, BigWig, BedGraph or txt/xlsx files, create + genome/protein coverage plots, add various annotations to the coverage + plot, including base and amino acid annotation, GC annotation, gene + annotation, transcript annotation, ideogram annotation, peak + annotation, contact map annotation, link annotation and protein + feature annotation. License: MIT + file LICENSE -Encoding: UTF-8 -RoxygenNote: 7.3.1 -URL: https://showteeth.github.io/ggcoverage/, https://github.com/showteeth/ggcoverage +URL: https://showteeth.github.io/ggcoverage/, + https://github.com/showteeth/ggcoverage BugReports: https://github.com/showteeth/ggcoverage/issues -biocViews: Imports: + BiocParallel, + Biostrings, dplyr, + GenomeInfoDb, + GenomicAlignments, GenomicRanges, - ggbio, ggh4x, + ggpattern, ggplot2, ggrepel, grDevices, + gridExtra, IRanges, - magrittr, + methods, patchwork, - RColorBrewer, rlang, Rsamtools, rtracklayer, scales, stats, - utils, - methods, - GenomeInfoDb, - S4Vectors, - Biostrings, - BSgenome, - GenomicAlignments, - reshape2, - seqinr, - ggforce, - HiCBricks, - ggpattern, - BiocParallel, - openxlsx, - stringr, - gridExtra + utils Suggests: - rmarkdown, - knitr, BiocStyle, - htmltools, BSgenome.Hsapiens.UCSC.hg19, + ggbio, + ggforce, graphics, - HiCDataHumanIMR90 -VignetteBuilder: knitr + HiCBricks, + HiCDataHumanIMR90, + htmltools, + knitr, + rmarkdown +VignetteBuilder: + knitr +biocViews: +Encoding: UTF-8 +RoxygenNote: 7.3.1 diff --git a/NAMESPACE b/NAMESPACE index 9a7fb32..ea95618 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,6 +14,7 @@ export(FormatTrack) export(GetConsensusPeak) export(GetPlotData) export(LoadTrackFile) +export(geom_arrows) export(geom_base) export(geom_cnv) export(geom_coverage) @@ -45,14 +46,14 @@ export(theme_protein2) export(theme_tad) export(theme_tad2) export(theme_transcript) -import(HiCBricks) -importFrom(BSgenome,getSeq) importFrom(BiocParallel,MulticoreParam) importFrom(BiocParallel,bplapply) importFrom(BiocParallel,register) +importFrom(Biostrings,getSeq) importFrom(Biostrings,letterFrequency) importFrom(Biostrings,readAAStringSet) importFrom(Biostrings,readDNAStringSet) +importFrom(Biostrings,translate) importFrom(GenomeInfoDb,"seqlengths<-") importFrom(GenomeInfoDb,seqlengths) importFrom(GenomeInfoDb,seqnames) @@ -69,10 +70,9 @@ importFrom(GenomicRanges,trim) importFrom(IRanges,IRanges) importFrom(IRanges,findOverlaps) importFrom(IRanges,subsetByOverlaps) -importFrom(RColorBrewer,brewer.pal) importFrom(Rsamtools,ScanBamParam) importFrom(Rsamtools,indexBam) -importFrom(S4Vectors,"values<-") +importFrom(dplyr,"%>%") importFrom(dplyr,all_of) importFrom(dplyr,arrange) importFrom(dplyr,filter) @@ -81,8 +81,6 @@ importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,summarize) -importFrom(ggbio,layout_karyogram) -importFrom(ggforce,geom_bezier) importFrom(ggh4x,elem_list_rect) importFrom(ggh4x,facet_wrap2) importFrom(ggh4x,strip_themed) @@ -99,6 +97,7 @@ importFrom(ggplot2,element_rect) importFrom(ggplot2,element_text) importFrom(ggplot2,expansion) importFrom(ggplot2,geom_bar) +importFrom(ggplot2,geom_curve) importFrom(ggplot2,geom_hline) importFrom(ggplot2,geom_label) importFrom(ggplot2,geom_line) @@ -114,7 +113,7 @@ importFrom(ggplot2,ggplot_add) importFrom(ggplot2,labs) importFrom(ggplot2,margin) importFrom(ggplot2,rel) -importFrom(ggplot2,scale_color_gradient2) +importFrom(ggplot2,scale_color_gradientn) importFrom(ggplot2,scale_color_manual) importFrom(ggplot2,scale_fill_manual) importFrom(ggplot2,scale_x_continuous) @@ -125,22 +124,25 @@ importFrom(ggplot2,unit) importFrom(ggrepel,geom_text_repel) importFrom(grDevices,col2rgb) importFrom(grDevices,colorRampPalette) +importFrom(grDevices,grey) importFrom(gridExtra,tableGrob) importFrom(gridExtra,ttheme_default) -importFrom(magrittr,"%>%") importFrom(methods,extends) -importFrom(openxlsx,read.xlsx) importFrom(patchwork,wrap_plots) -importFrom(reshape2,melt) importFrom(rlang,.data) importFrom(rlang,as_label) +importFrom(rtracklayer,GRangesForUCSCGenome) +importFrom(rtracklayer,browserSession) +importFrom(rtracklayer,getTable) importFrom(rtracklayer,import) +importFrom(rtracklayer,tableName) +importFrom(rtracklayer,ucscGenomes) +importFrom(rtracklayer,ucscTableQuery) importFrom(scales,comma) importFrom(scales,rescale) importFrom(scales,scientific) -importFrom(seqinr,translate) importFrom(stats,as.formula) -importFrom(stringr,str_locate) +importFrom(stats,reshape) importFrom(utils,menu) importFrom(utils,read.csv) importFrom(utils,read.table) diff --git a/NEWS.md b/NEWS.md index c80ec28..2513161 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +# ggcoverage 1.4.0 +## Major changes +* removed 11 strict dependencies, refactored many functions to work with basic solutions +* formatted all R code with 'styler' +* improved arrow style for genomic features + +## Minor changes +* fixed overlapping gene arrows issue +* dont run failing ideogram example check +* fixed unavailable URLS +* use tidy description + # ggcoverage 1.3.0 ## Major changes * Refactored `LoadTrack` function. diff --git a/R/ConsensusPeak.R b/R/ConsensusPeak.R index 9707bcb..9df6099 100644 --- a/R/ConsensusPeak.R +++ b/R/ConsensusPeak.R @@ -26,7 +26,6 @@ GetConsensusPeak <- function(peak.file, peak.folder = NULL, mspc.path = NULL, rep.type = c("bio", "tec"), stringency.threshold = 1e-8, weak.threshold = 1e-4, gamma = 1e-8, alpha = 0.05, min.overlap.num = 1, multiple.intersections = c("Lowest", "Highest"), parallelism.degree = 1) { - # check parameters rep.type <- match.arg(arg = rep.type) multiple.intersections <- match.arg(arg = multiple.intersections) diff --git a/R/FormatInput.R b/R/FormatInput.R index c7f05b1..f3fe35d 100644 --- a/R/FormatInput.R +++ b/R/FormatInput.R @@ -41,7 +41,7 @@ GetRegion <- function(df, chr, start, end = NULL) { #' @param extend Extend length of \code{region}. Default: 2000. #' #' @return A dataframe. -#' @importFrom magrittr %>% +#' @importFrom dplyr %>% #' @importFrom dplyr filter arrange #' #' @export diff --git a/R/LoadTrack.R b/R/LoadTrack.R index 1e63f33..f8d4756 100644 --- a/R/LoadTrack.R +++ b/R/LoadTrack.R @@ -31,7 +31,7 @@ #' @importFrom GenomicAlignments alphabetFrequencyFromBam readGAlignments coverage #' @importFrom GenomicRanges GRanges #' @importFrom IRanges IRanges subsetByOverlaps -#' @importFrom magrittr %>% +#' @importFrom dplyr %>% #' @importFrom dplyr select filter mutate all_of group_by summarize #' @importFrom BiocParallel register MulticoreParam bplapply #' @importFrom ggplot2 cut_width @@ -56,17 +56,16 @@ #' meta.info = sample.meta #' ) LoadTrackFile <- function( - track.file, track.folder = NULL, - format = c("bam", "wig", "bw", "bedgraph", "txt"), - region = NULL, extend = 2000, - gtf.gr = NULL, gene.name = "HNRNPC", - gene.name.type = c("gene_name", "gene_id"), - meta.info = NULL, meta.file = "", - bamcoverage.path = NULL, - norm.method = c("RPKM", "CPM", "BPM", "RPGC", "None"), - single.nuc = FALSE, single.nuc.region = NULL, - bin.size = 10, bc.extra.para = NULL, n.cores = 1 -) { + track.file, track.folder = NULL, + format = c("bam", "wig", "bw", "bedgraph", "txt"), + region = NULL, extend = 2000, + gtf.gr = NULL, gene.name = "HNRNPC", + gene.name.type = c("gene_name", "gene_id"), + meta.info = NULL, meta.file = "", + bamcoverage.path = NULL, + norm.method = c("RPKM", "CPM", "BPM", "RPGC", "None"), + single.nuc = FALSE, single.nuc.region = NULL, + bin.size = 10, bc.extra.para = NULL, n.cores = 1) { # check parameters format <- match.arg(arg = format) gene.name.type <- match.arg(arg = gene.name.type) @@ -83,8 +82,8 @@ LoadTrackFile <- function( if (format == "bam") { seqnames <- Rsamtools::scanBamHeader(track.file[1]) %>% lapply(function(x) x$targets) %>% - unname %>% - unlist + unname() %>% + unlist() gr <- GenomicRanges::GRanges( seqnames = names(seqnames[1]), IRanges(start = 1, end = min(100000, seqnames[1])) @@ -128,20 +127,20 @@ LoadTrackFile <- function( BiocParallel::bplapply(track.file, BPPARAM = BiocParallel::MulticoreParam(), FUN = index_bam) } if (single.nuc) { - if (is.null(n.cores) || n.cores == 1) { - track.list <- lapply( - track.file, - single_nuc_cov, - single.nuc.region - ) - } else { - track.list <- BiocParallel::bplapply( - track.file, - BPPARAM = BiocParallel::MulticoreParam(), - FUN = single_nuc_cov, - single.nuc.region - ) - } + if (is.null(n.cores) || n.cores == 1) { + track.list <- lapply( + track.file, + single_nuc_cov, + single.nuc.region + ) + } else { + track.list <- BiocParallel::bplapply( + track.file, + BPPARAM = BiocParallel::MulticoreParam(), + FUN = single_nuc_cov, + single.nuc.region + ) + } } else { if (norm.method == "None") { message("Calculating coverage with GenomicAlignments when 'norm.method = None'") @@ -310,8 +309,7 @@ single_nuc_cov <- function(x, single.nuc.region) { } bam_coverage <- function( - x, bamcoverage.path, bin.size, norm.method, bc.extra.para, gr -) { + x, bamcoverage.path, bin.size, norm.method, bc.extra.para, gr) { # bigwig file out.bw.file <- tempfile(fileext = c(".bw")) # prepare bamCoverage cmd diff --git a/R/geom_base.R b/R/geom_base.R index c044913..4bc56f1 100644 --- a/R/geom_base.R +++ b/R/geom_base.R @@ -34,14 +34,12 @@ #' @param plot.height The relative height of base and amino acid annotation to coverage plot. Default: 0.5. #' #' @return Plot. +#' @importFrom stats reshape #' @importFrom GenomicRanges GRanges #' @importFrom IRanges IRanges #' @importFrom GenomicAlignments alphabetFrequencyFromBam -#' @importFrom magrittr %>% -#' @importFrom Biostrings readDNAStringSet -#' @importFrom BSgenome getSeq -#' @importFrom reshape2 melt -#' @importFrom seqinr translate +#' @importFrom dplyr %>% +#' @importFrom Biostrings readDNAStringSet getSeq translate #' @importFrom ggplot2 ggplot_add ggplot geom_bar geom_label unit aes_string geom_hline labs #' geom_tile geom_text theme_classic theme element_blank scale_fill_manual #' element_text element_rect margin scale_x_continuous scale_y_continuous coord_cartesian @@ -49,29 +47,40 @@ #' @export #' #' @examples -#' # library(ggcoverage) -#' # library("BSgenome.Hsapiens.UCSC.hg19") +#' library("BSgenome.Hsapiens.UCSC.hg19") +#' #' # get sample metadata -#' # sample.meta <- data.frame( -#' # SampleName = c("tumorA.chr4.selected"), -#' # Type = c("tumorA"), Group = c("tumorA") -#' # ) +#' sample.meta <- data.frame( +#' SampleName = c("tumorA.chr4.selected"), +#' Type = c("tumorA"), +#' Group = c("tumorA") +#' ) +#' #' # get bam file -#' # bam.file <- system.file("extdata", "DNA-seq", "tumorA.chr4.selected.bam", package = "ggcoverage") +#' bam.file <- +#' system.file("extdata", "DNA-seq", "tumorA.chr4.selected.bam", package = "ggcoverage") +#' #' # load bam file -#' # track.df <- LoadTrackFile( -#' # track.file = bam.file, -#' # meta.info = sample.meta, single.nuc = TRUE, -#' # single.nuc.region = "chr4:62474235-62474295" -#' # ) -#' # ggcoverage( -#' # data = track.df, color = "grey", range.position = "out", -#' # single.nuc = TRUE, rect.color = "white" -#' # ) + -#' # geom_base( -#' # bam.file = bam.file, -#' # bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19 -#' # ) +#' track.df <- LoadTrackFile( +#' track.file = bam.file, +#' meta.info = sample.meta, +#' single.nuc = TRUE, +#' single.nuc.region = "chr4:62474235-62474295" +#' ) +#' +#' # plot +#' ggcoverage( +#' data = track.df, +#' color = "grey", +#' range.position = "out", +#' single.nuc = TRUE, +#' rect.color = "white" +#' ) + +#' geom_base( +#' bam.file = bam.file, +#' bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19 +#' ) +#' geom_base <- function(bam.file, fa.file = NULL, bs.fa.seq = NULL, chr.split = "[[:space:]]", nuc.offset = -0.1, nuc.size = 4, nuc.padding = 0.05, nuc.padding.r = 0, nuc.color = c("A" = "#ff2b08", "C" = "#009aff", "G" = "#ffb507", "T" = "#00bc0d"), @@ -86,16 +95,17 @@ geom_base <- function(bam.file, fa.file = NULL, bs.fa.seq = NULL, chr.split = "[ "*" = "#FFC0CB" ), aa.border.color = "white", aa.size = 4, aa.margin = 2, aa.height = 0.4, plot.space = 2.5, plot.height = 0.5) { - structure(list( - bam.file = bam.file, fa.file = fa.file, bs.fa.seq = bs.fa.seq, chr.split = chr.split, - nuc.offset = nuc.offset, nuc.size = nuc.size, nuc.padding = nuc.padding, nuc.padding.r = nuc.padding.r, - nuc.color = nuc.color, guide.line = guide.line, guide.line.color = guide.line.color, guide.line.type = guide.line.type, - mark.type = mark.type, star.size = star.size, - show.aa = show.aa, sens = sens, numcode = numcode, NAstring = NAstring, ambiguous = ambiguous, - aa.color = aa.color, aa.border.color = aa.border.color, aa.size = aa.size, aa.margin = aa.margin, aa.height = aa.height, - plot.space = plot.space, plot.height = plot.height - ), - class = "base" + structure( + list( + bam.file = bam.file, fa.file = fa.file, bs.fa.seq = bs.fa.seq, chr.split = chr.split, + nuc.offset = nuc.offset, nuc.size = nuc.size, nuc.padding = nuc.padding, nuc.padding.r = nuc.padding.r, + nuc.color = nuc.color, guide.line = guide.line, guide.line.color = guide.line.color, guide.line.type = guide.line.type, + mark.type = mark.type, star.size = star.size, + show.aa = show.aa, sens = sens, numcode = numcode, NAstring = NAstring, ambiguous = ambiguous, + aa.color = aa.color, aa.border.color = aa.border.color, aa.size = aa.size, aa.margin = aa.margin, aa.height = aa.height, + plot.space = plot.space, plot.height = plot.height + ), + class = "base" ) } @@ -114,7 +124,11 @@ ggplot_add.base <- function(object, plot, object_name) { # notice that this is start, because the use of geom_bar instead of geom_rect # plot.region.end <- plot.data[nrow(plot.data), "start"] plot.region.end <- max(plot.data[, "start"]) - region <- GenomicRanges::GRanges(plot.chr, IRanges::IRanges(plot.region.start, plot.region.end)) + region <- + GenomicRanges::GRanges( + plot.chr, + IRanges::IRanges(plot.region.start, plot.region.end) + ) # get parameters bam.file <- object$bam.file @@ -145,12 +159,15 @@ ggplot_add.base <- function(object, plot, object_name) { plot.height <- object$plot.height # get position AGCT frequency - pos.nuc.freq <- GenomicAlignments::alphabetFrequencyFromBam(bam.file, - param = region, - baseOnly = TRUE - ) + pos.nuc.freq <- + GenomicAlignments::alphabetFrequencyFromBam( + bam.file, + param = region, + baseOnly = TRUE + ) # filter out others - pos.nuc.freq <- pos.nuc.freq[, c("A", "G", "C", "T")] %>% as.data.frame() + pos.nuc.freq <- + pos.nuc.freq[, c("A", "G", "C", "T")] %>% as.data.frame() # add chromosome and position pos.nuc.freq$Chr <- plot.chr pos.nuc.freq$Pos <- plot.region.start:plot.region.end @@ -162,37 +179,49 @@ ggplot_add.base <- function(object, plot, object_name) { } else { fa.seq <- Biostrings::readDNAStringSet(fa.file) # change fasta name - names(fa.seq) <- sapply(strsplit(x = names(fa.seq), split = chr.split), "[", 1) + names(fa.seq) <- + sapply(strsplit(x = names(fa.seq), split = chr.split), "[", 1) } } else { fa.seq <- bs.fa.seq } - region.seq <- BSgenome::getSeq(fa.seq, region) %>% as.character() - region.seq.sc <- unlist(strsplit(region.seq, split = "")) + region.seq <- Biostrings::getSeq(fa.seq, region) # get reference nuc - pos.nuc.freq$Ref <- region.seq.sc + pos.nuc.freq$Ref <- as.character(region.seq) %>% + strsplit("") %>% + unlist() # add fill column - pos.nuc.freq$Total <- rowSums(pos.nuc.freq[, c("A", "G", "C", "T")]) + pos.nuc.freq$Total <- + rowSums(pos.nuc.freq[, c("A", "G", "C", "T")]) pos.nuc.freq$Fill <- ifelse(pos.nuc.freq$Total == 0, 1, 0) - # pos.nuc.freq$Total <- NULL - # convert wide to long dataframe - # pos.nuc.freq.long <- reshape2::melt(pos.nuc.freq, id.vars = c("Chr", "Pos", "Ref")) - # colnames(pos.nuc.freq.long) <- c("Chr", "Pos", "Ref", "Base", "Freq") - pos.nuc.freq.long <- reshape2::melt(pos.nuc.freq, id.vars = c("Chr", "Pos", "Ref", "Total")) - colnames(pos.nuc.freq.long) <- c("Chr", "Pos", "Ref", "Total", "Base", "Freq") + pos.nuc.freq.long <- reshape( + pos.nuc.freq, + varying = c("A", "G", "C", "T"), + times = c("A", "G", "C", "T"), + timevar = "Base", + idvar = "id", + v.names = "Freq", + direction = "long", + sep = "", + new.row.names = 1:(nrow(pos.nuc.freq) * 4) + ) # get position with alt alt.pos <- pos.nuc.freq.long %>% - dplyr::filter(.data$Ref == .data$Base & .data$Total != .data$Freq) %>% + dplyr::filter(.data$Ref == .data$Base & + .data$Total != .data$Freq) %>% dplyr::pull(.data$Pos) %>% unique() - alt.pos.nuc.freq.long <- pos.nuc.freq.long %>% dplyr::filter(.data$Pos %in% c(alt.pos)) + alt.pos.nuc.freq.long <- + pos.nuc.freq.long %>% dplyr::filter(.data$Pos %in% c(alt.pos)) # get position without alt ref.pos <- pos.nuc.freq.long %>% - dplyr::filter(.data$Ref == .data$Base & .data$Total == .data$Freq) %>% + dplyr::filter(.data$Ref == .data$Base & + .data$Total == .data$Freq) %>% dplyr::pull(.data$Pos) %>% unique() - ref.pos.nuc.freq.long <- pos.nuc.freq.long %>% dplyr::filter(.data$Pos %in% c(ref.pos)) + ref.pos.nuc.freq.long <- + pos.nuc.freq.long %>% dplyr::filter(.data$Pos %in% c(ref.pos)) # create label offset pos.nuc.freq$Offset <- nuc.offset # add guide line @@ -207,151 +236,181 @@ ggplot_add.base <- function(object, plot, object_name) { # create plot base.plot <- ggplot() + geom_bar( - data = pos.nuc.freq.long, aes_string(x = "Pos", y = "Freq", fill = "Base"), - position = "fill", stat = "identity", color = "white" + data = pos.nuc.freq.long, + aes_string(x = "Pos", y = "Freq", fill = "Base"), + position = "fill", + stat = "identity", + color = "white" ) if (mark.type == "twill") { base.plot <- base.plot + ggpattern::geom_col_pattern( data = alt.pos.nuc.freq.long, aes_string( - x = "Pos", y = "Freq", pattern = "Base", fill = "Base", - pattern_fill = "Base", pattern_angle = "Base" + x = "Pos", + y = "Freq", + pattern = "Base", + fill = "Base", + pattern_fill = "Base", + pattern_angle = "Base" ), - position = "fill", colour = "black", - pattern_density = 0.35, pattern_key_scale_factor = 1.3 + position = "fill", + colour = "black", + pattern_density = 0.35, + pattern_key_scale_factor = 1.3 ) + ggpattern::scale_pattern_fill_manual(values = c(nuc.color, "white")) } else if (mark.type == "star") { base.plot <- base.plot + geom_point( - data = alt.pos.nuc.freq.long, aes_string(x = "Pos", y = "1.01"), - shape = 8, show.legend = FALSE, size = star.size + data = alt.pos.nuc.freq.long, + aes_string(x = "Pos", y = "1.01"), + shape = 8, + show.legend = FALSE, + size = star.size ) } } else if (mark.type == "highlight") { base.plot <- ggplot() + geom_bar( - data = ref.pos.nuc.freq.long, aes_string(x = "Pos", y = "Freq"), - position = "fill", stat = "identity", color = "white", fill = "grey" + data = ref.pos.nuc.freq.long, + aes_string(x = "Pos", y = "Freq"), + position = "fill", + stat = "identity", + color = "white", + fill = "grey" ) + geom_bar( - data = alt.pos.nuc.freq.long, aes_string(x = "Pos", y = "Freq", fill = "Base"), - position = "fill", stat = "identity", color = "white" + data = alt.pos.nuc.freq.long, + aes_string(x = "Pos", y = "Freq", fill = "Base"), + position = "fill", + stat = "identity", + color = "white" ) } else { - stop("The mark.type you provided is not valid, please choose from twill, star, highlight.") + stop( + "The mark.type you provided is not valid, please choose from twill, star, highlight." + ) } } else { message("No SNV detected, do not add mark!") # create plot base.plot <- ggplot() + geom_bar( - data = pos.nuc.freq.long, aes_string(x = "Pos", y = "Freq", fill = "Base"), - position = "fill", stat = "identity", color = "white" + data = pos.nuc.freq.long, + aes_string(x = "Pos", y = "Freq", fill = "Base"), + position = "fill", + stat = "identity", + color = "white" ) } if (show.aa) { - # translate - region.aa.0 <- seqinr::translate( - seq = region.seq.sc, frame = 0, sens = sens, - numcode = numcode, NAstring = NAstring, ambiguous = ambiguous - ) - region.aa.1 <- seqinr::translate( - seq = region.seq.sc, frame = 1, sens = sens, - numcode = numcode, NAstring = NAstring, ambiguous = ambiguous - ) - region.aa.2 <- seqinr::translate( - seq = region.seq.sc, frame = 2, sens = sens, - numcode = numcode, NAstring = NAstring, ambiguous = ambiguous - ) - region.aa.0.df <- AAPadding(len = length(region.seq.sc), offset = 0, aa.seq = region.aa.0) - region.aa.0.df$Pos <- pos.nuc.freq$Pos - region.aa.0.df$y <- -0.2 - region.aa.1.df <- AAPadding(len = length(region.seq.sc), offset = 1, aa.seq = region.aa.1) - region.aa.1.df$Pos <- pos.nuc.freq$Pos - region.aa.1.df$y <- -0.4 - region.aa.2.df <- AAPadding(len = length(region.seq.sc), offset = 2, aa.seq = region.aa.2) - region.aa.2.df$Pos <- pos.nuc.freq$Pos - region.aa.2.df$y <- -0.6 + # translate the three different reading frames + list_translated <- lapply(c(0, 1, 2), function(reading_frame) { + region_remain <- (width(region.seq) - reading_frame) %% 3 + region_framed <- Biostrings::subseq( + x = region.seq, + start = 1 + reading_frame, + end = width(region.seq) - region_remain + ) + region_aa <- + Biostrings::translate(x = region_framed, no.init.codon = TRUE) + region_aa_df <- AAPadding( + len = width(region.seq), + offset = reading_frame, + aa.seq = strsplit(as.character(region_aa), "")[[1]] + ) + region_aa_df$Pos <- pos.nuc.freq$Pos + region_aa_df$y <- -0.2 * (1 + reading_frame) + return(region_aa_df) + }) + # get base plot base.plot <- base.plot + theme_base2(fill.color = nuc.color) + geom_label( - data = pos.nuc.freq, aes_string( - x = "Pos", y = "Offset", - label = "Ref", fill = "Ref" + data = pos.nuc.freq, + aes_string( + x = "Pos", + y = "Offset", + label = "Ref", + fill = "Ref" ), - show.legend = FALSE, size = nuc.size, + show.legend = FALSE, + size = nuc.size, label.padding = unit(nuc.padding, "lines"), label.r = unit(nuc.padding.r, "lines") ) + labs(y = "Base") + - geom_hline(yintercept = guide.line, color = guide.line.color, linetype = guide.line.type) - # create aa plot - options(digits = nchar(max(region.aa.0.df$Pos)) + 1) - # get rect dataframe used to plot - region.aa.0.rect <- PrepareRect(df = region.aa.0.df, y.center = -0.2) - region.aa.1.rect <- PrepareRect(df = region.aa.1.df, y.center = -0.4) - region.aa.2.rect <- PrepareRect(df = region.aa.2.df, y.center = -0.6) - # aa.plot <- ggplot() + - # geom_tile( - # data = region.aa.0.df, aes_string(x = "Pos", y = "y", fill = "aa"), - # colour = NA, height = 0.2 - # ) + - # geom_tile( - # data = region.aa.1.df, aes_string(x = "Pos", y = "y", fill = "aa"), - # colour = NA, height = 0.2 - # ) + - # geom_tile( - # data = region.aa.2.df, aes_string(x = "Pos", y = "y", fill = "aa"), - # colour = NA, height = 0.2 - # ) + - # labs(y = "AA") + - # geom_text(data = region.aa.0.df, aes_string(x = "Pos", y = "y", label = "anno"), size = aa.size) + - # geom_text(data = region.aa.1.df, aes_string(x = "Pos", y = "y", label = "anno"), size = aa.size) + - # geom_text(data = region.aa.2.df, aes_string(x = "Pos", y = "y", label = "anno"), size = aa.size) + - # theme_aa(margin.len = aa.margin, fill.color = aa.color) - aa.plot <- ggplot() + - geom_rect(data = region.aa.0.rect, aes_string( - xmin = "xmin", xmax = "xmax", - ymin = "ymin", ymax = "ymax", fill = "aa" - ), color = aa.border.color) + - geom_rect(data = region.aa.1.rect, aes_string( - xmin = "xmin", xmax = "xmax", - ymin = "ymin", ymax = "ymax", fill = "aa" - ), color = aa.border.color) + - geom_rect(data = region.aa.2.rect, aes_string( - xmin = "xmin", xmax = "xmax", - ymin = "ymin", ymax = "ymax", fill = "aa" - ), color = aa.border.color) + + geom_hline( + yintercept = guide.line, + color = guide.line.color, + linetype = guide.line.type + ) + + # make final AA plot + options(digits = nchar(max(list_translated[[1]]$Pos)) + 1) + aa_plot <- ggplot() + for (reading_frame in seq_along(list_translated)) { + region_aa_rect <- + PrepareRect(df = list_translated[[reading_frame]], y.center = -0.2 * reading_frame) + aa_plot <- aa_plot + + geom_rect( + data = region_aa_rect, + aes_string( + xmin = "xmin", + xmax = "xmax", + ymin = "ymin", + ymax = "ymax", + fill = "aa" + ), + color = aa.border.color + ) + + geom_text( + data = list_translated[[reading_frame]], + aes_string(x = "Pos", y = "y", label = "anno") + ) + } + aa_plot <- aa_plot + labs(y = "AA") + - geom_text(data = region.aa.0.df, aes_string(x = "Pos", y = "y", label = "anno")) + - geom_text(data = region.aa.1.df, aes_string(x = "Pos", y = "y", label = "anno")) + - geom_text(data = region.aa.2.df, aes_string(x = "Pos", y = "y", label = "anno")) + theme_aa(margin.len = aa.margin, fill.color = aa.color) - final.plot <- patchwork::wrap_plots(base.plot, aa.plot, ncol = 1, heights = c(1, aa.height)) + final.plot <- + patchwork::wrap_plots(base.plot, + aa_plot, + ncol = 1, + heights = c(1, aa.height) + ) } else { # create plot without amino acid final.plot <- base.plot + theme_base(margin.len = plot.space, fill.color = nuc.color) + geom_label( - data = pos.nuc.freq, aes_string( - x = "Pos", y = "Offset", - label = "Ref", fill = "Ref" + data = pos.nuc.freq, + aes_string( + x = "Pos", + y = "Offset", + label = "Ref", + fill = "Ref" ), - show.legend = FALSE, size = nuc.size, + show.legend = FALSE, + size = nuc.size, label.padding = unit(nuc.padding, "lines"), label.r = unit(nuc.padding.r, "lines") ) + labs(y = "Base") + - geom_hline(yintercept = guide.line, color = guide.line.color, linetype = guide.line.type) + geom_hline( + yintercept = guide.line, + color = guide.line.color, + linetype = guide.line.type + ) } # assemble plot - patchwork::wrap_plots(plot + theme(plot.margin = margin(t = plot.space, b = plot.space)), + patchwork::wrap_plots( + plot + theme(plot.margin = margin(t = plot.space, b = plot.space)), final.plot, - ncol = 1, heights = c(1, plot.height) + ncol = 1, + heights = c(1, plot.height) ) } diff --git a/R/geom_cnv.R b/R/geom_cnv.R index 293e6ef..1c2ff82 100644 --- a/R/geom_cnv.R +++ b/R/geom_cnv.R @@ -13,7 +13,7 @@ #' @param plot.height The relative height of contact map to coverage plot. Default: 0.2. #' #' @return Plot. -#' @importFrom magrittr %>% +#' @importFrom dplyr %>% #' @importFrom GenomicRanges GRanges makeGRangesFromDataFrame start end #' @importFrom IRanges IRanges subsetByOverlaps #' @importFrom ggplot2 ggplot_add ggplot geom_point geom_line geom_hline aes_string labs theme_classic theme element_blank element_rect @@ -42,12 +42,13 @@ geom_cnv <- function(cnv.df, bin.col = 3, cn.col = 4, ref.cn = 2, bin.point.color = "grey", bin.point.alpha = 0.6, cn.line.color = "red", ref.line.color = "black", plot.space = 0.1, plot.height = 0.2) { - structure(list( - cnv.df = cnv.df, bin.col = bin.col, cn.col = cn.col, ref.cn = ref.cn, - bin.point.color = bin.point.color, bin.point.alpha = bin.point.alpha, cn.line.color = cn.line.color, - ref.line.color = ref.line.color, plot.space = plot.space, plot.height = plot.height - ), - class = "cnv" + structure( + list( + cnv.df = cnv.df, bin.col = bin.col, cn.col = cn.col, ref.cn = ref.cn, + bin.point.color = bin.point.color, bin.point.alpha = bin.point.alpha, cn.line.color = cn.line.color, + ref.line.color = ref.line.color, plot.space = plot.space, plot.height = plot.height + ), + class = "cnv" ) } diff --git a/R/geom_coverage.R b/R/geom_coverage.R index fedcf3c..6bc5d96 100644 --- a/R/geom_coverage.R +++ b/R/geom_coverage.R @@ -27,12 +27,11 @@ #' @importFrom ggplot2 aes_string scale_fill_manual geom_rect geom_text aes geom_step #' @importFrom rlang .data #' @importFrom grDevices colorRampPalette col2rgb -#' @importFrom RColorBrewer brewer.pal #' @importFrom rlang as_label #' @importFrom stats as.formula #' @importFrom ggh4x facet_wrap2 strip_themed elem_list_rect #' @importFrom dplyr group_by summarise -#' @importFrom magrittr %>% +#' @importFrom dplyr %>% #' @importFrom ggrepel geom_text_repel #' @importFrom utils tail #' @@ -89,9 +88,10 @@ #' geom_coverage( #' data = track.df, facet.key = "Type", #' mark.region = data.frame( -#' start = c(21678900,21732001,21737590), -#' end = c(21679900,21732400,21737650), -#' label=c("M1", "M2", "M3")), +#' start = c(21678900, 21732001, 21737590), +#' end = c(21679900, 21732400, 21737650), +#' label = c("M1", "M2", "M3") +#' ), #' mark.color = grey(0.4) #' ) #' @@ -113,7 +113,8 @@ geom_coverage <- function(data, mapping = NULL, color = NULL, rect.color = NA, if (!is.null(color)) { testcolors <- sapply(color, function(x) { tryCatch(is.matrix(col2rgb(x)), - error = function(e) FALSE) + error = function(e) FALSE + ) }) if (length(color) < length(unique(data[, group.key]))) { warning("Fewer colors provided than there are groups in ", group.key, " variable, falling back to default colors") @@ -260,9 +261,10 @@ geom_coverage <- function(data, mapping = NULL, color = NULL, rect.color = NA, if (range.position == "in") { data.range <- data.range %>% - dplyr::summarise(.groups = "drop_last", - min_score = pretty(.data[[ymax.str]])[1], - max_score = tail(pretty(.data[[ymax.str]]), 1) + dplyr::summarise( + .groups = "drop_last", + min_score = pretty(.data[[ymax.str]])[1], + max_score = tail(pretty(.data[[ymax.str]]), 1) ) data.range$label <- paste0("[", data.range$min_score, ", ", data.range$max_score, "]") region.range <- geom_text( diff --git a/R/geom_feature.R b/R/geom_feature.R index 2f4db9f..7565fbe 100644 --- a/R/geom_feature.R +++ b/R/geom_feature.R @@ -10,7 +10,7 @@ #' @return Plot. #' @importFrom utils read.table #' @importFrom dplyr arrange -#' @importFrom magrittr %>% +#' @importFrom dplyr %>% #' @importFrom ggplot2 ggplot_add ggplot geom_segment aes_string theme_classic theme element_blank element_text #' element_rect margin scale_x_continuous scale_y_continuous coord_cartesian scale_color_manual #' @export @@ -42,11 +42,12 @@ #' # ) geom_feature <- function(feature.file = NULL, feature.df = NULL, feature.color = "black", feature.size = 5, plot.space = 0.1, plot.height = 0.1) { - structure(list( - feature.file = feature.file, feature.df = feature.df, feature.color = feature.color, feature.size = feature.size, - plot.space = plot.space, plot.height = plot.height - ), - class = "feature" + structure( + list( + feature.file = feature.file, feature.df = feature.df, feature.color = feature.color, feature.size = feature.size, + plot.space = plot.space, plot.height = plot.height + ), + class = "feature" ) } diff --git a/R/geom_gc.R b/R/geom_gc.R index 0707c58..d16a29d 100644 --- a/R/geom_gc.R +++ b/R/geom_gc.R @@ -12,10 +12,9 @@ #' #' @return Plot. #' @importFrom dplyr filter -#' @importFrom magrittr %>% -#' @importFrom Biostrings readDNAStringSet letterFrequency +#' @importFrom dplyr %>% +#' @importFrom Biostrings readDNAStringSet letterFrequency getSeq #' @importFrom GenomicRanges makeGRangesFromDataFrame -#' @importFrom BSgenome getSeq #' @importFrom ggplot2 ggplot_add ggplot geom_line aes_string geom_hline labs theme_classic theme element_blank #' element_text element_rect margin scale_x_continuous scale_y_continuous coord_cartesian #' @export @@ -38,12 +37,13 @@ geom_gc <- function(fa.file = NULL, bs.fa.seq = NULL, chr.split = "[[:space:]]", guide.line = NULL, line.color = "black", guide.line.color = "red", guide.line.type = "dashed", plot.space = 0.1, plot.height = 0.2) { - structure(list( - fa.file = fa.file, bs.fa.seq = bs.fa.seq, chr.split = chr.split, guide.line = guide.line, - line.color = line.color, guide.line.color = guide.line.color, guide.line.type = guide.line.type, - plot.space = plot.space, plot.height = plot.height - ), - class = "gc" + structure( + list( + fa.file = fa.file, bs.fa.seq = bs.fa.seq, chr.split = chr.split, guide.line = guide.line, + line.color = line.color, guide.line.color = guide.line.color, guide.line.type = guide.line.type, + plot.space = plot.space, plot.height = plot.height + ), + class = "gc" ) } @@ -102,7 +102,7 @@ ggplot_add.gc <- function(object, plot, object_name) { end.field = "end" ) # get GRanges' sequence - range.seqs <- BSgenome::getSeq(fa.seq.selected, plot.data.gr) + range.seqs <- Biostrings::getSeq(fa.seq.selected, plot.data.gr) # calculate GC content plot.data$GC <- as.numeric(Biostrings::letterFrequency(x = range.seqs, letters = "GC", as.prob = TRUE)) diff --git a/R/geom_gene.R b/R/geom_gene.R index 7849a32..8424399 100644 --- a/R/geom_gene.R +++ b/R/geom_gene.R @@ -6,22 +6,23 @@ #' and tight (place non-overlap genes in one line). Default: loose. #' @param gene.size The line size of gene. Default: 1. #' @param utr.size The line size of UTR. Default: 2. -#' @param exon.size The line size of exon. Default: 4. -#' @param arrow.size The line size of arrow. Default: 1. +#' @param exon.size The line size of exon. Default: 3. +#' @param arrow.size The line size of arrow. Default: 1.5. +#' @param arrow.gap The gap distance between intermittent arrows. Default: NULL. +#' Set arrow.num and arrow.gap to NULL to suppress intermittent arrows. +#' @param arrow.num Total number of intermittent arrows over whole region. Default: 50. +#' Set arrow.num and arrow.gap to NULL to suppress intermittent arrows. #' @param color.by Color the line by. Default: strand. #' @param fill.color Color used for \code{color.by}. -#' Default: darkblue for - (minus strand), darkgreen for + (plus strand). +#' Default: blue for - (minus strand), green for + (plus strand). #' @param show.utr Logical value, whether to show UTR. Default: TRUE. -#' @param arrow.gap The gap distance between arrow. Default: NULL. -#' @param arrow.num Total arrow num of whole region. Default: 50. -#' @param arrow.length The length of arrow. Default: 0.06. #' @param label.size The size of gene label. Default: 3. #' @param label.vjust The vjust of gene label. Default: 2. #' @param plot.space Top and bottom margin. Default: 0.1. #' @param plot.height The relative height of gene annotation to coverage plot. Default: 0.2. #' #' @return Plot. -#' @importFrom magrittr %>% +#' @importFrom dplyr %>% #' @importFrom rlang .data #' @importFrom GenomicRanges GRanges makeGRangesFromDataFrame setdiff #' @importFrom IRanges IRanges subsetByOverlaps findOverlaps @@ -32,44 +33,79 @@ #' @export #' #' @examples -#' # library(ggcoverage) -#' # library(utils) -#' # library(rtracklayer) -#' # meta.file <- system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") -#' # sample.meta <- utils::read.csv(meta.file) +#' library(ggcoverage) +#' library(utils) +#' library(rtracklayer) +#' +#' # load metadata +#' meta_file <- system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") +#' sample_meta <- read.csv(meta_file) +#' #' # track folder -#' # track.folder <- system.file("extdata", "RNA-seq", package = "ggcoverage") +#' track_folder <- system.file("extdata", "RNA-seq", package = "ggcoverage") +#' #' # load bigwig file -#' # track.df <- LoadTrackFile( -#' # track.folder = track.folder, format = "bw", -#' # meta.info = sample.meta -#' # ) -#' # gtf.file <- system.file("extdata", "used_hg19.gtf", package = "ggcoverage") -#' # gtf.gr <- rtracklayer::import.gff(con = gtf.file, format = "gtf") -#' # basic.coverage <- ggcoverage(data = track.df, color = "auto", range.position = "out") -#' # basic.coverage + geom_gene(gtf.gr = gtf.gr) -geom_gene <- function(gtf.gr, overlap.gene.gap = 0.1, overlap.style = "loose", gene.size = 1, - utr.size = 2, exon.size = 4, arrow.size = 1, color.by = "strand", - fill.color = c("-" = "darkblue", "+" = "darkgreen"), show.utr = TRUE, - arrow.gap = NULL, arrow.num = 50, arrow.length = 0.06, - label.size = 3, label.vjust = 2, plot.space = 0.1, plot.height = 0.2) { - structure(list( - gtf.gr = gtf.gr, - overlap.gene.gap = overlap.gene.gap, overlap.style = overlap.style, gene.size = gene.size, - utr.size = utr.size, exon.size = exon.size, arrow.size = arrow.size, color.by = color.by, - fill.color = fill.color, show.utr = show.utr, arrow.gap = arrow.gap, arrow.num = arrow.num, - arrow.length = arrow.length, label.size = label.size, label.vjust = label.vjust, - plot.space = plot.space, plot.height = plot.height - ), - class = "gene" +#' track_df <- LoadTrackFile( +#' track.folder = track_folder, +#' format = "bw", +#' region = "chr14:21,677,306-21,737,601", +#' extend = 2000, +#' meta.info = sample_meta +#' ) +#' +#' # load GTF file +#' gtf_file <- system.file("extdata", "used_hg19.gtf", package = "ggcoverage") +#' gtf_gr <- rtracklayer::import.gff(con = gtf_file, format = "gtf") +#' +#' # plot coverage and gene annotation +#' basic.coverage <- ggcoverage(data = track_df, range.position = "out") +#' basic.coverage + +#' geom_gene(gtf.gr = gtf_gr) +geom_gene <- function(gtf.gr, + overlap.gene.gap = 0.1, + overlap.style = "loose", + gene.size = 1, + utr.size = 2, + exon.size = 3, + arrow.size = 1.5, + arrow.gap = NULL, + arrow.num = 50, + color.by = "strand", + fill.color = c( + "-" = "cornflowerblue", + "+" = "darkolivegreen3" + ), + show.utr = FALSE, + label.size = 3, + label.vjust = 2, + plot.space = 0.1, + plot.height = 0.2) { + structure( + list( + gtf.gr = gtf.gr, + overlap.gene.gap = overlap.gene.gap, + overlap.style = overlap.style, + gene.size = gene.size, + utr.size = utr.size, + exon.size = exon.size, + arrow.size = arrow.size, + arrow.gap = arrow.gap, + arrow.num = arrow.num, + color.by = color.by, + fill.color = fill.color, + show.utr = show.utr, + label.size = label.size, + label.vjust = label.vjust, + plot.space = plot.space, + plot.height = plot.height + ), + class = "gene" ) } #' @export ggplot_add.gene <- function(object, plot, object_name) { # get plot data - # track.data <- plot$layers[[1]]$data - # get plot data, plot data should contain bins if ("patchwork" %in% class(plot)) { track.data <- plot[[1]]$layers[[1]]$data } else { @@ -78,9 +114,7 @@ ggplot_add.gene <- function(object, plot, object_name) { # prepare plot range # the plot region are not normal, so start is minimum value plot.range.chr <- track.data[1, "seqnames"] - # plot.range.start <- track.data[1, "start"] plot.range.start <- min(track.data[, "start"]) - # plot.range.end <- track.data[nrow(track.data), "end"] plot.range.end <- max(track.data[, "end"]) plot.range.gr <- GenomicRanges::GRanges( seqnames = plot.range.chr, @@ -99,15 +133,12 @@ ggplot_add.gene <- function(object, plot, object_name) { show.utr <- object$show.utr arrow.gap <- object$arrow.gap arrow.num <- object$arrow.num - arrow.length <- object$arrow.length label.size <- object$label.size label.vjust <- object$label.vjust plot.space <- object$plot.space plot.height <- object$plot.height - # process # get gene in region - # gtf.gr <- rtracklayer::import.gff(gtf.file,format = 'gtf') gtf.df.used <- IRanges::subsetByOverlaps(x = gtf.gr, ranges = plot.range.gr) %>% as.data.frame() # check information used.gtf.columns <- c("seqnames", "start", "end", "strand", "type", "gene_name") @@ -160,144 +191,69 @@ ggplot_add.gene <- function(object, plot, object_name) { gene.info.used.utr$start <- as.numeric(gene.info.used.utr$start) gene.info.used.utr$end <- as.numeric(gene.info.used.utr$end) # change UTR - if (nrow(gene.info.used.utr) == 0) { - warning("No UTR detected in provided GTF!") + if (show.utr & nrow(gene.info.used.utr) == 0) { + warning("No UTR detected in provided GTF, omitting plotting UTRs.") show.utr <- FALSE } - # create plot without arrow + # plot genomic features with arrow at the end if (show.utr) { # substract UTR from exon gene.exon.utr <- SplitExonUTR(exon.df = gene.info.used.exon, utr.df = gene.info.used.utr) gene.info.used.exon <- gene.exon.utr$exon gene.info.used.utr <- gene.exon.utr$utr - gene.plot <- ggplot() + - geom_segment( - data = gene.info.used.gene, - mapping = aes_string( - x = "start", - y = "group", - xend = "end", - yend = "group", - color = color.by - ), - show.legend = FALSE, - size = gene.size - ) + - geom_segment( - data = gene.info.used.utr, - mapping = aes_string( - x = "start", - y = "group", - xend = "end", - yend = "group", - color = color.by - ), - show.legend = FALSE, - size = utr.size - ) + - geom_segment( - data = gene.info.used.exon, - mapping = aes_string( - x = "start", - y = "group", - xend = "end", - yend = "group", - color = color.by - ), - show.legend = FALSE, - size = exon.size - ) - } else { - gene.plot <- ggplot() + - geom_segment( - data = gene.info.used.gene, - mapping = aes_string( - x = "start", - y = "group", - xend = "end", - yend = "group", - color = color.by - ), - show.legend = FALSE, - size = gene.size - ) + - geom_segment( - data = gene.info.used.exon, - mapping = aes_string( - x = "start", - y = "group", - xend = "end", - yend = "group", - color = color.by - ), - show.legend = FALSE, - size = exon.size - ) + } + gene.plot <- ggplot() + + geom_arrows(gene.info.used.gene, color.by, gene.size, arrow.size) + + geom_arrows(gene.info.used.exon, color.by, exon.size, arrow.size) + if (show.utr) { + gene.plot <- gene.plot + + geom_arrows(gene.info.used.utr, color.by, utr.size, arrow.size) } - if (is.null(arrow.gap)) { - if (is.null(arrow.num)) { - stop("Please provide either arrow.num or arrow.gap!") - } else { + if (!is.null(arrow.gap) || !is.null(arrow.num)) { + if (!is.null(arrow.num)) { arrow.gap <- (plot.range.end - plot.range.start) / arrow.num } - } - arrow.list <- list() - # create arrow based on gene - for (i in 1:nrow(gene.info.used.gene)) { - gene.seq <- as.character(gene.info.used.gene[i, "seqnames"]) - gene.start <- as.numeric(gene.info.used.gene[i, "start"]) - gene.end <- as.numeric(gene.info.used.gene[i, "end"]) - gene.strand <- as.character(gene.info.used.gene[i, "strand"]) - gene.type <- as.character(gene.info.used.gene[i, "type"]) - gene.gene_type <- as.character(gene.info.used.gene[i, "gene_type"]) - gene.name <- as.character(gene.info.used.gene[i, "gene_name"]) - gene.group <- as.numeric(gene.info.used.gene[i, "group"]) - gene.gap <- gene.end - gene.start - if (gene.gap <= arrow.gap) { - # create only one arrow - arrow.pos <- floor((gene.end + gene.start) / 2) - arrow.list[[gene.name]] <- c( - gene.seq, arrow.pos, arrow.pos + 1, gene.strand, - gene.type, gene.gene_type, gene.name, gene.group - ) - } else { - gene.arrow.num <- floor(gene.gap / arrow.gap) - gene.arrow.start <- (arrow.gap * 0:gene.arrow.num) + gene.start - gene.arrow.end <- gene.arrow.start + 1 - for (grn in 1:length(gene.arrow.start)) { - arrow.list[[paste(gene.name, grn, sep = "_")]] <- - c( - gene.seq, gene.arrow.start[grn], gene.arrow.end[grn], gene.strand, - gene.type, gene.gene_type, gene.name, gene.group - ) + arrow.list <- list() + # create arrow based on gene + for (i in 1:nrow(gene.info.used.gene)) { + gene.seq <- as.character(gene.info.used.gene[i, "seqnames"]) + gene.start <- as.numeric(gene.info.used.gene[i, "start"]) + gene.end <- as.numeric(gene.info.used.gene[i, "end"]) + gene.strand <- as.character(gene.info.used.gene[i, "strand"]) + gene.type <- as.character(gene.info.used.gene[i, "type"]) + gene.gene_type <- as.character(gene.info.used.gene[i, "gene_type"]) + gene.name <- as.character(gene.info.used.gene[i, "gene_name"]) + gene.group <- as.numeric(gene.info.used.gene[i, "group"]) + gene.gap <- gene.end - gene.start + if (gene.gap <= arrow.gap) { + # create only one arrow + arrow.pos <- floor((gene.end + gene.start) / 2) + arrow.list[[gene.name]] <- c( + gene.seq, arrow.pos, arrow.pos + 1, gene.strand, + gene.type, gene.gene_type, gene.name, gene.group + ) + } else { + gene.arrow.num <- floor(gene.gap / arrow.gap) + gene.arrow.start <- (arrow.gap * 0:gene.arrow.num) + gene.start + gene.arrow.end <- gene.arrow.start + 1 + for (grn in 1:length(gene.arrow.start)) { + arrow.list[[paste(gene.name, grn, sep = "_")]] <- + c( + gene.seq, gene.arrow.start[grn], gene.arrow.end[grn], gene.strand, + gene.type, gene.gene_type, gene.name, gene.group + ) + } } } + arrow.df <- do.call(rbind, arrow.list) %>% as.data.frame() + colnames(arrow.df) <- c("seqnames", "start", "end", "strand", "type", "gene_type", "gene_name", "group") + arrow.df$start <- as.numeric(arrow.df$start) + arrow.df$end <- as.numeric(arrow.df$end) + arrow.df$group <- as.numeric(arrow.df$group) + gene.plot <- gene.plot + + geom_arrows(arrow.df, color.by, gene.size / 2, arrow.size, 35, TRUE) } - arrow.df <- do.call(rbind, arrow.list) %>% as.data.frame() - colnames(arrow.df) <- c("seqnames", "start", "end", "strand", "type", "gene_type", "gene_name", "group") - arrow.df$start <- as.numeric(arrow.df$start) - arrow.df$end <- as.numeric(arrow.df$end) - arrow.df$group <- as.numeric(arrow.df$group) - - gene.arrow.plot <- gene.plot + geom_segment( - data = arrow.df, - mapping = aes_string( - x = "start", - y = "group", - xend = "end", - yend = "group", - color = color.by - ), - arrow = arrow( - ends = ifelse(arrow.df$strand == "-", "first", "last"), - type = "open", - angle = 45, - length = unit(x = arrow.length, units = "inches") - ), - show.legend = FALSE, - size = arrow.size - ) label.df <- data.frame( pos = (gene.info.used.gene$start + gene.info.used.gene$end) / 2, @@ -305,7 +261,7 @@ ggplot_add.gene <- function(object, plot, object_name) { gene = gene.info.used.gene$gene_name ) - gene.final.plot <- gene.arrow.plot + + gene.final.plot <- gene.plot + geom_text( data = label.df, mapping = aes_string(x = "pos", y = "group", label = "gene"), @@ -317,6 +273,7 @@ ggplot_add.gene <- function(object, plot, object_name) { fill.color = fill.color, x.range = c(plot.range.start, plot.range.end), margin.len = plot.space ) + # assemble plot patchwork::wrap_plots(plot + theme(plot.margin = margin(t = plot.space, b = plot.space)), gene.final.plot, diff --git a/R/geom_ideogram.R b/R/geom_ideogram.R index 21d2807..a621241 100644 --- a/R/geom_ideogram.R +++ b/R/geom_ideogram.R @@ -20,33 +20,76 @@ #' @param plot.height The relative height of ideogram annotation to coverage plot. Default: 0.2. #' #' @return Plot. -#' @importFrom magrittr %>% -#' @importFrom ggbio layout_karyogram -#' @importFrom GenomicRanges GRanges makeGRangesFromDataFrame +#' @importFrom dplyr %>% +#' @importFrom GenomicRanges GRanges makeGRangesFromDataFrame trim #' @importFrom IRanges IRanges subsetByOverlaps -#' @importFrom ggplot2 ggplot_add ggplot geom_rect aes_string geom_polygon theme_classic theme element_blank -#' scale_x_continuous scale_y_continuous margin +#' @importFrom ggplot2 ggplot_add ggplot geom_rect aes_string geom_polygon +#' theme_classic theme element_blank scale_x_continuous scale_y_continuous +#' margin #' @importFrom patchwork wrap_plots #' @importFrom methods extends #' @importFrom utils menu +#' @importFrom rtracklayer ucscGenomes ucscTableQuery tableName getTable +#' GRangesForUCSCGenome browserSession #' @importFrom GenomeInfoDb seqlengths seqlengths<- seqnames -#' @importFrom GenomicRanges trim GRanges -#' @importFrom S4Vectors values<- #' @export #' +#' @examples +#' \dontrun{ +#' library(ggbio) +#' +#' # load metadata +#' meta_file <- +#' system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") +#' sample_meta <- read.csv(meta_file) +#' +#' # track folder +#' track_folder <- +#' system.file("extdata", "RNA-seq", package = "ggcoverage") +#' # load bigwig file +#' track_df <- LoadTrackFile( +#' track.folder = track_folder, +#' format = "bw", +#' region = "chr14:21,677,306-21,737,601", +#' extend = 2000, +#' meta.info = sample_meta +#' ) +#' +#' # gene annotation +#' gtf_file <- +#' system.file("extdata", "used_hg19.gtf", package = "ggcoverage") +#' gtf_gr <- rtracklayer::import.gff(con = gtf_file, format = "gtf") +#' +#' # coverage plot + ideogram +#' basic_coverage <- ggcoverage( +#' data = track_df, +#' plot.type = "facet", +#' range.position = "in", +#' facet.y.scale = "fixed" +#' ) +#' +#' basic_coverage + +#' geom_gene(gtf.gr = gtf_gr) + +#' geom_ideogram(genome = "hg19", plot.space = 0) +#' } +#' geom_ideogram <- function(genome = "hg19", mark.color = "red", mark.alpha = 0.7, mark.line.size = 1, add.shadow = TRUE, shadow.color = "grey", shadow.alpha = 0.7, shadow.line.size = 1, highlight.centromere = FALSE, highlight.color = "green", highlight.alpha = 0.7, highlight.line.size = 1, highlight.shadow.color = "black", highlight.shadow.alpha = 0.7, highlight.shadow.line.size = 1, plot.space = 0.1, plot.height = 0.1) { - structure(list( - genome = genome, mark.color = mark.color, mark.alpha = mark.alpha, mark.line.size = mark.line.size, - add.shadow = add.shadow, shadow.color = shadow.color, shadow.alpha = shadow.alpha, shadow.line.size = shadow.line.size, - highlight.centromere = highlight.centromere, highlight.color = highlight.color, highlight.alpha = highlight.alpha, - highlight.line.size = highlight.line.size, highlight.shadow.color = highlight.shadow.color, highlight.shadow.alpha = highlight.shadow.alpha, - highlight.shadow.line.size = highlight.shadow.line.size, plot.space = plot.space, plot.height = plot.height - ), - class = "ideogram" + # test if suggested package is installed + requireNamespace("ggbio", quietly = TRUE) + + structure( + list( + genome = genome, mark.color = mark.color, mark.alpha = mark.alpha, mark.line.size = mark.line.size, + add.shadow = add.shadow, shadow.color = shadow.color, shadow.alpha = shadow.alpha, shadow.line.size = shadow.line.size, + highlight.centromere = highlight.centromere, highlight.color = highlight.color, highlight.alpha = highlight.alpha, + highlight.line.size = highlight.line.size, highlight.shadow.color = highlight.shadow.color, highlight.shadow.alpha = highlight.shadow.alpha, + highlight.shadow.line.size = highlight.shadow.line.size, plot.space = plot.space, plot.height = plot.height + ), + class = "ideogram" ) } diff --git a/R/geom_link.R b/R/geom_link.R index 1913c15..8be57db 100644 --- a/R/geom_link.R +++ b/R/geom_link.R @@ -1,41 +1,48 @@ -#' Add Links to Coverage Plot. +#' Add Genome Links to Coverage Plot. #' #' @param link.file File contains region link information. #' @param file.type The type of \code{link.file}, choose from bedpe, pairs. Default: bedpe. -#' @param score.col Column index contains score information, used when \code{file.type} is bedpe. Default: NULL. +#' @param score.col Column index that contains score information, used when \code{file.type} is bedpe. Default: NULL. #' @param score.threshold The score threshold, used when \code{score.col} is not NULL. Default: NULL. -#' @param score.color The score color vector. The length should be three, the first represents the lowest score, the second represents the -#' middle score, the third represents the highest score. Default: c("blue", "grey", "red"). -#' @param scale.range Scale the height of link according to width, should be greater than or equal to 1 (not scale). Default: 10. +#' @param score.color The score color vector. Default: c("grey70", "#56B1F7", "#132B43"). +#' @param scale.range Scale the height of links according to width, should be greater than or equal to 1 (not scale). Default: 10. +#' @param plot.curve One of 'curve' or 'bezier', for the latter it is required to install package \code{ggforce}. Default: 'curve'. #' @param plot.space Top and bottom margin. Default: 0.1. #' @param plot.height The relative height of link to coverage plot. Default: 0.2. #' @param show.rect Logical value, whether to add rect border to the plot. Default: FALSE. #' #' @return Plot. -#' @importFrom magrittr %>% +#' @importFrom dplyr %>% #' @importFrom GenomicRanges GRanges makeGRangesFromDataFrame start end #' @importFrom IRanges IRanges subsetByOverlaps #' @importFrom utils read.table #' @importFrom scales rescale -#' @importFrom ggforce geom_bezier -#' @importFrom ggplot2 ggplot_add ggplot aes_string scale_color_gradient2 labs theme_classic theme element_blank element_rect -#' element_text margin scale_y_continuous scale_x_continuous expansion coord_cartesian +#' @importFrom ggplot2 ggplot_add ggplot aes_string scale_color_gradientn +#' labs theme_classic theme element_blank element_rect +#' element_text margin scale_y_continuous scale_x_continuous expansion +#' coord_cartesian geom_curve #' @importFrom patchwork wrap_plots #' @references \url{https://stuartlab.org/signac/articles/cicero.html} #' @export #' #' @examples #' library(ggcoverage) -#' # create test dataframe -#' # (random, but use seed to obtain same result every time) +#' # create random test data +#' # use seed to obtain same result every time #' set.seed(123) +#' #' df <- data.frame( -#' seqnames = "chr2L", start = seq(from = 8000000, to = 8300000, by = 1000), -#' end = seq(from = 8001000, to = 8301000, by = 1000), score = sample(1:100, 301, replace = TRUE), +#' seqnames = "chr2L", +#' start = seq(from = 8000000, to = 8300000, by = 1000), +#' end = seq(from = 8001000, to = 8301000, by = 1000), +#' score = sample(1:100, 301, replace = TRUE), #' Type = "Example", Group = "Example" #' ) #' # get links -#' link.file = system.file("extdata", "HiC", "HiC_link.bedpe", package = "ggcoverage") +#' link.file <- system.file( +#' "extdata", "HiC", "HiC_link.bedpe", +#' package = "ggcoverage" +#' ) #' #' # create plot #' ggcoverage( @@ -44,23 +51,38 @@ #' ) + #' geom_link(link.file = link.file, file.type = "bedpe", show.rect = TRUE) #' -geom_link <- function(link.file, file.type = "bedpe", score.col = NULL, score.threshold = NULL, - score.color = c("blue", "grey", "red"), scale.range = 10, - plot.space = 0.1, plot.height = 0.2, show.rect = FALSE) { - structure(list( - link.file = link.file, file.type = file.type, score.col = score.col, score.threshold = score.threshold, - score.color = score.color, scale.range = scale.range, plot.space = plot.space, plot.height = plot.height, show.rect = show.rect - ), - class = "link" - ) -} +geom_link <- + function(link.file, + file.type = "bedpe", + score.col = NULL, + score.threshold = NULL, + score.color = c("grey70", "#56B1F7", "#132B43"), + scale.range = 10, + plot.curve = "curve", + plot.space = 0.1, + plot.height = 0.2, + show.rect = FALSE) { + structure( + list( + link.file = link.file, + file.type = file.type, + score.col = score.col, + score.threshold = score.threshold, + score.color = score.color, + scale.range = scale.range, + plot.curve = plot.curve, + plot.space = plot.space, + plot.height = plot.height, + show.rect = show.rect + ), + class = "link" + ) + } #' @export ggplot_add.link <- function(object, plot, object_name) { # get plot data - # track.data <- plot$layers[[1]]$data - # get plot data, plot data should contain bins if ("patchwork" %in% class(plot)) { track.data <- plot[[1]]$layers[[1]]$data } else { @@ -84,33 +106,29 @@ ggplot_add.link <- function(object, plot, object_name) { score.threshold <- object$score.threshold score.color <- object$score.color scale.range <- object$scale.range + plot.curve <- object$plot.curve plot.space <- object$plot.space plot.height <- object$plot.height show.rect <- object$show.rect - # check parameter - if (length(score.color) >= 3) { - score.color <- score.color[1:3] - } else { - warning("The score.color you provided is smaller than 3, please check! Use default (blue, grey, red) first.") - score.color <- c("blue", "grey", "red") - } - # prepare dataframe if (file.type == "bedpe") { # bedpe: https://bedtools.readthedocs.io/en/latest/content/general-usage.html # read bedpe file link.df <- utils::read.table(file = link.file, sep = "\t") + col_names <- c("chr1", "start1", "end1", "chr2", "start2", "end2") if (!is.null(score.col)) { if (score.col > ncol(link.df)) { - stop("The score column index is bigger than whole dataframe columns, please provide valid score column!") + stop( + "The score column index does not match one of the data columns. Provide column index as integer in the range 1 to ncol(df)" + ) } else { - link.df <- link.df[, c(1, 2, 3, 4, 5, 6, score.col)] - colnames(link.df) <- c("chr1", "start1", "end1", "chr2", "start2", "end2", "score") + link.df <- link.df[c(1, 2, 3, 4, 5, 6, score.col)] + colnames(link.df) <- c(col_names, "score") } } else { - link.df <- link.df[, c(1, 2, 3, 4, 5, 6)] - colnames(link.df) <- c("chr1", "start1", "end1", "chr2", "start2", "end2") + link.df <- link.df[c(1, 2, 3, 4, 5, 6)] + colnames(link.df) <- col_names } # filter link dataframe link.df <- link.df[link.df$chr1 == link.df$chr2, ] @@ -129,8 +147,10 @@ ggplot_add.link <- function(object, plot, object_name) { r1.center <- (link.df$start1 + link.df$end1) / 2 r2.center <- (link.df$start2 + link.df$end2) / 2 # change position - point.start.vec <- ifelse(r1.center < r2.center, r1.center, r2.center) - point.end.vec <- ifelse(r1.center < r2.center, r2.center, r1.center) + point.start.vec <- + ifelse(r1.center < r2.center, r1.center, r2.center) + point.end.vec <- + ifelse(r1.center < r2.center, r2.center, r1.center) # create link point dataframe link.point.df <- data.frame( chr = unique(link.df$chr1), @@ -156,8 +176,16 @@ ggplot_add.link <- function(object, plot, object_name) { link.df <- link.df[link.df$chr1 == link.df$chr1[1], ] } # change position - point.start.vec <- ifelse(link.df$start1 < link.df$start2, link.df$start1, link.df$start2) - point.end.vec <- ifelse(link.df$start1 < link.df$start2, link.df$start2, link.df$start1) + point.start.vec <- + ifelse(link.df$start1 < link.df$start2, + link.df$start1, + link.df$start2 + ) + point.end.vec <- + ifelse(link.df$start1 < link.df$start2, + link.df$start2, + link.df$start1 + ) # create link point dataframe link.point.df <- data.frame( chr = unique(link.df$chr1), @@ -166,12 +194,15 @@ ggplot_add.link <- function(object, plot, object_name) { ) } # convert link.point.df to genomic ranges - link.point.gr <- GenomicRanges::makeGRangesFromDataFrame(df = link.point.df, keep.extra.columns = TRUE) + link.point.gr <- + GenomicRanges::makeGRangesFromDataFrame(df = link.point.df, keep.extra.columns = TRUE) # filter link gr - link.point.df <- as.data.frame(IRanges::subsetByOverlaps(x = link.point.gr, ranges = plot.range.gr)) + link.point.df <- + as.data.frame(IRanges::subsetByOverlaps(x = link.point.gr, ranges = plot.range.gr)) # remove links outside region - link.point.df <- link.point.df[link.point.df$start >= GenomicRanges::start(x = plot.range.gr) & - link.point.df$end <= GenomicRanges::end(x = plot.range.gr), ] + link.point.df <- + link.point.df[link.point.df$start >= GenomicRanges::start(x = plot.range.gr) & + link.point.df$end <= GenomicRanges::end(x = plot.range.gr), ] rownames(link.point.df) <- 1:nrow(link.point.df) # check dataframe if (nrow(link.point.df) < 1) { @@ -181,12 +212,13 @@ ggplot_add.link <- function(object, plot, object_name) { ggplot(data = link.plot.df) } else { # prepare plot dataframe - link.point.df$group <- seq_len(length.out = nrow(x = link.point.df)) + link.point.df$group <- + seq_len(length.out = nrow(x = link.point.df)) link.point.plot <- link.point.df link.point.plot$width <- link.point.df$end - link.point.df$start # scale width to range - link.point.plot$rw <- scales::rescale(link.point.plot$width, to = c(1, scale.range)) - + link.point.plot$rw <- + scales::rescale(link.point.plot$width, to = c(1, scale.range)) # prepare plot dataframe link.plot.df <- data.frame( x = c( @@ -203,37 +235,63 @@ ggplot_add.link <- function(object, plot, object_name) { ), group = rep(x = link.point.plot$group, 4) ) - # add score and create basic plot if ("score" %in% colnames(link.point.plot)) { # add score link.plot.df$score <- rep(link.point.plot$score, 4) + group_color <- "score" + scale_color <- scale_color_gradientn( + colors = score.color, + limits = range(link.plot.df$score) + ) + } else { + group_color <- NULL + scale_color <- scale_color_manual() + } + if (plot.curve == "bezier") { + # test if suggested package is installed + requireNamespace("ggforce", quietly = TRUE) # create plot link.basic.plot <- ggplot(data = link.plot.df) + - ggforce::geom_bezier( - mapping = aes_string(x = "x", y = "y", group = "group", color = "score") - ) + - scale_color_gradient2( - low = score.color[1], mid = score.color[2], high = score.color[3], - limits = c(min(link.plot.df$score), max(link.plot.df$score)), - n.breaks = 3 - ) - } else { + ggforce::geom_bezier(mapping = aes_string( + x = "x", + y = "y", + group = "group", + color = group_color + )) + + scale_color + } else if (plot.curve == "curve") { link.basic.plot <- - ggplot(data = link.plot.df) + - ggforce::geom_bezier( - mapping = aes_string(x = "x", y = "y", group = "group") - ) + ggplot(data = link.point.df) + + ggplot2::geom_curve( + aes_string( + x = "start", + xend = "end", + y = 0, + yend = 0, + color = group_color + ), + curvature = 1.1, + angle = 90, + ncp = 15 + ) + + scale_color } } # create plot link.plot <- link.basic.plot + labs(y = "Links") + - theme_link(x.range = c(plot.range.start, plot.range.end), margin.len = plot.space, show.rect = show.rect) + theme_link( + x.range = c(plot.range.start, plot.range.end), + margin.len = plot.space, + show.rect = show.rect + ) # assemble plot - patchwork::wrap_plots(plot + theme(plot.margin = margin(t = plot.space, b = plot.space)), + patchwork::wrap_plots( + plot + theme(plot.margin = margin(t = plot.space, b = plot.space)), link.plot, - ncol = 1, heights = c(1, plot.height) + ncol = 1, + heights = c(1, plot.height) ) } diff --git a/R/geom_peak.R b/R/geom_peak.R index a018e6c..2a33ca5 100644 --- a/R/geom_peak.R +++ b/R/geom_peak.R @@ -10,7 +10,7 @@ #' @return Plot. #' @importFrom utils read.table #' @importFrom dplyr arrange -#' @importFrom magrittr %>% +#' @importFrom dplyr %>% #' @importFrom ggplot2 ggplot_add ggplot geom_segment aes_string theme_classic theme element_blank element_text #' element_rect margin scale_x_continuous scale_y_continuous coord_cartesian #' @export @@ -43,11 +43,12 @@ #' # basic.coverage + geom_gene(gtf.gr = gtf.gr) + geom_peak(bed.file = peak.file) geom_peak <- function(bed.file = NULL, peak.df = NULL, peak.color = "black", peak.size = 5, plot.space = 0.1, plot.height = 0.1) { - structure(list( - bed.file = bed.file, peak.df = peak.df, peak.color = peak.color, peak.size = peak.size, - plot.space = plot.space, plot.height = plot.height - ), - class = "peak" + structure( + list( + bed.file = bed.file, peak.df = peak.df, peak.color = peak.color, peak.size = peak.size, + plot.space = plot.space, plot.height = plot.height + ), + class = "peak" ) } diff --git a/R/geom_protein.R b/R/geom_protein.R index 3613769..d5630e0 100644 --- a/R/geom_protein.R +++ b/R/geom_protein.R @@ -1,6 +1,6 @@ #' Layer for Protein Coverage Plot. #' -#' @param coverage.file Exported protein coverage file, should be in excel. +#' @param coverage.df Protein coverage, for example output from Proteome Discoverer. #' @param fasta.file Input reference protein fasta file. #' @param protein.id The protein ID of exported coverage file. This should be unique and in \code{fasta.file}. #' @param XCorr.threshold The cross-correlation threshold. Default: 2. @@ -21,12 +21,10 @@ #' out (normal y axis). Default: in. #' #' @return A ggplot2 object. -#' @importFrom openxlsx read.xlsx -#' @importFrom magrittr %>% +#' @importFrom dplyr %>% #' @importFrom dplyr filter group_by summarise arrange #' @importFrom rlang .data #' @importFrom Biostrings readAAStringSet -#' @importFrom stringr str_locate #' @importFrom GenomicRanges reduce GRanges setdiff #' @importFrom IRanges IRanges #' @importFrom ggplot2 ggplot geom_rect geom_text aes aes_string @@ -35,18 +33,33 @@ #' @export #' #' @examples +#' \dontrun{ #' library(ggplot2) -#' library(ggcoverage) +#' library(openxlsx) +#' +#' # import coverage dataframe with function from openxlsx #' coverage.file <- system.file( -#' "extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage" +#' "extdata", "Proteomics", "MS_BSA_coverage.xlsx", +#' package = "ggcoverage" #' ) +#' coverage.df <- read.xlsx(coverage.file) +#' head(coverage.df) +#' +#' # get fasta file #' fasta.file <- system.file( -#' "extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage" +#' "extdata", "Proteomics", "MS_BSA_coverage.fasta", +#' package = "ggcoverage" #' ) -#' protein.id = "sp|P02769|ALBU_BOVIN" +#' +#' protein.id <- "sp|P02769|ALBU_BOVIN" #' ggplot() + -#' geom_protein(coverage.file = coverage.file, fasta.file = fasta.file, protein.id = protein.id) -geom_protein <- function(coverage.file, fasta.file, protein.id, XCorr.threshold = 2, +#' geom_protein( +#' coverage.df = coverage.df, +#' fasta.file = fasta.file, +#' protein.id = protein.id +#' ) +#' } +geom_protein <- function(coverage.df, fasta.file, protein.id, XCorr.threshold = 2, confidence = "High", contaminant = NULL, remove.na = TRUE, color = "grey", mark.bare = TRUE, mark.color = "red", mark.alpha = 0.5, show.table = TRUE, table.position = c("top_right", "top_left", "bottom_right", "bottom_left"), @@ -55,8 +68,6 @@ geom_protein <- function(coverage.file, fasta.file, protein.id, XCorr.threshold table.position <- match.arg(arg = table.position) range.position <- match.arg(arg = range.position) - # load coverage dataframe - coverage.df <- openxlsx::read.xlsx(coverage.file) # remove suffix and prefix string coverage.df$Annotated.Sequence <- gsub(pattern = ".*\\.(.*)\\..*", replacement = "\\1", x = coverage.df$Annotated.Sequence) # filter converge according to confidence @@ -103,7 +114,8 @@ geom_protein <- function(coverage.file, fasta.file, protein.id, XCorr.threshold # get the region aa.anno.region <- sapply(coverage.df$peptide, function(x) { - stringr::str_locate(pattern = x, aa.seq.used) + re_result <- regexpr(x, aa.seq.used) + c(re_result, re_result + attr(re_result, which = "match.length")) - 1 }) %>% t() %>% as.data.frame() @@ -197,7 +209,8 @@ geom_protein <- function(coverage.file, fasta.file, protein.id, XCorr.threshold summary.table <- ggplot2::annotation_custom( grob = gridExtra::tableGrob( d = coverage.summary, - theme = table_theme), + theme = table_theme + ), xmin = table_xmin, xmax = table_xmax, ymin = table_ymin, ymax = table_ymax ) diff --git a/R/geom_tad.R b/R/geom_tad.R index dc9367b..2b6e1a4 100644 --- a/R/geom_tad.R +++ b/R/geom_tad.R @@ -3,21 +3,20 @@ #' @param matrix Matrix (n x n) contains contact map information. #' @param granges The rownames and colnames information of matrix. #' @param color.palette One of the RColorbrewer or viridis colour palettes. -#' Parameter of \code{\link{Brick_vizart_plot_heatmap}}. Default: NULL. +#' Parameter of \code{HiCBricks::Brick_vizart_plot_heatmap}. Default: NULL. #' @param value.cut If present, values beyond a certain quantile will be capped to that quantile. -#' Parameter of \code{\link{Brick_vizart_plot_heatmap}}. Default: NULL. +#' Parameter of \code{HiCBricks::Brick_vizart_plot_heatmap}. Default: NULL. #' @param transform.fun If any sort of transformations should be applied to the data before plotting. -#' Parameter of \code{\link{Brick_vizart_plot_heatmap}}. Default: NULL. +#' Parameter of \code{HiCBricks::Brick_vizart_plot_heatmap}. Default: NULL. #' @param plot.space Top and bottom margin. Default: 0.1. #' @param plot.height The relative height of contact map to coverage plot. Default: 1. #' @param top Logical value, whether to place this plot on the coverage plot. Default: TRUE. #' @param show.rect Logical value, whether to add rect border to the plot. Default: FALSE. #' #' @return Plot. -#' @importFrom magrittr %>% +#' @importFrom dplyr %>% #' @importFrom GenomicRanges GRanges #' @importFrom IRanges IRanges findOverlaps subsetByOverlaps -#' @import HiCBricks #' @importFrom ggplot2 ggplot_add ggplot labs theme_classic theme element_blank element_rect #' element_text margin scale_y_continuous scale_x_continuous #' @importFrom patchwork wrap_plots @@ -25,59 +24,67 @@ #' #' @examples #' library(ggcoverage) -#' library(GenomicRanges) +#' library(HiCBricks) #' #' # prepare track dataframe -#' track.file = system.file("extdata", "HiC", "H3K36me3.bw", package = "ggcoverage") -#' track.df = LoadTrackFile(track.file = track.file, format = "bw", -#' region = "chr2L:8050000-8300000", extend = 0) -#' track.df$score = ifelse(track.df$score <0, 0, track.df$score) +#' track.file <- system.file("extdata", "HiC", "H3K36me3.bw", package = "ggcoverage") +#' track.df <- LoadTrackFile( +#' track.file = track.file, format = "bw", +#' region = "chr2L:8050000-8300000", extend = 0 +#' ) +#' track.df$score <- ifelse(track.df$score < 0, 0, track.df$score) #' # check the data #' head(track.df) #' #' # Load Hi-C data -#' hic.mat.file = system.file("extdata", "HiC", "HiC_mat.txt", package = "ggcoverage") -#' hic.mat = read.table(file = hic.mat.file, sep = "\t") -#' hic.mat = as.matrix(hic.mat) +#' hic.mat.file <- system.file("extdata", "HiC", "HiC_mat.txt", package = "ggcoverage") +#' hic.mat <- read.table(file = hic.mat.file, sep = "\t") +#' hic.mat <- as.matrix(hic.mat) #' #' # bin data -#' hic.bin.file = system.file("extdata", "HiC", "HiC_bin.txt", package = "ggcoverage") -#' hic.bin = read.table(file = hic.bin.file, sep = "\t") -#' colnames(hic.bin) = c("chr", "start", "end") -#' hic.bin.gr = GenomicRanges::makeGRangesFromDataFrame(df = hic.bin) +#' hic.bin.file <- system.file("extdata", "HiC", "HiC_bin.txt", package = "ggcoverage") +#' hic.bin <- read.table(file = hic.bin.file, sep = "\t") +#' colnames(hic.bin) <- c("chr", "start", "end") +#' hic.bin.gr <- GenomicRanges::makeGRangesFromDataFrame(df = hic.bin) #' #' # transfrom function -#' FailSafe_log10 <- function(x){ +#' failsafe_log10 <- function(x) { #' x[is.na(x) | is.nan(x) | is.infinite(x)] <- 0 -#' return(log10(x+1)) +#' return(log10(x + 1)) #' } #' #' # load link data: prepare arcs -#' link.file = system.file("extdata", "HiC", "HiC_link.bedpe", package = "ggcoverage") +#' link.file <- system.file("extdata", "HiC", "HiC_link.bedpe", package = "ggcoverage") #' #' # basic coverage -#' basic.coverage = ggcoverage( +#' basic.coverage <- ggcoverage( #' data = track.df, color = "grey", #' mark.region = NULL, range.position = "out" #' ) #' #' # add annotations #' basic.coverage + -#' geom_tad(matrix = hic.mat, granges = hic.bin.gr, value.cut = 0.99, -#' color.palette = "viridis", transform.fun = FailSafe_log10, -#' top = FALSE, show.rect = TRUE) + +#' geom_tad( +#' matrix = hic.mat, granges = hic.bin.gr, value.cut = 0.99, +#' color.palette = "viridis", transform.fun = failsafe_log10, +#' top = FALSE, show.rect = TRUE +#' ) + #' geom_link(link.file = link.file, file.type = "bedpe", show.rect = TRUE) #' #' @export geom_tad <- function(matrix, granges, color.palette = NULL, value.cut = NULL, transform.fun = NULL, plot.space = 0.1, plot.height = 1, top = TRUE, show.rect = FALSE) { - structure(list( - matrix = matrix, granges = granges, color.palette = color.palette, - value.cut = value.cut, transform.fun = transform.fun, - plot.space = plot.space, plot.height = plot.height, top = top, show.rect = show.rect - ), - class = "tad" + # test if suggested package is installed + requireNamespace("HiCBricks", quietly = TRUE) + + structure( + list( + matrix = matrix, granges = granges, color.palette = color.palette, + value.cut = value.cut, transform.fun = transform.fun, + plot.space = plot.space, plot.height = plot.height, top = top, show.rect = show.rect + ), + class = "tad" ) } #' @export diff --git a/R/geom_transcript.R b/R/geom_transcript.R index c06e43a..ac4a103 100644 --- a/R/geom_transcript.R +++ b/R/geom_transcript.R @@ -7,21 +7,22 @@ #' and tight (place non-overlap transcripts in one line). Default: loose. #' @param tx.size The line size of transcript. Default: 1. #' @param utr.size The line size of UTR. Default: 2. -#' @param exon.size The line size of exon. Default: 4. -#' @param arrow.size The line size of arrow. Default: 1. +#' @param exon.size The line size of exon. Default: 3. +#' @param arrow.size The line size of arrow. Default: 1.5. +#' @param arrow.gap The gap distance between intermittent arrows. Default: NULL. +#' Set arrow.num and arrow.gap to NULL to suppress intermittent arrows. +#' @param arrow.num Total number of intermittent arrows over whole region. Default: 50. +#' Set arrow.num and arrow.gap to NULL to suppress intermittent arrows. #' @param color.by Color the line by. Default: strand. #' @param fill.color Color used for \code{color.by}. -#' Default: darkblue for - (minus strand), darkgreen for + (plus strand). -#' @param arrow.gap The gap distance between arrow. Default: NULL. -#' @param arrow.num Total arrow num of whole region. Default: 50. -#' @param arrow.length The length of arrow. Default: 0.06. +#' Default: blue for - (minus strand), green for + (plus strand). #' @param label.size The size of transcript label. Default: 3. #' @param label.vjust The vjust of transcript label. Default: 2. #' @param plot.space Top and bottom margin. Default: 0.1. #' @param plot.height The relative height of transcript annotation to coverage plot. Default: 0.2. #' #' @return Plot. -#' @importFrom magrittr %>% +#' @importFrom dplyr %>% #' @importFrom rlang .data #' @importFrom GenomicRanges GRanges makeGRangesFromDataFrame setdiff #' @importFrom IRanges IRanges subsetByOverlaps findOverlaps @@ -32,37 +33,76 @@ #' @export #' #' @examples -#' # library(ggcoverage) -#' # library(utils) -#' # library(rtracklayer) -#' # meta.file <- system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") -#' # sample.meta <- utils::read.csv(meta.file) +#' library(ggcoverage) +#' library(utils) +#' library(rtracklayer) +#' +#' # load metadata +#' meta_file <- system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") +#' sample_meta <- read.csv(meta_file) +#' #' # track folder -#' # track.folder <- system.file("extdata", "RNA-seq", package = "ggcoverage") +#' track_folder <- system.file("extdata", "RNA-seq", package = "ggcoverage") +#' #' # load bigwig file -#' # track.df <- LoadTrackFile( -#' # track.folder = track.folder, format = "bw", -#' # meta.info = sample.meta -#' # ) -#' # gtf.file <- system.file("extdata", "used_hg19.gtf", package = "ggcoverage") -#' # gtf.gr <- rtracklayer::import.gff(con = gtf.file, format = "gtf") -#' # basic.coverage <- ggcoverage(data = track.df, color = "auto", range.position = "out") -#' # basic.coverage + geom_transcript(gtf.gr = gtf.gr, label.vjust = 1.5) -geom_transcript <- function(gtf.gr, gene.name = "HNRNPC", overlap.tx.gap = 0.1, overlap.style = "loose", - tx.size = 1, utr.size = 2, exon.size = 4, arrow.size = 1, color.by = "strand", - fill.color = c("-" = "darkblue", "+" = "darkgreen"), - arrow.gap = NULL, arrow.num = 50, arrow.length = 0.06, - label.size = 3, label.vjust = 2, plot.space = 0.1, plot.height = 1) { - structure(list( - gtf.gr = gtf.gr, gene.name = gene.name, overlap.tx.gap = overlap.tx.gap, overlap.style = overlap.style, - tx.size = tx.size, utr.size = utr.size, exon.size = exon.size, arrow.size = arrow.size, color.by = color.by, - fill.color = fill.color, arrow.gap = arrow.gap, arrow.num = arrow.num, - arrow.length = arrow.length, label.size = label.size, label.vjust = label.vjust, - plot.space = plot.space, plot.height = plot.height - ), - class = "transcript" - ) -} +#' track_df <- LoadTrackFile( +#' track.folder = track_folder, +#' format = "bw", +#' region = "chr14:21,677,306-21,737,601", +#' extend = 2000, +#' meta.info = sample_meta +#' ) +#' +#' # load GTF file +#' gtf_file <- system.file("extdata", "used_hg19.gtf", package = "ggcoverage") +#' gtf_gr <- rtracklayer::import.gff(con = gtf_file, format = "gtf") +#' +#' # plot coverage and gene annotation +#' basic.coverage <- ggcoverage(data = track_df, range.position = "out") +#' basic.coverage + +#' geom_transcript(gtf.gr = gtf_gr, label.vjust = 1.5) +geom_transcript <- + function(gtf.gr, + gene.name = "HNRNPC", + overlap.tx.gap = 0.1, + overlap.style = "loose", + tx.size = 1, + utr.size = 2, + exon.size = 3, + arrow.size = 3, + arrow.gap = NULL, + arrow.num = 50, + color.by = "strand", + fill.color = c( + "-" = "cornflowerblue", + "+" = "darkolivegreen3" + ), + label.size = 3, + label.vjust = 2, + plot.space = 0.1, + plot.height = 1) { + structure( + list( + gtf.gr = gtf.gr, + gene.name = gene.name, + overlap.tx.gap = overlap.tx.gap, + overlap.style = overlap.style, + tx.size = tx.size, + utr.size = utr.size, + exon.size = exon.size, + arrow.size = arrow.size, + arrow.gap = arrow.gap, + arrow.num = arrow.num, + color.by = color.by, + fill.color = fill.color, + label.size = label.size, + label.vjust = label.vjust, + plot.space = plot.space, + plot.height = plot.height + ), + class = "transcript" + ) + } #' @export ggplot_add.transcript <- function(object, plot, object_name) { @@ -100,7 +140,6 @@ ggplot_add.transcript <- function(object, plot, object_name) { fill.color <- object$fill.color arrow.gap <- object$arrow.gap arrow.num <- object$arrow.num - arrow.length <- object$arrow.length label.size <- object$label.size label.vjust <- object$label.vjust plot.space <- object$plot.space @@ -163,50 +202,17 @@ ggplot_add.transcript <- function(object, plot, object_name) { # create basic plot tx.plot <- ggplot() + - geom_segment( - data = gene.tx.df.tx, - mapping = aes_string( - x = "start", - y = "group", - xend = "end", - yend = "group", - color = "strand" - ), - show.legend = FALSE, - size = tx.size - ) + geom_arrows(gene.tx.df.tx, color.by, tx.size, arrow.size) # deal with missing UTR if (is.null(gene.tx.df.utr)) { warning("No UTR detected in provided GTF!") } else { tx.plot <- tx.plot + - geom_segment( - data = gene.tx.df.utr, - mapping = aes_string( - x = "start", - y = "group", - xend = "end", - yend = "group", - color = "strand" - ), - show.legend = FALSE, - size = utr.size - ) + geom_arrows(gene.tx.df.utr, color.by, utr.size, arrow.size) } tx.plot <- tx.plot + - geom_segment( - data = gene.tx.df.exon, - mapping = aes_string( - x = "start", - y = "group", - xend = "end", - yend = "group", - color = "strand" - ), - show.legend = FALSE, - size = exon.size - ) + + geom_arrows(gene.tx.df.exon, color.by, exon.size, arrow.size) + theme_classic() if (is.null(arrow.gap)) { @@ -253,24 +259,8 @@ ggplot_add.transcript <- function(object, plot, object_name) { arrow.df$end <- as.numeric(arrow.df$end) arrow.df$group <- as.numeric(arrow.df$group) # add arrow - tx.arrow.plot <- tx.plot + geom_segment( - data = arrow.df, - mapping = aes_string( - x = "start", - y = "group", - xend = "end", - yend = "group", - color = color.by - ), - arrow = arrow( - ends = ifelse(arrow.df$strand == "-", "first", "last"), - type = "open", - angle = 45, - length = unit(x = arrow.length, units = "inches") - ), - show.legend = FALSE, - size = arrow.size - ) + tx.arrow.plot <- tx.plot + + geom_arrows(arrow.df, color.by, tx.size / 2, arrow.size, 35, TRUE) # prepare label dataframe label.df <- data.frame( diff --git a/R/ggcoverage.R b/R/ggcoverage.R index 3edd15a..230428c 100644 --- a/R/ggcoverage.R +++ b/R/ggcoverage.R @@ -25,17 +25,15 @@ #' @param mark.label.size The label size of mark label. Default: 4. #' #' @return A ggplot2 object. -#' @importFrom magrittr %>% +#' @importFrom dplyr %>% #' @importFrom ggplot2 ggplot aes_string scale_fill_manual geom_rect geom_text aes theme_classic theme unit #' element_blank annotate rel scale_y_continuous expansion scale_x_continuous coord_cartesian geom_step #' @importFrom scales comma #' @importFrom grDevices colorRampPalette -#' @importFrom RColorBrewer brewer.pal #' @importFrom rlang as_label .data #' @importFrom stats as.formula #' @importFrom ggh4x facet_wrap2 strip_themed #' @importFrom dplyr group_by summarise -#' @importFrom magrittr %>% #' @importFrom ggrepel geom_text_repel #' @export #' @@ -89,9 +87,10 @@ #' ggcoverage( #' data = track.df, facet.key = "Type", #' mark.region = data.frame( -#' start = c(21678900,21732001,21737590), -#' end = c(21679900,21732400,21737650), -#' label=c("M1", "M2", "M3")), +#' start = c(21678900, 21732001, 21737590), +#' end = c(21679900, 21732400, 21737650), +#' label = c("M1", "M2", "M3") +#' ), #' mark.color = grey(0.4) #' ) #' diff --git a/R/ggprotein.R b/R/ggprotein.R index f0388d2..a7b8a23 100644 --- a/R/ggprotein.R +++ b/R/ggprotein.R @@ -1,6 +1,6 @@ #' Create Mass Spectrometry Protein Coverage Plot. #' -#' @param coverage.file Exported protein coverage file, should be in excel. +#' @param coverage.df Protein coverage, for example output from Proteome Discoverer. #' @param fasta.file Input reference protein fasta file. #' @param protein.id The protein ID of exported coverage file. This should be unique and in \code{fasta.file}. #' @param XCorr.threshold The cross-correlation threshold. Default: 2. @@ -24,21 +24,32 @@ #' @export #' #' @examples -#' library(ggcoverage) +#' \dontrun{ +#' library(ggplot2) +#' library(openxlsx) +#' +#' # import coverage dataframe with function from openxlsx #' coverage.file <- system.file( -#' "extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage" +#' "extdata", "Proteomics", "MS_BSA_coverage.xlsx", +#' package = "ggcoverage" #' ) +#' coverage.df <- read.xlsx(coverage.file) +#' head(coverage.df) +#' +#' # get fasta file #' fasta.file <- system.file( -#' "extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage" +#' "extdata", "Proteomics", "MS_BSA_coverage.fasta", +#' package = "ggcoverage" #' ) -#' protein.id = "sp|P02769|ALBU_BOVIN" #' +#' protein.id <- "sp|P02769|ALBU_BOVIN" #' ggprotein( -#' coverage.file = coverage.file, +#' coverage.df = coverage.df, #' fasta.file = fasta.file, #' protein.id = protein.id #' ) -ggprotein <- function(coverage.file, fasta.file, protein.id, XCorr.threshold = 2, +#' } +ggprotein <- function(coverage.df, fasta.file, protein.id, XCorr.threshold = 2, confidence = "High", contaminant = NULL, remove.na = TRUE, color = "grey", mark.bare = TRUE, mark.color = "red", mark.alpha = 0.5, show.table = TRUE, table.position = c("top_right", "top_left", "bottom_left", "bottom_right"), @@ -50,7 +61,7 @@ ggprotein <- function(coverage.file, fasta.file, protein.id, XCorr.threshold = 2 # ms protein plot protein.plot <- ggplot() + geom_protein( - coverage.file = coverage.file, fasta.file = fasta.file, protein.id = protein.id, + coverage.df = coverage.df, fasta.file = fasta.file, protein.id = protein.id, XCorr.threshold = XCorr.threshold, confidence = confidence, contaminant = contaminant, remove.na = remove.na, color = color, mark.bare = mark.bare, mark.color = mark.color, mark.alpha = mark.alpha, show.table = show.table, table.position = table.position, diff --git a/R/utils.R b/R/utils.R index 148c73c..696a618 100644 --- a/R/utils.R +++ b/R/utils.R @@ -56,7 +56,12 @@ PrepareRegion <- function(region = NULL, # select color automatically AutoColor <- function(data, n, name, key) { - getPalette <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(n, name)) + palettes <- list( + Set1 = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999"), + Set2 = c("#66C2A5", "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F", "#E5C494", "#B3B3B3"), + Set3 = c("#8DD3C7", "#FFFFB3", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#D9D9D9") + ) + getPalette <- grDevices::colorRampPalette(palettes[[name]]) # sample group with same color group.info <- unique(data[, key]) fill.color <- getPalette(length(group.info)) @@ -77,27 +82,6 @@ AAPadding <- function(len, offset = 0, aa.seq) { return(final.aa.df) } -# get gene and transcript group -# divide genes to non-overlap groups (V1) -# GetGeneGroup = function(gene.gr, fc = "queryHits",sc= "subjectHits", overlap.gene.gap=1){ -# overlap.df = IRanges::findOverlaps(gene.gr,gene.gr, ignore.strand=TRUE) %>% as.data.frame() -# overlap.list = lapply(split(overlap.df,overlap.df[,fc]), function(x){ -# x[, sc] -# }) -# group.idx = rep(1, length(unique(overlap.df[,fc]))) -# for (i in seq_along(overlap.list)) { -# ovrlap.vec <- overlap.list[[i]] -# curr.group = group.idx[i] -# remain.vec = setdiff(ovrlap.vec, i) -# for (j in remain.vec) { -# if(group.idx[j] == curr.group){ -# group.idx[j] = curr.group + overlap.gene.gap -# } -# } -# } -# return(group.idx) -# } - # divide genes to non-overlap groups (V2) GetGeneGroup <- function(gene.gr, fc = "queryHits", sc = "subjectHits", overlap.gene.gap = 1) { overlap.df <- IRanges::findOverlaps(gene.gr, gene.gr, ignore.strand = TRUE) %>% @@ -106,13 +90,13 @@ GetGeneGroup <- function(gene.gr, fc = "queryHits", sc = "subjectHits", overlap. overlap.list <- lapply(split(overlap.df, overlap.df[, fc]), function(x) { x[, sc] }) - overlap.list <- overlap.list[order(sapply(overlap.list, length))] + overlap.list <- overlap.list[order(sapply(overlap.list, length), decreasing = TRUE)] group.idx <- rep(1, length(unique(overlap.df[, fc]))) for (i in names(overlap.list)) { + overlap.vec <- overlap.list[[i]] i <- as.numeric(i) - ovrlap.vec <- overlap.list[[i]] curr.group <- group.idx[i] - remain.vec <- setdiff(ovrlap.vec, i) + remain.vec <- setdiff(overlap.vec, i) for (j in remain.vec) { if (group.idx[j] == curr.group) { group.idx[j] <- curr.group + overlap.gene.gap @@ -122,7 +106,7 @@ GetGeneGroup <- function(gene.gr, fc = "queryHits", sc = "subjectHits", overlap. return(group.idx) } -# V3: place non-overlap transcripts together +# place non-overlapping transcripts together GetGeneGroupTight <- function(gene.gr, overlap.gene.gap = 1) { # convert to dataframe gene.gr.df <- as.data.frame(gene.gr) @@ -157,7 +141,6 @@ GetGeneGroupTight <- function(gene.gr, overlap.gene.gap = 1) { return(group.index) } -# SplitExonUTR(exon.df = gene.info.used.exon, utr.df = gene.info.used.utr) # substract UTR from exon SplitExonUTR <- function(exon.df, utr.df) { # get metadata @@ -257,13 +240,11 @@ SplitTxExonUTR <- function(exon.df, utr.df) { } # From: https://github.com/jorainer/biovizBase/blob/master/R/ideogram.R -# Fix bug: the names on the supplied 'seqlengths' vector must be identical to the seqnames +# Fix bug: the names on the supplied 'seqlengths' vector must be +# identical to the seqnames getIdeogram <- function(genome, subchr = NULL, cytobands = TRUE) { .gnm <- genome lst <- lapply(.gnm, function(genome) { - # print(genome) - ## to remove the "heavy dependency" we put require here. - # require(rtracklayer) if (!(exists("session") && extends(class(session), "BrowserSession"))) { session <- rtracklayer::browserSession() } @@ -283,7 +264,8 @@ getIdeogram <- function(genome, subchr = NULL, cytobands = TRUE) { seqnames = df$chrom, IRanges(start = df$chromStart, end = df$chromEnd) ) - S4Vectors::values(gr) <- df[, c("name", "gieStain")] + gr@elementMetadata$name <- df$name + gr@elementMetadata$gieStain <- df$gieStain message("Loading ranges...") gr.r <- rtracklayer::GRangesForUCSCGenome(genome) @@ -293,7 +275,7 @@ getIdeogram <- function(genome, subchr = NULL, cytobands = TRUE) { suppressWarnings(GenomeInfoDb::seqlengths(gr) <- new.seqlength) gr <- GenomicRanges::trim(gr) } else { - message("cytoBand informatin is not available, only get ranges.") + message("cytoBand information is not available, only getting ranges.") message("Loading ranges...") gr <- rtracklayer::GRangesForUCSCGenome(genome) message("Done") @@ -362,3 +344,76 @@ GetPlotData <- function(plot, layer.num = 1) { plot.data <- eval(parse(text = paste0(plot.layer.str, "$layers[[1]]$data"))) return(plot.data) } + +#' Plot genomic features as arrows. +#' @description +#' This function is a variation of geom_segment to plot (gene) features +#' as arrows. Mainly meant for internal use, not to be called directly. +#' +#' @param data data frame describing arrow position, with columns +#' start, end, group, and a custom 'color' column +#' @param color name of the color column in the data frame +#' @param line_width line_width of the (arrow) segment +#' @param arrow_size size of the arrow +#' @param arrow_angle angle of the arrow. Default: 35° +#' @param intermittent If TRUE, arrows are only drawn intermittently in +#' half-transparent white color. Default: FALSE. +#' @importFrom grDevices grey +#' @return A geom layer for ggplot2 objects. +#' @export +geom_arrows <- + function(data, + color, + line_width, + arrow_size, + arrow_angle = 35, + intermittent = FALSE) { + if (nrow(data)) { + if (!"strand" %in% colnames(data)) { + data$strand <- "+" + } + if (!intermittent) { + geom_segment( + data = data, + mapping = aes_string( + x = "start", + y = "group", + xend = "end", + yend = "group", + color = color + ), + arrow = arrow( + ends = ifelse(data$strand == "+", "last", "first"), + angle = arrow_angle, + length = unit(arrow_size, "points"), + type = "open" + ), + lineend = "butt", + linejoin = "mitre", + show.legend = FALSE, + linewidth = line_width + ) + } else { + geom_segment( + data = data, + mapping = aes_string( + x = "start", + y = "group", + xend = "end", + yend = "group" + ), + arrow = arrow( + ends = ifelse(data$strand == "+", "last", "first"), + angle = arrow_angle, + length = unit(arrow_size, "points"), + type = "closed" + ), + lineend = "butt", + linejoin = "mitre", + show.legend = FALSE, + linewidth = line_width, + color = grDevices::grey(1, alpha = 0.5) + ) + } + } + } diff --git a/README.Rmd b/README.Rmd index 7c2049b..239808c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -52,7 +52,7 @@ The goal of `ggcoverage` is simplify the process of visualizing omics coverage. `ggcoverage` is an R package distributed as part of the [CRAN](https://cran.r-project.org/). To install the package, start R and enter one of the following commands: -```{r install, eval=FALSE} +```{r install, eval = FALSE} # install via CRAN (not yet available) install.packages("ggcoverage") @@ -63,12 +63,10 @@ remotes::install_github("showteeth/ggcoverage") In general, it is **recommended** to install from [Github repository](https://github.com/showteeth/ggcoverage) (update more timely). -Once `ggcoverage` is installed, it can be loaded (together with other libraries) like this: +Once `ggcoverage` is installed, it can be loaded as every other package: -```{r library, message=FALSE, warning=FALSE} -library("rtracklayer") +```{r library, message = FALSE, warning = FALSE} library("ggcoverage") -library("ggpattern") ``` ## Manual @@ -146,7 +144,7 @@ The basic coverage plot has **two types**: Create line plot for **every sample** (`facet.key = "Type"`) and color by **every sample** (`group.key = "Type"`): -```{r basic_coverage_joint, warning=FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} +```{r basic_coverage_joint, warning = FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, plot.type = "joint", @@ -161,7 +159,7 @@ basic_coverage Create **group average line plot** (sample is indicated by `facet.key = "Type"`, group is indicated by `group.key = "Group"`): -```{r basic_coverage_joint_avg, warning=FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} +```{r basic_coverage_joint_avg, warning = FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, plot.type = "joint", @@ -177,7 +175,7 @@ basic_coverage #### Facet view -```{r basic_coverage, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +```{r basic_coverage, warning = FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, plot.type = "facet", @@ -192,7 +190,7 @@ basic_coverage **Change the Y-axis scale label in/out of plot region with `range.position`**: -```{r basic_coverage_2, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +```{r basic_coverage_2, warning = FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, plot.type = "facet", @@ -205,7 +203,7 @@ basic_coverage **Shared/Free Y-axis scale with `facet.y.scale`**: -```{r basic_coverage_3, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +```{r basic_coverage_3, warning = FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, plot.type = "facet", @@ -219,7 +217,12 @@ basic_coverage ### Add gene annotation -```{r gene_coverage, warning=FALSE, fig.height = 8, fig.width = 12, fig.align = "center"} +- default behavior is to draw genes (transcripts), exons and UTRs with different line width +- can bec adjusted using `gene.size`, `exon.size` and `utr.size` parameters +- frequency of intermittent arrows (light color) can be adjusted using the `arrow.num` and `arrow.gap` parameters +- genomic features are colored by `strand` by default, which can be changed using the `color.by` parameter + +```{r gene_coverage, warning = FALSE, fig.height = 8, fig.width = 12, fig.align = "center"} basic_coverage + geom_gene(gtf.gr = gtf_gr) ``` @@ -229,29 +232,37 @@ basic_coverage + **In "loose" stype (default style; each transcript occupies one line)**: -```{r transcript_coverage, warning=FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} +```{r transcript_coverage, warning = FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} basic_coverage + geom_transcript(gtf.gr = gtf_gr, label.vjust = 1.5) ``` **In "tight" style (place non-overlap transcripts in one line)**: -```{r transcript_coverage_tight, warning=FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} +```{r transcript_coverage_tight, warning = FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} basic_coverage + - geom_transcript(gtf.gr = gtf_gr, - overlap.style = "tight", - label.vjust = 1.5) + geom_transcript( + gtf.gr = gtf_gr, + overlap.style = "tight", + label.vjust = 1.5 + ) ``` ### Add ideogram -```{r ideogram_coverage_1, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} +The ideogram is an overview plot about the respective position on a chromosome. +The plotting of the ideogram is implemented by the `ggbio` package. +This package needs to be installed separately (it is only 'Suggested' by `ggcoverage`). + +```{r ideogram_coverage_1, warning = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} +library(ggbio) + basic_coverage + geom_gene(gtf.gr = gtf_gr) + geom_ideogram(genome = "hg19", plot.space = 0) ``` -```{r ideogram_coverage_2, warning=FALSE, fig.height = 14, fig.width = 12, fig.align = "center"} +```{r ideogram_coverage_2, warning = FALSE, fig.height = 14, fig.width = 12, fig.align = "center"} basic_coverage + geom_transcript(gtf.gr = gtf_gr, label.vjust = 1.5) + geom_ideogram(genome = "hg19", plot.space = 0) @@ -265,7 +276,7 @@ basic_coverage + ##### Load the data -The DNA-seq data used here are from [Copy number work flow](http://bioconductor.org/help/course-materials/2014/SeattleOct2014/B02.2.3_CopyNumber.html), we select tumor sample, and get bin counts with `cn.mops::getReadCountsFromBAM` with `WL` 1000. +The DNA-seq data used here are from [Copy number work flow](https://bioconductor.org/help/course-materials/2014/SeattleOct2014/B02.2.3_CopyNumber.html), we select tumor sample, and get bin counts with `cn.mops::getReadCountsFromBAM` with `WL` 1000. ```{r load_bin_counts} # prepare metafile @@ -277,7 +288,9 @@ cnv_meta_info <- data.frame( # track file track_file <- system.file("extdata", - "DNA-seq", "CNV_example.txt", package = "ggcoverage") + "DNA-seq", "CNV_example.txt", + package = "ggcoverage" +) # load txt file track_df <- LoadTrackFile( @@ -293,7 +306,7 @@ head(track_df) ##### Basic coverage -```{r basic_coverage_dna, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +```{r basic_coverage_dna, warning = FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, color = "grey", @@ -307,7 +320,7 @@ basic_coverage Add **GC**, **ideogram** and **gene** annotaions. -```{r gc_coverage, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} +```{r gc_coverage, warning = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} # load genome data library("BSgenome.Hsapiens.UCSC.hg19") @@ -330,9 +343,11 @@ track_file <- system.file("extdata", "DNA-seq", "SRR054616.bw", package = "ggcoverage") # load track -track_df <- LoadTrackFile(track.file = track_file, - format = "bw", - region = "4:1-160000000") +track_df <- LoadTrackFile( + track.file = track_file, + format = "bw", + region = "4:1-160000000" +) # add chr prefix track_df$seqnames <- paste0("chr", track_df$seqnames) @@ -360,7 +375,8 @@ basic_coverage # prepare files cnv_file <- system.file("extdata", "DNA-seq", "SRR054616_copynumber.txt", - package = "ggcoverage") + package = "ggcoverage" + ) # read CNV cnv_df <- read.table(file = cnv_file, sep = "\t", header = TRUE) @@ -377,9 +393,11 @@ Add **GC**, **ideogram** and **CNV** annotations. # create plot basic_coverage + geom_gc(bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19) + - geom_cnv(cnv.df = cnv_df, - bin.col = 3, - cn.col = 4) + + geom_cnv( + cnv.df = cnv_df, + bin.col = 3, + cn.col = 4 + ) + geom_ideogram( genome = "hg19", plot.space = 0, @@ -402,8 +420,9 @@ sample_meta <- data.frame( # load bam file bam_file <- system.file("extdata", - "DNA-seq", "tumorA.chr4.selected.bam", - package = "ggcoverage") + "DNA-seq", "tumorA.chr4.selected.bam", + package = "ggcoverage" +) track_df <- LoadTrackFile( track.file = bam_file, @@ -421,7 +440,7 @@ For base and amino acid annotation, we have following default color schemes, you Default color scheme for base annotation is `Clustal-style`, more popular color schemes is available [here](https://www.biostars.org/p/171056/). -```{r base_color_scheme, warning=FALSE, fig.height = 2, fig.width = 6, fig.align = "center"} +```{r base_color_scheme, warning = FALSE, fig.height = 2, fig.width = 6, fig.align = "center"} # color scheme nuc_color <- c( "A" = "#ff2b08", "C" = "#009aff", "G" = "#ffb507", "T" = "#00bc0d" @@ -455,7 +474,7 @@ graphics::par(opar) Default color scheme for amino acid annotation is from [Residual colours: a proposal for aminochromography](https://academic.oup.com/peds/article/10/7/743/1593029?login=false): -```{r aa_color_scheme, warning=FALSE, fig.height = 9, fig.width = 10, fig.align = "center"} +```{r aa_color_scheme, warning = FALSE, fig.height = 9, fig.width = 10, fig.align = "center"} aa_color <- c( "D" = "#FF0000", "S" = "#FF2400", "T" = "#E34234", "G" = "#FF8000", "P" = "#F28500", "C" = "#FFFF00", "A" = "#FDFF00", "V" = "#E3FF00", @@ -501,7 +520,7 @@ Sys.sleep(60) ``` -```{r base_aa_coverage, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} +```{r base_aa_coverage, warning = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} # create plot with twill mark ggcoverage( data = track_df, @@ -510,9 +529,11 @@ ggcoverage( single.nuc = TRUE, rect.color = "white" ) + - geom_base(bam.file = bam_file, - bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, - mark.type = "twill") + + geom_base( + bam.file = bam_file, + bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, + mark.type = "twill" + ) + geom_ideogram(genome = "hg19", plot.space = 0) ``` @@ -523,7 +544,7 @@ ggcoverage( Sys.sleep(60) ``` -```{r base_aa_coverage_star, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} +```{r base_aa_coverage_star, warning = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} # create plot with star mark ggcoverage( data = track_df, @@ -532,9 +553,11 @@ ggcoverage( single.nuc = TRUE, rect.color = "white" ) + - geom_base(bam.file = bam_file, - bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, - mark.type = "star") + + geom_base( + bam.file = bam_file, + bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, + mark.type = "star" + ) + geom_ideogram(genome = "hg19", plot.space = 0) ``` @@ -545,7 +568,7 @@ ggcoverage( Sys.sleep(60) ``` -```{r base_aa_coverage_highlight, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} +```{r base_aa_coverage_highlight, warning = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} # highlight one base ggcoverage( data = track_df, @@ -554,9 +577,11 @@ ggcoverage( single.nuc = TRUE, rect.color = "white" ) + - geom_base(bam.file = bam_file, - bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, - mark.type = "highlight") + + geom_base( + bam.file = bam_file, + bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, + mark.type = "highlight" + ) + geom_ideogram(genome = "hg19", plot.space = 0) ``` @@ -616,10 +641,12 @@ mark_region ### Basic coverage -```{r basic_coverage_chip, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -basic_coverage <- ggcoverage(data = track_df, - mark.region = mark_region, - show.mark.label = FALSE) +```{r basic_coverage_chip, warning = FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +basic_coverage <- ggcoverage( + data = track_df, + mark.region = mark_region, + show.mark.label = FALSE +) basic_coverage ``` @@ -632,12 +659,13 @@ Add **gene**, **ideogram** and **peak** annotations. To create peak annotation, Sys.sleep(60) ``` -```{r peak_coverage, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} +```{r peak_coverage, warning = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} # get consensus peak file peak_file <- system.file("extdata", - "ChIP-seq", - "consensus.peak", - package = "ggcoverage") + "ChIP-seq", + "consensus.peak", + package = "ggcoverage" +) basic_coverage + geom_gene(gtf.gr = gtf_gr) + @@ -647,9 +675,14 @@ basic_coverage + ## Hi-C data +The Hi-C method maps chromosome contacts in eukaryotic cells. +For this purpose, DNA and protein complexes are cross-linked and DNA fragments then purified. +As a result, even distant chromatin fragments can be found to interact due to the spatial organization of the DNA and histones in the cell. Hi-C data shows these interactions for example as a contact map. + The Hi-C data are from [pyGenomeTracks: reproducible plots for multivariate genomic datasets](https://academic.oup.com/bioinformatics/article/37/3/422/5879987?login=false). -The Hi-C matrix visualization is implemented by [HiCBricks](https://github.com/koustav-pal/HiCBricks). +The Hi-C matrix visualization is implemented by [`HiCBricks`](https://github.com/koustav-pal/HiCBricks). +This package needs to be installed separately (it is only 'Suggested' by `ggcoverage`). ### Load track data @@ -678,7 +711,9 @@ Matrix: ```{r hic_load_hic_matrix} ## matrix hic_mat_file <- system.file("extdata", - "HiC", "HiC_mat.txt", package = "ggcoverage") + "HiC", "HiC_mat.txt", + package = "ggcoverage" +) hic_mat <- read.table(file = hic_mat_file, sep = "\t") hic_mat <- as.matrix(hic_mat) ``` @@ -712,7 +747,7 @@ link_file <- ### Basic coverage -```{r basic_coverage_hic, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +```{r basic_coverage_hic, warning = FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, @@ -728,7 +763,9 @@ basic_coverage Add **link**, **contact map**annotations: -```{r hic_coverage, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} +```{r hic_coverage, warning = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} +library(HiCBricks) + basic_coverage + geom_tad( matrix = hic_mat, @@ -739,9 +776,11 @@ basic_coverage + top = FALSE, show.rect = TRUE ) + - geom_link(link.file = link_file, - file.type = "bedpe", - show.rect = TRUE) + geom_link( + link.file = link_file, + file.type = "bedpe", + show.rect = TRUE + ) ``` ## Mass spectrometry protein coverage @@ -750,14 +789,16 @@ basic_coverage + ### Load coverage -The exported coverage from [Proteome Discoverer](https://www.thermofisher.cn/cn/zh/home/industrial/mass-spectrometry/liquid-chromatography-mass-spectrometry-lc-ms/lc-ms-software/multi-omics-data-analysis/proteome-discoverer-software.html?adobe_mc=MCMID%7C90228073352279367993013412919222863692%7CMCAID%3D3208C32C269355DE-4000028116B65FEB%7CMCORGID%3D5B135A0C5370E6B40A490D44%40AdobeOrg%7CTS=1614293705): +The exported coverage from [Proteome Discoverer](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8006021/): ```{r ms_coverage_data} library(openxlsx) # prepare coverage dataframe coverage_file <- system.file("extdata", - "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") + "Proteomics", "MS_BSA_coverage.xlsx", + package = "ggcoverage" + ) coverage_df <- openxlsx::read.xlsx(coverage_file, sheet = "Sheet1") # check the data head(coverage_df) @@ -768,7 +809,9 @@ The input protein fasta: ```{r ms_coverage_fasta} fasta_file <- system.file("extdata", - "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") + "Proteomics", "MS_BSA_coverage.fasta", + package = "ggcoverage" + ) # prepare track dataframe protein_set <- Biostrings::readAAStringSet(fasta_file) @@ -779,9 +822,9 @@ protein_set ### Protein coverage -```{r basic_coverage_protein, warning=FALSE, fig.height = 6, fig.width = 10, fig.align = "center"} +```{r basic_coverage_protein, warning = FALSE, fig.height = 6, fig.width = 10, fig.align = "center"} protein_coverage <- ggprotein( - coverage.file = coverage_file, + coverage.df = coverage_df, fasta.file = fasta_file, protein.id = "sp|P02769|ALBU_BOVIN", range.position = "out" @@ -794,7 +837,7 @@ protein_coverage We can obtain features of the protein from [UniProt](https://www.uniprot.org/). For example, the above protein coverage plot shows that there is empty region in 1-24, and this empty region in [UniProt](https://www.uniprot.org/uniprotkb/P02769/entry) is annotated as Signal peptide and Propeptide peptide. When the protein is mature and released extracellular, these peptides will be cleaved. This is the reason why there is empty region in 1-24. -```{r basic_coverage_protein_feature, warning=FALSE, fig.height = 6, fig.width = 10, fig.align = "center"} +```{r basic_coverage_protein_feature, warning = FALSE, fig.height = 6, fig.width = 10, fig.align = "center"} # protein feature obtained from UniProt protein_feature_df <- data.frame( ProteinID = "sp|P02769|ALBU_BOVIN", @@ -805,8 +848,10 @@ protein_feature_df <- data.frame( # add annotation protein_coverage + - geom_feature(feature.df = protein_feature_df, - feature.color = c("#4d81be", "#173b5e", "#6a521d")) + geom_feature( + feature.df = protein_feature_df, + feature.color = c("#4d81be", "#173b5e", "#6a521d") + ) ``` ## Code of Conduct diff --git a/README.md b/README.md index c38abed..3323a9e 100644 --- a/README.md +++ b/README.md @@ -63,13 +63,10 @@ In general, it is **recommended** to install from [Github repository](https://github.com/showteeth/ggcoverage) (update more timely). -Once `ggcoverage` is installed, it can be loaded (together with other -libraries) like this: +Once `ggcoverage` is installed, it can be loaded as every other package: ``` r -library("rtracklayer") library("ggcoverage") -library("ggpattern") ``` ## Manual @@ -259,6 +256,15 @@ basic_coverage ### Add gene annotation +- default behavior is to draw genes (transcripts), exons and UTRs with + different line width +- can bec adjusted using `gene.size`, `exon.size` and `utr.size` + parameters +- frequency of intermittent arrows (light color) can be adjusted using + the `arrow.num` and `arrow.gap` parameters +- genomic features are colored by `strand` by default, which can be + changed using the `color.by` parameter + ``` r basic_coverage + geom_gene(gtf.gr = gtf_gr) @@ -290,7 +296,40 @@ basic_coverage + ### Add ideogram +The ideogram is an overview plot about the respective position on a +chromosome. The plotting of the ideogram is implemented by the `ggbio` +package. This package needs to be installed separately (it is only +‘Suggested’ by `ggcoverage`). + ``` r +library(ggbio) +#> Loading required package: BiocGenerics +#> +#> Attaching package: 'BiocGenerics' +#> The following objects are masked from 'package:stats': +#> +#> IQR, mad, sd, var, xtabs +#> The following objects are masked from 'package:base': +#> +#> anyDuplicated, aperm, append, as.data.frame, basename, cbind, +#> colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find, +#> get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply, +#> match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, +#> Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort, +#> table, tapply, union, unique, unsplit, which.max, which.min +#> Loading required package: ggplot2 +#> Registered S3 method overwritten by 'GGally': +#> method from +#> +.gg ggplot2 +#> Need specific help about ggbio? try mailing +#> the maintainer or visit https://lawremi.github.io/ggbio/ +#> +#> Attaching package: 'ggbio' +#> The following objects are masked from 'package:ggplot2': +#> +#> geom_bar, geom_rect, geom_segment, ggsave, stat_bin, stat_identity, +#> xlim + basic_coverage + geom_gene(gtf.gr = gtf_gr) + geom_ideogram(genome = "hg19", plot.space = 0) @@ -323,7 +362,7 @@ basic_coverage + ##### Load the data The DNA-seq data used here are from [Copy number work -flow](http://bioconductor.org/help/course-materials/2014/SeattleOct2014/B02.2.3_CopyNumber.html), +flow](https://bioconductor.org/help/course-materials/2014/SeattleOct2014/B02.2.3_CopyNumber.html), we select tumor sample, and get bin counts with `cn.mops::getReadCountsFromBAM` with `WL` 1000. @@ -380,6 +419,19 @@ Add **GC**, **ideogram** and **gene** annotaions. # load genome data library("BSgenome.Hsapiens.UCSC.hg19") #> Loading required package: BSgenome +#> Loading required package: S4Vectors +#> Loading required package: stats4 +#> +#> Attaching package: 'S4Vectors' +#> The following object is masked from 'package:utils': +#> +#> findMatches +#> The following objects are masked from 'package:base': +#> +#> expand.grid, I, unname +#> Loading required package: IRanges +#> Loading required package: GenomeInfoDb +#> Loading required package: GenomicRanges #> Loading required package: Biostrings #> Loading required package: XVector #> @@ -387,6 +439,7 @@ library("BSgenome.Hsapiens.UCSC.hg19") #> The following object is masked from 'package:base': #> #> strsplit +#> Loading required package: rtracklayer # create plot basic_coverage + @@ -647,8 +700,6 @@ ggcoverage( #> Adding another scale for x, which will replace the existing scale. ``` - - **Use star to mark position with SNV**: ``` r @@ -806,12 +857,21 @@ basic_coverage + ## Hi-C data +The Hi-C method maps chromosome contacts in eukaryotic cells. For this +purpose, DNA and protein complexes are cross-linked and DNA fragments +then purified. As a result, even distant chromatin fragments can be +found to interact due to the spatial organization of the DNA and +histones in the cell. Hi-C data shows these interactions for example as +a contact map. + The Hi-C data are from [pyGenomeTracks: reproducible plots for multivariate genomic datasets](https://academic.oup.com/bioinformatics/article/37/3/422/5879987?login=false). The Hi-C matrix visualization is implemented by -[HiCBricks](https://github.com/koustav-pal/HiCBricks). +[`HiCBricks`](https://github.com/koustav-pal/HiCBricks). This package +needs to be installed separately (it is only ‘Suggested’ by +`ggcoverage`). ### Load track data @@ -901,6 +961,18 @@ basic_coverage Add **link**, **contact map**annotations: ``` r +library(HiCBricks) +#> Loading required package: curl +#> Using libcurl 7.81.0 with OpenSSL/3.0.2 +#> Loading required package: rhdf5 +#> Loading required package: R6 +#> Loading required package: grid +#> +#> Attaching package: 'grid' +#> The following object is masked from 'package:Biostrings': +#> +#> pattern + basic_coverage + geom_tad( matrix = hic_mat, @@ -940,7 +1012,7 @@ experiment. ### Load coverage The exported coverage from [Proteome -Discoverer](https://www.thermofisher.cn/cn/zh/home/industrial/mass-spectrometry/liquid-chromatography-mass-spectrometry-lc-ms/lc-ms-software/multi-omics-data-analysis/proteome-discoverer-software.html?adobe_mc=MCMID%7C90228073352279367993013412919222863692%7CMCAID%3D3208C32C269355DE-4000028116B65FEB%7CMCORGID%3D5B135A0C5370E6B40A490D44%40AdobeOrg%7CTS=1614293705): +Discoverer](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8006021/): ``` r library(openxlsx) @@ -1024,7 +1096,7 @@ protein_set ``` r protein_coverage <- ggprotein( - coverage.file = coverage_file, + coverage.df = coverage_df, fasta.file = fasta_file, protein.id = "sp|P02769|ALBU_BOVIN", range.position = "out" diff --git a/cran-comments.md b/cran-comments.md new file mode 100644 index 0000000..fe4bec6 --- /dev/null +++ b/cran-comments.md @@ -0,0 +1,37 @@ +# Resubmission + +This package has been submitted previously (2023, v0.7.1) and was removed from +CRAN due to several issues. In the mean time many functions were re-factored, +more than 10 dependencies were removed to make the package lighter, and other +problems regarding documentation and style were fixed. Current version 1.4.0 +now builds fine on the tested platforms. + +## Test environments + +### Local + +- Ubuntu 22.04 + +### with Github Actions + +- windows-latest (release) +- macOS-latest (release) +- ubuntu-latest (release) + +## R CMD check results + +0 errors | 0 warnings | 1 note + +There was 1 NOTE: + +``` +❯ checking installed package size ... NOTE + installed size is 31.9Mb + sub-directories of 1Mb or more: + doc 2.2Mb + extdata 28.6Mb +``` + +## Downstream dependencies + +- There are currently no downstream dependencies for this package diff --git a/man/figures/README-base_aa_coverage_highlight-1.png b/man/figures/README-base_aa_coverage_highlight-1.png index 2cc380a..9015d7d 100644 Binary files a/man/figures/README-base_aa_coverage_highlight-1.png and b/man/figures/README-base_aa_coverage_highlight-1.png differ diff --git a/man/figures/README-base_aa_coverage_star-1.png b/man/figures/README-base_aa_coverage_star-1.png index c85ae54..552806c 100644 Binary files a/man/figures/README-base_aa_coverage_star-1.png and b/man/figures/README-base_aa_coverage_star-1.png differ diff --git a/man/figures/README-basic_coverage-1.png b/man/figures/README-basic_coverage-1.png index 3170da2..d18f4c7 100644 Binary files a/man/figures/README-basic_coverage-1.png and b/man/figures/README-basic_coverage-1.png differ diff --git a/man/figures/README-basic_coverage_2-1.png b/man/figures/README-basic_coverage_2-1.png index c140810..e7c6bcc 100644 Binary files a/man/figures/README-basic_coverage_2-1.png and b/man/figures/README-basic_coverage_2-1.png differ diff --git a/man/figures/README-basic_coverage_3-1.png b/man/figures/README-basic_coverage_3-1.png index 13ea576..defc78b 100644 Binary files a/man/figures/README-basic_coverage_3-1.png and b/man/figures/README-basic_coverage_3-1.png differ diff --git a/man/figures/README-basic_coverage_chip-1.png b/man/figures/README-basic_coverage_chip-1.png index 0f4fd0b..f9d6200 100644 Binary files a/man/figures/README-basic_coverage_chip-1.png and b/man/figures/README-basic_coverage_chip-1.png differ diff --git a/man/figures/README-basic_coverage_protein-1.png b/man/figures/README-basic_coverage_protein-1.png index d4e0c66..7e6fde7 100644 Binary files a/man/figures/README-basic_coverage_protein-1.png and b/man/figures/README-basic_coverage_protein-1.png differ diff --git a/man/figures/README-basic_coverage_protein_feature-1.png b/man/figures/README-basic_coverage_protein_feature-1.png index caaf3ac..f5df098 100644 Binary files a/man/figures/README-basic_coverage_protein_feature-1.png and b/man/figures/README-basic_coverage_protein_feature-1.png differ diff --git a/man/figures/README-gc_coverage-1.png b/man/figures/README-gc_coverage-1.png index e7532ca..d83be30 100644 Binary files a/man/figures/README-gc_coverage-1.png and b/man/figures/README-gc_coverage-1.png differ diff --git a/man/figures/README-gene_coverage-1.png b/man/figures/README-gene_coverage-1.png index ae4c87f..49898c2 100644 Binary files a/man/figures/README-gene_coverage-1.png and b/man/figures/README-gene_coverage-1.png differ diff --git a/man/figures/README-hic_coverage-1.png b/man/figures/README-hic_coverage-1.png index d290975..c73a081 100644 Binary files a/man/figures/README-hic_coverage-1.png and b/man/figures/README-hic_coverage-1.png differ diff --git a/man/figures/README-ideogram_coverage_1-1.png b/man/figures/README-ideogram_coverage_1-1.png index 8adde79..665b726 100644 Binary files a/man/figures/README-ideogram_coverage_1-1.png and b/man/figures/README-ideogram_coverage_1-1.png differ diff --git a/man/figures/README-ideogram_coverage_2-1.png b/man/figures/README-ideogram_coverage_2-1.png index 12cbb1b..bc4157c 100644 Binary files a/man/figures/README-ideogram_coverage_2-1.png and b/man/figures/README-ideogram_coverage_2-1.png differ diff --git a/man/figures/README-peak_coverage-1.png b/man/figures/README-peak_coverage-1.png index d180408..cdec413 100644 Binary files a/man/figures/README-peak_coverage-1.png and b/man/figures/README-peak_coverage-1.png differ diff --git a/man/figures/README-transcript_coverage-1.png b/man/figures/README-transcript_coverage-1.png index af74162..7e6704b 100644 Binary files a/man/figures/README-transcript_coverage-1.png and b/man/figures/README-transcript_coverage-1.png differ diff --git a/man/figures/README-transcript_coverage_tight-1.png b/man/figures/README-transcript_coverage_tight-1.png index 81e72bc..55536fa 100644 Binary files a/man/figures/README-transcript_coverage_tight-1.png and b/man/figures/README-transcript_coverage_tight-1.png differ diff --git a/man/geom_arrows.Rd b/man/geom_arrows.Rd new file mode 100644 index 0000000..bb1db5d --- /dev/null +++ b/man/geom_arrows.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{geom_arrows} +\alias{geom_arrows} +\title{Plot genomic features as arrows.} +\usage{ +geom_arrows( + data, + color, + line_width, + arrow_size, + arrow_angle = 35, + intermittent = FALSE +) +} +\arguments{ +\item{data}{data frame describing arrow position, with columns +start, end, group, and a custom 'color' column} + +\item{color}{name of the color column in the data frame} + +\item{line_width}{line_width of the (arrow) segment} + +\item{arrow_size}{size of the arrow} + +\item{arrow_angle}{angle of the arrow. Default: 35°} + +\item{intermittent}{If TRUE, arrows are only drawn intermittently in +half-transparent white color. Default: FALSE.} +} +\value{ +A geom layer for ggplot2 objects. +} +\description{ +This function is a variation of geom_segment to plot (gene) features +as arrows. Mainly meant for internal use, not to be called directly. +} diff --git a/man/geom_base.Rd b/man/geom_base.Rd index 7dfa77a..6e71552 100644 --- a/man/geom_base.Rd +++ b/man/geom_base.Rd @@ -102,27 +102,38 @@ Plot. Add Base and Amino Acid Annotation to Coverage Plot. } \examples{ -# library(ggcoverage) -# library("BSgenome.Hsapiens.UCSC.hg19") +library("BSgenome.Hsapiens.UCSC.hg19") + # get sample metadata -# sample.meta <- data.frame( -# SampleName = c("tumorA.chr4.selected"), -# Type = c("tumorA"), Group = c("tumorA") -# ) +sample.meta <- data.frame( + SampleName = c("tumorA.chr4.selected"), + Type = c("tumorA"), + Group = c("tumorA") +) + # get bam file -# bam.file <- system.file("extdata", "DNA-seq", "tumorA.chr4.selected.bam", package = "ggcoverage") +bam.file <- + system.file("extdata", "DNA-seq", "tumorA.chr4.selected.bam", package = "ggcoverage") + # load bam file -# track.df <- LoadTrackFile( -# track.file = bam.file, -# meta.info = sample.meta, single.nuc = TRUE, -# single.nuc.region = "chr4:62474235-62474295" -# ) -# ggcoverage( -# data = track.df, color = "grey", range.position = "out", -# single.nuc = TRUE, rect.color = "white" -# ) + -# geom_base( -# bam.file = bam.file, -# bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19 -# ) +track.df <- LoadTrackFile( + track.file = bam.file, + meta.info = sample.meta, + single.nuc = TRUE, + single.nuc.region = "chr4:62474235-62474295" +) + +# plot +ggcoverage( + data = track.df, + color = "grey", + range.position = "out", + single.nuc = TRUE, + rect.color = "white" +) + + geom_base( + bam.file = bam.file, + bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19 + ) + } diff --git a/man/geom_coverage.Rd b/man/geom_coverage.Rd index ec7759d..fdb00cf 100644 --- a/man/geom_coverage.Rd +++ b/man/geom_coverage.Rd @@ -126,9 +126,10 @@ ggplot() + geom_coverage( data = track.df, facet.key = "Type", mark.region = data.frame( - start = c(21678900,21732001,21737590), - end = c(21679900,21732400,21737650), - label=c("M1", "M2", "M3")), + start = c(21678900, 21732001, 21737590), + end = c(21679900, 21732400, 21737650), + label = c("M1", "M2", "M3") + ), mark.color = grey(0.4) ) diff --git a/man/geom_gene.Rd b/man/geom_gene.Rd index 9b181bc..6664129 100644 --- a/man/geom_gene.Rd +++ b/man/geom_gene.Rd @@ -10,14 +10,13 @@ geom_gene( overlap.style = "loose", gene.size = 1, utr.size = 2, - exon.size = 4, - arrow.size = 1, - color.by = "strand", - fill.color = c(`-` = "darkblue", `+` = "darkgreen"), - show.utr = TRUE, + exon.size = 3, + arrow.size = 1.5, arrow.gap = NULL, arrow.num = 50, - arrow.length = 0.06, + color.by = "strand", + fill.color = c(`-` = "cornflowerblue", `+` = "darkolivegreen3"), + show.utr = FALSE, label.size = 3, label.vjust = 2, plot.space = 0.1, @@ -36,23 +35,23 @@ and tight (place non-overlap genes in one line). Default: loose.} \item{utr.size}{The line size of UTR. Default: 2.} -\item{exon.size}{The line size of exon. Default: 4.} +\item{exon.size}{The line size of exon. Default: 3.} + +\item{arrow.size}{The line size of arrow. Default: 1.5.} + +\item{arrow.gap}{The gap distance between intermittent arrows. Default: NULL. +Set arrow.num and arrow.gap to NULL to suppress intermittent arrows.} -\item{arrow.size}{The line size of arrow. Default: 1.} +\item{arrow.num}{Total number of intermittent arrows over whole region. Default: 50. +Set arrow.num and arrow.gap to NULL to suppress intermittent arrows.} \item{color.by}{Color the line by. Default: strand.} \item{fill.color}{Color used for \code{color.by}. -Default: darkblue for - (minus strand), darkgreen for + (plus strand).} +Default: blue for - (minus strand), green for + (plus strand).} \item{show.utr}{Logical value, whether to show UTR. Default: TRUE.} -\item{arrow.gap}{The gap distance between arrow. Default: NULL.} - -\item{arrow.num}{Total arrow num of whole region. Default: 50.} - -\item{arrow.length}{The length of arrow. Default: 0.06.} - \item{label.size}{The size of gene label. Default: 3.} \item{label.vjust}{The vjust of gene label. Default: 2.} @@ -68,20 +67,32 @@ Plot. Add Gene Annotation to Coverage Plot. } \examples{ -# library(ggcoverage) -# library(utils) -# library(rtracklayer) -# meta.file <- system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") -# sample.meta <- utils::read.csv(meta.file) +library(ggcoverage) +library(utils) +library(rtracklayer) + +# load metadata +meta_file <- system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") +sample_meta <- read.csv(meta_file) + # track folder -# track.folder <- system.file("extdata", "RNA-seq", package = "ggcoverage") +track_folder <- system.file("extdata", "RNA-seq", package = "ggcoverage") + # load bigwig file -# track.df <- LoadTrackFile( -# track.folder = track.folder, format = "bw", -# meta.info = sample.meta -# ) -# gtf.file <- system.file("extdata", "used_hg19.gtf", package = "ggcoverage") -# gtf.gr <- rtracklayer::import.gff(con = gtf.file, format = "gtf") -# basic.coverage <- ggcoverage(data = track.df, color = "auto", range.position = "out") -# basic.coverage + geom_gene(gtf.gr = gtf.gr) +track_df <- LoadTrackFile( + track.folder = track_folder, + format = "bw", + region = "chr14:21,677,306-21,737,601", + extend = 2000, + meta.info = sample_meta +) + +# load GTF file +gtf_file <- system.file("extdata", "used_hg19.gtf", package = "ggcoverage") +gtf_gr <- rtracklayer::import.gff(con = gtf_file, format = "gtf") + +# plot coverage and gene annotation +basic.coverage <- ggcoverage(data = track_df, range.position = "out") +basic.coverage + + geom_gene(gtf.gr = gtf_gr) } diff --git a/man/geom_ideogram.Rd b/man/geom_ideogram.Rd index 5406e7c..b176919 100644 --- a/man/geom_ideogram.Rd +++ b/man/geom_ideogram.Rd @@ -66,3 +66,43 @@ Plot. \description{ Add Ideogram Annotation to Coverage Plot. } +\examples{ +\dontrun{ +library(ggbio) + +# load metadata +meta_file <- + system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") +sample_meta <- read.csv(meta_file) + +# track folder +track_folder <- + system.file("extdata", "RNA-seq", package = "ggcoverage") +# load bigwig file +track_df <- LoadTrackFile( + track.folder = track_folder, + format = "bw", + region = "chr14:21,677,306-21,737,601", + extend = 2000, + meta.info = sample_meta +) + +# gene annotation +gtf_file <- + system.file("extdata", "used_hg19.gtf", package = "ggcoverage") +gtf_gr <- rtracklayer::import.gff(con = gtf_file, format = "gtf") + +# coverage plot + ideogram +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "facet", + range.position = "in", + facet.y.scale = "fixed" +) + +basic_coverage + + geom_gene(gtf.gr = gtf_gr) + + geom_ideogram(genome = "hg19", plot.space = 0) +} + +} diff --git a/man/geom_link.Rd b/man/geom_link.Rd index cb612b4..c3017db 100644 --- a/man/geom_link.Rd +++ b/man/geom_link.Rd @@ -2,15 +2,16 @@ % Please edit documentation in R/geom_link.R \name{geom_link} \alias{geom_link} -\title{Add Links to Coverage Plot.} +\title{Add Genome Links to Coverage Plot.} \usage{ geom_link( link.file, file.type = "bedpe", score.col = NULL, score.threshold = NULL, - score.color = c("blue", "grey", "red"), + score.color = c("grey70", "#56B1F7", "#132B43"), scale.range = 10, + plot.curve = "curve", plot.space = 0.1, plot.height = 0.2, show.rect = FALSE @@ -21,14 +22,15 @@ geom_link( \item{file.type}{The type of \code{link.file}, choose from bedpe, pairs. Default: bedpe.} -\item{score.col}{Column index contains score information, used when \code{file.type} is bedpe. Default: NULL.} +\item{score.col}{Column index that contains score information, used when \code{file.type} is bedpe. Default: NULL.} \item{score.threshold}{The score threshold, used when \code{score.col} is not NULL. Default: NULL.} -\item{score.color}{The score color vector. The length should be three, the first represents the lowest score, the second represents the -middle score, the third represents the highest score. Default: c("blue", "grey", "red").} +\item{score.color}{The score color vector. Default: c("grey70", "#56B1F7", "#132B43").} -\item{scale.range}{Scale the height of link according to width, should be greater than or equal to 1 (not scale). Default: 10.} +\item{scale.range}{Scale the height of links according to width, should be greater than or equal to 1 (not scale). Default: 10.} + +\item{plot.curve}{One of 'curve' or 'bezier', for the latter it is required to install package \code{ggforce}. Default: 'curve'.} \item{plot.space}{Top and bottom margin. Default: 0.1.} @@ -40,20 +42,26 @@ middle score, the third represents the highest score. Default: c("blue", "grey", Plot. } \description{ -Add Links to Coverage Plot. +Add Genome Links to Coverage Plot. } \examples{ library(ggcoverage) -# create test dataframe -# (random, but use seed to obtain same result every time) +# create random test data +# use seed to obtain same result every time set.seed(123) + df <- data.frame( - seqnames = "chr2L", start = seq(from = 8000000, to = 8300000, by = 1000), - end = seq(from = 8001000, to = 8301000, by = 1000), score = sample(1:100, 301, replace = TRUE), + seqnames = "chr2L", + start = seq(from = 8000000, to = 8300000, by = 1000), + end = seq(from = 8001000, to = 8301000, by = 1000), + score = sample(1:100, 301, replace = TRUE), Type = "Example", Group = "Example" ) # get links -link.file = system.file("extdata", "HiC", "HiC_link.bedpe", package = "ggcoverage") +link.file <- system.file( + "extdata", "HiC", "HiC_link.bedpe", + package = "ggcoverage" +) # create plot ggcoverage( diff --git a/man/geom_protein.Rd b/man/geom_protein.Rd index 3f8e370..8255e2d 100644 --- a/man/geom_protein.Rd +++ b/man/geom_protein.Rd @@ -5,7 +5,7 @@ \title{Layer for Protein Coverage Plot.} \usage{ geom_protein( - coverage.file, + coverage.df, fasta.file, protein.id, XCorr.threshold = 2, @@ -25,7 +25,7 @@ geom_protein( ) } \arguments{ -\item{coverage.file}{Exported protein coverage file, should be in excel.} +\item{coverage.df}{Protein coverage, for example output from Proteome Discoverer.} \item{fasta.file}{Input reference protein fasta file.} @@ -68,15 +68,30 @@ A ggplot2 object. Layer for Protein Coverage Plot. } \examples{ +\dontrun{ library(ggplot2) -library(ggcoverage) +library(openxlsx) + +# import coverage dataframe with function from openxlsx coverage.file <- system.file( - "extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage" + "extdata", "Proteomics", "MS_BSA_coverage.xlsx", + package = "ggcoverage" ) +coverage.df <- read.xlsx(coverage.file) +head(coverage.df) + +# get fasta file fasta.file <- system.file( - "extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage" + "extdata", "Proteomics", "MS_BSA_coverage.fasta", + package = "ggcoverage" ) -protein.id = "sp|P02769|ALBU_BOVIN" + +protein.id <- "sp|P02769|ALBU_BOVIN" ggplot() + - geom_protein(coverage.file = coverage.file, fasta.file = fasta.file, protein.id = protein.id) + geom_protein( + coverage.df = coverage.df, + fasta.file = fasta.file, + protein.id = protein.id + ) +} } diff --git a/man/geom_tad.Rd b/man/geom_tad.Rd index 044ba9e..950a99e 100644 --- a/man/geom_tad.Rd +++ b/man/geom_tad.Rd @@ -22,13 +22,13 @@ geom_tad( \item{granges}{The rownames and colnames information of matrix.} \item{color.palette}{One of the RColorbrewer or viridis colour palettes. -Parameter of \code{\link{Brick_vizart_plot_heatmap}}. Default: NULL.} +Parameter of \code{HiCBricks::Brick_vizart_plot_heatmap}. Default: NULL.} \item{value.cut}{If present, values beyond a certain quantile will be capped to that quantile. -Parameter of \code{\link{Brick_vizart_plot_heatmap}}. Default: NULL.} +Parameter of \code{HiCBricks::Brick_vizart_plot_heatmap}. Default: NULL.} \item{transform.fun}{If any sort of transformations should be applied to the data before plotting. -Parameter of \code{\link{Brick_vizart_plot_heatmap}}. Default: NULL.} +Parameter of \code{HiCBricks::Brick_vizart_plot_heatmap}. Default: NULL.} \item{plot.space}{Top and bottom margin. Default: 0.1.} @@ -46,47 +46,51 @@ Add Contact Map to Coverage Plot. } \examples{ library(ggcoverage) -library(GenomicRanges) +library(HiCBricks) # prepare track dataframe -track.file = system.file("extdata", "HiC", "H3K36me3.bw", package = "ggcoverage") -track.df = LoadTrackFile(track.file = track.file, format = "bw", - region = "chr2L:8050000-8300000", extend = 0) -track.df$score = ifelse(track.df$score <0, 0, track.df$score) +track.file <- system.file("extdata", "HiC", "H3K36me3.bw", package = "ggcoverage") +track.df <- LoadTrackFile( + track.file = track.file, format = "bw", + region = "chr2L:8050000-8300000", extend = 0 +) +track.df$score <- ifelse(track.df$score < 0, 0, track.df$score) # check the data head(track.df) # Load Hi-C data -hic.mat.file = system.file("extdata", "HiC", "HiC_mat.txt", package = "ggcoverage") -hic.mat = read.table(file = hic.mat.file, sep = "\t") -hic.mat = as.matrix(hic.mat) +hic.mat.file <- system.file("extdata", "HiC", "HiC_mat.txt", package = "ggcoverage") +hic.mat <- read.table(file = hic.mat.file, sep = "\t") +hic.mat <- as.matrix(hic.mat) # bin data -hic.bin.file = system.file("extdata", "HiC", "HiC_bin.txt", package = "ggcoverage") -hic.bin = read.table(file = hic.bin.file, sep = "\t") -colnames(hic.bin) = c("chr", "start", "end") -hic.bin.gr = GenomicRanges::makeGRangesFromDataFrame(df = hic.bin) +hic.bin.file <- system.file("extdata", "HiC", "HiC_bin.txt", package = "ggcoverage") +hic.bin <- read.table(file = hic.bin.file, sep = "\t") +colnames(hic.bin) <- c("chr", "start", "end") +hic.bin.gr <- GenomicRanges::makeGRangesFromDataFrame(df = hic.bin) # transfrom function -FailSafe_log10 <- function(x){ +failsafe_log10 <- function(x) { x[is.na(x) | is.nan(x) | is.infinite(x)] <- 0 - return(log10(x+1)) + return(log10(x + 1)) } # load link data: prepare arcs -link.file = system.file("extdata", "HiC", "HiC_link.bedpe", package = "ggcoverage") +link.file <- system.file("extdata", "HiC", "HiC_link.bedpe", package = "ggcoverage") # basic coverage -basic.coverage = ggcoverage( +basic.coverage <- ggcoverage( data = track.df, color = "grey", mark.region = NULL, range.position = "out" ) # add annotations basic.coverage + - geom_tad(matrix = hic.mat, granges = hic.bin.gr, value.cut = 0.99, - color.palette = "viridis", transform.fun = FailSafe_log10, - top = FALSE, show.rect = TRUE) + + geom_tad( + matrix = hic.mat, granges = hic.bin.gr, value.cut = 0.99, + color.palette = "viridis", transform.fun = failsafe_log10, + top = FALSE, show.rect = TRUE + ) + geom_link(link.file = link.file, file.type = "bedpe", show.rect = TRUE) } diff --git a/man/geom_transcript.Rd b/man/geom_transcript.Rd index daa9b05..a2fb109 100644 --- a/man/geom_transcript.Rd +++ b/man/geom_transcript.Rd @@ -11,13 +11,12 @@ geom_transcript( overlap.style = "loose", tx.size = 1, utr.size = 2, - exon.size = 4, - arrow.size = 1, - color.by = "strand", - fill.color = c(`-` = "darkblue", `+` = "darkgreen"), + exon.size = 3, + arrow.size = 3, arrow.gap = NULL, arrow.num = 50, - arrow.length = 0.06, + color.by = "strand", + fill.color = c(`-` = "cornflowerblue", `+` = "darkolivegreen3"), label.size = 3, label.vjust = 2, plot.space = 0.1, @@ -38,20 +37,20 @@ and tight (place non-overlap transcripts in one line). Default: loose.} \item{utr.size}{The line size of UTR. Default: 2.} -\item{exon.size}{The line size of exon. Default: 4.} +\item{exon.size}{The line size of exon. Default: 3.} -\item{arrow.size}{The line size of arrow. Default: 1.} +\item{arrow.size}{The line size of arrow. Default: 1.5.} -\item{color.by}{Color the line by. Default: strand.} +\item{arrow.gap}{The gap distance between intermittent arrows. Default: NULL. +Set arrow.num and arrow.gap to NULL to suppress intermittent arrows.} -\item{fill.color}{Color used for \code{color.by}. -Default: darkblue for - (minus strand), darkgreen for + (plus strand).} +\item{arrow.num}{Total number of intermittent arrows over whole region. Default: 50. +Set arrow.num and arrow.gap to NULL to suppress intermittent arrows.} -\item{arrow.gap}{The gap distance between arrow. Default: NULL.} - -\item{arrow.num}{Total arrow num of whole region. Default: 50.} +\item{color.by}{Color the line by. Default: strand.} -\item{arrow.length}{The length of arrow. Default: 0.06.} +\item{fill.color}{Color used for \code{color.by}. +Default: blue for - (minus strand), green for + (plus strand).} \item{label.size}{The size of transcript label. Default: 3.} @@ -68,20 +67,32 @@ Plot. Add Transcript Annotation to Coverage Plot. } \examples{ -# library(ggcoverage) -# library(utils) -# library(rtracklayer) -# meta.file <- system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") -# sample.meta <- utils::read.csv(meta.file) +library(ggcoverage) +library(utils) +library(rtracklayer) + +# load metadata +meta_file <- system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") +sample_meta <- read.csv(meta_file) + # track folder -# track.folder <- system.file("extdata", "RNA-seq", package = "ggcoverage") +track_folder <- system.file("extdata", "RNA-seq", package = "ggcoverage") + # load bigwig file -# track.df <- LoadTrackFile( -# track.folder = track.folder, format = "bw", -# meta.info = sample.meta -# ) -# gtf.file <- system.file("extdata", "used_hg19.gtf", package = "ggcoverage") -# gtf.gr <- rtracklayer::import.gff(con = gtf.file, format = "gtf") -# basic.coverage <- ggcoverage(data = track.df, color = "auto", range.position = "out") -# basic.coverage + geom_transcript(gtf.gr = gtf.gr, label.vjust = 1.5) +track_df <- LoadTrackFile( + track.folder = track_folder, + format = "bw", + region = "chr14:21,677,306-21,737,601", + extend = 2000, + meta.info = sample_meta +) + +# load GTF file +gtf_file <- system.file("extdata", "used_hg19.gtf", package = "ggcoverage") +gtf_gr <- rtracklayer::import.gff(con = gtf_file, format = "gtf") + +# plot coverage and gene annotation +basic.coverage <- ggcoverage(data = track_df, range.position = "out") +basic.coverage + + geom_transcript(gtf.gr = gtf_gr, label.vjust = 1.5) } diff --git a/man/ggcoverage.Rd b/man/ggcoverage.Rd index 2035c7c..3562b89 100644 --- a/man/ggcoverage.Rd +++ b/man/ggcoverage.Rd @@ -127,9 +127,10 @@ ggcoverage(data = track.df, facet.key = "Type") + ggcoverage( data = track.df, facet.key = "Type", mark.region = data.frame( - start = c(21678900,21732001,21737590), - end = c(21679900,21732400,21737650), - label=c("M1", "M2", "M3")), + start = c(21678900, 21732001, 21737590), + end = c(21679900, 21732400, 21737650), + label = c("M1", "M2", "M3") + ), mark.color = grey(0.4) ) diff --git a/man/ggprotein.Rd b/man/ggprotein.Rd index 62022f8..5e247ef 100644 --- a/man/ggprotein.Rd +++ b/man/ggprotein.Rd @@ -5,7 +5,7 @@ \title{Create Mass Spectrometry Protein Coverage Plot.} \usage{ ggprotein( - coverage.file, + coverage.df, fasta.file, protein.id, XCorr.threshold = 2, @@ -25,7 +25,7 @@ ggprotein( ) } \arguments{ -\item{coverage.file}{Exported protein coverage file, should be in excel.} +\item{coverage.df}{Protein coverage, for example output from Proteome Discoverer.} \item{fasta.file}{Input reference protein fasta file.} @@ -68,18 +68,29 @@ A ggplot2 object. Create Mass Spectrometry Protein Coverage Plot. } \examples{ -library(ggcoverage) +\dontrun{ +library(ggplot2) +library(openxlsx) + +# import coverage dataframe with function from openxlsx coverage.file <- system.file( - "extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage" + "extdata", "Proteomics", "MS_BSA_coverage.xlsx", + package = "ggcoverage" ) +coverage.df <- read.xlsx(coverage.file) +head(coverage.df) + +# get fasta file fasta.file <- system.file( - "extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage" + "extdata", "Proteomics", "MS_BSA_coverage.fasta", + package = "ggcoverage" ) -protein.id = "sp|P02769|ALBU_BOVIN" +protein.id <- "sp|P02769|ALBU_BOVIN" ggprotein( - coverage.file = coverage.file, + coverage.df = coverage.df, fasta.file = fasta.file, protein.id = protein.id ) } +} diff --git a/vignettes/CustomizeThePlot.Rmd b/vignettes/CustomizeThePlot.Rmd index 51e9fde..4266c8e 100644 --- a/vignettes/CustomizeThePlot.Rmd +++ b/vignettes/CustomizeThePlot.Rmd @@ -53,31 +53,39 @@ library(rtracklayer) ### Prepare data ```{r prepare_data, warning=FALSE, fig.height = 6, fig.width = 8, fig.align = "center"} # prepare gtf -gtf.file = system.file("extdata", "used_hg19.gtf", package = "ggcoverage") -gtf.gr = rtracklayer::import.gff(con = gtf.file, format = 'gtf') +gtf.file <- system.file("extdata", "used_hg19.gtf", package = "ggcoverage") +gtf.gr <- rtracklayer::import.gff(con = gtf.file, format = "gtf") # sample metadata -sample.meta = data.frame(SampleName=c('Chr18_MCF7_ER_1','Chr18_MCF7_ER_2','Chr18_MCF7_ER_3','Chr18_MCF7_input'), - Type = c("MCF7_ER_1","MCF7_ER_2","MCF7_ER_3","MCF7_input"), - Group = c("IP", "IP", "IP", "Input")) +sample.meta <- data.frame( + SampleName = c("Chr18_MCF7_ER_1", "Chr18_MCF7_ER_2", "Chr18_MCF7_ER_3", "Chr18_MCF7_input"), + Type = c("MCF7_ER_1", "MCF7_ER_2", "MCF7_ER_3", "MCF7_input"), + Group = c("IP", "IP", "IP", "Input") +) sample.meta # track folder -track.folder = system.file("extdata", "ChIP-seq", package = "ggcoverage") +track.folder <- system.file("extdata", "ChIP-seq", package = "ggcoverage") # load bigwig file -track.df = LoadTrackFile(track.folder = track.folder, format = "bw", region = "chr18:76822285-76900000", - meta.info = sample.meta) +track.df <- LoadTrackFile( + track.folder = track.folder, format = "bw", region = "chr18:76822285-76900000", + meta.info = sample.meta +) # check data head(track.df) -track.df = track.df %>% dplyr::filter(Type %in% c("MCF7_ER_1", "MCF7_input")) +track.df <- track.df %>% dplyr::filter(Type %in% c("MCF7_ER_1", "MCF7_input")) # create mark region -mark.region=data.frame(start=c(76822533), - end=c(76823743), - label=c("Promoter")) +mark.region <- data.frame( + start = c(76822533), + end = c(76823743), + label = c("Promoter") +) # check data mark.region # create basic coverage plot -basic.coverage = ggcoverage(data = track.df, range.position = "out", - mark.region=mark.region, show.mark.label = FALSE) +basic.coverage <- ggcoverage( + data = track.df, range.position = "out", + mark.region = mark.region, show.mark.label = FALSE +) basic.coverage ``` @@ -86,18 +94,18 @@ basic.coverage ### Customize ggplot2 object Customize the plot generated by `ggcoverage`: ```{r customize_ggplot2, warning=FALSE, fig.height = 6, fig.width = 8, fig.align = "center"} -basic.coverage + +basic.coverage + # add title labs(title = "chr18:76,822,285-76,900,000") + - theme(plot.title=element_text(hjust=0.5)) + + theme(plot.title = element_text(hjust = 0.5)) + # change color - scale_fill_manual(values = c("MCF7_ER_1"="green", "MCF7_input"="yellow")) + + scale_fill_manual(values = c("MCF7_ER_1" = "green", "MCF7_input" = "yellow")) + # add rect geom_rect( - data = data.frame(start = 76840533, end = 76842533), - aes_string(xmin = "start", xmax = "end", ymin = "0", ymax = "Inf"), - fill = "red", alpha = 0.6 - ) + data = data.frame(start = 76840533, end = 76842533), + aes_string(xmin = "start", xmax = "end", ymin = "0", ymax = "Inf"), + fill = "red", alpha = 0.6 + ) ```
@@ -108,16 +116,18 @@ basic.coverage + The example plot: ```{r customize_patchwork_example, warning=FALSE,fig.height = 7, fig.width = 8, fig.align = "center"} # get consensus peak file -peak.file = system.file("extdata", "ChIP-seq", "consensus.peak", package = "ggcoverage") +peak.file <- system.file("extdata", "ChIP-seq", "consensus.peak", package = "ggcoverage") # example plot -chip.coverage = basic.coverage + labs(title = "chr18:76,822,285-76,900,000") + - theme(plot.title=element_text(hjust=0.5)) + - geom_gene(gtf.gr=gtf.gr, arrow.length = 0.04,arrow.size=0.25, - gene.size = 0.75, - utr.size = 1.5, - exon.size = 2.5,label.size = 2.5, plot.height = 0.3) + +chip.coverage <- basic.coverage + labs(title = "chr18:76,822,285-76,900,000") + + theme(plot.title = element_text(hjust = 0.5)) + + geom_gene( + gtf.gr = gtf.gr, arrow.length = 0.04, arrow.size = 0.25, + gene.size = 0.75, + utr.size = 1.5, + exon.size = 2.5, label.size = 2.5, plot.height = 0.3 + ) + geom_peak(bed.file = peak.file, plot.height = 0.1) + - geom_ideogram(genome = "hg19",plot.space = 0, plot.height = 0.15) + geom_ideogram(genome = "hg19", plot.space = 0, plot.height = 0.15) # get the class of chip.coverage class(chip.coverage) # output the plot @@ -156,13 +166,13 @@ class(chip.coverage[[1]][[2]]) **Add another peak info**: ```{r customize_patchwork_add_peak, warning=FALSE,fig.height = 2, fig.width = 8, fig.align = "center"} -chip.coverage[[1]][[2]] + +chip.coverage[[1]][[2]] + # the size is the height of segment, controlled by peak.size in geom_peak geom_segment( - data = data.frame(start = 76840533, end = 76842533), - aes_string(x = "start", xend = "end", y = "1", yend = "1"), - color = "red", size = 5 - ) + data = data.frame(start = 76840533, end = 76842533), + aes_string(x = "start", xend = "end", y = "1", yend = "1"), + color = "red", size = 5 + ) ```
@@ -172,7 +182,7 @@ chip.coverage[[1]][[2]] + To add additional layer, we need to obtain the raw coverage data (for consistency). `ggcoverage` provides `GetPlotData` to obtain the data used to plot. ```{r get_raw_data} # get coverage data, the layer number is four -coverage.data = GetPlotData(plot = chip.coverage, layer.num = 4) +coverage.data <- GetPlotData(plot = chip.coverage, layer.num = 4) # inspect data head(coverage.data) str(coverage.data) @@ -185,19 +195,19 @@ Here, I will create a new peak layer as an example (this is a sample example tha ```{r add_peak_data, warning=FALSE,fig.height = 2, fig.width = 8, fig.align = "center"} # create pseudo-peak, you can load your peak file instead (be aware of 0-based/1-based) -pseudo_peak = data.frame(chr="chr18", start = 76840533, end = 76842533) +pseudo_peak <- data.frame(chr = "chr18", start = 76840533, end = 76842533) # get region constraint plot.region.start <- coverage.data[1, "start"] plot.region.end <- coverage.data[nrow(coverage.data), "end"] # create plot -peak.plot = ggplot() + +peak.plot <- ggplot() + geom_segment( - data = pseudo_peak, - aes_string(x = "start", xend = "end", y = "1", yend = "1"), - color = "red", size = 5 - ) + - labs(y="Peak") + + data = pseudo_peak, + aes_string(x = "start", xend = "end", y = "1", yend = "1"), + color = "red", size = 5 + ) + + labs(y = "Peak") + theme_peak(margin.len = 0.1, x.range = c(plot.region.start, plot.region.end)) peak.plot ``` @@ -205,16 +215,16 @@ peak.plot Combine the plot: ```{r combine_layers, warning=FALSE,fig.height = 7, fig.width = 8, fig.align = "center"} # add peak layer -add.peak = patchwork::wrap_plots(chip.coverage[[1]] + theme(plot.margin = margin(t = 0.1, b = 0.1)), - peak.plot, - ncol = 1, heights = c(1, 0.1) - ) +add.peak <- patchwork::wrap_plots(chip.coverage[[1]] + theme(plot.margin = margin(t = 0.1, b = 0.1)), + peak.plot, + ncol = 1, heights = c(1, 0.1) +) add.peak # add ideogram layers -final.plot = patchwork::wrap_plots(add.peak + theme(plot.margin = margin(t = 0.1, b = 0.1)), - chip.coverage[[2]], - ncol = 1, heights = c(1, 0.1) - ) +final.plot <- patchwork::wrap_plots(add.peak + theme(plot.margin = margin(t = 0.1, b = 0.1)), + chip.coverage[[2]], + ncol = 1, heights = c(1, 0.1) +) final.plot ``` diff --git a/vignettes/TimeAndMemory.Rmd b/vignettes/TimeAndMemory.Rmd index 5e91c86..437d8fb 100644 --- a/vignettes/TimeAndMemory.Rmd +++ b/vignettes/TimeAndMemory.Rmd @@ -31,12 +31,14 @@ ls -lh possorted_genome_bam.bam library(ggcoverage) # prepare metadata -sample.meta = data.frame(SampleName=c('possorted_genome_bam'), - Type = c("possorted_genome_bam"), - Group = c("10x")) +sample.meta <- data.frame( + SampleName = c("possorted_genome_bam"), + Type = c("possorted_genome_bam"), + Group = c("10x") +) sample.meta # prepare track folder -track.folder = '~/projects/ggcoverage' +track.folder <- "~/projects/ggcoverage" # load the track # region length: 3631 system.time(track.df <- LoadTrackFile( @@ -68,8 +70,10 @@ With the track dataframe, we can generate coverage plot. ```{r big_bam_coverage_time} # create basic coverage plot # the running time is very small -system.time(basic.coverage <- ggcoverage(data = track.df, color = "red", - range.position = "out", show.mark.label = FALSE)) +system.time(basic.coverage <- ggcoverage( + data = track.df, color = "red", + range.position = "out", show.mark.label = FALSE +)) ``` The coverage plot (the bars are relatively dense, and can be viewed more clearly when saved as a PDF): @@ -117,20 +121,22 @@ ls -lh ./test2 ### Sequential normalization ```{r sequential_norm, eval=FALSE} # prepare sample metadata -sample.meta = data.frame(SampleName=c('SRR054616_rep1','SRR054616_rep2','SRR054616_rep3'), - Type = c("SRR054616_rep1","SRR054616_rep2","SRR054616_rep3"), - Group = c("rep1", "rep2", "rep3")) +sample.meta <- data.frame( + SampleName = c("SRR054616_rep1", "SRR054616_rep2", "SRR054616_rep3"), + Type = c("SRR054616_rep1", "SRR054616_rep2", "SRR054616_rep3"), + Group = c("rep1", "rep2", "rep3") +) sample.meta # track folder -track.folder = "./test" +track.folder <- "./test" # run system.time(track.df <- LoadTrackFile( - track.folder = track.folder, format = "bam", norm.method = "RPKM", - region = "14:21,677,306-21,737,601", bamcoverage.path = "~/anaconda3/bin/bamCoverage", - extend = 2000, meta.info = sample.meta - )) + track.folder = track.folder, format = "bam", norm.method = "RPKM", + region = "14:21,677,306-21,737,601", bamcoverage.path = "~/anaconda3/bin/bamCoverage", + extend = 2000, meta.info = sample.meta +)) #> user system elapsed #> 1169.767 35.005 1208.704 @@ -143,13 +149,15 @@ The **elapsed** time (unit: seconds) is the time spent from the start to the end ### Parallel normalization ```{r parallel_norm, eval=FALSE} # prepare sample metadata -sample.meta = data.frame(SampleName=c('SRR054616_rep1','SRR054616_rep2','SRR054616_rep3'), - Type = c("SRR054616_rep1","SRR054616_rep2","SRR054616_rep3"), - Group = c("rep1", "rep2", "rep3")) +sample.meta <- data.frame( + SampleName = c("SRR054616_rep1", "SRR054616_rep2", "SRR054616_rep3"), + Type = c("SRR054616_rep1", "SRR054616_rep2", "SRR054616_rep3"), + Group = c("rep1", "rep2", "rep3") +) sample.meta # track folder -track.folder = "./test2" +track.folder <- "./test2" # run with three cores system.time(track.df <- LoadTrackFile( @@ -170,12 +178,16 @@ system.time(track.df <- LoadTrackFile( In general, the loading step is the most time-consuming step. With the track dataframe, we can generate coverage plot. ```{r parallel_coverage_time} # read the track dataframe -track.df = utils::read.table(file = "~/projects/ggcoverage/ggcoverage_parallel_track.txt", - sep = "\t", header = T) +track.df <- utils::read.table( + file = "~/projects/ggcoverage/ggcoverage_parallel_track.txt", + sep = "\t", header = T +) # create basic coverage plot # the running time is very small -system.time(basic.coverage <- ggcoverage(data = track.df, color = "auto", - range.position = "out", show.mark.label = FALSE)) +system.time(basic.coverage <- ggcoverage( + data = track.df, color = "auto", + range.position = "out", show.mark.label = FALSE +)) ``` The coverage plot: diff --git a/vignettes/ggcoverage.Rmd b/vignettes/ggcoverage.Rmd index 6ffa104..c51819f 100644 --- a/vignettes/ggcoverage.Rmd +++ b/vignettes/ggcoverage.Rmd @@ -3,9 +3,11 @@ title: > ggcoverage User Guide author: - name: Yabing Song - affiliation: - - &id1 School of Life Sciences, Tsinghua University + affiliation: School of Life Sciences, Tsinghua University email: songyb18@mails.tsinghua.edu.cn +- name: Michael Jahn + affiliation: Max Planck Unit for the Science of Pathogens, Berlin + email: jahn@mpusp.mpg.de date: "`r BiocStyle::doc_date()`" package: "`r BiocStyle::pkg_ver('ggcoverage')`" abstract: > @@ -25,7 +27,7 @@ vignette: > BiocStyle::markdown() ``` -```{r setup, echo=FALSE, warning=FALSE} +```{r setup, echo=FALSE, warning = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", @@ -61,7 +63,7 @@ The goal of `ggcoverage` is simplify the process of visualizing omics coverage. `ggcoverage` is an R package distributed as part of the [CRAN](https://cran.r-project.org/). To install the package, start R and enter one of the following commands: -```{r install, eval=FALSE} +```{r install, eval = FALSE} # install via CRAN (not yet available) install.packages("ggcoverage") @@ -72,12 +74,10 @@ remotes::install_github("showteeth/ggcoverage") In general, it is **recommended** to install from [Github repository](https://github.com/showteeth/ggcoverage) (update more timely). -Once `ggcoverage` is installed, it can be loaded (together with other libraries) like this: +Once `ggcoverage` is installed, it can be loaded as every other package: -```{r library, message=FALSE, warning=FALSE} -library("rtracklayer") +```{r library, message = FALSE, warning = FALSE} library("ggcoverage") -library("ggpattern") ``` ## Manual @@ -155,7 +155,7 @@ The basic coverage plot has **two types**: Create line plot for **every sample** (`facet.key = "Type"`) and color by **every sample** (`group.key = "Type"`): -```{r basic_coverage_joint, warning=FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} +```{r basic_coverage_joint, warning = FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, plot.type = "joint", @@ -170,7 +170,7 @@ basic_coverage Create **group average line plot** (sample is indicated by `facet.key = "Type"`, group is indicated by `group.key = "Group"`): -```{r basic_coverage_joint_avg, warning=FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} +```{r basic_coverage_joint_avg, warning = FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, plot.type = "joint", @@ -186,7 +186,7 @@ basic_coverage #### Facet view -```{r basic_coverage, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +```{r basic_coverage, warning = FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, plot.type = "facet", @@ -201,7 +201,7 @@ basic_coverage **Change the Y-axis scale label in/out of plot region with `range.position`**: -```{r basic_coverage_2, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +```{r basic_coverage_2, warning = FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, plot.type = "facet", @@ -214,7 +214,7 @@ basic_coverage **Shared/Free Y-axis scale with `facet.y.scale`**: -```{r basic_coverage_3, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +```{r basic_coverage_3, warning = FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, plot.type = "facet", @@ -228,7 +228,12 @@ basic_coverage ### Add gene annotation -```{r gene_coverage, warning=FALSE, fig.height = 8, fig.width = 12, fig.align = "center"} +- default behavior is to draw genes (transcripts), exons and UTRs with different line width +- can bec adjusted using `gene.size`, `exon.size` and `utr.size` parameters +- frequency of intermittent arrows (light color) can be adjusted using the `arrow.num` and `arrow.gap` parameters +- genomic features are colored by `strand` by default, which can be changed using the `color.by` parameter + +```{r gene_coverage, warning = FALSE, fig.height = 8, fig.width = 12, fig.align = "center"} basic_coverage + geom_gene(gtf.gr = gtf_gr) ``` @@ -238,23 +243,31 @@ basic_coverage + **In "loose" stype (default style; each transcript occupies one line)**: -```{r transcript_coverage, warning=FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} +```{r transcript_coverage, warning = FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} basic_coverage + geom_transcript(gtf.gr = gtf_gr, label.vjust = 1.5) ``` **In "tight" style (place non-overlap transcripts in one line)**: -```{r transcript_coverage_tight, warning=FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} +```{r transcript_coverage_tight, warning = FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} basic_coverage + - geom_transcript(gtf.gr = gtf_gr, - overlap.style = "tight", - label.vjust = 1.5) + geom_transcript( + gtf.gr = gtf_gr, + overlap.style = "tight", + label.vjust = 1.5 + ) ``` ### Add ideogram +The ideogram is an overview plot about the respective position on a chromosome. +The plotting of the ideogram is implemented by the `ggbio` package. +This package needs to be installed separately (it is only 'Suggested' by `ggcoverage`). + ```{r ideogram_coverage_1, eval = FALSE} +library(ggbio) + basic_coverage + geom_gene(gtf.gr = gtf_gr) + geom_ideogram(genome = "hg19", plot.space = 0) @@ -282,7 +295,7 @@ knitr::include_graphics("../man/figures/README-ideogram_coverage_2-1.png") ##### Load the data -The DNA-seq data used here are from [Copy number work flow](http://bioconductor.org/help/course-materials/2014/SeattleOct2014/B02.2.3_CopyNumber.html), we select tumor sample, and get bin counts with `cn.mops::getReadCountsFromBAM` with `WL` 1000. +The DNA-seq data used here are from [Copy number work flow](https://bioconductor.org/help/course-materials/2014/SeattleOct2014/B02.2.3_CopyNumber.html), we select tumor sample, and get bin counts with `cn.mops::getReadCountsFromBAM` with `WL` 1000. ```{r load_bin_counts} # prepare metafile @@ -294,7 +307,9 @@ cnv_meta_info <- data.frame( # track file track_file <- system.file("extdata", - "DNA-seq", "CNV_example.txt", package = "ggcoverage") + "DNA-seq", "CNV_example.txt", + package = "ggcoverage" +) # load txt file track_df <- LoadTrackFile( @@ -310,7 +325,7 @@ head(track_df) ##### Basic coverage -```{r basic_coverage_dna, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +```{r basic_coverage_dna, warning = FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, color = "grey", @@ -353,9 +368,11 @@ track_file <- system.file("extdata", "DNA-seq", "SRR054616.bw", package = "ggcoverage") # load track -track_df <- LoadTrackFile(track.file = track_file, - format = "bw", - region = "4:1-160000000") +track_df <- LoadTrackFile( + track.file = track_file, + format = "bw", + region = "4:1-160000000" +) # add chr prefix track_df$seqnames <- paste0("chr", track_df$seqnames) @@ -383,7 +400,8 @@ basic_coverage # prepare files cnv_file <- system.file("extdata", "DNA-seq", "SRR054616_copynumber.txt", - package = "ggcoverage") + package = "ggcoverage" + ) # read CNV cnv_df <- read.table(file = cnv_file, sep = "\t", header = TRUE) @@ -396,13 +414,15 @@ head(cnv_df) Add **GC**, **ideogram** and **CNV** annotations. -```{r cnv_gc_coverage, eval=FALSE} +```{r cnv_gc_coverage, eval = FALSE} # create plot basic_coverage + geom_gc(bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19) + - geom_cnv(cnv.df = cnv_df, - bin.col = 3, - cn.col = 4) + + geom_cnv( + cnv.df = cnv_df, + bin.col = 3, + cn.col = 4 + ) + geom_ideogram( genome = "hg19", plot.space = 0, @@ -429,8 +449,9 @@ sample_meta <- data.frame( # load bam file bam_file <- system.file("extdata", - "DNA-seq", "tumorA.chr4.selected.bam", - package = "ggcoverage") + "DNA-seq", "tumorA.chr4.selected.bam", + package = "ggcoverage" +) track_df <- LoadTrackFile( track.file = bam_file, @@ -448,7 +469,7 @@ For base and amino acid annotation, we have following default color schemes, you Default color scheme for base annotation is `Clustal-style`, more popular color schemes is available [here](https://www.biostars.org/p/171056/). -```{r base_color_scheme, warning=FALSE, fig.height = 2, fig.width = 6, fig.align = "center"} +```{r base_color_scheme, warning = FALSE, fig.height = 2, fig.width = 6, fig.align = "center"} # color scheme nuc_color <- c( "A" = "#ff2b08", "C" = "#009aff", "G" = "#ffb507", "T" = "#00bc0d" @@ -482,7 +503,7 @@ graphics::par(opar) Default color scheme for amino acid annotation is from [Residual colours: a proposal for aminochromography](https://academic.oup.com/peds/article/10/7/743/1593029?login=false): -```{r aa_color_scheme, warning=FALSE, fig.height = 9, fig.width = 10, fig.align = "center"} +```{r aa_color_scheme, warning = FALSE, fig.height = 9, fig.width = 10, fig.align = "center"} aa_color <- c( "D" = "#FF0000", "S" = "#FF2400", "T" = "#E34234", "G" = "#FF8000", "P" = "#F28500", "C" = "#FFFF00", "A" = "#FDFF00", "V" = "#E3FF00", @@ -531,9 +552,11 @@ ggcoverage( single.nuc = TRUE, rect.color = "white" ) + - geom_base(bam.file = bam_file, - bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, - mark.type = "twill") + + geom_base( + bam.file = bam_file, + bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, + mark.type = "twill" + ) + geom_ideogram(genome = "hg19", plot.space = 0) ``` @@ -553,9 +576,11 @@ ggcoverage( single.nuc = TRUE, rect.color = "white" ) + - geom_base(bam.file = bam_file, - bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, - mark.type = "star") + + geom_base( + bam.file = bam_file, + bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, + mark.type = "star" + ) + geom_ideogram(genome = "hg19", plot.space = 0) ``` @@ -575,9 +600,11 @@ ggcoverage( single.nuc = TRUE, rect.color = "white" ) + - geom_base(bam.file = bam_file, - bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, - mark.type = "highlight") + + geom_base( + bam.file = bam_file, + bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, + mark.type = "highlight" + ) + geom_ideogram(genome = "hg19", plot.space = 0) ``` @@ -642,10 +669,12 @@ mark_region ### Basic coverage -```{r basic_coverage_chip, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -basic_coverage <- ggcoverage(data = track_df, - mark.region = mark_region, - show.mark.label = FALSE) +```{r basic_coverage_chip, warning = FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +basic_coverage <- ggcoverage( + data = track_df, + mark.region = mark_region, + show.mark.label = FALSE +) basic_coverage ``` @@ -656,9 +685,10 @@ Add **gene**, **ideogram** and **peak** annotations. To create peak annotation, ```{r peak_coverage, eval = FALSE} # get consensus peak file peak_file <- system.file("extdata", - "ChIP-seq", - "consensus.peak", - package = "ggcoverage") + "ChIP-seq", + "consensus.peak", + package = "ggcoverage" +) basic_coverage + geom_gene(gtf.gr = gtf_gr) + @@ -672,9 +702,14 @@ knitr::include_graphics("../man/figures/README-peak_coverage-1.png") ## Hi-C data +The Hi-C method maps chromosome contacts in eukaryotic cells. +For this purpose, DNA and protein complexes are cross-linked and DNA fragments then purified. +As a result, even distant chromatin fragments can be found to interact due to the spatial organization of the DNA and histones in the cell. Hi-C data shows these interactions for example as a contact map. + The Hi-C data are from [pyGenomeTracks: reproducible plots for multivariate genomic datasets](https://academic.oup.com/bioinformatics/article/37/3/422/5879987?login=false). -The Hi-C matrix visualization is implemented by [HiCBricks](https://github.com/koustav-pal/HiCBricks). +The Hi-C matrix visualization is implemented by [`HiCBricks`](https://github.com/koustav-pal/HiCBricks). +This package needs to be installed separately (it is only 'Suggested' by `ggcoverage`). ### Load track data @@ -703,7 +738,9 @@ Matrix: ```{r hic_load_hic_matrix} ## matrix hic_mat_file <- system.file("extdata", - "HiC", "HiC_mat.txt", package = "ggcoverage") + "HiC", "HiC_mat.txt", + package = "ggcoverage" +) hic_mat <- read.table(file = hic_mat_file, sep = "\t") hic_mat <- as.matrix(hic_mat) ``` @@ -737,7 +774,7 @@ link_file <- ### Basic coverage -```{r basic_coverage_hic, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +```{r basic_coverage_hic, warning = FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} basic_coverage <- ggcoverage( data = track_df, @@ -754,6 +791,8 @@ basic_coverage Add **link**, **contact map**annotations: ```{r hic_coverage, eval = FALSE} +library(HiCBricks) + basic_coverage + geom_tad( matrix = hic_mat, @@ -764,9 +803,11 @@ basic_coverage + top = FALSE, show.rect = TRUE ) + - geom_link(link.file = link_file, - file.type = "bedpe", - show.rect = TRUE) + geom_link( + link.file = link_file, + file.type = "bedpe", + show.rect = TRUE + ) ``` ```{r hic_coverage_plot, echo = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} @@ -779,14 +820,16 @@ knitr::include_graphics("../man/figures/README-hic_coverage-1.png") ### Load coverage -The exported coverage from [Proteome Discoverer](https://www.thermofisher.cn/cn/zh/home/industrial/mass-spectrometry/liquid-chromatography-mass-spectrometry-lc-ms/lc-ms-software/multi-omics-data-analysis/proteome-discoverer-software.html?adobe_mc=MCMID%7C90228073352279367993013412919222863692%7CMCAID%3D3208C32C269355DE-4000028116B65FEB%7CMCORGID%3D5B135A0C5370E6B40A490D44%40AdobeOrg%7CTS=1614293705): +The exported coverage from [Proteome Discoverer](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8006021/): -```{r ms_coverage_data} +```{r ms_coverage_data, eval = FALSE} library(openxlsx) # prepare coverage dataframe coverage_file <- system.file("extdata", - "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") + "Proteomics", "MS_BSA_coverage.xlsx", + package = "ggcoverage" + ) coverage_df <- openxlsx::read.xlsx(coverage_file, sheet = "Sheet1") # check the data head(coverage_df) @@ -794,10 +837,12 @@ head(coverage_df) The input protein fasta: -```{r ms_coverage_fasta} +```{r ms_coverage_fasta, eval = FALSE} fasta_file <- system.file("extdata", - "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") + "Proteomics", "MS_BSA_coverage.fasta", + package = "ggcoverage" + ) # prepare track dataframe protein_set <- Biostrings::readAAStringSet(fasta_file) @@ -838,8 +883,10 @@ protein_feature_df <- data.frame( # add annotation protein_coverage + - geom_feature(feature.df = protein_feature_df, - feature.color = c("#4d81be", "#173b5e", "#6a521d")) + geom_feature( + feature.df = protein_feature_df, + feature.color = c("#4d81be", "#173b5e", "#6a521d") + ) ``` ```{r basic_coverage_protein_feature_plot, echo = FALSE, fig.height = 6, fig.width = 10, fig.align = "center"}