diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..d19bff8 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,13 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^\.github$ +^LICENSE\.md$ +^CODE_OF_CONDUCT\.md$ +^CONTRIBUTING\.md$ +^README\.Rmd$ +^cran-comments\.md$ +^CRAN-SUBMISSION$ +^_pkgdown\.yml$ +^docs$ +^vignettes/CustomizeThePlot\.Rmd$ +^vignettes/TimeAndMemory\.Rmd$ diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml new file mode 100644 index 0000000..ae0c779 --- /dev/null +++ b/.github/workflows/R-CMD-check.yaml @@ -0,0 +1,48 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, dev] + pull_request: + branches: [main, dev] + workflow_dispatch: + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: macos-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + - {os: ubuntu-latest, r: 'release'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v3 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + upload-snapshots: true diff --git a/.gitignore b/.gitignore index 5cb39df..16e98ef 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ inst/doc .Rhistory -.Rbuildignore .Rproj* .Rproj.user diff --git a/DESCRIPTION b/DESCRIPTION index 42d38d4..0b29213 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,18 @@ Package: ggcoverage Type: Package Title: Visualize Genome/Protein Coverage with Various Annotations -Version: 1.2.0 -Authors@R: +Version: 1.3.0 +Authors@R: c( person(given = "Yabing", family = "Song", role = c("aut", "cre"), - email = "songyb0519@gmail.com") + email = "songyb0519@gmail.com"), + person(given = "Michael", + family = "Jahn", + email = "jahn@mpusp.mpg.de", + role = "aut", + comment = c(ORCID = "0000-0002-3913-153X")) + ) Maintainer: Yabing Song Description: The goal of 'ggcoverage' is to simplify the process of visualizing genome/protein coverage. It contains functions to load data from BAM, BigWig, BedGraph or txt/xlsx files, create genome/protein coverage plot, add various annotations to @@ -14,7 +20,7 @@ Description: The goal of 'ggcoverage' is to simplify the process of visualizing peak annotation, contact map annotation, link annotation and peotein feature annotation. License: MIT + file LICENSE Encoding: UTF-8 -RoxygenNote: 7.3.0 +RoxygenNote: 7.3.1 URL: https://showteeth.github.io/ggcoverage/, https://github.com/showteeth/ggcoverage BugReports: https://github.com/showteeth/ggcoverage/issues biocViews: @@ -50,7 +56,7 @@ Imports: BiocParallel, openxlsx, stringr, - ggpp + gridExtra Suggests: rmarkdown, knitr, diff --git a/LICENSE b/LICENSE index ea8d657..afa75e4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,2 +1,2 @@ -YEAR: 2022 -COPYRIGHT HOLDER: ggtrack authors +YEAR: 2022-2024 +COPYRIGHT HOLDER: ggcoverage authors diff --git a/LICENSE.md b/LICENSE.md index 71fd20e..9c73e33 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ # MIT License -Copyright (c) 2022 ggcoverage authors +Copyright (c) 2022-2024 ggcoverage authors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/NAMESPACE b/NAMESPACE index 43633a7..9a7fb32 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -73,11 +73,14 @@ importFrom(RColorBrewer,brewer.pal) importFrom(Rsamtools,ScanBamParam) importFrom(Rsamtools,indexBam) importFrom(S4Vectors,"values<-") +importFrom(dplyr,all_of) importFrom(dplyr,arrange) importFrom(dplyr,filter) importFrom(dplyr,group_by) +importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,summarise) +importFrom(dplyr,summarize) importFrom(ggbio,layout_karyogram) importFrom(ggforce,geom_bezier) importFrom(ggh4x,elem_list_rect) @@ -90,6 +93,7 @@ importFrom(ggplot2,aes_string) importFrom(ggplot2,annotate) importFrom(ggplot2,arrow) importFrom(ggplot2,coord_cartesian) +importFrom(ggplot2,cut_width) importFrom(ggplot2,element_blank) importFrom(ggplot2,element_rect) importFrom(ggplot2,element_text) @@ -118,10 +122,11 @@ importFrom(ggplot2,scale_y_continuous) importFrom(ggplot2,theme) importFrom(ggplot2,theme_classic) importFrom(ggplot2,unit) -importFrom(ggpp,annotate) importFrom(ggrepel,geom_text_repel) importFrom(grDevices,col2rgb) importFrom(grDevices,colorRampPalette) +importFrom(gridExtra,tableGrob) +importFrom(gridExtra,ttheme_default) importFrom(magrittr,"%>%") importFrom(methods,extends) importFrom(openxlsx,read.xlsx) diff --git a/NEWS.md b/NEWS.md index cee56c5..c80ec28 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +# ggcoverage 1.3.0 +## Major changes +* Refactored `LoadTrack` function. +* Added GH actions workflow to run automatic R CMD CHECK + +## Minor changes +* Handle binning for bam files regardless if done via `bamCoverage` or `GenomicAlignments`. +* Various small linting fixes. +* Refactored `ggcoverage.Rmd` and `README.Rmd` in order to remove all lint errors, adhere to R code style + +------------- + # ggcoverage 1.2.0 ## Major changes * Support protein coverage and annotation plot (`ggprotein`, `geom_protein`, `geom_feature`). diff --git a/R/LoadTrack.R b/R/LoadTrack.R index 69b4031..1e63f33 100644 --- a/R/LoadTrack.R +++ b/R/LoadTrack.R @@ -3,25 +3,26 @@ #' @param track.file Track file, when \code{track.folder} is not NULL, determined by \code{track.folder}. #' @param track.folder Track file folder. Default: NULL. #' @param format Track file format, chosen from bam, wig, bw(bigwig), bedgraph(bedGraph) and txt. -#' @param region Region used to create coverage plot, eg: chr14:21,677,306-21,737,601 or chr14:21,677,306. -#' Default: "chr14:21,677,306-21,737,601" +#' @param region Region to extract coverage for, eg: chr14:21,677,306-21,737,601 or chr14:21,677,306. +#' Default: NULL, coverage is extracted from the first annotated chromosome/sequence. #' @param extend Extend length of \code{region}. Default: 2000. #' @param gtf.gr Granges object of GTF, created with \code{\link{import.gff}}. Default: NULL. #' @param gene.name The name of gene. Default: HNRNPC. #' @param gene.name.type Gene name type (filed of \code{gtf.gr}), chosen from gene_name and gene_id. -#' Default: gene_name. +#' Default: gene_name. #' @param meta.info Track file metadata. The columns should be: SampleName (\code{track.file} without suffix), -#' Type (sample with replicates information), Group (sample group). when \code{meta.file} is not NULL, -#' determined by \code{meta.file}.Default: NULL. +#' Type (sample with replicates information), Group (sample group). when \code{meta.file} is not NULL, +#' determined by \code{meta.file}.Default: NULL. #' @param meta.file File contains track file metadata. Default: "". #' @param bamcoverage.path The path to \code{bamCoverage}, used when \code{format} is bam. Default: NULL (auto-detect). #' @param norm.method Methods to normalize the number of reads per bin, chosen from "RPKM", "CPM", "BPM", "RPGC", "None". -#' Default: RPKM. +#' Default: RPKM. #' @param single.nuc Logical value, whether to visualize at single nucleotide level. Default: FALSE. #' @param single.nuc.region Region for \code{single.nuc}. Default: NULL -#' @param bin.size Size of the bins, in bases. Default: 50. +#' @param bin.size Size of the bins, in bases. Default: 10. Only used for BAM files, ignored for Wig, Bigwig, etc. +#' Set to NULL to turn binning off. #' @param bc.extra.para Extra parameters for \code{bamCoverage}, eg: "--effectiveGenomeSize 2700000000 --ignoreForNormalization chrX" -#' @param n.cores The number of cores to be used for this job. Default:1. +#' @param n.cores The number of cores to be used for this job. Default: 1. #' #' @return A dataframe. #' @importFrom rtracklayer import @@ -31,28 +32,41 @@ #' @importFrom GenomicRanges GRanges #' @importFrom IRanges IRanges subsetByOverlaps #' @importFrom magrittr %>% -#' @importFrom dplyr select filter +#' @importFrom dplyr select filter mutate all_of group_by summarize #' @importFrom BiocParallel register MulticoreParam bplapply +#' @importFrom ggplot2 cut_width #' @export #' #' @examples #' library(ggcoverage) #' library(utils) +#' #' meta.file <- system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") #' sample.meta <- utils::read.csv(meta.file) +#' #' # track folder #' track.folder <- system.file("extdata", "RNA-seq", package = "ggcoverage") +#' #' # load bigwig file #' track.df <- LoadTrackFile( -#' track.folder = track.folder, format = "bw", region = "chr14:21,677,306-21,737,601", -#' extend = 2000, meta.info = sample.meta +#' track.folder = track.folder, +#' format = "bw", +#' region = "chr14:21,677,306-21,737,601", +#' extend = 2000, +#' meta.info = sample.meta #' ) -LoadTrackFile <- function(track.file, track.folder = NULL, format = c("bam", "wig", "bw", "bedgraph", "txt"), - region = "chr14:21,677,306-21,737,601", extend = 2000, - gtf.gr = NULL, gene.name = "HNRNPC", gene.name.type = c("gene_name", "gene_id"), - meta.info = NULL, meta.file = "", - bamcoverage.path = NULL, norm.method = c("RPKM", "CPM", "BPM", "RPGC", "None"), - single.nuc = FALSE, single.nuc.region = NULL, bin.size = 10, bc.extra.para = NULL, n.cores = 1) { +LoadTrackFile <- function( + track.file, track.folder = NULL, + format = c("bam", "wig", "bw", "bedgraph", "txt"), + region = NULL, extend = 2000, + gtf.gr = NULL, gene.name = "HNRNPC", + gene.name.type = c("gene_name", "gene_id"), + meta.info = NULL, meta.file = "", + bamcoverage.path = NULL, + norm.method = c("RPKM", "CPM", "BPM", "RPGC", "None"), + single.nuc = FALSE, single.nuc.region = NULL, + bin.size = 10, bc.extra.para = NULL, n.cores = 1 +) { # check parameters format <- match.arg(arg = format) gene.name.type <- match.arg(arg = gene.name.type) @@ -63,216 +77,115 @@ LoadTrackFile <- function(track.file, track.folder = NULL, format = c("bam", "wi track.file <- list.files(path = track.folder, full.names = TRUE, pattern = paste0(format, "$")) } + # get genomic region if supplied, else it is guessed from input + if (is.null(region)) { + message("No 'region' specified; extracting coverage for an example range\n(<=100,000 bases, first annotated sequence)") + if (format == "bam") { + seqnames <- Rsamtools::scanBamHeader(track.file[1]) %>% + lapply(function(x) x$targets) %>% + unname %>% + unlist + gr <- GenomicRanges::GRanges( + seqnames = names(seqnames[1]), + IRanges(start = 1, end = min(100000, seqnames[1])) + ) + } else if (format %in% c("wig", "bw", "bedgraph")) { + gr <- range(rtracklayer::import(track.file[1])) + seqnames <- as.character(seqnames(gr)) + if (GenomicRanges::width(gr) <= 100000) { + gr <- GenomicRanges::resize(gr, width = 100000) + } + } + message(paste0("Coverage extracted from sequence/chromosome: ", names(seqnames[1]))) + } else { + gr <- PrepareRegion( + region = region, + gtf.gr = gtf.gr, + gene.name = gene.name, + gene.name.type = gene.name.type, + extend = extend + ) + } + # get track dataframe if (format %in% c("wig", "bw", "bedgraph")) { if (single.nuc) { - stop("To visualize single nucleotide, please use bam file!") + stop("To visualize single nucleotide resolution, please use bam file!") } else { - # get used gr - gr <- PrepareRegion(region = region, gtf.gr = gtf.gr, gene.name = gene.name, gene.name.type = gene.name.type, extend = extend) if (is.null(n.cores) || n.cores == 1) { - # read track file - track.list <- lapply(track.file, function(x) { - # get basename - track.file.base <- basename(x) - # import wig, bigwig and bedgraph file - single.track.df <- as.data.frame(rtracklayer::import(x, which = gr)) - single.track.df$TrackFile <- track.file.base - return(single.track.df) - }) + track.list <- lapply(track.file, import_bw, gr) } else { - # register BiocParallel::register(BiocParallel::MulticoreParam(workers = n.cores), default = TRUE) - # read track file - track.list <- BiocParallel::bplapply(track.file, BPPARAM = BiocParallel::MulticoreParam(), FUN = function(x) { - # get basename - track.file.base <- basename(x) - # import wig, bigwig and bedgraph file - single.track.df <- as.data.frame(rtracklayer::import(x, which = gr)) - single.track.df$TrackFile <- track.file.base - return(single.track.df) - }) + track.list <- BiocParallel::bplapply(track.file, BPPARAM = BiocParallel::MulticoreParam(), FUN = import_bw, gr) } } } else if (format == "bam") { # create index if (is.null(n.cores) || n.cores == 1) { - for (bam in track.file) { - bam.index.file <- paste(bam, "bai", sep = ".") - if (!file.exists(bam.index.file)) { - message("Create index file for: ", basename(bam)) - Rsamtools::indexBam(bam) - } - } + lapply(track.file, index_bam) } else { - # register BiocParallel::register(BiocParallel::MulticoreParam(workers = n.cores), default = TRUE) - index.flag <- BiocParallel::bplapply(track.file, BPPARAM = BiocParallel::MulticoreParam(), FUN = function(x) { - bam.index.file <- paste(x, "bai", sep = ".") - if (!file.exists(bam.index.file)) { - message("Create index file for: ", basename(x)) - Rsamtools::indexBam(x) - } - }) + BiocParallel::bplapply(track.file, BPPARAM = BiocParallel::MulticoreParam(), FUN = index_bam) } if (single.nuc) { - if (!is.null(single.nuc.region)) { - single.nuc.region <- gsub(pattern = ",", replacement = "", x = single.nuc.region) - single.nuc.region.chr <- unlist(strsplit(x = single.nuc.region, split = ":"))[1] - single.nuc.region.se <- unlist(strsplit(x = single.nuc.region, split = ":"))[2] - single.nuc.region.start <- unlist(strsplit(x = single.nuc.region.se, split = "-"))[1] - single.nuc.region.end <- unlist(strsplit(x = single.nuc.region.se, split = "-"))[2] - # load if (is.null(n.cores) || n.cores == 1) { - track.list <- lapply(track.file, function(x) { - single.track.df <- GenomicAlignments::alphabetFrequencyFromBam(x, param = single.nuc.region, baseOnly = TRUE) %>% as.data.frame() - single.track.df <- single.track.df[, c("A", "G", "C", "T")] - single.track.df$score <- rowSums(single.track.df) - single.track.df$seqnames <- single.nuc.region.chr - single.track.df$start <- single.nuc.region.start:single.nuc.region.end - single.track.df$end <- single.track.df$start + 1 - single.track.df$width <- 1 - single.track.df$strand <- "*" - single.track.df <- single.track.df %>% dplyr::select(-c("A", "G", "C", "T")) - # get basename - track.file.base <- basename(x) - single.track.df$TrackFile <- track.file.base - single.track.df <- single.track.df[c( - "seqnames", "start", "end", "width", - "strand", "score", "TrackFile" - )] - return(single.track.df) - }) + track.list <- lapply( + track.file, + single_nuc_cov, + single.nuc.region + ) } else { - # register - BiocParallel::register(BiocParallel::MulticoreParam(workers = n.cores), default = TRUE) - track.list <- BiocParallel::bplapply(track.file, BPPARAM = BiocParallel::MulticoreParam(), FUN = function(x) { - single.track.df <- GenomicAlignments::alphabetFrequencyFromBam(x, param = single.nuc.region, baseOnly = TRUE) %>% as.data.frame() - single.track.df <- single.track.df[, c("A", "G", "C", "T")] - single.track.df$score <- rowSums(single.track.df) - single.track.df$seqnames <- single.nuc.region.chr - single.track.df$start <- single.nuc.region.start:single.nuc.region.end - single.track.df$end <- single.track.df$start + 1 - single.track.df$width <- 1 - single.track.df$strand <- "*" - single.track.df <- single.track.df %>% dplyr::select(-c("A", "G", "C", "T")) - # get basename - track.file.base <- basename(x) - single.track.df$TrackFile <- track.file.base - single.track.df <- single.track.df[c( - "seqnames", "start", "end", "width", - "strand", "score", "TrackFile" - )] - return(single.track.df) - }) + track.list <- BiocParallel::bplapply( + track.file, + BPPARAM = BiocParallel::MulticoreParam(), + FUN = single_nuc_cov, + single.nuc.region + ) } - } else { - stop("Please provide region for visualizing single nucleotide!") - } } else { - # get used gr - gr <- PrepareRegion(region = region, gtf.gr = gtf.gr, gene.name = gene.name, gene.name.type = gene.name.type, extend = extend) if (norm.method == "None") { - message("Calculate coverage with GenomicAlignments when norm.method is None!") + message("Calculating coverage with GenomicAlignments when 'norm.method = None'") if (is.null(n.cores) || n.cores == 1) { - track.list <- lapply(track.file, function(x) { - # get basename - track.file.base <- basename(x) - # load track - param <- Rsamtools::ScanBamParam(which = gr) - ga <- GenomicAlignments::readGAlignments(x, param = param) - ga.cov <- GenomicAlignments::coverage(ga) - ga.cov.gr <- GenomicRanges::GRanges(ga.cov) - ga.cov.df <- IRanges::subsetByOverlaps(ga.cov.gr, gr) %>% as.data.frame() - # valid the region - gr.df <- as.data.frame(gr) - ga.cov.df[1, "start"] <- gr.df[1, "start"] - ga.cov.df[nrow(ga.cov.df), "end"] <- gr.df[1, "end"] - # add track file - ga.cov.df$TrackFile <- track.file.base - return(ga.cov.df) - }) + track.list <- lapply( + track.file, import_bam_ga, gr, bin.size + ) } else { - # register - BiocParallel::register(BiocParallel::MulticoreParam(workers = n.cores), default = TRUE) - track.list <- BiocParallel::bplapply(track.file, BPPARAM = BiocParallel::MulticoreParam(), FUN = function(x) { - # get basename - track.file.base <- basename(x) - # load track - param <- Rsamtools::ScanBamParam(which = gr) - ga <- GenomicAlignments::readGAlignments(x, param = param) - ga.cov <- GenomicAlignments::coverage(ga) - ga.cov.gr <- GenomicRanges::GRanges(ga.cov) - ga.cov.df <- IRanges::subsetByOverlaps(ga.cov.gr, gr) %>% as.data.frame() - # valid the region - gr.df <- as.data.frame(gr) - ga.cov.df[1, "start"] <- gr.df[1, "start"] - ga.cov.df[nrow(ga.cov.df), "end"] <- gr.df[1, "end"] - # add track file - ga.cov.df$TrackFile <- track.file.base - return(ga.cov.df) - }) + track.list <- BiocParallel::bplapply( + track.file, + BPPARAM = BiocParallel::MulticoreParam(), + FUN = import_bam_ga, gr, bin.size + ) } } else { - message("Calculate coverage with bamCoverage when norm.method is not None!") + message("Calculate coverage with bamCoverage when 'norm.method != None'") # require deeptools if (is.null(bamcoverage.path)) { bamcoverage.path <- Sys.which("bamCoverage") if (bamcoverage.path == "") { - stop("Can not find bamCoverage automatically, please specify the path!") + stop("Can not find bamCoverage automatically, please specify 'bamcoverage.path'") } } else { bamcoverage.path <- bamcoverage.path } - # read track file if (is.null(n.cores) || n.cores == 1) { - track.list <- lapply(track.file, function(x) { - # get basename - track.file.base <- basename(x) - # bigwig file - out.bw.file <- tempfile(fileext = c(".bw")) - # prepare bamCoverage cmd - bamcoverage.cmd <- paste( - bamcoverage.path, "-b", x, "-o", out.bw.file, - "--binSize", bin.size, "--normalizeUsing", norm.method, bc.extra.para - ) - # run command - message(paste("Calling bamCoverage: ", bamcoverage.cmd)) - bamcoverage.status <- system(bamcoverage.cmd, intern = TRUE) - bamcoverage.status.code <- attr(bamcoverage.status, "status") - if (!is.null(bamcoverage.status.code)) { - stop("Run bamCoverage error!") - } - # import wig, bigwig and bedgraph file - single.track.df <- as.data.frame(rtracklayer::import(out.bw.file, which = gr)) - single.track.df$TrackFile <- track.file.base - return(single.track.df) - }) + track.list <- lapply( + track.file, + bam_coverage, + bamcoverage.path, + bin.size, + norm.method, + bc.extra.para, gr + ) } else { - # register - BiocParallel::register(BiocParallel::MulticoreParam(workers = n.cores), default = TRUE) - track.list <- BiocParallel::bplapply(track.file, BPPARAM = BiocParallel::MulticoreParam(), FUN = function(x) { - # get basename - track.file.base <- basename(x) - # bigwig file - out.bw.file <- tempfile(fileext = c(".bw")) - # prepare bamCoverage cmd - bamcoverage.cmd <- paste( - bamcoverage.path, "-b", x, "-o", out.bw.file, - "--binSize", bin.size, "--normalizeUsing", norm.method, bc.extra.para - ) - # run command - message(paste("Calling bamCoverage: ", bamcoverage.cmd)) - bamcoverage.status <- system(bamcoverage.cmd, intern = TRUE) - bamcoverage.status.code <- attr(bamcoverage.status, "status") - if (!is.null(bamcoverage.status.code)) { - stop("Run bamCoverage error!") - } - # import wig, bigwig and bedgraph file - single.track.df <- as.data.frame(rtracklayer::import(out.bw.file, which = gr)) - single.track.df$TrackFile <- track.file.base - return(single.track.df) - }) + track.list <- BiocParallel::bplapply( + track.file, + BPPARAM = BiocParallel::MulticoreParam(), + FUN = bam_coverage, + bamcoverage.path, + bin.size, + norm.method, + bc.extra.para, gr + ) } } } @@ -282,34 +195,15 @@ LoadTrackFile <- function(track.file, track.folder = NULL, format = c("bam", "wi } else { # read track file if (is.null(n.cores) || n.cores == 1) { - track.list <- lapply(track.file, function(x) { - # get basename - track.file.base <- basename(x) - # import wig, bigwig and bedgraph file - single.track.df <- utils::read.table(track.file, header = TRUE) - single.track.df$TrackFile <- track.file.base - return(single.track.df) - }) + track.list <- lapply(track.file, import_txt) } else { - # register BiocParallel::register(BiocParallel::MulticoreParam(workers = n.cores), default = TRUE) - track.list <- BiocParallel::bplapply(track.file, BPPARAM = BiocParallel::MulticoreParam(), FUN = function(x) { - # get basename - track.file.base <- basename(x) - # import wig, bigwig and bedgraph file - single.track.df <- utils::read.table(track.file, header = TRUE) - single.track.df$TrackFile <- track.file.base - return(single.track.df) - }) + track.list <- BiocParallel::bplapply(track.file, BPPARAM = BiocParallel::MulticoreParam(), FUN = import_txt) } } } # get track dataframe track.df <- do.call(rbind, track.list) - # remove width and strand - if (all(c("width", "strand") %in% colnames(track.df))) { - track.df <- track.df %>% dplyr::select(-c(width, strand)) - } # get metadata if (file.exists(meta.file)) { @@ -317,7 +211,7 @@ LoadTrackFile <- function(track.file, track.folder = NULL, format = c("bam", "wi } else if (!is.null(meta.info)) { meta.info.used <- meta.info } else { - message("Sample without metadata!") + message("No metadata provided, returning coverage as is.") meta.info.used <- NULL } @@ -340,6 +234,117 @@ LoadTrackFile <- function(track.file, track.folder = NULL, format = c("bam", "wi gene.name = gene.name, gene.name.type = gene.name.type ) } - + # return final df return(track.df) } + +import_bw <- function(x, gr) { + single.track.df <- as.data.frame(rtracklayer::import(x, which = gr)) + single.track.df$TrackFile <- basename(x) + return(single.track.df) +} + +import_txt <- function(x) { + single.track.df <- utils::read.table(x, header = TRUE) + single.track.df$TrackFile <- basename(x) + return(single.track.df) +} + +import_bam_ga <- function(x, gr, bin.size) { + # get basename + track.file.base <- basename(x) + # load track + param <- Rsamtools::ScanBamParam(which = gr) + ga <- GenomicAlignments::readGAlignments(x, param = param) + ga.cov <- GenomicAlignments::coverage(ga) + ga.cov.gr <- GenomicRanges::GRanges(ga.cov) + ga.cov.df <- IRanges::subsetByOverlaps(ga.cov.gr, gr) %>% + as.data.frame() + # valid the region + gr.df <- as.data.frame(gr) + ga.cov.df[1, "start"] <- gr.df[1, "start"] + ga.cov.df[nrow(ga.cov.df), "end"] <- gr.df[1, "end"] + # optional binning + ga.cov.df <- bin_coverage(ga.cov.df, bin.size) + # add track file + ga.cov.df$TrackFile <- track.file.base + return(ga.cov.df) +} + +index_bam <- function(x) { + bam.index.file <- paste(x, "bai", sep = ".") + if (!file.exists(bam.index.file)) { + message("Create index file for: ", basename(x)) + Rsamtools::indexBam(x) + } +} + +single_nuc_cov <- function(x, single.nuc.region) { + if (!is.null(single.nuc.region)) { + single.nuc.region <- gsub(pattern = ",", replacement = "", x = single.nuc.region) + single.nuc.region.chr <- unlist(strsplit(x = single.nuc.region, split = ":"))[1] + single.nuc.region.se <- unlist(strsplit(x = single.nuc.region, split = ":"))[2] + single.nuc.region.start <- unlist(strsplit(x = single.nuc.region.se, split = "-"))[1] + single.nuc.region.end <- unlist(strsplit(x = single.nuc.region.se, split = "-"))[2] + } else { + stop("Please provide region for visualizing single nucleotide!") + } + single.track.df <- GenomicAlignments::alphabetFrequencyFromBam(x, param = single.nuc.region, baseOnly = TRUE) %>% + as.data.frame() + single.track.df <- single.track.df[, c("A", "G", "C", "T")] + single.track.df$score <- rowSums(single.track.df) + single.track.df$seqnames <- single.nuc.region.chr + single.track.df$start <- single.nuc.region.start:single.nuc.region.end + single.track.df$end <- single.track.df$start + 1 + single.track.df$width <- 1 + single.track.df$strand <- "*" + single.track.df <- single.track.df %>% dplyr::select(-c("A", "G", "C", "T")) + # get basename + track.file.base <- basename(x) + single.track.df$TrackFile <- track.file.base + single.track.df <- single.track.df[c( + "seqnames", "start", "end", "width", + "strand", "score", "TrackFile" + )] + return(single.track.df) +} + +bam_coverage <- function( + x, bamcoverage.path, bin.size, norm.method, bc.extra.para, gr +) { + # bigwig file + out.bw.file <- tempfile(fileext = c(".bw")) + # prepare bamCoverage cmd + bamcoverage.cmd <- paste( + bamcoverage.path, "-b", x, "-o", out.bw.file, + "--binSize", bin.size, "--normalizeUsing", norm.method, bc.extra.para + ) + # run command + message(paste("Calling bamCoverage: ", bamcoverage.cmd)) + bamcoverage.status <- system(bamcoverage.cmd, intern = TRUE) + bamcoverage.status.code <- attr(bamcoverage.status, "status") + if (!is.null(bamcoverage.status.code)) { + stop("Run bamCoverage error.") + } + # import wig, bigwig and bedgraph file + single.track.df <- as.data.frame(rtracklayer::import(out.bw.file, which = gr)) + single.track.df$TrackFile <- basename(x) + return(single.track.df) +} + +bin_coverage <- function(df, bin.size = 10) { + if (!is.null(bin.size) && is.numeric(bin.size)) { + binned_df <- df %>% + dplyr::mutate( + bin = ggplot2::cut_width(start, width = bin.size, center = bin.size / 2, labels = FALSE) * bin.size + ) %>% + dplyr::group_by(.data$seqnames, .data$bin, .data$strand) %>% + dplyr::summarize(score = mean(.data$score, na.rm = TRUE), .groups = "drop") %>% + dplyr::mutate(start = .data$bin - (min(.data$bin) - 1), end = .data$bin, width = bin.size) %>% + dplyr::select(dplyr::all_of(c("seqnames", "start", "end", "width", "strand", "score"))) %>% + as.data.frame() + return(binned_df) + } else { + return(df) + } +} diff --git a/R/geom_base.R b/R/geom_base.R index ea83762..c044913 100644 --- a/R/geom_base.R +++ b/R/geom_base.R @@ -183,16 +183,16 @@ ggplot_add.base <- function(object, plot, object_name) { colnames(pos.nuc.freq.long) <- c("Chr", "Pos", "Ref", "Total", "Base", "Freq") # get position with alt alt.pos <- pos.nuc.freq.long %>% - dplyr::filter(Ref == Base & Total != Freq) %>% - dplyr::pull(Pos) %>% + dplyr::filter(.data$Ref == .data$Base & .data$Total != .data$Freq) %>% + dplyr::pull(.data$Pos) %>% unique() - alt.pos.nuc.freq.long <- pos.nuc.freq.long %>% dplyr::filter(Pos %in% c(alt.pos)) + alt.pos.nuc.freq.long <- pos.nuc.freq.long %>% dplyr::filter(.data$Pos %in% c(alt.pos)) # get position without alt ref.pos <- pos.nuc.freq.long %>% - dplyr::filter(Ref == Base & Total == Freq) %>% - dplyr::pull(Pos) %>% + dplyr::filter(.data$Ref == .data$Base & .data$Total == .data$Freq) %>% + dplyr::pull(.data$Pos) %>% unique() - ref.pos.nuc.freq.long <- pos.nuc.freq.long %>% dplyr::filter(Pos %in% c(ref.pos)) + ref.pos.nuc.freq.long <- pos.nuc.freq.long %>% dplyr::filter(.data$Pos %in% c(ref.pos)) # create label offset pos.nuc.freq$Offset <- nuc.offset # add guide line diff --git a/R/geom_feature.R b/R/geom_feature.R index 85fd801..2f4db9f 100644 --- a/R/geom_feature.R +++ b/R/geom_feature.R @@ -17,14 +17,29 @@ #' #' @examples #' # library(ggcoverage) -#' # coverage.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") -#' # fasta.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") +#' # coverage.file <- system.file( +#' # "extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage" +#' # ) +#' # fasta.file <- system.file( +#' # "extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage" +#' # ) #' # protein.id = "sp|P02769|ALBU_BOVIN" -#' # protein.coverage = ggprotein(coverage.file = coverage.file, fasta.file = fasta.file, protein.id = protein.id) -#' # feature.df = data.frame(ProteinID = protein.id, start = c(1, 19, 25), end = c(18, 24, 607), -#' # Type = c("Signal", "Propeptide", "Chain")) +#' # protein.coverage = ggprotein( +#' # coverage.file = coverage.file, +#' # fasta.file = fasta.file, +#' # protein.id = protein.id +#' # ) +#' # feature.df = data.frame( +#' # ProteinID = protein.id, +#' # start = c(1, 19, 25), +#' # end = c(18, 24, 607), +#' # Type = c("Signal", "Propeptide", "Chain") +#' # ) #' # protein.coverage + -#' # geom_feature(feature.df = feature.df, feature.color = c("#4d81be","#173b5e","#6a521d")) +#' # geom_feature( +#' # feature.df = feature.df, +#' # feature.color = c("#4d81be","#173b5e","#6a521d") +#' # ) geom_feature <- function(feature.file = NULL, feature.df = NULL, feature.color = "black", feature.size = 5, plot.space = 0.1, plot.height = 0.1) { structure(list( diff --git a/R/geom_protein.R b/R/geom_protein.R index 6d8afb0..3613769 100644 --- a/R/geom_protein.R +++ b/R/geom_protein.R @@ -14,7 +14,7 @@ #' @param show.table Logical value, whether to show coverage summary table. Default: TRUE. #' @param table.position The position of the coverage summary table, choose from right_top, left_top, left_bottom, right_bottom. #' Default: right_top. -#' @param table.size The font size of coverage summary table. Default: 4. +#' @param table.size The font size of coverage summary table. Default: 12. #' @param table.color The font color of coverage summary table. Default: black. #' @param range.size The label size of range text, used when \code{range.position} is in. Default: 3. #' @param range.position The position of y axis range, chosen from in (move y axis in the plot) and @@ -30,22 +30,26 @@ #' @importFrom GenomicRanges reduce GRanges setdiff #' @importFrom IRanges IRanges #' @importFrom ggplot2 ggplot geom_rect geom_text aes aes_string -#' @importFrom ggpp annotate #' @importFrom scales scientific +#' @importFrom gridExtra ttheme_default tableGrob #' @export #' #' @examples #' library(ggplot2) #' library(ggcoverage) -#' coverage.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") -#' fasta.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") +#' coverage.file <- system.file( +#' "extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage" +#' ) +#' fasta.file <- system.file( +#' "extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage" +#' ) #' protein.id = "sp|P02769|ALBU_BOVIN" #' ggplot() + #' geom_protein(coverage.file = coverage.file, fasta.file = fasta.file, protein.id = protein.id) geom_protein <- function(coverage.file, fasta.file, protein.id, XCorr.threshold = 2, confidence = "High", contaminant = NULL, remove.na = TRUE, color = "grey", mark.bare = TRUE, mark.color = "red", mark.alpha = 0.5, - show.table = TRUE, table.position = c("right_top", "left_top", "left_bottom", "right_bottom"), + show.table = TRUE, table.position = c("top_right", "top_left", "bottom_right", "bottom_left"), table.size = 4, table.color = "black", range.size = 3, range.position = c("in", "out")) { # check parameters table.position <- match.arg(arg = table.position) @@ -165,22 +169,37 @@ geom_protein <- function(coverage.file, fasta.file, protein.id, XCorr.threshold # summary table if (show.table) { # table position - if (table.position == "left_top") { - table.x <- 0 - table.y <- max(coverage.final[, "abundance"]) - } else if (table.position == "right_top") { - table.x <- nchar(aa.seq.used) - table.y <- max(coverage.final[, "abundance"]) - } else if (table.position == "left_bottom") { - table.x <- 0 - table.y <- 0 - } else if (table.position == "right_bottom") { - table.x <- nchar(aa.seq.used) - table.y <- 0 + + if (table.position == "top_left") { + table_xmin <- 0 + table_xmax <- nchar(aa.seq.used) * 0.5 + table_ymin <- max(coverage.final[, "abundance"]) * 0.5 + table_ymax <- max(coverage.final[, "abundance"]) * 1.0 + } else if (table.position == "top_right") { + table_xmin <- nchar(aa.seq.used) * 0.5 + table_xmax <- nchar(aa.seq.used) * 1.0 + table_ymin <- max(coverage.final[, "abundance"]) * 0.5 + table_ymax <- max(coverage.final[, "abundance"]) * 1.0 + } else if (table.position == "bottom_left") { + table_xmin <- 0 + table_xmax <- nchar(aa.seq.used) * 0.5 + table_ymin <- 0 + table_ymax <- max(coverage.final[, "abundance"]) * 0.5 + } else if (table.position == "bottom_right") { + table_xmin <- nchar(aa.seq.used) * 0.5 + table_xmax <- nchar(aa.seq.used) * 1.0 + table_ymin <- 0 + table_ymax <- max(coverage.final[, "abundance"]) * 0.5 } - summary.table <- ggpp::annotate( - geom = "table", label = list(coverage.summary), x = table.x, y = table.y, - color = table.color, size = table.size + table_theme <- gridExtra::ttheme_default( + base_size = table.size, base_colour = table.color + ) + summary.table <- ggplot2::annotation_custom( + grob = gridExtra::tableGrob( + d = coverage.summary, + theme = table_theme), + xmin = table_xmin, xmax = table_xmax, + ymin = table_ymin, ymax = table_ymax ) plot.ele <- append(plot.ele, summary.table) } diff --git a/R/ggprotein.R b/R/ggprotein.R index d0fc2e9..f0388d2 100644 --- a/R/ggprotein.R +++ b/R/ggprotein.R @@ -21,31 +21,28 @@ #' out (normal y axis). Default: in. #' #' @return A ggplot2 object. -#' @importFrom openxlsx read.xlsx -#' @importFrom magrittr %>% -#' @importFrom dplyr filter group_by summarise arrange -#' @importFrom rlang .data -#' @importFrom Biostrings readAAStringSet -#' @importFrom stringr str_locate -#' @importFrom GenomicRanges reduce GRanges setdiff -#' @importFrom IRanges IRanges -#' @importFrom ggplot2 ggplot geom_rect geom_text aes aes_string scale_x_continuous theme_classic theme -#' element_blank annotate rel scale_y_continuous expansion -#' @importFrom ggpp annotate -#' @importFrom scales scientific #' @export #' #' @examples -#' # library(ggcoverage) -#' # coverage.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") -#' # fasta.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") -#' # protein.id = "sp|P02769|ALBU_BOVIN" -#' # ggprotein(coverage.file = coverage.file, fasta.file = fasta.file, protein.id = protein.id) +#' library(ggcoverage) +#' coverage.file <- system.file( +#' "extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage" +#' ) +#' fasta.file <- system.file( +#' "extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage" +#' ) +#' protein.id = "sp|P02769|ALBU_BOVIN" +#' +#' ggprotein( +#' coverage.file = coverage.file, +#' fasta.file = fasta.file, +#' protein.id = protein.id +#' ) ggprotein <- function(coverage.file, fasta.file, protein.id, XCorr.threshold = 2, confidence = "High", contaminant = NULL, remove.na = TRUE, color = "grey", mark.bare = TRUE, mark.color = "red", mark.alpha = 0.5, - show.table = TRUE, table.position = c("right_top", "left_top", "left_bottom", "right_bottom"), - table.size = 4, table.color = "black", range.size = 3, range.position = c("in", "out"), plot.space = 0.2) { + show.table = TRUE, table.position = c("top_right", "top_left", "bottom_left", "bottom_right"), + table.size = 10, table.color = "black", range.size = 3, range.position = c("in", "out")) { # check parameters table.position <- match.arg(arg = table.position) range.position <- match.arg(arg = range.position) diff --git a/R/utils.R b/R/utils.R index 1920618..148c73c 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,5 +1,5 @@ # prepare GR -PrepareRegion <- function(region = "chr14:21,677,306-21,737,601", +PrepareRegion <- function(region = NULL, gtf.gr = NULL, gene.name = "HNRNPC", gene.name.type = c("gene_name", "gene_id"), extend = 2000) { # check parameters @@ -126,7 +126,7 @@ GetGeneGroup <- function(gene.gr, fc = "queryHits", sc = "subjectHits", overlap. GetGeneGroupTight <- function(gene.gr, overlap.gene.gap = 1) { # convert to dataframe gene.gr.df <- as.data.frame(gene.gr) - gene.gr.df$ID <- 1:nrow(gene.gr.df) + gene.gr.df$ID <- seq_len(nrow(gene.gr.df)) # split to group group.flag <- 1 group.list <- list() @@ -142,12 +142,12 @@ GetGeneGroupTight <- function(gene.gr, overlap.gene.gap = 1) { } } group.list[[paste0("G", group.flag)]] <- vec - gene.gr.df <- gene.gr.df %>% dplyr::filter(!ID %in% vec) + gene.gr.df <- gene.gr.df %>% dplyr::filter(!.data$ID %in% vec) group.flag <- group.flag + 1 } # get group index group.index <- c() - for (g in 1:length(group.list)) { + for (g in seq_along(group.list)) { g.index <- 1 + overlap.gene.gap * (g - 1) g.index.vec <- rep(g.index, length(group.list[[g]])) names(g.index.vec) <- group.list[[g]] @@ -322,7 +322,7 @@ getIdeogram <- function(genome, subchr = NULL, cytobands = TRUE) { # used in geom_base PrepareRect <- function(df, y.center = -0.2) { valid.df <- df[df$aa == "B" | df$anno != "", ] - rect.li <- lapply(1:nrow(valid.df), function(x) { + rect.li <- lapply(seq_len(nrow(valid.df)), function(x) { row.info <- valid.df[x, ] if (row.info$aa == "B") { c(row.info$Pos - 0.5, row.info$Pos + 0.5, row.info$aa) diff --git a/README.Rmd b/README.Rmd index fa790d4..7c2049b 100644 --- a/README.Rmd +++ b/README.Rmd @@ -10,7 +10,8 @@ knitr::opts_chunk$set( comment = "#>", fig.path = "man/figures/README-", out.width = "100%", - dpi=60 + dpi = 60, + crop = NULL ) ``` @@ -19,11 +20,15 @@ knitr::opts_chunk$set( [![CRAN](https://www.r-pkg.org/badges/version/ggcoverage?color=orange)](https://cran.r-project.org/package=ggcoverage) +[![R-CMD-check](https://github.com/showteeth/ggcoverage/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/showteeth/ggcoverage/actions/workflows/R-CMD-check.yaml) +![GitHub issues](https://img.shields.io/github/issues/showteeth/ggcoverage) +![GitHub last commit](https://img.shields.io/github/last-commit/showteeth/ggcoverage) ![License](https://img.shields.io/badge/license-MIT-green) [![CODE_SIZE](https://img.shields.io/github/languages/code-size/showteeth/ggcoverage.svg)](https://github.com/showteeth/ggcoverage) ## Introduction - The goal of `ggcoverage` is simplify the process of visualizing omics coverage. It contains three main parts: + +The goal of `ggcoverage` is simplify the process of visualizing omics coverage. It contains three main parts: * **Load the data**: `ggcoverage` can load BAM, BigWig (.bw), BedGraph, txt/xlsx files from various omics data, including WGS, RNA-seq, ChIP-seq, ATAC-seq, proteomics, et al. * **Create omics coverage plot** @@ -43,21 +48,22 @@ knitr::opts_chunk$set( ## Installation + `ggcoverage` is an R package distributed as part of the [CRAN](https://cran.r-project.org/). -To install the package, start R and enter: +To install the package, start R and enter one of the following commands: ```{r install, eval=FALSE} -# install via CRAN +# install via CRAN (not yet available) install.packages("ggcoverage") -# install via Github -# install.package("remotes") #In case you have not installed it. +# OR install via Github +install.package("remotes") remotes::install_github("showteeth/ggcoverage") ``` In general, it is **recommended** to install from [Github repository](https://github.com/showteeth/ggcoverage) (update more timely). -Once `ggcoverage` is installed, it can be loaded by the following command. +Once `ggcoverage` is installed, it can be loaded (together with other libraries) like this: ```{r library, message=FALSE, warning=FALSE} library("rtracklayer") @@ -66,6 +72,7 @@ library("ggpattern") ``` ## Manual + `ggcoverage` provides two [vignettes](https://showteeth.github.io/ggcoverage/): * **detailed manual**: step-by-step usage @@ -73,251 +80,372 @@ library("ggpattern") ## RNA-seq data + ### Load the data + The RNA-seq data used here are from [Transcription profiling by high throughput sequencing of HNRNPC knockdown and control HeLa cells](https://bioconductor.org/packages/release/data/experiment/html/RNAseqData.HNRNPC.bam.chr14.html), we select four sample to use as example: ERR127307_chr14, ERR127306_chr14, ERR127303_chr14, ERR127302_chr14, and all bam files are converted to bigwig file with [deeptools](https://deeptools.readthedocs.io/en/develop/). Load metadata: + ```{r load_metadata} # load metadata -meta.file <- system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") -sample.meta = read.csv(meta.file) -sample.meta +meta_file <- + system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") +sample_meta <- read.csv(meta_file) +sample_meta ``` Load track files: + ```{r load_track} # track folder -track.folder = system.file("extdata", "RNA-seq", package = "ggcoverage") +track_folder <- system.file("extdata", "RNA-seq", package = "ggcoverage") # load bigwig file -track.df = LoadTrackFile(track.folder = track.folder, format = "bw", - region = "chr14:21,677,306-21,737,601", extend = 2000, - meta.info = sample.meta) +track_df <- LoadTrackFile( + track.folder = track_folder, + format = "bw", + region = "chr14:21,677,306-21,737,601", + extend = 2000, + meta.info = sample_meta +) # check data -head(track.df) +head(track_df) ``` Prepare mark region: + ```{r prepare_mark} # create mark region -mark.region=data.frame(start=c(21678900,21732001,21737590), - end=c(21679900,21732400,21737650), - label=c("M1", "M2", "M3")) +mark_region <- data.frame( + start = c(21678900, 21732001, 21737590), + end = c(21679900, 21732400, 21737650), + label = c("M1", "M2", "M3") +) # check data -mark.region +mark_region ``` ### Load GTF + To add **gene annotation**, the gtf file should contain **gene_type** and **gene_name** attributes in **column 9**; to add **transcript annotation**, the gtf file should contain **transcript_name** attribute in **column 9**. + ```{r load_gtf} -gtf.file = system.file("extdata", "used_hg19.gtf", package = "ggcoverage") -gtf.gr = rtracklayer::import.gff(con = gtf.file, format = 'gtf') +gtf_file <- + system.file("extdata", "used_hg19.gtf", package = "ggcoverage") +gtf_gr <- rtracklayer::import.gff(con = gtf_file, format = "gtf") ``` ### Basic coverage + The basic coverage plot has **two types**: * **facet**: Create subplot for every track (specified by `facet.key`). This is default. * **joint**: Visualize all tracks in a single plot. #### joint view + Create line plot for **every sample** (`facet.key = "Type"`) and color by **every sample** (`group.key = "Type"`): + ```{r basic_coverage_joint, warning=FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} -basic.coverage = ggcoverage(data = track.df, - plot.type = "joint", facet.key = "Type", group.key = "Type", - mark.region = mark.region, range.position = "out") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "joint", + facet.key = "Type", + group.key = "Type", + mark.region = mark_region, + range.position = "out" +) + +basic_coverage ``` Create **group average line plot** (sample is indicated by `facet.key = "Type"`, group is indicated by `group.key = "Group"`): + ```{r basic_coverage_joint_avg, warning=FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} -basic.coverage = ggcoverage(data = track.df, - plot.type = "joint", facet.key = "Type", group.key = "Group", - joint.avg = TRUE, - mark.region = mark.region, range.position = "out") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "joint", + facet.key = "Type", + group.key = "Group", + joint.avg = TRUE, + mark.region = mark_region, + range.position = "out" +) + +basic_coverage ``` -#### facet view +#### Facet view + ```{r basic_coverage, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -basic.coverage = ggcoverage(data = track.df, plot.type = "facet", - mark.region = mark.region, range.position = "out") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "facet", + mark.region = mark_region, + range.position = "out" +) + +basic_coverage ``` #### Custom Y-axis style + **Change the Y-axis scale label in/out of plot region with `range.position`**: + ```{r basic_coverage_2, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -basic.coverage = ggcoverage(data = track.df, plot.type = "facet", - mark.region = mark.region, range.position = "in") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "facet", + mark.region = mark_region, + range.position = "in" +) + +basic_coverage ``` **Shared/Free Y-axis scale with `facet.y.scale`**: + ```{r basic_coverage_3, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -basic.coverage = ggcoverage(data = track.df, plot.type = "facet", - mark.region = mark.region, range.position = "in", - facet.y.scale = "fixed") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "facet", + mark.region = mark_region, + range.position = "in", + facet.y.scale = "fixed" +) + +basic_coverage ``` ### Add gene annotation + ```{r gene_coverage, warning=FALSE, fig.height = 8, fig.width = 12, fig.align = "center"} -basic.coverage + - geom_gene(gtf.gr=gtf.gr) +basic_coverage + + geom_gene(gtf.gr = gtf_gr) ``` ### Add transcript annotation + **In "loose" stype (default style; each transcript occupies one line)**: + ```{r transcript_coverage, warning=FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} -basic.coverage + - geom_transcript(gtf.gr=gtf.gr,label.vjust = 1.5) +basic_coverage + + geom_transcript(gtf.gr = gtf_gr, label.vjust = 1.5) ``` **In "tight" style (place non-overlap transcripts in one line)**: + ```{r transcript_coverage_tight, warning=FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} -basic.coverage + - geom_transcript(gtf.gr=gtf.gr, overlap.style = "tight", label.vjust = 1.5) +basic_coverage + + geom_transcript(gtf.gr = gtf_gr, + overlap.style = "tight", + label.vjust = 1.5) ``` ### Add ideogram + ```{r ideogram_coverage_1, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} -basic.coverage + - geom_gene(gtf.gr=gtf.gr) + - geom_ideogram(genome = "hg19",plot.space = 0) +basic_coverage + + geom_gene(gtf.gr = gtf_gr) + + geom_ideogram(genome = "hg19", plot.space = 0) ``` ```{r ideogram_coverage_2, warning=FALSE, fig.height = 14, fig.width = 12, fig.align = "center"} -basic.coverage + - geom_transcript(gtf.gr=gtf.gr,label.vjust = 1.5) + - geom_ideogram(genome = "hg19",plot.space = 0) +basic_coverage + + geom_transcript(gtf.gr = gtf_gr, label.vjust = 1.5) + + geom_ideogram(genome = "hg19", plot.space = 0) ``` ## DNA-seq data + ### CNV -#### Example 1 + +#### Example 1 + ##### Load the data + The DNA-seq data used here are from [Copy number work flow](http://bioconductor.org/help/course-materials/2014/SeattleOct2014/B02.2.3_CopyNumber.html), we select tumor sample, and get bin counts with `cn.mops::getReadCountsFromBAM` with `WL` 1000. ```{r load_bin_counts} # prepare metafile -cnv.meta.info = data.frame( +cnv_meta_info <- data.frame( SampleName = c("CNV_example"), Type = c("tumor"), Group = c("tumor") ) + # track file -track.file = system.file("extdata", "DNA-seq", "CNV_example.txt", package = "ggcoverage") +track_file <- system.file("extdata", + "DNA-seq", "CNV_example.txt", package = "ggcoverage") + # load txt file -track.df = LoadTrackFile(track.file = track.file, format = "txt", region = "chr4:61750000-62,700,000", - meta.info = cnv.meta.info) +track_df <- LoadTrackFile( + track.file = track_file, + format = "txt", + region = "chr4:61750000-62,700,000", + meta.info = cnv_meta_info +) + # check data -head(track.df) +head(track_df) ``` ##### Basic coverage + ```{r basic_coverage_dna, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -basic.coverage = ggcoverage(data = track.df,color = "grey", mark.region = NULL, - range.position = "out") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + color = "grey", + mark.region = NULL, + range.position = "out" +) +basic_coverage ``` ##### Add GC annotations + Add **GC**, **ideogram** and **gene** annotaions. ```{r gc_coverage, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} # load genome data library("BSgenome.Hsapiens.UCSC.hg19") + # create plot -basic.coverage + - geom_gc(bs.fa.seq=BSgenome.Hsapiens.UCSC.hg19) + - geom_gene(gtf.gr=gtf.gr) + +basic_coverage + + geom_gc(bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19) + + geom_gene(gtf.gr = gtf_gr) + geom_ideogram(genome = "hg19") ``` #### Example 2 + ##### Load the data + The DNA-seq data used here are from [Genome-wide copy number analysis of single cells](https://www.nature.com/articles/nprot.2012.039), and the accession number is [SRR054616](https://trace.ncbi.nlm.nih.gov/Traces/index.html?run=SRR054616). ```{r cnv_load_track_file} # track file -track.file <- system.file("extdata", "DNA-seq", "SRR054616.bw", package = "ggcoverage") +track_file <- + system.file("extdata", "DNA-seq", "SRR054616.bw", package = "ggcoverage") + # load track -track.df = LoadTrackFile(track.file = track.file, format = "bw", region = "4:1-160000000") +track_df <- LoadTrackFile(track.file = track_file, + format = "bw", + region = "4:1-160000000") + # add chr prefix -track.df$seqnames = paste0("chr", track.df$seqnames) +track_df$seqnames <- paste0("chr", track_df$seqnames) + # check data -head(track.df) +head(track_df) ``` ##### Basic coverage + ```{r cnv_basic_coverage_dna} -basic.coverage = ggcoverage(data = track.df, color = "grey", - mark.region = NULL, range.position = "out") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + color = "grey", + mark.region = NULL, + range.position = "out" +) + +basic_coverage ``` ##### Load CNV file + ```{r cnv_load_cnv} # prepare files -cnv.file <- system.file("extdata", "DNA-seq", "SRR054616_copynumber.txt", package = "ggcoverage") +cnv_file <- + system.file("extdata", "DNA-seq", "SRR054616_copynumber.txt", + package = "ggcoverage") + # read CNV -cnv.df = read.table(file = cnv.file, sep = "\t", header = TRUE) +cnv_df <- read.table(file = cnv_file, sep = "\t", header = TRUE) + # check data -head(cnv.df) +head(cnv_df) ``` ##### Add annotations + Add **GC**, **ideogram** and **CNV** annotations. ```{r cnv_gc_coverage} -# load genome data -library("BSgenome.Hsapiens.UCSC.hg19") # create plot -basic.coverage + - geom_gc(bs.fa.seq=BSgenome.Hsapiens.UCSC.hg19) + - geom_cnv(cnv.df = cnv.df, bin.col = 3, cn.col = 4) + - geom_ideogram(genome = "hg19",plot.space = 0, highlight.centromere = TRUE) +basic_coverage + + geom_gc(bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19) + + geom_cnv(cnv.df = cnv_df, + bin.col = 3, + cn.col = 4) + + geom_ideogram( + genome = "hg19", + plot.space = 0, + highlight.centromere = TRUE + ) ``` ### Single-nucleotide level + #### Load the data + ```{r load_single_nuc} # prepare sample metadata -sample.meta <- data.frame( +sample_meta <- data.frame( SampleName = c("tumorA.chr4.selected"), Type = c("tumorA"), Group = c("tumorA") ) + # load bam file -bam.file = system.file("extdata", "DNA-seq", "tumorA.chr4.selected.bam", package = "ggcoverage") -track.df <- LoadTrackFile( - track.file = bam.file, - meta.info = sample.meta, - single.nuc=TRUE, single.nuc.region="chr4:62474235-62474295" +bam_file <- system.file("extdata", + "DNA-seq", "tumorA.chr4.selected.bam", + package = "ggcoverage") + +track_df <- LoadTrackFile( + track.file = bam_file, + meta.info = sample_meta, + single.nuc = TRUE, + single.nuc.region = "chr4:62474235-62474295" ) -head(track.df) + +head(track_df) ``` #### Default color scheme + For base and amino acid annotation, we have following default color schemes, you can change with `nuc.color` and `aa.color` parameters. Default color scheme for base annotation is `Clustal-style`, more popular color schemes is available [here](https://www.biostars.org/p/171056/). + ```{r base_color_scheme, warning=FALSE, fig.height = 2, fig.width = 6, fig.align = "center"} # color scheme -nuc.color = c("A" = "#ff2b08", "C" = "#009aff", "G" = "#ffb507", "T" = "#00bc0d") -opar <- graphics::par() +nuc_color <- c( + "A" = "#ff2b08", "C" = "#009aff", "G" = "#ffb507", "T" = "#00bc0d" +) +opar <- graphics::par() + # create plot graphics::par(mar = c(1, 5, 1, 1)) graphics::image( - 1:length(nuc.color), 1, as.matrix(1:length(nuc.color)), - col = nuc.color, - xlab = "", ylab = "", xaxt = "n", yaxt = "n", bty = "n" + seq_along(nuc_color), + 1, + as.matrix(seq_along(nuc_color)), + col = nuc_color, + xlab = "", + ylab = "", + xaxt = "n", + yaxt = "n", + bty = "n" ) -graphics::text(1:length(nuc.color), 1, names(nuc.color)) +graphics::text(seq_along(nuc_color), 1, names(nuc_color)) graphics::mtext( - text = "Base", adj = 1, las = 1, + text = "Base", + adj = 1, + las = 1, side = 2 ) @@ -326,24 +454,36 @@ graphics::par(opar) ``` Default color scheme for amino acid annotation is from [Residual colours: a proposal for aminochromography](https://academic.oup.com/peds/article/10/7/743/1593029?login=false): + ```{r aa_color_scheme, warning=FALSE, fig.height = 9, fig.width = 10, fig.align = "center"} -aa.color = c( - "D" = "#FF0000", "S" = "#FF2400", "T" = "#E34234", "G" = "#FF8000", "P" = "#F28500", - "C" = "#FFFF00", "A" = "#FDFF00", "V" = "#E3FF00", "I" = "#C0FF00", "L" = "#89318C", - "M" = "#00FF00", "F" = "#50C878", "Y" = "#30D5C8", "W" = "#00FFFF", "H" = "#0F2CB3", - "R" = "#0000FF", "K" = "#4b0082", "N" = "#800080", "Q" = "#FF00FF", "E" = "#8F00FF", - "*" = "#FFC0CB", " " = "#FFFFFF", " " = "#FFFFFF", " " = "#FFFFFF", " " = "#FFFFFF" +aa_color <- c( + "D" = "#FF0000", "S" = "#FF2400", "T" = "#E34234", "G" = "#FF8000", + "P" = "#F28500", "C" = "#FFFF00", "A" = "#FDFF00", "V" = "#E3FF00", + "I" = "#C0FF00", "L" = "#89318C", "M" = "#00FF00", "F" = "#50C878", + "Y" = "#30D5C8", "W" = "#00FFFF", "H" = "#0F2CB3", "R" = "#0000FF", + "K" = "#4b0082", "N" = "#800080", "Q" = "#FF00FF", "E" = "#8F00FF", + "*" = "#FFC0CB", " " = "#FFFFFF", " " = "#FFFFFF", " " = "#FFFFFF", + " " = "#FFFFFF" ) graphics::par(mar = c(1, 5, 1, 1)) graphics::image( - 1:5, 1:5, matrix(1:length(aa.color),nrow=5), - col = rev(aa.color), - xlab = "", ylab = "", xaxt = "n", yaxt = "n", bty = "n" + 1:5, + 1:5, + matrix(seq_along(aa_color), nrow = 5), + col = rev(aa_color), + xlab = "", + ylab = "", + xaxt = "n", + yaxt = "n", + bty = "n" ) -graphics::text(expand.grid(1:5,1:5), names(rev(aa.color))) + +graphics::text(expand.grid(1:5, 1:5), names(rev(aa_color))) graphics::mtext( - text = "Amino acids", adj = 1, las = 1, + text = "Amino acids", + adj = 1, + las = 1, side = 2 ) @@ -354,211 +494,319 @@ graphics::par(opar) #### Add base and amino acid annotation **Use twill to mark position with SNV**: + +```{r, echo = FALSE} +# wait some time to avoid 'Too Many Requests' error +Sys.sleep(60) +``` + + ```{r base_aa_coverage, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} -library(ggpattern) # create plot with twill mark -ggcoverage(data = track.df, color = "grey", range.position = "out", - single.nuc=T, rect.color = "white") + - geom_base(bam.file = bam.file, +ggcoverage( + data = track_df, + color = "grey", + range.position = "out", + single.nuc = TRUE, + rect.color = "white" +) + + geom_base(bam.file = bam_file, bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, mark.type = "twill") + - geom_ideogram(genome = "hg19",plot.space = 0) + geom_ideogram(genome = "hg19", plot.space = 0) ``` **Use star to mark position with SNV**: + +```{r, echo = FALSE} +# wait some time to avoid 'Too Many Requests' error +Sys.sleep(60) +``` + ```{r base_aa_coverage_star, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} # create plot with star mark -ggcoverage(data = track.df, color = "grey", range.position = "out", - single.nuc=T, rect.color = "white") + - geom_base(bam.file = bam.file, +ggcoverage( + data = track_df, + color = "grey", + range.position = "out", + single.nuc = TRUE, + rect.color = "white" +) + + geom_base(bam.file = bam_file, bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, mark.type = "star") + - geom_ideogram(genome = "hg19",plot.space = 0) + geom_ideogram(genome = "hg19", plot.space = 0) ``` **Highlight position with SNV**: + +```{r, echo = FALSE} +# wait some time to avoid 'Too Many Requests' error +Sys.sleep(60) +``` + ```{r base_aa_coverage_highlight, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} -# highlight -ggcoverage(data = track.df, color = "grey", range.position = "out", - single.nuc=T, rect.color = "white") + - geom_base(bam.file = bam.file, +# highlight one base +ggcoverage( + data = track_df, + color = "grey", + range.position = "out", + single.nuc = TRUE, + rect.color = "white" +) + + geom_base(bam.file = bam_file, bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, mark.type = "highlight") + - geom_ideogram(genome = "hg19",plot.space = 0) + geom_ideogram(genome = "hg19", plot.space = 0) ``` ## ChIP-seq data + The ChIP-seq data used here are from [DiffBind](https://bioconductor.org/packages/release/bioc/html/DiffBind.html), I select four sample to use as example: Chr18_MCF7_input, Chr18_MCF7_ER_1, Chr18_MCF7_ER_3, Chr18_MCF7_ER_2, and all bam files are converted to bigwig file with [deeptools](https://deeptools.readthedocs.io/en/develop/). Create metadata: + ```{r load_metadata_chip} # load metadata -sample.meta = data.frame(SampleName=c('Chr18_MCF7_ER_1','Chr18_MCF7_ER_2','Chr18_MCF7_ER_3','Chr18_MCF7_input'), - Type = c("MCF7_ER_1","MCF7_ER_2","MCF7_ER_3","MCF7_input"), - Group = c("IP", "IP", "IP", "Input")) -sample.meta +sample_meta <- data.frame( + SampleName = c( + "Chr18_MCF7_ER_1", + "Chr18_MCF7_ER_2", + "Chr18_MCF7_ER_3", + "Chr18_MCF7_input" + ), + Type = c("MCF7_ER_1", "MCF7_ER_2", "MCF7_ER_3", "MCF7_input"), + Group = c("IP", "IP", "IP", "Input") +) + +sample_meta ``` Load track files: + ```{r load_track_chip} # track folder -track.folder = system.file("extdata", "ChIP-seq", package = "ggcoverage") +track_folder <- system.file("extdata", "ChIP-seq", package = "ggcoverage") + # load bigwig file -track.df = LoadTrackFile(track.folder = track.folder, format = "bw", region = "chr18:76822285-76900000", - meta.info = sample.meta) +track_df <- LoadTrackFile( + track.folder = track_folder, + format = "bw", + region = "chr18:76822285-76900000", + meta.info = sample_meta +) + # check data -head(track.df) +head(track_df) ``` Prepare mark region: + ```{r prepare_mark_chip} # create mark region -mark.region=data.frame(start=c(76822533), - end=c(76823743), - label=c("Promoter")) +mark_region <- data.frame( + start = c(76822533), + end = c(76823743), + label = c("Promoter") +) + # check data -mark.region +mark_region ``` ### Basic coverage + ```{r basic_coverage_chip, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -basic.coverage = ggcoverage(data = track.df, - mark.region=mark.region, show.mark.label = FALSE) -basic.coverage +basic_coverage <- ggcoverage(data = track_df, + mark.region = mark_region, + show.mark.label = FALSE) +basic_coverage ``` ### Add annotations + Add **gene**, **ideogram** and **peak** annotations. To create peak annotation, we first **get consensus peaks** with [MSPC](https://github.com/Genometric/MSPC). +```{r, echo = FALSE} +# wait some time to avoid 'Too Many Requests' error +Sys.sleep(60) +``` + ```{r peak_coverage, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} # get consensus peak file -peak.file = system.file("extdata", "ChIP-seq", "consensus.peak", package = "ggcoverage") +peak_file <- system.file("extdata", + "ChIP-seq", + "consensus.peak", + package = "ggcoverage") -basic.coverage + - geom_gene(gtf.gr=gtf.gr) + - geom_peak(bed.file = peak.file) + - geom_ideogram(genome = "hg19",plot.space = 0) +basic_coverage + + geom_gene(gtf.gr = gtf_gr) + + geom_peak(bed.file = peak_file) + + geom_ideogram(genome = "hg19", plot.space = 0) ``` ## Hi-C data + The Hi-C data are from [pyGenomeTracks: reproducible plots for multivariate genomic datasets](https://academic.oup.com/bioinformatics/article/37/3/422/5879987?login=false). The Hi-C matrix visualization is implemented by [HiCBricks](https://github.com/koustav-pal/HiCBricks). ### Load track data + ```{r hic_track} -library(ggcoverage) -library(GenomicRanges) # prepare track dataframe -track.file = system.file("extdata", "HiC", "H3K36me3.bw", package = "ggcoverage") -track.df = LoadTrackFile(track.file = track.file, format = "bw", - region = "chr2L:8050000-8300000", extend = 0) -track.df$score = ifelse(track.df$score <0, 0, track.df$score) +track_file <- + system.file("extdata", "HiC", "H3K36me3.bw", package = "ggcoverage") + +track_df <- LoadTrackFile( + track.file = track_file, + format = "bw", + region = "chr2L:8050000-8300000", + extend = 0 +) + +track_df$score <- ifelse(track_df$score < 0, 0, track_df$score) + # check the data -head(track.df) +head(track_df) ``` ### Load Hi-C data + Matrix: ```{r hic_load_hic_matrix} ## matrix -hic.mat.file = system.file("extdata", "HiC", "HiC_mat.txt", package = "ggcoverage") -hic.mat = read.table(file = hic.mat.file, sep = "\t") -hic.mat = as.matrix(hic.mat) +hic_mat_file <- system.file("extdata", + "HiC", "HiC_mat.txt", package = "ggcoverage") +hic_mat <- read.table(file = hic_mat_file, sep = "\t") +hic_mat <- as.matrix(hic_mat) ``` Bin table: ```{r hic_load_hic_bin} ## bin -hic.bin.file = system.file("extdata", "HiC", "HiC_bin.txt", package = "ggcoverage") -hic.bin = read.table(file = hic.bin.file, sep = "\t") -colnames(hic.bin) = c("chr", "start", "end") -hic.bin.gr = GenomicRanges::makeGRangesFromDataFrame(df = hic.bin) +hic_bin_file <- + system.file("extdata", "HiC", "HiC_bin.txt", package = "ggcoverage") +hic_bin <- read.table(file = hic_bin_file, sep = "\t") +colnames(hic_bin) <- c("chr", "start", "end") +hic_bin_gr <- GenomicRanges::makeGRangesFromDataFrame(df = hic_bin) + ## transfrom func -FailSafe_log10 <- function(x){ +failsafe_log10 <- function(x) { x[is.na(x) | is.nan(x) | is.infinite(x)] <- 0 - return(log10(x+1)) + return(log10(x + 1)) } ``` Data transfromation method: -```{r hic_load_hic_transformation} -## transfrom func -FailSafe_log10 <- function(x){ - x[is.na(x) | is.nan(x) | is.infinite(x)] <- 0 - return(log10(x+1)) -} -``` - ### Load link + ```{r hic_load_link} # prepare arcs -link.file = system.file("extdata", "HiC", "HiC_link.bedpe", package = "ggcoverage") +link_file <- + system.file("extdata", "HiC", "HiC_link.bedpe", package = "ggcoverage") ``` ### Basic coverage + ```{r basic_coverage_hic, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -basic.coverage = ggcoverage(data = track.df, color = "grey", - mark.region = NULL, range.position = "out") -basic.coverage +basic_coverage <- + ggcoverage( + data = track_df, + color = "grey", + mark.region = NULL, + range.position = "out" + ) + +basic_coverage ``` ### Add annotations + Add **link**, **contact map**annotations: ```{r hic_coverage, warning=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} -basic.coverage + - geom_tad(matrix = hic.mat, granges = hic.bin.gr, value.cut = 0.99, - color.palette = "viridis", transform.fun = FailSafe_log10, - top = FALSE, show.rect = TRUE) + - geom_link(link.file = link.file, file.type = "bedpe", show.rect = TRUE) +basic_coverage + + geom_tad( + matrix = hic_mat, + granges = hic_bin_gr, + value.cut = 0.99, + color.palette = "viridis", + transform.fun = failsafe_log10, + top = FALSE, + show.rect = TRUE + ) + + geom_link(link.file = link_file, + file.type = "bedpe", + show.rect = TRUE) ``` ## Mass spectrometry protein coverage + [Mass spectrometry (MS) is an important method for the accurate mass determination and characterization of proteins, and a variety of methods and instrumentations have been developed for its many uses](https://en.wikipedia.org/wiki/Protein_mass_spectrometry). After MS, we can check the coverage of protein to check the quality of the data and find the reason why the segment did not appear and improve the experiment. ### Load coverage + The exported coverage from [Proteome Discoverer](https://www.thermofisher.cn/cn/zh/home/industrial/mass-spectrometry/liquid-chromatography-mass-spectrometry-lc-ms/lc-ms-software/multi-omics-data-analysis/proteome-discoverer-software.html?adobe_mc=MCMID%7C90228073352279367993013412919222863692%7CMCAID%3D3208C32C269355DE-4000028116B65FEB%7CMCORGID%3D5B135A0C5370E6B40A490D44%40AdobeOrg%7CTS=1614293705): + ```{r ms_coverage_data} library(openxlsx) # prepare coverage dataframe -coverage.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") -coverage.df <- openxlsx::read.xlsx(coverage.file) +coverage_file <- + system.file("extdata", + "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") +coverage_df <- openxlsx::read.xlsx(coverage_file, sheet = "Sheet1") # check the data -head(coverage.df) +head(coverage_df) ``` The input protein fasta: + ```{r ms_coverage_fasta} -library(Biostrings) -fasta.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") +fasta_file <- + system.file("extdata", + "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") + # prepare track dataframe -protein.set <- Biostrings::readAAStringSet(fasta.file) +protein_set <- Biostrings::readAAStringSet(fasta_file) + # check the data -protein.set +protein_set ``` ### Protein coverage -```{r basic_coverage_protein, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -protein.coverage = ggprotein(coverage.file = coverage.file, fasta.file = fasta.file, - protein.id = "sp|P02769|ALBU_BOVIN", range.position = "out") -protein.coverage + +```{r basic_coverage_protein, warning=FALSE, fig.height = 6, fig.width = 10, fig.align = "center"} +protein_coverage <- ggprotein( + coverage.file = coverage_file, + fasta.file = fasta_file, + protein.id = "sp|P02769|ALBU_BOVIN", + range.position = "out" +) + +protein_coverage ``` ### Add annotation + We can obtain features of the protein from [UniProt](https://www.uniprot.org/). For example, the above protein coverage plot shows that there is empty region in 1-24, and this empty region in [UniProt](https://www.uniprot.org/uniprotkb/P02769/entry) is annotated as Signal peptide and Propeptide peptide. When the protein is mature and released extracellular, these peptides will be cleaved. This is the reason why there is empty region in 1-24. -```{r basic_coverage_protein_feature, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +```{r basic_coverage_protein_feature, warning=FALSE, fig.height = 6, fig.width = 10, fig.align = "center"} # protein feature obtained from UniProt -protein.feature.df = data.frame(ProteinID = "sp|P02769|ALBU_BOVIN", start = c(1, 19, 25), - end = c(18, 24, 607), - Type = c("Signal", "Propeptide", "Chain")) +protein_feature_df <- data.frame( + ProteinID = "sp|P02769|ALBU_BOVIN", + start = c(1, 19, 25), + end = c(18, 24, 607), + Type = c("Signal", "Propeptide", "Chain") +) + # add annotation -protein.coverage + - geom_feature(feature.df = protein.feature.df, feature.color = c("#4d81be","#173b5e","#6a521d")) +protein_coverage + + geom_feature(feature.df = protein_feature_df, + feature.color = c("#4d81be", "#173b5e", "#6a521d")) ``` ## Code of Conduct diff --git a/README.md b/README.md index d6cf7cd..c38abed 100644 --- a/README.md +++ b/README.md @@ -6,37 +6,40 @@ [![CRAN](https://www.r-pkg.org/badges/version/ggcoverage?color=orange)](https://cran.r-project.org/package=ggcoverage) +[![R-CMD-check](https://github.com/showteeth/ggcoverage/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/showteeth/ggcoverage/actions/workflows/R-CMD-check.yaml) +![GitHub +issues](https://img.shields.io/github/issues/showteeth/ggcoverage) +![GitHub last +commit](https://img.shields.io/github/last-commit/showteeth/ggcoverage) ![License](https://img.shields.io/badge/license-MIT-green) -[![CODE\_SIZE](https://img.shields.io/github/languages/code-size/showteeth/ggcoverage.svg)](https://github.com/showteeth/ggcoverage) +[![CODE_SIZE](https://img.shields.io/github/languages/code-size/showteeth/ggcoverage.svg)](https://github.com/showteeth/ggcoverage) ## Introduction The goal of `ggcoverage` is simplify the process of visualizing omics coverage. It contains three main parts: -- **Load the data**: `ggcoverage` can load BAM, BigWig (.bw), - BedGraph, txt/xlsx files from various omics data, including WGS, - RNA-seq, ChIP-seq, ATAC-seq, proteomics, et al. -- **Create omics coverage plot** -- **Add annotations**: `ggcoverage` supports six different - annotations: - - **base and amino acid annotation**: Visualize genome coverage at - single-nucleotide level with bases and amino acids. - - **GC annotation**: Visualize genome coverage with GC content - - **CNV annotation**: Visualize genome coverage with copy number - variation (CNV) - - **gene annotation**: Visualize genome coverage across genes - - **transcription annotation**: Visualize genome coverage across - different transcripts - - **ideogram annotation**: Visualize the region showing on whole - chromosome - - **peak annotation**: Visualize genome coverage and peak - identified - - **contact map annotation**: Visualize genome coverage with Hi-C - contact map - - **link annotation**: Visualize genome coverage with contacts - - **peotein feature annotation**: Visualize protein coverage with - features +- **Load the data**: `ggcoverage` can load BAM, BigWig (.bw), BedGraph, + txt/xlsx files from various omics data, including WGS, RNA-seq, + ChIP-seq, ATAC-seq, proteomics, et al. +- **Create omics coverage plot** +- **Add annotations**: `ggcoverage` supports six different annotations: + - **base and amino acid annotation**: Visualize genome coverage at + single-nucleotide level with bases and amino acids. + - **GC annotation**: Visualize genome coverage with GC content + - **CNV annotation**: Visualize genome coverage with copy number + variation (CNV) + - **gene annotation**: Visualize genome coverage across genes + - **transcription annotation**: Visualize genome coverage across + different transcripts + - **ideogram annotation**: Visualize the region showing on whole + chromosome + - **peak annotation**: Visualize genome coverage and peak identified + - **contact map annotation**: Visualize genome coverage with Hi-C + contact map + - **link annotation**: Visualize genome coverage with contacts + - **peotein feature annotation**: Visualize protein coverage with + features `ggcoverage` utilizes `ggplot2` plotting system, so its usage is **ggplot2-style**! @@ -45,14 +48,14 @@ coverage. It contains three main parts: `ggcoverage` is an R package distributed as part of the [CRAN](https://cran.r-project.org/). To install the package, start R and -enter: +enter one of the following commands: ``` r -# install via CRAN +# install via CRAN (not yet available) install.packages("ggcoverage") -# install via Github -# install.package("remotes") #In case you have not installed it. +# OR install via Github +install.package("remotes") remotes::install_github("showteeth/ggcoverage") ``` @@ -60,8 +63,8 @@ In general, it is **recommended** to install from [Github repository](https://github.com/showteeth/ggcoverage) (update more timely). -Once `ggcoverage` is installed, it can be loaded by the following -command. +Once `ggcoverage` is installed, it can be loaded (together with other +libraries) like this: ``` r library("rtracklayer") @@ -74,8 +77,8 @@ library("ggpattern") `ggcoverage` provides two [vignettes](https://showteeth.github.io/ggcoverage/): -- **detailed manual**: step-by-step usage -- **customize the plot**: customize the plot and add additional layer +- **detailed manual**: step-by-step usage +- **customize the plot**: customize the plot and add additional layer ## RNA-seq data @@ -84,18 +87,19 @@ library("ggpattern") The RNA-seq data used here are from [Transcription profiling by high throughput sequencing of HNRNPC knockdown and control HeLa cells](https://bioconductor.org/packages/release/data/experiment/html/RNAseqData.HNRNPC.bam.chr14.html), -we select four sample to use as example: ERR127307\_chr14, -ERR127306\_chr14, ERR127303\_chr14, ERR127302\_chr14, and all bam files -are converted to bigwig file with +we select four sample to use as example: ERR127307_chr14, +ERR127306_chr14, ERR127303_chr14, ERR127302_chr14, and all bam files are +converted to bigwig file with [deeptools](https://deeptools.readthedocs.io/en/develop/). Load metadata: ``` r # load metadata -meta.file <- system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") -sample.meta = read.csv(meta.file) -sample.meta +meta_file <- + system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") +sample_meta <- read.csv(meta_file) +sample_meta #> SampleName Type Group #> 1 ERR127302_chr14 KO_rep1 KO #> 2 ERR127303_chr14 KO_rep2 KO @@ -107,31 +111,37 @@ Load track files: ``` r # track folder -track.folder = system.file("extdata", "RNA-seq", package = "ggcoverage") +track_folder <- system.file("extdata", "RNA-seq", package = "ggcoverage") # load bigwig file -track.df = LoadTrackFile(track.folder = track.folder, format = "bw", - region = "chr14:21,677,306-21,737,601", extend = 2000, - meta.info = sample.meta) +track_df <- LoadTrackFile( + track.folder = track_folder, + format = "bw", + region = "chr14:21,677,306-21,737,601", + extend = 2000, + meta.info = sample_meta +) # check data -head(track.df) -#> seqnames start end score Type Group -#> 1 chr14 21675306 21675950 0 KO_rep1 KO -#> 2 chr14 21675951 21676000 1 KO_rep1 KO -#> 3 chr14 21676001 21676100 2 KO_rep1 KO -#> 4 chr14 21676101 21676150 1 KO_rep1 KO -#> 5 chr14 21676151 21677100 0 KO_rep1 KO -#> 6 chr14 21677101 21677200 2 KO_rep1 KO +head(track_df) +#> seqnames start end width strand score Type Group +#> 1 chr14 21675306 21675950 645 * 0 KO_rep1 KO +#> 2 chr14 21675951 21676000 50 * 1 KO_rep1 KO +#> 3 chr14 21676001 21676100 100 * 2 KO_rep1 KO +#> 4 chr14 21676101 21676150 50 * 1 KO_rep1 KO +#> 5 chr14 21676151 21677100 950 * 0 KO_rep1 KO +#> 6 chr14 21677101 21677200 100 * 2 KO_rep1 KO ``` Prepare mark region: ``` r # create mark region -mark.region=data.frame(start=c(21678900,21732001,21737590), - end=c(21679900,21732400,21737650), - label=c("M1", "M2", "M3")) +mark_region <- data.frame( + start = c(21678900, 21732001, 21737590), + end = c(21679900, 21732400, 21737650), + label = c("M1", "M2", "M3") +) # check data -mark.region +mark_region #> start end label #> 1 21678900 21679900 M1 #> 2 21732001 21732400 M2 @@ -140,23 +150,24 @@ mark.region ### Load GTF -To add **gene annotation**, the gtf file should contain **gene\_type** -and **gene\_name** attributes in **column 9**; to add **transcript -annotation**, the gtf file should contain **transcript\_name** attribute +To add **gene annotation**, the gtf file should contain **gene_type** +and **gene_name** attributes in **column 9**; to add **transcript +annotation**, the gtf file should contain **transcript_name** attribute in **column 9**. ``` r -gtf.file = system.file("extdata", "used_hg19.gtf", package = "ggcoverage") -gtf.gr = rtracklayer::import.gff(con = gtf.file, format = 'gtf') +gtf_file <- + system.file("extdata", "used_hg19.gtf", package = "ggcoverage") +gtf_gr <- rtracklayer::import.gff(con = gtf_file, format = "gtf") ``` ### Basic coverage The basic coverage plot has **two types**: -- **facet**: Create subplot for every track (specified by - `facet.key`). This is default. -- **joint**: Visualize all tracks in a single plot. +- **facet**: Create subplot for every track (specified by `facet.key`). + This is default. +- **joint**: Visualize all tracks in a single plot. #### joint view @@ -164,10 +175,16 @@ Create line plot for **every sample** (`facet.key = "Type"`) and color by **every sample** (`group.key = "Type"`): ``` r -basic.coverage = ggcoverage(data = track.df, - plot.type = "joint", facet.key = "Type", group.key = "Type", - mark.region = mark.region, range.position = "out") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "joint", + facet.key = "Type", + group.key = "Type", + mark.region = mark_region, + range.position = "out" +) + +basic_coverage ``` @@ -176,21 +193,32 @@ Create **group average line plot** (sample is indicated by `facet.key = "Type"`, group is indicated by `group.key = "Group"`): ``` r -basic.coverage = ggcoverage(data = track.df, - plot.type = "joint", facet.key = "Type", group.key = "Group", - joint.avg = TRUE, - mark.region = mark.region, range.position = "out") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "joint", + facet.key = "Type", + group.key = "Group", + joint.avg = TRUE, + mark.region = mark_region, + range.position = "out" +) + +basic_coverage ``` -#### facet view +#### Facet view ``` r -basic.coverage = ggcoverage(data = track.df, plot.type = "facet", - mark.region = mark.region, range.position = "out") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "facet", + mark.region = mark_region, + range.position = "out" +) + +basic_coverage ``` @@ -201,9 +229,14 @@ basic.coverage `range.position`**: ``` r -basic.coverage = ggcoverage(data = track.df, plot.type = "facet", - mark.region = mark.region, range.position = "in") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "facet", + mark.region = mark_region, + range.position = "in" +) + +basic_coverage ``` @@ -211,10 +244,15 @@ basic.coverage **Shared/Free Y-axis scale with `facet.y.scale`**: ``` r -basic.coverage = ggcoverage(data = track.df, plot.type = "facet", - mark.region = mark.region, range.position = "in", - facet.y.scale = "fixed") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "facet", + mark.region = mark_region, + range.position = "in", + facet.y.scale = "fixed" +) + +basic_coverage ``` @@ -222,8 +260,8 @@ basic.coverage ### Add gene annotation ``` r -basic.coverage + - geom_gene(gtf.gr=gtf.gr) +basic_coverage + + geom_gene(gtf.gr = gtf_gr) ``` @@ -233,8 +271,8 @@ basic.coverage + **In “loose” stype (default style; each transcript occupies one line)**: ``` r -basic.coverage + - geom_transcript(gtf.gr=gtf.gr,label.vjust = 1.5) +basic_coverage + + geom_transcript(gtf.gr = gtf_gr, label.vjust = 1.5) ``` @@ -242,8 +280,10 @@ basic.coverage + **In “tight” style (place non-overlap transcripts in one line)**: ``` r -basic.coverage + - geom_transcript(gtf.gr=gtf.gr, overlap.style = "tight", label.vjust = 1.5) +basic_coverage + + geom_transcript(gtf.gr = gtf_gr, + overlap.style = "tight", + label.vjust = 1.5) ``` @@ -251,9 +291,9 @@ basic.coverage + ### Add ideogram ``` r -basic.coverage + - geom_gene(gtf.gr=gtf.gr) + - geom_ideogram(genome = "hg19",plot.space = 0) +basic_coverage + + geom_gene(gtf.gr = gtf_gr) + + geom_ideogram(genome = "hg19", plot.space = 0) #> Loading ideogram... #> Loading ranges... #> Scale for x is already present. @@ -263,9 +303,9 @@ basic.coverage + ``` r -basic.coverage + - geom_transcript(gtf.gr=gtf.gr,label.vjust = 1.5) + - geom_ideogram(genome = "hg19",plot.space = 0) +basic_coverage + + geom_transcript(gtf.gr = gtf_gr, label.vjust = 1.5) + + geom_ideogram(genome = "hg19", plot.space = 0) #> Loading ideogram... #> Loading ranges... #> Scale for x is already present. @@ -289,18 +329,26 @@ we select tumor sample, and get bin counts with ``` r # prepare metafile -cnv.meta.info = data.frame( +cnv_meta_info <- data.frame( SampleName = c("CNV_example"), Type = c("tumor"), Group = c("tumor") ) + # track file -track.file = system.file("extdata", "DNA-seq", "CNV_example.txt", package = "ggcoverage") +track_file <- system.file("extdata", + "DNA-seq", "CNV_example.txt", package = "ggcoverage") + # load txt file -track.df = LoadTrackFile(track.file = track.file, format = "txt", region = "chr4:61750000-62,700,000", - meta.info = cnv.meta.info) +track_df <- LoadTrackFile( + track.file = track_file, + format = "txt", + region = "chr4:61750000-62,700,000", + meta.info = cnv_meta_info +) + # check data -head(track.df) +head(track_df) #> seqnames start end score Type Group #> 1 chr4 61748000 61748000 25 tumor tumor #> 2 chr4 61748001 61749000 24 tumor tumor @@ -313,9 +361,13 @@ head(track.df) ##### Basic coverage ``` r -basic.coverage = ggcoverage(data = track.df,color = "grey", mark.region = NULL, - range.position = "out") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + color = "grey", + mark.region = NULL, + range.position = "out" +) +basic_coverage ``` @@ -335,10 +387,11 @@ library("BSgenome.Hsapiens.UCSC.hg19") #> The following object is masked from 'package:base': #> #> strsplit + # create plot -basic.coverage + - geom_gc(bs.fa.seq=BSgenome.Hsapiens.UCSC.hg19) + - geom_gene(gtf.gr=gtf.gr) + +basic_coverage + + geom_gc(bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19) + + geom_gene(gtf.gr = gtf_gr) + geom_ideogram(genome = "hg19") #> Loading ideogram... #> Loading ranges... @@ -359,29 +412,40 @@ accession number is ``` r # track file -track.file <- system.file("extdata", "DNA-seq", "SRR054616.bw", package = "ggcoverage") +track_file <- + system.file("extdata", "DNA-seq", "SRR054616.bw", package = "ggcoverage") + # load track -track.df = LoadTrackFile(track.file = track.file, format = "bw", region = "4:1-160000000") -#> Sample without metadata! +track_df <- LoadTrackFile(track.file = track_file, + format = "bw", + region = "4:1-160000000") +#> No metadata provided, returning coverage as is. + # add chr prefix -track.df$seqnames = paste0("chr", track.df$seqnames) +track_df$seqnames <- paste0("chr", track_df$seqnames) + # check data -head(track.df) -#> seqnames start end score Type Group -#> 1 chr4 1 50000 197 SRR054616.bw SRR054616.bw -#> 2 chr4 50001 100000 598 SRR054616.bw SRR054616.bw -#> 3 chr4 100001 150000 287 SRR054616.bw SRR054616.bw -#> 4 chr4 150001 200000 179 SRR054616.bw SRR054616.bw -#> 5 chr4 200001 250000 282 SRR054616.bw SRR054616.bw -#> 6 chr4 250001 300000 212 SRR054616.bw SRR054616.bw +head(track_df) +#> seqnames start end width strand score Type Group +#> 1 chr4 1 50000 50000 * 197 SRR054616.bw SRR054616.bw +#> 2 chr4 50001 100000 50000 * 598 SRR054616.bw SRR054616.bw +#> 3 chr4 100001 150000 50000 * 287 SRR054616.bw SRR054616.bw +#> 4 chr4 150001 200000 50000 * 179 SRR054616.bw SRR054616.bw +#> 5 chr4 200001 250000 50000 * 282 SRR054616.bw SRR054616.bw +#> 6 chr4 250001 300000 50000 * 212 SRR054616.bw SRR054616.bw ``` ##### Basic coverage ``` r -basic.coverage = ggcoverage(data = track.df, color = "grey", - mark.region = NULL, range.position = "out") -basic.coverage +basic_coverage <- ggcoverage( + data = track_df, + color = "grey", + mark.region = NULL, + range.position = "out" +) + +basic_coverage ``` @@ -390,11 +454,15 @@ basic.coverage ``` r # prepare files -cnv.file <- system.file("extdata", "DNA-seq", "SRR054616_copynumber.txt", package = "ggcoverage") +cnv_file <- + system.file("extdata", "DNA-seq", "SRR054616_copynumber.txt", + package = "ggcoverage") + # read CNV -cnv.df = read.table(file = cnv.file, sep = "\t", header = TRUE) +cnv_df <- read.table(file = cnv_file, sep = "\t", header = TRUE) + # check data -head(cnv.df) +head(cnv_df) #> chrom chrompos cn.ratio copy.number #> 1 chr4 1 11.518554 5 #> 2 chr4 90501 5.648878 5 @@ -409,13 +477,17 @@ head(cnv.df) Add **GC**, **ideogram** and **CNV** annotations. ``` r -# load genome data -library("BSgenome.Hsapiens.UCSC.hg19") # create plot -basic.coverage + - geom_gc(bs.fa.seq=BSgenome.Hsapiens.UCSC.hg19) + - geom_cnv(cnv.df = cnv.df, bin.col = 3, cn.col = 4) + - geom_ideogram(genome = "hg19",plot.space = 0, highlight.centromere = TRUE) +basic_coverage + + geom_gc(bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19) + + geom_cnv(cnv.df = cnv_df, + bin.col = 3, + cn.col = 4) + + geom_ideogram( + genome = "hg19", + plot.space = 0, + highlight.centromere = TRUE + ) #> Loading ideogram... #> Loading ranges... #> Scale for x is already present. @@ -430,26 +502,35 @@ basic.coverage + ``` r # prepare sample metadata -sample.meta <- data.frame( +sample_meta <- data.frame( SampleName = c("tumorA.chr4.selected"), Type = c("tumorA"), Group = c("tumorA") ) + # load bam file -bam.file = system.file("extdata", "DNA-seq", "tumorA.chr4.selected.bam", package = "ggcoverage") -track.df <- LoadTrackFile( - track.file = bam.file, - meta.info = sample.meta, - single.nuc=TRUE, single.nuc.region="chr4:62474235-62474295" +bam_file <- system.file("extdata", + "DNA-seq", "tumorA.chr4.selected.bam", + package = "ggcoverage") + +track_df <- LoadTrackFile( + track.file = bam_file, + meta.info = sample_meta, + single.nuc = TRUE, + single.nuc.region = "chr4:62474235-62474295" ) -head(track.df) -#> seqnames start end score Type Group -#> 1 chr4 62474235 62474236 5 tumorA tumorA -#> 2 chr4 62474236 62474237 5 tumorA tumorA -#> 3 chr4 62474237 62474238 5 tumorA tumorA -#> 4 chr4 62474238 62474239 6 tumorA tumorA -#> 5 chr4 62474239 62474240 6 tumorA tumorA -#> 6 chr4 62474240 62474241 6 tumorA tumorA +#> No 'region' specified; extracting coverage for an example range +#> (<=100,000 bases, first annotated sequence) +#> Coverage extracted from sequence/chromosome: chr10 + +head(track_df) +#> seqnames start end width strand score Type Group +#> 1 chr4 62474235 62474236 1 * 5 tumorA tumorA +#> 2 chr4 62474236 62474237 1 * 5 tumorA tumorA +#> 3 chr4 62474237 62474238 1 * 5 tumorA tumorA +#> 4 chr4 62474238 62474239 1 * 6 tumorA tumorA +#> 5 chr4 62474239 62474240 1 * 6 tumorA tumorA +#> 6 chr4 62474240 62474241 1 * 6 tumorA tumorA ``` #### Default color scheme @@ -463,18 +544,29 @@ popular color schemes is available ``` r # color scheme -nuc.color = c("A" = "#ff2b08", "C" = "#009aff", "G" = "#ffb507", "T" = "#00bc0d") -opar <- graphics::par() +nuc_color <- c( + "A" = "#ff2b08", "C" = "#009aff", "G" = "#ffb507", "T" = "#00bc0d" +) +opar <- graphics::par() + # create plot graphics::par(mar = c(1, 5, 1, 1)) graphics::image( - 1:length(nuc.color), 1, as.matrix(1:length(nuc.color)), - col = nuc.color, - xlab = "", ylab = "", xaxt = "n", yaxt = "n", bty = "n" + seq_along(nuc_color), + 1, + as.matrix(seq_along(nuc_color)), + col = nuc_color, + xlab = "", + ylab = "", + xaxt = "n", + yaxt = "n", + bty = "n" ) -graphics::text(1:length(nuc.color), 1, names(nuc.color)) +graphics::text(seq_along(nuc_color), 1, names(nuc_color)) graphics::mtext( - text = "Base", adj = 1, las = 1, + text = "Base", + adj = 1, + las = 1, side = 2 ) ``` @@ -482,6 +574,7 @@ graphics::mtext( ``` r + # reset par default graphics::par(opar) ``` @@ -491,23 +584,34 @@ colours: a proposal for aminochromography](https://academic.oup.com/peds/article/10/7/743/1593029?login=false): ``` r -aa.color = c( - "D" = "#FF0000", "S" = "#FF2400", "T" = "#E34234", "G" = "#FF8000", "P" = "#F28500", - "C" = "#FFFF00", "A" = "#FDFF00", "V" = "#E3FF00", "I" = "#C0FF00", "L" = "#89318C", - "M" = "#00FF00", "F" = "#50C878", "Y" = "#30D5C8", "W" = "#00FFFF", "H" = "#0F2CB3", - "R" = "#0000FF", "K" = "#4b0082", "N" = "#800080", "Q" = "#FF00FF", "E" = "#8F00FF", - "*" = "#FFC0CB", " " = "#FFFFFF", " " = "#FFFFFF", " " = "#FFFFFF", " " = "#FFFFFF" +aa_color <- c( + "D" = "#FF0000", "S" = "#FF2400", "T" = "#E34234", "G" = "#FF8000", + "P" = "#F28500", "C" = "#FFFF00", "A" = "#FDFF00", "V" = "#E3FF00", + "I" = "#C0FF00", "L" = "#89318C", "M" = "#00FF00", "F" = "#50C878", + "Y" = "#30D5C8", "W" = "#00FFFF", "H" = "#0F2CB3", "R" = "#0000FF", + "K" = "#4b0082", "N" = "#800080", "Q" = "#FF00FF", "E" = "#8F00FF", + "*" = "#FFC0CB", " " = "#FFFFFF", " " = "#FFFFFF", " " = "#FFFFFF", + " " = "#FFFFFF" ) graphics::par(mar = c(1, 5, 1, 1)) graphics::image( - 1:5, 1:5, matrix(1:length(aa.color),nrow=5), - col = rev(aa.color), - xlab = "", ylab = "", xaxt = "n", yaxt = "n", bty = "n" + 1:5, + 1:5, + matrix(seq_along(aa_color), nrow = 5), + col = rev(aa_color), + xlab = "", + ylab = "", + xaxt = "n", + yaxt = "n", + bty = "n" ) -graphics::text(expand.grid(1:5,1:5), names(rev(aa.color))) + +graphics::text(expand.grid(1:5, 1:5), names(rev(aa_color))) graphics::mtext( - text = "Amino acids", adj = 1, las = 1, + text = "Amino acids", + adj = 1, + las = 1, side = 2 ) ``` @@ -515,6 +619,7 @@ graphics::mtext( ``` r + # reset par default graphics::par(opar) ``` @@ -524,14 +629,18 @@ graphics::par(opar) **Use twill to mark position with SNV**: ``` r -library(ggpattern) # create plot with twill mark -ggcoverage(data = track.df, color = "grey", range.position = "out", - single.nuc=T, rect.color = "white") + - geom_base(bam.file = bam.file, +ggcoverage( + data = track_df, + color = "grey", + range.position = "out", + single.nuc = TRUE, + rect.color = "white" +) + + geom_base(bam.file = bam_file, bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, mark.type = "twill") + - geom_ideogram(genome = "hg19",plot.space = 0) + geom_ideogram(genome = "hg19", plot.space = 0) #> Loading ideogram... #> Loading ranges... #> Scale for x is already present. @@ -544,12 +653,17 @@ ggcoverage(data = track.df, color = "grey", range.position = "out", ``` r # create plot with star mark -ggcoverage(data = track.df, color = "grey", range.position = "out", - single.nuc=T, rect.color = "white") + - geom_base(bam.file = bam.file, +ggcoverage( + data = track_df, + color = "grey", + range.position = "out", + single.nuc = TRUE, + rect.color = "white" +) + + geom_base(bam.file = bam_file, bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, mark.type = "star") + - geom_ideogram(genome = "hg19",plot.space = 0) + geom_ideogram(genome = "hg19", plot.space = 0) #> Loading ideogram... #> Loading ranges... #> Scale for x is already present. @@ -561,13 +675,18 @@ ggcoverage(data = track.df, color = "grey", range.position = "out", **Highlight position with SNV**: ``` r -# highlight -ggcoverage(data = track.df, color = "grey", range.position = "out", - single.nuc=T, rect.color = "white") + - geom_base(bam.file = bam.file, +# highlight one base +ggcoverage( + data = track_df, + color = "grey", + range.position = "out", + single.nuc = TRUE, + rect.color = "white" +) + + geom_base(bam.file = bam_file, bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, mark.type = "highlight") + - geom_ideogram(genome = "hg19",plot.space = 0) + geom_ideogram(genome = "hg19", plot.space = 0) #> Loading ideogram... #> Loading ranges... #> Scale for x is already present. @@ -580,19 +699,27 @@ ggcoverage(data = track.df, color = "grey", range.position = "out", The ChIP-seq data used here are from [DiffBind](https://bioconductor.org/packages/release/bioc/html/DiffBind.html), -I select four sample to use as example: Chr18\_MCF7\_input, -Chr18\_MCF7\_ER\_1, Chr18\_MCF7\_ER\_3, Chr18\_MCF7\_ER\_2, and all bam -files are converted to bigwig file with +I select four sample to use as example: Chr18_MCF7_input, +Chr18_MCF7_ER_1, Chr18_MCF7_ER_3, Chr18_MCF7_ER_2, and all bam files are +converted to bigwig file with [deeptools](https://deeptools.readthedocs.io/en/develop/). Create metadata: ``` r # load metadata -sample.meta = data.frame(SampleName=c('Chr18_MCF7_ER_1','Chr18_MCF7_ER_2','Chr18_MCF7_ER_3','Chr18_MCF7_input'), - Type = c("MCF7_ER_1","MCF7_ER_2","MCF7_ER_3","MCF7_input"), - Group = c("IP", "IP", "IP", "Input")) -sample.meta +sample_meta <- data.frame( + SampleName = c( + "Chr18_MCF7_ER_1", + "Chr18_MCF7_ER_2", + "Chr18_MCF7_ER_3", + "Chr18_MCF7_input" + ), + Type = c("MCF7_ER_1", "MCF7_ER_2", "MCF7_ER_3", "MCF7_input"), + Group = c("IP", "IP", "IP", "Input") +) + +sample_meta #> SampleName Type Group #> 1 Chr18_MCF7_ER_1 MCF7_ER_1 IP #> 2 Chr18_MCF7_ER_2 MCF7_ER_2 IP @@ -604,30 +731,39 @@ Load track files: ``` r # track folder -track.folder = system.file("extdata", "ChIP-seq", package = "ggcoverage") +track_folder <- system.file("extdata", "ChIP-seq", package = "ggcoverage") + # load bigwig file -track.df = LoadTrackFile(track.folder = track.folder, format = "bw", region = "chr18:76822285-76900000", - meta.info = sample.meta) +track_df <- LoadTrackFile( + track.folder = track_folder, + format = "bw", + region = "chr18:76822285-76900000", + meta.info = sample_meta +) + # check data -head(track.df) -#> seqnames start end score Type Group -#> 1 chr18 76820285 76820400 219.658005 MCF7_ER_1 IP -#> 2 chr18 76820401 76820700 0.000000 MCF7_ER_1 IP -#> 3 chr18 76820701 76821000 439.316010 MCF7_ER_1 IP -#> 4 chr18 76821001 76821300 219.658005 MCF7_ER_1 IP -#> 5 chr18 76821301 76821600 0.000000 MCF7_ER_1 IP -#> 6 chr18 76821601 76821900 219.658005 MCF7_ER_1 IP +head(track_df) +#> seqnames start end width strand score Type Group +#> 1 chr18 76820285 76820400 116 * 219.658005 MCF7_ER_1 IP +#> 2 chr18 76820401 76820700 300 * 0.000000 MCF7_ER_1 IP +#> 3 chr18 76820701 76821000 300 * 439.316010 MCF7_ER_1 IP +#> 4 chr18 76821001 76821300 300 * 219.658005 MCF7_ER_1 IP +#> 5 chr18 76821301 76821600 300 * 0.000000 MCF7_ER_1 IP +#> 6 chr18 76821601 76821900 300 * 219.658005 MCF7_ER_1 IP ``` Prepare mark region: ``` r # create mark region -mark.region=data.frame(start=c(76822533), - end=c(76823743), - label=c("Promoter")) +mark_region <- data.frame( + start = c(76822533), + end = c(76823743), + label = c("Promoter") +) + # check data -mark.region +mark_region #> start end label #> 1 76822533 76823743 Promoter ``` @@ -635,9 +771,10 @@ mark.region ### Basic coverage ``` r -basic.coverage = ggcoverage(data = track.df, - mark.region=mark.region, show.mark.label = FALSE) -basic.coverage +basic_coverage <- ggcoverage(data = track_df, + mark.region = mark_region, + show.mark.label = FALSE) +basic_coverage ``` @@ -650,12 +787,15 @@ annotation, we first **get consensus peaks** with ``` r # get consensus peak file -peak.file = system.file("extdata", "ChIP-seq", "consensus.peak", package = "ggcoverage") - -basic.coverage + - geom_gene(gtf.gr=gtf.gr) + - geom_peak(bed.file = peak.file) + - geom_ideogram(genome = "hg19",plot.space = 0) +peak_file <- system.file("extdata", + "ChIP-seq", + "consensus.peak", + package = "ggcoverage") + +basic_coverage + + geom_gene(gtf.gr = gtf_gr) + + geom_peak(bed.file = peak_file) + + geom_ideogram(genome = "hg19", plot.space = 0) #> Loading ideogram... #> Loading ranges... #> Scale for x is already present. @@ -676,23 +816,29 @@ The Hi-C matrix visualization is implemented by ### Load track data ``` r -library(ggcoverage) -library(GenomicRanges) # prepare track dataframe -track.file = system.file("extdata", "HiC", "H3K36me3.bw", package = "ggcoverage") -track.df = LoadTrackFile(track.file = track.file, format = "bw", - region = "chr2L:8050000-8300000", extend = 0) -#> Sample without metadata! -track.df$score = ifelse(track.df$score <0, 0, track.df$score) +track_file <- + system.file("extdata", "HiC", "H3K36me3.bw", package = "ggcoverage") + +track_df <- LoadTrackFile( + track.file = track_file, + format = "bw", + region = "chr2L:8050000-8300000", + extend = 0 +) +#> No metadata provided, returning coverage as is. + +track_df$score <- ifelse(track_df$score < 0, 0, track_df$score) + # check the data -head(track.df) -#> seqnames start end score Type Group -#> 1 chr2L 8050000 8050009 1.66490245 H3K36me3.bw H3K36me3.bw -#> 2 chr2L 8050015 8050049 1.59976900 H3K36me3.bw H3K36me3.bw -#> 3 chr2L 8050057 8050091 1.60730922 H3K36me3.bw H3K36me3.bw -#> 4 chr2L 8050097 8050131 1.65555012 H3K36me3.bw H3K36me3.bw -#> 5 chr2L 8050137 8050171 1.71025538 H3K36me3.bw H3K36me3.bw -#> 6 chr2L 8050176 8050210 1.75198197 H3K36me3.bw H3K36me3.bw +head(track_df) +#> seqnames start end width strand score Type Group +#> 1 chr2L 8050000 8050009 10 * 1.66490245 H3K36me3.bw H3K36me3.bw +#> 2 chr2L 8050015 8050049 35 * 1.59976900 H3K36me3.bw H3K36me3.bw +#> 3 chr2L 8050057 8050091 35 * 1.60730922 H3K36me3.bw H3K36me3.bw +#> 4 chr2L 8050097 8050131 35 * 1.65555012 H3K36me3.bw H3K36me3.bw +#> 5 chr2L 8050137 8050171 35 * 1.71025538 H3K36me3.bw H3K36me3.bw +#> 6 chr2L 8050176 8050210 35 * 1.75198197 H3K36me3.bw H3K36me3.bw ``` ### Load Hi-C data @@ -701,49 +847,51 @@ Matrix: ``` r ## matrix -hic.mat.file = system.file("extdata", "HiC", "HiC_mat.txt", package = "ggcoverage") -hic.mat = read.table(file = hic.mat.file, sep = "\t") -hic.mat = as.matrix(hic.mat) +hic_mat_file <- system.file("extdata", + "HiC", "HiC_mat.txt", package = "ggcoverage") +hic_mat <- read.table(file = hic_mat_file, sep = "\t") +hic_mat <- as.matrix(hic_mat) ``` Bin table: ``` r ## bin -hic.bin.file = system.file("extdata", "HiC", "HiC_bin.txt", package = "ggcoverage") -hic.bin = read.table(file = hic.bin.file, sep = "\t") -colnames(hic.bin) = c("chr", "start", "end") -hic.bin.gr = GenomicRanges::makeGRangesFromDataFrame(df = hic.bin) +hic_bin_file <- + system.file("extdata", "HiC", "HiC_bin.txt", package = "ggcoverage") +hic_bin <- read.table(file = hic_bin_file, sep = "\t") +colnames(hic_bin) <- c("chr", "start", "end") +hic_bin_gr <- GenomicRanges::makeGRangesFromDataFrame(df = hic_bin) + ## transfrom func -FailSafe_log10 <- function(x){ +failsafe_log10 <- function(x) { x[is.na(x) | is.nan(x) | is.infinite(x)] <- 0 - return(log10(x+1)) + return(log10(x + 1)) } ``` Data transfromation method: -``` r -## transfrom func -FailSafe_log10 <- function(x){ - x[is.na(x) | is.nan(x) | is.infinite(x)] <- 0 - return(log10(x+1)) -} -``` - ### Load link ``` r # prepare arcs -link.file = system.file("extdata", "HiC", "HiC_link.bedpe", package = "ggcoverage") +link_file <- + system.file("extdata", "HiC", "HiC_link.bedpe", package = "ggcoverage") ``` ### Basic coverage ``` r -basic.coverage = ggcoverage(data = track.df, color = "grey", - mark.region = NULL, range.position = "out") -basic.coverage +basic_coverage <- + ggcoverage( + data = track_df, + color = "grey", + mark.region = NULL, + range.position = "out" + ) + +basic_coverage ``` @@ -753,11 +901,19 @@ basic.coverage Add **link**, **contact map**annotations: ``` r -basic.coverage + - geom_tad(matrix = hic.mat, granges = hic.bin.gr, value.cut = 0.99, - color.palette = "viridis", transform.fun = FailSafe_log10, - top = FALSE, show.rect = TRUE) + - geom_link(link.file = link.file, file.type = "bedpe", show.rect = TRUE) +basic_coverage + + geom_tad( + matrix = hic_mat, + granges = hic_bin_gr, + value.cut = 0.99, + color.palette = "viridis", + transform.fun = failsafe_log10, + top = FALSE, + show.rect = TRUE + ) + + geom_link(link.file = link_file, + file.type = "bedpe", + show.rect = TRUE) #> Read 534 lines after Skipping 0 lines #> Inserting Data at location: 1 #> Data length: 534 @@ -789,10 +945,12 @@ Discoverer](https://www.thermofisher.cn/cn/zh/home/industrial/mass-spectrometry/ ``` r library(openxlsx) # prepare coverage dataframe -coverage.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") -coverage.df <- openxlsx::read.xlsx(coverage.file) +coverage_file <- + system.file("extdata", + "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") +coverage_df <- openxlsx::read.xlsx(coverage_file, sheet = "Sheet1") # check the data -head(coverage.df) +head(coverage_df) #> Confidence Annotated.Sequence #> 1 High [K].ATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPK.[L] #> 2 High [K].ATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPK.[L] @@ -847,12 +1005,15 @@ head(coverage.df) The input protein fasta: ``` r -library(Biostrings) -fasta.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") +fasta_file <- + system.file("extdata", + "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") + # prepare track dataframe -protein.set <- Biostrings::readAAStringSet(fasta.file) +protein_set <- Biostrings::readAAStringSet(fasta_file) + # check the data -protein.set +protein_set #> AAStringSet object of length 2: #> width seq names #> [1] 607 MKWVTFISLLLLFSSAYSRGVFR...DDKEACFAVEGPKLVVSTQTALA sp|P02769|ALBU_BOVIN @@ -862,9 +1023,14 @@ protein.set ### Protein coverage ``` r -protein.coverage = ggprotein(coverage.file = coverage.file, fasta.file = fasta.file, - protein.id = "sp|P02769|ALBU_BOVIN", range.position = "out") -protein.coverage +protein_coverage <- ggprotein( + coverage.file = coverage_file, + fasta.file = fasta_file, + protein.id = "sp|P02769|ALBU_BOVIN", + range.position = "out" +) + +protein_coverage ``` @@ -881,12 +1047,17 @@ is the reason why there is empty region in 1-24. ``` r # protein feature obtained from UniProt -protein.feature.df = data.frame(ProteinID = "sp|P02769|ALBU_BOVIN", start = c(1, 19, 25), - end = c(18, 24, 607), - Type = c("Signal", "Propeptide", "Chain")) +protein_feature_df <- data.frame( + ProteinID = "sp|P02769|ALBU_BOVIN", + start = c(1, 19, 25), + end = c(18, 24, 607), + Type = c("Signal", "Propeptide", "Chain") +) + # add annotation -protein.coverage + - geom_feature(feature.df = protein.feature.df, feature.color = c("#4d81be","#173b5e","#6a521d")) +protein_coverage + + geom_feature(feature.df = protein_feature_df, + feature.color = c("#4d81be", "#173b5e", "#6a521d")) ``` diff --git a/docs/man/figures/README-base_aa_coverage-1.png b/docs/man/figures/README-base_aa_coverage-1.png deleted file mode 100644 index ca4a142..0000000 Binary files a/docs/man/figures/README-base_aa_coverage-1.png and /dev/null differ diff --git a/docs/man/figures/README-base_aa_coverage_highlight-1.png b/docs/man/figures/README-base_aa_coverage_highlight-1.png deleted file mode 100644 index f041b1c..0000000 Binary files a/docs/man/figures/README-base_aa_coverage_highlight-1.png and /dev/null differ diff --git a/docs/man/figures/README-base_aa_coverage_star-1.png b/docs/man/figures/README-base_aa_coverage_star-1.png deleted file mode 100644 index 9fdaa90..0000000 Binary files a/docs/man/figures/README-base_aa_coverage_star-1.png and /dev/null differ diff --git a/docs/man/figures/README-basic_coverage-1.png b/docs/man/figures/README-basic_coverage-1.png deleted file mode 100644 index 96fd179..0000000 Binary files a/docs/man/figures/README-basic_coverage-1.png and /dev/null differ diff --git a/docs/man/figures/README-basic_coverage_2-1.png b/docs/man/figures/README-basic_coverage_2-1.png deleted file mode 100644 index 235c9c7..0000000 Binary files a/docs/man/figures/README-basic_coverage_2-1.png and /dev/null differ diff --git a/docs/man/figures/README-basic_coverage_3-1.png b/docs/man/figures/README-basic_coverage_3-1.png deleted file mode 100644 index 4414e30..0000000 Binary files a/docs/man/figures/README-basic_coverage_3-1.png and /dev/null differ diff --git a/docs/man/figures/README-basic_coverage_chip-1.png b/docs/man/figures/README-basic_coverage_chip-1.png deleted file mode 100644 index f2dacd3..0000000 Binary files a/docs/man/figures/README-basic_coverage_chip-1.png and /dev/null differ diff --git a/docs/man/figures/README-basic_coverage_dna-1.png b/docs/man/figures/README-basic_coverage_dna-1.png deleted file mode 100644 index c5d0ea3..0000000 Binary files a/docs/man/figures/README-basic_coverage_dna-1.png and /dev/null differ diff --git a/docs/man/figures/README-basic_coverage_hic-1.png b/docs/man/figures/README-basic_coverage_hic-1.png deleted file mode 100644 index 0d153d1..0000000 Binary files a/docs/man/figures/README-basic_coverage_hic-1.png and /dev/null differ diff --git a/docs/man/figures/README-basic_coverage_joint-1.png b/docs/man/figures/README-basic_coverage_joint-1.png deleted file mode 100644 index a899407..0000000 Binary files a/docs/man/figures/README-basic_coverage_joint-1.png and /dev/null differ diff --git a/docs/man/figures/README-basic_coverage_joint_avg-1.png b/docs/man/figures/README-basic_coverage_joint_avg-1.png deleted file mode 100644 index e224675..0000000 Binary files a/docs/man/figures/README-basic_coverage_joint_avg-1.png and /dev/null differ diff --git a/docs/man/figures/README-basic_coverage_protein-1.png b/docs/man/figures/README-basic_coverage_protein-1.png deleted file mode 100644 index e83e8b2..0000000 Binary files a/docs/man/figures/README-basic_coverage_protein-1.png and /dev/null differ diff --git a/docs/man/figures/README-basic_coverage_protein_feature-1.png b/docs/man/figures/README-basic_coverage_protein_feature-1.png deleted file mode 100644 index 6f4265e..0000000 Binary files a/docs/man/figures/README-basic_coverage_protein_feature-1.png and /dev/null differ diff --git a/docs/man/figures/README-cnv_basic_coverage_dna-1.png b/docs/man/figures/README-cnv_basic_coverage_dna-1.png deleted file mode 100644 index db54526..0000000 Binary files a/docs/man/figures/README-cnv_basic_coverage_dna-1.png and /dev/null differ diff --git a/docs/man/figures/README-cnv_gc_coverage-1.png b/docs/man/figures/README-cnv_gc_coverage-1.png deleted file mode 100644 index 1778280..0000000 Binary files a/docs/man/figures/README-cnv_gc_coverage-1.png and /dev/null differ diff --git a/docs/man/figures/README-gc_coverage-1.png b/docs/man/figures/README-gc_coverage-1.png deleted file mode 100644 index ed8330f..0000000 Binary files a/docs/man/figures/README-gc_coverage-1.png and /dev/null differ diff --git a/docs/man/figures/README-gene_coverage-1.png b/docs/man/figures/README-gene_coverage-1.png deleted file mode 100644 index 77e1aae..0000000 Binary files a/docs/man/figures/README-gene_coverage-1.png and /dev/null differ diff --git a/docs/man/figures/README-hic_coverage-1.png b/docs/man/figures/README-hic_coverage-1.png deleted file mode 100644 index 5d6f5eb..0000000 Binary files a/docs/man/figures/README-hic_coverage-1.png and /dev/null differ diff --git a/docs/man/figures/README-ideogram_coverage_1-1.png b/docs/man/figures/README-ideogram_coverage_1-1.png deleted file mode 100644 index 78f1b3b..0000000 Binary files a/docs/man/figures/README-ideogram_coverage_1-1.png and /dev/null differ diff --git a/docs/man/figures/README-ideogram_coverage_2-1.png b/docs/man/figures/README-ideogram_coverage_2-1.png deleted file mode 100644 index a0a3aea..0000000 Binary files a/docs/man/figures/README-ideogram_coverage_2-1.png and /dev/null differ diff --git a/docs/man/figures/README-peak_coverage-1.png b/docs/man/figures/README-peak_coverage-1.png deleted file mode 100644 index 418bd0e..0000000 Binary files a/docs/man/figures/README-peak_coverage-1.png and /dev/null differ diff --git a/docs/man/figures/README-transcript_coverage-1.png b/docs/man/figures/README-transcript_coverage-1.png deleted file mode 100644 index 61ce28e..0000000 Binary files a/docs/man/figures/README-transcript_coverage-1.png and /dev/null differ diff --git a/docs/man/figures/README-transcript_coverage_tight-1.png b/docs/man/figures/README-transcript_coverage_tight-1.png deleted file mode 100644 index 2189a8d..0000000 Binary files a/docs/man/figures/README-transcript_coverage_tight-1.png and /dev/null differ diff --git a/docs/man/figures/ggcoverage.png b/docs/man/figures/ggcoverage.png deleted file mode 100644 index 9846c3c..0000000 Binary files a/docs/man/figures/ggcoverage.png and /dev/null differ diff --git a/docs/man/figures/time_memory_large.png b/docs/man/figures/time_memory_large.png deleted file mode 100644 index 3f89265..0000000 Binary files a/docs/man/figures/time_memory_large.png and /dev/null differ diff --git a/man/LoadTrackFile.Rd b/man/LoadTrackFile.Rd index b1d0dff..0032b8d 100644 --- a/man/LoadTrackFile.Rd +++ b/man/LoadTrackFile.Rd @@ -8,7 +8,7 @@ LoadTrackFile( track.file, track.folder = NULL, format = c("bam", "wig", "bw", "bedgraph", "txt"), - region = "chr14:21,677,306-21,737,601", + region = NULL, extend = 2000, gtf.gr = NULL, gene.name = "HNRNPC", @@ -31,8 +31,8 @@ LoadTrackFile( \item{format}{Track file format, chosen from bam, wig, bw(bigwig), bedgraph(bedGraph) and txt.} -\item{region}{Region used to create coverage plot, eg: chr14:21,677,306-21,737,601 or chr14:21,677,306. -Default: "chr14:21,677,306-21,737,601"} +\item{region}{Region to extract coverage for, eg: chr14:21,677,306-21,737,601 or chr14:21,677,306. +Default: NULL, coverage is extracted from the first annotated chromosome/sequence.} \item{extend}{Extend length of \code{region}. Default: 2000.} @@ -58,11 +58,12 @@ Default: RPKM.} \item{single.nuc.region}{Region for \code{single.nuc}. Default: NULL} -\item{bin.size}{Size of the bins, in bases. Default: 50.} +\item{bin.size}{Size of the bins, in bases. Default: 10. Only used for BAM files, ignored for Wig, Bigwig, etc. +Set to NULL to turn binning off.} \item{bc.extra.para}{Extra parameters for \code{bamCoverage}, eg: "--effectiveGenomeSize 2700000000 --ignoreForNormalization chrX"} -\item{n.cores}{The number of cores to be used for this job. Default:1.} +\item{n.cores}{The number of cores to be used for this job. Default: 1.} } \value{ A dataframe. @@ -73,13 +74,19 @@ Load Track File to Dataframe. \examples{ library(ggcoverage) library(utils) + meta.file <- system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") sample.meta <- utils::read.csv(meta.file) + # track folder track.folder <- system.file("extdata", "RNA-seq", package = "ggcoverage") + # load bigwig file track.df <- LoadTrackFile( - track.folder = track.folder, format = "bw", region = "chr14:21,677,306-21,737,601", - extend = 2000, meta.info = sample.meta + track.folder = track.folder, + format = "bw", + region = "chr14:21,677,306-21,737,601", + extend = 2000, + meta.info = sample.meta ) } diff --git a/man/figures/README-aa_color_scheme-1.png b/man/figures/README-aa_color_scheme-1.png index f833829..39bdad2 100644 Binary files a/man/figures/README-aa_color_scheme-1.png and b/man/figures/README-aa_color_scheme-1.png differ diff --git a/man/figures/README-base_aa_coverage-1.png b/man/figures/README-base_aa_coverage-1.png index ca4a142..38c0e3d 100644 Binary files a/man/figures/README-base_aa_coverage-1.png and b/man/figures/README-base_aa_coverage-1.png differ diff --git a/man/figures/README-base_aa_coverage_highlight-1.png b/man/figures/README-base_aa_coverage_highlight-1.png index f041b1c..2cc380a 100644 Binary files a/man/figures/README-base_aa_coverage_highlight-1.png and b/man/figures/README-base_aa_coverage_highlight-1.png differ diff --git a/man/figures/README-base_aa_coverage_star-1.png b/man/figures/README-base_aa_coverage_star-1.png index 9fdaa90..c85ae54 100644 Binary files a/man/figures/README-base_aa_coverage_star-1.png and b/man/figures/README-base_aa_coverage_star-1.png differ diff --git a/man/figures/README-base_color_scheme-1.png b/man/figures/README-base_color_scheme-1.png index bf66892..4711087 100644 Binary files a/man/figures/README-base_color_scheme-1.png and b/man/figures/README-base_color_scheme-1.png differ diff --git a/man/figures/README-basic_coverage-1.png b/man/figures/README-basic_coverage-1.png index 96fd179..3170da2 100644 Binary files a/man/figures/README-basic_coverage-1.png and b/man/figures/README-basic_coverage-1.png differ diff --git a/man/figures/README-basic_coverage_2-1.png b/man/figures/README-basic_coverage_2-1.png index 235c9c7..c140810 100644 Binary files a/man/figures/README-basic_coverage_2-1.png and b/man/figures/README-basic_coverage_2-1.png differ diff --git a/man/figures/README-basic_coverage_3-1.png b/man/figures/README-basic_coverage_3-1.png index 4414e30..13ea576 100644 Binary files a/man/figures/README-basic_coverage_3-1.png and b/man/figures/README-basic_coverage_3-1.png differ diff --git a/man/figures/README-basic_coverage_chip-1.png b/man/figures/README-basic_coverage_chip-1.png index f2dacd3..0f4fd0b 100644 Binary files a/man/figures/README-basic_coverage_chip-1.png and b/man/figures/README-basic_coverage_chip-1.png differ diff --git a/man/figures/README-basic_coverage_dna-1.png b/man/figures/README-basic_coverage_dna-1.png index c5d0ea3..48d4940 100644 Binary files a/man/figures/README-basic_coverage_dna-1.png and b/man/figures/README-basic_coverage_dna-1.png differ diff --git a/man/figures/README-basic_coverage_hic-1.png b/man/figures/README-basic_coverage_hic-1.png index 0d153d1..ff6f01b 100644 Binary files a/man/figures/README-basic_coverage_hic-1.png and b/man/figures/README-basic_coverage_hic-1.png differ diff --git a/man/figures/README-basic_coverage_joint-1.png b/man/figures/README-basic_coverage_joint-1.png index a899407..6437c09 100644 Binary files a/man/figures/README-basic_coverage_joint-1.png and b/man/figures/README-basic_coverage_joint-1.png differ diff --git a/man/figures/README-basic_coverage_joint_avg-1.png b/man/figures/README-basic_coverage_joint_avg-1.png index e224675..dd8efc8 100644 Binary files a/man/figures/README-basic_coverage_joint_avg-1.png and b/man/figures/README-basic_coverage_joint_avg-1.png differ diff --git a/man/figures/README-basic_coverage_protein-1.png b/man/figures/README-basic_coverage_protein-1.png index e83e8b2..d4e0c66 100644 Binary files a/man/figures/README-basic_coverage_protein-1.png and b/man/figures/README-basic_coverage_protein-1.png differ diff --git a/man/figures/README-basic_coverage_protein_feature-1.png b/man/figures/README-basic_coverage_protein_feature-1.png index 6f4265e..caaf3ac 100644 Binary files a/man/figures/README-basic_coverage_protein_feature-1.png and b/man/figures/README-basic_coverage_protein_feature-1.png differ diff --git a/man/figures/README-cnv_basic_coverage_dna-1.png b/man/figures/README-cnv_basic_coverage_dna-1.png index db54526..7b585a5 100644 Binary files a/man/figures/README-cnv_basic_coverage_dna-1.png and b/man/figures/README-cnv_basic_coverage_dna-1.png differ diff --git a/man/figures/README-cnv_gc_coverage-1.png b/man/figures/README-cnv_gc_coverage-1.png index 1778280..4b4f42d 100644 Binary files a/man/figures/README-cnv_gc_coverage-1.png and b/man/figures/README-cnv_gc_coverage-1.png differ diff --git a/man/figures/README-gc_coverage-1.png b/man/figures/README-gc_coverage-1.png index ed8330f..e7532ca 100644 Binary files a/man/figures/README-gc_coverage-1.png and b/man/figures/README-gc_coverage-1.png differ diff --git a/man/figures/README-gene_coverage-1.png b/man/figures/README-gene_coverage-1.png index 77e1aae..ae4c87f 100644 Binary files a/man/figures/README-gene_coverage-1.png and b/man/figures/README-gene_coverage-1.png differ diff --git a/man/figures/README-hic_coverage-1.png b/man/figures/README-hic_coverage-1.png index 5d6f5eb..d290975 100644 Binary files a/man/figures/README-hic_coverage-1.png and b/man/figures/README-hic_coverage-1.png differ diff --git a/man/figures/README-ideogram_coverage_1-1.png b/man/figures/README-ideogram_coverage_1-1.png index 78f1b3b..8adde79 100644 Binary files a/man/figures/README-ideogram_coverage_1-1.png and b/man/figures/README-ideogram_coverage_1-1.png differ diff --git a/man/figures/README-ideogram_coverage_2-1.png b/man/figures/README-ideogram_coverage_2-1.png index a0a3aea..12cbb1b 100644 Binary files a/man/figures/README-ideogram_coverage_2-1.png and b/man/figures/README-ideogram_coverage_2-1.png differ diff --git a/man/figures/README-peak_coverage-1.png b/man/figures/README-peak_coverage-1.png index 418bd0e..d180408 100644 Binary files a/man/figures/README-peak_coverage-1.png and b/man/figures/README-peak_coverage-1.png differ diff --git a/man/figures/README-transcript_coverage-1.png b/man/figures/README-transcript_coverage-1.png index 61ce28e..af74162 100644 Binary files a/man/figures/README-transcript_coverage-1.png and b/man/figures/README-transcript_coverage-1.png differ diff --git a/man/figures/README-transcript_coverage_tight-1.png b/man/figures/README-transcript_coverage_tight-1.png index 2189a8d..81e72bc 100644 Binary files a/man/figures/README-transcript_coverage_tight-1.png and b/man/figures/README-transcript_coverage_tight-1.png differ diff --git a/man/geom_feature.Rd b/man/geom_feature.Rd index 1cdb6d7..a22b6b8 100644 --- a/man/geom_feature.Rd +++ b/man/geom_feature.Rd @@ -34,12 +34,27 @@ Add Feature Annotation to Coverage Plot. } \examples{ # library(ggcoverage) -# coverage.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") -# fasta.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") +# coverage.file <- system.file( +# "extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage" +# ) +# fasta.file <- system.file( +# "extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage" +# ) # protein.id = "sp|P02769|ALBU_BOVIN" -# protein.coverage = ggprotein(coverage.file = coverage.file, fasta.file = fasta.file, protein.id = protein.id) -# feature.df = data.frame(ProteinID = protein.id, start = c(1, 19, 25), end = c(18, 24, 607), -# Type = c("Signal", "Propeptide", "Chain")) +# protein.coverage = ggprotein( +# coverage.file = coverage.file, +# fasta.file = fasta.file, +# protein.id = protein.id +# ) +# feature.df = data.frame( +# ProteinID = protein.id, +# start = c(1, 19, 25), +# end = c(18, 24, 607), +# Type = c("Signal", "Propeptide", "Chain") +# ) # protein.coverage + -# geom_feature(feature.df = feature.df, feature.color = c("#4d81be","#173b5e","#6a521d")) +# geom_feature( +# feature.df = feature.df, +# feature.color = c("#4d81be","#173b5e","#6a521d") +# ) } diff --git a/man/geom_protein.Rd b/man/geom_protein.Rd index 5577086..3f8e370 100644 --- a/man/geom_protein.Rd +++ b/man/geom_protein.Rd @@ -17,7 +17,7 @@ geom_protein( mark.color = "red", mark.alpha = 0.5, show.table = TRUE, - table.position = c("right_top", "left_top", "left_bottom", "right_bottom"), + table.position = c("top_right", "top_left", "bottom_right", "bottom_left"), table.size = 4, table.color = "black", range.size = 3, @@ -52,7 +52,7 @@ geom_protein( \item{table.position}{The position of the coverage summary table, choose from right_top, left_top, left_bottom, right_bottom. Default: right_top.} -\item{table.size}{The font size of coverage summary table. Default: 4.} +\item{table.size}{The font size of coverage summary table. Default: 12.} \item{table.color}{The font color of coverage summary table. Default: black.} @@ -70,8 +70,12 @@ Layer for Protein Coverage Plot. \examples{ library(ggplot2) library(ggcoverage) -coverage.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") -fasta.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") +coverage.file <- system.file( + "extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage" +) +fasta.file <- system.file( + "extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage" +) protein.id = "sp|P02769|ALBU_BOVIN" ggplot() + geom_protein(coverage.file = coverage.file, fasta.file = fasta.file, protein.id = protein.id) diff --git a/man/ggprotein.Rd b/man/ggprotein.Rd index ffc1b45..62022f8 100644 --- a/man/ggprotein.Rd +++ b/man/ggprotein.Rd @@ -17,12 +17,11 @@ ggprotein( mark.color = "red", mark.alpha = 0.5, show.table = TRUE, - table.position = c("right_top", "left_top", "left_bottom", "right_bottom"), - table.size = 4, + table.position = c("top_right", "top_left", "bottom_left", "bottom_right"), + table.size = 10, table.color = "black", range.size = 3, - range.position = c("in", "out"), - plot.space = 0.2 + range.position = c("in", "out") ) } \arguments{ @@ -69,9 +68,18 @@ A ggplot2 object. Create Mass Spectrometry Protein Coverage Plot. } \examples{ -# library(ggcoverage) -# coverage.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") -# fasta.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") -# protein.id = "sp|P02769|ALBU_BOVIN" -# ggprotein(coverage.file = coverage.file, fasta.file = fasta.file, protein.id = protein.id) +library(ggcoverage) +coverage.file <- system.file( + "extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage" +) +fasta.file <- system.file( + "extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage" +) +protein.id = "sp|P02769|ALBU_BOVIN" + +ggprotein( + coverage.file = coverage.file, + fasta.file = fasta.file, + protein.id = protein.id +) } diff --git a/vignettes/ggcoverage.Rmd b/vignettes/ggcoverage.Rmd index 43c0cc9..6ffa104 100644 --- a/vignettes/ggcoverage.Rmd +++ b/vignettes/ggcoverage.Rmd @@ -26,380 +26,453 @@ BiocStyle::markdown() ``` ```{r setup, echo=FALSE, warning=FALSE} -library(knitr) -htmltools::tagList(rmarkdown::html_dependency_font_awesome()) - -# set dpi knitr::opts_chunk$set( collapse = TRUE, comment = "#>", - dpi=60 + dpi = 60, + fig.path = "png/", + crop = NULL ) ``` -# Getting started +## Introduction + +The goal of `ggcoverage` is simplify the process of visualizing omics coverage. It contains three main parts: + +* **Load the data**: `ggcoverage` can load BAM, BigWig (.bw), BedGraph, txt/xlsx files from various omics data, including WGS, RNA-seq, ChIP-seq, ATAC-seq, proteomics, et al. +* **Create omics coverage plot** +* **Add annotations**: `ggcoverage` supports six different annotations: + * **base and amino acid annotation**: Visualize genome coverage at single-nucleotide level with bases and amino acids. + * **GC annotation**: Visualize genome coverage with GC content + * **CNV annotation**: Visualize genome coverage with copy number variation (CNV) + * **gene annotation**: Visualize genome coverage across genes + * **transcription annotation**: Visualize genome coverage across different transcripts + * **ideogram annotation**: Visualize the region showing on whole chromosome + * **peak annotation**: Visualize genome coverage and peak identified + * **contact map annotation**: Visualize genome coverage with Hi-C contact map + * **link annotation**: Visualize genome coverage with contacts + * **protein feature annotation**: Visualize protein coverage with features + +`ggcoverage` utilizes `ggplot2` plotting system, so its usage is **ggplot2-style**! + + +## Installation `ggcoverage` is an R package distributed as part of the [CRAN](https://cran.r-project.org/). -To install the package, start R and enter: +To install the package, start R and enter one of the following commands: ```{r install, eval=FALSE} -# install via CRAN (v0.7.1) # old version, it's better to install via Github +# install via CRAN (not yet available) install.packages("ggcoverage") -# install via Github (v1.2.0) -# install.package("remotes") #In case you have not installed it. -# BiocManager::install("areyesq89/GenomeMatrix") # In case of possible dependency error +# OR install via Github +install.package("remotes") remotes::install_github("showteeth/ggcoverage") ``` In general, it is **recommended** to install from [Github repository](https://github.com/showteeth/ggcoverage) (update more timely). -Once `ggcoverage` is installed, it can be loaded by the following command. +Once `ggcoverage` is installed, it can be loaded (together with other libraries) like this: ```{r library, message=FALSE, warning=FALSE} library("rtracklayer") -library("graphics") library("ggcoverage") library("ggpattern") ``` -# Introduction - +## Manual - The goal of `ggcoverage` is simplify the process of visualizing omics coverage. It contains three main parts: +`ggcoverage` provides two [vignettes](https://showteeth.github.io/ggcoverage/): + +* **detailed manual**: step-by-step usage +* **customize the plot**: customize the plot and add additional layer -* **Load the data**: `ggcoverage` can load `BAM`, `BigWig (.bw)`, `BedGraph`, `txt/xlsx` files from various omics data, including WGS, RNA-seq, ChIP-seq, ATAC-seq, proteomics, et al. -* **Create omics coverage plot** -* **Add annotations**: `ggcoverage` supports six different annotations: - * **base and amino acid annotation**: Visualize genome coverage at single-nucleotide level with bases and amino acids. - * **GC annotation**: Visualize genome coverage with GC content - * **CNV annotation**: Visualize genome coverage with copy number variation (CNV) - * **gene annotation**: Visualize genome coverage across genes - * **transcription annotation**: Visualize genome coverage across different transcripts - * **ideogram annotation**: Visualize the region showing on whole chromosome - * **peak annotation**: Visualize genome coverage and peak identified - * **contact map annotation**: Visualize genome coverage with Hi-C contact map - * **link annotation**: Visualize genome coverage with contacts - * **peotein feature annotation**: Visualize protein coverage with features -------------- +## RNA-seq data + +### Load the data -# RNA-seq data -## Load the data The RNA-seq data used here are from [Transcription profiling by high throughput sequencing of HNRNPC knockdown and control HeLa cells](https://bioconductor.org/packages/release/data/experiment/html/RNAseqData.HNRNPC.bam.chr14.html), we select four sample to use as example: ERR127307_chr14, ERR127306_chr14, ERR127303_chr14, ERR127302_chr14, and all bam files are converted to bigwig file with [deeptools](https://deeptools.readthedocs.io/en/develop/). Load metadata: + ```{r load_metadata} # load metadata -meta.file <- system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") -sample.meta = read.csv(meta.file) -sample.meta +meta_file <- + system.file("extdata", "RNA-seq", "meta_info.csv", package = "ggcoverage") +sample_meta <- read.csv(meta_file) +sample_meta ``` Load track files: + ```{r load_track} # track folder -track.folder = system.file("extdata", "RNA-seq", package = "ggcoverage") +track_folder <- system.file("extdata", "RNA-seq", package = "ggcoverage") # load bigwig file -track.df = LoadTrackFile(track.folder = track.folder, format = "bw", - region = "chr14:21,677,306-21,737,601", extend = 2000, - meta.info = sample.meta) +track_df <- LoadTrackFile( + track.folder = track_folder, + format = "bw", + region = "chr14:21,677,306-21,737,601", + extend = 2000, + meta.info = sample_meta +) # check data -head(track.df) +head(track_df) ``` Prepare mark region: + ```{r prepare_mark} # create mark region -mark.region=data.frame(start=c(21678900,21732001,21737590), - end=c(21679900,21732400,21737650), - label=c("M1", "M2", "M3")) +mark_region <- data.frame( + start = c(21678900, 21732001, 21737590), + end = c(21679900, 21732400, 21737650), + label = c("M1", "M2", "M3") +) # check data -mark.region +mark_region ``` -------------- +### Load GTF -## Load GTF To add **gene annotation**, the gtf file should contain **gene_type** and **gene_name** attributes in **column 9**; to add **transcript annotation**, the gtf file should contain **transcript_name** attribute in **column 9**. + ```{r load_gtf} -gtf.file = system.file("extdata", "used_hg19.gtf", package = "ggcoverage") -gtf.gr = rtracklayer::import.gff(con = gtf.file, format = 'gtf') +gtf_file <- + system.file("extdata", "used_hg19.gtf", package = "ggcoverage") +gtf_gr <- rtracklayer::import.gff(con = gtf_file, format = "gtf") ``` -------------- +### Basic coverage -## Basic coverage The basic coverage plot has **two types**: * **facet**: Create subplot for every track (specified by `facet.key`). This is default. * **joint**: Visualize all tracks in a single plot. #### joint view + Create line plot for **every sample** (`facet.key = "Type"`) and color by **every sample** (`group.key = "Type"`): -```{r basic_coverage_joint, eval=FALSE} -basic.coverage = ggcoverage(data = track.df, - plot.type = "joint", facet.key = "Type", group.key = "Type", - mark.region = mark.region, range.position = "out") -basic.coverage -``` -```{r basic_coverage_joint_plot, echo=FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} -knitr::include_graphics("../man/figures/README-basic_coverage_joint-1.png") +```{r basic_coverage_joint, warning=FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "joint", + facet.key = "Type", + group.key = "Type", + mark.region = mark_region, + range.position = "out" +) + +basic_coverage ``` Create **group average line plot** (sample is indicated by `facet.key = "Type"`, group is indicated by `group.key = "Group"`): -```{r basic_coverage_joint_avg, eval=FALSE} -basic.coverage = ggcoverage(data = track.df, - plot.type = "joint", facet.key = "Type", group.key = "Group", - joint.avg = TRUE, - mark.region = mark.region, range.position = "out") -basic.coverage -``` -```{r basic_coverage_joint_avg_plot, echo=FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} -knitr::include_graphics("../man/figures/README-basic_coverage_joint_avg-1.png") -``` +```{r basic_coverage_joint_avg, warning=FALSE, fig.height = 4, fig.width = 12, fig.align = "center"} +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "joint", + facet.key = "Type", + group.key = "Group", + joint.avg = TRUE, + mark.region = mark_region, + range.position = "out" +) -#### facet view -```{r basic_coverage, eval=FALSE} -basic.coverage = ggcoverage(data = track.df, plot.type = "facet", - mark.region = mark.region, range.position = "out") -basic.coverage +basic_coverage ``` -```{r basic_coverage_plot, echo=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -knitr::include_graphics("../man/figures/README-basic_coverage-1.png") +#### Facet view + +```{r basic_coverage, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "facet", + mark.region = mark_region, + range.position = "out" +) + +basic_coverage ``` #### Custom Y-axis style + **Change the Y-axis scale label in/out of plot region with `range.position`**: -```{r basic_coverage_2, eval=FALSE} -basic.coverage = ggcoverage(data = track.df, plot.type = "facet", - mark.region = mark.region, range.position = "in") -basic.coverage -``` -```{r basic_coverage_2_plot, echo=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -knitr::include_graphics("../man/figures/README-basic_coverage_2-1.png") +```{r basic_coverage_2, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "facet", + mark.region = mark_region, + range.position = "in" +) + +basic_coverage ``` **Shared/Free Y-axis scale with `facet.y.scale`**: -```{r basic_coverage_3, eval=FALSE} -basic.coverage = ggcoverage(data = track.df, plot.type = "facet", - mark.region = mark.region, range.position = "in", - facet.y.scale = "fixed") -basic.coverage -``` -```{r basic_coverage_3_plot, echo=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -knitr::include_graphics("../man/figures/README-basic_coverage_3-1.png") -``` +```{r basic_coverage_3, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +basic_coverage <- ggcoverage( + data = track_df, + plot.type = "facet", + mark.region = mark_region, + range.position = "in", + facet.y.scale = "fixed" +) -------------- +basic_coverage +``` -## Add gene annotation +### Add gene annotation -```{r gene_coverage, eval=FALSE} -basic.coverage + - geom_gene(gtf.gr=gtf.gr) +```{r gene_coverage, warning=FALSE, fig.height = 8, fig.width = 12, fig.align = "center"} +basic_coverage + + geom_gene(gtf.gr = gtf_gr) ``` -```{r gene_coverage_plot, echo=FALSE, fig.height = 8, fig.width = 12, fig.align = "center"} -knitr::include_graphics("../man/figures/README-gene_coverage-1.png") -``` -------------- +### Add transcript annotation -## Add transcript annotation **In "loose" stype (default style; each transcript occupies one line)**: -```{r transcript_coverage, eval=FALSE} -basic.coverage + - geom_transcript(gtf.gr=gtf.gr,label.vjust = 1.5) -``` -```{r transcript_coverage_plot, echo=FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} -knitr::include_graphics("../man/figures/README-transcript_coverage-1.png") +```{r transcript_coverage, warning=FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} +basic_coverage + + geom_transcript(gtf.gr = gtf_gr, label.vjust = 1.5) ``` **In "tight" style (place non-overlap transcripts in one line)**: -```{r transcript_coverage_tight, eval=FALSE} -basic.coverage + - geom_transcript(gtf.gr=gtf.gr, overlap.style = "tight", label.vjust = 1.5) -``` -```{r transcript_coverage_tight_plot, echo=FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} -knitr::include_graphics("../man/figures/README-transcript_coverage_tight-1.png") +```{r transcript_coverage_tight, warning=FALSE, fig.height = 12, fig.width = 12, fig.align = "center"} +basic_coverage + + geom_transcript(gtf.gr = gtf_gr, + overlap.style = "tight", + label.vjust = 1.5) ``` -------------- +### Add ideogram -## Add ideogram -```{r ideogram_coverage_1, eval=FALSE} -basic.coverage + - geom_gene(gtf.gr=gtf.gr) + - geom_ideogram(genome = "hg19",plot.space = 0) +```{r ideogram_coverage_1, eval = FALSE} +basic_coverage + + geom_gene(gtf.gr = gtf_gr) + + geom_ideogram(genome = "hg19", plot.space = 0) ``` -```{r ideogram_coverage_1_plot, echo=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} +```{r ideogram_coverage_1_plot, echo = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} knitr::include_graphics("../man/figures/README-ideogram_coverage_1-1.png") ``` -```{r ideogram_coverage_2, eval=FALSE} -basic.coverage + - geom_transcript(gtf.gr=gtf.gr,label.vjust = 1.5) + - geom_ideogram(genome = "hg19",plot.space = 0) +```{r ideogram_coverage_2, eval = FALSE} +basic_coverage + + geom_transcript(gtf.gr = gtf_gr, label.vjust = 1.5) + + geom_ideogram(genome = "hg19", plot.space = 0) ``` -```{r ideogram_coverage_2_plot, echo=FALSE, fig.height = 14, fig.width = 12, fig.align = "center"} +```{r ideogram_coverage_2_plot, echo = FALSE, fig.height = 14, fig.width = 12, fig.align = "center"} knitr::include_graphics("../man/figures/README-ideogram_coverage_2-1.png") ``` -------------- +## DNA-seq data + +### CNV + +#### Example 1 + +##### Load the data -# DNA-seq data -## CNV -### Example 1 -#### Load the data The DNA-seq data used here are from [Copy number work flow](http://bioconductor.org/help/course-materials/2014/SeattleOct2014/B02.2.3_CopyNumber.html), we select tumor sample, and get bin counts with `cn.mops::getReadCountsFromBAM` with `WL` 1000. ```{r load_bin_counts} # prepare metafile -cnv.meta.info = data.frame( +cnv_meta_info <- data.frame( SampleName = c("CNV_example"), Type = c("tumor"), Group = c("tumor") ) + # track file -track.file = system.file("extdata", "DNA-seq", "CNV_example.txt", package = "ggcoverage") +track_file <- system.file("extdata", + "DNA-seq", "CNV_example.txt", package = "ggcoverage") + # load txt file -track.df = LoadTrackFile(track.file = track.file, format = "txt", region = "chr4:61750000-62,700,000", - meta.info = cnv.meta.info) +track_df <- LoadTrackFile( + track.file = track_file, + format = "txt", + region = "chr4:61750000-62,700,000", + meta.info = cnv_meta_info +) + # check data -head(track.df) +head(track_df) ``` -#### Basic coverage -```{r basic_coverage_dna, eval=FALSE} -basic.coverage = ggcoverage(data = track.df,color = "grey", mark.region = NULL, - range.position = "out") -basic.coverage -``` +##### Basic coverage -```{r basic_coverage_dna_plot, echo=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -knitr::include_graphics("../man/figures/README-basic_coverage_dna-1.png") +```{r basic_coverage_dna, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +basic_coverage <- ggcoverage( + data = track_df, + color = "grey", + mark.region = NULL, + range.position = "out" +) + +basic_coverage ``` -#### Add annotations -Add **GC**, **ideogram** and **gene** annotations. +##### Add GC annotations + +Add **GC**, **ideogram** and **gene** annotaions. -```{r gc_coverage, eval=FALSE} +```{r gc_coverage, eval = FALSE} # load genome data library("BSgenome.Hsapiens.UCSC.hg19") + # create plot -basic.coverage + - geom_gc(bs.fa.seq=BSgenome.Hsapiens.UCSC.hg19) + - geom_gene(gtf.gr=gtf.gr) + +basic_coverage + + geom_gc(bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19) + + geom_gene(gtf.gr = gtf_gr) + geom_ideogram(genome = "hg19") ``` -```{r gc_coverage_plot, echo=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} + +```{r gc_coverage_plot, echo = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} knitr::include_graphics("../man/figures/README-gc_coverage-1.png") ``` +#### Example 2 + +##### Load the data -### Example 2 -#### Load the data The DNA-seq data used here are from [Genome-wide copy number analysis of single cells](https://www.nature.com/articles/nprot.2012.039), and the accession number is [SRR054616](https://trace.ncbi.nlm.nih.gov/Traces/index.html?run=SRR054616). ```{r cnv_load_track_file} # track file -track.file <- system.file("extdata", "DNA-seq", "SRR054616.bw", package = "ggcoverage") +track_file <- + system.file("extdata", "DNA-seq", "SRR054616.bw", package = "ggcoverage") + # load track -track.df = LoadTrackFile(track.file = track.file, format = "bw", region = "4:1-160000000") +track_df <- LoadTrackFile(track.file = track_file, + format = "bw", + region = "4:1-160000000") + # add chr prefix -track.df$seqnames = paste0("chr", track.df$seqnames) +track_df$seqnames <- paste0("chr", track_df$seqnames) + # check data -head(track.df) +head(track_df) ``` -#### Basic coverage -```{r cnv_basic_coverage_dna, eval=FALSE} -basic.coverage = ggcoverage(data = track.df, color = "grey", - mark.region = NULL, range.position = "out") -basic.coverage -``` +##### Basic coverage + +```{r cnv_basic_coverage_dna} +basic_coverage <- ggcoverage( + data = track_df, + color = "grey", + mark.region = NULL, + range.position = "out" +) -```{r cnv_basic_coverage_dna_plot, echo=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -knitr::include_graphics("../man/figures/README-cnv_basic_coverage_dna-1.png") +basic_coverage ``` -#### Load CNV file +##### Load CNV file + ```{r cnv_load_cnv} # prepare files -cnv.file <- system.file("extdata", "DNA-seq", "SRR054616_copynumber.txt", package = "ggcoverage") +cnv_file <- + system.file("extdata", "DNA-seq", "SRR054616_copynumber.txt", + package = "ggcoverage") + # read CNV -cnv.df = read.table(file = cnv.file, sep = "\t", header = TRUE) +cnv_df <- read.table(file = cnv_file, sep = "\t", header = TRUE) + # check data -head(cnv.df) +head(cnv_df) ``` -#### Add annotations +##### Add annotations + Add **GC**, **ideogram** and **CNV** annotations. ```{r cnv_gc_coverage, eval=FALSE} -# load genome data -library("BSgenome.Hsapiens.UCSC.hg19") # create plot -basic.coverage + - geom_gc(bs.fa.seq=BSgenome.Hsapiens.UCSC.hg19) + - geom_cnv(cnv.df = cnv.df, bin.col = 3, cn.col = 4) + - geom_ideogram(genome = "hg19",plot.space = 0, highlight.centromere = TRUE) -``` - -```{r cnv_gc_coverage_plot, echo=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} +basic_coverage + + geom_gc(bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19) + + geom_cnv(cnv.df = cnv_df, + bin.col = 3, + cn.col = 4) + + geom_ideogram( + genome = "hg19", + plot.space = 0, + highlight.centromere = TRUE + ) +``` + +```{r cnv_gc_coverage_plot, echo = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} knitr::include_graphics("../man/figures/README-cnv_gc_coverage-1.png") ``` ---------------------- -## Single-nucleotide level -### Load the data +### Single-nucleotide level + +#### Load the data + ```{r load_single_nuc} # prepare sample metadata -sample.meta <- data.frame( +sample_meta <- data.frame( SampleName = c("tumorA.chr4.selected"), Type = c("tumorA"), Group = c("tumorA") ) + # load bam file -bam.file = system.file("extdata", "DNA-seq", "tumorA.chr4.selected.bam", package = "ggcoverage") -track.df <- LoadTrackFile( - track.file = bam.file, - meta.info = sample.meta, - single.nuc=TRUE, single.nuc.region="chr4:62474235-62474295" +bam_file <- system.file("extdata", + "DNA-seq", "tumorA.chr4.selected.bam", + package = "ggcoverage") + +track_df <- LoadTrackFile( + track.file = bam_file, + meta.info = sample_meta, + single.nuc = TRUE, + single.nuc.region = "chr4:62474235-62474295" ) -head(track.df) + +head(track_df) ``` -### Default color scheme +#### Default color scheme + For base and amino acid annotation, we have following default color schemes, you can change with `nuc.color` and `aa.color` parameters. Default color scheme for base annotation is `Clustal-style`, more popular color schemes is available [here](https://www.biostars.org/p/171056/). + ```{r base_color_scheme, warning=FALSE, fig.height = 2, fig.width = 6, fig.align = "center"} # color scheme -nuc.color = c("A" = "#ff2b08", "C" = "#009aff", "G" = "#ffb507", "T" = "#00bc0d") -opar <- graphics::par() +nuc_color <- c( + "A" = "#ff2b08", "C" = "#009aff", "G" = "#ffb507", "T" = "#00bc0d" +) +opar <- graphics::par() + # create plot graphics::par(mar = c(1, 5, 1, 1)) graphics::image( - 1:length(nuc.color), 1, as.matrix(1:length(nuc.color)), - col = nuc.color, - xlab = "", ylab = "", xaxt = "n", yaxt = "n", bty = "n" + seq_along(nuc_color), + 1, + as.matrix(seq_along(nuc_color)), + col = nuc_color, + xlab = "", + ylab = "", + xaxt = "n", + yaxt = "n", + bty = "n" ) -graphics::text(1:length(nuc.color), 1, names(nuc.color)) +graphics::text(seq_along(nuc_color), 1, names(nuc_color)) graphics::mtext( - text = "Base", adj = 1, las = 1, + text = "Base", + adj = 1, + las = 1, side = 2 ) @@ -408,24 +481,36 @@ graphics::par(opar) ``` Default color scheme for amino acid annotation is from [Residual colours: a proposal for aminochromography](https://academic.oup.com/peds/article/10/7/743/1593029?login=false): + ```{r aa_color_scheme, warning=FALSE, fig.height = 9, fig.width = 10, fig.align = "center"} -aa.color = c( - "D" = "#FF0000", "S" = "#FF2400", "T" = "#E34234", "G" = "#FF8000", "P" = "#F28500", - "C" = "#FFFF00", "A" = "#FDFF00", "V" = "#E3FF00", "I" = "#C0FF00", "L" = "#89318C", - "M" = "#00FF00", "F" = "#50C878", "Y" = "#30D5C8", "W" = "#00FFFF", "H" = "#0F2CB3", - "R" = "#0000FF", "K" = "#4b0082", "N" = "#800080", "Q" = "#FF00FF", "E" = "#8F00FF", - "*" = "#FFC0CB", " " = "#FFFFFF", " " = "#FFFFFF", " " = "#FFFFFF", " " = "#FFFFFF" +aa_color <- c( + "D" = "#FF0000", "S" = "#FF2400", "T" = "#E34234", "G" = "#FF8000", + "P" = "#F28500", "C" = "#FFFF00", "A" = "#FDFF00", "V" = "#E3FF00", + "I" = "#C0FF00", "L" = "#89318C", "M" = "#00FF00", "F" = "#50C878", + "Y" = "#30D5C8", "W" = "#00FFFF", "H" = "#0F2CB3", "R" = "#0000FF", + "K" = "#4b0082", "N" = "#800080", "Q" = "#FF00FF", "E" = "#8F00FF", + "*" = "#FFC0CB", " " = "#FFFFFF", " " = "#FFFFFF", " " = "#FFFFFF", + " " = "#FFFFFF" ) graphics::par(mar = c(1, 5, 1, 1)) graphics::image( - 1:5, 1:5, matrix(1:length(aa.color),nrow=5), - col = rev(aa.color), - xlab = "", ylab = "", xaxt = "n", yaxt = "n", bty = "n" + 1:5, + 1:5, + matrix(seq_along(aa_color), nrow = 5), + col = rev(aa_color), + xlab = "", + ylab = "", + xaxt = "n", + yaxt = "n", + bty = "n" ) -graphics::text(expand.grid(1:5,1:5), names(rev(aa.color))) + +graphics::text(expand.grid(1:5, 1:5), names(rev(aa_color))) graphics::mtext( - text = "Amino acids", adj = 1, las = 1, + text = "Amino acids", + adj = 1, + las = 1, side = 2 ) @@ -433,290 +518,341 @@ graphics::mtext( graphics::par(opar) ``` -### Add base and amino acid annotation +#### Add base and amino acid annotation + **Use twill to mark position with SNV**: -```{r base_aa_coverage, eval=FALSE} -library(ggpattern) + +```{r base_aa_coverage, eval =FALSE} # create plot with twill mark -ggcoverage(data = track.df, color = "grey", range.position = "out", - single.nuc=T, rect.color = "white") + - geom_base(bam.file = bam.file, +ggcoverage( + data = track_df, + color = "grey", + range.position = "out", + single.nuc = TRUE, + rect.color = "white" +) + + geom_base(bam.file = bam_file, bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, mark.type = "twill") + - geom_ideogram(genome = "hg19",plot.space = 0) + geom_ideogram(genome = "hg19", plot.space = 0) ``` -```{r base_aa_coverage_plot, echo=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} + +```{r base_aa_coverage_plot, echo = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} knitr::include_graphics("../man/figures/README-base_aa_coverage-1.png") ``` **Use star to mark position with SNV**: -```{r base_aa_coverage_star, eval=FALSE} + +```{r base_aa_coverage_star, eval = FALSE} # create plot with star mark -ggcoverage(data = track.df, color = "grey", range.position = "out", - single.nuc=T, rect.color = "white") + - geom_base(bam.file = bam.file, +ggcoverage( + data = track_df, + color = "grey", + range.position = "out", + single.nuc = TRUE, + rect.color = "white" +) + + geom_base(bam.file = bam_file, bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, mark.type = "star") + - geom_ideogram(genome = "hg19",plot.space = 0) + geom_ideogram(genome = "hg19", plot.space = 0) ``` -```{r base_aa_coverage_star_plot, echo=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} + +```{r base_aa_coverage_star_plot, echo = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} knitr::include_graphics("../man/figures/README-base_aa_coverage_star-1.png") ``` **Highlight position with SNV**: -```{r base_aa_coverage_highlight, eval=FALSE} -# highlight -ggcoverage(data = track.df, color = "grey", range.position = "out", - single.nuc=T, rect.color = "white") + - geom_base(bam.file = bam.file, + +```{r base_aa_coverage_highlight, eval = FALSE} +# highlight one base +ggcoverage( + data = track_df, + color = "grey", + range.position = "out", + single.nuc = TRUE, + rect.color = "white" +) + + geom_base(bam.file = bam_file, bs.fa.seq = BSgenome.Hsapiens.UCSC.hg19, mark.type = "highlight") + - geom_ideogram(genome = "hg19",plot.space = 0) + geom_ideogram(genome = "hg19", plot.space = 0) ``` -```{r base_aa_coverage_highlight_plot, echo=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} + +```{r base_aa_coverage_highlight_plot, echo = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} knitr::include_graphics("../man/figures/README-base_aa_coverage_highlight-1.png") ``` ---------------------- +## ChIP-seq data -# ChIP-seq data The ChIP-seq data used here are from [DiffBind](https://bioconductor.org/packages/release/bioc/html/DiffBind.html), I select four sample to use as example: Chr18_MCF7_input, Chr18_MCF7_ER_1, Chr18_MCF7_ER_3, Chr18_MCF7_ER_2, and all bam files are converted to bigwig file with [deeptools](https://deeptools.readthedocs.io/en/develop/). Create metadata: + ```{r load_metadata_chip} # load metadata -sample.meta = data.frame(SampleName=c('Chr18_MCF7_ER_1','Chr18_MCF7_ER_2','Chr18_MCF7_ER_3','Chr18_MCF7_input'), - Type = c("MCF7_ER_1","MCF7_ER_2","MCF7_ER_3","MCF7_input"), - Group = c("IP", "IP", "IP", "Input")) -sample.meta +sample_meta <- data.frame( + SampleName = c( + "Chr18_MCF7_ER_1", + "Chr18_MCF7_ER_2", + "Chr18_MCF7_ER_3", + "Chr18_MCF7_input" + ), + Type = c("MCF7_ER_1", "MCF7_ER_2", "MCF7_ER_3", "MCF7_input"), + Group = c("IP", "IP", "IP", "Input") +) + +sample_meta ``` Load track files: + ```{r load_track_chip} # track folder -track.folder = system.file("extdata", "ChIP-seq", package = "ggcoverage") +track_folder <- system.file("extdata", "ChIP-seq", package = "ggcoverage") + # load bigwig file -track.df = LoadTrackFile(track.folder = track.folder, format = "bw", region = "chr18:76822285-76900000", - meta.info = sample.meta) +track_df <- LoadTrackFile( + track.folder = track_folder, + format = "bw", + region = "chr18:76822285-76900000", + meta.info = sample_meta +) + # check data -head(track.df) +head(track_df) ``` Prepare mark region: + ```{r prepare_mark_chip} # create mark region -mark.region=data.frame(start=c(76822533), - end=c(76823743), - label=c("Promoter")) +mark_region <- data.frame( + start = c(76822533), + end = c(76823743), + label = c("Promoter") +) + # check data -mark.region +mark_region ``` -------------- - -## Basic coverage -```{r basic_coverage_chip, eval=FALSE} -basic.coverage = ggcoverage(data = track.df, - mark.region=mark.region, show.mark.label = FALSE) -basic.coverage -``` +### Basic coverage -```{r basic_coverage_chip_plot, echo=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -knitr::include_graphics("../man/figures/README-basic_coverage_chip-1.png") +```{r basic_coverage_chip, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +basic_coverage <- ggcoverage(data = track_df, + mark.region = mark_region, + show.mark.label = FALSE) +basic_coverage ``` -------------- - -## Get consensus peaks -Before create peak annotation, we first **get consensus peaks** from replicates with [MSPC](https://github.com/Genometric/MSPC). -```{r consensus_peaks} -# load peak file -peak.file <- system.file("extdata", "ChIP-seq", "consensus.peak", package = "ggcoverage") -# get consensus peak (do nothing when there is only one file) -# notice: this step requires MSPC, specific the installation path with mspc.path -peak.df <- GetConsensusPeak(peak.file = peak.file) -``` +### Add annotations +Add **gene**, **ideogram** and **peak** annotations. To create peak annotation, we first **get consensus peaks** with [MSPC](https://github.com/Genometric/MSPC). -## Add annotations -Add **gene**, **ideogram** and **peak** annotations: -```{r peak_coverage, eval=FALSE} +```{r peak_coverage, eval = FALSE} # get consensus peak file -peak.file = system.file("extdata", "ChIP-seq", "consensus.peak", package = "ggcoverage") -# create with peak file -basic.coverage + - geom_gene(gtf.gr=gtf.gr) + - geom_peak(bed.file = peak.file) + - geom_ideogram(genome = "hg19",plot.space = 0) +peak_file <- system.file("extdata", + "ChIP-seq", + "consensus.peak", + package = "ggcoverage") -# create with peak dataframe -basic.coverage + - geom_gene(gtf.gr=gtf.gr) + - geom_peak(peak.df = peak.df) + - geom_ideogram(genome = "hg19",plot.space = 0) +basic_coverage + + geom_gene(gtf.gr = gtf_gr) + + geom_peak(bed.file = peak_file) + + geom_ideogram(genome = "hg19", plot.space = 0) ``` -```{r peak_coverage_plot, echo=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} +```{r peak_coverage_plot, echo = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} knitr::include_graphics("../man/figures/README-peak_coverage-1.png") ``` ---------------------- +## Hi-C data -# Hi-C data The Hi-C data are from [pyGenomeTracks: reproducible plots for multivariate genomic datasets](https://academic.oup.com/bioinformatics/article/37/3/422/5879987?login=false). The Hi-C matrix visualization is implemented by [HiCBricks](https://github.com/koustav-pal/HiCBricks). -## Load track data +### Load track data + ```{r hic_track} -library(ggcoverage) -library(GenomicRanges) # prepare track dataframe -track.file = system.file("extdata", "HiC", "H3K36me3.bw", package = "ggcoverage") -track.df = LoadTrackFile(track.file = track.file, format = "bw", - region = "chr2L:8050000-8300000", extend = 0) -track.df$score = ifelse(track.df$score <0, 0, track.df$score) +track_file <- + system.file("extdata", "HiC", "H3K36me3.bw", package = "ggcoverage") + +track_df <- LoadTrackFile( + track.file = track_file, + format = "bw", + region = "chr2L:8050000-8300000", + extend = 0 +) + +track_df$score <- ifelse(track_df$score < 0, 0, track_df$score) + # check the data -head(track.df) +head(track_df) ``` -## Load Hi-C data +### Load Hi-C data + Matrix: ```{r hic_load_hic_matrix} ## matrix -hic.mat.file = system.file("extdata", "HiC", "HiC_mat.txt", package = "ggcoverage") -hic.mat = read.table(file = hic.mat.file, sep = "\t") -hic.mat = as.matrix(hic.mat) +hic_mat_file <- system.file("extdata", + "HiC", "HiC_mat.txt", package = "ggcoverage") +hic_mat <- read.table(file = hic_mat_file, sep = "\t") +hic_mat <- as.matrix(hic_mat) ``` Bin table: ```{r hic_load_hic_bin} ## bin -hic.bin.file = system.file("extdata", "HiC", "HiC_bin.txt", package = "ggcoverage") -hic.bin = read.table(file = hic.bin.file, sep = "\t") -colnames(hic.bin) = c("chr", "start", "end") -hic.bin.gr = GenomicRanges::makeGRangesFromDataFrame(df = hic.bin) +hic_bin_file <- + system.file("extdata", "HiC", "HiC_bin.txt", package = "ggcoverage") +hic_bin <- read.table(file = hic_bin_file, sep = "\t") +colnames(hic_bin) <- c("chr", "start", "end") +hic_bin_gr <- GenomicRanges::makeGRangesFromDataFrame(df = hic_bin) + ## transfrom func -FailSafe_log10 <- function(x){ +failsafe_log10 <- function(x) { x[is.na(x) | is.nan(x) | is.infinite(x)] <- 0 - return(log10(x+1)) + return(log10(x + 1)) } ``` Data transfromation method: -```{r hic_load_hic_transformation} -## transfrom func -FailSafe_log10 <- function(x){ - x[is.na(x) | is.nan(x) | is.infinite(x)] <- 0 - return(log10(x+1)) -} -``` +### Load link -## Load link ```{r hic_load_link} # prepare arcs -link.file = system.file("extdata", "HiC", "HiC_link.bedpe", package = "ggcoverage") +link_file <- + system.file("extdata", "HiC", "HiC_link.bedpe", package = "ggcoverage") ``` -## Basic coverage -```{r basic_coverage_hic, eval=FALSE} -basic.coverage = ggcoverage(data = track.df, color = "grey", - mark.region = NULL, range.position = "out") -basic.coverage -``` +### Basic coverage + +```{r basic_coverage_hic, warning=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +basic_coverage <- + ggcoverage( + data = track_df, + color = "grey", + mark.region = NULL, + range.position = "out" + ) -```{r basic_coverage_hic_plot, echo=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} -knitr::include_graphics("../man/figures/README-basic_coverage_hic-1.png") +basic_coverage ``` -## Add annotations -Add **link**, **contact map**annotations: +### Add annotations -```{r hic_coverage, eval=FALSE} -basic.coverage + - geom_tad(matrix = hic.mat, granges = hic.bin.gr, value.cut = 0.99, - color.palette = "viridis", transform.fun = FailSafe_log10, - top = FALSE, show.rect = TRUE) + - geom_link(link.file = link.file, file.type = "bedpe", show.rect = TRUE) -``` +Add **link**, **contact map**annotations: -```{r hic_coverage_plot, echo=FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} +```{r hic_coverage, eval = FALSE} +basic_coverage + + geom_tad( + matrix = hic_mat, + granges = hic_bin_gr, + value.cut = 0.99, + color.palette = "viridis", + transform.fun = failsafe_log10, + top = FALSE, + show.rect = TRUE + ) + + geom_link(link.file = link_file, + file.type = "bedpe", + show.rect = TRUE) +``` + +```{r hic_coverage_plot, echo = FALSE, fig.height = 10, fig.width = 12, fig.align = "center"} knitr::include_graphics("../man/figures/README-hic_coverage-1.png") ``` ---------------------- +## Mass spectrometry protein coverage -# Mass spectrometry protein coverage [Mass spectrometry (MS) is an important method for the accurate mass determination and characterization of proteins, and a variety of methods and instrumentations have been developed for its many uses](https://en.wikipedia.org/wiki/Protein_mass_spectrometry). After MS, we can check the coverage of protein to check the quality of the data and find the reason why the segment did not appear and improve the experiment. -## Load coverage +### Load coverage + The exported coverage from [Proteome Discoverer](https://www.thermofisher.cn/cn/zh/home/industrial/mass-spectrometry/liquid-chromatography-mass-spectrometry-lc-ms/lc-ms-software/multi-omics-data-analysis/proteome-discoverer-software.html?adobe_mc=MCMID%7C90228073352279367993013412919222863692%7CMCAID%3D3208C32C269355DE-4000028116B65FEB%7CMCORGID%3D5B135A0C5370E6B40A490D44%40AdobeOrg%7CTS=1614293705): + ```{r ms_coverage_data} library(openxlsx) # prepare coverage dataframe -coverage.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") -coverage.df <- openxlsx::read.xlsx(coverage.file) +coverage_file <- + system.file("extdata", + "Proteomics", "MS_BSA_coverage.xlsx", package = "ggcoverage") +coverage_df <- openxlsx::read.xlsx(coverage_file, sheet = "Sheet1") # check the data -head(coverage.df) +head(coverage_df) ``` The input protein fasta: + ```{r ms_coverage_fasta} -library(Biostrings) -fasta.file <- system.file("extdata", "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") +fasta_file <- + system.file("extdata", + "Proteomics", "MS_BSA_coverage.fasta", package = "ggcoverage") + # prepare track dataframe -protein.set <- Biostrings::readAAStringSet(fasta.file) +protein_set <- Biostrings::readAAStringSet(fasta_file) + # check the data -protein.set +protein_set ``` -## Protein coverage -```{r basic_coverage_protein, eval=FALSE} -protein.coverage = ggprotein(coverage.file = coverage.file, fasta.file = fasta.file, - protein.id = "sp|P02769|ALBU_BOVIN", range.position = "out") -protein.coverage +### Protein coverage + +```{r basic_coverage_protein, eval = FALSE} +protein_coverage <- ggprotein( + coverage.file = coverage_file, + fasta.file = fasta_file, + protein.id = "sp|P02769|ALBU_BOVIN", + range.position = "out" +) + +protein_coverage ``` -```{r basic_coverage_protein_plot, echo=FALSE, fig.height = 6, fig.width = 12, fig.align = "center"} +```{r basic_coverage_protein_plot, echo = FALSE, fig.height = 6, fig.width = 10, fig.align = "center"} knitr::include_graphics("../man/figures/README-basic_coverage_protein-1.png") ``` -## Add annotation +### Add annotation + We can obtain features of the protein from [UniProt](https://www.uniprot.org/). For example, the above protein coverage plot shows that there is empty region in 1-24, and this empty region in [UniProt](https://www.uniprot.org/uniprotkb/P02769/entry) is annotated as Signal peptide and Propeptide peptide. When the protein is mature and released extracellular, these peptides will be cleaved. This is the reason why there is empty region in 1-24. -```{r basic_coverage_protein_feature, eval=FALSE} +```{r basic_coverage_protein_feature, eval = FALSE} # protein feature obtained from UniProt -protein.feature.df = data.frame(ProteinID = "sp|P02769|ALBU_BOVIN", start = c(1, 19, 25), - end = c(18, 24, 607), - Type = c("Signal", "Propeptide", "Chain")) +protein_feature_df <- data.frame( + ProteinID = "sp|P02769|ALBU_BOVIN", + start = c(1, 19, 25), + end = c(18, 24, 607), + Type = c("Signal", "Propeptide", "Chain") +) + # add annotation -protein.coverage + - geom_feature(feature.df = protein.feature.df, feature.color = c("#4d81be","#173b5e","#6a521d")) +protein_coverage + + geom_feature(feature.df = protein_feature_df, + feature.color = c("#4d81be", "#173b5e", "#6a521d")) ``` -```{r basic_coverage_protein_feature_plot, echo=FALSE, fig.height = 8, fig.width = 12, fig.align = "center"} +```{r basic_coverage_protein_feature_plot, echo = FALSE, fig.height = 6, fig.width = 10, fig.align = "center"} knitr::include_graphics("../man/figures/README-basic_coverage_protein_feature-1.png") ``` +## Code of Conduct + +Please note that the `ggcoverage` project is released with a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms. + --------------------- -# Session info +## Session info ```{r session} sessionInfo() ``` - - - - - - - - - -