Skip to content

Commit

Permalink
set up t2t support
Browse files Browse the repository at this point in the history
  • Loading branch information
ShixiangWang committed Dec 12, 2023
1 parent a8f0b5c commit c68bda0
Show file tree
Hide file tree
Showing 31 changed files with 3,585 additions and 81 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@ inst/extdata/COSMIC_v3.4_DBS_GRCh37.rds
inst/extdata/COSMIC_v3.4_RNA-SBS_GRCh37.rds
inst/extdata/COSMIC_v3.4_SBS_GRCh37.rds
inst/extdata/COSMIC_v3.4_SV_GRCh38.rds
inst/extdata/human_T2T_gene_info.rds
inst/extdata/ce11_gene_info.rds
8 changes: 4 additions & 4 deletions R/get.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ get_cnlist <- function(CopyNumber, ignore_chrs = NULL, add_index = FALSE) {

get_features_wang <- function(CN_data,
cores = 1,
genome_build = c("hg19", "hg38", "mm10", "mm9"),
genome_build = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"),
feature_setting = sigminer::CN.features) {
genome_build <- match.arg(genome_build)
# get chromosome lengths and centromere locations
Expand Down Expand Up @@ -108,7 +108,7 @@ get_features_wang <- function(CN_data,
# Get copy number length profile ------------------------------------------

get_LengthFraction <- function(CN_data,
genome_build = c("hg19", "hg38", "mm10", "mm9"),
genome_build = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"),
seg_cols = c("Chromosome", "Start.bp", "End.bp", "modal_cn"),
samp_col = "sample") {
stopifnot(is.list(CN_data) | is.data.frame(CN_data))
Expand Down Expand Up @@ -280,7 +280,7 @@ get_LengthFraction <- function(CN_data,

# Get arm location --------------------------------------------------------

get_ArmLocation <- function(genome_build = c("hg19", "hg38", "mm10", "mm9")) {
get_ArmLocation <- function(genome_build = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11")) {
genome_build <- match.arg(genome_build)
# get chromosome lengths and centromere locations
chrlen <- get_genome_annotation(data_type = "chr_size", genome_build = genome_build)
Expand Down Expand Up @@ -335,7 +335,7 @@ get_ArmLocation <- function(genome_build = c("hg19", "hg38", "mm10", "mm9")) {

# Get summary of copy number variation per sample ------------------------------------

get_cnsummary_sample <- function(segTab, genome_build = c("hg19", "hg38", "mm10", "mm9"),
get_cnsummary_sample <- function(segTab, genome_build = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"),
genome_measure = c("called", "wg")) {
genome_build <- match.arg(genome_build)
genome_measure <- match.arg(genome_measure)
Expand Down
54 changes: 42 additions & 12 deletions R/get_genome_annotation.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,26 +34,56 @@
#' expect_equal(identical(df3, df4), FALSE)
#' expect_equal(identical(df5, df6), FALSE)
#' @export
get_genome_annotation <- function(data_type = c("chr_size", "centro_loc", "cytobands", "transcript"),
get_genome_annotation <- function(data_type = c("chr_size", "centro_loc", "cytobands", "transcript", "gene"),
chrs = paste0("chr", c(1:22, "X", "Y")),
genome_build = c("hg19", "hg38", "mm10", "mm9")) {
genome_build = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11")) {
data_type <- match.arg(data_type)
genome_build <- match.arg(genome_build)

prefix <- switch(data_type,
chr_size = "chromsize",
centro_loc = "centromeres",
cytobands = "cytobands",
transcript = "transcript"
chr_size = "chromsize",
centro_loc = "centromeres",
cytobands = "cytobands",
transcript = "transcript",
gene = "gene"
)

annot <- get(paste(prefix, genome_build, sep = "."), envir = tryCatch(
as.environment("package:sigminer"),
error = function(e) {
eval(parse(text = "attachNamespace('sigminer')"))
as.environment("package:sigminer")
if (prefix == "gene") {
gene_file <- switch(
genome_build,
mm9 = file.path(
system.file("extdata", package = "sigminer"),
"mouse_mm9_gene_info.rds"
),
mm10 = file.path(
system.file("extdata", package = "sigminer"),
"mouse_mm10_gene_info.rds"
),
ce11 = file.path(
system.file("extdata", package = "sigminer"),
"ce11_gene_info.rds"
),
file.path(
system.file("extdata", package = "sigminer"),
paste0("human_", genome_build, "_gene_info.rds")
)
)
ok <- TRUE
if (!file.exists(gene_file)) ok <- query_remote_data(basename(gene_file))
if (!ok) {
return(invisible(NULL))
}
))
annot <- readRDS(gene_file)
} else {
annot <- get(paste(prefix, genome_build, sep = "."), envir = tryCatch(
as.environment("package:sigminer"),
error = function(e) {
eval(parse(text = "attachNamespace('sigminer')"))
as.environment("package:sigminer")
}
))
}

res <- annot %>%
dplyr::filter(.data$chrom %in% chrs) %>%
dplyr::arrange(factor(.data$chrom, chrs))
Expand Down
6 changes: 3 additions & 3 deletions R/helper_derive_cn_features.R
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ getCN <- function(abs_profiles) {

# Number of Chromosome with CNV
getNChrV <- function(abs_profiles, genome_build = "hg38") {
genome_build <- match.arg(genome_build, choices = c("hg19", "hg38", "mm10", "mm9"))
genome_build <- match.arg(genome_build, choices = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"))
abs_profiles <- handle_sex(abs_profiles)

if (genome_build %in% c("hg19", "hg38")) {
Expand All @@ -246,7 +246,7 @@ getNChrV <- function(abs_profiles, genome_build = "hg38") {
# The chromosome sequences (using integer as index) with copy number variation
# The count of this result represents the burden (contribution) of chromosome
getBoChr <- function(abs_profiles, genome_build = "hg38") {
genome_build <- match.arg(genome_build, choices = c("hg19", "hg38", "mm10", "mm9"))
genome_build <- match.arg(genome_build, choices = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"))
abs_profiles <- handle_sex(abs_profiles)

if (genome_build %in% c("hg19", "hg38")) {
Expand Down Expand Up @@ -279,7 +279,7 @@ getBoChr <- function(abs_profiles, genome_build = "hg38") {

# The minimal number of chromosome with 50% CNV
getNC50 <- function(abs_profiles, genome_build = "hg38") {
genome_build <- match.arg(genome_build, choices = c("hg19", "hg38", "mm10", "mm9"))
genome_build <- match.arg(genome_build, choices = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"))
abs_profiles <- handle_sex(abs_profiles)

if (genome_build %in% c("hg19", "hg38")) {
Expand Down
2 changes: 1 addition & 1 deletion R/read_copynumber.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ read_copynumber <- function(input,
use_all = add_loh,
min_segnum = 0L,
max_copynumber = 20L,
genome_build = c("hg19", "hg38", "mm10", "mm9"),
genome_build = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"),
genome_measure = c("called", "wg"),
complement = FALSE,
...) {
Expand Down
25 changes: 1 addition & 24 deletions R/read_vcf.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,30 +79,7 @@ read_vcf <- function(vcfs, samples = NULL,
vcfs$Gene_ID <- "Unknown"

# Annotate gene symbol
gene_file <- switch(genome_build,
mm9 = file.path(
system.file("extdata", package = "sigminer"),
"mouse_mm9_gene_info.rds"
),
mm10 = file.path(
system.file("extdata", package = "sigminer"),
"mouse_mm10_gene_info.rds"
),
ce11 = file.path(
system.file("extdata", package = "sigminer"),
"ce11_gene_info.rds"
),
file.path(
system.file("extdata", package = "sigminer"),
paste0("human_", genome_build, "_gene_info.rds")
)
)
ok <- TRUE
if (!file.exists(gene_file)) ok <- query_remote_data(basename(gene_file))
if (!ok) {
return(invisible(NULL))
}
gene_dt <- readRDS(gene_file)
gene_dt <- get_genome_annotation("gene", genome_build = genome_build)

if (verbose) message("Annotating mutations to first matched gene based on database ", gene_file, "...")
dt <- gene_dt[, c("chrom", "start", "end", "gene_name", "gene_id")]
Expand Down
2 changes: 1 addition & 1 deletion R/show_cn_circos.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
show_cn_circos <- function(data, samples = NULL,
show_title = TRUE,
chrs = paste0("chr", 1:22),
genome_build = c("hg19", "hg38", "mm10", "mm9"),
genome_build = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"),
col = NULL,
side = "inside",
...) {
Expand Down
2 changes: 1 addition & 1 deletion R/show_cn_freq_circos.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ show_cn_freq_circos <- function(data,
resolution_factor = 1L,
title = c("AMP", "DEL"),
chrs = paste0("chr", 1:22),
genome_build = c("hg19", "hg38", "mm10", "mm9"),
genome_build = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"),
cols = NULL,
plot_ideogram = TRUE,
track_height = 0.5,
Expand Down
23 changes: 2 additions & 21 deletions R/show_cn_group_profile.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ show_cn_group_profile <- function(data,
fill_area = TRUE,
cols = NULL,
chrs = paste0("chr", c(1:22, "X")),
genome_build = c("hg19", "hg38", "mm10", "mm9"),
genome_build = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"),
cutoff = 2L,
resolution_factor = 1L,
force_y_limit = TRUE,
Expand Down Expand Up @@ -134,26 +134,7 @@ show_cn_group_profile <- function(data,
))

if (!is.null(highlight_genes)) {
gene_file <- switch(genome_build,
mm10 = file.path(
system.file("extdata", package = "sigminer"),
"mouse_mm10_gene_info.rds"
),
mm9 = file.path(
system.file("extdata", package = "sigminer"),
"mouse_mm9_gene_info.rds"
),
file.path(
system.file("extdata", package = "sigminer"),
paste0("human_", genome_build, "_gene_info.rds")
)
)
ok <- TRUE
if (!file.exists(gene_file)) ok <- query_remote_data(basename(gene_file))
if (!ok) {
return(invisible(NULL))
}
gene_dt <- readRDS(gene_file)
gene_dt <- get_genome_annotation("gene", genome_build = genome_build)
gene_dt <- gene_dt[gene_dt$gene_name %in% highlight_genes][
, c("chrom", "start", "end", "gene_name")
]
Expand Down
2 changes: 1 addition & 1 deletion R/show_cn_profile.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ show_cn_profile <- function(data, samples = NULL,
show_labels = NULL,
chrs = paste0("chr", 1:22),
position = NULL,
genome_build = c("hg19", "hg38", "mm10", "mm9"),
genome_build = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"),
ylim = NULL,
nrow = NULL, ncol = NULL,
return_plotlist = FALSE) {
Expand Down
2 changes: 1 addition & 1 deletion R/sigprofiler.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ sigprofiler_extract <- function(nmf_matrix, output,
"nndsvd", "nndsvda", "nndsvdar"
),
cores = -1L,
genome_build = c("hg19", "hg38", "mm10", "mm9"),
genome_build = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"),
use_conda = FALSE,
py_path = NULL,
sigprofiler_version = "1.1.3") {
Expand Down
2 changes: 1 addition & 1 deletion R/transform_seg_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#' expect_is(x, "data.table")
#' expect_is(x2, "data.table")
transform_seg_table <- function(data,
genome_build = c("hg19", "hg38", "mm10", "mm9"),
genome_build = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"),
ref_type = c("cytoband", "gene"),
values_fill = NA,
values_fn = function(x, ...) {
Expand Down
2 changes: 1 addition & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Download to extdata/
query_remote_data <- function(x) {
x_url <- paste0("https://zenodo.org/record/4771552/files/", x)
x_url <- paste0("https://zenodo.org/record/10360995/files/", x)
x_dest <- file.path(system.file("extdata", package = "sigminer"), x)
message("Downloading ", x_url, " to ", x_dest)
tryCatch(
Expand Down
105 changes: 105 additions & 0 deletions data-raw/T2T.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Source https://github.com/marbl/CHM13
# https://s3-us-west-2.amazonaws.com/human-pangenomics/index.html?prefix=T2T/CHM13/assemblies/annotation/
# wget https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/annotation/chm13v2.0_combined3.gene_annotation_v0.1.gtf

# Chromosome size ---------------------------------------------------------

chromsize.T2T = GenomicRanges::seqinfo(BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0::BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0)
chromsize.T2T
chromsize.T2T = data.frame(chrom = paste0("chr", chromsize.T2T@seqnames), size = chromsize.T2T@seqlengths)

chromsize.T2T %>% str()

usethis::use_data(chromsize.T2T, overwrite = TRUE)


# Cytobands ---------------------------------------------------------------

cytobands.T2T <- data.table::fread("data-raw/chm13v2.0_cytobands_allchrs.bed", data.table = FALSE, header = FALSE)
head(cytobands.T2T)
colnames(cytobands.T2T) <- c("chrom", "start", "end", "band", "stain")

cytobands.T2T %>% str()

usethis::use_data(cytobands.T2T, overwrite = TRUE)


# Centromeres -------------------------------------------------------------

centromeres.hg38

centromeres.T2T <- data.table::fread("data-raw/chm13.draft_v2.0.cen_mask.bed", header = FALSE, data.table = FALSE)
centromeres.T2T
colnames(centromeres.T2T) = c("chrom", "left.base", "right.base")

centromeres.T2T %>% str()

usethis::use_data(centromeres.T2T, overwrite = TRUE)


# Transcript --------------------------------------------------------------

library(IRanges)

gtf_T2T <- data.table::fread("~/../Downloads/chm13v2.0_combined3.gene_annotation_v0.1.gtf", sep = "\t", header = FALSE)

head(gtf_T2T)
table(gtf_T2T$V7)
table(gtf_T2T$V1)

extract_col <- function(x, name) {
library(magrittr)
stringr::str_extract(x, paste0(name, " ([^;]+);")) %>%
stringr::str_remove(paste0(name, " ")) %>%
stringr::str_remove_all("\"") %>%
stringr::str_remove(";")
}

#gtf_T2T[, gene_type := extract_col(V9, "gene_type")]

## Keep only protein coding region
gtf_T2T[, gene_id := extract_col(V9, "gene_id")]
gtf_T2T[, gene_name := extract_col(V9, "gene_name")]

hg38_gene = get_genome_annotation("gene", genome_build = "hg38")
hg38_gene
coding_ids = unique(hg38_gene$gene_name[hg38_gene$gene_type == "protein_coding"])

T2T <- gtf_T2T[V3 == "transcript" & gene_name %in% coding_ids, .(V1, V4, V5, V7)]
T2T
T2T <- T2T[, data.table::as.data.table(reduce(IRanges(V4, V5))), by = .(V7, V1)]
colnames(T2T)[1:2] <- c("strand", "chrom")
T2T
T2T$width <- NULL

T2T
## Save to package
transcript.T2T <- T2T
usethis::use_data(transcript.T2T, overwrite = TRUE)

# Gene --------------------------------------------------------------------

## T2T gene # No gene rows here, use merged transcript instead

gene_T2T2 <- gtf_T2T[, .(V1, V4, V5, V7, gene_name, gene_id)]
colnames(gene_T2T2)[1:4] <- c("chrom", "start", "end", "strand")
gene_T2T2
# t2t_gene_id = paste(gene_id, collapse = ",")
gene_T2T2 <- gene_T2T2[!is.na(gene_name), list(start = min(start), end = max(end)), by = .(chrom, strand, gene_name)]
gene_T2T2
gene_T2T2[5, 1:5]

# > gene_T2T2[gene_name == "TP53"]
# chrom strand gene_name gene_id start end
# 1: chr17 - TP53 XLOC_026526 7565929 7591642
# 2: chr17 - TP53 XLOC_026527 7589364 7590475

hg38_gene
gene_T2T2_2 = merge(gene_T2T2, unique(hg38_gene[, list(chrom, strand, gene_name, gene_id, gene_type)]), by = c("chrom", "strand", "gene_name"), all.x = TRUE)
gene_T2T2_2

hg38_gene
data.table::setcolorder(gene_T2T2_2, colnames(hg38_gene))
## Save to extdata
saveRDS(gene_T2T2_2, file = "inst/extdata/human_T2T_gene_info.rds")

23 changes: 23 additions & 0 deletions data-raw/chm13.draft_v2.0.cen_mask.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
chr1 121619169 142242033
chr2 92300802 94695067
chr3 90804701 96415026
chr4 49705154 55303192
chr5 46830042 50962194
chr6 58286706 61058390
chr7 60410644 63714499
chr8 44243546 46325080
chr9 44938599 76694047
chr10 39633793 41926237
chr11 51023358 54476419
chr12 34593492 37202490
chr13 0 17508596
chr14 0 12708411
chr15 0 17694466
chr16 35834066 52219756
chr17 23433372 27571319
chr18 15641581 21121235
chr19 24570766 29769351
chr20 26383658 32969590
chr21 0 11306378
chr22 0 15711065
chrX 57819763 60927195
Loading

0 comments on commit c68bda0

Please sign in to comment.