-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a8f0b5c
commit c68bda0
Showing
31 changed files
with
3,585 additions
and
81 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
# Source https://github.com/marbl/CHM13 | ||
# https://s3-us-west-2.amazonaws.com/human-pangenomics/index.html?prefix=T2T/CHM13/assemblies/annotation/ | ||
# wget https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/annotation/chm13v2.0_combined3.gene_annotation_v0.1.gtf | ||
|
||
# Chromosome size --------------------------------------------------------- | ||
|
||
chromsize.T2T = GenomicRanges::seqinfo(BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0::BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0) | ||
chromsize.T2T | ||
chromsize.T2T = data.frame(chrom = paste0("chr", chromsize.T2T@seqnames), size = chromsize.T2T@seqlengths) | ||
|
||
chromsize.T2T %>% str() | ||
|
||
usethis::use_data(chromsize.T2T, overwrite = TRUE) | ||
|
||
|
||
# Cytobands --------------------------------------------------------------- | ||
|
||
cytobands.T2T <- data.table::fread("data-raw/chm13v2.0_cytobands_allchrs.bed", data.table = FALSE, header = FALSE) | ||
head(cytobands.T2T) | ||
colnames(cytobands.T2T) <- c("chrom", "start", "end", "band", "stain") | ||
|
||
cytobands.T2T %>% str() | ||
|
||
usethis::use_data(cytobands.T2T, overwrite = TRUE) | ||
|
||
|
||
# Centromeres ------------------------------------------------------------- | ||
|
||
centromeres.hg38 | ||
|
||
centromeres.T2T <- data.table::fread("data-raw/chm13.draft_v2.0.cen_mask.bed", header = FALSE, data.table = FALSE) | ||
centromeres.T2T | ||
colnames(centromeres.T2T) = c("chrom", "left.base", "right.base") | ||
|
||
centromeres.T2T %>% str() | ||
|
||
usethis::use_data(centromeres.T2T, overwrite = TRUE) | ||
|
||
|
||
# Transcript -------------------------------------------------------------- | ||
|
||
library(IRanges) | ||
|
||
gtf_T2T <- data.table::fread("~/../Downloads/chm13v2.0_combined3.gene_annotation_v0.1.gtf", sep = "\t", header = FALSE) | ||
|
||
head(gtf_T2T) | ||
table(gtf_T2T$V7) | ||
table(gtf_T2T$V1) | ||
|
||
extract_col <- function(x, name) { | ||
library(magrittr) | ||
stringr::str_extract(x, paste0(name, " ([^;]+);")) %>% | ||
stringr::str_remove(paste0(name, " ")) %>% | ||
stringr::str_remove_all("\"") %>% | ||
stringr::str_remove(";") | ||
} | ||
|
||
#gtf_T2T[, gene_type := extract_col(V9, "gene_type")] | ||
|
||
## Keep only protein coding region | ||
gtf_T2T[, gene_id := extract_col(V9, "gene_id")] | ||
gtf_T2T[, gene_name := extract_col(V9, "gene_name")] | ||
|
||
hg38_gene = get_genome_annotation("gene", genome_build = "hg38") | ||
hg38_gene | ||
coding_ids = unique(hg38_gene$gene_name[hg38_gene$gene_type == "protein_coding"]) | ||
|
||
T2T <- gtf_T2T[V3 == "transcript" & gene_name %in% coding_ids, .(V1, V4, V5, V7)] | ||
T2T | ||
T2T <- T2T[, data.table::as.data.table(reduce(IRanges(V4, V5))), by = .(V7, V1)] | ||
colnames(T2T)[1:2] <- c("strand", "chrom") | ||
T2T | ||
T2T$width <- NULL | ||
|
||
T2T | ||
## Save to package | ||
transcript.T2T <- T2T | ||
usethis::use_data(transcript.T2T, overwrite = TRUE) | ||
|
||
# Gene -------------------------------------------------------------------- | ||
|
||
## T2T gene # No gene rows here, use merged transcript instead | ||
|
||
gene_T2T2 <- gtf_T2T[, .(V1, V4, V5, V7, gene_name, gene_id)] | ||
colnames(gene_T2T2)[1:4] <- c("chrom", "start", "end", "strand") | ||
gene_T2T2 | ||
# t2t_gene_id = paste(gene_id, collapse = ",") | ||
gene_T2T2 <- gene_T2T2[!is.na(gene_name), list(start = min(start), end = max(end)), by = .(chrom, strand, gene_name)] | ||
gene_T2T2 | ||
gene_T2T2[5, 1:5] | ||
|
||
# > gene_T2T2[gene_name == "TP53"] | ||
# chrom strand gene_name gene_id start end | ||
# 1: chr17 - TP53 XLOC_026526 7565929 7591642 | ||
# 2: chr17 - TP53 XLOC_026527 7589364 7590475 | ||
|
||
hg38_gene | ||
gene_T2T2_2 = merge(gene_T2T2, unique(hg38_gene[, list(chrom, strand, gene_name, gene_id, gene_type)]), by = c("chrom", "strand", "gene_name"), all.x = TRUE) | ||
gene_T2T2_2 | ||
|
||
hg38_gene | ||
data.table::setcolorder(gene_T2T2_2, colnames(hg38_gene)) | ||
## Save to extdata | ||
saveRDS(gene_T2T2_2, file = "inst/extdata/human_T2T_gene_info.rds") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
chr1 121619169 142242033 | ||
chr2 92300802 94695067 | ||
chr3 90804701 96415026 | ||
chr4 49705154 55303192 | ||
chr5 46830042 50962194 | ||
chr6 58286706 61058390 | ||
chr7 60410644 63714499 | ||
chr8 44243546 46325080 | ||
chr9 44938599 76694047 | ||
chr10 39633793 41926237 | ||
chr11 51023358 54476419 | ||
chr12 34593492 37202490 | ||
chr13 0 17508596 | ||
chr14 0 12708411 | ||
chr15 0 17694466 | ||
chr16 35834066 52219756 | ||
chr17 23433372 27571319 | ||
chr18 15641581 21121235 | ||
chr19 24570766 29769351 | ||
chr20 26383658 32969590 | ||
chr21 0 11306378 | ||
chr22 0 15711065 | ||
chrX 57819763 60927195 |
Oops, something went wrong.