GRCh38_STEP1_optional.Rmd

---
title: "wind: wORKFLOW FOR PiRNAs AnD BEYONd"
subtitle: "Optional workflow to extract various information  regarding the small RNA sequences, from the created GTF file of STEP 1"
author: "Constantinos Yeles (Konstantinos Geles)"
date: "`r format(Sys.Date(), '%a %b %d %Y')`"
output:
  html_document:
    toc: yes
    toc_depth: 3
    theme: paper 
  pdf_document:
    toc: yes
    toc_depth: 3
editor_options:
  chunk_output_type: console
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
```

# 1. Provide additional information regarding piRNA sequences and their genomic locations. 

## 1.1 piRNA Cluster DataBase
We use the [piRNA Cluster DataBase](https://www.smallrnagroup.uni-mainz.de/piCdb/) in order to find which of the piRNA sequences of the new gtf are inside piRNA Clusters

```{bash piCdb}
docker run --rm -ti -v $(pwd):/home/my_data  congelos/sncrna_workflow

# for hg38
wget https://www.smallrnagroup.uni-mainz.de/piCdb/data/Homo_sapiens/piRNAclusters.gtf -O my_data/human_data/piRNAclusters_piCdb_hg38.gtf

# for mm10
wget https://www.smallrnagroup.uni-mainz.de/piCdb/data/Mus_musculus/piRNAclusters.gtf -O my_data/mouse_data/piRNAclusters_piCdb_mm10.gtf
exit

docker run --rm -v $(pwd):/home/0 -p 8787:8787 -e PASSWORD=12345 \
-e USER=$UID congelos/rocker_tidyverse_plus_de_pckages
```
### i. load libraries

```{r}
suppressPackageStartupMessages({
library("tidyverse")
library("plyranges")  
  })
```
### ii. Import the complete gtf and the piCdb

```{r sncRNA and piCdb}
gtf_piB_RCentr <- file.path("human_data", "sncRNA_piRNBnk_RNACent_GRCh38_v34.gtf") %>% 
  read_gff2()
  
piCdb <- file.path("human_data", "piRNAclusters_piCdb_hg38.gtf") %>% 
  read_gff2() %>% 
  keepStandardChromosomes(pruning.mode = "coarse") %>% 
  as_tibble() %>% 
  mutate(seqnames = seqnames %>% str_c("chr",.) %>% as_factor(),
         piCluster = str_c(seqnames, start, end, sep = "_")) %>% 
  as_granges() 
```

### same for mouse
```{bash}
git clone https://github.com/NCBI-Hackathons/Master_gff3_parser.git

cd Master_gff3_parser/
python setup.py install

cd ..
seqconv convert --ref mm10 --in rs --out uc  my_data/mouse_data/piRNAclusters_piCdb_mm10.gtf > my_data/mouse_data/piRNAclusters_piCdb_mm10_ucsc_chr_ids.gtf

```
### iii. findoverlaps between the two gtf

```{r overlaps piCdb}
gtf_piB_RCentr_piCdb <- gtf_piB_RCentr %>% 
  filter(gene_type == "piRNA") %>% 
  keepStandardChromosomes(pruning.mode = "coarse") %>% 
  find_overlaps(piCdb, maxgap = -1L, minoverlap = 15L)

piRNA_GRs_Clusters <- gtf_piB_RCentr_piCdb %>% 
  select(piCluster) %>% 
  as_tibble() %>% 
  unite(col = "sequences_piRNA", seqnames:strand, sep = "_") %>% 
  distinct(sequences_piRNA, piCluster, .keep_all = TRUE)
  
# add as metadata the information about piRNA clusters
gtf_piB_RCentr_piCdb <- gtf_piB_RCentr %>% 
  as_tibble() %>% 
  unite(col = "sequences_piRNA", seqnames:strand, sep = "_") %>% 
  left_join(piRNA_GRs_Clusters) %>% 
  separate(col = sequences_piRNA,
           into = c("seqnames", "start","end","width","strand"), 
           sep = "_", convert = TRUE) %>% 
  mutate(source = as.character(source),
         source = 
           if_else(!is.na(piCluster),
                   true = str_c(source, ",proTRAC"),
                   false = source)) %>% 
  as_granges()

# save the new gtf
dir.create(file.path("human_data", "additional_info_piRNAs_GRs"))
gtf_piB_RCentr_piCdb %>% 
  write_gff2(file.path("human_data", "additional_info_piRNAs_GRs", "sncRNA_piRNBnk_RNACent_piCdb_GRCh38_v34.gtf"))
```

## 1.2 Find multimapping between piRNAs

```{r multimapping piRNAs}
multi_test <- gtf_piB_RCentr_piCdb %>% 
  #filter(gene_type == "piRNA") %>% 
  select(gene_id, seq_RNA, gene_type) %>%
  join_overlap_self_directed() %>% 
  arrange(seqnames)

multi_test %>% 
  filter(gene_type == "piRNA",
    gene_id != gene_id.overlap) %>%
  as_tibble() %>% 
  count(gene_type, gene_type.overlap, sort = T) %>% 
  write_tsv(file.path("human_data", "additional_info_piRNAs_GRs",
          "piRNA_GR_overlapping_sncRNAs_gene_types.txt"))

gtf_piB_RCentr_piCdb %>% 
  filter(gene_type == "piRNA") %>% 
  as_tibble() %>% 
  count(gene_id, sort = T) %>% 
  write_tsv(file.path("human_data", "additional_info_piRNAs_GRs",
                      "GRs_per_piRNA.txt"))
```
## 1.3 Find how many piRNAs are in common or not between piRNABank and RNAcentral in the new gtf 
```{r}
# piRNA sequences only in piRNAbank: 353
piRNA_BANK <- gtf_piB_RCentr_piCdb %>% 
  dplyr::select(gene_id, gene_type, sRNA_id, source) %>% 
  as_tibble() %>% 
  distinct(gene_id, .keep_all = TRUE) %>% 
  filter(str_detect(source, "piRNA_BANK$|piRNA_BANK,proTRAC$"))

# piRNA sequences only in RNAcentral:  8,444
RNAcentral <- gtf_piB_RCentr_piCdb %>% 
  dplyr::select(gene_id, gene_type, sRNA_id, source) %>% 
  as_tibble() %>% 
  distinct(gene_id, .keep_all = TRUE) %>% 
   filter(gene_type == "piRNA",
          !is.na(sRNA_id),
          !str_detect(source, "piRNA_BANK"))

# piRNA sequences common between RNAcentral and piRNABank:  19,203
common <- gtf_piB_RCentr_piCdb %>% 
  dplyr::select(gene_id, gene_type, sRNA_id, source) %>% 
  as_tibble() %>% 
  distinct(gene_id, .keep_all = TRUE) %>% 
  filter(str_detect(source, "piRNA_BANK,"),
          !is.na(sRNA_id)) 

# all piRNA sequences: 28,000
all <- gtf_piB_RCentr_piCdb %>% 
   dplyr::select(gene_id, gene_type, sRNA_id, source) %>% 
   filter(gene_type == "piRNA") %>% 
  as_tibble() %>% 
  distinct(gene_id, .keep_all = TRUE)
  

# mouse-------------------------------
# piRNA sequences only in piRNAbank: 2,213
piRNA_BANK <- gtf_piB_RCentr_mouse %>% 
  dplyr::select(gene_id, gene_type, sRNA_id, source) %>% 
  filter(gene_type == "piRNA") %>% 
  as_tibble() %>% 
  distinct(gene_id, .keep_all = TRUE) %>% 
  filter(str_detect(source, "piRNA_BANK$"))

# piRNA sequences only in RNAcentral:  29,114
RNAcentral <- gtf_piB_RCentr_mouse %>% 
  dplyr::select(gene_id, gene_type, sRNA_id, source) %>% 
  filter(gene_type == "piRNA") %>% 
  as_tibble() %>% 
  distinct(gene_id, .keep_all = TRUE) %>% 
  filter(!is.na(sRNA_id),
          !str_detect(source, "piRNA_BANK"))

# piRNA sequences common between RNAcentral and piRNABank:  34,306
common <- gtf_piB_RCentr_mouse %>% 
  dplyr::select(gene_id, gene_type, sRNA_id, source) %>% 
  filter(gene_type == "piRNA") %>%
  as_tibble() %>% 
  distinct(gene_id, .keep_all = TRUE) %>% 
  filter(str_detect(source, "piRNA_BANK,"),
          !is.na(sRNA_id)) 

# all piRNA sequences: 65,633
all <- gtf_piB_RCentr_mouse %>% 
   dplyr::select(gene_id, gene_type, sRNA_id, source) %>% 
   filter(gene_type == "piRNA") %>% 
  as_tibble() %>% 
  distinct(gene_id, .keep_all = TRUE)

nrow(common)+ nrow(RNAcentral) + nrow(piRNA_BANK)
```

## 1.4. Find which piRNAs are inside Trasposable Elements

We download a gtf file with the information about genomic regions of 
Transposable Elements for human genome: http://labshare.cshl.edu/shares/mhammelllab/www-data/TEtranscripts/TE_GTF/
more precisely: [GRCh38_GENCODE_rmsk_TE.gtf.gz](
http://labshare.cshl.edu/shares/mhammelllab/www-data/TEtranscripts/TE_GTF/GRCh38_GENCODE_rmsk_TE.gtf.gz)
### i. download the TE information

```{bash}
wget http://labshare.cshl.edu/shares/mhammelllab/www-data/TEtranscripts/TE_GTF/GRCh38_GENCODE_rmsk_TE.gtf.gz -O my_data/human_data/additional_info_piRNAs_GRs/GRCh38_GENCODE_rmsk_TE.gtf.gz
```
### ii. create a gtf with metadata about TEs

```{r Trasposable Elements annotation}
TEs <- file.path("human_data", "additional_info_piRNAs_GRs", "GRCh38_GENCODE_rmsk_TE.gtf.gz") %>%  
  read_gff2() %>% 
  select("TE_gene_id" = gene_id, "TE_transcript_id" = transcript_id, 
   "TE_family_id" = family_id, "TE_class_id" = class_id) %>% 
  keepStandardChromosomes(pruning.mode = "coarse") %>% 
  arrange(seqnames) 

gtf_piB_RCentr_piCdb %>% 
  filter(gene_type == "piRNA") %>% 
  find_overlaps_directed(TEs) %>% 
  write_gff2(file.path("human_data", "additional_info_piRNAs_GRs",
                      "TEs_piRNAbank_rCentral_piCdb.gtf"))
```

## 1.5 Find in which genes are inside the piRNAs
### i. Load libraries

```{r bumphunter libraries}
suppressPackageStartupMessages({
library('TxDb.Hsapiens.UCSC.hg38.knownGene')
library('org.Hs.eg.db')
library('bumphunter')
library('BiocParallel')
library('stats')
})
```
### ii. Import regions of transcripts

```{r transcript regions bumphunter}
genes <- annotateTranscripts(TxDb.Hsapiens.UCSC.hg38.knownGene, annotation = "org.Hs.eg.db") %>% 
  keepStandardChromosomes(pruning.mode="coarse") %>% 
  arrange(seqnames)

gtf_only_piRNAs <- gtf_piB_RCentr_piCdb %>% filter(gene_type == "piRNA") %>%  keepStandardChromosomes(pruning.mode="coarse") %>% 
  arrange(seqnames)

identical(genes %>% seqlevels(), 
          gtf_only_piRNAs %>% seqlevels())

gtf_only_piRNAs %>% length()


# check how many GRs per chromosome
gtf_only_piRNAs %>% 
  as_tibble() %>% 
  dplyr::count(seqnames, sort = TRUE)
# chr15 has the most values of GRs with 14562
# we will parallelize per chr.
```

### iii. gene regions identification 
```{r transcript regions bumphunter}
if(.Platform$OS.type == "windows") {
mt_param <- SnowParam()
 } else{
mt_param <- MulticoreParam()
 }
# we will work with 4 workers 
mt_param <- MulticoreParam(workers = 4)

# simple function which takes lists of Grs and the chromosome 
# name to select from each list
matchGenes_fun <- function(our_Grs, genes_GRs){
  suppressPackageStartupMessages({
    library('dplyr')
    library('bumphunter')
  })
  message("working on matchGenes")
  matchGenes(our_Grs, genes_GRs, 
    type = "any", promoterDist = 2500, 
    skipExons = FALSE, verbose = TRUE) %>% as_tibble()
}

genes_chr <- map(genes %>% seqlevels() %>% purrr::set_names(), 
  ~genes %>% filter(seqnames == .x))

piR_chr <- map(genes %>% 
    seqlevels() %>% 
    purrr::set_names(), ~gtf_only_piRNAs %>% 
        filter(seqnames == .x))

# test for small chrs --------
gen_test <- genes_chr[c("chrM","chrY")]
piR_test <- piR_chr[c("chrM","chrY")]

res_chr <- bpmapply(matchGenes_fun,
  piR_test, gen_test, USE.NAMES=TRUE, SIMPLIFY = FALSE,
    BPREDO=list(), BPPARAM = mt_param) 

# run the complete list -------
res_chr <- bpmapply(matchGenes_fun,
  piR_chr, genes_chr, USE.NAMES=TRUE, SIMPLIFY = FALSE,
    BPREDO=list(), BPPARAM = mt_param) 

res_chr <- bind_rows(res_chr) 

gtf_only_piRNAs <- piR_chr %>%  bind_ranges %>%  as_tibble()

#make one dataframe
gtf_only_piRNAs %>% 
  bind_cols(res_chr) %>% 
  write_tsv(file.path("human_data", "additional_info_piRNAs_GRs",
                      "Gene_Regions_piRNAbank_RNACentral.txt"))
```
# 2. piRNA targets prediction
Extract the 3', 5' and CDS from the primary GTF of GRCH38 Gencode
### i. find the genomic regions 3', 5' UTR and CDS

```{r CDS and UTR}
library("BSgenome.Hsapiens.UCSC.hg38")
library(tidyverse)
library(plyranges)
library(GenomicFeatures)

# import the gencode gtf as TxDb
genecode <- makeTxDbFromGFF(file = file.path("human_data", "GRCh38", "gencode.v34.primary_assembly.annotation.gtf.gz"),
                            dataSource="gencode",
                            organism = "Homo sapiens")

CDS <- cdsBy(genecode, "tx", use.names = TRUE) %>% 
  keepStandardChromosomes(pruning.mode = "tidy")
UTR3 <- threeUTRsByTranscript(genecode, use.names = TRUE) %>% keepStandardChromosomes(pruning.mode = "tidy")
UTR5 <- fiveUTRsByTranscript(genecode, use.names = TRUE) %>% keepStandardChromosomes(pruning.mode = "tidy")

# get sequences
CDS_seq <- Views(BSgenome.Hsapiens.UCSC.hg38, unlist(CDS))
UTR3_seq <- Views(BSgenome.Hsapiens.UCSC.hg38, unlist(UTR3))
UTR5_seq <- Views(BSgenome.Hsapiens.UCSC.hg38, unlist(UTR5))

# make fasta files
fasta_CDS_seq <- DNAStringSet(CDS_seq)
fasta_UTR3_seq <- DNAStringSet(UTR3_seq)
fasta_UTR5_seq <- DNAStringSet(UTR5_seq)

# fix names to be unique for each sequence
CDS_names <- CDS %>% 
  unlist() %>% 
  mutate(fasta_names = str_c(names(.), "_exon_rank_", exon_rank)) %>% 
  as_tibble() %>% 
  dplyr::select(fasta_names) 

UTR3_names <- UTR3 %>% 
  unlist() %>% 
  mutate(fasta_names = str_c(names(.), "_exon_name_",exon_name)) %>% 
  as_tibble() %>% 
  dplyr::select(fasta_names) 

UTR5_names <- UTR5 %>% 
  unlist() %>% 
  mutate(fasta_names = str_c(names(.), "_exon_name_",exon_name)) %>% 
  as_tibble() %>% 
  dplyr::select(fasta_names) 
# rename the fasta
names(fasta_CDS_seq) <-   CDS_names$fasta_names
names(fasta_UTR3_seq) <-   UTR3_names$fasta_names
names(fasta_UTR5_seq) <-   UTR5_names$fasta_names

# export the fasta 
targets_dirs <- c( "CDS", "UTR3", "UTR5") %>% 
  file.path("human_data", "piRNA_Targets", .) 
targets_dirs %>% 
  map(~dir.create(., recursive = TRUE))      

list("CDS" = fasta_CDS_seq,
     "UTR3" = fasta_UTR3_seq,
     "UTR5" = fasta_UTR5_seq) %>% 
  map2(.y = targets_dirs, ~ Biostrings::writeXStringSet(x = .x, filepath = file.path(.y, str_c(basename(.y),"_hg38.fasta")))
       )
```
### ii. make a fasta with only piRNA sequences

```{r piRNA fasta}
library(tidyverse)
piRNAs_gtf <- plyranges::read_gff2(file.path("human_data", "sncRNA_piRNBnk_RNACent_GRCh38_v34.gtf")) %>% 
  filter(gene_type == "piRNA") %>% 
  as_tibble() %>% 
  distinct(gene_id, .keep_all = T) %>% 
  dplyr::select(gene_id, seq_RNA) %>%
  column_to_rownames("gene_id")

piRNA_fa_hg38 <- Biostrings::DNAStringSet(piRNAs_gtf$seq_RNA)
names(piRNA_fa_hg38) <- rownames(piRNAs_gtf)

piRNA_fa_hg38 %>% 
Biostrings::writeXStringSet(file.path("human_data", 
                                      "piRNA_Targets", 
                                      "piRNAs_for_target_prediction_v0.2.fa"))
```
### iii. make the indexes for bowtie

```{bash run the bowtie docker}
mkdir UTR3 CDS UTR5

docker run --rm -ti -v $(pwd):/home/my_data  congelos/bowtie_bowtie2
#UTR3 index
bowtie-build -f -o 3 --threads 8 my_data/human_data/piRNA_Targets/UTR3/UTR3_hg38.fasta my_data/human_data/piRNA_Targets/UTR3/UTR3_hg38

#UTR5 index
bowtie-build -f -o 3 --threads 8 my_data/human_data/piRNA_Targets/UTR5/UTR5_hg38.fasta my_data/human_data/piRNA_Targets/UTR5/UTR5_hg38

#CDS index
bowtie-build -f -o 3 --threads 8 my_data/human_data/piRNA_Targets/CDS/CDS_hg38.fasta my_data/human_data/piRNA_Targets/CDS/CDS_hg38

# UTR3 run
bowtie --nofw -v 3 -a --best --strata -p 6 -x my_data/human_data/piRNA_Targets/UTR3/UTR3_hg38 -S -f \
my_data/human_data/piRNA_Targets/piRNAs_for_target_prediction_v0.2.fa | \
samtools view -F 4 -@ 2 -  > my_data/human_data/piRNA_Targets/res_UTR3.txt

# UTR5 run
bowtie --nofw -v 3 -a --best --strata -p 6 -x my_data/human_data/piRNA_Targets/UTR5/UTR5_hg38 -S -f \
my_data/human_data/piRNA_Targets/piRNAs_for_target_prediction_v0.2.fa | \
samtools view -F 4 -@ 2 -  > my_data/human_data/piRNA_Targets/res_UTR5.txt

# CDS run
bowtie --nofw -v 3 -a --best --strata -p 6 -x my_data/human_data/piRNA_Targets/CDS/CDS_hg38 -S -f \
my_data/human_data/piRNA_Targets/piRNAs_for_target_prediction_v0.2.fa | \
samtools view -F 4 -@ 2 -  > my_data/human_data/piRNA_Targets/res_CDS_hg38.txt
```
### iv. make a dataframe with all predicted gene targets

```{r export predicted targeted genes}
gtf_again <- file.path("human_data", "GRCh38", "gencode.v34.primary_assembly.annotation.gtf.gz") %>% 
  read_gff2() %>% 
  as_tibble()

gen_names <- gtf_again %>% 
  select(gene_id, gene_type, gene_name, transcript_id, transcript_name) %>%
  filter(!is.na(transcript_id)) %>% 
  dplyr::count(transcript_id, gene_name, gene_type) %>% 
  select(-n)

# Load the targets
targets <- list.files(path = file.path("human_data", 
                                      "piRNA_Targets"), 
                      pattern = "(UTR|CDS).+txt",full.names = T) %>% 
  vroom::vroom(col_names = FALSE, 
               id = "file", 
               col_select = c("file", "X1", "X3")) %>%
  mutate(file = file %>% basename() %>% str_remove_all("res_|_hg38.txt|.txt")) %>% 
  separate(X3, 
           c("transcript_id", "exon_rank"), sep = "_exon_rank_|_exon_name_") %>% 
  left_join(gen_names)  %>% 
  dplyr::rename(piRNA_id = X1,
                Target_gene_name = gene_name) %>% 
  arrange(Target_gene_name) %>% 
  write_tsv(file.path("human_data",
                      "piRNA_Targets", 
                      "piRNA_predicted_Targets.v02.txt"))
```

# 3. Graph for sncRNAs genomic regions and sequences
Create a boxplot with all the different annotation for sncRNAs

### i. import gtf annotation
```{r}
gtf_piB_RCentr <- file.path("human_data", "sncRNA_piRNBnk_RNACent_GRCh38_v34.gtf.gz") %>% 
  read_gff2()
```

### ii. make Histogram
```{r}
library(scales)

## genomic locations
pdf("human_data/genomic_locations_histogram_sncRNAs.pdf")
gtf_piB_RCentr %>% 
  as_tibble %>% 
  ggplot(aes(x=gene_type)) + 
  geom_histogram(stat="count", width=0.8) +
  theme_minimal() +
  scale_y_continuous(name = "Counts",
                     #trans = log10_trans(),
                     breaks = waiver(),
                     minor_breaks = waiver(),
                     n.breaks = 8,
                     labels = comma) +
  ggtitle(label = "Histogram of genomic locations for each smallRNA type") +
  coord_flip()
 dev.off()
 
## sequences
pdf("human_data/sequences_histogram_sncRNAs.pdf")
gtf_piB_RCentr %>% 
  as_tibble %>% 
  distinct(gene_id, .keep_all = TRUE) %>% 
  ggplot(aes(x=gene_type)) + 
  geom_histogram(stat="count", width=0.8) +
  theme_minimal() +
  scale_y_continuous(name = "Counts",
                     breaks = waiver(),
                     minor_breaks = waiver(),
                     n.breaks = 8,
                     labels = comma) +
  ggtitle(label = "Histogram of different sequences for each smallRNA type") +
  coord_flip()
 dev.off()
```

# 4. RNAcentral mapping IDs

## mouse mm10
### load the GTF and the IDs
```{r}
library(tidyverse)
library(plyranges)
library(vroom)
# load the gtf file
gtf_piB_RCentr <- file.path("mouse_data", "sncRNA_piRNBnk_RNACent_GRCm38_v34.gtf.gz") %>% 
  read_gff2()

# load the RNAcentral IDs
rnacentral_ids_mm10 <- vroom("../wind/mouse_data/RNACentral/mus_musculus.GRCm38.id_mapping.tsv.gz", 
                        col_names = c("RNAcentral_id", 
                                      "Database",
                                      "external_id",
                                      "NCBI_taxon_id",
                                      "RNA_type",
                                      "gene_name")) %>% 
  filter(NCBI_taxon_id == "10090") %>%
  select(-c(NCBI_taxon_id, RNA_type))

# rnacentral_ids_mm10 %>% count(RNAcentral_id, sort = T)
# 185,348 sncRNAs

rnacentral_ids_mm10 
rnacentral_ids_mm10 <- rnacentral_ids_mm10 %>% 
  arrange(RNAcentral_id, gene_name, external_id) %>% 
  distinct(RNAcentral_id, Database, .keep_all = TRUE) %>% 
  distinct(RNAcentral_id, external_id, .keep_all = TRUE) 

multipleDBS_ids <- rnacentral_ids_mm10 %>% 
  count(RNAcentral_id) %>% 
  filter(n > 1) %>% 
  select(RNAcentral_id) %>% 
  deframe()

part_2_DF <- rnacentral_ids_mm10 %>% 
  filter(!RNAcentral_id %in% multipleDBS_ids) %>% 
  arrange(RNAcentral_id, gene_name, external_id) 

part_1_DF <- rnacentral_ids_mm10 %>% 
  filter(RNAcentral_id %in% multipleDBS_ids) %>% 
  distinct(RNAcentral_id, .keep_all = TRUE)

attach_to_GTF <- bind_rows(part_1_DF,part_2_DF) 

# merge the two files
gtf_piB_RCentr
attach_to_GTF %>% filter(RNAcentral_id %in% gtf_piB_RCentr$sRNA_id)

new_gtf <- gtf_piB_RCentr %>% 
  as_tibble() %>% 
  left_join(attach_to_GTF, by = c("sRNA_id" = "RNAcentral_id")) %>% 
  mutate(Database = ifelse(test = is.na(Database), 
                           yes = source,
                           no = Database),
         external_id = ifelse(test = is.na(external_id),
                              yes = gene_id,
                              no = external_id)) %>% 
  dplyr::rename("sncRNA_name" = gene_name)

new_gtf %>% 
  as_granges() %>%
  write_gff2(file.path("mouse_data", "sncRNA_piRNBnk_RNACent_gene_names_GRCm38_v34.gtf.gz"))
```

## human hg38
### load the GTF and the IDs
```{r}
library(tidyverse)
library(plyranges)
library(vroom)
# load the gtf file
gtf_piB_RCentr <- file.path("human_data", "additional_info_piRNAs_GRs", "sncRNA_piRNBnk_RNACent_piCdb_GRCh38_v34.gtf") %>% 
  read_gff2()

# load the RNAcentral IDs
rnacentral_ids_hg38 <- vroom("./mouse_data/RNACentral/mus_musculus.GRCm38.id_mapping.tsv.gz", 
                        col_names = c("RNAcentral_id", 
                                      "Database",
                                      "external_id",
                                      "NCBI_taxon_id",
                                      "RNA_type",
                                      "gene_name")) %>% 
  filter(NCBI_taxon_id == "9606") %>% # pick human taxon
  select(-c(NCBI_taxon_id))

# rnacentral_ids_hg38 %>% count(RNAcentral_id, sort = T)
# 253,851 sncRNAs
```

#### miRNA
```{r}
# check miRNA names
miRNA_id <- rnacentral_ids_hg38 %>% 
  filter(RNA_type == "miRNA") %>% 
  count(RNAcentral_id) %>% 
  magrittr::extract("RNAcentral_id")
  
miRNA <- rnacentral_ids_hg38 %>% 
   filter(RNA_type == "miRNA") %>%  
   mutate(
     gene_name_2 = case_when(
     is.na(gene_name) ~ external_id,
     str_detect(gene_name, regex("mir", ignore_case = TRUE)) ~ gene_name,
     str_detect(external_id, regex("hsa-mir-|mir", ignore_case = TRUE)) ~ external_id,
     TRUE ~ gene_name)) 

# filter for miRNA names with hsa-mir 
miRNA_hsa <- miRNA %>% 
  filter(str_detect(gene_name_2, regex("hsa-mir|hsa-let", ignore_case = TRUE))) %>% 
  distinct(RNAcentral_id, .keep_all = TRUE) %>% 
  select(-c(RNA_type, gene_name)) %>% 
  mutate(gene_name_2 = tolower(gene_name_2))

# pick the rest mirna
miRNA_rest <- miRNA %>% 
  filter(!RNAcentral_id %in% miRNA_hsa$RNAcentral_id) %>% 
  arrange(RNAcentral_id, gene_name, desc(gene_name_2)) %>%
  distinct(RNAcentral_id, .keep_all = TRUE) %>% 
  select(-c(RNA_type, gene_name)) 

miRNA <- bind_rows(miRNA_hsa, miRNA_rest) %>% 
  dplyr::rename("gene_name" = gene_name_2)

identical(arrange(miRNA, RNAcentral_id)$RNAcentral_id, arrange(miRNA_id, RNAcentral_id)$RNAcentral_id )
```

#### precursor miRNA
```{r}
precursor_miRNA_id <- rnacentral_ids_hg38 %>% 
  filter(RNA_type == "precursor_RNA") %>% 
  count(RNAcentral_id) %>% 
  magrittr::extract("RNAcentral_id") %>% 
  arrange()
  
precursor_miRNA <- rnacentral_ids_hg38 %>% 
  filter(RNA_type == "precursor_RNA") %>% 
  #group_by(RNAcentral_id) %>% 
   mutate(
     gene_name_2 = case_when(
     is.na(gene_name) ~ external_id,
     str_detect(gene_name, regex("mir", ignore_case = TRUE)) ~ gene_name,
     str_detect(external_id, regex("mir|let", ignore_case = TRUE)) ~ external_id,
     TRUE ~ gene_name)) 

# filter for precursor names with hsa-mir 
precursor_miRNA_hsa <- precursor_miRNA %>% 
  filter(str_detect(gene_name_2, regex("mir|let", ignore_case = TRUE))) %>% 
  distinct(RNAcentral_id, .keep_all = TRUE) %>% 
  select(-c(RNA_type, gene_name)) %>% 
  mutate(gene_name_2 = tolower(gene_name_2))

# pick the rest precursor
precursor_miRNA_rest <- precursor_miRNA %>% 
  filter(!RNAcentral_id %in% precursor_miRNA_hsa$RNAcentral_id) %>% 
  arrange(RNAcentral_id, gene_name_2) %>%
  distinct(RNAcentral_id, .keep_all = TRUE) %>% 
  select(-c(RNA_type, gene_name)) 

precursor_miRNA <- bind_rows(precursor_miRNA_hsa, precursor_miRNA_rest) %>% 
  dplyr::rename("gene_name" = gene_name_2)

identical(arrange(precursor_miRNA, RNAcentral_id)$RNAcentral_id, arrange(precursor_miRNA_id, RNAcentral_id)$RNAcentral_id )
```

#### other sncRNAs
```{r}
rnacentral_ids_hg38

rnacentral_ids_hg38 <- rnacentral_ids_hg38 %>% 
  filter(!RNA_type %in% c("miRNA", "precursor_RNA")) %>% 
  arrange(RNAcentral_id, gene_name, external_id) %>% 
  distinct(RNAcentral_id, Database, .keep_all = TRUE) %>% 
  distinct(RNAcentral_id, external_id, .keep_all = TRUE) 

multipleDBS_ids <- rnacentral_ids_hg38 %>% 
  count(RNAcentral_id) %>% 
  filter(n > 1) %>% 
  select(RNAcentral_id) %>% 
  deframe()

part_2_DF <- rnacentral_ids_hg38 %>% 
  filter(!RNAcentral_id %in% multipleDBS_ids) %>% 
  arrange(RNAcentral_id, gene_name, external_id) %>% 
  select(-RNA_type)

part_1_DF <- rnacentral_ids_hg38 %>% 
  filter(RNAcentral_id %in% multipleDBS_ids) %>% 
  distinct(RNAcentral_id, .keep_all = TRUE) %>% 
  select(-RNA_type)

attach_to_GTF <- bind_rows(miRNA, part_1_DF, part_2_DF, precursor_miRNA) %>% 
  arrange(RNAcentral_id)

# merge the two files
gtf_piB_RCentr
attach_to_GTF %>% filter(RNAcentral_id %in% gtf_piB_RCentr$sRNA_id)

new_gtf <- gtf_piB_RCentr %>% 
  as_tibble() %>% 
  left_join(attach_to_GTF, by = c("sRNA_id" = "RNAcentral_id")) %>% 
  mutate(Database = ifelse(test = is.na(Database), 
                           yes = source,
                           no = Database),
         external_id = ifelse(test = is.na(external_id),
                              yes = gene_id,
                              no = external_id)) %>% 
  dplyr::rename("sncRNA_name" = gene_name)

new_gtf %>% 
  as_granges() %>%
  write_gff2(file.path("human_data", "sncRNA_piRNBnk_RNACent_piCdb_gene_names_GRCh38_v34.gtf.gz"))
```