20241112_mpra_tf_annotations.Rmd

---
title: "TF columns for MPRA data"
output: html_document
author: Max Dippel
date: "2024-10-23"
---

The goal of this code is to create TF data which I can easily add to the big MPRA table. 

Load libraries
```{r}
library(readxl)
library(readr)
library(tidyverse)
```

Establish directories
```{r}
tf.dat.dir <- "~/Desktop/Ho et al. writings/Code_from_scratch2/data/tf/"
data.dir <- "~/Desktop/Ho et al. writings/Code_from_scratch2/data/"
tf.column.dir <- "/nfs/jray/screens/ALL_MPRAs/Ho_et_al_analysis/Downstream_Analysis/20240310_mpra_analysis/data/tf_columns_mpra_merge/"
```

Get the rsids for the MPRA library
```{r}
# Import MPRA
#mpra <- read_excel(paste0(data.dir,"240914_Ho_et_al_tables (Max's Version).xlsx"),sheet=1)
mpra <- read_excel(paste0(data.dir,"41588_2022_1056_MOESM4_ESM.xlsx"),sheet=3)
mpra$rsid <- mpra$rsid...2 
mpra_rsid <- subset(mpra, select=c(rsid))
write.table(mpra_rsid,paste0(tf.dat.dir,"mpra_rsid.txt"), row.names=F, col.names=F, sep="\t", quote=F)
mpra_rsid <- read.table(paste0(tf.dat.dir,"mpra_rsid.txt"), sep="\t", header=T)
```

Put the rsids into this website. https://ananastra.autosome.org/ 
You cannot do over 10000 at one time and we have 18000 variants, so I suggest doing 9000 and 9000. It actually takes a while to process that many SNP. I just used the default settings. 
Download  the table in the transcription factors tab:
ex. ananastra_billcipher_7y1kzw42.tf.tsv

Create an Ananastra column for MPRA data
```{r}
# Import the two ananastra datasets (there are two because I had to split the Rsids into two groups)
ananastra1 <- readr::read_tsv(paste0(tf.dat.dir, "ananastra_billcipher_7y1kzw42.tf.tsv"))
ananastra2 <- readr::read_tsv(paste0(tf.dat.dir, "ananastra_billcipher_6dxy9h35.tf.tsv"))
# Join the two two ananastra data
ananastra <- rbind(ananastra1,ananastra2)
# 1433 rows
ananastra$rsid <- ananastra$rs_id
# Subset the ananastra columns
ananastra <- subset(ananastra,select=c(chromosome,position,ref,alt,rsid,transcription_factor,finemapping))

ananastra$transcription_factor <- sub("_HUMAN","",ananastra$transcription_factor)

# Create hg38 SNP column for the MPRA to be able to merge with the motifbrerakr info
hg_liftover <- read.table(paste0(data.dir,"mpra_snps.hg38_liftover.txt"), sep="\t", header=T)
# Get rid of the word "chr" in the chromosome column, so only the number remains
hg38_liftover_chr_mod <- gsub("chr", "", hg_liftover$chr)
# Get rid of the numbers on the snpid_hg19 to be left with the alleles
hg38_liftover_id_mod <- gsub("[0-9]+", "", hg_liftover$snpid_hg19)
# Get rid of double colon
hg38_liftover_id_mod <- gsub("::", ":", hg38_liftover_id_mod)
# create the SNP ID out of all the pieces you just created
hg_liftover$SNP38 <- paste0(hg38_liftover_chr_mod,":",hg_liftover$pos_hg38,hg38_liftover_id_mod)
hg_liftover$SNP <- hg_liftover$snpid_hg19
mpra$SNP19 <- mpra$SNP
mpra<-merge(mpra,hg_liftover, by="SNP", all.x=T, all.y=T)
mpra<-mpra[!duplicated(mpra$SNP),]

mpra_coordinates <- subset(mpra, select=c(rsid,SNP38,SNP19))


ananastra_mpra_table <- merge(mpra_coordinates,ananastra, by="rsid")

# Make sure position and alleles for ananastra and MPRA are the same 
# Rename the columns 
ananastra_mpra_table$ananastra_pos <- ananastra_mpra_table$position
ananastra_mpra_table$mpra_pos <- sub("^[^:]+:([^:]+):.*$", "\\1", ananastra_mpra_table$SNP38)
# Create a column with a 1 when the ananastra and mpra positions match 
ananastra_mpra_table$pos_agree <- NA
ananastra_mpra_table$pos_agree <- as.integer(ananastra_mpra_table$ananastra_pos == ananastra_mpra_table$mpra_pos)
# Subset to only variants in which the position in the MPRA and ananastra match
ananastra_mpra_table  <- subset(ananastra_mpra_table, pos_agree==1)
# No variants lost

# Make sure reference alleles for Motifbreakr and MPRA are the same 
# Rename the columns 
ananastra_mpra_table$ananastra_ref <- ananastra_mpra_table$ref
ananastra_mpra_table$mpra_ref <- sub("^[^:]+:[^:]+:([^:]+):.*$", "\\1", ananastra_mpra_table$SNP38)
# Create a column with a 1 when the ananastra and mpra alleles match 
ananastra_mpra_table$ref_agree <- NA
ananastra_mpra_table$ref_agree <- as.integer(ananastra_mpra_table$ananastra_ref == ananastra_mpra_table$mpra_ref)
# Subset to only variants in which the reference allele in the MPRA and ananastra match
ananastra_mpra_table  <- subset(ananastra_mpra_table, ref_agree==1)
# Lost 6 variants

# Make sure alternate alleles for Motifbreakr and MPRA are the same 
# Rename the columns 
ananastra_mpra_table$ananastra_alt <- ananastra_mpra_table$alt
ananastra_mpra_table$mpra_alt <- sub("^[^:]+:[^:]+:[^:]+:([^:]+)$", "\\1", ananastra_mpra_table$SNP38)
# Create a column with a 1 when the ananastra and mpra alleles match 
ananastra_mpra_table$alt_agree <- NA
ananastra_mpra_table$alt_agree <- as.integer(ananastra_mpra_table$ananastra_alt == ananastra_mpra_table$mpra_alt)
# Subset to only variants in which the alternate allele in the MPRA and ananastra match
ananastra_mpra_table  <- subset(ananastra_mpra_table, alt_agree==1)
# Lost no variants

ananastra.data.for.mpra <- subset(ananastra_mpra_table, select=c(SNP19,transcription_factor))

ananastra.data.for.mpra <- ananastra.data.for.mpra %>%
  group_by(SNP19) %>%
  summarise(ananastra_tf = toString(transcription_factor)) %>%
  ungroup()

write.table(ananastra.data.for.mpra,paste0(tf.dat.dir,"ananastra_data_for_mpra_hg19.txt"), row.names=F, col.names=T, sep="\t", quote=F)
```

Motifbreakr column for mpra: A bunch of this code 
```{r}
# Import motif data. This is created in the motifbreakr enrichment analysis. I am including the data here so that I can make this column in the big table, even though I use the big table to make the motifbreakr results. This little paradox helps simplify the code a lot, and you can see the raw data I got from motifbreakr. 
motifbreakr_dir <- paste0(data.dir,"motifbreakr/","2023.11.15.hg38.tcells.glm.all.snps.hocomoco.bed",".motifbreakr.results.txt")
motif.dat <- read.table(motifbreakr_dir, sep="\t", header=T)

# Import  MPRA data. This will have more variants than normal but that's fine because I am only using this to merge a single column into the big table and we are only going to be using the appropriate columns for that. 
mpra <- read.table(paste0(data.dir,"OLJR.C_Tcell_emVAR_glm_20240310.out"), sep="\t", header=T)
mpra <- subset(mpra, select=c(SNP,ref_allele,alt_allele))
# Create hg38 SNP column for the MPRA to be able to merge with the motifbrerakr info
hg_liftover <- read.table(paste0(data.dir,"mpra_snps.hg38_liftover.txt"), sep="\t", header=T)
# Get rid of the word "chr" in the chromosome column, so only the number remains
hg38_liftover_chr_mod <- gsub("chr", "", hg_liftover$chr)
# Get rid of the numbers on the snpid_hg19 to be left with the alleles
hg38_liftover_id_mod <- gsub("[0-9]+", "", hg_liftover$snpid_hg19)
# Get rid of double colon
hg38_liftover_id_mod <- gsub("::", ":", hg38_liftover_id_mod)
# create the SNP ID out of all the pieces you just created
hg_liftover$SNP38 <- paste0(hg38_liftover_chr_mod,":",hg_liftover$pos_hg38,hg38_liftover_id_mod)
hg_liftover$SNP <- hg_liftover$snpid_hg19
mpra<-merge(mpra,hg_liftover, by="SNP", all.x=T, all.y=T)
mpra<-mpra[!duplicated(mpra$SNP),]

# Makes geneSymbol (TF name) all uppercase
motif.dat$geneSymbol <- toupper(motif.dat$geneSymbol)
# make geneSymbol (TF Name) change from slash / to underscore so that the plots can save
motif.dat$geneSymbol <- gsub("/","_",motif.dat$geneSymbol)

# Subset the motif data
motif.dat<-subset(motif.dat,  select=c(seqnames, end, REF, ALT,SNP_id, geneSymbol,alleleDiff))
# Subset MPRA
#mpra <- subset(mpra, select=c(SNP,A.log2FC,B.log2FC,LogSkew,mpra_sig,rsid,ref_allele,alt_allele))

######## code for merging mpra and motifbreakr by chromosome and position ########
# Get rid of the chr in motifbreakr SNP_id column to conform with mpra SNP column
motif.dat$SNP38 <- gsub("chr","",motif.dat$SNP_id)
# Merge Motifbreakr and MPRA ( we expect a similar number to the 61570 that are in the motif.dat right now (example run not represenative of all runs))
motif.mpra.dat<-merge(motif.dat, mpra, by="SNP38", all.x=T, all.y=T)

# Make sure alleles for Motifbreakr and MPRA are the same 
# Rename the columns 
motif.mpra.dat$motifbreakr_ref <- motif.mpra.dat$REF
motif.mpra.dat$mpra_ref <- motif.mpra.dat$ref_allele
# Create a column with a 1 when the motifbreakr and mpra alleles match (Improve this loop)
motif.mpra.dat$ref_agree <- NA
motif.mpra.dat$ref_agree <- as.integer(motif.mpra.dat$motifbreakr_ref == motif.mpra.dat$mpra_ref)
# Subset to only variants in which the reference allele in the MPRA and motifbreakr match
motif.mpra.dat <- subset(motif.mpra.dat, ref_agree==1)

# Make sure alleles for Motifbreakr and MPRA are the same 
# Rename the columns 
motif.mpra.dat$motifbreakr_alt <- motif.mpra.dat$ALT
motif.mpra.dat$mpra_alt <- motif.mpra.dat$alt_allele
# Create a column with a 1 when the motifbreakr and mpra alleles match (Improve this loop)
motif.mpra.dat$alt_agree <- NA
motif.mpra.dat$alt_agree <- as.integer(motif.mpra.dat$motifbreakr_alt == motif.mpra.dat$mpra_alt)
# Subset to only variants in which the alternate allele in the MPRA and motifbreakr match
motif.mpra.dat <- subset(motif.mpra.dat, alt_agree==1)

# Add back in the variants without motifbreakr alleles
motif.mpra.dat<-merge(motif.mpra.dat, mpra, by=names(mpra), all.x=T, all.y=T)

# Generate unique MPRA sites 
motif.mpra.dat$unique_snp_tf <- paste0(motif.mpra.dat$seqnames,":",motif.mpra.dat$end,":",
                                    motif.mpra.dat$REF,":",motif.mpra.dat$ALT,"_",motif.mpra.dat$geneSymbol)
motif.mpra.dat<-motif.mpra.dat[order(-abs(motif.mpra.dat$alleleDiff)),]
motif.mpra.dat<-motif.mpra.dat[!duplicated(motif.mpra.dat$unique_snp_tf),]

################################################################################

motifbreakr.data.for.mpra <- subset(motif.mpra.dat, select=c(SNP,geneSymbol))

motifbreakr.data.for.mpra <- motifbreakr.data.for.mpra %>%
  group_by(SNP=SNP) %>%
  summarise(motifbreakr_tf_2024 = toString(geneSymbol)) %>%
  ungroup()

write.table(motifbreakr.data.for.mpra,paste0(tf.dat.dir,"motifbreakr_data_for_mpra_hg19.txt"), row.names=F, col.names=T, sep="\t", quote=F)
```