Skip to content

Latest commit

 

History

History
242 lines (175 loc) · 9.05 KB

003.Figure 2C (homer to do motif enrichment and visualization).md

File metadata and controls

242 lines (175 loc) · 9.05 KB

1.1 Promoter-localized BIA related THSs to do motif enrichment

library(ggplot2)
library(ggpointdensity) 
library(viridis)
library("ChIPseeker")
library("GenomicFeatures")

poppy_TxDb <- makeTxDbFromGFF("D:/Papaver_somniferum.gene.gff3")
X = annotatePeak(paste0("D:/ATAC_peak/BIA/four_tissue_sort.bed"),tssRegion = c(-3000,3000),TxDb = poppy_TxDb)
perk_related_to_gene <- as.data.frame(as.GRanges(X))

bia_gene <- read.csv("D:/ATAC_peak/BIA/BIA_gene.csv",stringsAsFactors = F)
bia_gene <- bia_gene[,-1]

perk_related_to_gene <- perk_related_to_gene[perk_related_to_gene$geneId %in% bia_gene[,2],]

perk_related_to_gene_1 <- perk_related_to_gene[abs(perk_related_to_gene[,21]) <= 3000,]
a <- as.data.frame(table(perk_related_to_gene_1$geneId))

perk_related_to_gene_2 <- perk_related_to_gene[abs(perk_related_to_gene[,21]) <= 5000,]
a_1 <- as.data.frame(table(perk_related_to_gene_2$geneId))

perk_related_to_gene_1[,2] <- perk_related_to_gene_1[,2] -1
perk_related_to_gene_1 <- perk_related_to_gene_1[,c(1,2,3,6,19,21,14,15,16)]

write.table(perk_related_to_gene_1,"D:/ATAC_peak/BIA/homer/BIA_3kb.bed",
            sep = "\t",row.names = F,col.names = F,quote = F)

1.2 Homer to find motif

cd /mnt/d/ATAC_peak/BIA/homer
awk '{print $4"\t"$1"\t"$2"\t"$3"\t+"}' BIA_3kb.bed >homer_peaks.tmp
conda activate homer
findMotifsGenome.pl ./homer_peaks.tmp /Papaver_somniferum.ref.fa BIA_3kb_motifDir -p 4

1.3 blast to identify motif related gene name in opium poppy

Because many motif names like "athb6", we must known which gene in opium poppy is "athb6"
rm(list=ls())
## get id from homer results
library(tidyverse)
homer_result <- read.table("D:/ATAC_peak/BIA/homer/BIA_3kb_motifDir/knownResults.txt",sep = "\t",skip = 1)
colnames(homer_result) <- c("Motif Name","Consensus","P-value","Log P-value","q-value (Benjamini)","of Target Sequences with Motif","% of Target Sequences with Motif","of Background Sequences with Motif"	,"% of Background Sequences with Motif")
homer_result1 <- separate(data = homer_result, col = `Motif Name`, into = c("TF_name","data_set"), sep = "/")
homer_result1 <- homer_result1[homer_result1[,4]<=0.05,]
homer_result1 <- separate(data = homer_result1, col = TF_name, into = "TF", sep = "\\(")
homer_result1 <- as.data.frame(homer_result1[,1])
write.table(t(homer_result1),"D:/ATAC_peak/BIA/homer_tf/known_ATAC_tf_only_name_knownResults.txt",sep = " ",col.names = F,row.names = F,quote = F)
cd /mnt/d/ATAC_peak/BIA/homer_tf
## "known_uniprot-ATAC_tf_only_name_knownResults.fasta" is a  protein fasta file.
## using "known_ATAC_tf_only_name_knownResults.txt" as name input to find protein sequences in uniprot database with arabidopsis species as ref.
blastp -db blastdp -query ./known_uniprot-ATAC_tf_only_name_knownResults.fasta -evalue 1e-5 -num_threads 4 -outfmt 6 -best_hit_score_edge 0.05 -best_hit_overhang 0.25 -qcov_hsp_perc 75.0  -out known_blast_result.txt

1.4 Get tfs

the following the same as 003Joint analysis of RNA-seq and ATAC-seq

library(tidyverse)
library(readxl)
library(XML)
library(bitops)
library(RCurl)

tissue_homer <- read.table("D:/ATAC_peak/BIA/homer_tf/known_blast_result.txt"
                           ,sep = "\t",stringsAsFactors = F)
tissue_homer <- tissue_homer[tissue_homer[,3] >40,]
tissue_homer <- tissue_homer[,c(1,2)]
tissue_homer <- separate(data = tissue_homer, col = V1, into = c("1","2","3"), sep = "\\|")
tissue_homer <- tissue_homer[,c(4,2)]
colnames(tissue_homer) <- c("poppy_name","uniprot_name")

tf_information <- read.table("D:/ATAC_peak/BIA/homer_tf/uniprot_anno.txt"
                             ,sep = "\t",stringsAsFactors = F)
colnames(tf_information) <- tf_information[1,]
tf_information <- tf_information[-1,]

tf_merge <- merge(tissue_homer,tf_information,by.x="uniprot_name",by.y = "Entry")


URL <- paste0("file:///D:/ATAC_peak/BIA/homer/BIA_3kb_motifDir/knownResults.html")
temp <- getURL(URL)
web_tissue <- htmlParse(temp)
root <- xmlRoot(web_tissue)
length_motif <- as.data.frame(xmlSApply(root[["body"]][["table"]],xmlValue))
tissue_motif_matrix <- rbind()
for (i in 2:length(t(length_motif))) {
  tissue_motif_matrix <- rbind(tissue_motif_matrix,xmlToDataFrame(root[["body"]][["table"]][i]))
}
tissue_motif_matrix <- data.frame(lapply(tissue_motif_matrix,as.character),stringsAsFactors = F)
tissue_motif_matrix<- tissue_motif_matrix[,c(1,3)]
colnames(tissue_motif_matrix) <- c("rnak","Motif Name")


tissue_motif_matrix_true1 <- separate(data = tissue_motif_matrix, col = `Motif Name`, into = "TF", sep = "/")
tissue_motif_matrix_true1 <- separate(data = tissue_motif_matrix_true1, col = TF, into = "TF_name", sep = "\\(")

tf_poppy <- merge(tf_merge,tissue_motif_matrix_true1,by.x = "homer_name",by.y = "TF_name")
tf_poppy[,6] <- paste0("/mnt/d/ATAC_peak/BIA/homer/BIA_3kb_motifDir/knownResults/","known",tf_poppy[,5],".motif")

path <- as.data.frame(unique(tf_poppy[,6]))
path <- data.frame(lapply(path,as.character),stringsAsFactors = F)


len <- length(t(path))+1
path[len,1] <- "test"

write.csv(tf_poppy,"D:/ATAC_peak/BIA/homer_tf/regulatory_network/homer_tf_all_known.csv",
          row.names = F,quote = F)
write.table(t(path),"D:/ATAC_peak/BIA/homer_tf/regulatory_network/path_to_motif/homer_tf_path_known.txt",
            sep = " ",col.names = F,row.names = F,quote = F)
conda activate homer
cd /mnt/d/ATAC_peak/BIA/homer_tf/regulatory_network/path_to_motif/ 
ls | while read id;do(cat $id |while read id2;do(cat $id2 >> /mnt/d/ATAC_peak/BIA/homer_tf/regulatory_network/$id.motifs);done);done
cd ..
annotatePeaks.pl ./BIA_3kb.bed Papaver_somniferum.ref.fa -m homer_tf_path_known.txt.motifs > homer_tf_path_known.txt.motifs.txt

1.5 Get motif binding information

options(stringsAsFactors = FALSE)
motif_binding <- read.csv("D:/ATAC_peak/BIA/homer_tf/regulatory_network/homer_tf_path_known.txt.motifs.txt",
                          sep="\t",stringsAsFactors = F)
a <- as.data.frame(table(motif_binding[,1]))
motif_binding <- as.data.frame(t(motif_binding[c(1,22:length(colnames(motif_binding)))]))

add_col_num <- dim(motif_binding)[2]+1
motif_binding[,add_col_num] <- rownames(motif_binding) 
motif_binding <- separate(data = motif_binding, col = paste0("V",add_col_num), into = "TF_name", sep = "\\.")
a <- as.data.frame(table(motif_binding[,add_col_num]))

rownames(motif_binding)[7]
motif_binding[7,add_col_num] <- paste0(motif_binding[7,add_col_num],"_","DAP")
rownames(motif_binding)[8]
motif_binding[8,add_col_num] <- paste0(motif_binding[8,add_col_num],"_","CHIP")

motif_binding <- as.data.frame(t(motif_binding))


result_data <- rbind()


for (tf in 2:dim(motif_binding)[2]) {
  print(tf)
  result_data_1 <- rbind()
  for(target_gene in 1:(dim(motif_binding)[1]-1)){
    if(motif_binding[target_gene,tf] !=""){
      result_data_2 <- motif_binding[target_gene,1:2]
      result_data_2[1,2] <- as.character(motif_binding[dim(motif_binding)[1],tf])
      colnames(result_data_2) <- c("peak","tf")
      result_data <- rbind(result_data,result_data_2)
    }
  }
}

peak <- read.csv("D:/ATAC_peak/BIA/homer_tf/regulatory_network/BIA_3kb.bed",
                 sep = "\t",header = F)
peak <- peak[,c(4,5)]
result_data <- merge(result_data,peak,by.x="peak",by.y="V4")
colnames(result_data) <- c("peak","tf","target_gene")
write.csv(result_data,"D:/ATAC_peak/BIA/homer_tf/regulatory_network/bia_tf_gene_network.csv")

result_data_table <- result_data[,c(2,3)]
result_data_table <- unique(result_data_table)
result_data_table <- as.data.frame(table(result_data_table[,1]))

poppy_tf <- read.csv("D:/ATAC_peak/BIA/homer_tf/regulatory_network/homer_tf_all_known.csv")
poppy_tf <- poppy_tf[,c(1,2,3)]
poppy_tf[c(16,18,20),1] <- "ATHB6_DAP"
poppy_tf[c(17,19,21),1] <- "ATHB6_CHIP"

result_data <- merge(result_data,poppy_tf,by.x="tf",by.y="homer_name")

write.csv(result_data,"D:/ATAC_peak/BIA/homer_tf/regulatory_network/bia_tf_gene_network_all_infor.csv")

1.6 visualization


bedtools getfasta -fi /mnt/d/001poppy_atac_new_genome/Papaver_somniferum.ref.fa   -bed ./hb6_getfasta_2.bed -s -name > hb6_common_motif_2.fasta
library(ggplot2)
library(ggmsa)
library(seqmagick)

seqence <- seqmagick::fa_read("D:/001poppy_atac_new_genome/ATAC_peak/BIA/homer/hb6_common_motif_2.fasta")
color_data <- as.data.frame(matrix(0,4,2))
colnames(color_data) <- c("names","color")
color_data[,1] <- c("A","T","G","C")
color_data[,2] <- c("white","white","white","white")
my_cutstom <- color_data



pdf("motif_seqence_flanking_1_10.pdf")

ggmsa(seqence,1,10,custom_color=my_cutstom ,seq_name =TRUE)
##+geom_seqlogo(color = "Taylor_NT") + geom_msaBar()
dev.off()


pdf("motif_seqence_flanking_11_20.pdf")

ggmsa(seqence, 11,20, color="Chemistry_NT",seq_name =TRUE)
##+geom_seqlogo(color = "Taylor_NT") + geom_msaBar()
dev.off()

pdf("motif_seqence_flanking_21_30.pdf")

ggmsa(seqence,21,30,custom_color=my_cutstom ,seq_name =TRUE)
##+geom_seqlogo(color = "Taylor_NT") + geom_msaBar()
dev.off()