predict_correlated_expressed_gene.Rmd

---
title: "predict_correlated_deg"
output: html_document
---


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
dyn.load("/home/rstudio/libs/libxml2.so.2")
library(DESeq2)
library(dplyr)
library(GenomicRanges)
library(tibble)
library(rtracklayer)
library(ggplot2)
library(stringr)
library(scales)
```

Load helper functions:

```{r, include=T}
source("helper_functions.R")
```

Define constants:

```{r, include=T}
set.seed(10) # reproduce exactly the same sequence of pseudorandom numbers each run

# Parameters for choosing significantly regulated regions
fdr = 0.01
min.l2fc = 1 # Min log2(fold change)
min.baseMean = 20 # 100, 40

fdr.matches = 0.05

# expression.cutoff = 5 # min number of normalised reads

# pass.count.cutoff = 2 # min number of RNA-seq samples with at least expression.cutoff normalised reads

vicinity.radius = 500 # kbp

pcc.fdr = 0.05

vicinity.sampling.n = 5 # 3

expression.median = 5 # minimal median expression of a gene across all samples to count the gene as expressed

expression.amplitude.remove = 0.5 # the lower proportion of genes to remove by expression amplitude | 0.25

output.suffix = paste0("_expr-median", expression.median, "_ampl", expression.amplitude.remove)
```

Define cutoff sets:

```{r, include=T}
region.cutoffs = list("set1" = list("fdr" = 0.01, "min.l2fc" = 2, "min.baseMean" = 100),
                      "set2" = list("fdr" = 0.01, "min.l2fc" = 2, "min.baseMean" = 80),
                      "set3" = list("fdr" = 0.01, "min.l2fc" = 2, "min.baseMean" = 60),
                      "set4" = list("fdr" = 0.01, "min.l2fc" = 2, "min.baseMean" = 40),
                      "set5" = list("fdr" = 0.01, "min.l2fc" = 2, "min.baseMean" = 20),
                      "set6" = list("fdr" = 0.01, "min.l2fc" = 1.5, "min.baseMean" = 20),
                      "set7" = list("fdr" = 0.01, "min.l2fc" = 1, "min.baseMean" = 20),
                      "set8" = list("fdr" = 0.01, "min.l2fc" = 0.5, "min.baseMean" = 20),
                      "set9" = list("fdr" = 0.02, "min.l2fc" = 0.5, "min.baseMean" = 20),
                      "set10" = list("fdr" = 0.03, "min.l2fc" = 0.5, "min.baseMean" = 20),
                      "set11" = list("fdr" = 0.04, "min.l2fc" = 0.5, "min.baseMean" = 20),
                      "set12" = list("fdr" = 0.05, "min.l2fc" = 0.5, "min.baseMean" = 20))
```

Load the master table:

```{r, include=T}
master.table = assays(readRDS("../input/salmon.merged.gene_counts.rds"))$counts

gene.names = rownames(master.table)
```

Normalise gene expression across replicates using DESeq2:

```{r, include=T}
p1.norm.gene.counts.deseq2 = generate_norm_gene_counts(master.table, "p1")

p2.norm.gene.counts.deseq2 = generate_norm_gene_counts(master.table, "p2")

pM.norm.gene.counts.deseq2 = generate_norm_gene_counts(master.table, "pM")
```

Select expressed genes:

```{r, include=T}
p1.expr.genes = select_expressed_genes(p1.norm.gene.counts.deseq2, expression.median)

p2.expr.genes = select_expressed_genes(p2.norm.gene.counts.deseq2, expression.median)

pM.expr.genes = select_expressed_genes(pM.norm.gene.counts.deseq2, expression.median)

cat("Number of expressed genes in p1 ( expression.median =", expression.median, "):", length(p1.expr.genes), "\n")

cat("Number of expressed genes in p2 ( expression.median =", expression.median, "):", length(p2.expr.genes), "\n")

cat("Number of expressed genes in pM ( expression.median =", expression.median, "):", length(pM.expr.genes), "\n")
```

Scale the normalised expression counts (calculate z-scores) of expressed genes:

```{r, include=T}
p1.norm.gene.counts = generate_scaled_counts(p1.norm.gene.counts.deseq2[p1.norm.gene.counts.deseq2$gene_names %in% p1.expr.genes, ], "gene_names")

p2.norm.gene.counts = generate_scaled_counts(p2.norm.gene.counts.deseq2[p2.norm.gene.counts.deseq2$gene_names %in% p2.expr.genes, ], "gene_names")

pM.norm.gene.counts = generate_scaled_counts(pM.norm.gene.counts.deseq2[pM.norm.gene.counts.deseq2$gene_names %in% pM.expr.genes, ], "gene_names")
```

Calculate gene expression amplitude across samples and remove the lower `expression.amplitude.remove` proportion:

```{r, include=T}
p1.norm.gene.counts = select_genes_by_amplitude(p1.norm.gene.counts, expression.amplitude.remove)

p2.norm.gene.counts = select_genes_by_amplitude(p2.norm.gene.counts, expression.amplitude.remove)

pM.norm.gene.counts = select_genes_by_amplitude(pM.norm.gene.counts, expression.amplitude.remove)

p1.expr.genes = p1.norm.gene.counts$gene_names

p2.expr.genes = p2.norm.gene.counts$gene_names

pM.expr.genes = pM.norm.gene.counts$gene_names

cat("Number of amplitude-selected genes in p1 ( expression.amplitude.remove =", expression.amplitude.remove, "):", 
    length(p1.expr.genes), "\n")

cat("Number of amplitude-selected genes in p2 ( expression.amplitude.remove =", expression.amplitude.remove, "):", 
    length(p2.expr.genes), "\n")

cat("Number of amplitude-selected genes in pM ( expression.amplitude.remove =", expression.amplitude.remove, "):", 
    length(pM.expr.genes), "\n")

# Number of amplitude-selected genes in p1 ( expression.amplitude.remove = 0.25 ): 9980 
# Number of amplitude-selected genes in p2 ( expression.amplitude.remove = 0.25 ): 10028 
# Number of amplitude-selected genes in pM ( expression.amplitude.remove = 0.25 ): 9868
```

The reason for doing this selection is that genes with more variable expression are more likely to be regulated in gliogenesis.

Write down the RNA-seq sample names:

```{r, include=T}
p1.gene.sample.names = unlist(stringr::str_match_all(names(master.table), "WT_D[179]+_p1.*"))

p2.gene.sample.names = unlist(stringr::str_match_all(names(master.table), "WT_D[179]+_p2.*"))

pM.gene.sample.names = unlist(stringr::str_match_all(names(master.table), "WT_D[179]+_pM.*"))
```

Remove the gene expression master table from memory:

```{r, include=T}
rm(master.table)
```

Upload the chromatin accessibility master table:

```{r, include=T}
master.table = read.delim(file = "../input/consensus_peaks.mRp.clN.featureCounts.txt",
                          header = T,
                          sep = "\t",
                          skip = 1)

names(master.table) = gsub(pattern = ".mLb.clN.bam", replacement = "", x = names(master.table))

region.annot = master.table %>%
  dplyr::select(Geneid,
                Chr,
                Start,
                End,
                Strand,
                Length)
```

Normalise chromatin accessibility across replicates using DESeq2:

```{r, include=T}
p1.norm.region.counts.deseq2 = generate_norm_chrom_counts(master.table, region.annot, "p1")

p2.norm.region.counts.deseq2 = generate_norm_chrom_counts(master.table, region.annot, "p2")

pM.norm.region.counts.deseq2 = generate_norm_chrom_counts(master.table, region.annot, "pM")
```

Scale the normalised region accessibility (calculate z-scores):

```{r, include=T}
p1.norm.region.counts = generate_scaled_counts(p1.norm.region.counts.deseq2, "region_names")

p2.norm.region.counts = generate_scaled_counts(p2.norm.region.counts.deseq2, "region_names")

pM.norm.region.counts = generate_scaled_counts(pM.norm.region.counts.deseq2, "region_names")
```

Write down the ATAC-seq sample names:

```{r, include=T}
p1.region.sample.names = unlist(stringr::str_match_all(names(master.table), "WT_D[179]+_p1.*"))

p2.region.sample.names = unlist(stringr::str_match_all(names(master.table), "WT_D[179]+_p2.*"))

pM.region.sample.names = unlist(stringr::str_match_all(names(master.table), "WT_D[179]+_pM.*"))
```

Remove the chromatin accessibility master table from memory:

```{r, include=T}
rm(master.table)
```

Upload sets of NFIA-dependent regions:

```{r, include=T}
p1.dep.regions = import(paste0("../r_results/select_diff_regions/p1_dep_ranges", 
                               "_fdr", fdr, "_min-l2fc", min.l2fc, "_min-baseMean", min.baseMean,
                               ".bed"))

p2.dep.regions = import(paste0("../r_results/select_diff_regions/p2_dep_ranges", 
                               "_fdr", fdr, "_min-l2fc", min.l2fc, "_min-baseMean", min.baseMean,
                               ".bed"))

pM.dep.regions = import(paste0("../r_results/select_diff_regions/pM_dep_ranges", 
                               "_fdr", fdr, "_min-l2fc", min.l2fc, "_min-baseMean", min.baseMean,
                               ".bed"))
```

Print the number of NFIA-dependet regions:

```{r, include=T}
cat("Number of NFIA-dependent regions in p1:", length(p1.dep.regions$name), "\n")

cat("Number of NFIA-dependent regions in p2:", length(p2.dep.regions$name), "\n")

cat("Number of NFIA-dependent regions in pM:", length(pM.dep.regions$name), "\n")
```

Upload the genome annotation for mm10 that was used for the expression quantification analysis:

```{r, include=T}
# These are actually transcripts, not genes, but I retained the name "genes" 
# to track the provenance of this BED from the original gene.bed used in 
# expression quantification with the nf-core rnaseq pipeline (gene.bed also 
# contains transcripts, not genes).
mm10.tx = read.delim(file = "../results/genes_bed6.bed",
                     header = F)

names(mm10.tx) = c("chr", "start", "stop", "tx_name", "score", "strand")

mm10.gene2tx = read.delim(file = "../results/mm10_genes_to_transcripts.tsv",
                          header = F)

names(mm10.gene2tx) = c("gene_name", "tx_name")

mm10.annot.genes = mm10.tx %>%
  left_join(mm10.gene2tx,
            by = c("tx_name" = "tx_name"))
```

Generate genomic ranges for the TSSs of the expressed genes:

```{r, include=T}
p1.expr.tss.ranges = generate_tss_annot(p1.expr.genes,
                                        mm10.annot.genes,
                                        "../r_results/predict_correlated_expressed_gene/p1_expr_genes_tss.bed")

p2.expr.tss.ranges = generate_tss_annot(p2.expr.genes,
                                        mm10.annot.genes,
                                        "../r_results/predict_correlated_expressed_gene/p2_expr_genes_tss.bed")

pM.expr.tss.ranges = generate_tss_annot(pM.expr.genes,
                                        mm10.annot.genes,
                                        "../r_results/predict_correlated_expressed_gene/pM_expr_genes_tss.bed")
```

Calculate the distribution of distances between NFIA-dependent regions and the closest expressed TSSs (on the same strand) to make sure that regions are mostly enhancers, as it would make sense to analyse promoters (regions overlapping TSSs) separately:

```{r, include=T}
calc_dist_distributions(p1.dep.regions, p1.expr.tss.ranges, "p1")

calc_dist_distributions(p2.dep.regions, p2.expr.tss.ranges, "p2")

calc_dist_distributions(pM.dep.regions, pM.expr.tss.ranges, "pM")
```

Up to 1.6% of NFIA-dependent regions are in core or proximal promoters, hence I will need to analyse them separately (and, probably, to assign them to the closest expressed TSS). For now, I continue to analyse all NFIA-dependent regions together, as the proportion of promoter ones is very small in any of the three domains.

Upload mm10 chromosome sizes:

```{r, include=T}
mm10.chr.sizes.df = read.delim(file = "../input/mm10.chrom.sizes",
                               header = F,
                               sep = "\t")

mm10.chr.sizes = mm10.chr.sizes.df$V2

names(mm10.chr.sizes) = mm10.chr.sizes.df$V1
```

Generate the region-gene assignment areas as vicinities of a fixed radius around the NFIA-dependent regions:

```{r, include=T}
p1.vicinity.radius = generate_vicinity_radius(p1.dep.regions, 
                                              vicinity.radius,
                                              paste0("../r_results/predict_correlated_expressed_gene/p1_vicinities_radius_", 
                                                     vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc, 
                                                     "_min-baseMean", min.baseMean, ".bed"))

p2.vicinity.radius = generate_vicinity_radius(p2.dep.regions, 
                                              vicinity.radius,
                                              paste0("../r_results/predict_correlated_expressed_gene/p2_vicinities_radius_", 
                                                     vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc, 
                                                     "_min-baseMean", min.baseMean, ".bed"))

pM.vicinity.radius = generate_vicinity_radius(pM.dep.regions, 
                                              vicinity.radius,
                                              paste0("../r_results/predict_correlated_expressed_gene/pM_vicinities_radius_", 
                                                     vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc, 
                                                     "_min-baseMean", min.baseMean, ".bed"))
```

Calculate Pearson correlation coefficients (PCCs) between NFIA-dependent regions and expressed genes within the assignment areas defined by a fixed radius around the regions:

```{r, include=T}
p1.corr.radius = calc_correlations(p1.vicinity.radius, p1.expr.tss.ranges,
                                   p1.norm.region.counts, p1.norm.gene.counts,
                                   p1.region.sample.names, p1.gene.sample.names,
                                   "p1",
                                   paste0("../r_results/predict_correlated_expressed_gene/p1_corr_in_radius_",
                                          vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                                          "_min-baseMean", min.baseMean, "_", output.suffix, ".rds"))

p2.corr.radius = calc_correlations(p2.vicinity.radius, p2.expr.tss.ranges,
                                   p2.norm.region.counts, p2.norm.gene.counts,
                                   p2.region.sample.names, p2.gene.sample.names,
                                   "p2",
                                   paste0("../r_results/predict_correlated_expressed_gene/p2_corr_in_radius_",
                                          vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                                          "_min-baseMean", min.baseMean, "_", output.suffix, ".rds"))

pM.corr.radius = calc_correlations(pM.vicinity.radius, pM.expr.tss.ranges,
                                   pM.norm.region.counts, pM.norm.gene.counts,
                                   pM.region.sample.names, pM.gene.sample.names,
                                   "pM",
                                   paste0("../r_results/predict_correlated_expressed_gene/pM_corr_in_radius_",
                                          vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                                          "_min-baseMean", min.baseMean, "_", output.suffix, ".rds"))
```

Generate background PCC distributions:

```{r, include=T}
p1.vicinity.radius.random = shuffle_regions_across_vicinities(p1.vicinity.radius, 
                                                              paste0("../r_results/predict_correlated_expressed_gene/p1_random_vicinities_radius_", 
                                                                      vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc, 
                                                                      "_min-baseMean", min.baseMean, ".bed"))

p1.bkgd.radius = calc_correlations(p1.vicinity.radius.random, p1.expr.tss.ranges,
                                   p1.norm.region.counts, p1.norm.gene.counts,
                                   p1.region.sample.names, p1.gene.sample.names,
                                   "p1",
                                   paste0("../r_results/predict_correlated_expressed_gene/p1_bkgd_in_radius_",
                                          vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                                          "_min-baseMean", min.baseMean, "_", output.suffix, ".rds"))

p2.vicinity.radius.random = shuffle_regions_across_vicinities(p2.vicinity.radius, 
                                                              paste0("../r_results/predict_correlated_expressed_gene/p2_random_vicinities_radius_", 
                                                                      vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc, 
                                                                      "_min-baseMean", min.baseMean, ".bed"))

p2.bkgd.radius = calc_correlations(p2.vicinity.radius.random, p2.expr.tss.ranges,
                                   p2.norm.region.counts, p2.norm.gene.counts,
                                   p2.region.sample.names, p2.gene.sample.names,
                                   "p2",
                                   paste0("../r_results/predict_correlated_expressed_gene/p2_bkgd_in_radius_",
                                          vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                                          "_min-baseMean", min.baseMean, "_", output.suffix, ".rds"))

pM.vicinity.radius.random = shuffle_regions_across_vicinities(pM.vicinity.radius, 
                                                              paste0("../r_results/predict_correlated_expressed_gene/pM_random_vicinities_radius_", 
                                                                      vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc, 
                                                                      "_min-baseMean", min.baseMean, ".bed"))

pM.bkgd.radius = calc_correlations(pM.vicinity.radius.random, pM.expr.tss.ranges,
                                   pM.norm.region.counts, pM.norm.gene.counts,
                                   pM.region.sample.names, pM.gene.sample.names,
                                   "pM",
                                   paste0("../r_results/predict_correlated_expressed_gene/pM_bkgd_in_radius_",
                                          vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                                          "_min-baseMean", min.baseMean, "_", output.suffix, ".rds"))
```

Plot the background and the target distributions of the absolute PCC values:

```{r, include=T}
p1.corr.bkgd.radius = p1.corr.radius %>%
  mutate(pair_type = "Target") %>%
  bind_rows(p1.bkgd.radius %>%
              mutate(pair_type = "Background"))
  
p1.plot = p1.corr.bkgd.radius %>%
  ggplot(aes(x = abs_pcc,
             fill = pair_type)) +
  geom_density(alpha = 0.5) +
  theme_classic()

ggsave(filename = paste0("../r_results/predict_correlated_expressed_gene/plots/p1_corr_and_bkgd_in_radius_",
                         vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                         "_min-baseMean", min.baseMean, "_", output.suffix, "_density.pdf"),
       plot = p1.plot)

p2.corr.bkgd.radius = p2.corr.radius %>%
  mutate(pair_type = "Target") %>%
  bind_rows(p2.bkgd.radius %>%
              mutate(pair_type = "Background"))
  
p2.plot = p2.corr.bkgd.radius %>%
  ggplot(aes(x = abs_pcc,
             fill = pair_type)) +
  geom_density(alpha = 0.5) +
  theme_classic()

ggsave(filename = paste0("../r_results/predict_correlated_expressed_gene/plots/p2_corr_and_bkgd_in_radius_",
                         vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                         "_min-baseMean", min.baseMean, "_", output.suffix, "_density.pdf"),
       plot = p2.plot)

pM.corr.bkgd.radius = pM.corr.radius %>%
  mutate(pair_type = "Target") %>%
  bind_rows(pM.bkgd.radius %>%
              mutate(pair_type = "Background"))
  
pM.plot = pM.corr.bkgd.radius %>%
  ggplot(aes(x = abs_pcc,
             fill = pair_type)) +
  geom_density(alpha = 0.5) +
  theme_classic()

ggsave(filename = paste0("../r_results/predict_correlated_expressed_gene/plots/pM_corr_and_bkgd_in_radius_",
                         vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                         "_min-baseMean", min.baseMean, "_", output.suffix, "_density.pdf"),
       plot = pM.plot)
```

Hereafter, I will use the term "PCC" for "absolute PCC."

Overall, the target distributions and the background distributions look exactly the same, especially for the higher PCCs. But let us calculate adjusted p-values anyway.

Calculate empirical p-values of PCCs:

```{r, include=T}
p1.corr.radius.sign = p1.corr.radius %>%
  rowwise() %>%
  mutate(pvalue = calc_empirical_pvalue(abs_pcc, p1.bkgd.radius)) %>%
  ungroup() %>%
  mutate(padj = p.adjust(pvalue, method = "BH")) %>%
  mutate(fdr = pcc.fdr) %>%
  mutate(sign.padj = (padj < fdr))

cat("Number of significant PCCs ( FDR =", pcc.fdr, ") in p1, radius:", nrow(p1.corr.radius.sign %>% filter(sign.padj)), "\n")

p2.corr.radius.sign = p2.corr.radius %>%
  rowwise() %>%
  mutate(pvalue = calc_empirical_pvalue(abs_pcc, p2.bkgd.radius)) %>%
  ungroup() %>%
  mutate(padj = p.adjust(pvalue, method = "BH")) %>%
  mutate(fdr = pcc.fdr) %>%
  mutate(sign.padj = (padj < fdr))

cat("Number of significant PCCs ( FDR =", pcc.fdr, ") in p2, radius:", nrow(p2.corr.radius.sign %>% filter(sign.padj)), "\n")

pM.corr.radius.sign = pM.corr.radius %>%
  rowwise() %>%
  mutate(pvalue = calc_empirical_pvalue(abs_pcc, pM.bkgd.radius)) %>%
  ungroup() %>%
  mutate(padj = p.adjust(pvalue, method = "BH")) %>%
  mutate(fdr = pcc.fdr) %>%
  mutate(sign.padj = (padj < fdr))

cat("Number of significant PCCs ( FDR =", pcc.fdr, ") in pM, radius:", nrow(pM.corr.radius.sign %>% filter(sign.padj)), "\n")
```

Just single significant pairs:

```{r, include=T}
p1.corr.radius.sign %>%
  filter(sign.padj)

write.table(p1.corr.radius.sign,
            file = paste0("../r_results/predict_correlated_expressed_gene/p1_corr_radius_sign_shuffled_",
                          "radius_", vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                          "_min-baseMean", min.baseMean, "_", output.suffix, ".tsv"),
            quote = F,
            sep = "\t",
            row.names = F,
            col.names = F)

p2.corr.radius.sign %>%
  filter(sign.padj)

write.table(p2.corr.radius.sign,
            file = paste0("../r_results/predict_correlated_expressed_gene/p2_corr_radius_sign_shuffled_",
                          "radius_", vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                          "_min-baseMean", min.baseMean, "_", output.suffix, ".tsv"),
            quote = F,
            sep = "\t",
            row.names = F,
            col.names = F)

pM.corr.radius.sign %>%
  filter(sign.padj)

write.table(pM.corr.radius.sign,
            file = paste0("../r_results/predict_correlated_expressed_gene/pM_corr_radius_sign_shuffled_",
                          "radius_", vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                          "_min-baseMean", min.baseMean, "_", output.suffix, ".tsv"),
            quote = F,
            sep = "\t",
            row.names = F,
            col.names = F)
```

There are only five significant region-gene pairs across the three domains. 

A reason for the lack of significant region-gene pairs could be a high correlation of NFIA-dependent regions between each other so that they are interchangeable across vicinities. Let us check this hypothesis by calculating pairwise PCC distiributions between NFIA-dependent regions (here, PCCs are Pearson correlatin coefficients: -1..1):

```{r, include=T}
calc_and_plot_dep_region_pccs(p1.dep.regions,
                              p1.norm.region.counts,
                              "p1")

calc_and_plot_dep_region_pccs(p2.dep.regions,
                              p2.norm.region.counts,
                              "p2")

calc_and_plot_dep_region_pccs(pM.dep.regions,
                              pM.norm.region.counts,
                              "pM")
```

Indeed, pairwise PCCs of NFIA-dependent regions in each domain are very high. Consequently, the regions are indeed interchangeable, and shuffling them around vicinities will not give a meaningful background.

Check region-gene pairs obtained for the strictest region set, expressed genes selected by the expression in at least a certain number of samples, vicinity radius 500 kbp, Pearson correlation method: if some NFIA-dependent regions from the pairs are actually elements and if some expressed genes from the pairs are actually NFIA-dependent:

```{r, include=T}
p1.nfia.elements = readRDS("../r_results/select_diff_regions/p1_dep_nfia_ranges_with_sites_plus-strand_80pc_match-fdr_0.05_in-regions-of_fdr0.01_min-l2fc2_min-baseMean100.rds")

p2.nfia.elements = readRDS("../r_results/select_diff_regions/p2_dep_nfia_ranges_with_sites_plus-strand_80pc_match-fdr_0.05_in-regions-of_fdr0.01_min-l2fc2_min-baseMean100.rds")

pM.nfia.elements = readRDS("../r_results/select_diff_regions/pM_dep_nfia_ranges_with_sites_plus-strand_80pc_match-fdr_0.05_in-regions-of_fdr0.01_min-l2fc2_min-baseMean100.rds")

p1.nfia.elements[names(p1.nfia.elements) == "Interval_88153"]

p2.nfia.elements[names(p2.nfia.elements) == "Interval_33534"]

pM.nfia.elements[names(pM.nfia.elements) %in% c("Interval_5278", "Interval_169839", "Interval_199973")]

p1.dep.genes = readRDS("../r_results/diff_expression/tables/p1_ref_genes.rds")

p2.dep.genes = readRDS("../r_results/diff_expression/tables/p2_ref_genes.rds")

pM.dep.genes = readRDS("../r_results/diff_expression/tables/pM_ref_genes.rds")

p1.dep.genes[p1.dep.genes == "Pla2g7"]

p2.dep.genes[p2.dep.genes == "Aldoc"]

pM.dep.genes[pM.dep.genes %in% c("Serpine2", "Timp4", "3110039I08Rik")]
```

Let us try another background then: Choose a random new place for each NFIA-dependent region so that its new vicinity does not overlap with the real target vicinities. In this way, I will calculate the correlation of the region's accessibility with the expression of genes which presumably are not regulated by any of the NFIA-dependent regions (enhancers). To make this work, I may need to decrease the vicinity radii and to sample each vicinity several times to have at least the same number of region-gene correlations as in the target distributions (random vicinities that are placed outside of real vicinities may not have enough expressed genes).

First, for different vicinity radii, count expressed genes in and outside the vicinities:

```{r, include=T}
p1.expr.genes.n = length(p1.expr.genes)

p2.expr.genes.n = length(p2.expr.genes)

pM.expr.genes.n = length(pM.expr.genes)

p1.expr.in.vicinity = count_expr_genes_inside_vicinity(p1.vicinity.radius, p1.expr.tss.ranges)

p2.expr.in.vicinity = count_expr_genes_inside_vicinity(p2.vicinity.radius, p2.expr.tss.ranges)

pM.expr.in.vicinity = count_expr_genes_inside_vicinity(pM.vicinity.radius, pM.expr.tss.ranges)

cat("The number of expressed genes outside of 500-kbp vicinities in p1:",
    p1.expr.genes.n - p1.expr.in.vicinity,
    "(", round((p1.expr.genes.n - p1.expr.in.vicinity) / p1.expr.genes.n * 100, 2), "%)\n")

cat("The number of expressed genes outside of 500-kbp vicinities in p2:",
    p2.expr.genes.n - p2.expr.in.vicinity,
    "(", round((p2.expr.genes.n - p2.expr.in.vicinity) / p2.expr.genes.n * 100, 2), "%)\n")

cat("The number of expressed genes outside of 500-kbp vicinities in pM:",
    pM.expr.genes.n - pM.expr.in.vicinity,
    "(", round((pM.expr.genes.n - pM.expr.in.vicinity) / pM.expr.genes.n * 100, 2), "%)\n")
```

Hence, around a half (or many more, in the case of p1) expressed genes are outside vicinities, so there should be enough expressed genes from the background calculations.

First, multiply each set of vicinities `vicinity.sampling.n` times to collect at least as many background region-gene pairs as in the target distribution (just one sampling gave fewer than a half of pairs for p1, for example):

```{r, include=T}
p1.vicinity.radius.mult = multiply_vicinities(p1.vicinity.radius, vicinity.sampling.n,
                                              paste0("../r_results/predict_correlated_expressed_gene/p1_vicinities_radius_", 
                                                     vicinity.radius, "kbp_mult_", vicinity.sampling.n, "_dep_regions_fdr", fdr, 
                                                     "_min-l2fc", min.l2fc, "_min-baseMean", min.baseMean, ".bed"))

p2.vicinity.radius.mult = multiply_vicinities(p2.vicinity.radius, vicinity.sampling.n,
                                              paste0("../r_results/predict_correlated_expressed_gene/p2_vicinities_radius_", 
                                                     vicinity.radius, "kbp_mult_", vicinity.sampling.n, "_dep_regions_fdr", fdr, 
                                                     "_min-l2fc", min.l2fc, "_min-baseMean", min.baseMean, ".bed"))

pM.vicinity.radius.mult = multiply_vicinities(pM.vicinity.radius, vicinity.sampling.n,
                                              paste0("../r_results/predict_correlated_expressed_gene/pM_vicinities_radius_", 
                                                     vicinity.radius, "kbp_mult_", vicinity.sampling.n, "_dep_regions_fdr", fdr, 
                                                     "_min-l2fc", min.l2fc, "_min-baseMean", min.baseMean, ".bed"))
```

I shuffled the 500-kbp vicinities using `bedtools shuffle` (BEDTools `v2.30.0`) so that the randomly placed vicinities do not overlap the original vicinities. A command for p1 is as follows (and commands for p2 and pM are the same):

```
time bedtools shuffle \
    -i p1_vicinities_radius_500kbp_mult_3_dep_regions_fdr0.01_min-l2fc2_min-baseMean100.bed \
    -g ../../input/mm10.chrom.sizes \
    -excl p1_vicinities_radius_500kbp_mult_3_dep_regions_fdr0.01_min-l2fc2_min-baseMean100.bed \
    -seed 128 \
    -f 0 > \
    p1_random_placed_vicinities_radius_500kbp_dep_regions_fdr0.01_min-l2fc2_min-baseMean100.bed
```

All the three commands in total took less than a second on `Intel Xeon CPU E5-2640 v3 @ 2.60GHz` (no explicit parallelization).

This command should be run in `r_results/predict_correlated_expressed_gene`.

Generate new background PCC distributions:

```{r, include=T}
# p1.vicinity.radius.random.placed = import(paste0("../r_results/predict_correlated_expressed_gene/", 
#                                                  "p1_random_placed_vicinities_radius_500kbp_dep_regions_fdr0.01_min-l2fc2_min-baseMean100.bed"))
# 
# p2.vicinity.radius.random.placed = import(paste0("../r_results/predict_correlated_expressed_gene/", 
#                                                  "p2_random_placed_vicinities_radius_500kbp_dep_regions_fdr0.01_min-l2fc2_min-baseMean100.bed"))
# 
# pM.vicinity.radius.random.placed = import(paste0("../r_results/predict_correlated_expressed_gene/", 
#                                                  "pM_random_placed_vicinities_radius_500kbp_dep_regions_fdr0.01_min-l2fc2_min-baseMean100.bed"))

# p1.vicinity.radius.random.placed = import(paste0("../r_results/predict_correlated_expressed_gene/", 
#                                                  "p1_random_placed_vicinities_radius_250kbp_dep_regions_fdr0.01_min-l2fc2_min-baseMean100.bed"))
# 
# p2.vicinity.radius.random.placed = import(paste0("../r_results/predict_correlated_expressed_gene/", 
#                                                  "p2_random_placed_vicinities_radius_250kbp_dep_regions_fdr0.01_min-l2fc2_min-baseMean100.bed"))
# 
# pM.vicinity.radius.random.placed = import(paste0("../r_results/predict_correlated_expressed_gene/", 
#                                                  "pM_random_placed_vicinities_radius_250kbp_dep_regions_fdr0.01_min-l2fc2_min-baseMean100.bed"))

p1.vicinity.radius.random.placed = import(paste0("../r_results/predict_correlated_expressed_gene/", 
                                                 "p1_random_placed_vicinities_radius_500kbp_dep_regions_fdr0.01_min-l2fc2_min-baseMean40.bed"))

p2.vicinity.radius.random.placed = import(paste0("../r_results/predict_correlated_expressed_gene/", 
                                                 "p2_random_placed_vicinities_radius_500kbp_dep_regions_fdr0.01_min-l2fc2_min-baseMean40.bed"))

pM.vicinity.radius.random.placed = import(paste0("../r_results/predict_correlated_expressed_gene/", 
                                                 "pM_random_placed_vicinities_radius_500kbp_dep_regions_fdr0.01_min-l2fc2_min-baseMean40.bed"))

p1.vicinity.radius.random.placed = trim_vicinity_gr(p1.vicinity.radius.random.placed)

p2.vicinity.radius.random.placed = trim_vicinity_gr(p2.vicinity.radius.random.placed)

pM.vicinity.radius.random.placed = trim_vicinity_gr(pM.vicinity.radius.random.placed)

p1.bkgd.radius.placed = calc_correlations(p1.vicinity.radius.random.placed, p1.expr.tss.ranges,
                                          p1.norm.region.counts, p1.norm.gene.counts,
                                          p1.region.sample.names, p1.gene.sample.names,
                                          "p1",
                                          paste0("../r_results/predict_correlated_expressed_gene/p1_bkgd_placed_in_radius_",
                                                 vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                                                 "_min-baseMean", min.baseMean, ".rds"))

p2.bkgd.radius.placed = calc_correlations(p2.vicinity.radius.random.placed, p2.expr.tss.ranges,
                                          p2.norm.region.counts, p2.norm.gene.counts,
                                          p2.region.sample.names, p2.gene.sample.names,
                                          "p2",
                                          paste0("../r_results/predict_correlated_expressed_gene/p2_bkgd_placed_in_radius_",
                                                 vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                                                 "_min-baseMean", min.baseMean, ".rds"))

pM.bkgd.radius.placed = calc_correlations(pM.vicinity.radius.random.placed, pM.expr.tss.ranges,
                                          pM.norm.region.counts, pM.norm.gene.counts,
                                          pM.region.sample.names, pM.gene.sample.names,
                                          "pM",
                                          paste0("../r_results/predict_correlated_expressed_gene/pM_bkgd_placed_in_radius_",
                                                 vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                                                 "_min-baseMean", min.baseMean, ".rds"))
```

Plot the background and the target distributions of the absolute PCC values:

```{r, include=T}
p1.corr.bkgd.radius.placed = p1.corr.radius %>%
  mutate(pair_type = "Target") %>%
  bind_rows(p1.bkgd.radius.placed %>%
              mutate(pair_type = "Background"))
  
p1.plot.placed = p1.corr.bkgd.radius.placed %>%
  ggplot(aes(x = abs_pcc,
             fill = pair_type)) +
  geom_density(alpha = 0.5) +
  theme_classic()

ggsave(filename = paste0("../r_results/predict_correlated_expressed_gene/plots/p1_corr_and_bkgd_placed_in_radius_",
                         vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                         "_min-baseMean", min.baseMean, "_density.pdf"),
       plot = p1.plot.placed)

p2.corr.bkgd.radius.placed = p2.corr.radius %>%
  mutate(pair_type = "Target") %>%
  bind_rows(p2.bkgd.radius.placed %>%
              mutate(pair_type = "Background"))
  
p2.plot.placed = p2.corr.bkgd.radius.placed %>%
  ggplot(aes(x = abs_pcc,
             fill = pair_type)) +
  geom_density(alpha = 0.5) +
  theme_classic()

ggsave(filename = paste0("../r_results/predict_correlated_expressed_gene/plots/p2_corr_and_bkgd_placed_in_radius_",
                         vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                         "_min-baseMean", min.baseMean, "_density.pdf"),
       plot = p2.plot.placed)

pM.corr.bkgd.radius.placed = pM.corr.radius %>%
  mutate(pair_type = "Target") %>%
  bind_rows(pM.bkgd.radius.placed %>%
              mutate(pair_type = "Background"))
  
pM.plot.placed = pM.corr.bkgd.radius.placed %>%
  ggplot(aes(x = abs_pcc,
             fill = pair_type)) +
  geom_density(alpha = 0.5) +
  theme_classic()

ggsave(filename = paste0("../r_results/predict_correlated_expressed_gene/plots/pM_corr_and_bkgd_placed_in_radius_",
                         vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                         "_min-baseMean", min.baseMean, "_density.pdf"),
       plot = pM.plot.placed)
```

Overall, the target distributions and the background distributions look almost the same, but the target distribution is now consistently higher than the background distribution for high PCCs in all the three domains. Let us calculate and adjust p-values of target region-gene pairs:

Calculate empirical p-values of PCCs:

```{r, include=T}
p1.corr.radius.sign.placed = p1.corr.radius %>%
  rowwise() %>%
  mutate(pvalue = calc_empirical_pvalue(abs_pcc, p1.bkgd.radius.placed)) %>%
  ungroup() %>%
  mutate(padj = p.adjust(pvalue, method = "BH")) %>%
  mutate(fdr = pcc.fdr) %>%
  mutate(sign.padj = (padj < fdr))

cat("Number of significant PCCs ( FDR =", pcc.fdr, ") in p1, radius, placed background:", nrow(p1.corr.radius.sign.placed %>% filter(sign.padj)), "\n")

p2.corr.radius.sign.placed = p2.corr.radius %>%
  rowwise() %>%
  mutate(pvalue = calc_empirical_pvalue(abs_pcc, p2.bkgd.radius.placed)) %>%
  ungroup() %>%
  mutate(padj = p.adjust(pvalue, method = "BH")) %>%
  mutate(fdr = pcc.fdr) %>%
  mutate(sign.padj = (padj < fdr))

cat("Number of significant PCCs ( FDR =", pcc.fdr, ") in p2, radius, placed background:", nrow(p2.corr.radius.sign.placed %>% filter(sign.padj)), "\n")

pM.corr.radius.sign.placed = pM.corr.radius %>%
  rowwise() %>%
  mutate(pvalue = calc_empirical_pvalue(abs_pcc, pM.bkgd.radius.placed)) %>%
  ungroup() %>%
  mutate(padj = p.adjust(pvalue, method = "BH")) %>%
  mutate(fdr = pcc.fdr) %>%
  mutate(sign.padj = (padj < fdr))

cat("Number of significant PCCs ( FDR =", pcc.fdr, ") in pM, radius, placed background:", nrow(pM.corr.radius.sign.placed %>% filter(sign.padj)), "\n")
```

Only 8 significant pairs in pM:

```{r, include=T}
p1.corr.radius.sign.placed %>%
  filter(sign.padj)

write.table(p1.corr.radius.sign.placed,
            file = paste0("../r_results/predict_correlated_expressed_gene/p1_corr_radius_sign_placed_",
                          "radius_", vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                          "_min-baseMean", min.baseMean, ".tsv"),
            quote = F,
            sep = "\t",
            row.names = F,
            col.names = F)

p2.corr.radius.sign.placed %>%
  filter(sign.padj)

write.table(p2.corr.radius.sign.placed,
            file = paste0("../r_results/predict_correlated_expressed_gene/p2_corr_radius_sign_placed_",
                          "radius_", vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                          "_min-baseMean", min.baseMean, ".tsv"),
            quote = F,
            sep = "\t",
            row.names = F,
            col.names = F)

pM.corr.radius.sign.placed %>%
  filter(sign.padj)

write.table(pM.corr.radius.sign.placed,
            file = paste0("../r_results/predict_correlated_expressed_gene/pM_corr_radius_sign_placed_",
                          "radius_", vicinity.radius, "kbp_dep_regions_fdr", fdr, "_min-l2fc", min.l2fc,
                          "_min-baseMean", min.baseMean, ".tsv"),
            quote = F,
            sep = "\t",
            row.names = F,
            col.names = F)
```

Four genes (Serpine2, Aldoc, Timp4 and 3110039I08Rik) are were also in significant pairs in with the previous background, although, Aldoc was in a significant pair found in p2, not pM.