-
Notifications
You must be signed in to change notification settings - Fork 3
/
gene_exprs_data_prep.R
148 lines (115 loc) · 4.59 KB
/
gene_exprs_data_prep.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
## It is assumed your working directory is where this file is
# Clear R console screen output
cat("\014")
# Load required libraries
#library(CovariateAnalysis)
library(data.table)
library(tidyr)
library(plyr)
library(dplyr)
library(stringr)
library(synapseClient)
library(knitr)
library(githubr)
library(biomaRt)
library(ComplexHeatmap)
# synapseLogin()
# Utility function to download tsv or csv file from synapse and load it in to memory
downloadFile <- function(id, ...){
tmp = data.table::fread(synapseClient::synGet(id)@filePath, data.table=F, header=T, ...)
}
# Covariates
covariates <- downloadFile('syn6132532')
# logcpm
logcpm <- downloadFile('syn6132534')
# diffexp
diffexp <- downloadFile('syn6132536')
# Filter logcpm based on differential expression
# counts <- logcpm %>%
# dplyr::select(-Gene.ID) %>%
# filter(ensembl_gene_id %in% diffexp$ensembl_gene_id) %>%
# group_by(ensembl_gene_id) %>%
# slice(1) %>%
# ungroup() %>%
# as.data.frame()
# Remove period from Gene.ID colname
colnames(logcpm)[1] <- "GeneID"
# Remove NA gene Ids
counts <- logcpm %>%
filter(!is.na(GeneID)) %>%
filter(ensembl_gene_id %in% diffexp$ensembl_gene_id) %>%
as.data.frame
# Remove non-ensemble_gene_ids (e.g. '_alignment_not_sufficient_')
counts <- counts[grepl("ENSG", counts$ensembl_gene_id),]
# name the rows by ensembl gene id
rownames(counts) <- counts$ensembl_gene_id
# counts$ensembl_gene_id <- NULL
counts[is.na(counts)] <- 0
# df of ensemble gene ids to filter by
counts_genes <- data.frame(counts$ensembl_gene_id)
colnames(counts_genes) <- "ensembl_gene_id"
# Add rownames to covariates
covariates <- data.frame(covariates)
rownames(covariates) <- covariates$ID
# Filter and arrange covariates and counts
counts <- counts[,intersect(rownames(covariates), colnames(counts))]
covariates <- covariates[intersect(rownames(covariates), colnames(counts)),]
# Arrange covariates
covariates <- covariates %>%
arrange(Study, BrainRegion, Status, Gender) %>%
dplyr::select(ID, Study, BrainRegion, Status, Gender) %>%
data.frame
rownames(covariates) = covariates$ID
# Convert factor variables to character
i <- sapply(covariates, is.factor)
covariates[i] <- lapply(covariates[i], as.character)
#data frame of study, comparison used, ensemble_gene_ids, and logFC
#sorted by ensemble_gene_id
ad_data <- diffexp %>%
filter(!is.na(hgnc_symbol)) %>%
filter(ensembl_gene_id %in% counts_genes$ensembl_gene_id) %>%
dplyr::select(ensembl_gene_id, hgnc_symbol, Study, Tissue, Contrast, logFC) %>%
tidyr::unite(Study.Tissue.Contrast, Study, Tissue, Contrast, sep = '_') %>%
spread(Study.Tissue.Contrast, logFC) %>%
arrange(ensembl_gene_id)
# make ensembl_gene_ids rownames
ad_data_matrix <- ad_data %>%
dplyr::select(-ensembl_gene_id, -hgnc_symbol)
rownames(ad_data_matrix) <- ad_data$ensembl_gene_id
phenoData <- diffexp %>%
filter(!is.na(hgnc_symbol)) %>%
dplyr::select(Study, Tissue, Contrast) %>%
unite(Study.Tissue.Contrast, Study, Tissue, Contrast, sep = '_', remove = FALSE) %>%
unique() %>%
arrange(Study.Tissue.Contrast) %>%
data.frame
rownames(phenoData) <- phenoData$Study.Tissue.Contrast
featureData <- diffexp %>%
filter(!is.na(hgnc_symbol)) %>%
filter(ensembl_gene_id %in% counts_genes$ensembl_gene_id) %>%
dplyr::select(ensembl_gene_id, hgnc_symbol) %>%
unique() %>%
arrange(ensembl_gene_id) %>%
data.frame
rownames(featureData) <- featureData$ensembl_gene_id
eset.logFC <- ExpressionSet(assayData=as.matrix(ad_data_matrix),
phenoData=AnnotatedDataFrame(phenoData),
featureData=AnnotatedDataFrame(featureData))
# data frame for p-value
ad_data_pvalue <- diffexp %>%
filter(!is.na(hgnc_symbol)) %>%
filter(ensembl_gene_id %in% counts_genes$ensembl_gene_id) %>%
dplyr::select(ensembl_gene_id, hgnc_symbol, Study, Tissue, Contrast, adj.P.Val) %>%
tidyr::unite(Study.Tissue.Contrast, Study, Tissue, Contrast, sep = '_') %>%
spread(Study.Tissue.Contrast, adj.P.Val) %>%
arrange(ensembl_gene_id)
ad_data_matrix <- ad_data_pvalue %>%
dplyr::select(-ensembl_gene_id, -hgnc_symbol)
rownames(ad_data_matrix) <- ad_data_pvalue$ensembl_gene_id
eset.pval <- ExpressionSet(assayData=as.matrix(ad_data_matrix),
phenoData=AnnotatedDataFrame(phenoData),
featureData=AnnotatedDataFrame(featureData))
# Final output for heatmap viewer/explorer
eset.mRNA <- ExpressionSet(assayData=as.matrix(counts)[rownames(featureData), rownames(covariates)],
phenoData=AnnotatedDataFrame(covariates),
featureData=AnnotatedDataFrame(featureData))