-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinit_dataLoad.R
62 lines (55 loc) · 3.05 KB
/
init_dataLoad.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
ts=proc.time()[3]
options(stringsAsFactors=F)
library(dplyr)
library(randomForest)
library(caret)
library(doParallel)
library(scales)
##-------------------------------Load Data---------------------------------------------------------
fnge='CCLE_DepMap_18q3_RNAseq_RPKM_20180718.hgnc_sym_log2_rpkm_plus_1.rdata' #gene expressions file (format is .rdata)
fnscore='ExpandedGeneZSolsCleaned.csv' ## dependency scores file (format is csv)
m=get(load(fnge)) ## load gene expressions
score=read.table(fnscore, sep=',' ,header=T, row.names=1, check.names=F) #read the score csv file it
m_sd<-get(load("CCLE_DepMap_18q3_RNAseq_RPKM_20180718.hgnc_sym_log2_rpkm_plus_1.mean_sd.rdata"))# load r data w/ the gene expression standard deviations
## LOADS THE CUT DATA THAT IS GENERATED FROM seesaw_tscore_gene_importance_filter
#load("seesaw_results_gene_imp_cut.rdata")
cl1=colnames(score)## names of cell lines in scores
cl2=colnames(m)## names of cell lines in gene expression data
##> cl2[1:10]
## [1] "22RV1_PROSTATE (ACH-000956)"
## [2] "2313287_STOMACH (ACH-000948)"
## [3] "253JBV_URINARY_TRACT (ACH-000026)"
## [4] "253J_URINARY_TRACT (ACH-000011)"
## [5] "42MGBA_CENTRAL_NERVOUS_SYSTEM (ACH-000323)"
## [6] "5637_URINARY_TRACT (ACH-000905)"
## [7] "59M_OVARY (ACH-000520)"
## [8] "639V_URINARY_TRACT (ACH-000973)"
## [9] "647V_URINARY_TRACT (ACH-000896)"
##[10] "697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE (ACH-000070)"
cl2_=stringr::str_split(cl2, ' ', simplify=T)[,1] ## splits the gene expression cell line names from its ID in the original database
##> str(cl1)
## chr [1:501] "22RV1_PROSTATE" "697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE" ...
##> str(cl2)
## chr [1:1156] "22RV1_PROSTATE (ACH-000956)" ...
##> str(cl2_)
## chr [1:1156] "22RV1_PROSTATE" "2313287_STOMACH" ...
common=intersect(cl1, cl2_) ## checks for cell lines that exists in both datasets
##> str(common)
## chr [1:487] "22RV1_PROSTATE" "697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE" ...
colnames(m)=cl2_ ## remove the database ID from m
score=score[, common] ## filter cell lines to what's common
m=m[, common]
m<-t(m) ## transpose matrix
score<-t(score) ## transpose matrix
##-------------------------------------------------Data Processing, SDDI Project------------------------------------------------------------------
genes<-rownames(m_sd) ## names of all the genes
m_sd<-as.data.frame(m_sd, row.names=TRUE) ## convert standard deviation data to a data frame
m_sd$gene<-genes ## create a column with gene names
m_sd<-m_sd%>%arrange(desc(sd)) ## sort the genes my standard deviation largest-> smallest; done with the dplyr package
gene_most_var<-m_sd$gene[1:200] ## select names of the 200 genes with highest SD
NO_HAEM<-!grepl("HAE",rownames(m)) ## logical vector, haematopoietic=true, not=false
## sets the filter
filt=NO_HAEM
## select cells according to filter
m_filtered<-m[filt,]
score_filtered<-score[filt,]