forked from lzw150/DIA-BERT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
FigureS1.R
191 lines (178 loc) · 9.82 KB
/
FigureS1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
path="D:/LZW/Guomics/DIA-BERT/Benchmark/fdrs_DIANN19/"
setwd(path)
#DIANN proteome
DIA_NN_Human_proteome <- readxl::read_xlsx('DIA_NN_19_lib_proteome.xlsx',sheet = 'Human_proteome')
DIA_NN_Three_species_proteome <- readxl::read_xlsx('DIA_NN_19_lib_proteome.xlsx',sheet = 'Three_species_proteome')
#DIA_BERT proteome
DIA_BERT_Human_proteome <- readxl::read_xlsx('DIA_BERT_proteome.xlsx',sheet = 'Human_proteome')
DIA_BERT_Three_species_proteome <- readxl::read_xlsx('DIA_BERT_proteome.xlsx',sheet = 'Three_species_proteome')
pgs=read.csv('pgs.csv',row.names = 1)
###summary of number of peptides per protein
peptides_per_protein <- function(data,Dataset){
#data=DIA_BERT_Human_proteome; Dataset='Human_tissue'
rn=unique(data$Protein_id)
rn=rn[!is.na(rn)]
cn=c(sort(unique(data$Filename)))
D1=as.data.frame(matrix(ncol=length(cn), nrow=length(rn)))
colnames(D1)=cn
rownames(D1)=rn
for(i in cn){
T1=data[data$Filename==i, ]
T2=table(T1$Protein_id[T1$Protein_id!='' & T1$Dataset==Dataset])
D1[names(T2),i]=T2
}
round(apply(D1,1,mean,na.rm=T),0)
}
DIA_BERT_summary_count_Human_tissue <- peptides_per_protein(DIA_BERT_Human_proteome, Dataset='Human_tissue')
DIA_NN_summary_count_Human_tissue <- peptides_per_protein(DIA_NN_Human_proteome, Dataset='Human_tissue')
DIA_BERT_summary_count_Human_tissue=DIA_BERT_summary_count_Human_tissue[!is.na(DIA_BERT_summary_count_Human_tissue)]
DIA_NN_summary_count_Human_tissue =DIA_NN_summary_count_Human_tissue[!is.na(DIA_NN_summary_count_Human_tissue)]
DIA_BERT_summary_count_Human_tissue[DIA_BERT_summary_count_Human_tissue>20]=20
DIA_NN_summary_count_Human_tissue[DIA_NN_summary_count_Human_tissue>20]=20
pdf('Points plot for summary of number of peptides per protein.pdf', width = 8, height = 5)
plot(table(DIA_BERT_summary_count_Human_tissue),col='#D78B61', pch=19, cex=0.5, xlim=c(0,20),type='b',lwd=1, ylab='Numbers', main='Number of precursors per protein',ylim=c(0,4000))#
lines(table(DIA_NN_summary_count_Human_tissue),col='#8FBFA4',pch=19, cex=0.5, xlim=c(0,20),type='b',lwd=1)
# Add a legend
legend("topright", legend = c('Human tissue (DIA-BERT)', 'Human tissue (DIA-NN)'),
col=c("#D78B61" , "#8FBFA4"),
pch = '_', bty = "n", pt.cex = 3, cex = 1.2, horiz = F, inset = c(0.1, 0.1))
dev.off()
sum(DIA_BERT_summary_count_Human_tissue<10);sum(DIA_BERT_summary_count_Human_tissue>=10);#9501;1640
sum(DIA_NN_summary_count_Human_tissue<10);sum(DIA_NN_summary_count_Human_tissue>=10);#7691;1323
###quantification comparison
boxplot2 <- function(data,main,lab1="Common",lab2="Unique",ylab1="Log2intensity",addpoints=TRUE){
#data=prs_Human_proteome_long; main="Human proteome (precursors)"
par(mar=c(3,4,3,1))
myplot <- boxplot(quant ~ type*var , data=data,
boxwex=0.4 , ylab=ylab1,
main=main,
col=c( "slateblue1" , "tomato"),
xaxt="n", cex=0.1,xlab = '')
# To add the label of x axis
my_names <- sapply(strsplit(myplot$names , '\\.') , function(x) x[[2]] )
my_names <- my_names[seq(1 , length(my_names) , 2)]
axis(1,
at = seq(1.5 , length(my_names)*2 , 2),
labels = my_names , las=2,
tick=FALSE , cex=0.3)
# Add the grey vertical lines
for(i in seq(0.5 , length(my_names)*2+1 , 2)){
abline(v=i,lty=1, col="grey")
}
if(addpoints==TRUE){
# Add data points
data$type2=paste(data$var,data$type,sep='_')
mylevels <- sort(unique(data$type2))
levelProportions <- table(data$type2)/nrow(data)
for(i in 1:length(mylevels)){
#i=2
thislevel <- mylevels[i]
thisvalues <- data[data$type2==thislevel, "quant"]
# take the x-axis indices and add a jitter, proportional to the N in each level
myjitter <- jitter(rep(i, length(thisvalues)), amount=levelProportions[mylevels[i]]/2)
points(myjitter, thisvalues, pch=20, col=rgb(0,0,0,.9))
}
}
# Add a legend
legend("topright", legend = c(lab1, lab2),
col=c("slateblue1" , "tomato"),
pch = 15, bty = "n", pt.cex = 3, cex = 1.2, horiz = F, inset = c(0.1, 0.1))
}
##comparison of quantification
comparison_of_quantification <- function(dataset,filenames,prs_file,pgs_file,rename,type,proteome1,proteome2,pdfwidth=8,subdir='DIA_BERT/',softaware='In DIA-BERT '){
#dataset='Tumor15_IPX0001981000';filenames=rownames(pgs)[-c(1:6)];
#prs_file='DIA_BERT_precursor1_Tumor15_IPX0001981000.tsv';pgs_file='DIA_BERT_protein1_Tumor15_IPX0001981000.tsv';
#rename=paste(pgs$rks_id[-c(1:6)],pgs$Disease[-c(1:6)],sep='_');type='Human_tissue';
#proteome1=DIA_BERT_Human_proteome;proteome2=DIA_NN_Human_proteome
#pdfwidth=8;subdir='DIA_BERT/';softaware='In DIA-BERT '
cat(dataset,'\n',
filenames,'\n',
prs_file,'\n',
pgs_file,'\n',
rename,'\n',
type,'\n')
path_1=paste0(path,subdir,dataset)
#precursors
prs_quant_ <- read.table(paste0(path_1,'/',prs_file), sep='\t',header = T)
Precursor_ <- prs_quant_$Precursor
prs_quant_1 <- prs_quant_[,-c(1:3)]
rownames(prs_quant_1) <- Precursor_
colnames(prs_quant_1) <- gsub('.mzML$','', colnames(prs_quant_1))
colnames(prs_quant_1) <- gsub('\\.','-', colnames(prs_quant_1))
colnames(prs_quant_1) <- gsub('^X','', colnames(prs_quant_1))
prs_quant_1=prs_quant_1[, filenames]
colnames(prs_quant_1) <- rename
prs_quant_1[prs_quant_1==0]=NA
#proteins
pgs_quant_ <- read.table(paste0(path_1,'/',pgs_file), sep='\t',header = T)
pgs_quant_ <- pgs_quant_[pgs_quant_$Protein_id!='',]
rownames(pgs_quant_) <- pgs_quant_$Protein_id
colnames(pgs_quant_) <- gsub('.mzML$','', colnames(pgs_quant_))
colnames(pgs_quant_) <- gsub('\\.','-', colnames(pgs_quant_))
colnames(pgs_quant_) <- gsub('^X','', colnames(pgs_quant_))
pgs_quant_=pgs_quant_[, filenames]
colnames(pgs_quant_) <- rename
pgs_quant_[pgs_quant_==0]=NA
#comparison of precursors
colnames(proteome1)[colnames(proteome1)=='transition_group_id']='Precursor'
colnames(proteome1)[colnames(proteome1)=='Protein']='Protein_id'
colnames(proteome2)[colnames(proteome2)=='transition_group_id']='Precursor'
colnames(proteome2)[colnames(proteome2)=='Protein']='Protein_id'
set1=proteome1$Precursor[proteome1$Dataset==type]
set2=proteome2$Precursor[proteome2$Dataset==type]
set1=unique(set1[set1!='' & !is.na(set1)]); cat('precursors set1:',length(unique(set1)),'\n') #
set2=unique(set2[set2!='' & !is.na(set2)]); cat('precursors set2:',length(unique(set2)),'\n') #
prs_quant_1_unique=prs_quant_1[setdiff(set1, set2),]
prs_quant_1_common=prs_quant_1[intersect(set1, set2),]
prs_quant_1_long=data.frame()
for(i in colnames(prs_quant_1)){
#i=colnames(prs_quant_1)[1]
T1=data.frame(var=i, type='Unique precursors', quant=log(prs_quant_1_unique[,i],2))
T2=data.frame(var=i, type='Common precursors', quant=log(prs_quant_1_common[,i],2))
prs_quant_1_long=rbind(prs_quant_1_long, T1, T2)
}
prs_quant_1_long=prs_quant_1_long[!is.na(prs_quant_1_long$quant) & prs_quant_1_long$quant!='', ]
#ttest for precursor quant comparison
cat('ttest for precursor quant comparison\n')
print(sapply(colnames(prs_quant_1),function(x){
t.test(log(prs_quant_1_unique[,x],2),
log(prs_quant_1_common[,x],2))$p.value
}))
#comparison of proteins
set1=proteome1$Protein_id[proteome1$Dataset==type]
set2=proteome2$Protein_id[proteome2$Dataset==type]
set1=unique(set1[set1!='' & !is.na(set1)]); cat('proteins set1:',length(unique(set1)),'\n') #
set2=unique(set2[set2!='' & !is.na(set2)]); cat('proteins set2:',length(unique(set2)),'\n') #
pgs_quant__unique=pgs_quant_[setdiff(set1, set2),]
pgs_quant__common=pgs_quant_[intersect(set1, set2),]
pgs_quant__long=data.frame()
for(i in colnames(pgs_quant_)){
#i=colnames(pgs_quant_)[1]
T1=data.frame(var=i, type='Unique proteins', quant=log(pgs_quant__unique[,i],2))
T2=data.frame(var=i, type='Common proteins', quant=log(pgs_quant__common[,i],2))
pgs_quant__long=rbind(pgs_quant__long, T1, T2)
}
pgs_quant__long=pgs_quant__long[!is.na(pgs_quant__long$quant) & pgs_quant__long$quant!='', ]
#ttest for protein quant comparison
cat('ttest for protein quant comparison\n')
print(sapply(colnames(pgs_quant_),function(x){
t.test(log(pgs_quant__unique[,x],2),
log(pgs_quant__common[,x],2))$p.value
}))
#boxplot for comparison between unique and common prs or pgs
pdf(paste0(path,softaware,type,' proteome boxplot for comparison between unique and common prs or pgs.pdf'), width=pdfwidth,height = 5)
boxplot2(prs_quant_1_long, main=paste0(type," proteome (precursors)"),addpoints=FALSE)
boxplot2(pgs_quant__long, main=paste0(type," proteome (proteins)"),addpoints=FALSE)
dev.off()
return(list(prs_quant_1_long, pgs_quant__long))
}
#In DIA-BERT
quant_long_Tumor15_IPX0001981000_DIA_BERT = comparison_of_quantification(dataset='Tumor15_IPX0001981000',filenames=rownames(pgs)[-c(1:6)],
prs_file='DIA_BERT_precursor1_Tumor15_IPX0001981000.tsv',pgs_file='DIA_BERT_protein1_Tumor15_IPX0001981000.tsv',
rename=paste(pgs$rks_id[-c(1:6)],pgs$Disease[-c(1:6)],sep='_'),type='Human_tissue',
proteome1=DIA_BERT_Human_proteome,proteome2=DIA_NN_Human_proteome)
#In DIA-NN
quant_long_human_DIA_NN = comparison_of_quantification(dataset='Tumor15_IPX0001981000',filenames=rownames(pgs)[-c(1:6)],
prs_file='DIA_NN_precursor1_Tumor15_IPX0001981000.tsv',pgs_file='DIA_NN_protein1_Tumor15_IPX0001981000.tsv',
rename=paste(pgs$rks_id[-c(1:6)],pgs$Disease[-c(1:6)],sep='_'),type='Human_tissue',subdir='DIA_NN_19_lib/',softaware='In DIA-NN ',
proteome1=DIA_NN_Human_proteome,proteome2=DIA_BERT_Human_proteome)