-
Notifications
You must be signed in to change notification settings - Fork 0
/
compute_binding_Kd.Rmd
executable file
·406 lines (324 loc) · 25.3 KB
/
compute_binding_Kd.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
---
title: "Compute per-barcode CGG binding affinity"
author: "Tyler Starr"
date: "8/6/2021"
output:
github_document:
html_preview: false
editor_options:
chunk_output_type: inline
---
This notebook reads in per-barcode counts from `count_variants.ipynb` for CGG-binding Tite-seq experiments, computes functional scores for scFv CGG-binding affiniity, and does some basic QC on variant binding functional scores.
```{r setup, message=FALSE, warning=FALSE, error=FALSE}
require("knitr")
knitr::opts_chunk$set(echo = T)
knitr::opts_chunk$set(dev.args = list(png = list(type = "cairo")))
#list of packages to install/load
packages = c("yaml","data.table","tidyverse","gridExtra")
#install any packages not already installed
installed_packages <- packages %in% rownames(installed.packages())
if(any(installed_packages == F)){
install.packages(packages[!installed_packages])
}
#load packages
invisible(lapply(packages, library, character.only=T))
#read in config file
config <- read_yaml("config.yaml")
#make output directory
if(!file.exists(config$Titeseq_Kds_dir)){
dir.create(file.path(config$Titeseq_Kds_dir))
}
```
Session info for reproducing environment:
```{r print_sessionInfo}
sessionInfo()
```
## Setup
First, we will read in metadata on our sort samples, the table giving number of reads of each barcode in each of the sort bins, and the barcode-variant lookup tables, and merge these tables together.
```{r input_data}
#read dataframe with list of barcode runs
barcode_runs <- read.csv(file=config$barcode_runs,stringsAsFactors=F); barcode_runs <- subset(barcode_runs, select=-c(R1))
#eliminate rows from barcode_runs that are not from an expression sort-seq experiment
barcode_runs <- barcode_runs[barcode_runs$sample_type == "TiteSeq",]
#read file giving count of each barcode in each sort partition
counts <- data.table(read.csv(file=config$variant_counts_file,stringsAsFactors=F))
#eliminate rows from counts that are not part of an titration bin sample
counts <- subset(counts, sample %in% barcode_runs[barcode_runs$sample_type=="TiteSeq","sample"])
#read in barcode-variant lookup tables
dt <- data.table(read.csv(file=config$codon_variant_table_file,stringsAsFactors=F))
dt <- merge(counts, dt, by=c("library","barcode")) ;rm(counts)
#make tables giving names of Titeseq samples and the corresponding CGG incubation concentrations
samples_TiteSeq <- data.frame(sample=unique(paste(barcode_runs[barcode_runs$sample_type=="TiteSeq","sample_type"],formatC(barcode_runs[barcode_runs$sample_type=="TiteSeq","concentration"], width=2,flag="0"),sep="_")),conc=c(10^-6, 10^-7, 10^-8, 10^-9, 10^-10, 10^-11, 10^-12, 10^-13,0))
```
Convert from Illumina read counts to estimates of the number of cells that were sorted into a bin, and add some other useful information to our data frame.
```{r downweight_counts_by_cells}
#for each bin, normalize the read counts to the observed ratio of cell recovery among bins
for(i in 1:nrow(barcode_runs)){
lib <- as.character(barcode_runs$library[i])
bin <- as.character(barcode_runs$sample[i])
ratio <- sum(dt[library==lib & sample==bin,"count"])/barcode_runs$number_cells[i]
if(ratio<1){ #if there are fewer reads from a FACS bin than cells sorted
dt[library==lib & sample==bin, count.norm := as.numeric(count)] #don't normalize cell counts, make count.norm the same as count
print(paste("reads < cells for",lib,bin,", un-normalized (ratio",ratio,")")) #print to console to inform of undersampled bins
}else{
dt[library==lib & sample==bin, count.norm := as.numeric(count/ratio)] #normalize read counts by the average read:cell ratio, report in new "count.norm" column
print(paste("read:cell ratio for",lib,bin,"is",ratio))
}
}
#annotate each barcode as to whether it's a homolog variant, SARS-CoV-2 wildtype, synonymous muts only, stop, nonsynonymous, >1 nonsynonymous mutations
dt[,variant_class:=as.character(NA)]
dt[n_codon_substitutions==0, variant_class := "wildtype"]
dt[n_codon_substitutions > 0 & n_aa_substitutions==0, variant_class := "synonymous"]
dt[n_aa_substitutions>0 & grepl("*",aa_substitutions,fixed=T), variant_class := "stop"]
dt[n_aa_substitutions == 1 & !grepl("*",aa_substitutions,fixed=T), variant_class := "1 nonsynonymous"]
dt[n_aa_substitutions > 1 & !grepl("*",aa_substitutions,fixed=T), variant_class := ">1 nonsynonymous"]
#cast the data frame into wide format
dt <- dcast(dt, library + barcode + target + variant_class + aa_substitutions + n_aa_substitutions ~ sample, value.var="count.norm")
```
## Calculating mean bin for each barcode at each sample concentration
Next, for each barcode at each of the CGG concentrations, calculate the "mean bin" response variable. This is calculated as a simple mean, where the value of each bin is the integer value of the bin (bin1=unbound, bin4=highly bound) -- because of how bins are defined, the mean fluorescence of cells in each bin are equally spaced on a log-normal scale, so mean bin correlates with simple mean fluorescence.
We do not use the fluorescence boundaries of the FACS bins in our calculations here, but we provide them for posterity's sake below. The fluorescence boundaries for bins 1-4 are as follows:
```
(-288, 136), (137, 2000), (2001, 29421), (29422, 262143)
```
```{r calculate_mean_bin}
#function that returns mean bin and sum of counts for four bins cell counts. Includes cutoffs for bimodal sample splits to filter out
calc.meanbin <- function(vec, split13filter=0.4, split24filter=0.4, split14filter=0.2){
total <- sum(vec)
if(is.na(total) | (vec[1] > split13filter*total & vec[3] > split13filter*total) | (vec[2] > split24filter*total & vec[4] > split24filter*total) | (vec[1] > split14filter*total & vec[4] > split14filter*total)){
return(list(NA,NA))
}else{
return( list((vec[1]*1+vec[2]*2+vec[3]*3+vec[4]*4)/(vec[1]+vec[2]+vec[3]+vec[4]), total) )
}
}
#iterate through Titeseq samples, compute mean_bin and total_count for each barcode variant
for(i in 1:nrow(samples_TiteSeq)){ #iterate through titeseq sample (concentration)
meanbin_out <- paste(samples_TiteSeq[i,"sample"],"_meanbin",sep="") #define the header name for the meanbin output for the given concentration sample
totalcount_out <- paste(samples_TiteSeq[i,"sample"],"_totalcount",sep="") #define the header name for the total cell count output for the given concentration sample
bin1_in <- paste(samples_TiteSeq[i,"sample"],"_bin1",sep="") #define the header names for the input cell counts for bins1-4 of the given concnetration sample
bin2_in <- paste(samples_TiteSeq[i,"sample"],"_bin2",sep="")
bin3_in <- paste(samples_TiteSeq[i,"sample"],"_bin3",sep="")
bin4_in <- paste(samples_TiteSeq[i,"sample"],"_bin4",sep="")
dt[,c(meanbin_out,totalcount_out) := calc.meanbin(c(get(bin1_in),get(bin2_in),get(bin3_in),get(bin4_in))),by=c("barcode","library")]
}
```
## Fit titration curves
We will use nonlinear least squares regression to fit curves to each barcode's titration series. We will also include a minimum cell count that is required for a meanbin estimate to be used in the titration fit, and a minimum number of concentrations with determined meanbin that is required for a titration to be reported.
```{r fit_titrations}
#For QC and filtering, output columns giving the average number of cells that were sampled for a barcode across the 9 sample concentrations, and a value for the number of meanbin estimates that were removed for being below the # of cells cutoff
cutoff <- 5
dt[,TiteSeq_avgcount := mean(c(TiteSeq_01_totalcount,TiteSeq_02_totalcount,TiteSeq_03_totalcount,TiteSeq_04_totalcount,
TiteSeq_05_totalcount,TiteSeq_06_totalcount,TiteSeq_07_totalcount,TiteSeq_08_totalcount,
TiteSeq_09_totalcount),na.rm=T),by=c("library","barcode")]
#number of concentrations at which meanbin is calculated from < cutoff cells or is missing b/c filtered for bimodality
dt[,TiteSeq_min_cell_filtered := sum(c(c(TiteSeq_01_totalcount,TiteSeq_02_totalcount,TiteSeq_03_totalcount,TiteSeq_04_totalcount,
TiteSeq_05_totalcount,TiteSeq_06_totalcount,TiteSeq_07_totalcount,TiteSeq_08_totalcount,
TiteSeq_09_totalcount)<cutoff,is.na(c(TiteSeq_01_totalcount,TiteSeq_02_totalcount,TiteSeq_03_totalcount,TiteSeq_04_totalcount,
TiteSeq_05_totalcount,TiteSeq_06_totalcount,TiteSeq_07_totalcount,TiteSeq_08_totalcount,
TiteSeq_09_totalcount))),na.rm=T),by=c("library","barcode")]
#function that fits a nls regression to the titration series, including an option to filter below certain thresholds for average cells across all samples, and number of samples below a cutoff of cells
fit.titration <- function(y.vals,x.vals,count.vals,min.cfu=cutoff,
min.means=0.8,min.average=cutoff,Kd.start=1e-11,
a.start=3,a.lower=2,a.upper=3,
b.start=1,b.lower=1,b.upper=1.5){
indices <- count.vals>min.cfu & !is.na(y.vals)
y <- y.vals[indices]
x <- x.vals[indices]
if((length(y) < min.means*length(y.vals)) | (mean(count.vals,na.rm=T) < min.average)){ #return NAs if < min.means fraction of concentrations have above min.cfu counts or if the average count across all concentrations is below min.average
return(list(as.numeric(NA),as.numeric(NA),as.numeric(NA),as.numeric(NA),as.numeric(NA)))
}else{
fit <- nls(y ~ a*(x/(x+Kd))+b,
start=list(a=a.start,b=b.start,Kd=Kd.start),
lower=list(a=a.lower,b=b.lower,Kd=min(x.vals[x.vals>0])/100), #constrain Kd to be no lower than 1/100x the lowest concentration value
upper=list(a=a.upper,b=b.upper,Kd=max(x.vals[x.vals>0])*10), #constrain Kd to be no higher than the 10x highest concentration value
algorithm="port")
y.pred <- predict(fit,newdata=list(x=x))
resid <- y - y.pred
resid.norm <- resid/as.numeric(summary(fit)$coefficients["a","Estimate"])
nMSR <- mean((resid.norm)^2,na.rm=T)
return(list(as.numeric(summary(fit)$coefficients["Kd","Estimate"]),
as.numeric(summary(fit)$coefficients["Kd","Std. Error"]),
as.numeric(summary(fit)$coefficients["a","Estimate"]),
as.numeric(summary(fit)$coefficients["b","Estimate"]),
as.numeric(nMSR)))
}
}
#fit titration to CGG Titeseq data for each barcode
dt[,c("Kd_CGG","Kd_SE_CGG","response_CGG","baseline_CGG","nMSR_CGG") :=
tryCatch(fit.titration(y.vals=c(TiteSeq_01_meanbin,TiteSeq_02_meanbin,TiteSeq_03_meanbin,TiteSeq_04_meanbin,
TiteSeq_05_meanbin,TiteSeq_06_meanbin,TiteSeq_07_meanbin,TiteSeq_08_meanbin,
TiteSeq_09_meanbin),
x.vals=samples_TiteSeq$conc,
count.vals=c(TiteSeq_01_totalcount,TiteSeq_02_totalcount,TiteSeq_03_totalcount,TiteSeq_04_totalcount,
TiteSeq_05_totalcount,TiteSeq_06_totalcount,TiteSeq_07_totalcount,TiteSeq_08_totalcount,TiteSeq_09_totalcount)),
error=function(e){list(as.numeric(NA),as.numeric(NA),as.numeric(NA),as.numeric(NA),as.numeric(NA))}),by=c("library","barcode")]
```
## QC and sanity checks
We will do some QC to make sure we got good titration curves for most of our library barcodes. We will also spot check titration curves from across our measurement range, and spot check curves whose fit parameters hit the different boundary conditions of the fit variables.
We successfully generated *K*<sub>D</sub> estimates for `r sum(!is.na(dt[library=="lib1",Kd_CGG]))` of our lib1 barcodes (`r round(sum(!is.na(dt[library=="lib1",Kd_CGG]))/nrow(dt[library=="lib1",])*100,digits=2)`%) and `r sum(!is.na(dt[library=="lib2",Kd_CGG]))` of our lib2 barcodes (`r round(sum(!is.na(dt[library=="lib2",Kd_CGG]))/nrow(dt[library=="lib2",])*100,digits=2)`%).
Why were estimates not returned for some barcodes? The histograms below show that many barcodes with unsuccessful titration fits have lower average cell counts and more concentrations with fewer than the minimum cutoff number of cells (cutoff=`r cutoff`) than those that were fit. Therefore, we can see the the majority of unfit barcodes come from our minimum read cutoffs, meaning there weren't too many curves that failed to be fit for issues such as nls convergence.
```{r avgcount, fig.width=8, fig.height=8, fig.align="center",dpi=300,dev="png"}
par(mfrow=c(2,2))
hist(log10(dt[library=="lib1" & !is.na(Kd_CGG),TiteSeq_avgcount]+0.5),breaks=20,xlim=c(0,5),main="lib1",col="gray50",xlab="average cell count across concentration samples")
hist(log10(dt[library=="lib1" & is.na(Kd_CGG),TiteSeq_avgcount]+0.5),breaks=30,add=T,col="red")
hist(log10(dt[library=="lib2" & !is.na(Kd_CGG),TiteSeq_avgcount]+0.5),breaks=20,xlim=c(0,5),main="lib2",col="gray50",xlab="average cell count across concentration samples")
hist(log10(dt[library=="lib2" & is.na(Kd_CGG),TiteSeq_avgcount]+0.5),breaks=30,add=T,col="red")
hist(dt[library=="lib1" & !is.na(Kd_CGG),TiteSeq_min_cell_filtered],breaks=5,main="lib1",col="gray50",xlab="number of sample concentrations below cutoff cell number",xlim=c(0,10))
hist(dt[library=="lib1" & is.na(Kd_CGG),TiteSeq_min_cell_filtered],breaks=20,add=T,col="red")
hist(dt[library=="lib2" & !is.na(Kd_CGG),TiteSeq_min_cell_filtered],breaks=5,main="lib2",col="gray50",xlab="number of sample concentrations below cutoff cell number",xlim=c(0,10))
hist(dt[library=="lib2" & is.na(Kd_CGG),TiteSeq_min_cell_filtered],breaks=20,add=T,col="red")
```
To allow manual checks of what the data looks like for different curve fits, I define functions that take a row from the dt table and the corresponding table of fits, and plots the meanbin estimates and the fit titration curve (if converged). This allows for quick and easy troubleshooting and spot-checking of curves.
```{r plot_titration_functions}
#make functions that allow me to plot a titration for any given row from the counts data frames, for spot checking curves
plot.titration <- function(row,output.text=F){
y.vals <- c();for(sample in samples_TiteSeq$sample){y.vals <- c(y.vals,paste(sample,"_meanbin",sep=""))};y.vals <- unlist(dt[row,y.vals,with=F])
x.vals <- samples_TiteSeq$conc
count.vals <- c();for(sample in samples_TiteSeq$sample){count.vals <- c(count.vals,paste(sample,"_totalcount",sep=""))};count.vals <- unlist(dt[row,count.vals,with=F])
if(dt[row,variant_class] %in% c("wildtype","synonymous")){
title <- dt[row,target]
}else{
title <- paste(dt[row,target],dt[row,aa_substitutions])
}
indices <- count.vals>cutoff & !is.na(count.vals)
y.vals <- y.vals[indices]
x.vals <- x.vals[indices]
plot(x.vals,y.vals,xlab="[CGG] (M)",
ylab="mean bin",log="x",ylim=c(1,4),xlim=c(1e-13,1e-6),pch=19,main=title)
Kd_var <- "Kd_CGG"
fit <- nls(y.vals ~ a*(x.vals/(x.vals+Kd))+b,
start=list(a=3,b=1,Kd=dt[row,get(Kd_var)]),
lower=list(a=2,b=1,Kd=1e-15),
upper=list(a=3,b=1.5,Kd=1e-5), #constrain Kd to be no higher than the 10x highest concentration value
algorithm="port")
if(!is.na(dt[row,get(Kd_var)])){
lines(10^c(seq(-13,-6,0.25)),predict(fit,newdata=list(x.vals=10^c(seq(-13,-6,0.25)))))
legend("topleft",bty="n",cex=1,legend=paste("Kd",format(dt[row,get(Kd_var)],digits=3),"M"))
}
if(output.text==T){ #for troubleshooting and interactive work, output some info from the counts table for the given row
vars <- c("library","barcode","target","variant_class","aa_substitutions","TiteSeq_avgcount","TiteSeq_min_cell_filtered","Kd_CGG","Kd_SE_CGG","baseline_CGG","response_CGG","nMSR_CGG")
return(dt[row,..vars])
}
}
```
Distribution of Kd estimates, with wt/syn barcodes in purple:
```{r Kd_distribution, fig.width=5, fig.height=4, fig.align="center",dpi=300,dev="png"}
par(mfrow=c(1,1))
hist(log10(dt[,Kd_CGG]),col="gray40",breaks=60,xlab="log10(KD), CGG (M)",main="",xlim=c(-13,-5))
hist(log10(dt[variant_class %in% (c("synonymous","wildtype")),Kd_CGG]),col="#92278F",add=T,breaks=60)
#save pdf
invisible(dev.print(pdf, paste(config$Titeseq_Kds_dir,"/hist_Kd-per-barcode.pdf",sep="")))
```
Some stop variants eked through our scFv+ gating, either perhaps because of stop codon readthrough, improper PacBio sequence annotation, or other weirdness. Either way, the vast majority of nonsense mutants were purged before this step, and the remaining ones are biased toward unreliable and so we remove them.
```{r remove_stops}
#remove stop variants, which even if they eke through, either a) still have low counts and give poor fits as a result, or b) seem to be either dubious PacBio calls (lower variant_call_support) or have late stop codons which perhaps don't totally ablate funciton. Either way, the vast majority were purged before this step and we don't want to deal with the remaining ones!
dt[variant_class == "stop",c("Kd_CGG","Kd_SE_CGG","response_CGG","baseline_CGG","nMSR_CGG") := list(as.numeric(NA),as.numeric(NA),as.numeric(NA),as.numeric(NA),as.numeric(NA))]
```
Let's take a look at some of the curves with *K*<sub>D,app</sub> values across this distribution to get a broad sense of how things look.
First, curves with *K*<sub>D,app</sub> around 10<sup>-6</sup> (because of nonspecific background signal, which we partially deplete, all curves rise at the higher titration concentrations so nothign gets fit to the imposed fit maximum 10^-5). These curves are noisy background and are truly expected to be non-binders.
```{r 1e-6_Kd, fig.width=8, fig.height=8, fig.align="center",dpi=300, warning=FALSE, results=FALSE,dev="png"}
par(mfrow=c(2,2))
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-6 & dt$Kd_CGG < 1.2e-6)[1])
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-6 & dt$Kd_CGG < 1.2e-6)[2])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-6 & dt$Kd_CGG < 1.2e-6)[1])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-6 & dt$Kd_CGG < 1.2e-6)[2])
```
With *K*<sub>D,app</sub> around 10<sup>-7</sup>, still substantial noise in the curves, would not reliably call any of these binders compared to those above at 10^-6.
```{r 1e-7_Kd, fig.width=8, fig.height=8, fig.align="center",dpi=300, warning=FALSE, results=FALSE,dev="png"}
par(mfrow=c(2,2))
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-7 & dt$Kd_CGG < 1.2e-7)[1])
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-7 & dt$Kd_CGG < 1.2e-7)[2])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-7 & dt$Kd_CGG < 1.2e-7)[1])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-7 & dt$Kd_CGG < 1.2e-7)[2])
```
At *K*<sub>D,app</sub> of 10<sup>-8</sup>, it seems we're beginning to pick up genuine curves with weaker binding
```{r 1e-8_Kd, fig.width=8, fig.height=8, fig.align="center",dpi=300, warning=FALSE, results=FALSE,dev="png"}
par(mfrow=c(2,2))
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-8 & dt$Kd_CGG < 1.2e-8)[1])
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-8 & dt$Kd_CGG < 1.2e-8)[2])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-8 & dt$Kd_CGG < 1.2e-8)[1])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-8 & dt$Kd_CGG < 1.2e-8)[2])
```
Same at *K*<sub>D,app</sub> of 10<sup>-9</sup>, looking pretty good.
```{r 1e-9_Kd, fig.width=8, fig.height=8, fig.align="center",dpi=300, warning=FALSE, results=FALSE,dev="png"}
par(mfrow=c(2,2))
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-9 & dt$Kd_CGG < 1.2e-9)[1])
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-9 & dt$Kd_CGG < 1.2e-9)[2])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-9 & dt$Kd_CGG < 1.2e-9)[1])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-9 & dt$Kd_CGG < 1.2e-9)[2])
```
*K*<sub>D,app</sub> of 10<sup>-10</sup>, is beginning to move into the 'mode' of most mutant binding
```{r 1e-10_Kd, fig.width=8, fig.height=8, fig.align="center",dpi=300, warning=FALSE, results=FALSE,dev="png"}
par(mfrow=c(2,2))
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-10 & dt$Kd_CGG < 1.2e-10)[1])
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-10 & dt$Kd_CGG < 1.2e-10)[2])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-10 & dt$Kd_CGG < 1.2e-10)[1])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-10 & dt$Kd_CGG < 1.2e-10)[2])
```
*K*<sub>D,app</sub> of 10<sup>-10.5</sup>, is toward the mode including of WT binders. Curves look quite nice.
```{r 1e-10.5_Kd, fig.width=8, fig.height=8, fig.align="center",dpi=300, warning=FALSE, results=FALSE,dev="png"}
par(mfrow=c(2,2))
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 3.1e-11 & dt$Kd_CGG < 3.2e-11)[1])
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 3.1e-11 & dt$Kd_CGG < 3.2e-11)[2])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 3.1e-11 & dt$Kd_CGG < 3.2e-11)[1])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 3.1e-11 & dt$Kd_CGG < 3.2e-11)[2])
```
*K*<sub>D,app</sub> ~ 10<sup>-11</sup>. This is just past the wt-like 'mode' of binding affinity
```{r 1e-11_Kd, fig.width=8, fig.height=8, fig.align="center",dpi=300, warning=FALSE, results=FALSE,dev="png"}
par(mfrow=c(2,2))
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-11 & dt$Kd_CGG < 2e-11)[1])
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-11 & dt$Kd_CGG < 2e-11)[2])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-11 & dt$Kd_CGG < 2e-11)[1])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-11 & dt$Kd_CGG < 2e-11)[2])
```
*K*<sub>D,app</sub> ~ 10<sup>-11.5</sup>.
```{r 1e-11.5_Kd, fig.width=8, fig.height=8, fig.align="center",dpi=300, warning=FALSE, results=FALSE,dev="png"}
par(mfrow=c(2,2))
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 3.1e-12 & dt$Kd_CGG < 3.2e-12)[1])
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 3.1e-12 & dt$Kd_CGG < 3.2e-12)[2])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 3.1e-12 & dt$Kd_CGG < 3.2e-12)[1])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 3.1e-12 & dt$Kd_CGG < 3.2e-12)[2])
```
*K*<sub>D,app</sub> ~ 10<sup>-12</sup>.
```{r 1e-12_Kd, fig.width=8, fig.height=8, fig.align="center",dpi=300, warning=FALSE, results=FALSE,dev="png"}
par(mfrow=c(2,2))
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-12 & dt$Kd_CGG < 4e-12)[1])
plot.titration(which(dt$library=="lib1" & dt$Kd_CGG > 1e-12 & dt$Kd_CGG < 4e-12)[2])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-12 & dt$Kd_CGG < 4e-12)[1])
plot.titration(which(dt$library=="lib2" & dt$Kd_CGG > 1e-12 & dt$Kd_CGG < 4e-12)[2])
```
## Data filtering by fit quality
Next, let's filter out poor fits using the value we previously computed, the *normalized* mean square residual (nMSR). This metric computes the residual between the observed response variable and that predicted from the titration fit, normalizes this residual by the response range of the titration fit (which is allowed to vary between 1.5 and 3 per the titration fits above), and computes the mean-square of these normalized residuals.
Look at nMSR metric versus avgcoutn value, and layer on value of nMSR filtering based on 20x the global median (and percentage filtered from each background). Filter to NA fits with nMSR above this cutoff
```{r nMSR_v_cell_count, fig.width=4, fig.height=4, fig.align="center",dpi=300,dev="png"}
median.nMSR <- median(dt$nMSR_CGG,na.rm=T)
threshold <- 20
par(mfrow=c(1,1))
plot(log10(dt[,TiteSeq_avgcount]),dt[,nMSR_CGG],main="",pch=19,col="#00000010",xlab="average cell count (log10)",ylab="nMSR",xlim=c(0,3.5),ylim=c(0,0.5))
abline(h=threshold*median.nMSR,col="red",lty=2)
legend("topleft",bty="n",cex=1,legend=paste(format(100*nrow(dt[nMSR_CGG > threshold*median.nMSR & !is.na(nMSR_CGG),])/nrow(dt[!is.na(nMSR_CGG),]),digits=3),"%"))
dt[nMSR_CGG > threshold*median.nMSR,c("Kd_CGG","Kd_SE_CGG","response_CGG","baseline_CGG") := list(as.numeric(NA),as.numeric(NA),as.numeric(NA),as.numeric(NA))]
```
Last, convert our *K*<sub>D,app</sub> to 1) a log<sub>10</sub>-scale, and 2) *K*<sub>A,app</sub>, the inverse of *K*<sub>D,app</sub>, such that higher values are associated with tighter binding, as is more intuitive. (If we want to continue to discuss in terms of *K*<sub>D,app</sub>, since people are often more familiar with *K*<sub>D</sub>, we can refer to the log<sub>10</sub>(*K*<sub>A,app</sub>) as -log<sub>10</sub>(*K*<sub>D,app</sub>), which are identical.
```{r convert_log10Ka}
dt[,log10Ka := -log10(Kd_CGG),by=c("barcode","library")]
```
Let's visualize the final binding measurements as violin plots for the different wildtype targets. In next notebook, we'll evaluate count depth and possibly apply further filtering to remove low-count expression estimates
```{r binding_distribution_vioplot, echo=T, fig.width=6, fig.height=4.5, fig.align="center", dpi=300,dev="png"}
p1 <- ggplot(dt[!is.na(log10Ka),],aes(x=variant_class,y=log10Ka))+
geom_violin(scale="width")+stat_summary(fun=median,geom="point",size=1)+
ggtitle("huCGG, log10(Ka)")+xlab("variant class")+theme(axis.text.x=element_text(angle=-90,hjust=0))+
facet_wrap(~target,nrow=4)
grid.arrange(p1,ncol=1)
#save pdf
invisible(dev.print(pdf, paste(config$Titeseq_Kds_dir,"/violin-plot_log10Ka-by-target.pdf",sep="")))
```
We have generated binding measurements for `r round(nrow(dt[!is.na(log10Ka)])/nrow(dt)*100,digits=2)`% of the barcodes in our libraries.
## Data Output
Finally, let's output our measurements for downstream analyses.
```{r output_data}
dt[,.(library,barcode,target,variant_class,aa_substitutions,n_aa_substitutions,
TiteSeq_avgcount,log10Ka)] %>%
mutate_if(is.numeric, round, digits=6) %>%
write.csv(file=config$Titeseq_Kds_file, row.names=F)
```