-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPCRedux_introduction.Rmd
2776 lines (2029 loc) · 192 KB
/
PCRedux_introduction.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Supporting Information 3 (SI 3): PCRedux package - an introduction"
author: "The PCRedux package authors"
date: "`r Sys.Date()`"
output:
rmarkdown::pdf_document:
number_sections: true
toc: true
toc_depth: 5
header-includes:
- \usepackage[font={small}]{caption}
bibliography: "literature.bib"
---
```{r, include=FALSE, echo=FALSE, eval=FALSE}
options(tinytex.verbose = TRUE)
```
```{r, echo=FALSE}
knitr::opts_chunk$set(fig.lp = "", out.extra='', warnings = FALSE, message = FALSE)
amplification_curve_ROI <- "Phases of amplification curves as Region of Interest (ROI). For amplification curves, the fluorescence signal (RFU, relative fluorescence units) of the reporter dye is plotted against the cycle number. Positive amplification curves possess three ROIs: ground phase, exponential phase and plateau phase. These ROIs can be used to determine predictors such as the takedown point (`tdp`) or the standard deviation within the ground phase (`sd\\_bg`). The exponential range (red dots) is used to determine the Cq values and amplification efficiency (not shown). A linear regression model (red) can be used to calculate the slope in this region. B) PCRs without amplification reaction usually show a flat (non-sigmoides) signal. C) The exponential phase of PCR reactions can vary greatly depending on the DNA starting quantity and other factors. Amplification curves that appear in later cycles often have a lower slope in the exponential phase."
amplification_curve_ROI_short <- "Phases of amplification curves as Region of Interest (ROI)"
figure_quantification_points <- "Frequently used methods for the analysis of quantification points. A) The amplification curve is intersected by a gray horizontal line. This is the background signal (3$\\sigma$) determined from the \\textit{68-95-99.7 rule} from the fluorescence emission of cycles 1 to 10. The black horizontal line is the user-defined threshold (Ct value) in the exponential phase. Based on this, the cycle at which the amplification curve differs significantly from the background is calculated. B) The amplification curve can also be analyzed by fitting a multi-parametric model (black line, five parameters). The red line is the first derivative of the amplification curve with a maximum of 17.59 cycles. The first derivative maximum (`cpD1`) is used as a quantification point (Cq value) in some qPCR systems. The green line shows the second derivative of the amplification curve, with a maximum at 15.68 cycles a minimum at 19.5 cycles. The maximum of the second derivative (`cpD2`) is used as the Cq value in many systems. The blue line shows the amplification efficiency estimated from the trajectory of the exponential region. The `Eff` value of 1.795 means that the amplification efficiency is approximately 89\\%. `cpDdiff` is the difference between the first and second derivative maximum ($cpDdiff = cpD1 - cpD2$)."
figure_quantification_points_short <- "Frequently used methods for the analysis of quantification points"
figure_curve_classification <- "Variations in the classification of amplification curves. A prerequisite for the development of machine-learning models is the availability of manually classified amplification curves. Amplification curves (n = 8858) from the `htPCR` data set have been classified by one user eight times at different points over time (classes: ambiguous (a), positive (y) or negative (n)). During this process, the amplification curves were presented in random order. The example shows that different (subjective) class mappings may occur for the same data set. While only a few amplification curves were classified as negative in the first three classification cycles (A-C), their proportion increased almost tenfold in later classification cycles (D-H)."
figure_curve_classification_short <- "Variations of the classification of amplification curves"
htPCR_nap <- "Examples of negative, ambiguous and positive amplification curves. A) A negative (black), ambiguous (red) and positive (green) amplification curve were selected from the `htPCR` data set. The negative amplification curve is non-sigmoid and has a positive trend. The ambiguous amplification curve is similar to a sigmoidic amplification curve, but shows a positive slope in ground phase (cycle 1 $\\rightarrow$ 5). The positive amplification curve (green) is sigmoid. It starts with a flat baseline (cycle 5 $\\rightarrow$ 25). This is followed by the exponential phase (cycle 5 $\\rightarrow$ 25) and ends in a flat plateau phase (cycle 26 $\\rightarrow$ 35). B) Amplification curves of the `vermeulen1` data set were divided into groups with \\textit{negative}, \\textit{ambiguous} and \\textit{positive} classification. Negative amplification curves have a low signal level. Interesting is the spontaneous increase (probably due to a sensor calibration) in cycles 1 to 2 followed by a linear signal decrease. In principle, the ambiguous amplification curves have a sigmoid curve shape. However, the plateau phase is fairly broad. When different qPCR users were asked what they find ambiguous, they responded that there is an additional change in the slope between cycles 15 to 25. This made some believe that the reaction is not valid. One of the ambiguous amplification curves begins to rise sharply at Cycle 45. The positive amplification curves have a characteristic sigmoid curve shape."
htPCR_nap_short <- "Examples of negative, ambiguous and positive amplification curves"
htPCR_nap_frequency <- "Frequency of amplification curve classes and conformity in the `htPCR` data set. The `htPCR` data set was classified by hand eight times. Due to the unusual amplification curve shape and input errors during classification, many amplification curves were classified differently. A) Frequency of negative (black), ambiguous (red) and positive (green) amplification curves in the `htPCR` data set. The combined number of ambiguous and negative amplification curves appears to be higher, than the number of positive amplification curves. B) The number of observations where all classification cycles made the same decision (conformity == TRUE) accounts for only 5\\% of the total number of observations. TRUE, all classes of the amplification curve matched. FALSE, at least one in eight observations had a different class."
htPCR_nap_short_frequency <- "Frequency of amplification curve classes and conformity in the `htPCR` data set."
qPCR2fdata <- "Shape-based clustering of amplification curves. A) The clustering of the amplification curves of the `testdat` data set (A) was based on the Hausdorff distance. B) The amplification curves were converted with the qPCR2fdata() function, and the Hausdorff distance of the curves was determined by cluster analysis. There were no errors in distinguishing between negative (n) and positive (y) amplification curves."
qPCR2fdata_short <- "Shape-based grouping of amplification curves"
HCU32 <- "Clustering and variation analysis of amplification curves. The amplification curves of the 32HCU were processed with the qPCR2fdata() function and then processed by cluster analysis (Hausdorff distance). A) Amplification curves were plotted from the raw data. B) Overall, signal-to-noise ratios of the amplification curves between all cavities were similar. C) The Cq values and amplification efficiency were calculated using the efficiency(pcrfit()) [\\texttt{qpcR}] function. The median Cq is shown as a vertical line. Cqs greater or less than 0. 1 of Cq $\\tilde{x}$ are marked with observation labels. D) The cluster analysis showed no specific pattern with respect to the amplification curve signals. It appears that the observations D1, E1, F1, F3, G3 and H1 differ most from the other amplification curves."
HCU32_short <- "Clustering and variation analysis of amplification curves"
winklR_principle <- "Concept of the winklR() function. Analysis of the amplification curves of the `RAS002` data set with the winklR() function. Two amplification curves (A: positive, B: negative) were used. The red point shows the origin (first negative derivative maximum) while the green and blue points show the minimum and maximum of the second negative derivative. The angle is calculated from these points. Positive curves have smaller angles than negative curves."
winklR_principle_short <- "Concept of the winklR() function"
winklR <- "Analysis of the amplification curves of the `RAS002` data set with the winklR() function. All amplification curves of the data set `RAS002` were analyzed. Negative amplification curves are shown in red and positive amplification curves in black. The winklR() function was used to analyze all amplification curves. B) The stripchart of the analysis of positive and negative amplification curves shows separation. C) The cdplot calculates the conditional densities of x based on the values of y weighted by the boundary distribution of y. The densities are derived cumulatively via the values of y. The probability that the decision is negative (n) when the angle equals 30 is approximately 100\\%."
winklR_short <- "Variation analysis of amplification curves with the winklR() function"
curve_fit_fail <- "Incorrect model adjustment for amplification curves. A positive (black), and a negative (red) amplification curve were randomly selected from the `RAS002` data set. The positive amplification curve has a baseline signal of about 2500 RFU (Raw Fluorescene Unit) and has a definite sigmoidal shape. The negative amplification curve has a baseline signal of approx. 4200 RFU, but only moderately positive slope (no sigmoidal shape). A logistic function with seven parameters (`l7`) has been fitted to both amplification curves. A Cq value of 25.95 was determined for the positive amplification curve. The negative amplification curve had a Cq value of 9.41. However, it can be seen that the latter model fitting is not appropriate for calculating a trustworthy Cq value. An automatic calculation without user control would give a false-positive result. Note: This plot was shown in linear scale to demonstrate typical pitfalls."
curve_fit_fail_short <- "Incorrect model adjustment for amplification curves"
plot_models <- "Frequencies of the fitted multiparametric models and Cq values. The amplification curves (n = 3302) of the `data\\_sample` data set were analyzed with the encu() function. The amplification curves have been stratified according to their classes (negative: grey, positive: green). A) The optimal multiparametric model was selected for each amplification curve based on the Akaike information criterion. lNA stands for `no model` and l4 \\ldots l7 for a model with four to seven parameters. B) All Cq values were calculated from optimal multiparametric models. Cqs of positive amplification curves accumulate in the range between 15 and 30 PCR cycles (50\\%). For the negative amplification curves, the Cqs are distributed over the entire span of the cycles. Note: The Cqs of the negative amplification curves are false-positive!"
plot_models_short <- "Frequencies of the fitted multiparametric models and Cq values"
figure_cpD2_range <- "Location of the predictors `cpD2\\_range`, `bg.start`, `bg.stop` within an amplification curve. The minimum (cpD2m) and maximum (cpD2) of the second derivative were calculated numerically using the diffQ2() function. This function also returns the maximum of the first derivative (cpD1). The `cpD2\\_range` is defined as $cpD2\\_range = |cpD2 - cpD2m|$. Large `cpD2\\_range` values indicate a low amplification efficiency or a negative amplification reaction. The predictor `bg.start` is an estimate for the end of the ground phase. `bg.start` is an approximation for the onset of the plateau phase."
figure_cpD2_range_short <- "Location of the of the predictors `cpD2\\_range`, `bg.start`, `bg.stop` within an amplification curve"
plot_dat_EffTop <- "Values of predictors calculated from negative and positive amplification curves. Amplification curve predictors from the `data\\_sample\\_subset` data set were used as they contain positive and negative amplification curves, and amplification curves that exhibit a \\textit{hook effect} or non-sigmoid shapes. A) `eff`, optimized PCR efficiency found within a sliding window. B) `sliwin`, PCR efficiency by the `window-of-linearity` method. C) `cpDdiff`, difference between the Cq values calculated from the first and the second derivative maximum. D) `loglin\\_slope`, slope from the cycle at the second derivative maximum to the second derivative minimum. E) `cpD2\\_range`, absolute value of the difference between the minimum and the maximum of the second derivative maximum. F) `top`, takeoff point. G) `f.top`, fluorescence intensity at takeoff point. H) `tdp`, takedown point. I) `f.tdp`, fluorescence intensity at takedown point. J) `bg.stop`, estimated end of the ground phase. K) `amp.stop`, estimated end of the exponential phase. L) `convInfo\\_iteratons`, number of iterations until convergence."
plot_dat_EffTop_short <- "Analysis of location predictors"
loglin_slope <- "Concept of the `loglin\\_slope` predictor. The algorithm determines the fluorescence values of the raw data at the approximate positions of the maximum off the first derivative, the minimum of the second derivative and the maximum of the second derivative, which are in the exponential phase of the amplification curve. The data were taken from the `RAS002` data set. A linear model is created from these parameter sets and the slope is determined. A) Positive amplification curves have a clearly positive slope. B) Negative amplification curves usually have a low, sometimes negative slope."
loglin_slope_short <- "Concept of the `loglin\\_slope` predictor"
plot_sd_bg <- "Standard deviation in the ground phase of various qPCR devices. The `sd\\_bg` predictor was used to determine if the standard deviation between thermo-cyclers and between positive and negative amplification curves was different. The standard deviation was determined from the fluorescence values from the first cycle to the takeoff point. If the takeoff point could not be determined, the standard deviation from the first cycle to the eighth cycle was calculated. The Mann-Whitney test was used to compare the medians of the two populations (y, positive; n, negative). The differences were significant for A) LC\\_480 (Roche), B) CFX96 (Bio-Rad) and C) LC96 (Roche)."
plot_sd_bg_short <- "Standard deviation in the ground phase of various qPCR devices"
plot_bg_pt <- "Values of predictors calculated from negative and positive amplification curves. Amplification curve predictors from the `data\\_sample\\_subset` data set were used as they contain positive and negative amplification curves, as well as amplification curves that exhibit a \\textit{hook effect} or non-sigmoid shapes. A) `eff`, optimized PCR efficiency in a sliding window. B) `sliwin`, PCR efficiency according to the window-of-linearity method. C) `cpDdiff`, difference between the Cq values calculated from the first and the second derivative maximum. D) `loglin\\_slope`, slope from cycle at second derivative maximum to second derivative minimum. E) `cpD2\\_range`, absolute difference between the minimum and maximum of the second derivative. F) `top`, takeoff point. G) `f.top`, fluorescence intensity at takeoff point. H) `tdp`, takedown point. I) `f.tdp`, fluorescence intensity at the takedown point. J) `bg.stop`, estimated end of the ground phase. K) `amp.stop`, estimated end of the exponential phase. L) `convInfo\\_iteratons`, number of iterations until convergence when fitting a multiparametric model. The classes were compared using the Wilcoxon Rank Sum Test."
plot_bg_pt_short <- "Values of predictors calculated from negative and positive amplification curves"
plot_model_param <- "Values of predictors calculated from negative and positive amplification curves. Amplification curve predictors from the `data\\_sample\\_subset` data set were used as they contain positive and negative amplification curves, as well as amplification curves that exhibit a \\textit{hook effect} or non-sigmoid shapes. A) `c\\_model\\_param`, is the c model parameter of the seven parameter model. B) `d\\_model\\_param`, is the d model parameter of the seven parameter model. C) `e\\_model\\_param`, is the e model parameter of the seven parameter model. D) `f\\_model\\_param`, is the f model parameter of the seven parameter model. The classes were compared using the Wilcoxon Rank Sum Test."
plot_model_param_short <- "Values of predictors calculated from negative and positive amplification curves"
plot_Logistic_Regression <- "Machine classification by means of binomial logistic regression using the `loglin\\_slope` predictor. A) For the calculation of a binomial logistic regression model, the categorical response variable $Y$ (decision with classes: negative and positive) must be converted to a numerical value. With binomial logistic regression, the probability of a categorical response can be estimated using the $X$ predictor variable. In this example, the predictor variable `loglin\\_slope` is used. Grey measurement points (70\\% of the data set) were used for training. Red dots represent the values used for testing. The regression curve of the binomial logistic regression is shown in blue. The grey horizontal line at 0.5 marks the threshold of probability above which it is determined whether an amplification curve is negative or positive. B) The performance indicators were calculated using the performeR() function. Sensitivity, TPR; Specificity, SPC; Precision, PPV; Negative prediction value, NPV; Fall-out, FPR; False negative rate, FNR; False detection rate, FDR; Accuracy, ACC; F1 score, F1; Matthews correlation coefficient, MCC, Cohens kappa (binary classification), kappa ($\\kappa$)."
plot_Logistic_Regression_short <- "Machine classification by means of binomial logistic regression using the `loglin\\_slope` predictor"
statistical_methods_amptester <- "Analysis of amplification curves with the ``amptester()`` function. A \\& B) The threshold test (THt) is based on the Wilcoxon ranksum test and compares 20\\% of the fluorescence values of the ground phase with 15\\% of the plateau phase. In the example, a significant difference ($p = 0.000512$) was found for the positive amplification curve. However, this did not apply to the negative amplification curve ($p = 0.621$). C \\& D) A Q-Q diagram is used to graphically compare two probability distributions. In this study the probability distribution of the amplification curve was compared with a theoretical normal distribution. The orange line is the theoretically normal quantil-quantile plot that passes through the probabilities of the first and third quartiles. The Shapiro-Wilk test (SHt) of normality checks whether the underlying measurement data of the amplification curve is significantly normal distributed. Since the p-value of $7.09 e^{-9}$ of the positive amplification curve is $\\alpha \\leq 5e^{-4}$, the null hypothesis is rejected. However, this does not apply to the negative amplification curve ($p = 0.895$). E \\& F) The linear regression test (LRt) calculates the coefficient of determination ($R^{2}$) using an ordinary least square regression where all measured values are integrated into the model in a cycle-dependent manner. Experience shows that the non-linear part of an amplification curve has a $R^{2}$ smaller than 0.8, which is also shown in the example."
statistical_methods_amptester_short <- "Analysis of amplification curves with the amptester() function"
figure_autocorrelation_tau <- "Effect of tau"
figure_autocorrelation_tau_short <- "Effect of tau"
autocorrelation <- "Autocorrelation analysis of the amplification curves of the `RAS002` data set. A) Display of all amplification curves of the data set `RAS002`. Negative amplification curves are shown in red and positive amplification curves in black. The autocorrelation\\_test() function was used to analyze all amplification curves. B) The density diagram of the autocorrelation of positive and negative amplification curves shows a bimodal distribution. C) The cdplot calculates the conditional densities of x based on the values of y weighted by the boundary distribution of y. The densities are derived cumulatively via the values of y. The probability that the decision is negative (n) when autocorrelation equals 0.85 is approximately 100\\%. D) Performance analysis using the performeR() function (see \\autoref{section_helper_functions} for details)."
autocorrelation_short <- "Autocorrelation analysis for amplification curves of the `RAS002` data set)"
earlyreg_slopes <- "Analysis of the ground phase with the earlyreg() function and the `C127EGHP` data set (n = 64 amplification curves). This data set consists of 32 samples, which were simultaneously monitored with the intercalator EvaGreen or hydrolysis probes. A) All amplification curves possess slightly different slopes and intercepts in the first cycles of the ground phase (ROI: Cycles 1 to 8). Both the slope and the intercept of each amplification curve were used for cluster analysis (k-means, Hartigan-Wong algorithm, number of centers \\textit{k = 2}). B) The amplification curves were assigned to five clusters, depending on their slope and their intersection (red, black). C) Finally, the clusters were associated to the detection chemistries (EvaGreen (EG) or hydrolysis probes (HP))."
earlyreg_slopes_short <- "Analysis of the ground phase with the earlyreg() function"
figure_head2tailratio <- "Ratio between the head and the tail of a quantitative PCR amplification curve. A) Plot of quantile normalized amplification curves from the `RAS002` data set. Data points used in the head and and tail are highlighted by circles. The intervals for the Robust Linear Regression are automatically selected using the 25\\% and 75\\% quantiles. Therefore, not all data points are used in the regression model. The straight line is the regression line from the robust linear model. The slopes of the positive and negative amplification curves differ. B) Boxplot for the comparison of the $head/tail$ ratio. Positive amplification curves have a lower ratio than negative curves. The difference between the classes is significant."
figure_head2tailratio_short <- "Ratio between the head and the tail of a quantitative PCR amplification curve"
plot_mblrr <- "Robust local regression to analyze amplification curves. The amplification curves were arbitrarily selected from the `RAS002` data set. In the qPCR setup, the target genes beta globin (B. globin) and HPRT1 were simultaneously measured in a PCR cavity using two specific hydrolysis probes (duplex qPCR). Both positive (A, C, E) and negative (B, D, F) amplification curves were used. The amplification curves are normalized to the 99\\% quantile. The differences in slopes and intercepts (blue and orange lines and dots). The mblrr() function is presumably useful for data sets which are accompanied by noise and artifacts."
plot_mblrr_short <- "Robust local regression to analyze amplification curves"
plot_FFTrees <- "Visualization of decisions in Fast and Frugal Trees after data analysis of amplification curves via the mblrr() function. \\textbf{Top row} `Data`) Overview of the data set, stating the total number of observations (N = 192) and percentage of positive (22\\%) and negative (78\\%) amplification curves. \\textbf{Middle row} `FFT \\#1 (of 6)`) Decision Tree with the number of observations classified at each level of the tree. For the analysis, six predictors (nBG, intercept of head region; mBG, slope of head region; rBG, Pearson correlation of head region; nTP, intercept of tail region; mTP, slope of tail region; rBG, Pearson correlation of tail region) have been used for the analysis. After two tree levels (nBG, nTP), the decision tree is created, where all positive amplification curves (N = 40) are correctly classified. Two observations are classified as false-negative in the negative amplification curves. \\textbf{Lower row} `Performance`) The FFTrees() [FFTrees] function determines several performance statistics. For the training data, there is a classification table on the left side showing the relationship between tree `decision` and the `truth`. The correct rejection (`Cor Rej`) and `Hit` are the right decisions. `Miss` and false alarm (`False Al`) are wrong decisions. The centre shows the cumulative tree performance in terms of mean of used cues (`mcu`), Percent of ignored cues (`pci`), sensitivity (`sens`), specificity (`spec`), accuracy (`acc`) and weighted Accuracy (`wacc`). The receiver operating characteristic (ROC) curve on the right-hand side compares the performance of all trees in the FFTrees object. The system also displays the performance of the fast frugal trees (`\\#`, green), CART (`C`, red), logistical regression (`L`, blue), random forest (`R`, violet) and the support vector machine (`S`, yellow)."
plot_FFTrees_short <- "Visualization of decisions in Fast and Frugal Trees after data analysis of amplification curves via the mblrr() function"
plot_peaks_ratio <- "Working principle of the `peaks\\_ratio` predictor. The computation is based on a sequential linking of functions. The diffQ() function determines numerically the first derivative of an amplification curve. This derivative is passed to the mcaPeaks() [\\texttt{MBmca}] function. In the output all minima and all maxima are contained. The ranges are calculated from the minima and maxima. The Lagged Difference is determined from the ranges of the minima and maxima. Finally, the ratio of the differences (maximum/minimum) is calculated."
plot_peaks_ratio_short <- "Working principle of the `peaks\\_ratio` predictor"
plot_cp_area <- "Values of predictors calculated from negative and positive amplification curves. Amplification curves predictors from the `data\\_sample\\_subset` data set were used as they contain positive and negative amplification curves and amplification curves that exhibit a \\textit{hook effect} or non-sigmoid shapes. A) `polyarea`, is the area under the amplification curve determined by the Gauss polygon area formula. B) `peaks\\_ratio`, is the ratio of the local minima and the local maxima. C) `cp\\_e.agglo`, makes use of energy agglomerative clustering. Positive amplification curves have fewer change points than negative amplification curves. These two change point analyses generally separate positive and negative amplification curves. D) `cp\\_bcp`, analyses change points by a Bayesian approach. Positive amplification curves appear to contain more change points than negative amplification curves. Nevertheless, there is an overlap between the positive and negative amplification curves in both methods. This can lead to false-positive or false-negative classifications. E) `amptester\\_polygon` is the cycle normalized order of a polygon. F) `amptester\\_slope.ratio` is the slope (linear model) of the raw fluorescence values at the approximate first derivate maximum, second derivative minimum and second derivative maximum."
plot_cp_area_short <- "Analysis of predictors that decribe the area and changepoints of an amplification curve"
plot_cpa <- "Bayesian and energy agglomerative change point analysis on negative and positive amplification curves. An analysis of a negative and a positive amplification curve from the `RAS002` data set was performed using the pcrfit\\_single() function. In this process, the amplification curves were analysed for change points using Bayesian change point analysis and energy agglomerative clustering. A) The negative amplification curve has a base signal of approximately 2450 RFU and only a small signal increase to 2650 RFU. There is a clear indication of the signal variation (noise). B) The first negative derivative amplifies the noise so that some peaks are visible. C) The change point analysis shows changes in energy agglomerative clustering at several positions (green vertical line). The Bayesian change point analysis rarely exceeds a probability of 0.6 (grey vert line). D) The positive amplification curve has a lower base signal ($\\sim 2450$ RFU) and increases up to the 40th cycle ($\\sim 3400$ RFU). A sigmoid shape of the curve is visible. E) The first negative derivation of the positive amplification curve shows a distinctive peak with a minimum at cycle 25. F) The change point analysis in energy agglomerative clustering shows changes (green vertical line) only at two positions. The Bayesian change point analysis shows a probability higher than 0.6 (grey horizontal line) at several positions."
plot_cpa_short <- "Bayesian and energy agglomerative change point analysis on negative and positive amplification curves"
plot_random_forest <- " The predictors `amptester\\_lrt` (lrt), `amptester\\_rgt` (rgt), `amptester\\_tht` (tht), `amptester\\_slt` (slt), `amptester\\_polygon`(polygon) and `amptester\\_slope.ratio` (slope.ratio) were used for classification using random forest. A) This plot shows the error depending on the number of trees. The error decreases as more and more trees are added and averaged. B). Mean Decrease Accuracy shown how much the model accuracy decreases if a variable is dropped. C) Mean Decrease Gini shows the importance of a variable based on the Gini impurity index used for the calculation of splits in trees."
plot_random_forest_short <- "Random Forest"
#----------Tables-------------------------------------
```
\newpage
\begin{figure}[ht]
\centering
\scalebox{0.6}{
\includegraphics[clip=true,trim=1cm 1cm 1cm 1cm]{Logo.pdf}
}
\end{figure}
- A comprehensive PDF version (including domain knowledge about qPCRs and machine learning) of this document is available **[online supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)**. This online document contains also the code that was used to generate the plot in this introduction.
- The code for all figures and analysis in the manuscript "PCRedux: A Data Mining and Machine Learning Toolkit for qPCR Experiments" is available at **[https://github.com/PCRuniversum/PCRedux-supplements](https://github.com/PCRuniversum/PCRedux-supplements)**.
# Aims of the Project
A review of the literature (PubMed, Google Scholar; 1984-01-01 - `r Sys.Date()`) and discussion with peers revealed that there is no open source software package to calculate predictors from quantitative PCR amplification curves for machine learning applications. A predictor is a quantifiable *informative* property of an amplification curve. In particular, there is no information available about predictors that can be used from amplification curves apart from measures that describe quantification points, amplification efficiencies and signal levels. Although several amplification curve data sets are available, no curated labeled data sets labels are described in the literature or repositories such as GitHub\footnote{\url{https://github.com/}}, Bitbucket\footnote{\url{https://bitbucket.org/}}, SourceForge\footnote{\url{https://sourceforge.net/}} or Kaggle\footnote{\url{https://www.kaggle.com/}}.
Therefore, the aim of the study was to:
1. create a collection of classified amplification curve data,
2. propose algorithms that can be used to calculate predictors from amplification curves,
3. evaluate pipelines that can be used for an automatic classification of amplification curves based on the curve shape and
4. to bundle the findings in a public repository open source software and open data package.
# Introduction and important infromation about the \texttt{PCRedux} package\label{label1}
In the **[online supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)** the reader finds an introduction to nucleic acids, including nucleic acid detection methods (e.g., melting curve analysis, photometric measurements) for the analysis of patient and forensic sample material. Special intention is paid to the quantitative Polymerase Chain Reaction (qPCR), since this method is the *de facto* standard for the detection and high-precision quantification of nucleic acids.
The focus of this study is the development of statistical and bioinformatical algorithms for the \texttt{PCRedux} software (version `r packageVersion("PCRedux")`). This software can be used to automatically calculate putative predictors (*features*) from qPCR amplification curves. A predictor herein refers to a quantifiable *informative* property of an amplification curve, employable for data mining, machine learning applications and classification tasks.
On the basis of these observations, concepts for predictors (*features*) were developed and implemented in algorithms to describe amplification curves. The functions described in the following are aimed for experimental studies. It is important to note that the concepts for the predictors proposed herein emerged by a *critical reasoning* process and *domain knowledge* of the \texttt{PCRedux} package creator. The aim of the package is to propose a set of predictors, functions and data for an independent research.
## Development, Implementation, InstallationVersion Control and Continuous Integration
The **[online supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)** deals with elements of the software engineering (e. g., continuous integration, Donald Knuth's *Literate Programming*, unit testing) used within the \texttt{PCRedux} software. The \autoref{section_reasoning_and_analysis} gives an introduction into qPCR data, their analysis and explains why there is a need for the \texttt{PCRedux} software. In addition, the data analysis using machine learning is concisely described, after which the work focuses on the analysis of the measured data.
The proposed algorithms were partially tested with machine learning methods. For this purpose, a brief introduction to the subject \emph{machine learning} is given in the **[online supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)**.
Information for statistical analyses of qPCR amplification curves are presented in \autoref{section_DataAnalysis} ff. This covers the description of the curvature and the challenges of the calculations.
All scientific and engineering work depends on data. In particular, *open data* are becoming a cornerstone in science. As data sets of classified amplification curves were not available anywhere else, the **[online supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)** summarizes the aggregation, maintenance, and distribution of classified qPCR amplification curve data sets. The manual classification of amplification curves is a time-consuming and error prone task when working with large data sets. To facilitate the manual analysis procedure, helper tools like ``humanrater()`` are presented in the the **[online supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)**. A novel approach for *curve-shape based group classification* is shown too.
An achievement of this study is the extensive portfolio of statistical algorithms for predictor calculation. Central findings of the research are presented in \autoref{section_Functions_of_PCRedux}.
It is expected that these implementations will allow the automatic analysis of large data sets for machine learning applications. The expectations of the findings are critically discussed in \autoref{section_Summary_and_conclusions}.
## PCRedux-app
\texttt{PCRedux-app} is a web server, based on the shiny technology [@shiny_2016] wrapped around the ``encu()`` function (\autoref{section_pcrfit_single_pcrfit_parallel}). An user can upload qPCR data and download obtained amplification curve features.
There are different ways to use the function.
- Through \texttt{RScript} (Scripting Front-End for R):
- Enter the command ``Rscript -e 'PCRedux::run_PCRedux()'`` in a console and copy the pasted URL in a browser.
- Through Graphical User Interfaces:
- The function can be started directly in \texttt{RStudio} or \texttt{RKWard} [@roediger_rkward_2012] by:
```{r, echo=TRUE, eval=FALSE}
# run the Shiny app
PCRedux::run_PCRedux()
```
## Analysis of Sigmoid-Shaped Curves for Data Mining and Machine Learning Applications\label{section_reasoning_and_analysis}
The following sections describe \texttt{PCRedux} regarding the analysis,
numerical description and predictor calculation from a sigmoid curve. A
predictor herein refers to a quantifiable *informative* property of a sigmoid
curve. The predictors (\autoref{section_Functions_of_PCRedux}), sometimes
referred to as descriptors, can be used for applications such as data mining,
machine learning and automatic classification (e. g., negative or positive
amplification).
Machine learning is a scientific discipline that deals with the use of simple to
sophisticated algorithms to learn from large volumes of data. A number of
approaches to machine learning exist. Supervised learning algorithms are trained
with data which contain correct answers [@zielesny_curve_2011;
@walsh_correct_2015; @fernandez-delgado_we_2014]. This allows to create models
that assign the data to the answers and use these for further processing and
predictions [@tolson_machine_2001]. Unsupervised algorithms learn from data
without answers. They use large, diverse data sets for self-improvement. Neural
networks or artificial neural networks are a type of machine learning that
roughly resembles the function of human neurons. They are computer programs that
use several levels of nodes (neurons), work in parallel for learning, recognize
patterns and make decisions in a human-like manner [@gunther_neuralnet:_2010].
Deep Learning uses a deep neural network with many neuronal layers and an
extensive volume of data [@shin_deep_2016]. They solve complex, non-linear
problems and are responsible for groundbreaking innovations through artificial
intelligence, such as the processing of a natural language or images
[@tolson_machine_2001]. Applications in the life sciences have already been
described for each of these methods. Up to now there appears to be no study that
uses machine learning for the classification of amplification curves in a
scientific setting.
The determination of quantification points such as the Cq value is a typical
task during the analysis of qPCR experiments. This is also covered by the
\texttt{PCRedux} software in dedicated sections and the **[online
supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)**.
Characteristics of amplification curves that can be used for the statistical and
analytical description are discussed in the **[online
supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)** in more
in detail. The examples described focus on the concepts for **binary
(dichotomous) classification** [@kruppa_probability_2014] as negative or
positive. The mere binary classification into classes "positive" or "negative"
is not necessarily the aim of the \texttt{PCRedux} package. Instead, it is aimed
to provide a tool set for automatic **multicategory (polychotomus)
classification** of amplification curves by any class conceivable. Such
classification could be used for the quality of an amplification curve as
negative, ambiguous and positive (\autoref{htPCR_nap}A & B). A definition for
binary (dichotomous) classification and multicategory (polychotomus)
classification is presented in @kruppa_probability_2014.
```{r htPCR_nap, echo=FALSE, fig.cap=htPCR_nap, fig.scap=htPCR_nap_short}
# Create graphic device for the plot(s)
layout(matrix(c(1,1,2,3,4,5),2,3,byrow=TRUE))
data <- qpcR::htPCR
matplot(
data[, 1], log(data[, c(552, 512, 616)]), xlab = "Cycles", ylab = "log(RFU)",
main = "", type = "l", lty = 1, lwd = 2
)
legend("topleft", c(
paste("negative ", colnames(data)[552]),
paste("ambiguous ", colnames(data)[512]),
paste("positive ", colnames(data)[616])
), col = 1:3, pch = 19, bty = "n")
mtext("A", cex = 1, side = 3, adj = 0, font = 2)
# Empty plot
plot(NA, NA, xlim = c(0,1), ylim = c(0,1), xlab ="", ylab = "", xaxt ="n", yaxt="n", bty ="n")
# Load the decision_res_htPCR.csv data set from a csv file.
filename <- system.file("decision_res_vermeulen1.csv", package = "PCRedux")
dec <- read.csv(filename)[, 2]
usr_colors <- c(
adjustcolor("black", alpha.f = 0.80), adjustcolor("red", alpha.f = 0.80), adjustcolor("green", alpha.f = 0.80)
)
colors <- as.character(factor(dec,
levels = c("n", "a", "y"),
labels = usr_colors))
data <- qpcR::vermeulen1
dy_range <- range(data[, -1], na.rm=TRUE)
decision <- c("n", "a", "y")
labels <- c("B negative", " ambiguous", " positive")
for(i in 1L:length(decision)) {
samples <- ifelse(length(which(dec == decision[i])) > 50, 25, length(which(dec == decision[i])))
matplot(
data[, 1], log(data[, sample(which(dec == decision[i]) + 1, samples)]), xlab = "Cycles", ylab = "log(RFU)", main = "", type = "l", lty = 1, lwd = 2, col = usr_colors[i]#, ylim = y_range
)
abline(v = 2, col = "grey")
legend("topleft",
c("negative ", "ambiguous ", "positive "),
col = usr_colors, pch = 19, bty = "n")
mtext(labels[i], cex = 1, side = 3, adj = 0, font = 2)
}
```
## Relation of Machine Learning to the Classification of Amplification Curves\label{section_technologies_amplification_curves_ML}
A few scientific approaches have previously been shown in which machine learning
was used for the analysis of amplification curves. The intention of
@gunay_machine_2016 was to improve the determination of Cq values, without
dealing with classification. The authors postulated that they had developed an
improved prediction of Cq values using a modified three-parameter model. One
assumption of their approach was that their modified three-parameter model could
be applied to any amplification curve. However, there are reasons why such an
assumption is not valid.
In the **[online
supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)** it was described that a considerable proportion of
amplification curves deviate clearly from a three-parameter model.
Multiparametric models with more than four parameters are more frequently
adapted to amplification curves. In addition, the multiparametric models tend to
adapt to noise (\autoref{curve_fit_fail}). Unsurprisingly, a Cq value is
calculated for actually negative amplification curves, demonstrating that a
three-parameter model alone cannot provide reliable predictions. However, a
correct model is important for the extraction of Cq values, for the
determination of predictors from the curves and consequently for classification.
Data mining and machine learning can be used for descriptive and predictive
tasks during the analysis of complex data sets. Data mining uses specific
methods from statistical inference, software engineering and domain knowledge to
get a better understanding of the data, and to extract *hidden knowledge* from
the preprocessed data [@kruppa_probability_2014; @herrera_multiple_2016]. All
this implies that a human being interacts with the data at the different stages
of the whole process as part of the workflow in data mining. Elements of the
data mining process are the preprocessing of the data, the description of the
data, the exploration of the data and the search for connections and causes.
The availability of classified amplification curve data sets and technologies
for the classification of amplification curves is of high importance to train
and validate models. This is dealt with in the **[online
supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)**.
For machine learning, the type of learning task is the first thing that needs to
be defined. The learning task can be a classification, clustering or regression
problem. Next, suitable algorithms can be selected depending on the task. In the
case of classification problems, it is attempted to predict a *discrete valued*
output. The labels ($y$) are usually categorical and represent a finite number
of classes (e. g. "negative", "positive" $\rightarrow$ binary
classification). With regression tasks, it is attempted to predict a
*continuously valued* output. Clustering is primarily about forming groups
(clusters) based on their similarities. More examples are presented in the
**[online
supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)**.
In contrast, machine learning uses instructions and data in software modules to
create models that can be used to make predictions on novel data. In machine
learning, the human being is much less necessary in the entire process.
Processes (algorithms) are used to create models with tunable parameters. These
models automatically adapt their performance to the information (predictors)
from the data. Well-known examples of machine learning technologies are Decision
Trees (DT), Boosting, Random Forests (RF), Support Vector Machines (SVM),
generalized linear models (GLM), logistic regression (LR) and deep neural
networks (DNN) [@lee_statistical_2010]. The three following concepts of machine
learning that are frequently described in the literature are *Supervised
learning*, *Unsupervised learning* and *Reinforcement Learning* which are
described in detail in the **[online
supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)**.
# Why is there a need for the \texttt{PCRedux} software?\label{why_PCRedux}
The classification of an amplification curve is feasible using bioanalytical methods such as melting curve analysis [@roediger_RJ_2013] or electrophoretic separation [@westermeier2004]. However, this is not always possible or desirable.
- Melting curve analysis is used in some qPCRs as a post-processing step to identify samples which contain the specific target sequence (*positive*) based on a specific melting temperature. However, some detection probe systems like hydrolysis probes do not permit such classification. Moreover, nucleic acids with similar biochemical properties but different sequences may have the same melting temperature.
- An electrophoretic separation (classification of target DNA sequences by size and quantity) often requires too much effort for experiments with high sample throughput.
- There are mathematical qPCR analysis algorithms such as \texttt{linreg} [@ruijter_amplification_2009] that require information on whether an amplification curve is negative or positive for subsequent calculation.
- Raw data of amplification curves can be fitted with sigmoid functions. Sigmoid functions are non-linear, real-valued, have an S-shaped curvature (\autoref{figure_sigmoid_curve_models}) and can be differentiated (e. g., first derivative maximum, with one local minimum and one local maximum). With the obtained model, predictions can be made. For example, the position of the second derivative maximum can be calculated from this (\autoref{section_DataAnalysis}). In the context of amplification curves, the second derivative maximum is commonly used to describe the relationship between the cycle number and the PCR product formation (\autoref{section_DataAnalysis}). All software assume that the amplification resembles a sigmoid curve shape (ideal positive amplification reaction), or a flat low line (ideal negative amplification reaction). For example, @Ritz2008 published the \texttt{qpcR} \texttt{R} package that contains functions to fit several multi-parameter models. This includes the five-parameter Richardson function [@richards_flexible_1959] (\autoref{l5}). The \texttt{qpcR} package [@Ritz2008] contains an amplification curve test via the ``modlist()`` function. The parameter `check="uni2"` offers an analytical approach, as part of a method for the kinetic outlier detection. It tries to checks for a sigmoid structure of the amplification curve. Then, ``modlist()`` tests for the location of the first derivative maximum and the second derivative maximum. However, multi-parameter functions fit "successful" in most cases including noise and give false positive results. This will be shown in later sections. This is exemplary shown in later sections in combination with the ``amptester()`` [\texttt{chipPCR}] function [@roediger2015chippcr], which uses fixed thresholds and frequentist inference to identify amplification curves that exceed the threshold ($\mapsto$ classified as positive). However, the analysis can also lead to false-positive classifications as exemplified in the example below and in \autoref{curve_fit_fail}. Therefore, additional classification concepts would be beneficial.
```{r curve_fit_fail, echo=FALSE, fig.cap=curve_fit_fail, fig.scap=curve_fit_fail_short, fig.width=6}
# Load the qpcR package for the model fit.
library(qpcR)
library(chipPCR)
# Select one positive and one negative amplification curve from the PCRedux
# package.
amp_data <- PCRedux::RAS002[, c("cyc", "A01_gDNA.._unkn_B.Globin",
"B07_gDNA.._unkn_HPRT1")]
colnames(amp_data) <- c("cyc", "positive", "negative")
# Arrange graphs in an matrix and set the plot parameters. An plot the positive
# and negative amplification curve.
hight <- c(3100, 4100)
plot(NA, NA, xlim = range(amp_data[, "cyc"]),
ylim = range(amp_data[, c("positive", "negative")]),
xlab = "Cycles", ylab = "RFU", main = "")
# Apply the amptester() function from the chipPCR package to the amplification
# curve data and write the results to the main of the plots.
for (i in 2:3) {
res.ampt <- suppressMessages(amptester(amp_data[, i]))
# Make a logical connection by two tests (shap.noisy, lrt.test and
# tht.dec) of amptester to decide if an amplification reaction is
# positive or negative.
decision <- ifelse(!res.ampt@decisions[1] &&
res.ampt@decisions[2] &&
res.ampt@decisions[4],
"positive", "negative"
)
# The amplification curves were fitted (l7 model) with pcrfit() function.
# The Cq was determined with the efficiency() function.
fit <- pcrfit(data = amp_data, cyc = 1, fluo = i, model = l7)
res <- efficiency(fit, plot = FALSE)
lines(predict(fit), pch = 19, lty = 1, xlab = "Cycles", ylab = "RFU",
main = "", col = i - 1)
abline(h = res[["fluo"]], col = "grey")
points(res[["cpD2"]], res[["fluo"]], pch = 19)
legend(1, hight[i-1], paste0(colnames(amp_data)[i],
" curve -> Decision: ",
decision, " Cq: ", res[["cpD2"]]),
bty = "n", cex = 1, col = "red"
)
}
```
- The analysis and classification of sigmoid data (e. g., quantitative PCR) is a manageable task if the data volume is low, or dedicated analysis software is available. An example for a low number of amplification curves is shown in \autoref{figure_sigmoid_curve}A. All `r ncol(chipPCR::C127EGHP)-1` curves exhibit a sigmoid curve shape. It is trivial to classify them as positive by hand. In contrast, the vast number of amplification curves in \autoref{figure_sigmoid_curve}B is barely manageable with a reasonable effort by simple visual inspection. These data originate from a high-throughput experiment that encompasses in total `r suppressMessages(ncol(qpcR::htPCR))-1` amplification curves of which only 200 are shown. A manual analysis of the data is time-consuming and prone to errors. Even for an experienced user, it is difficult to classify the amplification curves unambiguously and reproducible as will be later shown in \autoref{section_data_sets}.
- qPCRs are performed in thermo-cyclers, which are equipped with a real-time monitoring technology. There are numerous commercial manufactures producing thermo-cyclers (\autoref{table-datasets}). An example for a thermo-cycler that originated in a scientific project is the VideoScan technology [@roediger_highly_2013]. Most of the thermo-cyclers have a thermal block with wells at certain positions. Reaction vessels containing the PCR mix are inserted into the wells. There are also thermo-cyclers that use capillary tubes that are heated and cooled by air (e. g., Roche Light Cycler 1.0). The thermo-cycler raises and lowers the temperature in the reaction vessels in discrete, pre-programmed steps so that PCR cycling can take place. Instruments with a real-time monitoring functionality have sensors to measure changes of the fluorescence intensity in the reaction vessel. All thermo-cycler systems use software to processes the amplification curves. Plots of the fluorescence observations versus cycle number obtained from two different qPCR systems are shown in \autoref{figure_sigmoid_curve}A and B. The thermo-cyclers produce different amplification curve shapes even with the same sample material and PCR mastermix because of their technical design, sensors, and software. These factors need to be taken into account during the development of analysis algorithms.
```{r figure_sigmoid_curve_models, echo=FALSE, fig.scap="Sigmoid models of amplification curves", fig.height=6, out.extra='', fig.cap="A) Model function of a one-parameter sigmoid function. B) Model function of a sigmoid function with an intercept $n$ = 0.2 RFU (shift in base-line). C) Model function of a sigmoid function with an intercept ($n$ \\textasciitilde 0.2 RFU) and a square portion $m * x^{2}, m = -0.0005, n = 0.2 RFU$ (hook-effect-like). D) Model function of a sigmoid function with an intercept ($n$) and a square portion of $m * x^{2}$ and additional noise $\\epsilon$ (normal distributed, $\\mu = 0.01, \\sigma = 0.05$). Note: This plot was shown in linear scale to demonstrate typical pitfalls."}
x_val <- seq(-10, 10, 0.5)
y_val <- 1 / (1 + exp(-x_val))
y_val_slope <- 1 / (1 + exp(-x_val)) + 0.2
y_val_slope_quadratic <- 1 / (1 + exp(-x_val)) + -0.0005 * x_val ^ 2 + 0.2
y_val_slope_quadratic_noise <- 1 / (1 + exp(-x_val)) + -0.0005 * x_val ^ 2 + 0.2 + rnorm(length(x_val), mean = 0.01, sd = 0.05)
y_lim <- c(-0.05, max(c(
y_val, y_val_slope, y_val_slope_quadratic,
y_val_slope_quadratic_noise
)) * 1.2)
par(mfrow=c(2,2))
plot(x_val, y_val, type = "l", xlab = "x", ylab = "f(x)", ylim = y_lim)
abline(h = 0, col = "grey")
legend("topleft", expression(y == frac(1, (1 + e ^ {
-x
}))), bty = "n", cex = 0.9)
mtext("A", cex = 1, side = 3, adj = 0, font = 2)
plot(x_val, y_val_slope, type = "l", xlab = "x", ylab = "f(x)", ylim = y_lim)
abline(h = 0, col = "grey")
legend("topleft", expression(y == frac(1, (1 + e ^ {
-x
})) + n), bty = "n", cex = 0.9)
mtext("B", cex = 1, side = 3, adj = 0, font = 2)
plot(
x_val, y_val_slope_quadratic, type = "l", xlab = "x", ylab = "f(x)",
ylim = y_lim
)
abline(h = 0, col = "grey")
legend("topleft", expression(y == frac(1, (1 + e ^ {
-x
})) + m * x ^ 2 + n), bty = "n", cex = 0.9)
mtext("C", cex = 1, side = 3, adj = 0, font = 2)
plot(
x_val, y_val_slope_quadratic_noise, type = "l", xlab = "x", ylab = "f(x)",
ylim = y_lim
)
abline(h = 0, col = "grey")
legend("topleft", expression(y == frac(1, (1 + e ^ {
-x
})) + m * x ^ 2 + n + epsilon, epsilon %~% N(0, sigma)), bty = "n", cex = 0.9)
mtext("D", cex = 1, side = 3, adj = 0, font = 2)
```
```{r figure_sigmoid_curve, echo=FALSE, fig.scap="Shape of amplification curves", fig.height=6, out.extra='', fig.cap="Amplification curve data from an iQ5 (Bio-Rad) thermo-cycler and a high throughput experiment in the Biomark HD (Fluidigm). A) The `C127EGHP` data set with 64 amplification curves was produced in a conventional thermo-cycler with a 8 x 12 PCR grid. B) The `htPCR` data set, which contains 8858 amplification curves, was produced in a 95 x 96 PCR grid. Only 200 amplification curves are shown. In contrast to `A)` have all amplification curves in `B)` a stong off-set (intercept) between -2.5 and 0 log(RFU). This needs proper baselineing."}
library(chipPCR)
library(qpcR)
# Create graphic device for the plot(s)
par(mfrow = c(2, 1), las = 0, bty = "o", oma = c(0, 0, 0, 0))
# Load C127EGHP data set from the chipPCR package
data <- chipPCR::C127EGHP
colors <- rainbow(1000, alpha = 0.7)
matplot(
data[, 2], log(data[, c(-1, -2)]), xlab = "Cycles", ylab = "log(RFU)",
main = "", type = "l", lty = 1, lwd = 2, col = sample(colors,
ncol(data) - 2)
)
abline(h = -2, col = "grey")
mtext("A iQ5 C127EGHP data set", cex = 1, side = 3, adj = 0, font = 2)
matplot(
htPCR[, 1], log(htPCR[, c(2L:201)]), xlab = "Cycles", ylab = "log(RFU)",
main = "", type = "l", lty = 1, lwd = 2, col = sample(colors, 200)
)
abline(h = -1, col = "grey")
mtext("B Biomark HD htPCR data set", cex = 1, side = 3, adj = 0, font = 2)
```
There are several open source and closed source software tools for the analysis of qPCR data [@pabinger_2014]. The software packages deal for example with challenges like missing values and non-detects [@mccall_non-detects_2014], quantification cycle estimation [@Ritz2008; @ruijter_evaluation_2013], relative gene expression analysis [@dvinge_htqpcr:_2009; @pabinger_qpcr:_2009; @neve_unifiedwmwqpcr:_2014] and data analysis pipelines [@pabinger_qpcr:_2009; @ronde_practical_2017; @mallona_pcrefficiency:_2011; @mallona_chainy:_nodate]. More information can be found in the **[online supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)**.
However, a bottleneck of qPCR data analysis is the lack of predictors and software to build classifiers for amplification curves. A classifier herein refers to a vector of **interpretable** predictors that can be used to distinguish the amplification curves only by their shape. A predictor, also referred to as *feature*, is an entity that characterizes an object. A few potential predictors for amplification curves are described in the literature.
# Principles of Amplification Curve Data Analysis and Predictor Calculation\label{section_DataAnalysis}
The shape of a positive amplification curve is in most cases sigmoidal. Many factors such as the sample quality, qPCR chemistry, and technical problems (e. g., sensor errors) contribute to various curve shapes [@ruijter_2014]. The curvature of the amplification curve can be used as a quality measure. For example, fragmentation, inhibitors and sample material handling errors during the extraction can be identified. The kinetic of fluorescence emission is proportional to the quantity of the synthesized DNA. Typical amplification curves have three phases.
1. **Ground phase**: This phase occurs during the first cycles of the PCR, where the fluorescence emission is in most cases flat. Here, noise but no product formation is detected by the sensor system and the PCR product signal is an insignificantly small component of the total signal. This is often referred to as base-line or background signal. Apparently, there is only a phase shift or no signal at all, primarily due to the limited sensitivity of the instrument. Even in a perfect PCR reaction (double amplification per cycle), qPCR instruments cannot detect the fluorescence signal from the amplification. Fragmentation, inhibitors and sample handling errors would result in a prolonged ground phase. Nevertheless, this may indicate some typical properties of the qPCR system or probe system.
In many instruments, this phase is used to determine the base-line level for the calculation of the Cycle threshold (Ct). The Ct value is considered statistically relevant as an increase outside of the noise range (threshold) when coming from the amplicon. In some qPCR systems, a flat amplification signal is expected in this phase. Slight deviations from this trend are presumably due to changes (e. g., disintegration of probes) in the fluorophores. Background correction algorithms are often used here to ensure that flat amplification curves without slope are generated. However, this can result in errors and inevitably leads to a loss of information via the waveform of the raw data [@nolan_2006]. The slope, level and variance of this phase can serve as predictors.
2. **Exponential phase**: This phase follows the ground phase and is also called *log-linear phase*. It is characterized by a strong increase of the emitted fluorescence as the DNA amount roughly doubles in each cycle under ideal conditions and when the amount of the synthesized fluorescent labeled PCR product is high enough to be detected by the sensor system. This phase is used for the calculation of the quantification point (Cq) and curve specific amplification efficiency. The most important measurement from qPCRs is the Cq, which signifies the PCR cycle for which the fluorescence exceeds a ``threshold value``. However, there is an ongoing debate as to what a significant and robust threshold value is. An overview and performance comparison of Cq methods is given in @ruijter_evaluation_2013. There are several mathematical methods to calculate the Cq.
- The 'classical' threshold value (cycle threshold, Ct) is the intersection between a manually defined straight horizontal line with the quasi-linear phase in the exponential amplification phase (\autoref{figure_quantification_points}A & B). This simple to implement method requires that amplification curves are properly baselined prior to analysis. The Ct method makes the assumption that the amplification efficiency (~ slope in the log-linear phase) is equal across all compared amplification curves [@ruijter_evaluation_2013]. Evidently, this is not always case as exemplified in \autoref{amplification_curve_ROI}C. The Ct method is widely used presumably due to the familiarity of users with this approach (e. g., chemical analysis procedures). However, this method is statistically unreliable [@ruijter_evaluation_2013; @spiess_impact_2015; @spiess_system-specific_2016]. Moreover, the Ct method gives no stable in predictions if different users are given the same data set to be analyzed. *Therefore, this method is not used within the \texttt{PCRedux} package*.
- Another Cq method uses the maximum of the second derivative (SDM) [@roediger2015r] (\autoref{figure_quantification_points}C). In all cases, the Cq value can be used to calculate the concentration of target sequence in a sample (low Cq \textrightarrow high target concentration). In contrast, negative or ambiguous amplification curves loosely resemble noise. This noise may appear linear or exhibit a curvature similar to a specific amplification curve (\autoref{htPCR_nap}). This however, may result in faulty interpretation of the amplification curves. Fragmentation, inhibitors and sample handling errors would decrease the slope of the amplification curve [@spiess_highly_2008; @Ritz2008]. The slope and its variation can be considered as predictors. Since the Cq depends on the initial template amount and amplification efficiency, there seemingly is no immediate use of the Cq as an predictor.
3. **Plateau phase**: This phase follows the exponential phase and is a consequence of the exhaustion of limited reagents (incl. primers, nucleotides, enzyme activity) in the reaction vessel, limiting the amplification reaction, so that the theoretical maximum amplification efficiency (doubling per cycle) no longer prevails. This turning point, and the progressive limitation of resources, finally leads to a plateau. In the plateau phase, there is in some cases a signal decrease called *hook effect* (\autoref{why_PCRedux} and [@barratt_improving_2002; @isaac_essentials_2009; @burdukiewicz_algorithms_2018]). The slope (*hook effect*), level and variation can be considered as predictors.
If the amplification curve has only a slight positive slope and no perceptible/measureable exponential phase, it can be assumed that the amplification reaction did not occur (\autoref{amplification_curve_ROI}B). Causes may include poor specificity of the PCR primers (non-specific PCR products), degraded sample material, degraded probes or detector failures. If a lot of input DNA is present in a sample, the amplification curve starts to increase in early PCR cycles (1 - 12 cycles). Some PCR devices have a software that corrects this feature without rechecking, resulting in an amplification curve with a negative trend.
The discussed phases are considered as regions of interest (ROI). As an example, the \textit{ground phase} is in the head area, while the \textit{plateau phase} is in the tail area. The \textit{exponential phase} is located between these two ROIs.
```{r amplification_curve_ROI, echo=FALSE, fig.scap=amplification_curve_ROI_short, fig.cap=amplification_curve_ROI, fig.height=8, fig.width=8}
library(qpcR)
library(PCRedux)
colors <- rainbow(10, alpha = 0.15)
x_range <- 1L:35
d <- testdat[x_range, ]
amp_data <- data.frame(
d[, 1],
pos = d[, 3] + 0.9,
posReverse = (max(d[, 3]) - rev(d[, 3])) + 0.9,
neg = d[, 4] + 0.9 + 0.0005 * d[, 1] ^ 2
)
# Calculation for the normal data
res_amp_data <- pcrfit(amp_data, 1, 2, l5)
res_takeoff <- takeoff(res_amp_data)
# Calculation of sd_bg
res_sd_bg <- sd(amp_data[1:res_takeoff[[1]], 2])
# Calculation for the reversed data
res_amp_data_reverse <- pcrfit(amp_data, 1, 3, l5)
res_takeoff_reverse <- takeoff(res_amp_data_reverse)
res_takeoff_reverse[[1]] <- nrow(d) - res_takeoff_reverse[[1]]
res_takeoff_reverse[[2]] <- amp_data[res_takeoff_reverse[[1]], 2] - res_takeoff_reverse[[2]] + min(amp_data[, 3])
exponentialRange <- c((res_takeoff[[1]] + 1):(res_takeoff_reverse[[1]] - 1))
backgroundplateu <- function(x) {
bg <- mean(head(x, res_takeoff[[1]])) + 3 * sd(head(x, res_takeoff[[1]]))
plat <- mean(tail(x, 10)) - 3 * sd(tail(x, 10))
list(bg = bg, plateau = plat)
}
res_lm <- lm(amp_data[exponentialRange, 2] ~ amp_data[exponentialRange, 1])
y_lim <- max(amp_data[, 2:4]) * 1.15
res_bgpl <- unlist(backgroundplateu(amp_data[, 2]))
# Create graphic device for the plot(s)
layout(matrix(c(1, 1, 2, 3), 2, 2, byrow = TRUE), respect = TRUE)
plot(amp_data[, 1], amp_data[, 2], ylim = c(-0.1, y_lim), xlab = "Cycles", ylab = "RFU", type = "b", lwd = 2, pch = 19)
text(c(2,30), c(10.5,10.5), c("Head", "Tail"), cex = 1.2, col = "red")
rect(0, 0, res_takeoff[[1]] + 1, res_takeoff[[2]] * 1.25, col = colors[1], border = NA)
text(5, res_bgpl[1] * 1.45, "Ground phase")
rect(res_takeoff_reverse[[1]] - 1, res_takeoff_reverse[[2]] * 0.95, nrow(amp_data), y_lim, col = colors[5], border = NA)
text(32, res_bgpl[2] * 1.1, "Plateau phase")
text(res_takeoff_reverse[[1]], mean(amp_data[, 2]), "Exponential\nregion")
points(
c(res_takeoff[[1]], res_takeoff_reverse[[1]]),
c(res_takeoff[[2]], res_takeoff_reverse[[2]]), pch = 12, cex = 2.5
)
text(
c(res_takeoff[[1]], res_takeoff_reverse[[1]]),
c(res_takeoff[[2]], res_takeoff_reverse[[2]]) + c(1.05, -1.05), c("top", "tdp")
)
arrows(20, 0, 20, res_bgpl[1], code = 3, length = 0.1)
text(30, res_bgpl[1] / 2, "Background")
arrows(5, res_bgpl[2], 5, max(amp_data[, 2]), code = 3, length = 0.1)
text(15, res_bgpl[2] * 0.95, "Plateau")
abline(res_lm, col = "red")
points(amp_data[exponentialRange, 1], amp_data[exponentialRange, 2], pch = 19, col = "red")
abline(h = res_bgpl, col = c("green", "blue"))
abline(h = 0, col = "grey")
legend(2, 12, paste0(
"Slope: ", signif(coef(res_lm)[2], 3),
"\nBackground (mean): ", signif(res_bgpl[1], 3),
"\nsd_bg: ", signif(res_sd_bg, 3),
"\nPlateau: ", signif(res_bgpl[2], 3),
"\ntop: ", signif(res_takeoff[[1]], 3),
"\ntdp: ", signif(res_takeoff_reverse[[1]], 3)
), bty = "n")
mtext("A Positive", cex = 1, side = 3, adj = 0, font = 2)
y_lim <- 2
plot(amp_data[, 1], amp_data[, 4], ylim = c(-0.1, y_lim), xlab = "Cycles", ylab = "RFU", type = "b", lwd = 2, pch = 19)
res_bgpl <- unlist(backgroundplateu(amp_data[, 4]))
abline(h = res_bgpl, col = c("green", "blue"))
abline(h = 0, col = "grey")
mtext("B Negative", cex = 1, side = 3, adj = 0, font = 2)
curve_colors <- c(rainbow(ncol(boggy) - 1, alpha = .5))
matplot(boggy[, 1], boggy[, -1], type = "l", col = curve_colors, xlab = "Cycles", ylab = "RFU", lty = 1)
rect(22, 2, 40, 2.3, border = "blue")
text(27.5, 2.1, "Hook effect", col = "blue")
mtext("C boggy data set", cex = 1, side = 3, adj = 0, font = 2)
```
The amplification curve shape, the amplification efficiency and the Cq value are important measures to judge the outcome of a qPCR reaction. In all phases of PCR, the curves should be smooth. Possible artifacts in the curves may be due to unstable light sources from the instrument or problems during sample preparation, such as the presence of bubbles in the reaction vessel, incorrectly assigned dye detectors, errors during the calibration of dyes for the instrument, errors during the preparation of the PCR master mix, sample degradation, lack of a sample in the PCR, too much sample material in the PCR mix or a low detection probe concentration [@ruijter_amplification_2009; @ruijter_2014; @spiess_impact_2015]. Smoothing and filtering cause alterations to the raw data that affects the Cq value and the amplification efficiency.
Most commercial qPCR systems do not display the raw data of the amplification curves on the screen. Instead, raw data are often processed by the instrument software to remove fluorophore-specific effects and noise in all ROIs. Commonly employed preprocessing step of qPCR is smoothing and filtering to remove noise, where the latter can have different causes [@spiess_impact_2015].
The ordinate often does not display the measured fluorescence, but rather the change in fluorescence per cycle ($\varDelta RFU = RFU_{cycle + 1} - RFU_{cycle}$). Some qPCR systems display periodicity in the amplification curve data, thereby exposing the risk of introducing artificial shifts in the Cq values [@spiess_system-specific_2016].
In particular the cycle threshold method (Ct method) (\autoref{section_DataAnalysis}) is affected by these factors [@spiess_impact_2015; @spiess_system-specific_2016]. Therefore, it is advisable to clarify in advance, which processing steps the amplification curves have been subjected to. Failure to do so may result in misinterpretations and incorrect amplification curve fitting models [@nolan_2006; @roediger2015r; @roediger2015chippcr; @spiess_impact_2015].
```{r figure_quantification_points, results='hide', message=FALSE, echo=FALSE, fig.scap=figure_quantification_points_short, fig.cap=figure_quantification_points, fig.height=8, fig.width=8}
library(qpcR)
library(chipPCR)
res_model <- pcrfit(testdat, cyc = 1, fluo = 2, model = l5)
res_takeoff <- takeoff(res_model, pval = 0.05, nsig = 3)
res_model_predict <- predict(res_model)
r_user <- 2.356
res_th.cyc <- th.cyc(testdat[, 1], testdat[, 2], r = r_user, linear = FALSE)
# Create graphic device for the plot(s)
layout(matrix(c(1, 2, 3, 3), 2, 2, byrow = TRUE), respect = TRUE)
plot(testdat[, 1], testdat[, 2], xlab = "Cycles", ylab = "Raw fluorescence")
abline(h = (mean(testdat[1:10, 2]) + 3 * sd(testdat[1:10, 2])), col = "grey")
abline(h = res_th.cyc[1, 2], col = "black")
text(28, r_user + 0.3, paste0("Threshold: ", r_user))
arrows(res_th.cyc[1, 1], res_th.cyc[1, 2], res_th.cyc[1, 1], 0, angle = 25,
length = 0.1, lwd = 2)
mtext(paste0("A ", "Ct = ", signif(res_th.cyc[1, 1], 4)), cex = 1, side = 3,
adj = 0, font = 2)
plot(testdat[, 1], log(testdat[, 2]), xlab = "Cycles",
ylab = "log(Raw fluorescence)")
abline(h = log(res_th.cyc[1, 2]), col = "black")
arrows(res_th.cyc[1, 1], log(res_th.cyc[1, 2]), res_th.cyc[1, 1],
min(log(testdat[, 2]), na.rm = TRUE), angle = 25, length = 0.1, lwd = 2)
mtext(paste0("B ", "Ct = ", signif(res_th.cyc[1, 1], 4)), cex = 1,
side = 3, adj = 0, font = 2)
res_efficiency <- efficiency(res_model)
cpDdiff <- sqrt((res_efficiency$cpD1 - res_efficiency$cpD2)^2)
arrows(res_takeoff[[1]], res_takeoff[[2]], res_takeoff[[1]], -0.2, angle = 25,
length = 0.1, lwd = 2)
abline(v = 19.5)
mtext(paste0("C ", "cpDdiff: ", cpDdiff), cex = 1, side = 3, adj = 0,
font = 2)
```
## Data Analysis Functions of the \texttt{PCRedux} Package \label{section_Functions_of_PCRedux}
### Helper Functions of the \texttt{PCRedux} Package \label{section_helper_functions}
The \texttt{PCRedux} package contains functions for analyzing amplification curves. These are distinguished into helper functions and analysis functions. The details about the:
- ``performeR()`` - Performance Analysis for Binary Classification and
- ``qPCR2fdata()`` - A Helper Function to Convert Amplification Curve Data to the `fdata` format
can by found in the **[online supplement](https://github.com/devSJR/PCRedux/raw/master/docs/articles/PCRedux.pdf)**.
### ``pcrfit_single()`` and ``encu()``- Predictor calculation from an Amplification Curve \label{section_pcrfit_single_pcrfit_parallel}
The following sections give a concise description of the algorithms used to calculate predictor vectors by the ``pcrfit_single()`` function. Based on considerations and experience, the algorithms of the ``pcrfit_single()`` function are restricted to ROIs (\autoref{amplification_curve_ROI}) to calculate specific predictors.
The ``encu()`` function is a wrapper for the ``pcrfit_single()`` function. ``encu()`` can be used to process large records of amplification curve data arranged in columns. The progress of processing is displayed in the form of a progress bar and the estimated run-time. Additionally, ``encu()`` allows to specify which monitoring chemistry (e. g., DNA binding dye, sequence specific probes) and which thermo-cycler was used. @ruijter_2014 demonstrated that the monitoring chemistry and the type of input DNA (single stranded, double stranded) are important when analysing qPCR data, because they have an influence on the shape of the amplification curve. For simplicity, the documentation will describe the ``pcrfit_single()`` only.
The underlying hypotheses and concepts of the predictors are formulated and supported by *exemplary applications*. Different representative data sets were used to support a concept or predictors. For example, the `RAS002` data set represents a typical qPCR. This means that the positive amplification curves start with a flat plateau phase and then transition into the sigmoid shape with a plateau. The negative amplification curves display no significant peculiarities. For both positive and negative amplification curves, there is a shift from the origin. The `htPCR` data set serves as a problem example in several places, since it contains many observations (amplification curves from high-throughput experiments). In addition, the amplification curves have a high diversity of curve shapes that cannot be uniquely and reproducibly classified even by experienced users. Other data sets are used in the documentation, but these are not discussed in detail.
To underscore the usability of the algorithms and their predictors, `r d <- PCRedux::data_sample; paste0(nrow(d), " observations ", "(", sum(d[["decision"]] == "n"), " negative amplification curves, ", sum(d[["decision"]] == "y"), " positive amplification curves)")` from the `batsch1`, `boggy`, `C126EG595`, `competimer`, `dil4reps94`, `guescini1`, `karlen1`, `lievens1`, `reps384`, `rutledge`, `testdat`, `vermeulen1`, `VIMCFX96_60`, `stepone_std`, `RAS002`, `RAS003`, `HCU32_aggR` and `lc96_bACTXY` were analyzed with the ``encu()`` function and the results (predictors) were combined in the file **`data_sample.rda`**. Users of this function should independently verify and validate the results of the methods for their own applications.
A new data set called `data_sample_subset_balanced` has been compiled from the `data_sample` data set for some of the applications. Selection criteria included:
- both positive and negative amplification curves had to be included in a similar ratio,
- there should not be a dominating thermal cycler platform,
- the amplification curves should represent typical amplification curves (subjective criterion).
The compilation of the data sets `batsch1`, `HCU32_aggR`, `lc96_bACTXY`, `RAS002`, `RAS003` and `stepone_std` met this requirement satisfactorily.
```{r, echo=FALSE}
data_sample_subset_balanced <- data_sample[data_sample$dataset %in%
c("batsch1", "boggy", "C126EG595", "HCU32_aggR", "lc96_bACTXY",
"RAS002", "RAS003", "stepone_std", "testdat"), ]
# Dimension of data_sample_subset_balanced
dim(data_sample_subset_balanced) ## Observations predictors
```
```{r, echo=FALSE, fig.width=3.8}
# Show the counts of negative and positive amplification
# curves in a bar plot
# Build a contingency table of the counts at each
# combination of factor levels.
dec_table<- table(data_sample_subset_balanced[["decision"]])
barplot(dec_table, ylab = "Number of Observations", col = c("green", "black"),
border = "white")
text(c(0.7, 1.9), rep(min(dec_table) *0.9, length(dec_table)),
c(paste("y = ", dec_table[1]), paste("n = ", dec_table[2])),
col = c("black", "white"))
mtext("data_sample_subset_balanced", cex = 1, side = 3, adj = 0, font = 2,
las = 0)
```
For the comparison of predictors, the data set was enlarged. Selection criteria for the data sets were comparatively less stringent.
```{r, echo=TRUE}
data_sample_subset <- data_sample[data_sample$dataset %in% c("stepone_std",
"RAS002", "RAS003",
"lc96_bACTXY",
"C126EG595",
"dil4reps94",
"testdat",
"boggy"), ]
# Dimension of data_sample_subset
dim(data_sample_subset) ## Observations predictors
```
```{r, echo=TRUE, fig.width=3.8}
# Show the counts of negative and positive amplification
# curves in a bar plot
# Build a contingency table of the counts at each
# combination of factor levels.
dec_table<- table(data_sample_subset[["decision"]])
barplot(dec_table, ylab = "Number of Observations", col = c("green", "black"),
border = "white")
text(c(0.7, 1.9), rep(min(dec_table) *0.9, length(dec_table)),
c(paste("y = ", dec_table[1]), paste("n = ", dec_table[2])),
col = c("black", "white"))
mtext("data_sample_subset", cex = 1, side = 3, adj = 0, font = 2,
las = 0)
```
The goal is to demonstrate the basic functionality of the algorithms for predictor calculation. Similar concepts are presented in groups. The algorithms are divided into the following broad categories:
- algorithms that determine slopes, signal levels,
- algorithms that determine turning points and
- algorithms that determine areas.
The algorithms in
- ``earlyreg()`` (\autoref{section_earlyreg}),
- ``head2tailratio()`` (\autoref{section_head2tailratio}),
- ``hookreg()``& ``hookregNL()`` (\autoref{section_hookreg}) and
- ``mblrr()`` (\autoref{section_mblrr}),
- ``autocorrelation_test()`` (\autoref{section_autocorrelation_test})
were implemented as standalone functions to make them available for other applications.
The output below shows the predictors and their data type (`num`, numeric; `int`, integer; `Factor`, factor; `logi`, boolean) that were determined with the ``pcrfit_single()`` function.
```{r, echo=TRUE}
library(PCRedux)
# Calculate predictor vector of column two from the RAS002 data set.
str(pcrfit_single(RAS002[, 2]))
```
### Amplification Curve Preprocessing
The ``pcrfit_single()`` function performs preprocessing steps before each calculation, including checking whether an amplification curve contains missing values. Missing values (NA) are measuring points in a data set where no measured values are available or have been removed arbitrarily. NAs may occur if no measurement has been carried out (e. g., defective detector) or lengths of the vectors differ (number of cycles) between the observations. Such missing values are automatically imputed by spline interpolation as described in @roediger2015chippcr.
Values of an amplification curve are normalized to their 99\% quantile or rare cases to the maximum for many calculations. The normalization is used to equalize the amplitudes differences of amplification curves from thermo-cyclers (sensor technology, software processing) and detection chemistries. To compare amplification curves from different thermo-cyclers, the values should always be scaled systematically using the same method. Although there are other normalization methods (e. g., minimum-maximum normalization, see @roediger2015chippcr, the normalization by the 99\% quantile preserves the information about the level of the background phase. A normalization to the maximum is not used to avoid strong extenuation by outliers. The data in \autoref{plot_bg_pt}D show that the `maxRFU` values after normalization are approximately 1. There is no statistical significant difference between `maxRFU` values of positive and negative amplification curves.
Selected algorithms of the ``pcrfit_single()`` function use the ``CPP()`` [\texttt{chipPCR}] function to preprocess (e. g., base-lining, smoothing, imputation of missing values) the amplification curves. Further details are given in @roediger2015chippcr. Until package version 0.2.6-4 was the ``visdat_pcrfit()`` part of the package. ``visdat_pcrfit()`` was used for visualizing the content of data from an analysis with the ``pcrfit_single()`` function. There are other more powerful packages such as \texttt{visdat} by @Tierney2017, \texttt{assertr} by @assertr and \texttt{xray} by @Seibelt_xray.
During the analysis, several values are determined to describe the amplitude of an amplification curve. The resulting potential predictors are `minRFU` (minimum of the amplification curve, which is determined at the 1\% quantile to minimize the influence of outliers), `init2` (the initial template fluorescence from an exponential model) and `fluo` (raw fluorescence value at the second derivative maximum). The `minRFU`, `init2` and `fluo` values differ significantly between negative and positive amplification curves (\autoref{plot_bg_pt}C, E & F).
### Handling of Missing Predictors
Missing values (NA) can occur if a calculation of a predictor is impossible (e. g., if a logistic function cannot be adapted to noisy raw data). The lack of a predictor is nevertheless an useful information (no predictor calculate $\mapsto$ amplification curves deviate from sigmoid shape). The NAs were left unchanged in the \texttt{PCRedux} package up to version 0.2.5-1. Since version 0.2.6 the NAs are replaced by numerical values (e. g., total number of cycles) or factors (e. g., *lNA* for non-fitted model). Under the term "imputation", there are a number of procedures based on statistical methods (e. g., neighboring median, spline interpolation) or on user-defined rules [@williams_rattle:_2009; @cook_interactive_2007; @hothorn_handbook_2014]. Rules are mainly used in the functions of \texttt{PCRedux} to relieve the user from the decision as to how to deal with missing values. For example, slope parameters of a model are set to zero when it cannot be determined. The disadvantage is that rules do not necessarily concur to real world values.
### Multi-parametric Models for Amplification Curve Fitting\label{section_models}
Both the ``pcrfit_single()`` function and the ``encu()`` function use four multi-parametric models based on the findings of @spiess_highly_2008 and @Ritz2008. The ``pcrfit_single()`` function starts by adjusting a seven-parameter model since this adapts *easier* and more frequent to a data set (\autoref{plot_models}).
* **l7**:
\begin{equation}\label{l7}
f(x) = c + k1 \cdot x + k2 \cdot x^2 + \frac{d - c}{(1 + exp(b(log(x) - log(e))))^f}
\end{equation}
From that model, the ``pcrfit_single()`` function estimates the variables `b_slope` and `c_intercept`, describing the slope and the y-intercept. The number of iterations required to adapt the model is also stored. That value is returned by the ``pcrfit_single()`` function as `convInfo_iteratons`. The higher the `convInfo_iteratons` value, the more iterations are necessary to converge from the start parameters (\autoref{plot_dat_EffTop}L). A low `convInfo_iteratons` value is an indicator for
* a sigmoid curve shape or
* close start parameters.
High iterations numbers imply
* noisy amplification curves or
* non-sigmoid amplification curves.
The amplification curve fitting process continues with the four-parameter model (*l4*, \autoref{l4}). This is followed by a model with five parameters (*l5*, \autoref{l5}) and six parameters (*l6*, \autoref{l6}).
* **l4**:
\begin{equation}\label{l4}
f(x) = c + \frac{d - c}{1 + exp(b(log(x) - log(e)))}
\end{equation}
* **l5**:
\begin{equation}\label{l5}
f(x) = c + \frac{d - c}{(1 + exp(b(log(x) - log(e))))^f}
\end{equation}
* **l6**:
\begin{equation}\label{l6}
f(x) = c + k \cdot x + \frac{d - c}{(1 + exp(b(log(x) - log(e))))^f}
\end{equation}
The optimal model is selected on the basis of the Akaike information criterion and used for all further calculations. The ``pcrfit_single()`` function returns `qPCRmodel` as a factor (*l4*, *l5*, *l6*, *l7*). In case no model could be fitted, an *lNA* is returned.
The model is an indicator of the amplification curve shape. Model with many parameters deviate more from an ideal sigmoid model. For instance, a four-parameter model, unlike the six-parameter model, does not have a linear component. A negative linear slope in the plateau phase is an indicator of a *hook effect* [@burdukiewicz_algorithms_2018].
```{r plot_models, echo=FALSE, fig.cap=plot_models, fig.scap=plot_models_short, fig.height=3.5}
library(PCRedux)
x <- data_sample$decision
y <- factor(data_sample[["qPCRmodel"]], levels = c("lNA", "l4", "l5", "l6", "l7"))
res_fw <- rbind(negative = table(y[x == "n"]),
positive = table(y[x == "y"])
)
# Define custom colors for the classes
colors <- c(adjustcolor("black", alpha.f = 0.25), adjustcolor("green", alpha.f = 0.25))
# Create graphic device for the plot(s)
par(mfrow = c(1,2))
barplot(res_fw / sum(res_fw) * 100, beside = TRUE, col = colors, xlab = "",
ylab = "Percentage", border = "white")
legend("top", c(
paste0("Negative, n =", length(y[x == "n"])), paste0("Positive, n = ", length(y[x == "y"]))
), fill = colors, bty = "n")
mtext("A Fitted models", cex = 1, side = 3, adj = 0, font = 2)
# Data used for the analysis
data <- data_sample
# Predictor that is going to be analyzed
predictor <- c("cpD2")
# Classes assigned manually be a human beforehand
x <- data$decision
# Helper function `densR()`, to plot density plot in a stripchart
densR <- function(data, decision, size = 0.4, position = 1.35){
x <- decision
y <- data[, colnames(data) == predictor[i]]
y_density_neg <- density(y[x == "n"])
y_density_pos <- density(y[x == "y"])
max_density <- max(c(y_density_pos$y, y_density_neg$y))
polygon(y_density_neg$y/max_density * size + position, y_density_neg$x, col = colors[1], border = NA)
polygon(y_density_pos$y/max_density * size + position, y_density_pos$x, col = adjustcolor("green", alpha.f = 0.25), border = NA)
}
for(i in 1L:length(predictor)) {
y <- data[, colnames(data) == predictor[i]]
res <- stats::wilcox.test(y ~ x)
h <- max(na.omit(y))
l <- min(na.omit(y))
h_text <- rep(h * 0.976, 2)
par(bg=NA)
stripchart(y ~ x, vertical = TRUE, ylab = "Cq (cpD2)",
method = "jitter", pch = 20, cex = 0.7,
col = adjustcolor("black", alpha.f = 0.8),
ylim = c(l * 0.95, h * 1.05))
densR(data = data, decision = x, size = 0.4, position = 1.35)
arrows(1,-0.25,1.5,25, length = 0.1)
arrows(2,-0.25,1.5,5, length = 0.1)
boxplot(y ~ x, outline = FALSE, add = TRUE, boxwex = 0.35)
legend("topleft", paste0("P = ", signif(res[["p.value"]])),
cex = 1, bty = "n")
mtext(paste0("B", " ", predictor[i]), cex = 1, side = 3,
adj = 0, font = 2, col = ifelse(signif(res[["p.value"]], 2) < 0.05,
"black", "red"))
}
```
### ``winklR()`` - A function to calculate the central angle based on the first and the second derivative of an amplification curve data
``winklR()`` is a function to calculate the in the trajectory of the first negative and the second negative derivatives maxima and minima (\autoref{winklR}) of an amplification curve data from a quantitative PCR experiment. For the determination of the angle, the origin is the maximum of the first derivative. On this basis, the vectors to the approximate minimum and maximum of the second derivatives are determined. The vectors result from the relation of the maximum of the first derivative to the minimum of the second derivative and from the maximum of the first derivative to the maximum of the second derivative. In a simple trigonometric approach, the scalar product of the two vectors is formed first. Then the absolute values are calculated and multiplied by each other. Finally, the value is converted into an angle with the cosine. The assumption is that flat (negative amplification curves) have a large angle and sigmoid (positive amplification curves) have a smaller angle. Another assumption is that this angle is independent of the rotation of the amplification curve. This means that systematic off-sets, such as those caused by incorrect background correction, are of no consequence. The cycles to be analyzed is defined by the user. The output contains the angle and the coordinates of the minima and maxima.
```{r, echo=FALSE, fig.cap=winklR_principle, fig.scap=winklR_principle_short, fig.height=4, fig.width=9}
par(mfrow = c(1,2))
for(i in 1:2){
columns <- c(2,3) + 2
class <- c("positive", "negative")
res <- winklR(RAS002[, 1], RAS002[, columns[i]], preprocess = TRUE)
y_lim_range <- range(c(RAS002[, columns[i]], res$origin[2], res[["p1"]][2], res[["p2"]][2]))
plot(RAS002[, 1], RAS002[, columns[i]], type = "l", ylim = y_lim_range,
xlab = "Cycles", ylab = "RFU")
mtext(paste0(LETTERS[i], " ", class[i]), cex = 1, side = 3,
adj = 0, font = 2)
points(res$origin, col = "red", pch = 19)
points(res[["p1"]], col = "green", pch = 19)
points(res[["p2"]], col = "blue", pch = 19)
arrows(res$origin[[1]], res$origin[[2]], res[["p1"]][[1]], res[["p1"]][[2]], length = .05, col = "green")
arrows(res$origin[[1]], res$origin[[2]], res[["p2"]][[1]], res[["p2"]][[2]], length = .05, col = "blue")
text(res$origin[1], res$origin[2] + 250, signif(res[["angle"]], 2))
}
```
```{r winklR, echo=FALSE, fig.cap=winklR, fig.scap=winklR_short, fig.height=4, fig.width=9}
# Calculate the central angles for amplification curves from the RAS002 data set.
library(PCRedux)
# Load the amplification curves from the RAS002 data set.
DATA <- PCRedux::RAS002
# Load the RAS002_decisions data set.
dec <- RAS002_decisions