forked from shokru/mlfactor.github.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcausality.html
1005 lines (949 loc) · 130 KB
/
causality.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="" xml:lang="">
<head>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<title>Chapter 14 Two key concepts: causality and non-stationarity | Machine Learning for Factor Investing</title>
<meta name="description" content="Chapter 14 Two key concepts: causality and non-stationarity | Machine Learning for Factor Investing" />
<meta name="generator" content="bookdown 0.21 and GitBook 2.6.7" />
<meta property="og:title" content="Chapter 14 Two key concepts: causality and non-stationarity | Machine Learning for Factor Investing" />
<meta property="og:type" content="book" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="Chapter 14 Two key concepts: causality and non-stationarity | Machine Learning for Factor Investing" />
<meta name="author" content="Guillaume Coqueret and Tony Guida" />
<meta name="date" content="2021-01-08" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black" />
<link rel="prev" href="interp.html"/>
<link rel="next" href="unsup.html"/>
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-clipboard.css" rel="stylesheet" />
<script src="libs/accessible-code-block-0.0.1/empty-anchor.js"></script>
<link href="libs/anchor-sections-1.0/anchor-sections.css" rel="stylesheet" />
<script src="libs/anchor-sections-1.0/anchor-sections.js"></script>
<script src="libs/kePrint-0.0.1/kePrint.js"></script>
<link href="libs/lightable-0.0.1/lightable.css" rel="stylesheet" />
<style type="text/css">
code.sourceCode > span { display: inline-block; line-height: 1.25; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>
</head>
<body>
<div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
<div class="book-summary">
<nav role="navigation">
<ul class="summary">
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html"><i class="fa fa-check"></i>Preface</a><ul>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#what-this-book-is-not-about"><i class="fa fa-check"></i>What this book is not about</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#the-targeted-audience"><i class="fa fa-check"></i>The targeted audience</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#how-this-book-is-structured"><i class="fa fa-check"></i>How this book is structured</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#companion-website"><i class="fa fa-check"></i>Companion website</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#why-r"><i class="fa fa-check"></i>Why R?</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#coding-instructions"><i class="fa fa-check"></i>Coding instructions</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#acknowledgments"><i class="fa fa-check"></i>Acknowledgments</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#future-developments"><i class="fa fa-check"></i>Future developments</a></li>
</ul></li>
<li class="part"><span><b>I Introduction</b></span></li>
<li class="chapter" data-level="1" data-path="notdata.html"><a href="notdata.html"><i class="fa fa-check"></i><b>1</b> Notations and data</a><ul>
<li class="chapter" data-level="1.1" data-path="notdata.html"><a href="notdata.html#notations"><i class="fa fa-check"></i><b>1.1</b> Notations</a></li>
<li class="chapter" data-level="1.2" data-path="notdata.html"><a href="notdata.html#dataset"><i class="fa fa-check"></i><b>1.2</b> Dataset</a></li>
</ul></li>
<li class="chapter" data-level="2" data-path="intro.html"><a href="intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
<li class="chapter" data-level="2.1" data-path="intro.html"><a href="intro.html#context"><i class="fa fa-check"></i><b>2.1</b> Context</a></li>
<li class="chapter" data-level="2.2" data-path="intro.html"><a href="intro.html#portfolio-construction-the-workflow"><i class="fa fa-check"></i><b>2.2</b> Portfolio construction: the workflow</a></li>
<li class="chapter" data-level="2.3" data-path="intro.html"><a href="intro.html#machine-learning-is-no-magic-wand"><i class="fa fa-check"></i><b>2.3</b> Machine learning is no magic wand</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="factor.html"><a href="factor.html"><i class="fa fa-check"></i><b>3</b> Factor investing and asset pricing anomalies</a><ul>
<li class="chapter" data-level="3.1" data-path="factor.html"><a href="factor.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
<li class="chapter" data-level="3.2" data-path="factor.html"><a href="factor.html#detecting-anomalies"><i class="fa fa-check"></i><b>3.2</b> Detecting anomalies</a><ul>
<li class="chapter" data-level="3.2.1" data-path="factor.html"><a href="factor.html#challenges"><i class="fa fa-check"></i><b>3.2.1</b> Challenges</a></li>
<li class="chapter" data-level="3.2.2" data-path="factor.html"><a href="factor.html#simple-portfolio-sorts"><i class="fa fa-check"></i><b>3.2.2</b> Simple portfolio sorts </a></li>
<li class="chapter" data-level="3.2.3" data-path="factor.html"><a href="factor.html#factors"><i class="fa fa-check"></i><b>3.2.3</b> Factors</a></li>
<li class="chapter" data-level="3.2.4" data-path="factor.html"><a href="factor.html#predictive-regressions-sorts-and-p-value-issues"><i class="fa fa-check"></i><b>3.2.4</b> Predictive regressions, sorts, and p-value issues</a></li>
<li class="chapter" data-level="3.2.5" data-path="factor.html"><a href="factor.html#fama-macbeth-regressions"><i class="fa fa-check"></i><b>3.2.5</b> Fama-Macbeth regressions</a></li>
<li class="chapter" data-level="3.2.6" data-path="factor.html"><a href="factor.html#factor-competition"><i class="fa fa-check"></i><b>3.2.6</b> Factor competition</a></li>
<li class="chapter" data-level="3.2.7" data-path="factor.html"><a href="factor.html#advanced-techniques"><i class="fa fa-check"></i><b>3.2.7</b> Advanced techniques</a></li>
</ul></li>
<li class="chapter" data-level="3.3" data-path="factor.html"><a href="factor.html#factors-or-characteristics"><i class="fa fa-check"></i><b>3.3</b> Factors or characteristics?</a></li>
<li class="chapter" data-level="3.4" data-path="factor.html"><a href="factor.html#hot-topics-momentum-timing-and-esg"><i class="fa fa-check"></i><b>3.4</b> Hot topics: momentum, timing and ESG</a><ul>
<li class="chapter" data-level="3.4.1" data-path="factor.html"><a href="factor.html#factor-momentum"><i class="fa fa-check"></i><b>3.4.1</b> Factor momentum</a></li>
<li class="chapter" data-level="3.4.2" data-path="factor.html"><a href="factor.html#factor-timing"><i class="fa fa-check"></i><b>3.4.2</b> Factor timing</a></li>
<li class="chapter" data-level="3.4.3" data-path="factor.html"><a href="factor.html#the-green-factors"><i class="fa fa-check"></i><b>3.4.3</b> The green factors</a></li>
</ul></li>
<li class="chapter" data-level="3.5" data-path="factor.html"><a href="factor.html#the-links-with-machine-learning"><i class="fa fa-check"></i><b>3.5</b> The links with machine learning</a><ul>
<li class="chapter" data-level="3.5.1" data-path="factor.html"><a href="factor.html#a-short-list-of-recent-references"><i class="fa fa-check"></i><b>3.5.1</b> A short list of recent references</a></li>
<li class="chapter" data-level="3.5.2" data-path="factor.html"><a href="factor.html#explicit-connections-with-asset-pricing-models"><i class="fa fa-check"></i><b>3.5.2</b> Explicit connections with asset pricing models</a></li>
</ul></li>
<li class="chapter" data-level="3.6" data-path="factor.html"><a href="factor.html#coding-exercises"><i class="fa fa-check"></i><b>3.6</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="4" data-path="Data.html"><a href="Data.html"><i class="fa fa-check"></i><b>4</b> Data preprocessing</a><ul>
<li class="chapter" data-level="4.1" data-path="Data.html"><a href="Data.html#know-your-data"><i class="fa fa-check"></i><b>4.1</b> Know your data</a></li>
<li class="chapter" data-level="4.2" data-path="Data.html"><a href="Data.html#missing-data"><i class="fa fa-check"></i><b>4.2</b> Missing data</a></li>
<li class="chapter" data-level="4.3" data-path="Data.html"><a href="Data.html#outlier-detection"><i class="fa fa-check"></i><b>4.3</b> Outlier detection</a></li>
<li class="chapter" data-level="4.4" data-path="Data.html"><a href="Data.html#feateng"><i class="fa fa-check"></i><b>4.4</b> Feature engineering</a><ul>
<li class="chapter" data-level="4.4.1" data-path="Data.html"><a href="Data.html#feature-selection"><i class="fa fa-check"></i><b>4.4.1</b> Feature selection</a></li>
<li class="chapter" data-level="4.4.2" data-path="Data.html"><a href="Data.html#scaling"><i class="fa fa-check"></i><b>4.4.2</b> Scaling the predictors</a></li>
</ul></li>
<li class="chapter" data-level="4.5" data-path="Data.html"><a href="Data.html#labelling"><i class="fa fa-check"></i><b>4.5</b> Labelling</a><ul>
<li class="chapter" data-level="4.5.1" data-path="Data.html"><a href="Data.html#simple-labels"><i class="fa fa-check"></i><b>4.5.1</b> Simple labels</a></li>
<li class="chapter" data-level="4.5.2" data-path="Data.html"><a href="Data.html#categorical-labels"><i class="fa fa-check"></i><b>4.5.2</b> Categorical labels</a></li>
<li class="chapter" data-level="4.5.3" data-path="Data.html"><a href="Data.html#the-triple-barrier-method"><i class="fa fa-check"></i><b>4.5.3</b> The triple barrier method</a></li>
<li class="chapter" data-level="4.5.4" data-path="Data.html"><a href="Data.html#filtering-the-sample"><i class="fa fa-check"></i><b>4.5.4</b> Filtering the sample</a></li>
<li class="chapter" data-level="4.5.5" data-path="Data.html"><a href="Data.html#horizons"><i class="fa fa-check"></i><b>4.5.5</b> Return horizons</a></li>
</ul></li>
<li class="chapter" data-level="4.6" data-path="Data.html"><a href="Data.html#pers"><i class="fa fa-check"></i><b>4.6</b> Handling persistence</a></li>
<li class="chapter" data-level="4.7" data-path="Data.html"><a href="Data.html#extensions"><i class="fa fa-check"></i><b>4.7</b> Extensions</a><ul>
<li class="chapter" data-level="4.7.1" data-path="Data.html"><a href="Data.html#transforming-features"><i class="fa fa-check"></i><b>4.7.1</b> Transforming features</a></li>
<li class="chapter" data-level="4.7.2" data-path="Data.html"><a href="Data.html#macrovar"><i class="fa fa-check"></i><b>4.7.2</b> Macro-economic variables</a></li>
<li class="chapter" data-level="4.7.3" data-path="Data.html"><a href="Data.html#active-learning"><i class="fa fa-check"></i><b>4.7.3</b> Active learning</a></li>
</ul></li>
<li class="chapter" data-level="4.8" data-path="Data.html"><a href="Data.html#additional-code-and-results"><i class="fa fa-check"></i><b>4.8</b> Additional code and results</a><ul>
<li class="chapter" data-level="4.8.1" data-path="Data.html"><a href="Data.html#impact-of-rescaling-graphical-representation"><i class="fa fa-check"></i><b>4.8.1</b> Impact of rescaling: graphical representation</a></li>
<li class="chapter" data-level="4.8.2" data-path="Data.html"><a href="Data.html#impact-of-rescaling-toy-example"><i class="fa fa-check"></i><b>4.8.2</b> Impact of rescaling: toy example</a></li>
</ul></li>
<li class="chapter" data-level="4.9" data-path="Data.html"><a href="Data.html#coding-exercises-1"><i class="fa fa-check"></i><b>4.9</b> Coding exercises</a></li>
</ul></li>
<li class="part"><span><b>II Common supervised algorithms</b></span></li>
<li class="chapter" data-level="5" data-path="lasso.html"><a href="lasso.html"><i class="fa fa-check"></i><b>5</b> Penalized regressions and sparse hedging for minimum variance portfolios</a><ul>
<li class="chapter" data-level="5.1" data-path="lasso.html"><a href="lasso.html#penalized-regressions"><i class="fa fa-check"></i><b>5.1</b> Penalized regressions</a><ul>
<li class="chapter" data-level="5.1.1" data-path="lasso.html"><a href="lasso.html#penreg"><i class="fa fa-check"></i><b>5.1.1</b> Simple regressions</a></li>
<li class="chapter" data-level="5.1.2" data-path="lasso.html"><a href="lasso.html#forms-of-penalizations"><i class="fa fa-check"></i><b>5.1.2</b> Forms of penalizations</a></li>
<li class="chapter" data-level="5.1.3" data-path="lasso.html"><a href="lasso.html#illustrations"><i class="fa fa-check"></i><b>5.1.3</b> Illustrations</a></li>
</ul></li>
<li class="chapter" data-level="5.2" data-path="lasso.html"><a href="lasso.html#sparse-hedging-for-minimum-variance-portfolios"><i class="fa fa-check"></i><b>5.2</b> Sparse hedging for minimum variance portfolios</a><ul>
<li class="chapter" data-level="5.2.1" data-path="lasso.html"><a href="lasso.html#presentation-and-derivations"><i class="fa fa-check"></i><b>5.2.1</b> Presentation and derivations</a></li>
<li class="chapter" data-level="5.2.2" data-path="lasso.html"><a href="lasso.html#sparseex"><i class="fa fa-check"></i><b>5.2.2</b> Example</a></li>
</ul></li>
<li class="chapter" data-level="5.3" data-path="lasso.html"><a href="lasso.html#predictive-regressions"><i class="fa fa-check"></i><b>5.3</b> Predictive regressions</a><ul>
<li class="chapter" data-level="5.3.1" data-path="lasso.html"><a href="lasso.html#literature-review-and-principle"><i class="fa fa-check"></i><b>5.3.1</b> Literature review and principle</a></li>
<li class="chapter" data-level="5.3.2" data-path="lasso.html"><a href="lasso.html#code-and-results"><i class="fa fa-check"></i><b>5.3.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="5.4" data-path="lasso.html"><a href="lasso.html#coding-exercise"><i class="fa fa-check"></i><b>5.4</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="6" data-path="trees.html"><a href="trees.html"><i class="fa fa-check"></i><b>6</b> Tree-based methods</a><ul>
<li class="chapter" data-level="6.1" data-path="trees.html"><a href="trees.html#simple-trees"><i class="fa fa-check"></i><b>6.1</b> Simple trees</a><ul>
<li class="chapter" data-level="6.1.1" data-path="trees.html"><a href="trees.html#principle"><i class="fa fa-check"></i><b>6.1.1</b> Principle</a></li>
<li class="chapter" data-level="6.1.2" data-path="trees.html"><a href="trees.html#treeclass"><i class="fa fa-check"></i><b>6.1.2</b> Further details on classification</a></li>
<li class="chapter" data-level="6.1.3" data-path="trees.html"><a href="trees.html#pruning-criteria"><i class="fa fa-check"></i><b>6.1.3</b> Pruning criteria</a></li>
<li class="chapter" data-level="6.1.4" data-path="trees.html"><a href="trees.html#code-and-interpretation"><i class="fa fa-check"></i><b>6.1.4</b> Code and interpretation</a></li>
</ul></li>
<li class="chapter" data-level="6.2" data-path="trees.html"><a href="trees.html#random-forests"><i class="fa fa-check"></i><b>6.2</b> Random forests</a><ul>
<li class="chapter" data-level="6.2.1" data-path="trees.html"><a href="trees.html#principle-1"><i class="fa fa-check"></i><b>6.2.1</b> Principle</a></li>
<li class="chapter" data-level="6.2.2" data-path="trees.html"><a href="trees.html#code-and-results-1"><i class="fa fa-check"></i><b>6.2.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="6.3" data-path="trees.html"><a href="trees.html#adaboost"><i class="fa fa-check"></i><b>6.3</b> Boosted trees: Adaboost</a><ul>
<li class="chapter" data-level="6.3.1" data-path="trees.html"><a href="trees.html#methodology"><i class="fa fa-check"></i><b>6.3.1</b> Methodology</a></li>
<li class="chapter" data-level="6.3.2" data-path="trees.html"><a href="trees.html#illustration"><i class="fa fa-check"></i><b>6.3.2</b> Illustration</a></li>
</ul></li>
<li class="chapter" data-level="6.4" data-path="trees.html"><a href="trees.html#boosted-trees-extreme-gradient-boosting"><i class="fa fa-check"></i><b>6.4</b> Boosted trees: extreme gradient boosting</a><ul>
<li class="chapter" data-level="6.4.1" data-path="trees.html"><a href="trees.html#managing-loss"><i class="fa fa-check"></i><b>6.4.1</b> Managing loss</a></li>
<li class="chapter" data-level="6.4.2" data-path="trees.html"><a href="trees.html#penalization"><i class="fa fa-check"></i><b>6.4.2</b> Penalization</a></li>
<li class="chapter" data-level="6.4.3" data-path="trees.html"><a href="trees.html#aggregation"><i class="fa fa-check"></i><b>6.4.3</b> Aggregation</a></li>
<li class="chapter" data-level="6.4.4" data-path="trees.html"><a href="trees.html#tree-structure"><i class="fa fa-check"></i><b>6.4.4</b> Tree structure</a></li>
<li class="chapter" data-level="6.4.5" data-path="trees.html"><a href="trees.html#boostext"><i class="fa fa-check"></i><b>6.4.5</b> Extensions</a></li>
<li class="chapter" data-level="6.4.6" data-path="trees.html"><a href="trees.html#boostcode"><i class="fa fa-check"></i><b>6.4.6</b> Code and results</a></li>
<li class="chapter" data-level="6.4.7" data-path="trees.html"><a href="trees.html#instweight"><i class="fa fa-check"></i><b>6.4.7</b> Instance weighting</a></li>
</ul></li>
<li class="chapter" data-level="6.5" data-path="trees.html"><a href="trees.html#discussion"><i class="fa fa-check"></i><b>6.5</b> Discussion</a></li>
<li class="chapter" data-level="6.6" data-path="trees.html"><a href="trees.html#coding-exercises-2"><i class="fa fa-check"></i><b>6.6</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="NN.html"><a href="NN.html"><i class="fa fa-check"></i><b>7</b> Neural networks</a><ul>
<li class="chapter" data-level="7.1" data-path="NN.html"><a href="NN.html#the-original-perceptron"><i class="fa fa-check"></i><b>7.1</b> The original perceptron</a></li>
<li class="chapter" data-level="7.2" data-path="NN.html"><a href="NN.html#multilayer-perceptron"><i class="fa fa-check"></i><b>7.2</b> Multilayer perceptron</a><ul>
<li class="chapter" data-level="7.2.1" data-path="NN.html"><a href="NN.html#introduction-and-notations"><i class="fa fa-check"></i><b>7.2.1</b> Introduction and notations</a></li>
<li class="chapter" data-level="7.2.2" data-path="NN.html"><a href="NN.html#universal-approximation"><i class="fa fa-check"></i><b>7.2.2</b> Universal approximation</a></li>
<li class="chapter" data-level="7.2.3" data-path="NN.html"><a href="NN.html#backprop"><i class="fa fa-check"></i><b>7.2.3</b> Learning via back-propagation</a></li>
<li class="chapter" data-level="7.2.4" data-path="NN.html"><a href="NN.html#further-details-on-classification"><i class="fa fa-check"></i><b>7.2.4</b> Further details on classification</a></li>
</ul></li>
<li class="chapter" data-level="7.3" data-path="NN.html"><a href="NN.html#howdeep"><i class="fa fa-check"></i><b>7.3</b> How deep we should go and other practical issues</a><ul>
<li class="chapter" data-level="7.3.1" data-path="NN.html"><a href="NN.html#architectural-choices"><i class="fa fa-check"></i><b>7.3.1</b> Architectural choices</a></li>
<li class="chapter" data-level="7.3.2" data-path="NN.html"><a href="NN.html#frequency-of-weight-updates-and-learning-duration"><i class="fa fa-check"></i><b>7.3.2</b> Frequency of weight updates and learning duration</a></li>
<li class="chapter" data-level="7.3.3" data-path="NN.html"><a href="NN.html#penalizations-and-dropout"><i class="fa fa-check"></i><b>7.3.3</b> Penalizations and dropout</a></li>
</ul></li>
<li class="chapter" data-level="7.4" data-path="NN.html"><a href="NN.html#code-samples-and-comments-for-vanilla-mlp"><i class="fa fa-check"></i><b>7.4</b> Code samples and comments for vanilla MLP</a><ul>
<li class="chapter" data-level="7.4.1" data-path="NN.html"><a href="NN.html#regression-example"><i class="fa fa-check"></i><b>7.4.1</b> Regression example</a></li>
<li class="chapter" data-level="7.4.2" data-path="NN.html"><a href="NN.html#classification-example"><i class="fa fa-check"></i><b>7.4.2</b> Classification example</a></li>
<li class="chapter" data-level="7.4.3" data-path="NN.html"><a href="NN.html#custloss"><i class="fa fa-check"></i><b>7.4.3</b> Custom losses</a></li>
</ul></li>
<li class="chapter" data-level="7.5" data-path="NN.html"><a href="NN.html#recurrent-networks"><i class="fa fa-check"></i><b>7.5</b> Recurrent networks</a><ul>
<li class="chapter" data-level="7.5.1" data-path="NN.html"><a href="NN.html#presentation"><i class="fa fa-check"></i><b>7.5.1</b> Presentation</a></li>
<li class="chapter" data-level="7.5.2" data-path="NN.html"><a href="NN.html#code-and-results-2"><i class="fa fa-check"></i><b>7.5.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="7.6" data-path="NN.html"><a href="NN.html#other-common-architectures"><i class="fa fa-check"></i><b>7.6</b> Other common architectures</a><ul>
<li class="chapter" data-level="7.6.1" data-path="NN.html"><a href="NN.html#generative-aversarial-networks"><i class="fa fa-check"></i><b>7.6.1</b> Generative adversarial networks</a></li>
<li class="chapter" data-level="7.6.2" data-path="NN.html"><a href="NN.html#autoencoders"><i class="fa fa-check"></i><b>7.6.2</b> Autoencoders</a></li>
<li class="chapter" data-level="7.6.3" data-path="NN.html"><a href="NN.html#a-word-on-convolutional-networks"><i class="fa fa-check"></i><b>7.6.3</b> A word on convolutional networks</a></li>
<li class="chapter" data-level="7.6.4" data-path="NN.html"><a href="NN.html#advanced-architectures"><i class="fa fa-check"></i><b>7.6.4</b> Advanced architectures</a></li>
</ul></li>
<li class="chapter" data-level="7.7" data-path="NN.html"><a href="NN.html#coding-exercise-1"><i class="fa fa-check"></i><b>7.7</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="svm.html"><a href="svm.html"><i class="fa fa-check"></i><b>8</b> Support vector machines</a><ul>
<li class="chapter" data-level="8.1" data-path="svm.html"><a href="svm.html#svm-for-classification"><i class="fa fa-check"></i><b>8.1</b> SVM for classification</a></li>
<li class="chapter" data-level="8.2" data-path="svm.html"><a href="svm.html#svm-for-regression"><i class="fa fa-check"></i><b>8.2</b> SVM for regression</a></li>
<li class="chapter" data-level="8.3" data-path="svm.html"><a href="svm.html#practice"><i class="fa fa-check"></i><b>8.3</b> Practice</a></li>
<li class="chapter" data-level="8.4" data-path="svm.html"><a href="svm.html#coding-exercises-3"><i class="fa fa-check"></i><b>8.4</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="9" data-path="bayes.html"><a href="bayes.html"><i class="fa fa-check"></i><b>9</b> Bayesian methods</a><ul>
<li class="chapter" data-level="9.1" data-path="bayes.html"><a href="bayes.html#the-bayesian-framework"><i class="fa fa-check"></i><b>9.1</b> The Bayesian framework</a></li>
<li class="chapter" data-level="9.2" data-path="bayes.html"><a href="bayes.html#bayesian-sampling"><i class="fa fa-check"></i><b>9.2</b> Bayesian sampling</a><ul>
<li class="chapter" data-level="9.2.1" data-path="bayes.html"><a href="bayes.html#gibbs-sampling"><i class="fa fa-check"></i><b>9.2.1</b> Gibbs sampling</a></li>
<li class="chapter" data-level="9.2.2" data-path="bayes.html"><a href="bayes.html#metropolis-hastings-sampling"><i class="fa fa-check"></i><b>9.2.2</b> Metropolis-Hastings sampling</a></li>
</ul></li>
<li class="chapter" data-level="9.3" data-path="bayes.html"><a href="bayes.html#bayesian-linear-regression"><i class="fa fa-check"></i><b>9.3</b> Bayesian linear regression</a></li>
<li class="chapter" data-level="9.4" data-path="bayes.html"><a href="bayes.html#naive-bayes-classifier"><i class="fa fa-check"></i><b>9.4</b> Naive Bayes classifier</a></li>
<li class="chapter" data-level="9.5" data-path="bayes.html"><a href="bayes.html#BART"><i class="fa fa-check"></i><b>9.5</b> Bayesian additive trees</a><ul>
<li class="chapter" data-level="9.5.1" data-path="bayes.html"><a href="bayes.html#general-formulation"><i class="fa fa-check"></i><b>9.5.1</b> General formulation</a></li>
<li class="chapter" data-level="9.5.2" data-path="bayes.html"><a href="bayes.html#priors"><i class="fa fa-check"></i><b>9.5.2</b> Priors</a></li>
<li class="chapter" data-level="9.5.3" data-path="bayes.html"><a href="bayes.html#sampling-and-predictions"><i class="fa fa-check"></i><b>9.5.3</b> Sampling and predictions</a></li>
<li class="chapter" data-level="9.5.4" data-path="bayes.html"><a href="bayes.html#code"><i class="fa fa-check"></i><b>9.5.4</b> Code</a></li>
</ul></li>
</ul></li>
<li class="part"><span><b>III From predictions to portfolios</b></span></li>
<li class="chapter" data-level="10" data-path="valtune.html"><a href="valtune.html"><i class="fa fa-check"></i><b>10</b> Validating and tuning</a><ul>
<li class="chapter" data-level="10.1" data-path="valtune.html"><a href="valtune.html#mlmetrics"><i class="fa fa-check"></i><b>10.1</b> Learning metrics</a><ul>
<li class="chapter" data-level="10.1.1" data-path="valtune.html"><a href="valtune.html#regression-analysis"><i class="fa fa-check"></i><b>10.1.1</b> Regression analysis</a></li>
<li class="chapter" data-level="10.1.2" data-path="valtune.html"><a href="valtune.html#classification-analysis"><i class="fa fa-check"></i><b>10.1.2</b> Classification analysis</a></li>
</ul></li>
<li class="chapter" data-level="10.2" data-path="valtune.html"><a href="valtune.html#validation"><i class="fa fa-check"></i><b>10.2</b> Validation</a><ul>
<li class="chapter" data-level="10.2.1" data-path="valtune.html"><a href="valtune.html#the-variance-bias-tradeoff-theory"><i class="fa fa-check"></i><b>10.2.1</b> The variance-bias tradeoff: theory</a></li>
<li class="chapter" data-level="10.2.2" data-path="valtune.html"><a href="valtune.html#the-variance-bias-tradeoff-illustration"><i class="fa fa-check"></i><b>10.2.2</b> The variance-bias tradeoff: illustration</a></li>
<li class="chapter" data-level="10.2.3" data-path="valtune.html"><a href="valtune.html#the-risk-of-overfitting-principle"><i class="fa fa-check"></i><b>10.2.3</b> The risk of overfitting: principle</a></li>
<li class="chapter" data-level="10.2.4" data-path="valtune.html"><a href="valtune.html#the-risk-of-overfitting-some-solutions"><i class="fa fa-check"></i><b>10.2.4</b> The risk of overfitting: some solutions</a></li>
</ul></li>
<li class="chapter" data-level="10.3" data-path="valtune.html"><a href="valtune.html#the-search-for-good-hyperparameters"><i class="fa fa-check"></i><b>10.3</b> The search for good hyperparameters</a><ul>
<li class="chapter" data-level="10.3.1" data-path="valtune.html"><a href="valtune.html#methods"><i class="fa fa-check"></i><b>10.3.1</b> Methods</a></li>
<li class="chapter" data-level="10.3.2" data-path="valtune.html"><a href="valtune.html#example-grid-search"><i class="fa fa-check"></i><b>10.3.2</b> Example: grid search</a></li>
<li class="chapter" data-level="10.3.3" data-path="valtune.html"><a href="valtune.html#example-bayesian-optimization"><i class="fa fa-check"></i><b>10.3.3</b> Example: Bayesian optimization</a></li>
</ul></li>
<li class="chapter" data-level="10.4" data-path="valtune.html"><a href="valtune.html#short-discussion-on-validation-in-backtests"><i class="fa fa-check"></i><b>10.4</b> Short discussion on validation in backtests</a></li>
</ul></li>
<li class="chapter" data-level="11" data-path="ensemble.html"><a href="ensemble.html"><i class="fa fa-check"></i><b>11</b> Ensemble models</a><ul>
<li class="chapter" data-level="11.1" data-path="ensemble.html"><a href="ensemble.html#linear-ensembles"><i class="fa fa-check"></i><b>11.1</b> Linear ensembles</a><ul>
<li class="chapter" data-level="11.1.1" data-path="ensemble.html"><a href="ensemble.html#principles"><i class="fa fa-check"></i><b>11.1.1</b> Principles</a></li>
<li class="chapter" data-level="11.1.2" data-path="ensemble.html"><a href="ensemble.html#example"><i class="fa fa-check"></i><b>11.1.2</b> Example</a></li>
</ul></li>
<li class="chapter" data-level="11.2" data-path="ensemble.html"><a href="ensemble.html#stacked-ensembles"><i class="fa fa-check"></i><b>11.2</b> Stacked ensembles</a><ul>
<li class="chapter" data-level="11.2.1" data-path="ensemble.html"><a href="ensemble.html#two-stage-training"><i class="fa fa-check"></i><b>11.2.1</b> Two-stage training</a></li>
<li class="chapter" data-level="11.2.2" data-path="ensemble.html"><a href="ensemble.html#code-and-results-3"><i class="fa fa-check"></i><b>11.2.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="11.3" data-path="ensemble.html"><a href="ensemble.html#extensions-1"><i class="fa fa-check"></i><b>11.3</b> Extensions</a><ul>
<li class="chapter" data-level="11.3.1" data-path="ensemble.html"><a href="ensemble.html#exogenous-variables"><i class="fa fa-check"></i><b>11.3.1</b> Exogenous variables</a></li>
<li class="chapter" data-level="11.3.2" data-path="ensemble.html"><a href="ensemble.html#shrinking-inter-model-correlations"><i class="fa fa-check"></i><b>11.3.2</b> Shrinking inter-model correlations</a></li>
</ul></li>
<li class="chapter" data-level="11.4" data-path="ensemble.html"><a href="ensemble.html#exercise"><i class="fa fa-check"></i><b>11.4</b> Exercise</a></li>
</ul></li>
<li class="chapter" data-level="12" data-path="backtest.html"><a href="backtest.html"><i class="fa fa-check"></i><b>12</b> Portfolio backtesting</a><ul>
<li class="chapter" data-level="12.1" data-path="backtest.html"><a href="backtest.html#protocol"><i class="fa fa-check"></i><b>12.1</b> Setting the protocol</a></li>
<li class="chapter" data-level="12.2" data-path="backtest.html"><a href="backtest.html#turning-signals-into-portfolio-weights"><i class="fa fa-check"></i><b>12.2</b> Turning signals into portfolio weights</a></li>
<li class="chapter" data-level="12.3" data-path="backtest.html"><a href="backtest.html#perfmet"><i class="fa fa-check"></i><b>12.3</b> Performance metrics</a><ul>
<li class="chapter" data-level="12.3.1" data-path="backtest.html"><a href="backtest.html#discussion-1"><i class="fa fa-check"></i><b>12.3.1</b> Discussion</a></li>
<li class="chapter" data-level="12.3.2" data-path="backtest.html"><a href="backtest.html#pure-performance-and-risk-indicators"><i class="fa fa-check"></i><b>12.3.2</b> Pure performance and risk indicators</a></li>
<li class="chapter" data-level="12.3.3" data-path="backtest.html"><a href="backtest.html#factor-based-evaluation"><i class="fa fa-check"></i><b>12.3.3</b> Factor-based evaluation</a></li>
<li class="chapter" data-level="12.3.4" data-path="backtest.html"><a href="backtest.html#risk-adjusted-measures"><i class="fa fa-check"></i><b>12.3.4</b> Risk-adjusted measures</a></li>
<li class="chapter" data-level="12.3.5" data-path="backtest.html"><a href="backtest.html#transaction-costs-and-turnover"><i class="fa fa-check"></i><b>12.3.5</b> Transaction costs and turnover</a></li>
</ul></li>
<li class="chapter" data-level="12.4" data-path="backtest.html"><a href="backtest.html#common-errors-and-issues"><i class="fa fa-check"></i><b>12.4</b> Common errors and issues</a><ul>
<li class="chapter" data-level="12.4.1" data-path="backtest.html"><a href="backtest.html#forward-looking-data"><i class="fa fa-check"></i><b>12.4.1</b> Forward looking data</a></li>
<li class="chapter" data-level="12.4.2" data-path="backtest.html"><a href="backtest.html#backov"><i class="fa fa-check"></i><b>12.4.2</b> Backtest overfitting</a></li>
<li class="chapter" data-level="12.4.3" data-path="backtest.html"><a href="backtest.html#simple-safeguards"><i class="fa fa-check"></i><b>12.4.3</b> Simple safeguards</a></li>
</ul></li>
<li class="chapter" data-level="12.5" data-path="backtest.html"><a href="backtest.html#implication-of-non-stationarity-forecasting-is-hard"><i class="fa fa-check"></i><b>12.5</b> Implication of non-stationarity: forecasting is hard</a><ul>
<li class="chapter" data-level="12.5.1" data-path="backtest.html"><a href="backtest.html#general-comments"><i class="fa fa-check"></i><b>12.5.1</b> General comments</a></li>
<li class="chapter" data-level="12.5.2" data-path="backtest.html"><a href="backtest.html#the-no-free-lunch-theorem"><i class="fa fa-check"></i><b>12.5.2</b> The no free lunch theorem</a></li>
</ul></li>
<li class="chapter" data-level="12.6" data-path="backtest.html"><a href="backtest.html#first-example-a-complete-backtest"><i class="fa fa-check"></i><b>12.6</b> First example: a complete backtest</a></li>
<li class="chapter" data-level="12.7" data-path="backtest.html"><a href="backtest.html#second-example-backtest-overfitting"><i class="fa fa-check"></i><b>12.7</b> Second example: backtest overfitting</a></li>
<li class="chapter" data-level="12.8" data-path="backtest.html"><a href="backtest.html#coding-exercises-4"><i class="fa fa-check"></i><b>12.8</b> Coding exercises</a></li>
</ul></li>
<li class="part"><span><b>IV Further important topics</b></span></li>
<li class="chapter" data-level="13" data-path="interp.html"><a href="interp.html"><i class="fa fa-check"></i><b>13</b> Interpretability</a><ul>
<li class="chapter" data-level="13.1" data-path="interp.html"><a href="interp.html#global-interpretations"><i class="fa fa-check"></i><b>13.1</b> Global interpretations</a><ul>
<li class="chapter" data-level="13.1.1" data-path="interp.html"><a href="interp.html#surr"><i class="fa fa-check"></i><b>13.1.1</b> Simple models as surrogates</a></li>
<li class="chapter" data-level="13.1.2" data-path="interp.html"><a href="interp.html#variable-importance"><i class="fa fa-check"></i><b>13.1.2</b> Variable importance (tree-based)</a></li>
<li class="chapter" data-level="13.1.3" data-path="interp.html"><a href="interp.html#variable-importance-agnostic"><i class="fa fa-check"></i><b>13.1.3</b> Variable importance (agnostic)</a></li>
<li class="chapter" data-level="13.1.4" data-path="interp.html"><a href="interp.html#partial-dependence-plot"><i class="fa fa-check"></i><b>13.1.4</b> Partial dependence plot</a></li>
</ul></li>
<li class="chapter" data-level="13.2" data-path="interp.html"><a href="interp.html#local-interpretations"><i class="fa fa-check"></i><b>13.2</b> Local interpretations</a><ul>
<li class="chapter" data-level="13.2.1" data-path="interp.html"><a href="interp.html#lime"><i class="fa fa-check"></i><b>13.2.1</b> LIME</a></li>
<li class="chapter" data-level="13.2.2" data-path="interp.html"><a href="interp.html#shapley-values"><i class="fa fa-check"></i><b>13.2.2</b> Shapley values</a></li>
<li class="chapter" data-level="13.2.3" data-path="interp.html"><a href="interp.html#breakdown"><i class="fa fa-check"></i><b>13.2.3</b> Breakdown</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="14" data-path="causality.html"><a href="causality.html"><i class="fa fa-check"></i><b>14</b> Two key concepts: causality and non-stationarity</a><ul>
<li class="chapter" data-level="14.1" data-path="causality.html"><a href="causality.html#causality-1"><i class="fa fa-check"></i><b>14.1</b> Causality</a><ul>
<li class="chapter" data-level="14.1.1" data-path="causality.html"><a href="causality.html#granger"><i class="fa fa-check"></i><b>14.1.1</b> Granger causality</a></li>
<li class="chapter" data-level="14.1.2" data-path="causality.html"><a href="causality.html#causal-additive-models"><i class="fa fa-check"></i><b>14.1.2</b> Causal additive models</a></li>
<li class="chapter" data-level="14.1.3" data-path="causality.html"><a href="causality.html#structural-time-series-models"><i class="fa fa-check"></i><b>14.1.3</b> Structural time series models</a></li>
</ul></li>
<li class="chapter" data-level="14.2" data-path="causality.html"><a href="causality.html#nonstat"><i class="fa fa-check"></i><b>14.2</b> Dealing with changing environments</a><ul>
<li class="chapter" data-level="14.2.1" data-path="causality.html"><a href="causality.html#non-stationarity-yet-another-illustration"><i class="fa fa-check"></i><b>14.2.1</b> Non-stationarity: yet another illustration</a></li>
<li class="chapter" data-level="14.2.2" data-path="causality.html"><a href="causality.html#online-learning"><i class="fa fa-check"></i><b>14.2.2</b> Online learning</a></li>
<li class="chapter" data-level="14.2.3" data-path="causality.html"><a href="causality.html#homogeneous-transfer-learning"><i class="fa fa-check"></i><b>14.2.3</b> Homogeneous transfer learning</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="15" data-path="unsup.html"><a href="unsup.html"><i class="fa fa-check"></i><b>15</b> Unsupervised learning</a><ul>
<li class="chapter" data-level="15.1" data-path="unsup.html"><a href="unsup.html#corpred"><i class="fa fa-check"></i><b>15.1</b> The problem with correlated predictors</a></li>
<li class="chapter" data-level="15.2" data-path="unsup.html"><a href="unsup.html#principal-component-analysis-and-autoencoders"><i class="fa fa-check"></i><b>15.2</b> Principal component analysis and autoencoders</a><ul>
<li class="chapter" data-level="15.2.1" data-path="unsup.html"><a href="unsup.html#a-bit-of-algebra"><i class="fa fa-check"></i><b>15.2.1</b> A bit of algebra</a></li>
<li class="chapter" data-level="15.2.2" data-path="unsup.html"><a href="unsup.html#pca"><i class="fa fa-check"></i><b>15.2.2</b> PCA</a></li>
<li class="chapter" data-level="15.2.3" data-path="unsup.html"><a href="unsup.html#ae"><i class="fa fa-check"></i><b>15.2.3</b> Autoencoders</a></li>
<li class="chapter" data-level="15.2.4" data-path="unsup.html"><a href="unsup.html#application"><i class="fa fa-check"></i><b>15.2.4</b> Application</a></li>
</ul></li>
<li class="chapter" data-level="15.3" data-path="unsup.html"><a href="unsup.html#clustering-via-k-means"><i class="fa fa-check"></i><b>15.3</b> Clustering via k-means</a></li>
<li class="chapter" data-level="15.4" data-path="unsup.html"><a href="unsup.html#nearest-neighbors"><i class="fa fa-check"></i><b>15.4</b> Nearest neighbors</a></li>
<li class="chapter" data-level="15.5" data-path="unsup.html"><a href="unsup.html#coding-exercise-2"><i class="fa fa-check"></i><b>15.5</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="16" data-path="RL.html"><a href="RL.html"><i class="fa fa-check"></i><b>16</b> Reinforcement learning</a><ul>
<li class="chapter" data-level="16.1" data-path="RL.html"><a href="RL.html#theoretical-layout"><i class="fa fa-check"></i><b>16.1</b> Theoretical layout</a><ul>
<li class="chapter" data-level="16.1.1" data-path="RL.html"><a href="RL.html#general-framework"><i class="fa fa-check"></i><b>16.1.1</b> General framework</a></li>
<li class="chapter" data-level="16.1.2" data-path="RL.html"><a href="RL.html#q-learning"><i class="fa fa-check"></i><b>16.1.2</b> Q-learning</a></li>
<li class="chapter" data-level="16.1.3" data-path="RL.html"><a href="RL.html#sarsa"><i class="fa fa-check"></i><b>16.1.3</b> SARSA</a></li>
</ul></li>
<li class="chapter" data-level="16.2" data-path="RL.html"><a href="RL.html#the-curse-of-dimensionality"><i class="fa fa-check"></i><b>16.2</b> The curse of dimensionality</a></li>
<li class="chapter" data-level="16.3" data-path="RL.html"><a href="RL.html#policy-gradient"><i class="fa fa-check"></i><b>16.3</b> Policy gradient</a><ul>
<li class="chapter" data-level="16.3.1" data-path="RL.html"><a href="RL.html#principle-2"><i class="fa fa-check"></i><b>16.3.1</b> Principle</a></li>
<li class="chapter" data-level="16.3.2" data-path="RL.html"><a href="RL.html#extensions-2"><i class="fa fa-check"></i><b>16.3.2</b> Extensions</a></li>
</ul></li>
<li class="chapter" data-level="16.4" data-path="RL.html"><a href="RL.html#simple-examples"><i class="fa fa-check"></i><b>16.4</b> Simple examples</a><ul>
<li class="chapter" data-level="16.4.1" data-path="RL.html"><a href="RL.html#q-learning-with-simulations"><i class="fa fa-check"></i><b>16.4.1</b> Q-learning with simulations</a></li>
<li class="chapter" data-level="16.4.2" data-path="RL.html"><a href="RL.html#RLemp2"><i class="fa fa-check"></i><b>16.4.2</b> Q-learning with market data</a></li>
</ul></li>
<li class="chapter" data-level="16.5" data-path="RL.html"><a href="RL.html#concluding-remarks"><i class="fa fa-check"></i><b>16.5</b> Concluding remarks</a></li>
<li class="chapter" data-level="16.6" data-path="RL.html"><a href="RL.html#exercises"><i class="fa fa-check"></i><b>16.6</b> Exercises</a></li>
</ul></li>
<li class="part"><span><b>V Appendix</b></span></li>
<li class="chapter" data-level="17" data-path="data-description.html"><a href="data-description.html"><i class="fa fa-check"></i><b>17</b> Data description</a></li>
<li class="chapter" data-level="18" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html"><i class="fa fa-check"></i><b>18</b> Solutions to exercises</a><ul>
<li class="chapter" data-level="18.1" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-3"><i class="fa fa-check"></i><b>18.1</b> Chapter 3</a></li>
<li class="chapter" data-level="18.2" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-4"><i class="fa fa-check"></i><b>18.2</b> Chapter 4</a></li>
<li class="chapter" data-level="18.3" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-5"><i class="fa fa-check"></i><b>18.3</b> Chapter 5</a></li>
<li class="chapter" data-level="18.4" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-6"><i class="fa fa-check"></i><b>18.4</b> Chapter 6</a></li>
<li class="chapter" data-level="18.5" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-7-the-autoencoder-model"><i class="fa fa-check"></i><b>18.5</b> Chapter 7: the autoencoder model</a></li>
<li class="chapter" data-level="18.6" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-8"><i class="fa fa-check"></i><b>18.6</b> Chapter 8</a></li>
<li class="chapter" data-level="18.7" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-11-ensemble-neural-network"><i class="fa fa-check"></i><b>18.7</b> Chapter 11: ensemble neural network</a></li>
<li class="chapter" data-level="18.8" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-12"><i class="fa fa-check"></i><b>18.8</b> Chapter 12</a><ul>
<li class="chapter" data-level="18.8.1" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#ew-portfolios-with-the-tidyverse"><i class="fa fa-check"></i><b>18.8.1</b> EW portfolios with the tidyverse</a></li>
<li class="chapter" data-level="18.8.2" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#advanced-weighting-function"><i class="fa fa-check"></i><b>18.8.2</b> Advanced weighting function</a></li>
<li class="chapter" data-level="18.8.3" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#functional-programming-in-the-backtest"><i class="fa fa-check"></i><b>18.8.3</b> Functional programming in the backtest</a></li>
</ul></li>
<li class="chapter" data-level="18.9" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-15"><i class="fa fa-check"></i><b>18.9</b> Chapter 15</a></li>
<li class="chapter" data-level="18.10" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-16"><i class="fa fa-check"></i><b>18.10</b> Chapter 16</a></li>
</ul></li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Machine Learning for Factor Investing</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<section class="normal" id="section-">
<div id="causality" class="section level1">
<h1><span class="header-section-number">Chapter 14</span> Two key concepts: causality and non-stationarity</h1>
<p>
A prominent point of criticism faced by ML tools is their inability to uncover <strong>causality</strong> relationships between features and labels because they are mostly focused (by design) to capture correlations. Correlations are much weaker than causality because they characterize a two-way relationship (<span class="math inline">\(\textbf{X}\leftrightarrow \textbf{y}\)</span>), while causality specifies a direction <span class="math inline">\(\textbf{X}\rightarrow \textbf{y}\)</span> or <span class="math inline">\(\textbf{X}\leftarrow \textbf{y}\)</span>. One fashionable example is sentiment. Many academic articles seem to find that sentiment (irrespectively of its definition) is a significant driver of future returns. A high sentiment for a particular stock may increase the demand for this stock and push its price up (though contrarian reasonings may also apply: if sentiment is high, it is a sign that mean-reversion is possibly about to happen). The reverse causation is also plausible: returns may well cause sentiment. If a stock experiences a long period of market growth, people become bullish about this stock and sentiment increases (this notably comes from extrapolation, see <span class="citation">Barberis et al. (<a href="#ref-barberis2015x" role="doc-biblioref">2015</a>)</span> for a theoretical model). In <span class="citation">Coqueret (<a href="#ref-coqueret2018economic" role="doc-biblioref">2020</a>)</span>, it is found (in opposition to most findings in this field), that the latter relationship (returns <span class="math inline">\(\rightarrow\)</span> sentiment) is more likely. This result is backed by causality driven tests (see Section <a href="causality.html#granger">14.1.1</a>).</p>
<p>Statistical causality is a large field and we refer to <span class="citation">Pearl (<a href="#ref-pearl2009causality" role="doc-biblioref">2009</a>)</span> for a deep dive into this topic. Recently, researchers have sought to link causality with ML approaches (see, e.g., <span class="citation">Peters, Janzing, and Schölkopf (<a href="#ref-peters2017elements" role="doc-biblioref">2017</a>)</span>, <span class="citation">Heinze-Deml, Peters, and Meinshausen (<a href="#ref-heinze2018invariant" role="doc-biblioref">2018</a>)</span>, <span class="citation">Arjovsky et al. (<a href="#ref-arjovsky2019invariant" role="doc-biblioref">2019</a>)</span>). The key notion in their work is <strong>invariance</strong>. </p>
<p>Often, data is collected not at once, but from different sources at different moments. Some relationships found in these different sources will change, while others may remain the same. The relationships that are invariant to <strong>changing environments</strong> are likely to stem from (and signal) causality. One counter-example is the following (related in <span class="citation">Beery, Van Horn, and Perona (<a href="#ref-beery2018recognition" role="doc-biblioref">2018</a>)</span>): training a computer vision algorithm to discriminate between cows and camels will lead the algorithm to focus on grass versus sand! This is because most camels are pictured in the desert while cows are shown in green fields of grass. Thus, a picture of a camel on grass will be classified as cow, while a cow on sand would be labelled “camel”. It is only with pictures of these two animals in different contexts (environments) that the learner will end up truly finding what makes a cow and a camel. A camel will remain a camel no matter where it is pictured: it should be recognized as such by the learner. If so, the representation of the camel becomes invariant over all datasets and the learner has discovered causality, i.e., the true attributes that make the camel a camel (overall silhouette, shape of the back, face, color (possibly misleading!), etc.).</p>
<p>This search for invariance makes sense for many disciplines like computer vision or natural language processing (cats will always look like cats and languages don’t change much). In finance, it is not obvious that invariance may exist. Market conditions are known to be time-varying and the relationships between firm characteristics and returns also change from year to year. One solution to this issue may simply be to embrace <strong>non-stationarity</strong> (see Section <a href="notdata.html#notations">1.1</a> for a definition of stationarity). In Chapter <a href="backtest.html#backtest">12</a>, we advocate to do that by updating models as frequently as possible with rolling training sets: this allows the predictions to be based on the most recent trends. In Section <a href="causality.html#nonstat">14.2</a> below, we introduce other theoretical and practical options.</p>
<div id="causality-1" class="section level2">
<h2><span class="header-section-number">14.1</span> Causality</h2>
<p>
Traditional machine learning models aim to uncover relationships between variables but do not usually specify <em>directions</em> for these relationships. One typical example is the linear regression. If we write <span class="math inline">\(y=a+bx+\epsilon\)</span>, then it is also true that <span class="math inline">\(x=b^{-1}(y-a-\epsilon)\)</span>, which is of course also a linear relationship (with respect to <span class="math inline">\(y\)</span>). These equations do not define causation whereby <span class="math inline">\(x\)</span> would be a clear determinant of <span class="math inline">\(y\)</span> (<span class="math inline">\(x \rightarrow y\)</span>, but the opposite could be false).</p>
<div id="granger" class="section level3">
<h3><span class="header-section-number">14.1.1</span> Granger causality</h3>
<p>
The most notable tool first proposed by <span class="citation">Granger (<a href="#ref-granger1969investigating" role="doc-biblioref">1969</a>)</span> is probably the simplest. For simplicity, we consider only two stationary processes, <span class="math inline">\(X_t\)</span> and <span class="math inline">\(Y_t\)</span>. A strict definition of causality could be the following. <span class="math inline">\(X\)</span> can be said to cause <span class="math inline">\(Y\)</span>, whenever, for some integer <span class="math inline">\(k\)</span>,
<span class="math display">\[(Y_{t+1},\dots,Y_{t+k})|(\mathcal{F}_{Y,t}\cup \mathcal{F}_{X,t}) \quad \overset{d}{\neq} \quad (Y_{t+1},\dots,Y_{t+k})|\mathcal{F}_{Y,t},\]</span>
that is, when the distribution of future values of <span class="math inline">\(Y_t\)</span>, conditionally on the knowledge of both processes is not the same as the distribution with the sole knowledge of the filtration <span class="math inline">\(\mathcal{F}_{Y,t}\)</span>. Hence <span class="math inline">\(X\)</span> does have an impact on <span class="math inline">\(Y\)</span> because its trajectory alters that of <span class="math inline">\(Y\)</span>.</p>
<p>Now, this formulation is too vague and impossible to handle numerically, thus we simplify the setting via a linear formulation. We keep the same notations as section 5 of the original paper by <span class="citation">Granger (<a href="#ref-granger1969investigating" role="doc-biblioref">1969</a>)</span>. The test consists of two regressions:
<span class="math display">\[\begin{align*}
X_t&=\sum_{j=1}^ma_jX_{t-j}+\sum_{j=1}^mb_jY_{t-j} + \epsilon_t \\
Y_t&=\sum_{j=1}^mc_jX_{t-j}+\sum_{j=1}^md_jY_{t-j} + \nu_t
\end{align*}\]</span>
where for simplicity, it is assumed that both processes have zero mean. The usual assumptions apply: the Gaussian noises <span class="math inline">\(\epsilon_t\)</span> and <span class="math inline">\(\nu_t\)</span> are uncorrelated in every possible way (mutually and through time). The test is the following: if one <span class="math inline">\(b_j\)</span> is nonzero, then it is said that <span class="math inline">\(Y\)</span> Granger-causes <span class="math inline">\(X\)</span> and if one <span class="math inline">\(c_j\)</span> is nonzero, <span class="math inline">\(X\)</span> Granger-causes <span class="math inline">\(Y\)</span>. The two are not mutually exclusive and it is widely accepted that feedback loops can very well occur.</p>
<p>Statistically, under the null hypothesis, <span class="math inline">\(b_1=\dots=b_m=0\)</span> (<em>resp.</em> <span class="math inline">\(c_1=\dots=c_m=0\)</span>), which can be tested using the usual Fischer distribution. Obviously, the linear restriction can be dismissed but the tests are then much more complex. The main financial article in this direction is <span class="citation">Hiemstra and Jones (<a href="#ref-hiemstra1994testing" role="doc-biblioref">1994</a>)</span>.</p>
<p>There are many R packages that embed Granger causality functionalities. One of the most widespread is <em>lmtest</em>, so we work with it below. The syntax is incredibly simple. The <em>order</em> is the maximum lag <span class="math inline">\(m\)</span> in the above equation. We test if market capitalization averaged over the past 6 months Granger-causes 1 month ahead returns for one particular stock (the first in the sample).</p>
<div class="sourceCode" id="cb212"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb212-1"><a href="causality.html#cb212-1"></a><span class="kw">library</span>(lmtest)</span>
<span id="cb212-2"><a href="causality.html#cb212-2"></a>x_granger <-<span class="st"> </span>training_sample <span class="op">%>%</span><span class="st"> </span><span class="co"># X variable =...</span></span>
<span id="cb212-3"><a href="causality.html#cb212-3"></a><span class="st"> </span><span class="kw">filter</span>(stock_id <span class="op">==</span><span class="dv">1</span>) <span class="op">%>%</span><span class="st"> </span><span class="co"># ... stock nb 1</span></span>
<span id="cb212-4"><a href="causality.html#cb212-4"></a><span class="st"> </span><span class="kw">pull</span>(Mkt_Cap_6M_Usd) <span class="co"># ... & Market cap</span></span>
<span id="cb212-5"><a href="causality.html#cb212-5"></a>y_granger <-<span class="st"> </span>training_sample <span class="op">%>%</span><span class="st"> </span><span class="co"># Y variable = ...</span></span>
<span id="cb212-6"><a href="causality.html#cb212-6"></a><span class="st"> </span><span class="kw">filter</span>(stock_id <span class="op">==</span><span class="dv">1</span>) <span class="op">%>%</span><span class="st"> </span><span class="co"># ... stock nb 1</span></span>
<span id="cb212-7"><a href="causality.html#cb212-7"></a><span class="st"> </span><span class="kw">pull</span>(R1M_Usd) <span class="co"># ... & 1M return</span></span>
<span id="cb212-8"><a href="causality.html#cb212-8"></a>fit_granger <-<span class="st"> </span><span class="kw">grangertest</span>(x_granger, <span class="co"># X variable</span></span>
<span id="cb212-9"><a href="causality.html#cb212-9"></a> y_granger, <span class="co"># Y variable</span></span>
<span id="cb212-10"><a href="causality.html#cb212-10"></a> <span class="dt">order =</span> <span class="dv">6</span>, <span class="co"># Maximmum lag</span></span>
<span id="cb212-11"><a href="causality.html#cb212-11"></a> <span class="dt">na.action =</span> na.omit) <span class="co"># What to do with missing data</span></span>
<span id="cb212-12"><a href="causality.html#cb212-12"></a>fit_granger</span></code></pre></div>
<pre><code>## Granger causality test
##
## Model 1: y_granger ~ Lags(y_granger, 1:6) + Lags(x_granger, 1:6)
## Model 2: y_granger ~ Lags(y_granger, 1:6)
## Res.Df Df F Pr(>F)
## 1 149
## 2 155 -6 4.111 0.0007554 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1</code></pre>
<p>The test is directional and only tests if <span class="math inline">\(X\)</span> Granger-causes <span class="math inline">\(Y\)</span>. In order to test the reverse effect, it is required to inverse the arguments in the function. In the output above, the <span class="math inline">\(p\)</span>-value is very low, hence the probability of observing samples similar to ours knowing that <span class="math inline">\(H_0\)</span> holds is negligible. Thus it seems that market capitalization does Granger-cause one-month returns. We nonetheless underline that Granger causality is arguably weaker than the one defined in the next subsection. A process that Granger-causes another one simply contains useful predictive information, which is not proof of causality in a strict sense. Moreover, our test is limited to a linear model and including nonlinearities may alter the conclusion. Lastly, including other regressors (possibly omitted variables) could also change the results (see, e.g., <span class="citation">Chow, Cotsomitis, and Kwan (<a href="#ref-chow2002multivariate" role="doc-biblioref">2002</a>)</span>).</p>
</div>
<div id="causal-additive-models" class="section level3">
<h3><span class="header-section-number">14.1.2</span> Causal additive models</h3>
<p>
The zoo of causal model encompasses a variety of beasts (even BARTs from Section <a href="bayes.html#BART">9.5</a> are used for this purpose in <span class="citation">Hahn, Murray, and Carvalho (<a href="#ref-hahn2019bayesian" role="doc-biblioref">2019</a>)</span>). The interested reader can have a peek at <span class="citation">Pearl (<a href="#ref-pearl2009causality" role="doc-biblioref">2009</a>)</span>, <span class="citation">Peters, Janzing, and Schölkopf (<a href="#ref-peters2017elements" role="doc-biblioref">2017</a>)</span>, <span class="citation">Maathuis et al. (<a href="#ref-maathuis2018handbook" role="doc-biblioref">2018</a>)</span> and <span class="citation">Hünermund and Bareinboim (<a href="#ref-hunermund2019causal" role="doc-biblioref">2019</a>)</span> and the references therein. One central tool in causal models is the <strong>do-calculus</strong> developed by Pearl. Whereas traditional probabilities <span class="math inline">\(P[Y|X]\)</span> link the odds of <span class="math inline">\(Y\)</span> conditionally on <strong>observing</strong> <span class="math inline">\(X\)</span> take some value <span class="math inline">\(x\)</span>, the do(<span class="math inline">\(\cdot\)</span>) <strong>forces</strong> <span class="math inline">\(X\)</span> to take value <span class="math inline">\(x\)</span>. This is a <em>looking</em> versus <em>doing</em> dichotomy. One classical example is the following. Observing a barometer gives a clue what the weather will be because high pressures are more often associated with sunny days:
<span class="math display">\[P[\text{sunny weather}|\text{barometer says ``high''} ]>P[\text{sunny weather}|\text{barometer says ``low''} ],\]</span>
but if you hack the barometer (force it to display some value),
<span class="math display">\[P[\text{sunny weather}|\text{barometer hacked to ``high''} ]=P[\text{sunny weather}|\text{barometer hacked ``low''} ],\]</span>
because hacking the barometer will have no impact on the weather. In short notation, when there is an intervention on the barometer, <span class="math inline">\(P[\text{weather}|\text{do(barometer)}]=P[\text{weather}]\)</span>. This is an interesting example related to causality. The overarching variable is pressure. Pressure impacts both the weather and the barometer and this joint effect is called confounding. However, it may not be true that the barometer impacts the weather. The interested reader who wants to dive deeper into these concepts should have a closer look at the work of Judea Pearl. Do-calculus is a very powerful theoretical framework, but it is not easy to apply it to any situation or dataset (see for instance the book review <span class="citation">Aronow and Sävje (<a href="#ref-aronow2020book" role="doc-biblioref">2019</a>)</span>).</p>
<p>While we do not formally present an exhaustive tour of the theory behind causal inference, we wish to show some practical implementations because they are easy to interpret. It is always hard to single out one type of model in particular so we choose one that can be explained with simple mathematical tools. We start with the simplest definition of a structural causal model (SCM), where we follow here chapter 3 of <span class="citation">Peters, Janzing, and Schölkopf (<a href="#ref-peters2017elements" role="doc-biblioref">2017</a>)</span>. The idea behind these models is to introduce some hierarchy (i.e., some additional structure) in the model. Formally, this gives
<span class="math display">\[\begin{align*}
X&=\epsilon_X \\
Y&=f(X,\epsilon_Y),
\end{align*}\]</span>
where the <span class="math inline">\(\epsilon_X\)</span> and <span class="math inline">\(\epsilon_Y\)</span> are independent noise variables. Plainly, a realization of <span class="math inline">\(X\)</span> is drawn randomly and has then an impact on the realization of <span class="math inline">\(Y\)</span> via <span class="math inline">\(f\)</span>. Now this scheme could be more complex if the number of observed variables was larger. Imagine a third variable comes in so that
<span class="math display">\[\begin{align*}
X&=\epsilon_X \\
Y&=f(X,\epsilon_Y),\\
Z&=g(Y,\epsilon_Z)
\end{align*}\]</span></p>
<p>In this case, <span class="math inline">\(X\)</span> has a causation effect on <span class="math inline">\(Y\)</span> and then <span class="math inline">\(Y\)</span> has a causation effect on <span class="math inline">\(Z\)</span>. We thus have the following connections:
<span class="math display">\[\begin{array}{ccccccc} X & &&&\\
&\searrow & &&\\
&&Y&\rightarrow&Z. \\
&\nearrow &&\nearrow& \\
\epsilon_Y & &\epsilon_Z
\end{array}\]</span></p>
<p>
The above representation is called a graph and graph theory has its own nomenclature, which we very briefly summarize. The variables are often referred to as <em>vertices</em> (or <em>nodes</em>) and the arrows as <em>edges</em>. Because arrows have a direction, they are called <em>directed</em> edges. When two vertices are connected via an edge, they are called <em>adjacent</em>. A sequence of adjacent vertices is called a <em>path</em>, and it is directed if all edges are arrows. Within a directed path, a vertex that comes first is a parent node and the one just after is a child node.</p>
<p>Graphs can be summarized by adjacency matrices. An adjacency matrix <span class="math inline">\(\textbf{A}=A_{ij}\)</span> is a matrix filled with zeros and ones. <span class="math inline">\(A_{ij}=1\)</span> whenever there is an edge from vertex <span class="math inline">\(i\)</span> to vertex <span class="math inline">\(j\)</span>. Usually, self-loops (<span class="math inline">\(X \rightarrow X\)</span>) are prohibited so that adjacency matrices have zeros on the diagonal. If we consider a simplified version of the above graph like <span class="math inline">\(X \rightarrow Y \rightarrow Z\)</span>, the corresponding adjacency matrix is</p>
<p><span class="math display">\[\textbf{A}=\begin{bmatrix}
0 & 1 & 0 \\
0 & 0 & 1 \\
0& 0&0
\end{bmatrix}.\]</span></p>
<p>where letters <span class="math inline">\(X\)</span>, <span class="math inline">\(Y\)</span>, and <span class="math inline">\(Z\)</span> are naturally ordered alphabetically. There are only two arrows: from <span class="math inline">\(X\)</span> to <span class="math inline">\(Y\)</span> (first row, second column) and from <span class="math inline">\(Y\)</span> to <span class="math inline">\(Z\)</span> (second row, third column).</p>
<p>A <strong>cycle</strong> is a particular type of path that creates a loop, i.e., when the first vertex is also the last. The sequence <span class="math inline">\(X \rightarrow Y \rightarrow Z \rightarrow X\)</span> is a cycle. Technically, cycles pose problems. To illustrate this, consider the simple sequence <span class="math inline">\(X \rightarrow Y \rightarrow X\)</span>. This would imply that a realization of <span class="math inline">\(X\)</span> causes <span class="math inline">\(Y\)</span> which in turn would cause the realization of <span class="math inline">\(Y\)</span>. While Granger causality can be viewed as allowing this kind of connection, general causal models usually avoid cycles and work with <strong>directed acyclic graphs</strong> (DAGs). Formal graph manipulations (possibly linked to do-calculus) can be computed via the <em>causaleffect</em> package <span class="citation">Tikka and Karvanen (<a href="#ref-tikka2017identifying" role="doc-biblioref">2017</a>)</span>. Direct acyclic graphs can also be created and manipulated with the <em>dagitty</em> (textor2016robust) and <em>ggdag</em> packages.</p>
<p>Equipped with these tools, we can explicitize a very general form of models:
<span class="math display" id="eq:CAM0">\[\begin{equation}
\tag{14.1}
X_j=f_j\left(\textbf{X}_{\text{pa}_D(j)},\epsilon_j \right),
\end{equation}\]</span></p>
<p>where the noise variables are mutually independent. The notation <span class="math inline">\(\text{pa}_D(j)\)</span> refers to the set of parent nodes of vertex <span class="math inline">\(j\)</span> within the graph structure <span class="math inline">\(D\)</span>. Hence, <span class="math inline">\(X_j\)</span> is a function of all of its parents and some noise term <span class="math inline">\(\epsilon_j\)</span>. An additive causal model is a mild simplification of the above specification:</p>
<p><span class="math display" id="eq:CAM">\[\begin{equation}
\tag{14.2}
X_j=\sum_{k\in \text{pa}_D(j)}f_{j,k}\left(\textbf{X}_{k} \right)+\epsilon_j,
\end{equation}\]</span></p>
<p>where the nonlinear effect of each variable is cumulative, hence the term ‘<em>additive</em>’. Note that there is no time index there. In contrast to Granger causality, there is no natural ordering. Such models are very complex and hard to estimate. The details can be found in <span class="citation">Bühlmann et al. (<a href="#ref-buhlmann2014cam" role="doc-biblioref">2014</a>)</span>. Fortunately, the authors have developed an R package that determines the DAG <span class="math inline">\(D\)</span>.</p>
<p>Below, we build the adjacency matrix pertaining to the small set of predictor variables plus the 1-month ahead return (on the training sample). The original version of the book used the <em>CAM</em> package which has a very simple syntax.<a href="#fn28" class="footnote-ref" id="fnref28"><sup>28</sup></a> Below, we test the more recent <em>InvariantCausalPrediction</em> package.</p>
<p>[[<strong>NOTE</strong>: the remainder of the subsection is under revision.]]</p>
<div class="sourceCode" id="cb214"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb214-1"><a href="causality.html#cb214-1"></a><span class="co"># library(CAM) # Activate the package</span></span>
<span id="cb214-2"><a href="causality.html#cb214-2"></a>data_caus <-<span class="st"> </span>training_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(<span class="kw">c</span>(<span class="st">"R1M_Usd"</span>, features_short))</span>
<span id="cb214-3"><a href="causality.html#cb214-3"></a><span class="co"># fit_cam <- CAM(data_caus) # The main function</span></span>
<span id="cb214-4"><a href="causality.html#cb214-4"></a><span class="co"># fit_cam$Adj # Showing the adjacency matrix</span></span>
<span id="cb214-5"><a href="causality.html#cb214-5"></a><span class="kw">library</span>(InvariantCausalPrediction)</span>
<span id="cb214-6"><a href="causality.html#cb214-6"></a><span class="kw">ICP</span>(<span class="dt">X =</span> training_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(<span class="kw">all_of</span>(features_short)) <span class="op">%>%</span><span class="st"> </span><span class="kw">as.matrix</span>(),</span>
<span id="cb214-7"><a href="causality.html#cb214-7"></a> <span class="dt">Y =</span> training_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">pull</span>(<span class="st">"R1M_Usd"</span>),</span>
<span id="cb214-8"><a href="causality.html#cb214-8"></a> <span class="dt">ExpInd =</span> <span class="kw">round</span>(<span class="kw">runif</span>(<span class="kw">nrow</span>(training_sample))),</span>
<span id="cb214-9"><a href="causality.html#cb214-9"></a> <span class="dt">alpha =</span> <span class="fl">0.05</span>)</span></code></pre></div>
<pre><code>##
## *** 2% complete: tested 2 of 128 sets of variables</code></pre>
<pre><code>##
## Invariant Linear Causal Regression at level 0.05 (including multiplicity correction for the number of variables)
## Model has been rejected at the chosen level 0.05, that is no subset of variables leads to invariance across the environments. This can be for example due to presence of
## (a) non-linearities or
## (b) hidden variables or
## (c) interventions on the target variable.
##
## We will try to extend the functionality soon to allow non-linear models and address issue (a) [non-linearity], which currently leads to rejection of the linear model.
## If the reason might be related to issue (b) [presence of hidden variables], one can use function hiddenICP which allows for hidden variables.</code></pre>
<p>The matrix is not too sparse, which means that the model has uncovered many relationships between the variables within the sample. Sadly, none are in the direction that is of interest for the prediction task that we seek. Indeed, the first variable is the one we want to predict and its column is empty. However, its row is full, which indicates the reverse effect: future returns cause the predictor values, which may seem rather counter-intuitive, given the nature of features.</p>
<p>For the sake of completeness, we also provide an implementation of the <em>pcalg</em> package (<span class="citation">Kalisch et al. (<a href="#ref-kalisch2012causal" role="doc-biblioref">2012</a>)</span>).<a href="#fn29" class="footnote-ref" id="fnref29"><sup>29</sup></a> Below, an estimation via the so-called PC (named after its authors <strong>P</strong>eter Spirtes and <strong>C</strong>lark Glymour) is performed. The details of the algorithm are out of the scope of the book, and the interested reader can have a look at section 5.4 of <span class="citation">Spirtes et al. (<a href="#ref-spirtes2000causation" role="doc-biblioref">2000</a>)</span> or section 2 from <span class="citation">Kalisch et al. (<a href="#ref-kalisch2012causal" role="doc-biblioref">2012</a>)</span> for more information on this subject. We use the <em>Rgraphviz</em> package available at <a href="https://www.bioconductor.org/packages/release/bioc/html/Rgraphviz.html" class="uri">https://www.bioconductor.org/packages/release/bioc/html/Rgraphviz.html</a>.</p>
<div class="sourceCode" id="cb217"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb217-1"><a href="causality.html#cb217-1"></a><span class="kw">library</span>(pcalg) <span class="co"># Load packages</span></span>
<span id="cb217-2"><a href="causality.html#cb217-2"></a><span class="kw">library</span>(Rgraphviz)</span>
<span id="cb217-3"><a href="causality.html#cb217-3"></a>est_caus <-<span class="st"> </span><span class="kw">list</span>(<span class="dt">C =</span> <span class="kw">cor</span>(data_caus), <span class="dt">n =</span> <span class="kw">nrow</span>(data_caus)) <span class="co"># Compute correlations</span></span>
<span id="cb217-4"><a href="causality.html#cb217-4"></a>pc.fit <-<span class="st"> </span><span class="kw">pc</span>(est_caus, <span class="dt">indepTest =</span> gaussCItest, <span class="co"># Estimate model</span></span>
<span id="cb217-5"><a href="causality.html#cb217-5"></a> <span class="dt">p =</span> <span class="kw">ncol</span>(data_caus),<span class="dt">alpha =</span> <span class="fl">0.01</span>)</span>
<span id="cb217-6"><a href="causality.html#cb217-6"></a><span class="kw">iplotPC</span>(pc.fit) <span class="co"># Plot model</span></span></code></pre></div>
<div class="figure"><span id="fig:pcalg"></span>
<img src="ML_factor_files/figure-html/pcalg-1.png" alt="Representation of a directed graph." width="624" />
<p class="caption">
FIGURE 14.1: Representation of a directed graph.
</p>
</div>
<p>A bidirectional arrow is shown when the model was unable to determine the edge orientation. While the adjacency matrix is different compared to the first model, there are still no predictors that seem to have a clear causal effect on the dependent variable (first circle).</p>
</div>
<div id="structural-time-series-models" class="section level3">
<h3><span class="header-section-number">14.1.3</span> Structural time series models</h3>
<p>
We end the topic of causality by mentioning a particular type of structural models: <strong>structural time series</strong>. Because we illustrate their relevance for a particular kind of causal inference, we closely follow the notations of <span class="citation">Brodersen et al. (<a href="#ref-brodersen2015inferring" role="doc-biblioref">2015</a>)</span>. The model is driven by two equations:</p>
<p><span class="math display">\[\begin{align*}
y_t&=\textbf{Z}_t'\boldsymbol{\alpha}_t+\epsilon_t \\
\boldsymbol{\alpha}_{t+1}& =\textbf{T}_t\boldsymbol{\alpha}_{t}+\textbf{R}_t\boldsymbol{\eta}_t.
\end{align*}\]</span></p>
<p>The dependent variable is expressed as a linear function of state variables <span class="math inline">\(\boldsymbol{\alpha}_t\)</span> plus an error term. These variables are in turn linear functions of their past values plus another error term which can have a complex structure (it’s a product of a matrix <span class="math inline">\(\textbf{R}_t\)</span> with a centered Gaussian term <span class="math inline">\(\boldsymbol{\eta}_t\)</span>). This specification nests many models as special cases, like ARIMA for instance.</p>
<p>The goal of <span class="citation">Brodersen et al. (<a href="#ref-brodersen2015inferring" role="doc-biblioref">2015</a>)</span> is to detect causal impacts via regime changes. They estimate the above model over a given training period and then predict the model’s response on some test set. If the aggregate (summed/integrated) error between the realized versus predicted values is significant (based on some statistical test), then the authors conclude that the breaking point is relevant. Originally, the aim of the approach is to quantify the effect of an intervention by looking at how a model trained before the intervention behaves after the intervention.</p>
<p>Below, we test if the 100<span class="math inline">\(^{th}\)</span> date point in the sample (April 2008) is a turning point. Arguably, this date belongs to the time span of the subprime financial crisis. We use the <em>CausalImpact</em> package which uses the <em>bsts</em> library (Bayesian structural time series).</p>
<div class="sourceCode" id="cb218"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb218-1"><a href="causality.html#cb218-1"></a><span class="kw">library</span>(CausalImpact)</span>
<span id="cb218-2"><a href="causality.html#cb218-2"></a>stock1_data <-<span class="st"> </span>data_ml <span class="op">%>%</span><span class="st"> </span><span class="kw">filter</span>(stock_id <span class="op">==</span><span class="st"> </span><span class="dv">1</span>) <span class="co"># Data of first stock</span></span>
<span id="cb218-3"><a href="causality.html#cb218-3"></a>struct_data <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">y =</span> stock1_data<span class="op">$</span>R1M_Usd) <span class="op">%>%</span><span class="st"> </span><span class="co"># Combine label...</span></span>
<span id="cb218-4"><a href="causality.html#cb218-4"></a><span class="st"> </span><span class="kw">cbind</span>(stock1_data <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short)) <span class="co"># ... and features</span></span>
<span id="cb218-5"><a href="causality.html#cb218-5"></a>pre.period <-<span class="st"> </span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">100</span>) <span class="co"># Pre-break period (pre-2008)</span></span>
<span id="cb218-6"><a href="causality.html#cb218-6"></a>post.period <-<span class="st"> </span><span class="kw">c</span>(<span class="dv">101</span>,<span class="dv">200</span>) <span class="co"># Post-break period</span></span>
<span id="cb218-7"><a href="causality.html#cb218-7"></a>impact <-<span class="st"> </span><span class="kw">CausalImpact</span>(<span class="kw">zoo</span>(struct_data), pre.period, post.period)</span>
<span id="cb218-8"><a href="causality.html#cb218-8"></a><span class="kw">summary</span>(impact)</span></code></pre></div>
<pre><code>## Posterior inference {CausalImpact}
##
## Average Cumulative
## Actual 0.016 1.638
## Prediction (s.d.) 0.031 (0.017) 3.091 (1.712)
## 95% CI [-0.0023, 0.064] [-0.2331, 6.430]
##
## Absolute effect (s.d.) -0.015 (0.017) -1.453 (1.712)
## 95% CI [-0.048, 0.019] [-4.792, 1.871]
##
## Relative effect (s.d.) -47% (55%) -47% (55%)
## 95% CI [-155%, 61%] [-155%, 61%]
##
## Posterior tail-area probability p: 0.19309
## Posterior prob. of a causal effect: 81%
##
## For more details, type: summary(impact, "report")</code></pre>
<div class="sourceCode" id="cb220"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb220-1"><a href="causality.html#cb220-1"></a><span class="co">#summary(impact, "report") # Get the full report (see below)</span></span></code></pre></div>
<p>The time series associated with the model are shown in Figure <a href="causality.html#fig:structbayplot">14.2</a>.</p>
<div class="sourceCode" id="cb221"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb221-1"><a href="causality.html#cb221-1"></a><span class="kw">plot</span>(impact)</span></code></pre></div>
<div class="figure"><span id="fig:structbayplot"></span>
<img src="ML_factor_files/figure-html/structbayplot-1.png" alt="Output of the causal impact study." width="672" />
<p class="caption">
FIGURE 14.2: Output of the causal impact study.
</p>
</div>
<p>Below, we copy and paste the report generated by the function (obtained by the commented line in the above code). The conclusions do not support a marked effect of the crisis on the model probably because the signs of the error in the post period constantly change sign.</p>
<p><em>During the post-intervention period, the response variable had an average value of approx. 0.016. In the absence of an intervention, we would have expected an average response of 0.031. The 95% interval of this counterfactual prediction is [-0.0059, 0.063]. Subtracting this prediction from the observed response yields an estimate of the causal effect the intervention had on the response variable. This effect is -0.015 with a 95% interval of [-0.047, 0.022].</em></p>
<p><em>Summing up the individual data points during the post-intervention period (which can only sometimes be meaningfully interpreted), the response variable had an overall value of 1.64. Had the intervention not taken place, we would have expected a sum of 3.09. The 95% interval of this prediction is [-0.59, 6.34]. The above results are given in terms of absolute numbers. In relative terms, the response variable showed a decrease of -47%. The 95% interval of this percentage is [-152%, +72%].</em></p>
<p><em>This means that, although it may look as though the intervention has exerted a negative effect on the response variable when considering the intervention period as a whole, this effect is not statistically significant, and so cannot be meaningfully interpreted. The apparent effect could be the result of random fluctuations that are unrelated to the intervention. This is often the case when the intervention period is very long and includes much of the time when the effect has already worn off. It can also be the case when the intervention period is too short to distinguish the signal from the noise. Finally, failing to find a significant effect can happen when there are not enough control variables or when these variables do not correlate well with the response variable during the learning period.</em></p>
<p><em>The probability of obtaining this effect by chance is p = 0.199. This means the effect may be spurious and would generally not be considered statistically significant.</em></p>
</div>
</div>
<div id="nonstat" class="section level2">
<h2><span class="header-section-number">14.2</span> Dealing with changing environments</h2>
<p>The most common assumption in machine learning contributions is that the samples that are studied are i.i.d. realizations of a phenomenon that we are trying to characterize. This constraint is natural because if the relationship between <span class="math inline">\(X\)</span> and <span class="math inline">\(y\)</span> always changes, then it is very hard to infer anything from observations. One major problem in Finance is that this is often the case: markets, behaviors, policies, etc., evolve all the time. This is at least partly related to the notion of absence of arbitrage: if a trading strategy worked all the time, all agents would eventually adopt it via herding, which would annihilate the corresponding gains.<a href="#fn30" class="footnote-ref" id="fnref30"><sup>30</sup></a> If the strategy is kept private, its holder would become infinitely rich, which obviously has never happened.</p>
<p>There are several ways to define changes in environments. If we denote with <span class="math inline">\(\mathbb{P}_{XY}\)</span> the multivariate distribution of all variables (features and label), with <span class="math inline">\(\mathbb{P}_{XY}=\mathbb{P}_{X}\mathbb{P}_{Y|X}\)</span>, then two simple changes are possible:</p>
<ul>
<li><strong>covariate shift</strong>: <span class="math inline">\(\mathbb{P}_{X}\)</span> changes but <span class="math inline">\(\mathbb{P}_{Y|X}\)</span> does not: the features have a fluctuating distribution, but their relationship with <span class="math inline">\(Y\)</span> holds still;<br />
</li>
<li><strong>concept drift</strong>: <span class="math inline">\(\mathbb{P}_{Y|X}\)</span> changes but <span class="math inline">\(\mathbb{P}_{X}\)</span> does not: feature distributions are stable, but their relation to <span class="math inline">\(Y\)</span> is altered.</li>
</ul>
<p>Obviously, we omit the case when both items change, as it is too complex to handle. In factor investing, the feature engineering process (see Section <a href="Data.html#feateng">4.4</a>) is partly designed to bypass the risk of covariate shift. Uniformization guarantees that the marginals stay the same but correlations between features may of course change. The main issue is probably concept drift when the way features explain the label changes through time. In <span class="citation">Cornuejols, Miclet, and Barra (<a href="#ref-cornuejols2011apprentissage" role="doc-biblioref">2018</a>)</span>,<a href="#fn31" class="footnote-ref" id="fnref31"><sup>31</sup></a> the authors distinguish four types of drifts, which we reproduce in Figure <a href="causality.html#fig:conceptchange">14.3</a>. In factor models, changes are presumably a combination of all four types: they can be abrupt during crashes, but most of the time they are progressive (gradual or incremental) and never-ending (continuously recurring).</p>
<div class="figure" style="text-align: center"><span id="fig:conceptchange"></span>
<img src="images/conceptchange.png" alt="Different flavors of concept change." width="300px" />
<p class="caption">
FIGURE 14.3: Different flavors of concept change.
</p>
</div>
<p>Naturally, if we aknowledge that the environment changes, it appears logical to adapt models accordingly, i.e., dynamically. This gives rise to the so-called <strong>stability-plasticity dilemma</strong>. This dilemma is a trade-off between model <strong>reactiveness</strong> (new instances have an important impact on updates) versus <strong>stability</strong> (these instances may not be representative of a slower trend and they may thus shift the model in a suboptimal direction).</p>
<p>Practically, there are two ways to shift the cursor with respect to this dilemma: alter the chronological depth of the training sample (e.g., go further back in time) or, when it’s possible, allocate more weight to recent instances. We discuss the first option in Section <a href="backtest.html#protocol">12.1</a> and the second is mentioned in Section <a href="trees.html#adaboost">6.3</a> (though the purpose in Adaboost is precisely to let the algorithm handle the weights). In neural networks, it is possible, in all generality to introduce instance-based weights in the computation of the loss function, though this option is not (yet) available in Keras (to the best of our knowledge: the framework evolves rapidly). For simple regressions, this idea is known as <strong>weighted least squares</strong> wherein errors are weighted inside the loss:
<span class="math display">\[L=\sum_{i=1}^Iw_i(y_i-\textbf{x}_i\textbf{b})^2.\]</span>
In matrix terms, <span class="math inline">\(L=(\textbf{y}-\textbf{Xb})'\textbf{W}(\textbf{y}-\textbf{Xb})\)</span>, where <span class="math inline">\(\textbf{W}\)</span> is a diagonal matrix of weights. The gradient with respect to <span class="math inline">\(\textbf{b}\)</span> is equal to <span class="math inline">\(2\textbf{X}'\textbf{WX}\textbf{b}-2\textbf{X}'\textbf{Wy}\)</span> so that the loss is minimized for <span class="math inline">\(\textbf{b}^*=(\textbf{X}'\textbf{WX})^{-1}\textbf{X}'\textbf{Wy}\)</span>. The standard least-square solution is recovered for <span class="math inline">\(\textbf{W}=\textbf{I}\)</span>. In order to fine-tune the reactiveness of the model, the weights must be a function that decreases as instances become older in the sample.</p>
<p>There is of course no perfect solution to changing financial environements. Below, we mention two routes that are taken in the ML literature to overcome the problem of non-stationarity in the data generating process. But first, we propose yet another clear verification that markets do experience time-varying distributions.</p>
<div id="non-stationarity-yet-another-illustration" class="section level3">
<h3><span class="header-section-number">14.2.1</span> Non-stationarity: yet another illustration</h3>
<p>One of the most basic practices in (financial) econometrics is to work with returns (relative price changes). The simple reason is that returns seem to behave consistently through time (monthly returns are bounded, they usually lie between -1 and +1). Prices on the other hand shift and, often, some prices never come back to past values. This makes prices harder to study.</p>
<p>Stationarity is a key notion in financial econometrics: it is much easier to characterize a phenomenon with distributional properties that remain the same through time (this makes them possible to capture). Sadly, the distribution of returns is not stationary: both the mean and the variance of returns change along cycles.</p>
<p>Below, in Figure <a href="causality.html#fig:statplot">14.4</a>, we illustrate this fact by computing the average monthly return for all calendar years in the whole dataset.</p>
<div class="sourceCode" id="cb222"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb222-1"><a href="causality.html#cb222-1"></a>data_ml <span class="op">%>%</span><span class="st"> </span></span>
<span id="cb222-2"><a href="causality.html#cb222-2"></a><span class="st"> </span><span class="kw">mutate</span>(<span class="dt">year =</span> <span class="kw">year</span>(date)) <span class="op">%>%</span><span class="st"> </span><span class="co"># Create a year variable</span></span>
<span id="cb222-3"><a href="causality.html#cb222-3"></a><span class="st"> </span><span class="kw">group_by</span>(year) <span class="op">%>%</span><span class="st"> </span><span class="co"># Group by year</span></span>
<span id="cb222-4"><a href="causality.html#cb222-4"></a><span class="st"> </span><span class="kw">summarize</span>(<span class="dt">avg_ret =</span> <span class="kw">mean</span>(R1M_Usd)) <span class="op">%>%</span><span class="st"> </span><span class="co"># Compute average return</span></span>
<span id="cb222-5"><a href="causality.html#cb222-5"></a><span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> year, <span class="dt">y =</span> avg_ret)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_col</span>() <span class="op">+</span><span class="st"> </span><span class="kw">theme_grey</span>()</span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:statplot"></span>
<img src="ML_factor_files/figure-html/statplot-1.png" alt="Average monthly return on a yearly basis." width="350px" />
<p class="caption">
FIGURE 14.4: Average monthly return on a yearly basis.
</p>
</div>
<p>These changes in the mean are also accompanied by variations in the second moment (variance/volatility). This effect, known as volatility clustering, has been widely documented ever since the theoretical breakthrough of <span class="citation">Engle (<a href="#ref-engle1982autoregressive" role="doc-biblioref">1982</a>)</span> (and even well before). We refer for instance to <span class="citation">Cont (<a href="#ref-cont2007volatility" role="doc-biblioref">2007</a>)</span> for more details on this topic. For the computation of realized volatility in R, we strongly recommend chapter 4 in <span class="citation">Regenstein (<a href="#ref-regenstein2018reproducible" role="doc-biblioref">2018</a>)</span>.</p>
<p>In terms of machine learning models, this is also true. Below, we estimate a pure characteristic regression with one predictor, the market capitalization averaged over the past 6-months (<span class="math inline">\(r_{t+1,n}=\alpha+\beta x_{t,n}^{\text{cap}}+\epsilon_{t+1,n}\)</span>). The label is the 6-month forward return and the estimation is performed over every calendar year.</p>
<div class="sourceCode" id="cb223"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb223-1"><a href="causality.html#cb223-1"></a>data_ml <span class="op">%>%</span></span>
<span id="cb223-2"><a href="causality.html#cb223-2"></a><span class="st"> </span><span class="kw">mutate</span>(<span class="dt">year =</span> <span class="kw">year</span>(date)) <span class="op">%>%</span><span class="st"> </span><span class="co"># Create a year variable</span></span>
<span id="cb223-3"><a href="causality.html#cb223-3"></a><span class="st"> </span><span class="kw">group_by</span>(year) <span class="op">%>%</span><span class="st"> </span><span class="co"># Group by year</span></span>
<span id="cb223-4"><a href="causality.html#cb223-4"></a><span class="st"> </span><span class="kw">summarize</span>(<span class="dt">beta_cap =</span> <span class="kw">lm</span>(R6M_Usd <span class="op">~</span><span class="st"> </span>Mkt_Cap_6M_Usd) <span class="op">%>%</span><span class="st"> </span><span class="co"># Perform regression</span></span>
<span id="cb223-5"><a href="causality.html#cb223-5"></a><span class="st"> </span><span class="kw">coef</span>() <span class="op">%>%</span><span class="st"> </span><span class="co"># Extract coefs</span></span>
<span id="cb223-6"><a href="causality.html#cb223-6"></a><span class="st"> </span><span class="kw">t</span>() <span class="op">%>%</span><span class="st"> </span><span class="co"># Transpose</span></span>
<span id="cb223-7"><a href="causality.html#cb223-7"></a><span class="st"> </span><span class="kw">data.frame</span>() <span class="op">%>%</span><span class="st"> </span><span class="co"># Format into df</span></span>
<span id="cb223-8"><a href="causality.html#cb223-8"></a><span class="st"> </span><span class="kw">pull</span>(Mkt_Cap_6M_Usd)) <span class="op">%>%</span><span class="st"> </span><span class="co"># Pull coef (remove intercept)</span></span>
<span id="cb223-9"><a href="causality.html#cb223-9"></a><span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> year, <span class="dt">y =</span> beta_cap)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_col</span>() <span class="op">+</span><span class="st"> </span><span class="co"># Plot</span></span>
<span id="cb223-10"><a href="causality.html#cb223-10"></a><span class="st"> </span><span class="kw">theme_grey</span>()</span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:conceptdriftemp"></span>
<img src="ML_factor_files/figure-html/conceptdriftemp-1.png" alt="Variations in betas with respect to 6-month market capitalization." width="350px" />
<p class="caption">
FIGURE 14.5: Variations in betas with respect to 6-month market capitalization.
</p>
</div>
<p>The bars in Figure <a href="causality.html#fig:conceptdriftemp">14.5</a> highlight the concept drift: overall, the relationship between capitalization and returns is negative (the <strong>size effect</strong> again). Sometimes it is markedly negative, sometimes, not so much. The ability of capitalization to explain returns is time-varying and models must adapt accordingly.</p>
</div>
<div id="online-learning" class="section level3">
<h3><span class="header-section-number">14.2.2</span> Online learning</h3>
<p>
Online learning refers to a subset of machine learning in which new information arrives progressively and the integration of this flow is performed iteratively (the term ‘<em>online</em>’ is not linked to the internet). In order to take the latest data updates into account, it is imperative to update the model (stating the obvious). This is clearly the case in finance and this topic is closely related to the discussion on learning windows in Section <a href="backtest.html#protocol">12.1</a>.</p>
<p>The problem is that if a 2019 model is trained on data from 2010 to 2019, the (dynamic) 2020 model will have to be re-trained with the whole dataset including the latest points from 2020. This can be heavy and including just the latest points in the learning process would substantially decrease its computational cost. In neural networks, the sequential batch updating of weights can allow a progressive change in the model. Nonetheless, this is typically impossible for decision trees because the splits are decided once and for all. One notable exception is <span class="citation">Basak (<a href="#ref-basak2004online" role="doc-biblioref">2004</a>)</span>, but, in that case, the construction of the trees differs strongly from the original algorithm.</p>
<p>The simplest example of online learning is the Widrow-Hodd algorithm (originally from <span class="citation">Widrow and Hoff (<a href="#ref-widrow1960adaptive" role="doc-biblioref">1960</a>)</span>). Originally, the idea comes from the so-called ADALINE (ADAptive LInear NEuron) model which is a neural network with one hidden layer with linear activation function (i.e., like a perceptron, but with a different activation).</p>
<p>Suppose the model is linear, that is <span class="math inline">\(\textbf{y}=\textbf{Xb}+\textbf{e}\)</span> (a constant can be added to the list of predictors) and that the amount of data is both massive and coming in at a high frequency so that updating the model on the full sample is proscribed because it is technically intractable. A simple and heuristic way to update the values of <span class="math inline">\(\textbf{b}\)</span> is to compute
<span class="math display">\[\textbf{b}_{t+1} \longleftarrow \textbf{b}_t-\eta (\textbf{x}_t\textbf{b}-y_t)\textbf{x}_t',\]</span>
where <span class="math inline">\(\textbf{x}_t\)</span> is the row vector of instance <span class="math inline">\(t\)</span>. The justification is simple. The quadratic error <span class="math inline">\((\textbf{x}_t\textbf{b}-y_t)^2\)</span> has a gradient with respect to <span class="math inline">\(\textbf{b}\)</span> equal to <span class="math inline">\(2(\textbf{x}_t\textbf{b}-y_t)\textbf{x}_t'\)</span>; therefore, the above update is a simple example of gradient descent. <span class="math inline">\(\eta\)</span> must of course be quite small: if not, each new point will considerably alter <span class="math inline">\(\textbf{b}\)</span>, thereby resulting in a volatile model.</p>
<p>An exhaustive review of techniques pertaining to online learning is presented in <span class="citation">Hoi et al. (<a href="#ref-hoi2018online" role="doc-biblioref">2018</a>)</span> (section 4.11 is even dedicated to portfolio selection). The book <span class="citation">Hazan and others (<a href="#ref-hazan2016introduction" role="doc-biblioref">2016</a>)</span> covers online convex optimization which is a very close domain with a large overlap with online learning. The presentation below is adapted from the second and third parts of the first survey.</p>
<p>Datasets are indexed by time: we write <span class="math inline">\(\textbf{X}_t\)</span> and <span class="math inline">\(\textbf{y}_t\)</span> for features and labels (the usual column index (<span class="math inline">\(k\)</span>) and row index (<span class="math inline">\(i\)</span>) will not be used in this section). Time has a bounded horizon <span class="math inline">\(T\)</span>. The machine learning model depends on some parameters <span class="math inline">\(\boldsymbol{\theta}\)</span> and we denote it with <span class="math inline">\(f_{\boldsymbol{\theta}}\)</span>. At time <span class="math inline">\(t\)</span> (when dataset (<span class="math inline">\(\textbf{X}_t\)</span>, <span class="math inline">\(\textbf{y}_t\)</span>) is gathered), the loss function <span class="math inline">\(L\)</span> of the trained model naturally depends on the data (<span class="math inline">\(\textbf{X}_t\)</span>, <span class="math inline">\(\textbf{y}_t\)</span>) and on the model via <span class="math inline">\(\boldsymbol{\theta}_t\)</span> which are the parameter values fitted to the time-<span class="math inline">\(t\)</span> data. For notational simplicity, we henceforth write <span class="math inline">\(L_t(\boldsymbol{\theta}_t)=L(\textbf{X}_t,\textbf{y}_t,\boldsymbol{\theta}_t )\)</span>. The key quantity in online learning is the regret over the whole time sequence:
<span class="math display" id="eq:regret">\[\begin{equation}
\tag{14.3}
R_T=\sum_{t=1}^TL_t(\boldsymbol{\theta}_t)-\underset{\boldsymbol{\theta}^*\in \boldsymbol{\Theta}}{\inf} \ \sum_{t=1}^TL_t(\boldsymbol{\theta}^*).
\end{equation}\]</span></p>
<p>The regret is the total loss incurred by the models <span class="math inline">\(\boldsymbol{\theta}_t\)</span> minus the minimal loss that could have been obtained with full knowledge of the data sequence (hence computed in hindsight). The basic methods in online learning are in fact quite similar to the batch-training of neural networks. The updating of the parameter is based on
<span class="math display" id="eq:online1">\[\begin{equation}
\tag{14.4}
\textbf{z}_{t+1}=\boldsymbol{\theta}_t-\eta_t\nabla L_t(\boldsymbol{\theta}_t),
\end{equation}\]</span>
where <span class="math inline">\(\nabla L_t(\boldsymbol{\theta}_t)\)</span> denotes the gradient of the current loss <span class="math inline">\(L_t\)</span>. One problem that can arise is when <span class="math inline">\(\textbf{z}_{t+1}\)</span> falls out of the bounds that are prescribed for <span class="math inline">\(\boldsymbol{\theta}_t\)</span>. Thus, the candidate vector for the new parameters, <span class="math inline">\(\textbf{z}_{t+1}\)</span>, is projected onto the feasible domain which we call <span class="math inline">\(S\)</span> here:
<span class="math display" id="eq:online2">\[\begin{equation}
\tag{14.5}
\boldsymbol{\theta}_{t+1}=\Pi_S(\textbf{z}_{t+1}), \quad \text{with} \quad \Pi_S(\textbf{u}) = \underset{\boldsymbol{\theta}\in S}{\text{argmin}} \ ||\boldsymbol{\theta}-\textbf{u}||_2.
\end{equation}\]</span>
Hence <span class="math inline">\(\boldsymbol{\theta}_{t+1}\)</span> is as close as possible to the intermediate choice <span class="math inline">\(\textbf{z}_{t+1}\)</span>. In <span class="citation">Hazan, Agarwal, and Kale (<a href="#ref-hazan2007logarithmic" role="doc-biblioref">2007</a>)</span>, it is shown that under suitable assumptions (e.g., <span class="math inline">\(L_t\)</span> being strictly convex with bounded gradient <span class="math inline">\(\left|\left|\underset{\boldsymbol{\theta}}{\sup} \, \nabla L_t(\boldsymbol{\theta})\right|\right|\le G\)</span>), the regret <span class="math inline">\(R_T\)</span> satisfies
<span class="math display">\[R_T \le \frac{G^2}{2H}(1+\log(T)),\]</span>
where <span class="math inline">\(H\)</span> is a scaling factor for the learning rate (also called step sizes): <span class="math inline">\(\eta_t=(Ht)^{-1}\)</span>.</p>
<p>More sophisticated online algorithms generalize <a href="causality.html#eq:online1">(14.4)</a> and <a href="causality.html#eq:online2">(14.5)</a> by integrating the Hessian matrix <span class="math inline">\(\nabla^2 L_t(\boldsymbol{\theta}):=[\nabla^2 L_t]_{i,j}=\frac{\partial}{\partial \boldsymbol{\theta}_i \partial \boldsymbol{\theta}_j}L_t( \boldsymbol{\theta})\)</span> and/or by including penalizations to reduce instability in <span class="math inline">\(\boldsymbol{\theta}_t\)</span>. We refer to section 2 in <span class="citation">Hoi et al. (<a href="#ref-hoi2018online" role="doc-biblioref">2018</a>)</span> for more details on these extensions.</p>
<p>An interesting stream of parameter updating is that of the passive-aggressive algorithms (PAAs) formalized in <span class="citation">Crammer et al. (<a href="#ref-crammer2006online" role="doc-biblioref">2006</a>)</span>. The base case involves classification tasks, but we stick to the regression setting below (section 5 in <span class="citation">Crammer et al. (<a href="#ref-crammer2006online" role="doc-biblioref">2006</a>)</span>). One strong limitation with PAAs is that they rely on the set of parameters where the loss is either zero or negligible: <span class="math inline">\(\boldsymbol{\Theta}^*_\epsilon=\{\boldsymbol{\theta}, L_t(\boldsymbol{\theta})< \epsilon\}\)</span>. For general loss functions and learner <span class="math inline">\(f\)</span>, this set is largely inaccessible. Thus, the algorithms in <span class="citation">Crammer et al. (<a href="#ref-crammer2006online" role="doc-biblioref">2006</a>)</span> are restricted to a particular case, namely linear <span class="math inline">\(f\)</span> and <span class="math inline">\(\epsilon\)</span>-insensitive hinge loss:</p>
<p><span class="math display">\[L_\epsilon(\boldsymbol{\theta})=\left\{ \begin{array}{ll}
0 & \text{if } \ |\boldsymbol{\theta}'\textbf{x}-y|\le \epsilon \quad (\text{close enough prediction}) \\
|\boldsymbol{\theta}'\textbf{x}-y|- \epsilon & \text{if } \ |\boldsymbol{\theta}'\textbf{x}-y| > \epsilon \quad (\text{prediction too far})
\end{array}\right.,\]</span></p>
<p>for some parameter <span class="math inline">\(\epsilon>0\)</span>. If the weight <span class="math inline">\(\boldsymbol{\theta}\)</span> is such that the model is close enough to the true value, then the loss is zero; if not, it is equal to the absolute value of the error minus <span class="math inline">\(\epsilon\)</span>. In PAA, the update of the parameter is given by
<span class="math display">\[\boldsymbol{\theta}_{t+1}= \underset{\boldsymbol{\theta}}{\text{argmin}} ||\boldsymbol{\theta}-\boldsymbol{\theta}_t||_2^2, \quad \text{subject to} \quad L_\epsilon(\boldsymbol{\theta})=0,\]</span>
hence the new parameter values are chosen such that two conditions are satisfied:<br />
- the loss is zero (by the definition of the loss, this means that the model is close enough to the true value);<br />
- and, the parameter is as close as possible to the previous parameter values.</p>
<p>By construction, if the model is good enough, the model does not move (passive phase), but if not, it is rapidly shifted towards values that yield satisfactory results (aggressive phase).</p>
<p>We end this section with a historical note. Some of the ideas from online learning stem from the financial literature and from the concept of <strong>universal portfolios</strong> originally coined by <span class="citation">Cover (<a href="#ref-cover1991universal" role="doc-biblioref">1991</a>)</span> in particular. The setting is the following. The function <span class="math inline">\(f\)</span> is assumed to be linear <span class="math inline">\(f(\textbf{x}_t)=\boldsymbol{\theta}'\textbf{x}_t\)</span> and the data <span class="math inline">\(\textbf{x}_t\)</span> consists of asset returns, thus, the values are portfolio returns as long as <span class="math inline">\(\boldsymbol{\theta}'\textbf{1}_N=1\)</span> (the budget constraint). The loss functions <span class="math inline">\(L_t\)</span> correspond to a concave utility function (e.g., logarithmic) and the regret is reversed:
<span class="math display">\[R_T=\underset{\boldsymbol{\theta}^*\in \boldsymbol{\Theta}}{\sup} \ \sum_{t=1}^TL_t(\textbf{r}_t'\boldsymbol{\theta}^*)-\sum_{t=1}^TL_t(\textbf{r}_t'\boldsymbol{\theta}_t),\]</span>
where <span class="math inline">\(\textbf{r}_t'\)</span> are the returns. Thus, the program is transformed to maximize a concave function. Several articles (often from the Computer Science or ML communities) have proposed solutions to this type of problems: <span class="citation">Blum and Kalai (<a href="#ref-blum1999universal" role="doc-biblioref">1999</a>)</span>, <span class="citation">Agarwal et al. (<a href="#ref-agarwal2006algorithms" role="doc-biblioref">2006</a>)</span> and <span class="citation">Hazan, Agarwal, and Kale (<a href="#ref-hazan2007logarithmic" role="doc-biblioref">2007</a>)</span>. Most contributions work with price data only, with the notable exception of <span class="citation">Cover and Ordentlich (<a href="#ref-cover1996universal" role="doc-biblioref">1996</a>)</span>, which mentions external data (‘<em>side information</em>’). In the latter article, it is proven that constantly rebalanced portfolios distributed according to two random distributions achieve growth rates that are close to the unattainable optimal rates. The two distributions are the uniform law (equally weighting, once again) and the Dirichlet distribution with constant parameters equal to 1/2. Under this universal distribution, <span class="citation">Cover and Ordentlich (<a href="#ref-cover1996universal" role="doc-biblioref">1996</a>)</span> show that the wealth obtained is bounded below by:
<span class="math display">\[\text{wealth universal} \ge \frac{\text{wealth from optimal strategy}}{2(n+1)^{(m-1)/2}}, \]</span>
where <span class="math inline">\(m\)</span> is the number of assets and <span class="math inline">\(n\)</span> is the number of periods.</p>
<p>The literature on online portfolio allocation is reviewed in <span class="citation">Li and Hoi (<a href="#ref-li2014online" role="doc-biblioref">2014</a>)</span> and outlined in more details in <span class="citation">Li and Hoi (<a href="#ref-li2018online" role="doc-biblioref">2018</a>)</span>. Online learning, combined to early stopping for neural networks, is applied to factor investing in <span class="citation">Wong et al. (<a href="#ref-wong2020non" role="doc-biblioref">2020</a>)</span>. Finally, online learning is associated to clustering methods for portfolio choice in <span class="citation">Khedmati and Azin (<a href="#ref-khedmati2020online" role="doc-biblioref">2020</a>)</span>.</p>
</div>
<div id="homogeneous-transfer-learning" class="section level3">
<h3><span class="header-section-number">14.2.3</span> Homogeneous transfer learning</h3>
<p>
This subsection is mostly conceptual and will not be illustrated by coded applications. The ideas behind transfer learning can be valuable in that they can foster novel ideas, which is why we briefly present them below.</p>
<p>Transfer learning has been surveyed numerous times. One classical reference is <span class="citation">Pan and Yang (<a href="#ref-pan2009survey" role="doc-biblioref">2009</a>)</span>, but <span class="citation">Weiss, Khoshgoftaar, and Wang (<a href="#ref-weiss2016survey" role="doc-biblioref">2016</a>)</span> is more recent and more exhaustive. Suppose we are given two datasets <span class="math inline">\(D_S\)</span> (source) and <span class="math inline">\(D_T\)</span> (target). Each dataset has its own features <span class="math inline">\(\textbf{X}^S\)</span> and <span class="math inline">\(\textbf{X}^T\)</span> and labels <span class="math inline">\(\textbf{y}^S\)</span> and <span class="math inline">\(\textbf{y}^T\)</span>. In classical supervised learning, the patterns of the target set are learned only through <span class="math inline">\(\textbf{X}^T\)</span> and <span class="math inline">\(\textbf{y}^T\)</span>. Transfer learning proposes to improve the function <span class="math inline">\(f^T\)</span> (obtained by minimizing the fit <span class="math inline">\(y_i^T=f^T(\textbf{x}_i^T)+\epsilon^T_i\)</span> on the target data) via the function <span class="math inline">\(f^S\)</span> (from <span class="math inline">\(y_i^S=f^S(\textbf{x}_i^S)+\varepsilon^S_i\)</span> on the source data). Homogeneous transfer learning is when the feature space does not change, which is the case in our setting. In asset management, this may not always be the case if for instance new predictors are included (e.g., based on alternative data like sentiment, satellite imagery, credit card logs, etc.).</p>
<p>There are many subcategories in transfer learning depending on what changes between the source <span class="math inline">\(S\)</span> and the target <span class="math inline">\(T\)</span>: is it the feature space, the distribution of the labels, and/or the relationship between the two? These are the same questions as in Section <a href="causality.html#nonstat">14.2</a>. The latter case is of interest in finance because the link with non-stationarity is evident: it is when the model <span class="math inline">\(f\)</span> in <span class="math inline">\(\textbf{y}=f(\textbf{X})\)</span> changes through time. In transfer learning jargon, it is written as <span class="math inline">\(P[\textbf{y}^S|\textbf{X}^S]\neq P[\textbf{y}^T|\textbf{X}^T]\)</span>: the conditional law of the label knowing the features is not the same when switching from the source to the target. Often, the term ‘domain adaptation’ is used as synonym to transfer learning. Because of a data shift, we must adapt the model to increase its accuracy. These topics are reviewed in a series of chapters in the collection by <span class="citation">Quionero-Candela et al. (<a href="#ref-quionero2009dataset" role="doc-biblioref">2009</a>)</span>.</p>
<p>An important and elegant result in the theory was proven by <span class="citation">Ben-David et al. (<a href="#ref-ben2010theory" role="doc-biblioref">2010</a>)</span> in the case of binary classification. We state it below. We consider <span class="math inline">\(f\)</span> and <span class="math inline">\(h\)</span> two classifiers with values in <span class="math inline">\(\{0,1 \}\)</span>. The average error between the two over the domain <span class="math inline">\(S\)</span> is defined by
<span class="math display">\[\epsilon_S(f,h)=\mathbb{E}_S[|f(\textbf{x})-h(\textbf{x})|].\]</span>
Then,
<span class="math display">\[\begin{equation}
\small
\epsilon_T(f_T,h)\le \epsilon_S(f_S,h)+\underbrace{2 \sup_B|P_S(B)-P_T(B)|}_{\text{ difference between domains }} + \underbrace{ \min\left(\mathbb{E}_S[|f_S(\textbf{x})-f_T(\textbf{x})|],\mathbb{E}_T[|f_S(\textbf{x})-f_T(\textbf{x})|]\right)}_{\text{difference between the two learning tasks}}, \nonumber
\end{equation}\]</span></p>
<p>where <span class="math inline">\(P_S\)</span> and <span class="math inline">\(P_T\)</span> denote the distribution of the two domains. The above inequality is a bound on the generalization performance of <span class="math inline">\(h\)</span>. If we take <span class="math inline">\(f_S\)</span> to be the best possible classifier for <span class="math inline">\(S\)</span> and <span class="math inline">\(f_T\)</span> the best for <span class="math inline">\(T\)</span>, then the error generated by <span class="math inline">\(h\)</span> in <span class="math inline">\(T\)</span> is smaller than the sum of three components:<br />
- the error in the <span class="math inline">\(S\)</span> space;<br />
- the distance between the two domains (by how much the data space has shifted);<br />
- the distance between the two best models (generators).</p>
<p>One solution that is often mentioned in transfer learning is instance weighting. We present it here in a general setting. In machine learning, we seek to minimize
<span class="math display">\[\begin{align*}
\epsilon_T(f)=\mathbb{E}_T\left[L(\text{y},f(\textbf{X})) \right],
\end{align*}\]</span>
where <span class="math inline">\(L\)</span> is some loss function that depends on the task (regression versus classification). This can be arranged
<span class="math display">\[\begin{align*}
\epsilon_T(f)&=\mathbb{E}_T \left[\frac{P_S(\textbf{y},\textbf{X})}{P_S(\textbf{y},\textbf{X})} L(\text{y},f(\textbf{X})) \right] \\
&=\sum_{\textbf{y},\textbf{X}}P_T(\textbf{y},\textbf{X})\frac{P_S(\textbf{y},\textbf{X})}{P_S(\textbf{y},\textbf{X})} L(\text{y},f(\textbf{X})) \\
&=\mathbb{E}_S \left[\frac{P_T(\textbf{y},\textbf{X})}{P_S(\textbf{y},\textbf{X})} L(\text{y},f(\textbf{X})) \right]
\end{align*}\]</span></p>
<p>The key quantity is thus the transition ratio <span class="math inline">\(\frac{P_T(\textbf{y},\textbf{X})}{P_S(\textbf{y},\textbf{X})}\)</span> (Radon–Nikodym derivative under some assumptions). Of course this ratio is largely inaccessible in practice, but it is possible to find a weighting scheme (over the instances) that yields improvements over the error in the target space. The weighting scheme, just as in <span class="citation">Coqueret and Guida (<a href="#ref-coqueret2019training" role="doc-biblioref">2020</a>)</span>, can be binary, thereby simply excluding some observations in the computation of the error. Simply removing observations from the training sample can have beneficial effects.</p>
<p>
More generally, the above expression can be viewed as a theoretical invitation for user-specified instance weighting (as in Section <a href="trees.html#instweight">6.4.7</a>). In the asset allocation parlance, this can be viewed as introducing views as to which observations are the most interesting, e.g., value stocks can be allowed to have a larger weight in the computation of the loss if the user believes they carry more relevant information. Naturally, it then always remains to minimize this loss.</p>
<p>We close this topic by mentioning a practical application of transfer learning developed in <span class="citation">Koshiyama et al. (<a href="#ref-koshiyama2020quantnet" role="doc-biblioref">2020</a>)</span>. The authors propose a neural network architecture that allows to share the learning process from different strategies across several markets. This method is, among other things, aimed at alleviating the backtest overfitting problem.</p>
</div>
</div>
</div>
<h3>References</h3>
<div id="refs" class="references">
<div id="ref-agarwal2006algorithms">
<p>Agarwal, Amit, Elad Hazan, Satyen Kale, and Robert E Schapire. 2006. “Algorithms for Portfolio Management Based on the Newton Method.” In <em>Proceedings of the 23rd International Conference on Machine Learning</em>, 9–16. ACM.</p>
</div>
<div id="ref-arjovsky2019invariant">
<p>Arjovsky, Martin, Léon Bottou, Ishaan Gulrajani, and David Lopez-Paz. 2019. “Invariant Risk Minimization.” <em>arXiv Preprint</em>, no. 1907.02893.</p>
</div>
<div id="ref-aronow2020book">
<p>Aronow, Peter M., and Fredrik Sävje. 2019. “Book Review. The Book of Why: The New Science of Cause and Effect.” <em>Journal of the American Statistical Association</em> 115 (529): 482–85.</p>
</div>
<div id="ref-barberis2015x">
<p>Barberis, Nicholas, Robin Greenwood, Lawrence Jin, and Andrei Shleifer. 2015. “X-CAPM: An Extrapolative Capital Asset Pricing Model.” <em>Journal of Financial Economics</em> 115 (1): 1–24.</p>
</div>
<div id="ref-basak2004online">
<p>Basak, Jayanta. 2004. “Online Adaptive Decision Trees.” <em>Neural Computation</em> 16 (9): 1959–81.</p>
</div>
<div id="ref-beery2018recognition">
<p>Beery, Sara, Grant Van Horn, and Pietro Perona. 2018. “Recognition in Terra Incognita.” In <em>Proceedings of the European Conference on Computer Vision (Eccv)</em>, 456–73.</p>
</div>
<div id="ref-ben2010theory">
<p>Ben-David, Shai, John Blitzer, Koby Crammer, Alex Kulesza, Fernando Pereira, and Jennifer Wortman Vaughan. 2010. “A Theory of Learning from Different Domains.” <em>Machine Learning</em> 79 (1-2): 151–75.</p>
</div>
<div id="ref-blum1999universal">
<p>Blum, Avrim, and Adam Kalai. 1999. “Universal Portfolios with and Without Transaction Costs.” <em>Machine Learning</em> 35 (3): 193–205.</p>
</div>
<div id="ref-brodersen2015inferring">
<p>Brodersen, Kay H, Fabian Gallusser, Jim Koehler, Nicolas Remy, Steven L Scott, and others. 2015. “Inferring Causal Impact Using Bayesian Structural Time-Series Models.” <em>Annals of Applied Statistics</em> 9 (1): 247–74.</p>
</div>
<div id="ref-buhlmann2014cam">
<p>Bühlmann, Peter, Jonas Peters, Jan Ernest, and others. 2014. “CAM: Causal Additive Models, High-Dimensional Order Search and Penalized Regression.” <em>Annals of Statistics</em> 42 (6): 2526–56.</p>
</div>
<div id="ref-chow2002multivariate">
<p>Chow, Ying-Foon, John A Cotsomitis, and Andy CC Kwan. 2002. “Multivariate Cointegration and Causality Tests of Wagner’s Hypothesis: Evidence from the UK.” <em>Applied Economics</em> 34 (13): 1671–7.</p>
</div>
<div id="ref-cont2007volatility">
<p>Cont, Rama. 2007. “Volatility Clustering in Financial Markets: Empirical Facts and Agent-Based Models.” In <em>Long Memory in Economics</em>, 289–309. Springer.</p>
</div>
<div id="ref-coqueret2018economic">
<p>Coqueret, Guillaume. 2020. “Stock Specific Sentiment and Return Predictability.” <em>Quantitative Finance</em> Forthcoming.</p>
</div>
<div id="ref-coqueret2019training">
<p>Coqueret, Guillaume, and Tony Guida. 2020. “Training Trees on Tails with Applications to Portfolio Choice.” <em>Annals of Operations Research</em> 288: 181–221.</p>
</div>
<div id="ref-cornuejols2011apprentissage">
<p>Cornuejols, Antoine, Laurent Miclet, and Vincent Barra. 2018. <em>Apprentissage Artificiel: Deep Learning, Concepts et Algorithmes</em>. Eyrolles.</p>
</div>
<div id="ref-cover1991universal">
<p>Cover, Thomas M. 1991. “Universal Portfolios.” <em>Mathematical Finance</em> 1 (1): 1–29.</p>
</div>
<div id="ref-cover1996universal">
<p>Cover, Thomas M, and Erik Ordentlich. 1996. “Universal Portfolios with Side Information.” <em>IEEE Transactions on Information Theory</em> 42 (2): 348–63.</p>
</div>
<div id="ref-crammer2006online">
<p>Crammer, Koby, Ofer Dekel, Joseph Keshet, Shai Shalev-Shwartz, and Yoram Singer. 2006. “Online Passive-Aggressive Algorithms.” <em>Journal of Machine Learning Research</em> 7 (Mar): 551–85.</p>
</div>
<div id="ref-engle1982autoregressive">
<p>Engle, Robert F. 1982. “Autoregressive Conditional Heteroscedasticity with Estimates of the Variance of United Kingdom Inflation.” <em>Econometrica</em>, 987–1007.</p>
</div>
<div id="ref-granger1969investigating">
<p>Granger, Clive WJ. 1969. “Investigating Causal Relations by Econometric Models and Cross-Spectral Methods.” <em>Econometrica</em>, 424–38.</p>
</div>
<div id="ref-hahn2019bayesian">
<p>Hahn, P Richard, Jared S Murray, and Carlos Carvalho. 2019. “Bayesian Regression Tree Models for Causal Inference: Regularization, Confounding, and Heterogeneous Effects.” <em>arXiv Preprint</em>, no. 1706.09523.</p>
</div>
<div id="ref-hazan2007logarithmic">
<p>Hazan, Elad, Amit Agarwal, and Satyen Kale. 2007. “Logarithmic Regret Algorithms for Online Convex Optimization.” <em>Machine Learning</em> 69 (2-3): 169–92.</p>
</div>
<div id="ref-hazan2016introduction">
<p>Hazan, Elad, and others. 2016. “Introduction to Online Convex Optimization.” <em>Foundations and Trends in Optimization</em> 2 (3-4): 157–325.</p>
</div>
<div id="ref-heinze2018invariant">
<p>Heinze-Deml, Christina, Jonas Peters, and Nicolai Meinshausen. 2018. “Invariant Causal Prediction for Nonlinear Models.” <em>Journal of Causal Inference</em> 6 (2).</p>
</div>
<div id="ref-hiemstra1994testing">
<p>Hiemstra, Craig, and Jonathan D Jones. 1994. “Testing for Linear and Nonlinear Granger Causality in the Stock Price-Volume Relation.” <em>Journal of Finance</em> 49 (5): 1639–64.</p>
</div>
<div id="ref-hoi2018online">
<p>Hoi, Steven CH, Doyen Sahoo, Jing Lu, and Peilin Zhao. 2018. “Online Learning: A Comprehensive Survey.” <em>arXiv Preprint</em>, no. 1802.02871.</p>
</div>
<div id="ref-hunermund2019causal">
<p>Hünermund, Paul, and Elias Bareinboim. 2019. “Causal Inference and Data-Fusion in Econometrics.” <em>arXiv Preprint</em>, no. 1912.09104.</p>
</div>
<div id="ref-kalisch2012causal">
<p>Kalisch, Markus, Martin Mächler, Diego Colombo, Marloes H Maathuis, Peter Bühlmann, and others. 2012. “Causal Inference Using Graphical Models with the R Package Pcalg.” <em>Journal of Statistical Software</em> 47 (11): 1–26.</p>
</div>
<div id="ref-khedmati2020online">
<p>Khedmati, Majid, and Pejman Azin. 2020. “An Online Portfolio Selection Algorithm Using Clustering Approaches and Considering Transaction Costs.” <em>Expert Systems with Applications</em> Forthcoming: 113546.</p>
</div>
<div id="ref-koshiyama2020quantnet">
<p>Koshiyama, Adriano, Sebastian Flennerhag, Stefano B Blumberg, Nick Firoozye, and Philip Treleaven. 2020. “QuantNet: Transferring Learning Across Systematic Trading Strategies.” <em>arXiv Preprint</em>, no. 2004.03445.</p>
</div>
<div id="ref-li2014online">
<p>Li, Bin, and Steven CH Hoi. 2014. “Online Portfolio Selection: A Survey.” <em>ACM Computing Surveys (CSUR)</em> 46 (3): 35.</p>
</div>
<div id="ref-li2018online">
<p>Li, Bin, and Steven Chu Hong Hoi. 2018. <em>Online Portfolio Selection: Principles and Algorithms</em>. CRC Press.</p>
</div>
<div id="ref-maathuis2018handbook">
<p>Maathuis, Marloes, Mathias Drton, Steffen Lauritzen, and Martin Wainwright. 2018. <em>Handbook of Graphical Models</em>. CRC Press.</p>
</div>
<div id="ref-pan2009survey">
<p>Pan, Sinno Jialin, and Qiang Yang. 2009. “A Survey on Transfer Learning.” <em>IEEE Transactions on Knowledge and Data Engineering</em> 22 (10): 1345–59.</p>
</div>
<div id="ref-pearl2009causality">
<p>Pearl, Judea. 2009. <em>Causality: Models, Reasoning and Inference. Second Edition</em>. Vol. 29. Cambridge University Press.</p>
</div>
<div id="ref-peters2017elements">
<p>Peters, Jonas, Dominik Janzing, and Bernhard Schölkopf. 2017. <em>Elements of Causal Inference: Foundations and Learning Algorithms</em>. MIT Press.</p>
</div>
<div id="ref-quionero2009dataset">
<p>Quionero-Candela, Joaquin, Masashi Sugiyama, Anton Schwaighofer, and Neil D Lawrence. 2009. <em>Dataset Shift in Machine Learning</em>. MIT Press.</p>
</div>
<div id="ref-regenstein2018reproducible">
<p>Regenstein, Jonathan K. 2018. <em>Reproducible Finance with R: Code Flows and Shiny Apps for Portfolio Analysis</em>. Chapman & Hall / CRC.</p>
</div>
<div id="ref-spirtes2000causation">
<p>Spirtes, Peter, Clark N Glymour, Richard Scheines, and David Heckerman. 2000. <em>Causation, Prediction, and Search</em>. MIT Press.</p>
</div>
<div id="ref-tikka2017identifying">
<p>Tikka, Santtu, and Juha Karvanen. 2017. “Identifying Causal Effects with the R Package Causaleffect.” <em>Journal of Statistical Software</em> 76 (1): 1–30.</p>
</div>
<div id="ref-weiss2016survey">
<p>Weiss, Karl, Taghi M Khoshgoftaar, and DingDing Wang. 2016. “A Survey of Transfer Learning.” <em>Journal of Big Data</em> 3 (1): 9.</p>
</div>
<div id="ref-widrow1960adaptive">
<p>Widrow, Bernard, and Marcian E Hoff. 1960. “Adaptive Switching Circuits.” In <em>IRE Wescon Convention Record</em>, 4:96–104.</p>
</div>
<div id="ref-wong2020non">
<p>Wong, Steven YK, Jennifer Chan, Lamiae Azizi, and Richard YD Xu. 2020. “Time-Varying Neural Network for Stock Return Prediction.” <em>arXiv Preprint</em>, no. 2003.02515.</p>
</div>
</div>
<div class="footnotes">
<hr />
<ol start="28">
<li id="fn28"><p>The CAM package was removed from CRAN in November 2019 but can still be installed manually. First, download the content of the package: <a href="https://cran.r-project.org/web/packages/CAM/index.html" class="uri">https://cran.r-project.org/web/packages/CAM/index.html</a>. Second, copy it in the directory obtained by typing <em>.libPaths()</em> in the console.<a href="causality.html#fnref28" class="footnote-back">↩︎</a></p></li>
<li id="fn29"><p>Another possible choice is the <em>baycn</em> package documented in <span class="citation">E. A. Martin and Fu (<a href="#ref-martin2019bayesian" role="doc-biblioref">2019</a>)</span>.<a href="causality.html#fnref29" class="footnote-back">↩︎</a></p></li>
<li id="fn30"><p>See for instance the papers on herding in factor investing: <span class="citation">Krkoska and Schenk-Hoppé (<a href="#ref-krkoska2019herding" role="doc-biblioref">2019</a>)</span> and <span class="citation">Santi and Zwinkels (<a href="#ref-santi2018exploring" role="doc-biblioref">2018</a>)</span>.<a href="causality.html#fnref30" class="footnote-back">↩︎</a></p></li>
<li id="fn31"><p>This book is probably the most complete reference for theoretical results in machine learning, but it is in French.<a href="causality.html#fnref31" class="footnote-back">↩︎</a></p></li>
</ol>
</div>
</section>
</div>
</div>
</div>
<a href="interp.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="unsup.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
</div>
</div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/clipboard.min.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-clipboard.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": false,
"twitter": true,
"linkedin": true,
"weibo": false,
"instapaper": false,
"vk": false,
"all": ["facebook", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": null,
"history": {
"link": null,
"text": null
},
"view": {
"link": null,
"text": null
},
"download": null,
"toc": {
"collapse": "section",
"scroll_highlight": true
},
"toolbar": {
"position": "fixed",
"download": false
},
"search": true,
"info": true
});
});
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
var src = "true";
if (src === "" || src === "true") src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML";
if (location.protocol !== "file:")
if (/^https?:/.test(src))
src = src.replace(/^https?:/, '');
script.src = src;
document.getElementsByTagName("head")[0].appendChild(script);