-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.html
744 lines (518 loc) · 36.4 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
<meta name="theme-color" content="#222">
<meta name="generator" content="Hexo 6.1.0">
<link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png">
<link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png">
<link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png">
<link rel="mask-icon" href="/images/logo.svg" color="#222">
<link rel="stylesheet" href="/css/main.css">
<link rel="stylesheet" href="/lib/font-awesome/css/all.min.css">
<script id="hexo-configurations">
var NexT = window.NexT || {};
var CONFIG = {"hostname":"example.com","root":"/","scheme":"Pisces","version":"7.8.0","exturl":false,"sidebar":{"position":"left","display":"post","padding":18,"offset":12,"onmobile":false},"copycode":{"enable":false,"show_result":false,"style":null},"back2top":{"enable":true,"sidebar":false,"scrollpercent":false},"bookmark":{"enable":false,"color":"#222","save":"auto"},"fancybox":false,"mediumzoom":false,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"algolia":{"hits":{"per_page":10},"labels":{"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}},"localsearch":{"enable":false,"trigger":"auto","top_n_per_article":1,"unescape":false,"preload":false},"motion":{"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}}};
</script>
<meta property="og:type" content="website">
<meta property="og:title" content="5880">
<meta property="og:url" content="http://example.com/index.html">
<meta property="og:site_name" content="5880">
<meta property="og:locale" content="zh_CN">
<meta property="article:author" content="籍虞兰羽">
<meta name="twitter:card" content="summary">
<link rel="canonical" href="http://example.com/">
<script id="page-configurations">
// https://hexo.io/docs/variables.html
CONFIG.page = {
sidebar: "",
isHome : true,
isPost : false,
lang : 'zh-CN'
};
</script>
<title>5880</title>
<noscript>
<style>
.use-motion .brand,
.use-motion .menu-item,
.sidebar-inner,
.use-motion .post-block,
.use-motion .pagination,
.use-motion .comments,
.use-motion .post-header,
.use-motion .post-body,
.use-motion .collection-header { opacity: initial; }
.use-motion .site-title,
.use-motion .site-subtitle {
opacity: initial;
top: initial;
}
.use-motion .logo-line-before i { left: initial; }
.use-motion .logo-line-after i { right: initial; }
</style>
</noscript>
</head>
<body itemscope itemtype="http://schema.org/WebPage">
<div class="container use-motion">
<div class="headband"></div>
<header class="header" itemscope itemtype="http://schema.org/WPHeader">
<div class="header-inner"><div class="site-brand-container">
<div class="site-nav-toggle">
<div class="toggle" aria-label="切换导航栏">
<span class="toggle-line toggle-line-first"></span>
<span class="toggle-line toggle-line-middle"></span>
<span class="toggle-line toggle-line-last"></span>
</div>
</div>
<div class="site-meta">
<a href="/" class="brand" rel="start">
<span class="logo-line-before"><i></i></span>
<h1 class="site-title">5880</h1>
<span class="logo-line-after"><i></i></span>
</a>
</div>
<div class="site-nav-right">
<div class="toggle popup-trigger">
</div>
</div>
</div>
<nav class="site-nav">
<ul id="menu" class="main-menu menu">
<li class="menu-item menu-item-home">
<a href="/" rel="section"><i class="fa fa-home fa-fw"></i>首页</a>
</li>
<li class="menu-item menu-item-archives">
<a href="/archives/" rel="section"><i class="fa fa-archive fa-fw"></i>归档</a>
</li>
</ul>
</nav>
</div>
</header>
<div class="back-to-top">
<i class="fa fa-arrow-up"></i>
<span>0%</span>
</div>
<div class="reading-progress-bar"></div>
<main class="main">
<div class="main-inner">
<div class="content-wrap">
<div class="content index posts-expand">
<article itemscope itemtype="http://schema.org/Article" class="post-block" lang="zh-CN">
<link itemprop="mainEntityOfPage" href="http://example.com/2022/04/05/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0-%E7%AC%94%E8%AE%B0/">
<span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
<meta itemprop="image" content="/images/avatar.gif">
<meta itemprop="name" content="籍虞兰羽">
<meta itemprop="description" content="">
</span>
<span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
<meta itemprop="name" content="5880">
</span>
<header class="post-header">
<h2 class="post-title" itemprop="name headline">
<a href="/2022/04/05/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0-%E7%AC%94%E8%AE%B0/" class="post-title-link" itemprop="url">机器学习-笔记</a>
</h2>
<div class="post-meta">
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-calendar"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建时间:2022-04-05 20:48:58 / 修改时间:20:49:33" itemprop="dateCreated datePublished" datetime="2022-04-05T20:48:58+08:00">2022-04-05</time>
</span>
</div>
</header>
<div class="post-body" itemprop="articleBody">
<h2 id="机器学习-决策树"><a href="#机器学习-决策树" class="headerlink" title="机器学习 决策树"></a>机器学习 决策树</h2><h3 id="随机森林-1-创建、使用和评估"><a href="#随机森林-1-创建、使用和评估" class="headerlink" title="随机森林-1-创建、使用和评估"></a>随机森林-1-创建、使用和评估</h3><ol>
<li><code>create a bootstrapped dataset </code>随机抽取样本,抽取样本量和原始数据量一致,可以重复</li>
<li><code>only considering a random a subset of variables at each step</code> 从上文选择的bootstrapped数据集中选取随机变量(变量最优数量随后再说),选一个作为根节点,然后继续之前的操作选除了根节点变量以外的选随机变量作为叶节点</li>
<li><code>go back to step 1 and repeat</code>新建新的bootstrapped 数据集,在每一步考虑变量的子集新建更多的树</li>
<li>随机森林要比单独的决策树要更有效</li>
<li>参考所有树的结果,根据投票结果,票多的胜,进行最后的判断</li>
<li><code>Bootstrapping the data plus using the aggregate to make a decision is colled "Bagging"</code></li>
</ol>
<p>在bootstrapped dataset中,没有被选中的数据称为Out-Of-Bag dataset,袋外结果如果没有被正确分类,就被称为Out-Of-Bag Error,可以用于评估随机森林效果,这可以帮助我们在step 2时确定选择变量的数目有所帮助(微调变量数目),一般选择总变量数目的平方根的变量,并在附近的数字上微调</p>
<h3 id="缺失数据与clustering"><a href="#缺失数据与clustering" class="headerlink" title="缺失数据与clustering"></a>缺失数据与clustering</h3><p>两种类型的缺测:</p>
<ol>
<li>用于创建随机森林的数据中包含缺测</li>
<li>用于分类的新样本中包含缺测</li>
</ol>
<h2 id="Regression-Trees"><a href="#Regression-Trees" class="headerlink" title="Regression Trees"></a>Regression Trees</h2><ol>
<li>选定少于x个样本时,无法形成分叉节点,只能形成叶子(所属叶子中样本的平均作为叶子的数值),</li>
<li>通过不断改变节点阈值,找到残差(残差平方和)最小的阈值作为节点阈值,并进行分叉</li>
<li>重复以上步骤,直至无法进行分叉</li>
</ol>
<h3 id="如何对回归树剪枝"><a href="#如何对回归树剪枝" class="headerlink" title="如何对回归树剪枝"></a>如何对回归树剪枝</h3><p>剪枝是为了防止过度拟合</p>
<ol>
<li>计算每个叶子的残差,将一棵树的所有叶子的残差相加,就是树的残差</li>
<li>计算剪去各个枝叶后的树的残差。<code>Tree Score=SSR+alpha*T</code>,alpha是微调参数,我们可以在交叉验证中调整,T为树中叶子的数量,这里的<code>alpha*T</code>是为了补偿不同树中叶子数量差异带来的影响</li>
<li>选择Tree Score最小的树作为减枝后的树</li>
<li>如何选择alpha:给定不同的alpha值,通过剪枝选取当前alpha下Tree Score最小的树,代入测试集中,找出测试集中Tree Score最小的树对应的alpha值;继续使用新的训练集和测试集数据,重复以上操作,选取alpha值,重复十次,将十次中选中的alpha值做平均</li>
</ol>
<h2 id="AdaBoost"><a href="#AdaBoost" class="headerlink" title="AdaBoost"></a>AdaBoost</h2><ol>
<li>Stump(树桩)只有一个变量,一个根两个叶子,所以学习能力很差,是弱学习器,但AdaBoost很喜欢它,并且普遍存在于树中</li>
<li>随机森林中,每棵树是独立的,这与AdaBoost形成鲜明对比;AdaBoost第一棵树的误差会影响第二棵树,以此类推</li>
</ol>
<p>AdaBoost的三个理念:</p>
<ol>
<li>AdaBoost结合了许多弱学习器,这些弱学习器几乎都是树桩</li>
<li>一些树桩相对其他树桩的话语权更大</li>
<li>每个树桩都是考虑了前一个树桩的误差</li>
</ol>
<p>AdaBoost的计算流程</p>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211222221132546.png" alt="image-20211222221132546"></p>
<p>m表示类别的数量,pm表示样本属于第m类的概率,基尼系数越小 ,分类越好</p>
<ol>
<li>给每个样本相同的权重</li>
<li>构建多个弱学习器,查看每个弱学习器的Gini系数,系数最低的是森林的第一个树桩</li>
<li>接下来确定步骤2的树桩的发言权,根据树桩对样本分类的程度给定发言权的大小</li>
<li>由于样本权重的总和为1,总误差总是在0到1之间,0是最好的树桩,1是最差的树桩</li>
<li>使用总误差决定树桩的发言权</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211222222623530.png" alt="image-20211222222623530"></p>
<p>如果总误差是0.5的话,则树桩发言权是0</p>
<ol start="6">
<li>我们需要修改样本权重,以便下一个树桩能够考虑到上一个树桩的结果,对上一个树桩分类错误的样本增加权重,上一个树桩发言权越大,被错误分类样本的权重越大</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211222223315898.png" alt="image-20211222223315898"></p>
<ol start="7">
<li>对上一个树桩分类正确的样本降低权重,树桩发言权越大,正确样本权重就越小</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211222223718329.png" alt="image-20211222223718329"></p>
<ol start="8">
<li><p>更新样本权重,将样本权重之和归一(每个权重除以样本权重之和)</p>
</li>
<li><p>重复以上步骤,创建第二个树桩,可以使用权重Gini系数选择一个变量;我们也可以不用权重Gini系数,我们从0-1中取值,数值所属的样本被选中,所以样本权重越大的样本越容易被选中,这样所有的样本具有相同的权重</p>
</li>
<li><p>形成森林后,对判断为A类的树桩的发言权求和,对判断为B类的树桩的发言权求和,哪一个发言权大就是哪一类</p>
</li>
</ol>
<h2 id="Gradient-Boost"><a href="#Gradient-Boost" class="headerlink" title="Gradient Boost"></a>Gradient Boost</h2><p>Note:GB是用于预测连续的数值,这里我们用GB做回归</p>
<ol>
<li>不同于AdaBoost,GB开始是用叶子而不是树桩,随后根据上一颗树的误差,建立下一个树,这棵树一般要比树桩更复杂,并且叶子数量 设置在8-32之间</li>
<li>选定一个数值变量,确定第一个叶子的阈值,计算每个样本和阈值的差,并保存为Residual,每个样本都有一个Residual</li>
<li>利用除step2的其他变量构建树,树的节点数值为分类后样本的step2中Residual的平均,这样这棵树上每个叶子都有一个数值</li>
<li>通过学习率对这棵树计算结果进行加权</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223122558210.png" alt="image-20211223122558210"></p>
<ol start="5">
<li>计算得到新的Residual,对Residual进行更新</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223122734939.png" alt="image-20211223122734939"></p>
<ol start="6">
<li>基于前一个树计算得到的Residual构建新的树,并用新的树和之前的树还有叶子进行计算新的Residual</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223123109927.png" alt="image-20211223123109927"></p>
<h3 id="数学过程"><a href="#数学过程" class="headerlink" title="数学过程"></a>数学过程</h3><p>输入:需要有训练样本以及可以导的损失函数 </p>
<ol>
<li>初始化模型,并给定一个常数使得损失函数之和最小,这里给的是所有数值的平均,是损失函数最小的数</li>
<li>做一个树的循环,总共新建M棵树,A中gamma表示第m棵树中第i个样本的Residual,yi表示第i个样本的标签值,Fm-1(xi)表示第m-1棵树的结果,所以A是更新样本Residual;B中新建一个回归树,创建叶子Rjm,j表示第j个叶子;C计算B中新树结果加上之前树的Residual得到的结果于样本标签值损失函数最小时对应的Residual值,得到叶子的Residual值Residual jm;D上一个树的结果加上学习率乘以当前树的Residual值,更新得到当前森林结构下预测的数值</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223134236979.png" alt="image-20211223134236979"></p>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223135702865.png" alt="image-20211223135702865"></p>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223135802189.png" alt="image-20211223135802189"></p>
<h2 id="XGBoost"><a href="#XGBoost" class="headerlink" title="XGBoost"></a>XGBoost</h2><p>Note:XGBoost被用于大型复杂数据集当中,是extreme Gradient Boost</p>
<ol>
<li>首先预测出一个数值</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223144241526.png" alt="image-20211223144241526"></p>
<ol start="2">
<li>构建出一个树,计算根和叶的相似系数</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223144542252.png" alt="image-20211223144542252"></p>
<ol start="3">
<li>计算增益系数</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223144703829.png" alt="image-20211223144703829"></p>
<ol start="4">
<li>更换分叉的阈值,继续计算增益系数,哪个增益系数越大,就用哪个分叉阈值</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223144833610.png" alt="image-20211223144833610"></p>
<ol start="5">
<li>对叶子节点继续分叉,计算相似系数,计算这个叶子分叉的增益系数,选定增益系数最大的分叉阈值,当叶子中样本数量少于一定值时不再分叉</li>
<li>给定一个剪枝系数gamma,如果增益系数小于剪枝系数,则进行剪枝,但是不能跳过叶子节点分叉直接对根节点分叉进行剪枝,我们可以将所有的叶子节点和根节点减去,只留step1的数值</li>
<li>将相似系数中的lambda设置为1,此为正则项,叶子中样本数量越少,加入正则项后的惩罚越大,增益系数是可以小于零的,所以即使step6中的gamma=0,我们也可以进行剪枝</li>
<li>叶子的输出结果和相似系数很类似,但是并没有对Residual和求平方,这里的lambda=1,当lambda=0时,输出结果就是叶子中Residual的平均</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223152842621.png" alt="image-20211223152842621"></p>
<ol start="9">
<li>默认学习率是0.3,输出结果考虑最开始的预测值,加上学习率乘以树的预测值</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223153250518.png" alt="image-20211223153250518"></p>
<h3 id="数学过程-1"><a href="#数学过程-1" class="headerlink" title="数学过程"></a>数学过程</h3><ol>
<li>回归模型损失函数</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223154901543.png" alt="image-20211223154901543"></p>
<ol start="2">
<li>分类模型损失函数</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223155055943.png" alt="image-20211223155055943"></p>
<ol start="3">
<li>损失函数还有一项是剪枝项,由于剪枝是在树构建完成后进行对的,其对树的构建并没有作用,因此这里忽略,最后一项lambda是惩罚项,O^2是所有输出值的平方</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223155240880.png" alt="image-20211223155240880"></p>
<ol start="4">
<li>损失函数也可以这样写,通过改变输出值的大小,找出损失函数最小的对应的输出值,对损失函数的求导,需要考虑到二阶导数,也就是用二阶泰勒展开,一阶导数用g(gradient)表示,二阶导数用h(hessian)表示</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223155611122.png" alt="image-20211223155611122"></p>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223160433280.png" alt="image-20211223160433280"></p>
<ol start="5">
<li>展开成级数后求一阶导数,我们知道g和h的值,就可以推导出O的公式,O取这个值时,损失函数最小</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223160702175.png" alt="image-20211223160702175"></p>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223160901819.png" alt="image-20211223160901819"></p>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223160915423.png" alt="image-20211223160915423"></p>
<ol start="6">
<li>相似度推导,损失函数二阶展开,前面加负号,将之前算得的O代入</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223161946572.png" alt="image-20211223161946572"></p>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223162133354.png" alt="image-20211223162133354"></p>
<ol start="7">
<li>最后得到相似度定义,去除1/2就是相似度,代入g和h 得到</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223162306817.png" alt="image-20211223162306817"></p>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223162419545.png" alt="image-20211223162419545"></p>
<h2 id="Optimizations"><a href="#Optimizations" class="headerlink" title="Optimizations"></a>Optimizations</h2><p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223163244117.png" alt="image-20211223163244117"></p>
<ol>
<li>近似贪心算法:改变阈值使得增益系数最大,但不考虑随后叶子将如何分支(目光短浅,但是建树快),当有许多样本时,贪心算法会遍历所有的样本,找到增益系数最大的阈值,速度会变得很慢;如果不用贪心算法,XGBoost将会更加慎重考虑阈值大小,目光长远 </li>
<li>将数据划分出若干分位数,用分位数表示阈值,选择更少,速度更快,一般用33个左右的分位数</li>
<li>使用加权分位草图(sketch),这里的加权是对样本加权,样本的权重是h(hessian)也就是1,对于分类问题,概率分布在0.5附近的样本权重大,分布在0或者1附近的样本权重小,选用加权分位草图使得我们使用的时候有更小的分位数</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223182036722.png" alt="image-20211223182036722"></p>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223182245444.png" alt="image-20211223182245444"></p>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223182401084.png" alt="image-20211223182401084"></p>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223182423035.png" alt="image-20211223182423035"></p>
<ol start="4">
<li>Sparsity-Aware Split Finding如果数据中有缺测,将无缺测数据分位一组,有缺测数据分位另一组,用无缺测数据构建树,将有缺测数据分别放在左侧叶子和右侧叶子,分辨算增益系数,更换分叉阈值,继续将缺测数据放在左侧和右侧叶子中,分别计算增益系数,选取增益系数最小的阈值,和缺测的位置,这里缺测的位置供未来参考</li>
<li>Cache-Aware Access</li>
</ol>
<p><img src="C:\Users\tangyuheng\AppData\Roaming\Typora\typora-user-images\image-20211223183314004.png" alt="image-20211223183314004"></p>
<ol start="6">
<li>Blocks for Out-of-Core Computation CPU可以同时读取两个硬盘中的内容,加快读取速度</li>
</ol>
<h2 id="Distance-correlation(距离相关系数)"><a href="#Distance-correlation(距离相关系数)" class="headerlink" title="Distance correlation(距离相关系数)"></a>Distance correlation(距离相关系数)</h2><p>距离相关系数可以测量高维度的非线性关系,普通的相关系数比如皮尔森系数只适用于测量一元线性关系,且可以给出线性关系是正向的还是负向的。</p>
<p>In <a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Statistics">statistics</a> and in <a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Probability_theory">probability theory</a>, <strong>distance correlation</strong> or <strong>distance covariance</strong> is a measure of <a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Independence_(probability_theory)">dependence</a> between two paired <a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Random_vector">random vectors</a> of arbitrary, not necessarily equal, <a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Euclidean_vector">dimension</a>. The population distance correlation coefficient is zero if and only if the random vectors are <a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Independence_(probability_theory)">independent</a>. Thus, distance correlation measures both linear and nonlinear association between two random variables or random vectors. This is in contrast to <a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Pearson's_correlation">Pearson’s correlation</a>, which can only detect linear association between two <a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Random_variable">random variables</a>.</p>
<p>Distance correlation can be used to perform a <a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Statistical_hypothesis_testing">statistical test</a> of dependence with a <a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Permutation_test">permutation test</a>. One first computes the distance correlation (involving the re-centering of Euclidean distance matrices) between two random vectors, and then compares this value to the distance correlations of many shuffles of the data.</p>
<p><img src="https://img-blog.csdn.net/20180611174521364?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ppYW9hb2RlY2h1bmx2/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70" alt="img"></p>
<p><img src="https://img-blog.csdn.net/20180611174553732?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ppYW9hb2RlY2h1bmx2/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70" alt="img"></p>
<p>[11] 王黎明, 吴香华, 赵天良,等. 基于距离相关系数和支持向量机回归的PM_(2.5)浓度滚动统计预报方案[J]. 环境科学学报, 2017,37(4):1268-1276.(我是从这篇论文上找的,维基百科上有更细致的,可惜我看不下去啊)</p>
</div>
<footer class="post-footer">
<div class="post-eof"></div>
</footer>
</article>
<article itemscope itemtype="http://schema.org/Article" class="post-block" lang="zh-CN">
<link itemprop="mainEntityOfPage" href="http://example.com/2022/04/05/%E6%88%91%E7%9A%84%E5%8D%9A%E5%AE%A2/">
<span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
<meta itemprop="image" content="/images/avatar.gif">
<meta itemprop="name" content="籍虞兰羽">
<meta itemprop="description" content="">
</span>
<span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
<meta itemprop="name" content="5880">
</span>
<header class="post-header">
<h2 class="post-title" itemprop="name headline">
<a href="/2022/04/05/%E6%88%91%E7%9A%84%E5%8D%9A%E5%AE%A2/" class="post-title-link" itemprop="url">我的博客</a>
</h2>
<div class="post-meta">
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-calendar"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建时间:2022-04-05 17:24:48" itemprop="dateCreated datePublished" datetime="2022-04-05T17:24:48+08:00">2022-04-05</time>
</span>
</div>
</header>
<div class="post-body" itemprop="articleBody">
</div>
<footer class="post-footer">
<div class="post-eof"></div>
</footer>
</article>
<article itemscope itemtype="http://schema.org/Article" class="post-block" lang="zh-CN">
<link itemprop="mainEntityOfPage" href="http://example.com/2022/04/05/my-first-blog/">
<span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
<meta itemprop="image" content="/images/avatar.gif">
<meta itemprop="name" content="籍虞兰羽">
<meta itemprop="description" content="">
</span>
<span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
<meta itemprop="name" content="5880">
</span>
<header class="post-header">
<h2 class="post-title" itemprop="name headline">
<a href="/2022/04/05/my-first-blog/" class="post-title-link" itemprop="url">my-first-blog</a>
</h2>
<div class="post-meta">
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-calendar"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建时间:2022-04-05 17:20:44 / 修改时间:17:24:32" itemprop="dateCreated datePublished" datetime="2022-04-05T17:20:44+08:00">2022-04-05</time>
</span>
</div>
</header>
<div class="post-body" itemprop="articleBody">
<h2 id="这是我第一篇网络博客"><a href="#这是我第一篇网络博客" class="headerlink" title="这是我第一篇网络博客"></a>这是我第一篇网络博客</h2><p><strong>作者:籍虞兰羽</strong></p>
<p>你好,互联网</p>
</div>
<footer class="post-footer">
<div class="post-eof"></div>
</footer>
</article>
<article itemscope itemtype="http://schema.org/Article" class="post-block" lang="zh-CN">
<link itemprop="mainEntityOfPage" href="http://example.com/2022/04/05/hello-world/">
<span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
<meta itemprop="image" content="/images/avatar.gif">
<meta itemprop="name" content="籍虞兰羽">
<meta itemprop="description" content="">
</span>
<span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
<meta itemprop="name" content="5880">
</span>
<header class="post-header">
<h2 class="post-title" itemprop="name headline">
<a href="/2022/04/05/hello-world/" class="post-title-link" itemprop="url">Hello World</a>
</h2>
<div class="post-meta">
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-calendar"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建时间:2022-04-05 16:26:06" itemprop="dateCreated datePublished" datetime="2022-04-05T16:26:06+08:00">2022-04-05</time>
</span>
</div>
</header>
<div class="post-body" itemprop="articleBody">
<p>Welcome to <a target="_blank" rel="noopener" href="https://hexo.io/">Hexo</a>! This is your very first post. Check <a target="_blank" rel="noopener" href="https://hexo.io/docs/">documentation</a> for more info. If you get any problems when using Hexo, you can find the answer in <a target="_blank" rel="noopener" href="https://hexo.io/docs/troubleshooting.html">troubleshooting</a> or you can ask me on <a target="_blank" rel="noopener" href="https://github.com/hexojs/hexo/issues">GitHub</a>.</p>
<h2 id="Quick-Start"><a href="#Quick-Start" class="headerlink" title="Quick Start"></a>Quick Start</h2><h3 id="Create-a-new-post"><a href="#Create-a-new-post" class="headerlink" title="Create a new post"></a>Create a new post</h3><figure class="highlight bash"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">$ hexo new <span class="string">"My New Post"</span></span><br></pre></td></tr></table></figure>
<p>More info: <a target="_blank" rel="noopener" href="https://hexo.io/docs/writing.html">Writing</a></p>
<h3 id="Run-server"><a href="#Run-server" class="headerlink" title="Run server"></a>Run server</h3><figure class="highlight bash"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">$ hexo server</span><br></pre></td></tr></table></figure>
<p>More info: <a target="_blank" rel="noopener" href="https://hexo.io/docs/server.html">Server</a></p>
<h3 id="Generate-static-files"><a href="#Generate-static-files" class="headerlink" title="Generate static files"></a>Generate static files</h3><figure class="highlight bash"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">$ hexo generate</span><br></pre></td></tr></table></figure>
<p>More info: <a target="_blank" rel="noopener" href="https://hexo.io/docs/generating.html">Generating</a></p>
<h3 id="Deploy-to-remote-sites"><a href="#Deploy-to-remote-sites" class="headerlink" title="Deploy to remote sites"></a>Deploy to remote sites</h3><figure class="highlight bash"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">$ hexo deploy</span><br></pre></td></tr></table></figure>
<p>More info: <a target="_blank" rel="noopener" href="https://hexo.io/docs/one-command-deployment.html">Deployment</a></p>
</div>
<footer class="post-footer">
<div class="post-eof"></div>
</footer>
</article>
</div>
<script>
window.addEventListener('tabs:register', () => {
let { activeClass } = CONFIG.comments;
if (CONFIG.comments.storage) {
activeClass = localStorage.getItem('comments_active') || activeClass;
}
if (activeClass) {
let activeTab = document.querySelector(`a[href="#comment-${activeClass}"]`);
if (activeTab) {
activeTab.click();
}
}
});
if (CONFIG.comments.storage) {
window.addEventListener('tabs:click', event => {
if (!event.target.matches('.tabs-comment .tab-content .tab-pane')) return;
let commentClass = event.target.classList[1];
localStorage.setItem('comments_active', commentClass);
});
}
</script>
</div>
<div class="toggle sidebar-toggle">
<span class="toggle-line toggle-line-first"></span>
<span class="toggle-line toggle-line-middle"></span>
<span class="toggle-line toggle-line-last"></span>
</div>
<aside class="sidebar">
<div class="sidebar-inner">
<ul class="sidebar-nav motion-element">
<li class="sidebar-nav-toc">
文章目录
</li>
<li class="sidebar-nav-overview">
站点概览
</li>
</ul>
<!--noindex-->
<div class="post-toc-wrap sidebar-panel">
</div>
<!--/noindex-->
<div class="site-overview-wrap sidebar-panel">
<div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
<p class="site-author-name" itemprop="name">籍虞兰羽</p>
<div class="site-description" itemprop="description"></div>
</div>
<div class="site-state-wrap motion-element">
<nav class="site-state">
<div class="site-state-item site-state-posts">
<a href="/archives/">
<span class="site-state-item-count">4</span>
<span class="site-state-item-name">日志</span>
</a>
</div>
</nav>
</div>
<div class="links-of-author motion-element">
<span class="links-of-author-item">
<a href="mailto:[email protected]" title="E-Mail → mailto:[email protected]" rel="noopener" target="_blank"><i class="fa fa-envelope fa-fw"></i>E-Mail</a>
</span>
</div>
</div>
</div>
</aside>
<div id="sidebar-dimmer"></div>
</div>
</main>
<footer class="footer">
<div class="footer-inner">
<div class="copyright">
©
<span itemprop="copyrightYear">2022</span>
<span class="with-love">
<i class="fa fa-heart"></i>
</span>
<span class="author" itemprop="copyrightHolder">籍虞兰羽</span>
</div>
<div class="powered-by">由 <a href="https://hexo.io/" class="theme-link" rel="noopener" target="_blank">Hexo</a> & <a href="https://pisces.theme-next.org/" class="theme-link" rel="noopener" target="_blank">NexT.Pisces</a> 强力驱动
</div>
<div class="busuanzi-count">
<script async src="https://busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>
<span class="post-meta-item" id="busuanzi_container_site_uv" style="display: none;">
<span class="post-meta-item-icon">
<i class="fa fa-user"></i>
</span>
<span class="site-uv" title="总访客量">
<span id="busuanzi_value_site_uv"></span>
</span>
</span>
<span class="post-meta-divider">|</span>
<span class="post-meta-item" id="busuanzi_container_site_pv" style="display: none;">
<span class="post-meta-item-icon">
<i class="fa fa-eye"></i>
</span>
<span class="site-pv" title="总访问量">
<span id="busuanzi_value_site_pv"></span>
</span>
</span>
</div>
</div>
</footer>
</div>
<script size="300" alpha="0.6" zIndex="-1" src="/lib/canvas-ribbon/canvas-ribbon.js"></script>
<script src="/lib/anime.min.js"></script>
<script src="/lib/velocity/velocity.min.js"></script>
<script src="/lib/velocity/velocity.ui.min.js"></script>
<script src="/js/utils.js"></script>
<script src="/js/motion.js"></script>
<script src="/js/schemes/pisces.js"></script>
<script src="/js/next-boot.js"></script>
</body>
</html>