index.html

<!DOCTYPE html>
<html lang="zh-CN">
<head>
  <meta charset="UTF-8">
<meta name="viewport" content="width=device-width">
<meta name="theme-color" content="#222"><meta name="generator" content="Hexo 7.3.0">

  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png">
  <link rel="mask-icon" href="/images/logo.svg" color="#222">

<link rel="stylesheet" href="/css/main.css">


<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.6.0/css/all.min.css" integrity="sha256-5eIC48iZUHmSlSUz9XtjRyK2mzQkHScZY1WdMaoz74E=" crossorigin="anonymous">
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/animate.css/3.1.1/animate.min.css" integrity="sha256-PR7ttpcvz8qrF57fur/yAx1qXMFJeJFiA6pSzWi0OIE=" crossorigin="anonymous">
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/fancyapps-ui/5.0.31/fancybox/fancybox.css" integrity="sha256-gkQVf8UKZgQ0HyuxL/VnacadJ+D2Kox2TCEBuNQg5+w=" crossorigin="anonymous">

<script class="next-config" data-name="main" type="application/json">{"hostname":"chengmingbo.github.io","root":"/","images":"/images","scheme":"Gemini","darkmode":false,"version":"8.21.0","exturl":false,"sidebar":{"position":"right","width_expanded":320,"width_dual_column":240,"display":"always","padding":18,"offset":12,"b2t":true,"scrollpercent":true,"onmobile":false},"hljswrap":true,"copycode":{"enable":false,"style":null},"fold":{"enable":false,"height":500},"bookmark":{"enable":false,"color":"#222","save":"auto"},"mediumzoom":false,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"stickytabs":false,"motion":{"enable":true,"async":false,"transition":{"menu_item":"fadeInDown","post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},"i18n":{"placeholder":"搜索...","empty":"没有找到任何搜索结果：${query}","hits_time":"找到 ${hits} 个搜索结果（用时 ${time} 毫秒）","hits":"找到 ${hits} 个搜索结果"}}</script><script src="/js/config.js"></script>

    <meta name="description" content="Mingbo">
<meta property="og:type" content="website">
<meta property="og:title" content="Mingbo">
<meta property="og:url" content="http://chengmingbo.github.io/index.html">
<meta property="og:site_name" content="Mingbo">
<meta property="og:description" content="Mingbo">
<meta property="og:locale" content="zh_CN">
<meta property="article:author" content="Mingbo Cheng">
<meta name="twitter:card" content="summary">


<link rel="canonical" href="http://chengmingbo.github.io/">


<script class="next-config" data-name="page" type="application/json">{"sidebar":"","isHome":true,"isPost":false,"lang":"zh-CN","comments":"","permalink":"","path":"index.html","title":""}</script>

<script class="next-config" data-name="calendar" type="application/json">""</script>
<title>Mingbo</title>
  

  <noscript>
    <link rel="stylesheet" href="/css/noscript.css">
  </noscript>
<!-- hexo injector head_end start -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.12.0/dist/katex.min.css">

<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/hexo-math@4.0.0/dist/style.css">
<!-- hexo injector head_end end --></head>

<body itemscope itemtype="http://schema.org/WebPage" class="use-motion">
  <div class="headband"></div>

  <main class="main">
    <div class="column">
      <header class="header" itemscope itemtype="http://schema.org/WPHeader"><div class="site-brand-container">
  <div class="site-nav-toggle">
    <div class="toggle" aria-label="切换导航栏" role="button">
        <span class="toggle-line"></span>
        <span class="toggle-line"></span>
        <span class="toggle-line"></span>
    </div>
  </div>

  <div class="site-meta">

    <a href="/" class="brand" rel="start">
      <i class="logo-line"></i>
      <h1 class="site-title">Mingbo</h1>
      <i class="logo-line"></i>
    </a>
      <img class="custom-logo-image" src="/%5Bobject%20Object%5D" alt="Mingbo">
  </div>

  <div class="site-nav-right">
    <div class="toggle popup-trigger" aria-label="搜索" role="button">
    </div>
  </div>
</div>


<nav class="site-nav">
  <ul class="main-menu menu"><li class="menu-item menu-item-home"><a href="/" rel="section"><i class="home fa-fw"></i>首页</a></li><li class="menu-item menu-item-slides"><a href="/slides/" rel="section"><i class="area-chart fa-fw"></i>slides</a></li><li class="menu-item menu-item-archives"><a href="/archives/" rel="section"><i class="archive fa-fw"></i>归档</a></li><li class="menu-item menu-item-about"><a href="/about/" rel="section"><i class="user fa-fw"></i>关于</a></li>
  </ul>
</nav>


</header>
        
  
  <aside class="sidebar">

    <div class="sidebar-inner sidebar-overview-active">
      <ul class="sidebar-nav">
        <li class="sidebar-nav-toc">
          文章目录
        </li>
        <li class="sidebar-nav-overview">
          站点概览
        </li>
      </ul>

      <div class="sidebar-panel-container">
        <!--noindex-->
        <div class="post-toc-wrap sidebar-panel">
        </div>
        <!--/noindex-->

        <div class="site-overview-wrap sidebar-panel">
          <div class="site-author animated" itemprop="author" itemscope itemtype="http://schema.org/Person">
  <p class="site-author-name" itemprop="name">Mingbo Cheng</p>
  <div class="site-description" itemprop="description">Mingbo</div>
</div>
<div class="site-state-wrap animated">
  <nav class="site-state">
      <div class="site-state-item site-state-posts">
        <a href="/archives/">
          <span class="site-state-item-count">20</span>
          <span class="site-state-item-name">日志</span>
        </a>
      </div>
      <div class="site-state-item site-state-categories">
        <span class="site-state-item-count">1</span>
        <span class="site-state-item-name">分类</span>
      </div>
      <div class="site-state-item site-state-tags">
        <span class="site-state-item-count">5</span>
        <span class="site-state-item-name">标签</span>
      </div>
  </nav>
</div>
  <div class="links-of-author animated">
      <span class="links-of-author-item">
        <a href="https://github.com/chengmingbo" title="GitHub → https:&#x2F;&#x2F;github.com&#x2F;chengmingbo" rel="noopener me" target="_blank"><i class="github fa-fw"></i>GitHub</a>
      </span>
  </div>

        </div>
      </div>
    </div>

    
    <div class="sidebar-inner sidebar-blogroll">
      <div class="links-of-blogroll animated">
        <div class="links-of-blogroll-title"><i class="fa fa-globe fa-fw"></i>
          链接
        </div>
        <ul class="links-of-blogroll-list">
            <li class="links-of-blogroll-item">
              <a href="https://deeplearningmath.org/supervised-machine-learning" title="https:&#x2F;&#x2F;deeplearningmath.org&#x2F;supervised-machine-learning" rel="noopener" target="_blank">deeplearningmath</a>
            </li>
            <li class="links-of-blogroll-item">
              <a href="https://lilianweng.github.io/archives/" title="https:&#x2F;&#x2F;lilianweng.github.io&#x2F;archives&#x2F;" rel="noopener" target="_blank">Lil'Log</a>
            </li>
            <li class="links-of-blogroll-item">
              <a href="https://www.zybuluo.com/codeep/note/163962" title="https:&#x2F;&#x2F;www.zybuluo.com&#x2F;codeep&#x2F;note&#x2F;163962" rel="noopener" target="_blank">mathjax grammar</a>
            </li>
            <li class="links-of-blogroll-item">
              <a href="http://vividfree.github.io/" title="http:&#x2F;&#x2F;vividfree.github.io&#x2F;" rel="noopener" target="_blank">vividfree</a>
            </li>
            <li class="links-of-blogroll-item">
              <a href="http://colah.github.io/" title="http:&#x2F;&#x2F;colah.github.io&#x2F;" rel="noopener" target="_blank">colah</a>
            </li>
            <li class="links-of-blogroll-item">
              <a href="https://www.autonlab.org/tutorials" title="https:&#x2F;&#x2F;www.autonlab.org&#x2F;tutorials" rel="noopener" target="_blank">Andrew Moore</a>
            </li>
            <li class="links-of-blogroll-item">
              <a href="https://plot.ly/matlab/plot/" title="https:&#x2F;&#x2F;plot.ly&#x2F;matlab&#x2F;plot&#x2F;" rel="noopener" target="_blank">matlabplot</a>
            </li>
            <li class="links-of-blogroll-item">
              <a href="http://www.ryanzhang.info/blog/" title="http:&#x2F;&#x2F;www.ryanzhang.info&#x2F;blog&#x2F;" rel="noopener" target="_blank">Ryan’s Cabinet</a>
            </li>
            <li class="links-of-blogroll-item">
              <a href="http://www.cnblogs.com/jerrylead/tag/Machine%20Learning/" title="http:&#x2F;&#x2F;www.cnblogs.com&#x2F;jerrylead&#x2F;tag&#x2F;Machine%20Learning&#x2F;" rel="noopener" target="_blank">JerryLead</a>
            </li>
            <li class="links-of-blogroll-item">
              <a href="https://yxzf.github.io/" title="https:&#x2F;&#x2F;yxzf.github.io&#x2F;" rel="noopener" target="_blank">YXZF'S BLOG</a>
            </li>
            <li class="links-of-blogroll-item">
              <a href="http://vonng.com/" title="http:&#x2F;&#x2F;vonng.com" rel="noopener" target="_blank">VONNG</a>
            </li>
        </ul>
      </div>
    </div>
  </aside>


    </div>

    <div class="main-inner index posts-expand">

    
<div class="post-block">
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="">
    <link itemprop="mainEntityOfPage" href="http://chengmingbo.github.io/2023/05/10/from_laplacian_to_hodge_laplacian/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/avatar.gif">
      <meta itemprop="name" content="Mingbo Cheng">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Mingbo">
      <meta itemprop="description" content="Mingbo">
    </span>

    <span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
      <meta itemprop="name" content="undefined | Mingbo">
      <meta itemprop="description" content="">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/2023/05/10/from_laplacian_to_hodge_laplacian/" class="post-title-link" itemprop="url">From graph laplacian to hodge laplacian</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">发表于</span>

      <time title="创建时间：2023-05-10 21:17:56" itemprop="dateCreated datePublished" datetime="2023-05-10T21:17:56+02:00">2023-05-10</time>
    </span>

  
</div>

        </div>
      </header>

    
    <div class="post-body" itemprop="articleBody">
          <h2 id="preface">Preface</h2>
<p>Graph laplacian is familiar to computation science researcher, with
which we can perform spectral analysis such as diffusion map, eigenmap
or spectral clustering. Here, we discuss how to generalize the graph
laplacian to it's high-order form, i.e., Hodge laplacian.</p>
<h2 id="graph-laplacian">Graph laplacian</h2>
<p>In the previous post we know that the graph laplacian can be obtianed
by degree matrix <span class="math inline">\(D\)</span> and adjacency
matrix <span class="math inline">\(A\)</span> such that: <span
class="math display">\[\begin{equation}
L_0 = D - A
\end{equation}\]</span></p>
<p>We can find the adjacency matrix and degree matrix of the graph blow
with nine vertices: <img
src="https://cmb.oss-cn-qingdao.aliyuncs.com/hodge_graph_9_nodes.svg" /></p>
<p>such that:</p>
<p><span class="math display">\[\begin{equation}
A = \begin{array}[r]{c |c c c c c c c c c }
           &amp; 1 &amp; 2 &amp; 3 &amp; 4 &amp; 5 &amp; 6 &amp; 7 &amp;
8 &amp; 9\\
            \hline
             1&amp; 0 &amp; 1 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             2&amp; 1 &amp; 0 &amp; 1 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             3&amp; 0 &amp; 1 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             4&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 1 &amp; 1 &amp; 0
&amp; 0 &amp; 0\\
             5&amp; 0 &amp; 0 &amp; 0 &amp; 1 &amp; 0 &amp; 1 &amp; 0
&amp; 0 &amp; 0\\
             6&amp; 0 &amp; 0 &amp; 0 &amp; 1 &amp; 1 &amp; 0 &amp; 1
&amp; 0 &amp; 0\\
             7&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 1 &amp; 0
&amp; 0 &amp; 0\\
             8&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 1\\
             9&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 1 &amp; 0\\
    \end{array}
\end{equation}\]</span></p>
<p>and</p>
<p><span class="math display">\[\begin{equation}
D = \begin{array}[r]{c |c c c c c c c c c }
           &amp; 1 &amp; 2 &amp; 3 &amp; 4 &amp; 5 &amp; 6 &amp; 7 &amp;
8 &amp; 9\\
            \hline
             1&amp; 1 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             2&amp; 0 &amp; 2 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             3&amp; 0 &amp; 0 &amp; 1 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             4&amp; 0 &amp; 0 &amp; 0 &amp; 2 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             5&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 2 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             6&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 3 &amp; 0
&amp; 0 &amp; 0\\
             7&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 1
&amp; 0 &amp; 0\\
             8&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 1 &amp; 0\\
             9&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 1\\
    \end{array}
\end{equation}\]</span></p>
<p>And the graph laplacian thus can be calculated as:</p>
<p><span class="math display">\[\begin{equation}
L_0 = D - A =  \begin{array}[r]{c |c c c c c c c c c }
           &amp; 1 &amp; 2 &amp; 3 &amp; 4 &amp; 5 &amp; 6 &amp; 7 &amp;
8 &amp; 9\\
            \hline
             1&amp; 1 &amp; -1 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             2&amp; -1 &amp; 2 &amp; -1 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             3&amp; 0 &amp; -1 &amp; 1 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             4&amp; 0 &amp; 0 &amp; 0 &amp; 2 &amp; -1 &amp; -1 &amp; 0
&amp; 0 &amp; 0\\
             5&amp; 0 &amp; 0 &amp; 0 &amp; -1 &amp; 2 &amp; -1 &amp; 0
&amp; 0 &amp; 0\\
             6&amp; 0 &amp; 0 &amp; 0 &amp; -1 &amp; -1 &amp; 3 &amp; -1
&amp; 0 &amp; 0\\
             7&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; -1 &amp; 1
&amp; 0 &amp; 0\\
             8&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 1 &amp; -1\\
             9&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; -1 &amp; 1\\
    \end{array}
\end{equation}\]</span></p>
<p>However, there's anoth way to represent the graph laplacian. Before
that, we first introduce the incidence matrix of a graph. First, we can
convert an undirected graph to be directed just assign the direction of
edges by book keeping order. Next, let's construct the incidence matrix,
each row represent a vertex, and each colum is an edge. We set the entry
to be 1 if the the edge enter the vertex -1 when leave the vertex. Set 0
if there's no connection between a vertex and an edge. Thus the
incidence matrix of a graph <span class="math inline">\(G\)</span> is
<span class="math inline">\(B\)</span> such that: <span
class="math display">\[\begin{equation}
B_1 =
    \begin{array}[r]{c | c c c c c c c}
              &amp; \cdots &amp; [4,5] &amp; [4,6]&amp; [5,6]&amp;
[6,7]&amp; \cdots\\
            \hline
            \vdots &amp; \cdots &amp; &amp; \cdots &amp;  &amp;
\cdots  &amp;  \\
            4 &amp; \cdots &amp; -1 &amp; -1 &amp; 0 &amp; 0
&amp;  \cdots\\
            5 &amp; \cdots &amp; 1 &amp; 0 &amp; -1 &amp; 0 &amp;
\cdots\\
            6 &amp; \cdots &amp; 0 &amp; 1 &amp; 1 &amp; -1 &amp;
\cdots\\
            7 &amp; \cdots &amp; 0 &amp; 0 &amp; 0 &amp; 1 &amp;
\cdots\\
             \vdots &amp; \cdots &amp; &amp; \cdots &amp;  &amp;
\cdots  &amp;  \\
    \end{array}
\end{equation}\]</span> Next, let construct graph laplacian matrix using
<span class="math inline">\(B_1\)</span>. Actually, the graph laplacian
is： <span class="math display">\[\begin{equation}
L_0=B_1 B_1^\top =  \begin{array}[r]{c |c c c c c c c c c }
           &amp; 1 &amp; 2 &amp; 3 &amp; 4 &amp; 5 &amp; 6 &amp; 7 &amp;
8 &amp; 9\\
            \hline
             1&amp; 1 &amp; -1 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             2&amp; -1 &amp; 2 &amp; -1 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             3&amp; 0 &amp; -1 &amp; 1 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 0 &amp; 0\\
             4&amp; 0 &amp; 0 &amp; 0 &amp; 2 &amp; -1 &amp; -1 &amp; 0
&amp; 0 &amp; 0\\
             5&amp; 0 &amp; 0 &amp; 0 &amp; -1 &amp; 2 &amp; -1 &amp; 0
&amp; 0 &amp; 0\\
             6&amp; 0 &amp; 0 &amp; 0 &amp; -1 &amp; -1 &amp; 3 &amp; -1
&amp; 0 &amp; 0\\
             7&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; -1 &amp; 1
&amp; 0 &amp; 0\\
             8&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; 1 &amp; -1\\
             9&amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0 &amp; 0
&amp; -1 &amp; 1\\
\end{array}
\end{equation}\]</span> As we can see that, <span
class="math inline">\(L_0\)</span> is the same as the that obtained from
degree matrix and adjacency matrix.</p>
<h2 id="zero-eigenvaules-of-graph-laplacian">Zero eigenvaules of graph
laplacian</h2>
<p>We know that for the application of diffusion map or spectral
analysis, we have to drop the first eigenvector due to its same values
and the eigenvaule is 0. The second eigenvector is also called fidler
vector. However, these applications usually created a connected graph.
Since the graph we show here is an unconnected graph. We can stop here
to take a guess: Is the zero eigenvaule still there, or if so, how many
zero eigenvaules?</p>
<p>Now, let's perform the eigenvaule decomposition that: <span
class="math display">\[\begin{equation}
L_0 = Q\Lambda Q^\top
\end{equation}\]</span></p>
<p>Interestingly, the top 3 eigenvalues are all zero and we also have 3
connected components in the graph. Is this a coincidence or a property
of graph laplacian decomposition? <img
src="https://cmb.oss-cn-qingdao.aliyuncs.com/hodge_graph_9_nodes_eigenvalues.svg" /></p>
<p>Furthermore, the top 3 eigenvectors corresponding to the zero
eigenvaules are also interesting. Notice that the first eigenvector can
differentiate the components <span
class="math inline">\(\{4,5,6,7\}\)</span> from the rest vectices.
Likewise, the second eigenvector select the connected component <span
class="math inline">\(\{8,9\}\)</span> and the third choose the
component <span class="math inline">\(\{1,2,3\}\)</span>. <img
src="https://cmb.oss-cn-qingdao.aliyuncs.com/hodge_graph_9_nodes_eigenvectors.svg" /></p>
<p>In fact, in the field of Topological Data Analysis (TDA), the <span
class="math inline">\(L_0\)</span> is the special case of hodge
laplacian (<span class="math inline">\(L_k\)</span>). The number of
connected components is call 0-dimensional cycles. And the graph
laplaican can capture these cycles. The 1-dimensional cycles are
correspond to holes, we will go into detail in the next section.</p>
<h2 id="hodge-laplacian">Hodge laplacian</h2>
<p>We can see the graph laplacian zero-order of hodge laplacian, and the
formalu can be represented as: <span
class="math display">\[\begin{equation}
L_0 = \mathbf{0}^\top \mathbf{0} + B_1 B_1^\top
\end{equation}\]</span></p>
<p>Similarly, we can obtian <span
class="math inline">\(L_1\)</span>:</p>
<p><span class="math display">\[\begin{equation}
L_1 = B_1^\top B_1 + B_2 B_2^\top
\end{equation}\]</span></p>
<p>You must ask where is <span class="math inline">\(B_2\)</span> coming
from? We know that <span class="math inline">\(B_1\)</span> captures the
relationship between vertices and edges. Thus, <span
class="math inline">\(B_2\)</span> captures the relationship between
edges and triangles. We can also define <span
class="math inline">\(B_3\)</span> to capture relationship between
triangles and tetrahedron and so on and so forth. So what is a triangle
or a tetrahedron in a graph? We would not go into the detail of the
thory of Simplex and Simplicial complex. Here we just need to know that
three connected vertices forms a triangle. Similarly, four connected
vertices forms a tetrahedron which is a high-order of triangles. To
define <span class="math inline">\(B_2\)</span>, of a graph <span
class="math inline">\(G\)</span>, we would check the direction of an
edge <span class="math inline">\(e_j\)</span> to the triangle <span
class="math inline">\(\bigtriangleup_q\)</span> it beblongs, if it has
the same direction as the triangle, the entry would be <span
class="math inline">\(1\)</span>, if the direction is opposite, the
entry would be -1, otherwise the entry would be zero. Specifically:</p>
<p><span class="math display">\[\begin{equation}
    {B}_2[j, q] =
       \begin{cases}
1 &amp; \text{if } e_j \in \bigtriangleup_q \quad \text{with}\quad
\text{same}\quad \text{direction}    \\
-1  &amp; \text{if } e_j \in \bigtriangleup_q \quad \text{with}\quad
\text{opposite}\quad \text{direction} \\
        0  &amp; \text{otherwise}
       \end{cases}
\end{equation}\]</span></p>
<p>With the definition, we can obtian <span
class="math inline">\(B_2\)</span> of the graph aforementioned: <span
class="math display">\[\begin{equation}
B_2 =
    \begin{array}[r]{c | c }
            &amp; [4,5,6]\\
            \hline
             \vdots &amp; \cdots \\
              [4,5] &amp; 1\\
              [4,6] &amp; -1\\
              [5,6]&amp; 1\\
              [6,7] &amp; 0\\
              \vdots &amp; \cdots \\
    \end{array}
\end{equation}\]</span></p>
<p>We next introduce the normalized form and the decomposition of hodge
1-laplacian. The normalized form of hodge 1-laplacian is given:</p>
<p><span class="math display">\[\begin{equation}
        \mathcal{L}_1 = {D}_2 {B}_1^\top {D}_1^{-1} {B}_1 + {B}_2 {D}_3
{B}_2^\top {D}_2^{-1}
\end{equation}\]</span> where <span
class="math inline">\(\mathbf{D}_1\)</span> is the vertices degree
matrix, <span class="math inline">\({D}_2\)</span> is <span
class="math inline">\(\max{(\text{diag}(|{B}_2| \mathbf{1}),
\mathbf{I})}\)</span> and <span class="math inline">\({D}_3\)</span> is
<span class="math inline">\(\frac{1}{3}\mathbf{I}\)</span>. Since the
normalized <span class="math inline">\(L_1\)</span> is not neccessarily
symmetric, we next need to define the symmetric normalized Hodge
1-Laplacian such that: <span class="math display">\[\begin{equation}
    \begin{aligned}
        \mathcal{L}_1^s
        &amp; = {D}_2^{-1/2} \mathcal{L}_1 {D}_2^{1/2}\\
        &amp; = {D}_2^{1/2} {B}_1^\top {D}_1^{-1} {B}_1 {D}_2^{1/2} +
{D}^{-1/2} {B}_2 {D}_3 {B}_2^\top {D}_2^{-1/2}
    \end{aligned}
\end{equation}\]</span></p>
<p>We use the graph with three holes to present hodge 1-laplacian: <img
src="https://cmb.oss-cn-qingdao.aliyuncs.com/hodge_graph_3holes.svg" /></p>
<p>We next can perform eigenvalues decomposition on <span
class="math inline">\(\mathcal{L}_1\)</span>: <span
class="math display">\[\begin{equation}
        \begin{aligned}
            \mathcal{L}_1
             &amp; = \mathbf{D}_2^{1/2} \mathcal{L}_1^s
\mathbf{D}_2^{-1/2}   \\
             &amp; =  \mathbf{D}_2^{1/2} Q \Lambda Q^\top
\mathbf{D}_2^{-1/2} \\
             &amp; = \mathbf{U} \Lambda \mathbf{U}^{-1}
        \end{aligned}
\end{equation}\]</span> Interestingly, the top 3 eigenvector also all
zero which corresponding to the 3 holes, namely, the three 1-dimensional
cycles. <img
src="https://cmb.oss-cn-qingdao.aliyuncs.com/hodge_graph_3holes_eigenvalues.svg" /></p>
<p>When it comes to the eigenvectors, we can also notice that the top
three eigenvectors are around the three holes. <img
src="https://cmb.oss-cn-qingdao.aliyuncs.com/hodge_graph_3holes_eigenvectors.svg" /></p>
<p>In algebraic geometry, these the eigenvectors with zero eigenvaules
are called harmonic function or harmonic. These harmonic function around
holes is useful for some analysis like clustering to find the
1-dimensional cycles etc.</p>
<h2 id="conclusion">Conclusion</h2>
<p>Today, we review the graph laplacian, and we can find the zeor
eigenvalues and theirs corresponding eigenvectors can be used to find
connected components. The high order graph laplacian hodge laplacian
have the similar properties, we presented the hodge 1-laplaican and its
eigenvalues decomposition. We can find that the zero eigenvalues
indicates the number of holes of a graph. Furthermore, the corresponding
eigenvectors with zero eigenvalues are around holes.</p>

      
    </div>

    
    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>


<div class="post-block">
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="">
    <link itemprop="mainEntityOfPage" href="http://chengmingbo.github.io/2020/12/07/spectral_2/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/avatar.gif">
      <meta itemprop="name" content="Mingbo Cheng">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Mingbo">
      <meta itemprop="description" content="Mingbo">
    </span>

    <span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
      <meta itemprop="name" content="undefined | Mingbo">
      <meta itemprop="description" content="">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/2020/12/07/spectral_2/" class="post-title-link" itemprop="url">Spectral analysis (2)</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">发表于</span>

      <time title="创建时间：2020-12-07 21:10:01" itemprop="dateCreated datePublished" datetime="2020-12-07T21:10:01+01:00">2020-12-07</time>
    </span>

  
</div>

        </div>
      </header>

    
    <div class="post-body" itemprop="articleBody">
          <h2 id="diffussion-map">Diffussion map</h2>
<p>Eigenvalue decomposition of weighted graph laplacian is used to
obtain diffusion map. The asymmetric matrix or transition matrix <span
class="math inline">\(P=D^{-1}W\)</span>. The decomposition is: <span
class="math display">\[\begin{equation}
P = D^{-1/2}S\Lambda S^\top D^{1/2}
\end{equation}\]</span> In fact, the random walk on the graph would give
rise to the transition graph such that: <span
class="math display">\[\begin{equation}
p^{t+1}= p^t D^{-1}W = p^t P
\end{equation}\]</span> where <span class="math inline">\(p\)</span> is
a initial state of the graph (vertices weight). That is, after a certain
step of random walk on the graph, we would reach a steady state when any
more random walk would not change the weight. Let <span
class="math inline">\(Q=D^{-1/2}S\)</span>, then <span
class="math inline">\(Q^{-1}=S^{\top} D^{1/2}\)</span>, we can derive
<span class="math inline">\(P = Q\Lambda Q^{-1}\)</span>. The random
walk above mentioned can then be represent: <span
class="math display">\[\begin{equation}
\begin{aligned}
P^t &amp;= Q\Lambda Q^{-1}Q\Lambda Q^{-1}\cdots Q\Lambda Q^{-1}
    &amp;= Q\Lambda^t Q
\end{aligned}
\end{equation}\]</span> Since <span
class="math inline">\(\Lambda\)</span> is diagnoal, the random walk on a
graph can be easily cacluted by using the eigenvalue decomposition
outcome. The column of <span class="math inline">\(Q\)</span> is the
diffusion map dimensions.</p>
<h2 id="diffusion-map-example">Diffusion map example</h2>
<p>To understand diffusion map, we here introduce an example data and
run diffusion map on it. The data is a single cell fibroblasts to neuron
data with 392 cells. We first download the data from NCBI and loaded the
data to memory:</p>
<figure class="highlight r"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line">import requests</span><br><span class="line">url <span class="operator">=</span> <span class="string">&#x27;https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE67310&amp;format=file&amp;file=GSE67310%5FiN%5Fdata%5Flog2FPKM%5Fannotated.txt.gz&#x27;</span></span><br><span class="line">r <span class="operator">=</span> requests.get<span class="punctuation">(</span>url<span class="punctuation">,</span> allow_redirects<span class="operator">=</span>True<span class="punctuation">)</span></span><br><span class="line">open<span class="punctuation">(</span><span class="string">&#x27;GSE67310.txt.gz&#x27;</span><span class="punctuation">,</span> <span class="string">&#x27;wb&#x27;</span><span class="punctuation">)</span>.write<span class="punctuation">(</span>r.content<span class="punctuation">)</span></span><br><span class="line">data <span class="operator">=</span> pd.read_csv<span class="punctuation">(</span><span class="string">&#x27;GSE67310.txt.gz&#x27;</span><span class="punctuation">,</span> sep<span class="operator">=</span><span class="string">&#x27;\t&#x27;</span><span class="punctuation">,</span> index_col<span class="operator">=</span><span class="number">0</span><span class="punctuation">)</span></span><br><span class="line">data <span class="operator">=</span> data.loc<span class="punctuation">[</span>data<span class="punctuation">[</span><span class="string">&#x27;assignment&#x27;</span><span class="punctuation">]</span> <span class="operator">!=</span><span class="string">&#x27;Fibroblast&#x27;</span><span class="punctuation">]</span></span><br><span class="line">group <span class="operator">=</span> data<span class="punctuation">[</span><span class="string">&#x27;assignment&#x27;</span><span class="punctuation">]</span></span><br></pre></td></tr></table></figure>
<p>From the code as we can see that, we extract the cell types from the
data and stores it into the variable <code>group</code>. The rest part
of the data is the normalized gene expression count matrix.</p>
<p>To do the pre-processing, we first get the log-normalized count
matrix <span class="math inline">\(X\)</span> and revert it to the
counts and apply log geomatric scaling to the count matrix as the
pre-processing then store it to matrix <span
class="math inline">\(Y\)</span>. <figure class="highlight r"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">X <span class="operator">=</span> np.array<span class="punctuation">(</span>data.iloc<span class="punctuation">[</span><span class="operator">:</span><span class="punctuation">,</span> <span class="number">5</span><span class="operator">:</span><span class="punctuation">]</span><span class="punctuation">)</span>.T</span><br><span class="line">X <span class="operator">=</span> np.power<span class="punctuation">(</span><span class="number">2</span><span class="punctuation">,</span> X<span class="punctuation">[</span>np.apply_along_axis<span class="punctuation">(</span>np.var<span class="punctuation">,</span> <span class="number">1</span><span class="punctuation">,</span> X<span class="punctuation">)</span><span class="operator">&gt;</span><span class="number">0</span><span class="punctuation">,</span> <span class="operator">:</span><span class="punctuation">]</span><span class="punctuation">)</span> <span class="operator">-</span> <span class="number">1</span></span><br><span class="line">Y <span class="operator">=</span> np.log<span class="punctuation">(</span>gscale<span class="punctuation">(</span>X<span class="operator">+</span><span class="number">0.5</span><span class="punctuation">)</span><span class="punctuation">)</span>.T</span><br></pre></td></tr></table></figure> The geomatric scaling
implementation is: <figure class="highlight r"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line">def gscale<span class="punctuation">(</span>X<span class="operator">:</span>np.ndarray<span class="punctuation">)</span> <span class="operator">-&gt;</span> np.ndarray<span class="operator">:</span></span><br><span class="line">    assert<span class="punctuation">(</span>X.all<span class="punctuation">(</span><span class="punctuation">)</span><span class="operator">&gt;=</span><span class="number">0</span><span class="punctuation">)</span></span><br><span class="line">    div_ <span class="operator">=</span> np.divide<span class="punctuation">(</span>X.T<span class="punctuation">,</span> np.apply_along_axis<span class="punctuation">(</span>lambda x<span class="operator">:</span>np.exp<span class="punctuation">(</span>np.mean<span class="punctuation">(</span>np.log<span class="punctuation">(</span>x<span class="punctuation">)</span><span class="punctuation">)</span><span class="punctuation">)</span><span class="punctuation">,</span> <span class="number">1</span><span class="punctuation">,</span></span><br><span class="line">    X<span class="punctuation">)</span><span class="punctuation">)</span>.T</span><br><span class="line">    scale_ <span class="operator">=</span> np.apply_along_axis<span class="punctuation">(</span>np.median<span class="punctuation">,</span><span class="number">0</span><span class="punctuation">,</span> div_<span class="punctuation">)</span></span><br><span class="line">    sc <span class="operator">=</span> StandardScaler<span class="punctuation">(</span>with_mean<span class="operator">=</span>False<span class="punctuation">)</span></span><br><span class="line">    sc.fit<span class="punctuation">(</span>X<span class="punctuation">)</span></span><br><span class="line">    sc.scale_ <span class="operator">=</span> scale_</span><br><span class="line">    <span class="built_in">return</span> sc.transform<span class="punctuation">(</span>X<span class="punctuation">)</span></span><br></pre></td></tr></table></figure> After the pre-processing, we we next
run PCA to perform dimensionality reduction and use which to calculate
euclidean distances between cells, and then run diffusion map.</p>
<figure class="highlight r"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">pc <span class="operator">=</span> run_pca<span class="punctuation">(</span>Y<span class="punctuation">,</span> <span class="number">100</span><span class="punctuation">)</span></span><br><span class="line">R <span class="operator">=</span> distance_matrix<span class="punctuation">(</span>pc<span class="punctuation">,</span> pc<span class="punctuation">)</span></span><br><span class="line">d <span class="operator">=</span> diffusionMaps<span class="punctuation">(</span>R<span class="punctuation">,</span><span class="number">7</span><span class="punctuation">)</span></span><br></pre></td></tr></table></figure>
<p>We first take a look at the PCA figure which shows the first 2 PCs of
the dataset. PCA is a linear methods which cannot reflect cell fate
differentation process. <img
src="https://cmb.oss-cn-qingdao.aliyuncs.com/fib2neuron_pca.png" />
Since the diffusion map applied the Gaussian kernal during the distances
calculation, it usually a better way to capture the cell differentation
events. Here we show diffusion dimension 1,2 and 1,3. It's clear that
1,2 captures the cell differentation from fibroblasts to neuron, and 1,3
captures the differentiation from firoblasts to myocytes. <img
src="https://cmb.oss-cn-qingdao.aliyuncs.com/fib2neuron_diffusions.png" /></p>
<p>We can change the bandwidth from 7 to 100, which use the 100th
neareast neighbor as bandwidth instead of 7th. The following shows the
diffusion map that bandwidth=100. <img
src="https://cmb.oss-cn-qingdao.aliyuncs.com/fib2neuron_bandwidth100.png" /></p>
<h2 id="pseudo-time-by-random-walk">pseudo time by random walk</h2>
<p>The eigenvalue decomposition of graph laplacian can also be used to
infer the pseudo time simulating the pseudo time of cell differentation
events. We here set top <span class="math inline">\(m\)</span> cells
<span class="math inline">\(1/m\)</span> of the progenitor cells (MEF)
and set other cell 0 as the initial state. Next, we perform random walk
to get new pesudotime <span class="math inline">\(u\)</span> such that:
<span class="math display">\[\begin{equation}
u = [\frac{1}{m}, \frac{1}{m}, \cdots, 0, \cdots 0](D^{-1}W)^t
\end{equation}\]</span> By testing different number of random walk steps
we can check the new pseudo time <span class="math inline">\(u\)</span>.
Here we show time step equals to 1, 10, 100 and 200. From the figure we
can notice that after certain step, the pseudo time will not change
anymore. That means the random walk reaches the steady state. <img
src="https://cmb.oss-cn-qingdao.aliyuncs.com/fib2neuron_pseudotime.png" /></p>
<p>To test when can we reach the steady state, I use the graph we
mentioned in my last post <a
href="https://chengmingbo.github.io/2020/06/07/spectral/">Spectral
analysis (1)</a>:
<img src="https://cmb.oss-cn-qingdao.aliyuncs.com/a_graph.png" height="250px"></p>
<p>Here we random generate two initial states (<span
class="math inline">\(v_1\)</span>, <span
class="math inline">\(v_2\)</span>) to do the random walk such that:
<span class="math display">\[\begin{equation}
\begin{aligned}
v_1 &amp;= [0.15,0.41,0.54,0.9,0.62,0.93,0.1,0.46,0.01,0.88]\\
v_2 &amp;= [0.89,0.93,0.07,0.41,0.52,0.88,0.43,0.09,0.1,0.2]
\end{aligned}
\end{equation}\]</span></p>
<p>From the the figure below we can see that <span
class="math inline">\(v_1\)</span> reaches the steady state in 30 steps
whereas <span class="math inline">\(v_2\)</span> reaches steady state in
100 steps. In total, all these two initial state will reach the steady
state that would not change any longer. <img
src="https://cmb.oss-cn-qingdao.aliyuncs.com/fib2neuron_steady.png" /></p>
<h2 id="spectral-clustering">spectral clustering</h2>
<p>The eigenvectors of graph Laplacian can also be used to do the
clustering. From the figure we can find clear cut using the 1st, 2nd and
the 3rd diffusion dimensions. In practice, we can use kmeans, DBSCAN,
leiden, louvain algorithm to perform clustering using the diffusion
dimensions. <img
src="https://cmb.oss-cn-qingdao.aliyuncs.com/fib2neuron_clustering.png" /></p>
<h2 id="appendix">Appendix</h2>
<h5 id="pca-function">PCA function</h5>
<figure class="highlight r"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line">def run_pca<span class="punctuation">(</span>mtx<span class="punctuation">,</span> n_components<span class="operator">=</span><span class="number">2</span><span class="punctuation">,</span> random_state<span class="operator">=</span><span class="number">2022</span><span class="punctuation">)</span><span class="operator">:</span></span><br><span class="line">    dm <span class="operator">=</span> None</span><br><span class="line">        <span class="keyword">if</span> scipy.sparse.issparse<span class="punctuation">(</span>mtx<span class="punctuation">)</span><span class="operator">:</span></span><br><span class="line">        clf <span class="operator">=</span> TruncatedSVD<span class="punctuation">(</span>n_components<span class="punctuation">,</span> random_state<span class="operator">=</span>random_state<span class="punctuation">)</span></span><br><span class="line">        dm <span class="operator">=</span> clf.fit_transform<span class="punctuation">(</span>mtx<span class="punctuation">)</span></span><br><span class="line">    <span class="keyword">else</span><span class="operator">:</span></span><br><span class="line">        pca <span class="operator">=</span> PCA<span class="punctuation">(</span>n_components<span class="operator">=</span>n_components<span class="punctuation">,</span> random_state<span class="operator">=</span>random_state<span class="punctuation">)</span></span><br><span class="line">        dm <span class="operator">=</span> pca.fit_transform<span class="punctuation">(</span>mtx<span class="punctuation">)</span></span><br><span class="line">        <span class="built_in">return</span> dm</span><br></pre></td></tr></table></figure>
<h5 id="affinity-matrix-function">Affinity matrix function</h5>
<figure class="highlight r"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br></pre></td><td class="code"><pre><span class="line">def affinity<span class="punctuation">(</span>R<span class="punctuation">,</span> k<span class="operator">=</span><span class="number">7</span><span class="punctuation">,</span> sigma<span class="operator">=</span>None<span class="punctuation">,</span> <span class="built_in">log</span><span class="operator">=</span>False<span class="punctuation">)</span><span class="operator">:</span></span><br><span class="line">    <span class="string">&quot;&quot;</span><span class="string">&quot;</span></span><br><span class="line"><span class="string">    Gaussian affinity matrix constructor</span></span><br><span class="line"><span class="string">    W = exp(-r_&#123;ij&#125;^2/sigma)</span></span><br><span class="line"><span class="string">    &quot;</span><span class="string">&quot;&quot;</span></span><br><span class="line">    def top_k<span class="punctuation">(</span>lst<span class="punctuation">,</span> k<span class="operator">=</span><span class="number">1</span><span class="punctuation">)</span><span class="operator">:</span></span><br><span class="line">        assert<span class="punctuation">(</span>len<span class="punctuation">(</span>lst<span class="punctuation">)</span> <span class="operator">&gt;</span>k<span class="punctuation">)</span></span><br><span class="line">        <span class="built_in">return</span> np.partition<span class="punctuation">(</span>lst<span class="punctuation">,</span> k<span class="punctuation">)</span><span class="punctuation">[</span>k<span class="punctuation">]</span></span><br><span class="line"></span><br><span class="line">    R <span class="operator">=</span> np.array<span class="punctuation">(</span>R<span class="punctuation">)</span></span><br><span class="line">    <span class="keyword">if</span> not sigma<span class="operator">:</span></span><br><span class="line">        s <span class="operator">=</span> <span class="punctuation">[</span>top_k<span class="punctuation">(</span>R<span class="punctuation">[</span><span class="operator">:</span><span class="punctuation">,</span> i<span class="punctuation">]</span><span class="punctuation">,</span> k<span class="operator">=</span>k<span class="punctuation">)</span> <span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span><span class="punctuation">(</span>R.shape<span class="punctuation">[</span><span class="number">1</span><span class="punctuation">]</span><span class="punctuation">)</span><span class="punctuation">]</span></span><br><span class="line">        S <span class="operator">=</span> np.sqrt<span class="punctuation">(</span>np.outer<span class="punctuation">(</span>s<span class="punctuation">,</span> s<span class="punctuation">)</span><span class="punctuation">)</span></span><br><span class="line">    <span class="keyword">else</span><span class="operator">:</span></span><br><span class="line">        S <span class="operator">=</span> sigma</span><br><span class="line">        logW <span class="operator">=</span> <span class="operator">-</span>np.power<span class="punctuation">(</span>np.divide<span class="punctuation">(</span>R<span class="punctuation">,</span> S<span class="punctuation">)</span><span class="punctuation">,</span> <span class="number">2</span><span class="punctuation">)</span></span><br><span class="line">    <span class="keyword">if</span> <span class="built_in">log</span><span class="operator">:</span></span><br><span class="line">        <span class="built_in">return</span> logW</span><br><span class="line">    <span class="built_in">return</span> np.exp<span class="punctuation">(</span>logW<span class="punctuation">)</span></span><br></pre></td></tr></table></figure>
<h5 id="diffusion-map-function">Diffusion map function</h5>
<figure class="highlight r"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br></pre></td><td class="code"><pre><span class="line">def diffusionMaps<span class="punctuation">(</span>R<span class="punctuation">,</span>k<span class="operator">=</span><span class="number">7</span><span class="punctuation">,</span>sigma<span class="operator">=</span>None<span class="punctuation">)</span><span class="operator">:</span></span><br><span class="line">    <span class="string">&quot;&quot;</span><span class="string">&quot;</span></span><br><span class="line"><span class="string">    Diffusion map(Coifman, 2005)</span></span><br><span class="line"><span class="string">    https://en.wikipedia.org/wiki/Diffusion_map</span></span><br><span class="line"><span class="string">    ----------</span></span><br><span class="line"><span class="string">    dic:</span></span><br><span class="line"><span class="string">        psi: right eigvector of P = D^&#123;-1/2&#125; * evec</span></span><br><span class="line"><span class="string">        phi: left eigvector of P = D^&#123;1/2&#125; * evec</span></span><br><span class="line"><span class="string">        eig: eigenvalues</span></span><br><span class="line"><span class="string">    &quot;</span><span class="string">&quot;&quot;</span></span><br><span class="line">    k<span class="operator">=</span>k<span class="operator">-</span><span class="number">1</span> <span class="comment">## k is R version minus 1 for the index</span></span><br><span class="line">    logW <span class="operator">=</span> affinity<span class="punctuation">(</span>R<span class="punctuation">,</span>k<span class="punctuation">,</span>sigma<span class="punctuation">,</span><span class="built_in">log</span><span class="operator">=</span>True<span class="punctuation">)</span></span><br><span class="line">    rs <span class="operator">=</span> np.exp<span class="punctuation">(</span><span class="punctuation">[</span>logsumexp<span class="punctuation">(</span>logW<span class="punctuation">[</span>i<span class="punctuation">,</span><span class="operator">:</span><span class="punctuation">]</span><span class="punctuation">)</span> <span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span><span class="punctuation">(</span>logW.shape<span class="punctuation">[</span><span class="number">0</span><span class="punctuation">]</span><span class="punctuation">)</span><span class="punctuation">]</span><span class="punctuation">)</span> <span class="comment">## dii=\sum_j</span></span><br><span class="line">    w_<span class="punctuation">&#123;</span>i<span class="punctuation">,</span>j<span class="punctuation">&#125;</span></span><br><span class="line">    D <span class="operator">=</span> np.diag<span class="punctuation">(</span>np.sqrt<span class="punctuation">(</span>rs<span class="punctuation">)</span><span class="punctuation">)</span></span><br><span class="line">    <span class="comment">## D^&#123;1/2&#125;</span></span><br><span class="line">    Dinv <span class="operator">=</span> np.diag<span class="punctuation">(</span><span class="number">1</span><span class="operator">/</span>np.sqrt<span class="punctuation">(</span>rs<span class="punctuation">)</span><span class="punctuation">)</span> <span class="comment">##D^&#123;-1/2&#125;</span></span><br><span class="line">    Ms <span class="operator">=</span> Dinv <span class="operator">@</span> np.exp<span class="punctuation">(</span>logW<span class="punctuation">)</span> <span class="operator">@</span> Dinv <span class="comment">## D^&#123;-1/2&#125; W D^&#123;-1/2&#125;</span></span><br><span class="line">    e <span class="operator">=</span> np.linalg.eigh<span class="punctuation">(</span>Ms<span class="punctuation">)</span> <span class="comment">## eigen decomposition of P&#x27;</span></span><br><span class="line">    evalue<span class="operator">=</span> e<span class="punctuation">[</span><span class="number">0</span><span class="punctuation">]</span><span class="punctuation">[</span><span class="operator">::</span><span class="operator">-</span><span class="number">1</span><span class="punctuation">]</span></span><br><span class="line">    evec <span class="operator">=</span> np.flip<span class="punctuation">(</span>e<span class="punctuation">[</span><span class="number">1</span><span class="punctuation">]</span><span class="punctuation">,</span> axis<span class="operator">=</span><span class="number">1</span><span class="punctuation">)</span></span><br><span class="line">    s <span class="operator">=</span> np.sum<span class="punctuation">(</span>np.sqrt<span class="punctuation">(</span>rs<span class="punctuation">)</span> <span class="operator">*</span> evec<span class="punctuation">[</span><span class="operator">:</span><span class="punctuation">,</span><span class="number">0</span><span class="punctuation">]</span><span class="punctuation">)</span> <span class="comment"># scaling</span></span><br><span class="line">    <span class="comment"># Phi is orthonormal under the weighted inner product</span></span><br><span class="line">    <span class="comment">#0:Psi, 1:Phi, 2:eig</span></span><br><span class="line">    dic <span class="operator">=</span> <span class="punctuation">&#123;</span><span class="string">&#x27;psi&#x27;</span><span class="operator">:</span>s <span class="operator">*</span> Dinv<span class="operator">@</span>evec<span class="punctuation">,</span> <span class="string">&#x27;phi&#x27;</span><span class="operator">:</span> <span class="punctuation">(</span><span class="number">1</span><span class="operator">/</span>s<span class="punctuation">)</span><span class="operator">*</span>D<span class="operator">@</span>evec<span class="punctuation">,</span> <span class="string">&quot;eig&quot;</span><span class="operator">:</span> evalue<span class="punctuation">&#125;</span></span><br><span class="line">    <span class="built_in">return</span> dic</span><br></pre></td></tr></table></figure>

      
    </div>

    
    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>


<div class="post-block">
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="">
    <link itemprop="mainEntityOfPage" href="http://chengmingbo.github.io/2020/06/07/spectral/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/avatar.gif">
      <meta itemprop="name" content="Mingbo Cheng">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Mingbo">
      <meta itemprop="description" content="Mingbo">
    </span>

    <span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
      <meta itemprop="name" content="undefined | Mingbo">
      <meta itemprop="description" content="">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/2020/06/07/spectral/" class="post-title-link" itemprop="url">Spectral analysis (1)</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">发表于</span>

      <time title="创建时间：2020-06-07 18:30:22" itemprop="dateCreated datePublished" datetime="2020-06-07T18:30:22+02:00">2020-06-07</time>
    </span>

  
</div>

        </div>
      </header>

    
    <div class="post-body" itemprop="articleBody">
          <h2 id="introduction">Introduction</h2>
<p>Newton used glass separted the white sunlight into red, orange,
yellow, green, blue, indigo and violet single colors. The light spectrum
analysis can help scientists to interpret the world. For example, we can
detect the elements of our solar system as well as far stars in the
universe. The spectrum analysis is also a field in mathmatjics. In the
graph thoery field, Laplacian matrix is used to represented a graph. We
can obtian features from the undirected graph below (<a
target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Laplacian_matrix">wikipedia</a>).
<img src="https://cmb.oss-cn-qingdao.aliyuncs.com/a_graph.png" height="300px"></p>
<p>For example, we can check the degree of each vertex, which forms our
Degree matrix <span class="math inline">\(D\)</span> such that: <span
class="math display">\[\begin{equation}
D = \begin{pmatrix}
  2 &amp; 0 &amp;  0 &amp;  0 &amp; 0 &amp; 0\\
  0 &amp;  3 &amp; 0 &amp;  0 &amp; 0 &amp; 0\\
   0 &amp; 0 &amp;  2 &amp; 0 &amp;  0 &amp; 0\\
   0 &amp;  0 &amp; 0 &amp;  3 &amp; 0 &amp;0\\
  0 &amp; 0 &amp;  0 &amp; 0 &amp;  3 &amp; 0\\
   0 &amp;  0 &amp;  0 &amp; 0 &amp;  0 &amp; 1
\end{pmatrix}
\end{equation}\]</span></p>
<p>By checking the connection between all pairs nodes, we can create a
Adjacency matrix:</p>
<p><span class="math display">\[\begin{equation}
A = \begin{pmatrix}
  0 &amp; 1 &amp;  0 &amp;  0 &amp; 1 &amp; 0\\
  1 &amp;  0 &amp; 1 &amp;  0 &amp; 1 &amp; 0\\
   0 &amp; 1 &amp;  0 &amp; 1 &amp;  0 &amp; 0\\
   0 &amp;  0 &amp; 1 &amp;  0 &amp; 1 &amp;1\\
  1 &amp; 1 &amp;  0 &amp; 1 &amp;  0 &amp; 0\\
   0 &amp;  0 &amp;  0 &amp; 1 &amp;  0 &amp; 0
\end{pmatrix}
\end{equation}\]</span></p>
<h2 id="graph-laplacian">graph laplacian</h2>
<p>The graph Laplacian <span class="math inline">\(L\)</span> extracts
all useful information from a graph which is: <span
class="math display">\[\begin{equation}
L=D-A =\begin{pmatrix}
  2 &amp; -1 &amp;  0 &amp;  0 &amp; -1 &amp; 0\\
  -1 &amp;  3 &amp; -1 &amp;  0 &amp; -1 &amp; 0\\
   0 &amp; -1 &amp;  2 &amp; -1 &amp;  0 &amp; 0\\
   0 &amp;  0 &amp; -1 &amp;  3 &amp; -1 &amp; -1\\
  -1 &amp; -1 &amp;  0 &amp; -1 &amp;  3 &amp; 0\\
   0 &amp;  0 &amp;  0 &amp; -1 &amp;  0 &amp; 1
\end{pmatrix}
\end{equation}\]</span></p>
<p>In fact, the graph Laplaican matrix is symmetric and also positive
semidefinite (PSD), which means if we perform eigenvalue decomposition,
the eigen values are all real and nonnegative. We can normlize the graph
Laplaican by left multiplying the <span
class="math inline">\(D^{-1}\)</span> which will give rise to all <span
class="math inline">\(1\)</span>s of the diagnal entries of the matrix,
namely: <span class="math display">\[\begin{equation}
\text{norm}({L}) = D^{-1}L = \begin{pmatrix}1 &amp; -0.50 &amp;  0
&amp;  0 &amp; -0.50 &amp; 0\\
  -0.33 &amp;  1 &amp; -0.33 &amp;  0 &amp; -0.33 &amp; 0\\
   0 &amp; -0.50 &amp;  1 &amp; -0.50 &amp;  0 &amp; 0\\
   0 &amp;  0 &amp; -0.33 &amp;  1 &amp; -0.33 &amp;-0.33\\
  -0.33 &amp; -0.33 &amp;  0 &amp; -0.33 &amp;  1 &amp; 0\\
   0 &amp;  0 &amp;  0 &amp; -1 &amp;  0 &amp; 1
\end{pmatrix}
\end{equation}\]</span> This matrix is called transition matrix.
However, the matrix would not keep the symmetric and the PSD property
after the normalization. We will come back to discuss the spectral for
the normalized graph Laplacian.</p>
<h2 id="weighted-graph">Weighted graph</h2>
<p>In practice, graph are usually weighted. The weight between vertices
can be euclidean distance or other measures. The figure blow shows the
weights of the graph. Here we apply gussian kernel to the euclidean
distances between vertices such that: <img
src="https://cmb.oss-cn-qingdao.aliyuncs.com/a_graph_weighted.png"
alt="weighted graph" /></p>
<p><span class="math display">\[\begin{equation}
w_{ij} =\exp (-r_{ij}^2/\sigma) = \exp \big(\frac{-\|x_i -
x_j\|^2}{\sigma_i\sigma_j}\big)
\end{equation}\]</span> where <span
class="math inline">\(r_{ij}\)</span> is the euclidean distance between
vetex <span class="math inline">\(i\)</span> and <span
class="math inline">\(j\)</span>, and <span
class="math inline">\(sigma\)</span> controls the bandwidth. We call the
matrix <span class="math inline">\(W=(w_{ij})\)</span> gaussian affinity
matrix such that: <span class="math display">\[\begin{equation}
\begin{pmatrix}
0 &amp; w_{12} &amp; w_{13} &amp; w_{14} &amp; w_{15}&amp; w_{16}\\
w_{21} &amp; 0 &amp; w_{23} &amp; w_{24} &amp; w_{25}&amp; w_{26}\\
w_{31} &amp; w_{32} &amp; 0 &amp; w_{34} &amp; w_{35}&amp; w_{36}\\
w_{41} &amp; w_{42} &amp; w_{43} &amp; 0 &amp; w_{45}&amp; w_{46}\\
w_{51} &amp; w_{52} &amp; w_{53} &amp; w_{54} &amp; 0&amp; w_{56}\\
w_{61} &amp; w_{62} &amp; w_{63} &amp; w_{64} &amp; w_{65}&amp; 0
\end{pmatrix}
\end{equation}\]</span> The guassian kernel would enlarge the distance
between too far vertices. Similar to unweighted matrix, we can also
construct graph Laplaican matrix using the gaussian affinity matrix.
First, we need find the weighted degree based on <span
class="math inline">\(W\)</span> such that: <span
class="math display">\[\begin{equation}
d_{ii} = \sum_j{w_{ij}}
\end{equation}\]</span> With the diagnal degree matrix and affinity
matrix, we now can have the weighted laplacian that: <span
class="math display">\[\begin{equation}
L = D - W
\end{equation}\]</span> Likewise, we next give the normalized form of
Laplacian such that: <span class="math display">\[\begin{equation}
\text{norm}{(L)}= D^{-1}L = I - D^{-1}W
\end{equation}\]</span> To facilitate the eigenvalue decomposition, we
need apply trick to the asymmetric matrix <span
class="math inline">\(D^{-1}L\)</span>. Since the eigenvectors of <span
class="math inline">\(D^{-1}L\)</span> and <span
class="math inline">\(D^{-1}W\)</span> are the same, we apply some trick
to <span class="math inline">\(P = D^{-1}W\)</span> to simplify the
problem. Lets construct <span class="math inline">\(P&#39;\)</span> such
that: <span class="math display">\[\begin{equation}
P&#39; = D^{1/2} P D^{-1/2} = D^{-1/2}WD^{-1/2}
\end{equation}\]</span> It obvious <span
class="math inline">\(P&#39;\)</span> is symmetric due to the
symmetrisation of <span class="math inline">\(W\)</span>. We can perform
eigenvalue decomposition on <span class="math inline">\(P&#39;\)</span>
such that: <span class="math display">\[\begin{equation}
P&#39; = S\Lambda S^\top
\end{equation}\]</span> Where S stores the eigenvectors of <span
class="math inline">\(P&#39;\)</span> and the diagonals of <span
class="math inline">\(\Lambda\)</span> records the eigenvalues of <span
class="math inline">\(P&#39;\)</span>. We can also get the decompostion
to <span class="math inline">\(P\)</span> such that: <span
class="math display">\[\begin{equation}
P = D^{-1/2}S\Lambda S^\top D^{1/2}
\end{equation}\]</span> Let <span
class="math inline">\(Q=D^{-1/2}\)</span>, then <span
class="math inline">\(Q^{-1}=S^{\top}D^{1/2}\)</span>. We therefore find
the right and left eigenvector of <span class="math inline">\(P\)</span>
such that: <span class="math display">\[\begin{equation}
\psi = D^{-1/2} S \qquad
\phi = S^{\top}D^{1/2}
\end{equation}\]</span> In fact, columns of <span
class="math inline">\(\psi\)</span> stores the spectral of the graph
which also call diffusion map dimensions.</p>

      
    </div>

    
    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>


<div class="post-block">
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="">
    <link itemprop="mainEntityOfPage" href="http://chengmingbo.github.io/2020/03/07/pytorch_abc/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/avatar.gif">
      <meta itemprop="name" content="Mingbo Cheng">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Mingbo">
      <meta itemprop="description" content="Mingbo">
    </span>

    <span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
      <meta itemprop="name" content="undefined | Mingbo">
      <meta itemprop="description" content="">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/2020/03/07/pytorch_abc/" class="post-title-link" itemprop="url">Pytorch ABC</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">发表于</span>

      <time title="创建时间：2020-03-07 22:50:11" itemprop="dateCreated datePublished" datetime="2020-03-07T22:50:11+01:00">2020-03-07</time>
    </span>

  
</div>

        </div>
      </header>

    
    <div class="post-body" itemprop="articleBody">
          <h2 id="fundamentals">Fundamentals</h2>
<h4 id="torch.tensor">Torch.tensor</h4>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">t = tensor([[<span class="number">1</span>,<span class="number">2</span>,<span class="number">4</span>],[<span class="number">4</span>,<span class="number">5</span>,<span class="number">6</span>]])</span><br><span class="line">t.shape: (<span class="number">2</span>,<span class="number">3</span>)</span><br><span class="line">t.ndim: <span class="number">2</span></span><br><span class="line"><span class="built_in">type</span>: scalar, vector, matrix, tensor</span><br></pre></td></tr></table></figure></code></pre>
<h4 id="tensor-data-produce">Tensor data produce</h4>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">random_tensor = torch.rand(size=(<span class="number">3</span>, <span class="number">4</span>, <span class="number">5</span>))</span><br><span class="line">zeros = torch.zeros(size=(<span class="number">3</span>, <span class="number">4</span>))</span><br><span class="line">ones = torch.ones(size=(<span class="number">3</span>, <span class="number">4</span>))</span><br><span class="line">zero_to_ten = torch.arange(start=<span class="number">0</span>, end=<span class="number">10</span>, step=<span class="number">1</span>)</span><br><span class="line">ten_zeros = torch.zeros_like(<span class="built_in">input</span>=zero_to_ten) <span class="comment"># same shape but all zeros</span></span><br></pre></td></tr></table></figure></code></pre>
<h4 id="float">Float</h4>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">torch.float32/torch.<span class="built_in">float</span></span><br><span class="line">torch.float16</span><br><span class="line">torch.half</span><br><span class="line">torch.float64/torch.double</span><br></pre></td></tr></table></figure></code></pre>
<h4 id="types-specific-for-gpu-or-cpu">types specific for GPU or
CPU</h4>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">device=’cuda’ <span class="keyword">if</span> torch.cuda.is_available() <span class="keyword">else</span> ‘cpu’</span><br><span class="line">t = tensor([<span class="number">1</span>,<span class="number">2</span>,<span class="number">3</span>], device=device)</span><br></pre></td></tr></table></figure></code></pre>
<h4 id="tensor-operations">tensor operations</h4>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">tensor_A = torch.tensor([[<span class="number">1</span>,<span class="number">2</span>],[<span class="number">3</span>,<span class="number">4</span>],[<span class="number">5</span>,<span class="number">6</span>]],</span><br><span class="line">                        dtype = torch.float32)</span><br></pre></td></tr></table></figure></code></pre>
<h5 id="multiply">multiply</h5>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line">tensor = torch.tensor([<span class="number">1</span>, <span class="number">2</span>, <span class="number">3</span>])</span><br><span class="line">tensor + <span class="number">10</span></span><br><span class="line">torch.multiply(tensor, <span class="number">10</span>)</span><br><span class="line">tensor * tensor <span class="comment"># tensor([1,4,9])</span></span><br><span class="line">tenorA @ tensorB <span class="comment"># matrix multiplication -&gt; tensor(14)</span></span><br><span class="line">torch.matmul(tensor, tensor)/torch.mm <span class="comment"># 1*1 + 2*2 + 3*3 = tensor(14)</span></span><br><span class="line">tensor.T</span><br></pre></td></tr></table></figure></code></pre>
<h5 id="layer">layer</h5>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">#Torch.nn.Linear</span></span><br><span class="line">y = x A^T + b</span><br><span class="line">torch.manual_seed(<span class="number">42</span>)</span><br><span class="line">linear = torch.nn.Linear(in_features=<span class="number">2</span>, <span class="comment"># in_features = matches inner dimension of input </span></span><br><span class="line">             out_features=<span class="number">6</span>) <span class="comment"># out_features = describes outer value </span></span><br><span class="line">x = tensor_A</span><br><span class="line">output = linear(x)</span><br><span class="line">x.shape, output, output.shape</span><br></pre></td></tr></table></figure></code></pre>
<h5 id="other-operations">other operations</h5>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre></td><td class="code"><pre><span class="line">tensor = torch.arange(<span class="number">10</span>, <span class="number">100</span>, <span class="number">10</span>) <span class="comment"># tensor([10, 20, 30, 40, 50, 60, 70, 80, 90])</span></span><br><span class="line">tensor.argmax() <span class="comment"># 8</span></span><br><span class="line">tensor.argmin() <span class="comment"># 0</span></span><br><span class="line">tensor.<span class="built_in">type</span>(torch.float16) <span class="comment"># tensor([10., 20., 30., 40., 50., 60., 70., 80., 90.]</span></span><br><span class="line">torch.reshape(new_shape) <span class="comment"># -1 is to ask calculating automatically</span></span><br><span class="line">tensor.view(new_shape) <span class="comment"># return a new shape view</span></span><br><span class="line">torch.stack(t, dim=<span class="number">0</span>) <span class="comment"># concate a sequence of tensors along a new dimension(dim)</span></span><br><span class="line">torch.sequeeze() <span class="comment"># all into the first dimensions</span></span><br><span class="line">torch.clamp() <span class="comment"># min=min, max=max, limit the range</span></span><br><span class="line">torch.unsqueeze()</span><br><span class="line">torch.permute() <span class="comment"># torch.Size([224, 224, 3]) -&gt; torch.Size([3, 224, 224])</span></span><br><span class="line">torch.permute_(), x.unsqueeze_() -&gt; inplace operation</span><br></pre></td></tr></table></figure></code></pre>
<h5 id="slice">slice</h5>
<pre><code> <figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">x[:, <span class="number">0</span>] <span class="comment">#  tensor([[1, 2, 3]])</span></span><br><span class="line">x[:, :, <span class="number">1</span>] <span class="comment"># tensor([[2,5,8]])</span></span><br><span class="line">x[:, <span class="number">1</span>, <span class="number">1</span>] <span class="comment"># tensor([5])</span></span><br><span class="line">x[<span class="number">0</span>, <span class="number">0</span>, :]/x[<span class="number">0</span>][<span class="number">0</span>]  <span class="comment"># tensor([1,2,3])</span></span><br></pre></td></tr></table></figure></code></pre>
<h5 id="numpy">numpy</h5>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">tensor = torch.from_numpy(array)</span><br></pre></td></tr></table></figure></code></pre>
<h5 id="random-seed">random seed</h5>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">random seed</span><br><span class="line">torch.manual_seed(seed=RANDOM_SEED) </span><br><span class="line">torch.random.manual_seed(seed=RANDOM_SEED)</span><br></pre></td></tr></table></figure></code></pre>
<h5 id="variable">Variable</h5>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre></td><td class="code"><pre><span class="line">torch.autograd <span class="keyword">import</span> Variable</span><br><span class="line">.data, .grad, .grad_fn</span><br><span class="line">x_tensor = torch.randn(<span class="number">10</span>, <span class="number">5</span>)</span><br><span class="line">y_tensor = torch.randn(<span class="number">10</span>, <span class="number">5</span>)</span><br><span class="line">x = Variable(x_tensor, requires_grad=<span class="literal">True</span>) </span><br><span class="line">y = Variable(y_tensor, requires_grad=<span class="literal">True</span>)</span><br><span class="line">z = torch.<span class="built_in">sum</span>(x + y)</span><br><span class="line"><span class="built_in">print</span>(z.data) <span class="comment">#-2.1379</span></span><br><span class="line"><span class="built_in">print</span>(z.grad_fn) <span class="comment">#&lt;SumBackward0 object at 0x10da636a0&gt;</span></span><br><span class="line">z.backward()</span><br></pre></td></tr></table></figure></code></pre>
<h5 id="gpu">GPU</h5>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">if</span> torch.cuda.is_available():</span><br><span class="line">    device = <span class="string">&quot;cuda&quot;</span> <span class="comment"># Use NVIDIA GPU (if available)</span></span><br><span class="line"><span class="keyword">elif</span> torch.backends.mps.is_available():</span><br><span class="line">    device = <span class="string">&quot;mps&quot;</span> <span class="comment"># Use Apple Silicon GPU (if available)</span></span><br><span class="line"><span class="keyword">else</span>:</span><br><span class="line">    device = <span class="string">&quot;cpu&quot;</span> <span class="comment"># Default to CPU if no GPU is available</span></span><br><span class="line"></span><br><span class="line">tensor.to(device)</span><br><span class="line">tensor_on_gpu.cpu().numpy()</span><br></pre></td></tr></table></figure></code></pre>
<h2 id="neural-network">Neural network</h2>
<p><code>torch.nn</code></p>
<blockquote>
<p>Contains all of the building blocks for computational graphs
(essentially a series of computations executed in a particular way).</p>
</blockquote>
<p><code>torch.nn.Parameter</code></p>
<blockquote>
<p>Stores tensors that can be used with nn.Module. If requires_grad=True
gradients (used for updating model parameters via gradient descent) are
calculated automatically, this is often referred to as "autograd".</p>
</blockquote>
<p><code>torch.nn.Module</code></p>
<blockquote>
<p>The base class for all neural network modules, all the building
blocks for neural networks are subclasses. If you're building a neural
network in PyTorch, your models should subclass nn.Module. Requires a
forward() method be implemented.</p>
</blockquote>
<p><code>torch.optim</code></p>
<blockquote>
<p>Contains various optimization algorithms (these tell the model
parameters stored in nn.Parameter how to best change to improve gradient
descent and in turn reduce the loss).</p>
</blockquote>
<p><code>def forward()</code></p>
<blockquote>
<p>All nn.Module subclasses require a forward() method, this defines the
computation that will take place on the data passed to the particular
nn.Module (e.g. the linear regression formula above).</p>
</blockquote>
<h4 id="define-a-net">Define a net</h4>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">class</span> <span class="title class_">net</span>(nn.Module):</span><br><span class="line">    __init__(<span class="variable language_">self</span>): <span class="built_in">super</span>().__init__() ...</span><br><span class="line">    <span class="keyword">def</span> <span class="title function_">forward</span>(<span class="params">self, x: torch.Tensor</span>) -&gt; torch.Tensor: </span><br><span class="line">    <span class="keyword">return</span> <span class="variable language_">self</span>.weights * x + <span class="variable language_">self</span>.bias</span><br><span class="line"><span class="comment">## expample</span></span><br><span class="line"><span class="keyword">class</span> <span class="title class_">LinearRegressModle</span>(nn.Module):</span><br><span class="line">    <span class="keyword">def</span> <span class="title function_">__init__</span>(<span class="params">self</span>):</span><br><span class="line">    <span class="built_in">super</span>().__init__()</span><br><span class="line">    <span class="variable language_">self</span>.weights = nn.Parameter(torch.randn(<span class="number">1</span>, required_grad=<span class="literal">True</span>, dtype=torch.<span class="built_in">float</span>))</span><br><span class="line">    <span class="variable language_">self</span>.bias = nn.Parameter(torch.randn(<span class="number">1</span>, required_grad=<span class="literal">True</span>, dtype=torch.<span class="built_in">float</span>))</span><br><span class="line"><span class="keyword">def</span> <span class="title function_">forward</span>(<span class="params">self, x:torch.Tensor</span>) -&gt; torch.Tensor:</span><br><span class="line">        <span class="keyword">return</span> <span class="variable language_">self</span>.weight * x + <span class="variable language_">self</span>.bias</span><br></pre></td></tr></table></figure></code></pre>
<h5 id="check-module">Check module</h5>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">torch.manual_seed(<span class="number">42</span>)</span><br><span class="line">model_0 = LinearRegressionModel()</span><br><span class="line"><span class="built_in">list</span>(model_0.parameters()) <span class="comment">#  tensor([0.3367], requires_grad=True)</span></span><br><span class="line">model_0.state_dict() <span class="comment"># OrderedDict([(&#x27;weights&#x27;, tensor([0.3367])), (&#x27;bias&#x27;, tensor([0.1288]))])</span></span><br><span class="line"><span class="keyword">with</span> torch.inference_mode(): y_preds = model_0(X_test) <span class="comment"># run inference</span></span><br></pre></td></tr></table></figure></code></pre>
<h4 id="training">Training</h4>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">loss_fn = nn.L1Loss() <span class="comment"># MAE loss is same as L1Loss</span></span><br><span class="line">optimizer = torch.optim.SGD(params=model_0.parameters(), lr=<span class="number">0.01</span>) <span class="comment">## lr(learning rate)</span></span><br></pre></td></tr></table></figure></code></pre>
<table style="width:100%;">
<colgroup>
<col style="width: 1%" />
<col style="width: 18%" />
<col style="width: 63%" />
<col style="width: 16%" />
</colgroup>
<thead>
<tr class="header">
<th></th>
<th>Step name</th>
<th>What does it do?</th>
<th>Code example</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>1</td>
<td>Forward pass</td>
<td>The model goes through all of the training data once, performing its
forward() function calculations.</td>
<td>model(x_train)</td>
</tr>
<tr class="even">
<td>2</td>
<td>Calculate the loss</td>
<td>The model's outputs (predictions) are compared to the ground truth
and evaluated to see how wrong they are.</td>
<td>loss = loss_fn(y_pred, y_train)</td>
</tr>
<tr class="odd">
<td>3</td>
<td>Zero gradients</td>
<td>The optimizers gradients are set to zero (they are accumulated by
default) so they can be recalculated for the specific training
step.</td>
<td>optimizer.zero_grad()</td>
</tr>
<tr class="even">
<td>4</td>
<td>Perform backpropagation on the loss</td>
<td>Computes the gradient of the loss with respect for every model
parameter to be updated (each parameter with requires_grad=True)</td>
<td>loss.backward()</td>
</tr>
<tr class="odd">
<td>5</td>
<td>Update the optimizer (gradient descent)</td>
<td>Update the parameters with requires_grad=True with respect to the
loss gradients in order to improve them.</td>
<td>optimizer.step()</td>
</tr>
</tbody>
</table>
<h5 id="training-example">Training example</h5>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">for</span> epoch <span class="keyword">in</span> <span class="built_in">range</span>(epoches):</span><br><span class="line">    model.train()</span><br><span class="line">y_pred = model(X_train)</span><br><span class="line">loss = loss_fn(y_pred, y_true)</span><br><span class="line">optimizer.zero_grad()</span><br><span class="line">loss.backward()</span><br><span class="line">optimizer.step()</span><br></pre></td></tr></table></figure></code></pre>
<h4 id="test">test</h4>
<table>
<colgroup>
<col style="width: 1%" />
<col style="width: 20%" />
<col style="width: 59%" />
<col style="width: 18%" />
</colgroup>
<tbody>
<tr class="odd">
<td></td>
<td>Forward pass</td>
<td>The model goes through all of the training data once, performing its
forward() function calculations.</td>
<td>model(x_test)</td>
</tr>
<tr class="even">
<td></td>
<td>Calculate the loss</td>
<td>The model's outputs (predictions) are compared to the ground truth
and evaluated to see how wrong they are.</td>
<td>loss = loss_fn(y_pred, y_test)</td>
</tr>
<tr class="odd">
<td></td>
<td>Calulate evaluation metrics (optional)</td>
<td>Alongisde the loss value you may want to calculate other evaluation
metrics such as accuracy on the test set.</td>
<td>Custom functions</td>
</tr>
</tbody>
</table>
<h4 id="inference-and-save-model">Inference and save model</h4>
<p><code>Inferennce</code></p>
<blockquote>
<p>model_0.eval() # Set the model in evaluation mode with
torch.inference_mode(): y_preds = model_0(X_test)</p>
</blockquote>
<p><code>torch.save</code></p>
<blockquote>
<p>Saves a serialized object to disk using Python's pickle utility.
Models, tensors and various other Python objects like dictionaries can
be saved using torch.save.</p>
</blockquote>
<p><code>torch.load</code></p>
<blockquote>
<p>Uses pickle's unpickling features to deserialize and load pickled
Python object files (like models, tensors or dictionaries) into memory.
You can also set which device to load the object to (CPU, GPU etc).</p>
</blockquote>
<p><code>torch.nn.Module.load_state_dict</code> ## recommended</p>
<blockquote>
<p>Loads a model's parameter dictionary (model.state_dict()) using a
saved state_dict() object.</p>
</blockquote>
<h2 id="examples">Examples</h2>
<h4 id="example-1">Example 1</h4>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br></pre></td><td class="code"><pre><span class="line">torch.manual_seed(<span class="number">42</span>)</span><br><span class="line">epochs = <span class="number">100</span> <span class="comment"># Set the number of epochs </span></span><br><span class="line"></span><br><span class="line"><span class="comment"># Create empty loss lists to track values</span></span><br><span class="line">train_loss_values = []</span><br><span class="line">test_loss_values = []</span><br><span class="line">epoch_count = []</span><br><span class="line"></span><br><span class="line"><span class="keyword">for</span> epoch <span class="keyword">in</span> <span class="built_in">range</span>(epochs):</span><br><span class="line">    <span class="comment">### Training</span></span><br><span class="line">    model_0.train() <span class="comment"># Put model in training mode (this is the default state of a model)</span></span><br><span class="line"></span><br><span class="line">    <span class="comment"># 1. Forward pass on train data using the forward() method inside </span></span><br><span class="line">    y_pred = model_0(X_train)</span><br><span class="line">    <span class="comment"># 2. Calculate the loss (how different are our models predictions to the ground truth)</span></span><br><span class="line">    loss = loss_fn(y_pred, y_train)</span><br><span class="line">    optimizer.zero_grad() <span class="comment"># 3. Zero grad of the optimizer</span></span><br><span class="line">    loss.backward() <span class="comment"># 4. Loss backwards</span></span><br><span class="line">    optimizer.step() <span class="comment"># 5. Progress the optimizer</span></span><br><span class="line">     <span class="comment">### Testing</span></span><br><span class="line">    <span class="comment"># Put the model in evaluation mode</span></span><br><span class="line">    model_0.<span class="built_in">eval</span>()</span><br><span class="line"></span><br><span class="line">    <span class="keyword">with</span> torch.inference_mode():</span><br><span class="line">      <span class="comment"># 1. Forward pass on test data</span></span><br><span class="line">      test_pred = model_0(X_test)</span><br><span class="line"></span><br><span class="line">      <span class="comment"># 2. Caculate loss on test data</span></span><br><span class="line">       <span class="comment"># predictions come in torch.float datatype, so comparisons need to be done with tensors of the same type</span></span><br><span class="line">      test_loss = loss_fn(test_pred, y_test.<span class="built_in">type</span>(torch.<span class="built_in">float</span>)) </span><br><span class="line"></span><br><span class="line">      <span class="comment"># Print out what&#x27;s happening</span></span><br><span class="line">      <span class="keyword">if</span> epoch % <span class="number">10</span> == <span class="number">0</span>:</span><br><span class="line">            epoch_count.append(epoch)</span><br><span class="line">            train_loss_values.append(loss.detach().numpy())</span><br><span class="line">            test_loss_values.append(test_loss.detach().numpy())</span><br><span class="line">            <span class="built_in">print</span>(<span class="string">f&quot;Epoch: <span class="subst">&#123;epoch&#125;</span> | MAE Train Loss: <span class="subst">&#123;loss&#125;</span> | MAE Test Loss: <span class="subst">&#123;test_loss&#125;</span> &quot;</span>)</span><br><span class="line"></span><br></pre></td></tr></table></figure></code></pre>
<h4 id="example-2">Example 2</h4>
<pre><code><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br></pre></td><td class="code"><pre><span class="line">torch.manual_seed(<span class="number">42</span>)</span><br><span class="line"></span><br><span class="line"><span class="comment"># Set the number of epochs </span></span><br><span class="line">epochs = <span class="number">1000</span> </span><br><span class="line"></span><br><span class="line"><span class="comment"># Put data on the available device</span></span><br><span class="line"><span class="comment"># Without this, error will happen (not all model/data on device)</span></span><br><span class="line">X_train = X_train.to(device)</span><br><span class="line">X_test = X_test.to(device)</span><br><span class="line">y_train = y_train.to(device)</span><br><span class="line">y_test = y_test.to(device)</span><br><span class="line"></span><br><span class="line"><span class="keyword">for</span> epoch <span class="keyword">in</span> <span class="built_in">range</span>(epochs):</span><br><span class="line">    <span class="comment">### Training</span></span><br><span class="line">    model_1.train() <span class="comment"># train mode is on by default after construction</span></span><br><span class="line"></span><br><span class="line">    <span class="comment"># 1. Forward pass</span></span><br><span class="line">    y_pred = model_1(X_train)</span><br><span class="line">    <span class="comment"># 2. Calculate loss</span></span><br><span class="line">    loss = loss_fn(y_pred, y_train)</span><br><span class="line"></span><br><span class="line">    <span class="comment"># 3. Zero grad optimizer</span></span><br><span class="line">    optimizer.zero_grad()</span><br><span class="line"></span><br><span class="line">    <span class="comment"># 4. Loss backward</span></span><br><span class="line">    loss.backward()</span><br><span class="line"></span><br><span class="line">    <span class="comment"># 5. Step the optimizer</span></span><br><span class="line">    optimizer.step()</span><br><span class="line"></span><br><span class="line">    <span class="comment">### Testing</span></span><br><span class="line">    model_1.<span class="built_in">eval</span>() <span class="comment"># put the model in evaluation mode for testing (inference)</span></span><br><span class="line">    <span class="comment"># 1. Forward pass</span></span><br><span class="line">    <span class="keyword">with</span> torch.inference_mode():</span><br><span class="line">        test_pred = model_1(X_test)</span><br><span class="line"></span><br><span class="line">        <span class="comment"># 2. Calculate the loss</span></span><br><span class="line">        test_loss = loss_fn(test_pred, y_test)</span><br><span class="line"></span><br><span class="line">    <span class="keyword">if</span> epoch % <span class="number">100</span> == <span class="number">0</span>:</span><br><span class="line">        <span class="built_in">print</span>(<span class="string">f&quot;Epoch: <span class="subst">&#123;epoch&#125;</span> | Train loss: <span class="subst">&#123;loss&#125;</span> | Test loss: <span class="subst">&#123;test_loss&#125;</span>&quot;</span>)</span><br><span class="line"></span><br></pre></td></tr></table></figure></code></pre>
<h4 id="vae">VAE</h4>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">class</span> <span class="title class_">VAE</span>(nn.Module):</span><br><span class="line">    <span class="keyword">def</span> <span class="title function_">__init__</span>(<span class="params">self, input_dim=<span class="number">784</span>, hidden_dim=<span class="number">400</span>, latent_dim=<span class="number">200</span>, device=device</span>):</span><br><span class="line">        <span class="built_in">super</span>(VAE, <span class="variable language_">self</span>).__init__()</span><br><span class="line"></span><br><span class="line">        <span class="comment"># encoder</span></span><br><span class="line">        <span class="variable language_">self</span>.encoder = nn.Sequential(</span><br><span class="line">            nn.Linear(input_dim, hidden_dim),</span><br><span class="line">            nn.LeakyReLU(<span class="number">0.2</span>),</span><br><span class="line">            nn.Linear(hidden_dim, latent_dim),</span><br><span class="line">            nn.LeakyReLU(<span class="number">0.2</span>)</span><br><span class="line">            )</span><br><span class="line"></span><br><span class="line">        <span class="comment"># latent mean and variance </span></span><br><span class="line">        <span class="variable language_">self</span>.mean_layer = nn.Linear(latent_dim, <span class="number">2</span>)</span><br><span class="line">        <span class="variable language_">self</span>.logvar_layer = nn.Linear(latent_dim, <span class="number">2</span>)</span><br><span class="line"></span><br><span class="line">        <span class="comment"># decoder</span></span><br><span class="line">        <span class="variable language_">self</span>.decoder = nn.Sequential(</span><br><span class="line">            nn.Linear(<span class="number">2</span>, latent_dim),</span><br><span class="line">            nn.LeakyReLU(<span class="number">0.2</span>),</span><br><span class="line">            nn.Linear(latent_dim, hidden_dim),</span><br><span class="line">            nn.LeakyReLU(<span class="number">0.2</span>),</span><br><span class="line">            nn.Linear(hidden_dim, input_dim),</span><br><span class="line">            nn.Sigmoid()</span><br><span class="line">            )</span><br><span class="line">    <span class="keyword">def</span> <span class="title function_">encode</span>(<span class="params">self, x</span>):</span><br><span class="line">        x = <span class="variable language_">self</span>.encoder(x)</span><br><span class="line">        mean, logvar = <span class="variable language_">self</span>.mean_layer(x), <span class="variable language_">self</span>.logvar_layer(x)</span><br><span class="line">        <span class="keyword">return</span> mean, logvar</span><br><span class="line"></span><br><span class="line">    <span class="keyword">def</span> <span class="title function_">reparameterization</span>(<span class="params">self, mean, var</span>):</span><br><span class="line">        epsilon = torch.randn_like(var).to(device)      </span><br><span class="line">        z = mean + var*epsilon</span><br><span class="line">        <span class="keyword">return</span> z</span><br><span class="line"></span><br><span class="line">    <span class="keyword">def</span> <span class="title function_">decode</span>(<span class="params">self, x</span>):</span><br><span class="line">        <span class="keyword">return</span> <span class="variable language_">self</span>.decoder(x)</span><br><span class="line"></span><br><span class="line">    <span class="keyword">def</span> <span class="title function_">forward</span>(<span class="params">self, x</span>):</span><br><span class="line">        mean, logvar = <span class="variable language_">self</span>.encode(x)</span><br><span class="line">        z = <span class="variable language_">self</span>.reparameterization(mean, logvar)</span><br><span class="line">        x_hat = <span class="variable language_">self</span>.decode(z)</span><br><span class="line">        <span class="keyword">return</span> x_hat, mean, log_var</span><br></pre></td></tr></table></figure>
<h4 id="cnn">CNN</h4>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># Create a neural net class</span></span><br><span class="line"><span class="keyword">class</span> <span class="title class_">Net</span>(nn.Module):</span><br><span class="line">    <span class="comment"># Constructor</span></span><br><span class="line">    <span class="keyword">def</span> <span class="title function_">__init__</span>(<span class="params">self, num_classes=<span class="number">3</span></span>):</span><br><span class="line">        <span class="built_in">super</span>(Net, <span class="variable language_">self</span>).__init__()</span><br><span class="line"></span><br><span class="line">        <span class="comment"># Our images are RGB, so input channels = 3. We&#x27;ll apply 12 filters in the first convolutional layer</span></span><br><span class="line">        <span class="variable language_">self</span>.conv1 = nn.Conv2d(in_channels=<span class="number">3</span>, out_channels=<span class="number">12</span>, kernel_size=<span class="number">3</span>, stride=<span class="number">1</span>, padding=<span class="number">1</span>)</span><br><span class="line"></span><br><span class="line">        <span class="comment"># We&#x27;ll apply max pooling with a kernel size of 2</span></span><br><span class="line">        <span class="variable language_">self</span>.pool = nn.MaxPool2d(kernel_size=<span class="number">2</span>)</span><br><span class="line"></span><br><span class="line">        <span class="comment"># A second convolutional layer takes 12 input channels, and generates 12 outputs</span></span><br><span class="line">        <span class="variable language_">self</span>.conv2 = nn.Conv2d(in_channels=<span class="number">12</span>, out_channels=<span class="number">12</span>, kernel_size=<span class="number">3</span>, stride=<span class="number">1</span>, padding=<span class="number">1</span>)</span><br><span class="line"></span><br><span class="line">        <span class="comment"># A third convolutional layer takes 12 inputs and generates 24 outputs</span></span><br><span class="line">        <span class="variable language_">self</span>.conv3 = nn.Conv2d(in_channels=<span class="number">12</span>, out_channels=<span class="number">24</span>, kernel_size=<span class="number">3</span>, stride=<span class="number">1</span>, padding=<span class="number">1</span>)</span><br><span class="line"></span><br><span class="line">        <span class="comment"># A drop layer deletes 20% of the features to help prevent overfitting</span></span><br><span class="line">        <span class="variable language_">self</span>.drop = nn.Dropout2d(p=<span class="number">0.2</span>)</span><br><span class="line">        <span class="comment"># Our 128x128 image tensors will be pooled twice with a kernel size of 2. 128/2/2 is 32.</span></span><br><span class="line">        <span class="comment"># So our feature tensors are now 32 x 32, and we&#x27;ve generated 24 of them</span></span><br><span class="line">        <span class="comment"># We need to flatten these and feed them to a fully-connected layer</span></span><br><span class="line">        <span class="comment"># to map them to  the probability for each class</span></span><br><span class="line">        <span class="variable language_">self</span>.fc = nn.Linear(in_features=<span class="number">32</span> * <span class="number">32</span> * <span class="number">24</span>, out_features=num_classes)</span><br><span class="line"></span><br><span class="line">    <span class="keyword">def</span> <span class="title function_">forward</span>(<span class="params">self, x</span>):</span><br><span class="line">        <span class="comment"># Use a relu activation function after layer 1 (convolution 1 and pool)</span></span><br><span class="line">        x = F.relu(<span class="variable language_">self</span>.pool(<span class="variable language_">self</span>.conv1(x)))</span><br><span class="line"></span><br><span class="line">        <span class="comment"># Use a relu activation function after layer 2 (convolution 2 and pool)</span></span><br><span class="line">        x = F.relu(<span class="variable language_">self</span>.pool(<span class="variable language_">self</span>.conv2(x)))</span><br><span class="line"></span><br><span class="line">        <span class="comment"># Select some features to drop after the 3rd convolution to prevent overfitting</span></span><br><span class="line">        x = F.relu(<span class="variable language_">self</span>.drop(<span class="variable language_">self</span>.conv3(x)))</span><br><span class="line"></span><br><span class="line">        <span class="comment"># Only drop the features if this is a training pass</span></span><br><span class="line">        x = F.dropout(x, training=<span class="variable language_">self</span>.training)</span><br><span class="line"></span><br><span class="line">        <span class="comment"># Flatten</span></span><br><span class="line">        x = x.view(-<span class="number">1</span>, <span class="number">32</span> * <span class="number">32</span> * <span class="number">24</span>)</span><br><span class="line">        <span class="comment"># Feed to fully-connected layer to predict class</span></span><br><span class="line">        x = <span class="variable language_">self</span>.fc(x)</span><br><span class="line">        <span class="comment"># Return log_softmax tensor </span></span><br><span class="line">        <span class="keyword">return</span> F.log_softmax(x, dim=<span class="number">1</span>)</span><br></pre></td></tr></table></figure>
<h4 id="lstm">LSTM</h4>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> torch</span><br><span class="line"><span class="keyword">import</span> torch.autograd <span class="keyword">as</span> autograd</span><br><span class="line"><span class="keyword">import</span> torch.nn <span class="keyword">as</span> nn</span><br><span class="line"><span class="keyword">import</span> torch.functional <span class="keyword">as</span> F</span><br><span class="line"><span class="keyword">import</span> torch.optim <span class="keyword">as</span> optim</span><br><span class="line"><span class="keyword">from</span> torch.nn.utils.rnn <span class="keyword">import</span> pack_padded_sequence, pad_packed_sequence</span><br><span class="line"></span><br><span class="line"><span class="keyword">class</span> <span class="title class_">LSTMClassifier</span>(nn.Module):</span><br><span class="line">  <span class="keyword">def</span> <span class="title function_">__init__</span>(<span class="params">self, vocab_size, embedding_dim, hidden_dim, output_size</span>):</span><br><span class="line">    <span class="built_in">super</span>(LSTMClassifier, <span class="variable language_">self</span>).__init__()</span><br><span class="line">    <span class="variable language_">self</span>.embedding_dim = embedding_dim</span><br><span class="line">    <span class="variable language_">self</span>.hidden_dim = hidden_dim</span><br><span class="line">    <span class="variable language_">self</span>.vocab_size = vocab_size</span><br><span class="line">    <span class="variable language_">self</span>.embedding = nn.Embedding(vocab_size, embedding_dim)</span><br><span class="line">    <span class="variable language_">self</span>.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=<span class="number">1</span>)</span><br><span class="line">    <span class="variable language_">self</span>.hidden2out = nn.Linear(hidden_dim, output_size)</span><br><span class="line">    <span class="variable language_">self</span>.softmax = nn.LogSoftmax()</span><br><span class="line">    <span class="variable language_">self</span>.dropout_layer = nn.Dropout(p=<span class="number">0.2</span>)</span><br><span class="line">  <span class="keyword">def</span> <span class="title function_">init_hidden</span>(<span class="params">self, batch_size</span>):</span><br><span class="line">    <span class="keyword">return</span>(autograd.Variable(torch.randn(<span class="number">1</span>, batch_size, <span class="variable language_">self</span>.hidden_dim)),</span><br><span class="line">            autograd.Variable(torch.randn(<span class="number">1</span>, batch_size, <span class="variable language_">self</span>.hidden_dim)))</span><br><span class="line"></span><br><span class="line">  <span class="keyword">def</span> <span class="title function_">forward</span>(<span class="params">self, batch, lengths</span>): </span><br><span class="line"></span><br><span class="line">    <span class="variable language_">self</span>.hidden = <span class="variable language_">self</span>.init_hidden(batch.size(-<span class="number">1</span>))</span><br><span class="line">    embeds = <span class="variable language_">self</span>.embedding(batch)</span><br><span class="line">    packed_input = pack_padded_sequence(embeds, lengths)</span><br><span class="line">    outputs, (ht, ct) = <span class="variable language_">self</span>.lstm(packed_input, <span class="variable language_">self</span>.hidden)</span><br><span class="line">    <span class="comment"># ht is the last hidden state of the sequences</span></span><br><span class="line">    <span class="comment"># ht = (1 x batch_size x hidden_dim)</span></span><br><span class="line">    <span class="comment"># ht[-1] = (batch_size x hidden_dim)</span></span><br><span class="line">    output = <span class="variable language_">self</span>.dropout_layer(ht[-<span class="number">1</span>])</span><br><span class="line">    output = <span class="variable language_">self</span>.hidden2out(output)</span><br><span class="line">    output = <span class="variable language_">self</span>.softmax(output)</span><br><span class="line"></span><br><span class="line">    <span class="keyword">return</span> output</span><br></pre></td></tr></table></figure>
<h2 id="references">References</h2>
<pre><code>1. https://github.com/mrdbourke/pytorch-deep-learning
2. https://readmedium.com/@rekalantar/variational-auto-encoder-vae-pytorch-tutorial-dce2d2fe0f5f
3. https://github.com/MicrosoftDocs/ml-basics/blob/master/05b%20-%20Convolutional%20Neural%20Networks%20(PyTorch).ipynb
4. https://github.com/ritchieng/the-incredible-pytorch?tab=readme-ov-file
5. https://github.com/claravania/lstm-pytorch
6. https://machinelearningmastery.com/pytorch-tutorial-develop-deep-learning-models/</code></pre>

      
    </div>

    
    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>


<div class="post-block">
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="">
    <link itemprop="mainEntityOfPage" href="http://chengmingbo.github.io/2019/05/10/CCA/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/avatar.gif">
      <meta itemprop="name" content="Mingbo Cheng">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Mingbo">
      <meta itemprop="description" content="Mingbo">
    </span>

    <span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
      <meta itemprop="name" content="undefined | Mingbo">
      <meta itemprop="description" content="">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/2019/05/10/CCA/" class="post-title-link" itemprop="url">A tutorial on Canonical Correlation Analysis(CCA)</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">发表于</span>

      <time title="创建时间：2019-05-10 19:54:43" itemprop="dateCreated datePublished" datetime="2019-05-10T19:54:43+02:00">2019-05-10</time>
    </span>

  
</div>

        </div>
      </header>

    
    <div class="post-body" itemprop="articleBody">
          <h2 id="introduction">Introduction</h2>
<p>Suppose we have two sets of variable corresponding to two aspects
such as height and weight, we want to analysis the relationship between
this two sets. There are several ways to measure the relationship
between them. However, sometime the it is hard to handle datasets with
different dimensions, meaning, if <span class="math inline">\(X\in
\mathbb{R}^m\)</span> and <span class="math inline">\(Y\in
\mathbb{R}^n\)</span>, how to resolve the relationship?</p>
<h2 id="basic-of-cca">basic of CCA</h2>
<p>Assume there are two sets of data <span
class="math inline">\(X\)</span> and <span
class="math inline">\(Y\)</span>, the size of <span
class="math inline">\(X\)</span> is <span class="math inline">\(n \times
p\)</span>, whereas size of <span class="math inline">\(Y\)</span> is
<span class="math inline">\(n\times q\)</span>. That is, <span
class="math inline">\(X\)</span> and <span
class="math inline">\(Y\)</span> share the same row numbers but are
differnt in columns number. The idea of CCA is simple: find the best
match of <span class="math inline">\(X w_x\)</span> and <span
class="math inline">\(Y w_y\)</span>. Let's just set: <span
class="math display">\[X w_x = z_x\qquad\text{and}\qquad Y w_y =
z_y\]</span></p>
<p>Where <span class="math inline">\(X\in \mathbb{R}^{n\times
p}\)</span>, <span class="math inline">\(w_x \in
\mathbb{R}^{p}\)</span>, <span class="math inline">\(z_x\in
\mathbb{R}^n\)</span>, <span class="math inline">\(Y\in
\mathbb{R}^{n\times q}\)</span>, <span class="math inline">\(w_y \in
\mathbb{R}^{q}\)</span>, <span class="math inline">\(z_y\in
\mathbb{R}^n\)</span>. <span class="math inline">\(w_x\)</span> and
<span class="math inline">\(w_y\)</span> are often refered as canonical
weight vectors, <span class="math inline">\(z_x\)</span> and <span
class="math inline">\(z_y\)</span> are named images as well as canonical
variates or canonical scores. To simplify the problem, we assume <span
class="math inline">\(X\)</span> and <span
class="math inline">\(Y\)</span> are standardized to zero mean and unit
variance. Our task is to maximize the angle of <span
class="math inline">\(z_x\)</span> and <span
class="math inline">\(z_y\)</span>, meaning:</p>
<p><span class="math display">\[\max_{z_x, z_y \in \mathbf{R^n}}
&lt;z_x, z_y&gt;=\max \cos(z_x, z_y)=\max\frac{&lt;z_x,
z_y&gt;}{\|z_x\|\|z_y\|}\]</span></p>
<p>with respect to: <span class="math inline">\(\|z_x\|_{2}=1\quad
\|z_y\|_{2}=1\)</span>.</p>
<p>In fact, our task is just project <span
class="math inline">\(X\)</span> and <span
class="math inline">\(Y\)</span> to a new coordinate system after the
linear transformation to <span class="math inline">\(X\)</span> and
<span class="math inline">\(Y\)</span>.</p>
<h2 id="resolve-cca">Resolve CCA</h2>
<p>There are many solutions to this problems. Before start, We need make
some assumptions: 1. the each column vector of <span
class="math inline">\(X\)</span> is perpendicular to the others. Which
means <span class="math inline">\(X^T X= I\)</span>. The assumption is
the same with <span class="math inline">\(Y\)</span> and <span
class="math inline">\(w_x, w_y\)</span>. We can find <span
class="math inline">\(\min(p,q)\)</span> canonical components, and the
<span class="math inline">\(r\)</span>th component is orthogonal to all
the <span class="math inline">\(r-1\)</span> components.</p>
<h4 id="resolve-cca-through-svd">Resolve CCA through SVD</h4>
<p>To solve the CCA problem using SVD, we first introduce the joint
covariance matrix <span class="math inline">\(C\)</span> such such that:
<span class="math display">\[\begin{equation}
    C = \begin{pmatrix}
        C_{xx} &amp; C_{xy}\\
        C_{yx} &amp; C_{yy}\\
    \end{pmatrix}
\end{equation}\]</span> Where <span
class="math inline">\(C_{xx}=\frac{1}{n-1}X^\top X\)</span> and <span
class="math inline">\(C_{yy}=\frac{1}{n-1}Y^\top Y\)</span> are the
empirical variance matrices between <span
class="math inline">\(X\)</span> and <span
class="math inline">\(Y\)</span> respectively. The <span
class="math inline">\(C_{xy}=\frac{1}{n-1} X^\top Y\)</span> is the
covariance matrix between <span class="math inline">\(X\)</span> and
<span class="math inline">\(Y\)</span>.</p>
<p>We next can reform CCA problem with two linear transformations <span
class="math inline">\(w_x\)</span> and <span
class="math inline">\(w_y\)</span> such that:</p>
<p><span class="math display">\[\begin{equation}
w_x^\top C_{xx} w_x = I_p, \quad w_y^\top C_{yy} w_y = I_q, \quad
w_x^\top C_{xy} w_y = D
\end{equation}\]</span> Where I_p and I_q are th p-dimensional and
q-dimensional identity meatrics respectively. The diagonal matrix <span
class="math inline">\(D = \text{diag}(\gamma_i)\)</span> so that:</p>
<p><span class="math display">\[\begin{equation}
    \begin{pmatrix}
        {w}_x^\top &amp; { 0}\\
        { 0} &amp;  {w}_y^\top
        \end{pmatrix}
        \begin{pmatrix}
        C_{xx} &amp; C_{xy}\\
        C_{yx} &amp; C_{yy}
        \end{pmatrix}
        \begin{pmatrix}
         {w}_x &amp; { 0}\\
        { 0} &amp;  {w}_y
        \end{pmatrix}
        =
        \begin{pmatrix}
        I_p &amp; D\\
        D^\top &amp; I_q
    \end{pmatrix},
\end{equation}\]</span></p>
<p>The canoical variable: <span class="math display">\[\begin{equation}
Z_x = Xw_x, \quad Z_y = Y w_y
\end{equation}\]</span> The diagonal elements <span
class="math inline">\(\gamma_i\)</span> of D denote the canonical
correlations. Thus we find the linear compounds <span
class="math inline">\({Z}_x\)</span> and <span
class="math inline">\({Z}_y\)</span> to maximize the cross-correlations.
Since both <span class="math inline">\(C_{xx}\)</span> and <span
class="math inline">\(C_{yy}\)</span> are symmetric positive definite,
we can perform Cholesky Decomposition on them to get: <span
class="math display">\[\begin{equation}
    C_{xx} = C_{xx}^{\top/2} C_{xx}^{1/2}, \quad C_{yy} =
C_{yy}^{\top/2} C_{yy}^{1/2}
\end{equation}\]</span></p>
<p>where <span class="math inline">\(C_{xx}^{\top/2}\)</span> is the
transpose of <span class="math inline">\(C_{xx}^{1/2}\)</span>. Applying
the inverses of the square root factors symmetrically on the joint
covariance matrix <span class="math inline">\(C\)</span>, the matrix is
transformed into: <span class="math display">\[\begin{equation}
\begin{pmatrix}
    C_{xx}^{-\top/2} &amp; {\mathbf 0}\\
    {\mathbf 0} &amp; C_{yy}^{-\top/2}
    \end{pmatrix}
    \begin{pmatrix}
    C_{xx} &amp; C_{ab}\\
    C_{yx} &amp; C_{yy}
    \end{pmatrix}
    \begin{pmatrix}
    C_{xx}^{-1/2} &amp; {\mathbf 0}\\
    {\mathbf 0} &amp; C_{yy}^{-1/2}
    \end{pmatrix}
    =
    \begin{pmatrix}
    I_p &amp; C_{xx}^{-1/2}C_{ab}C_{yy}^{-1/2}\\
    C_{yy}^{-1/2}C_{yx}C_{xx}^{-1/2} &amp; I_q
\end{pmatrix}.
\end{equation}\]</span></p>
<p>The canonical correlation problem is reduced to that of finding an
SVD of a triple product: <span class="math display">\[\begin{equation}
    U^{\top} (C_{xx}^{-1/2}C_{ab}C_{yy}^{-1/2}) V = D.
\end{equation}\]</span> The matrix <span
class="math inline">\(C\)</span> is thus reduced to the joint covariance
matrix by applying a two-sided Jacobi method such that: <span
class="math display">\[\begin{equation}
    \begin{pmatrix}
        U^\top &amp; {\mathbf 0}\\
        {\mathbf 0} &amp; V^\top
    \end{pmatrix}
    \begin{pmatrix}
        I_p &amp; C_{xx}^{-1/2}C_{ab}C_{yy}^{-1/2}\\
        C_{yy}^{-1/2}C_{_y}C_{xx}^{-1/2} &amp; I_q
    \end{pmatrix}
    \begin{pmatrix}
        U &amp; {\mathbf 0}\\
        {\mathbf 0} &amp; V
    \end{pmatrix} =
    \begin{pmatrix}
    I_p &amp; D\\
    D^\top &amp; I_q
    \end{pmatrix}
\end{equation}\]</span></p>
<p>with the desired transformation <span
class="math inline">\({w}_x\)</span> and <span
class="math inline">\({w}_y\)</span>: <span
class="math display">\[\begin{equation}
    {w}_x = C_{xx}^{-1/2} U, \quad {w}_y = C_{yy}^{-1/2}V
\end{equation}\]</span> where the singular values <span
class="math inline">\(\gamma_i\)</span> are in descending order such
that: <span class="math display">\[\begin{equation}
    \gamma_1 \geq \gamma_2 \geq \cdots \geq 0.
\end{equation}\]</span></p>
<h4 id="resolve-cca-through-standard-eigenvalue-problem">Resolve CCA
through Standard EigenValue Problem</h4>
<p>The Problem can be reformed to solve the problem: <span
class="math display">\[\begin{equation}
\underset{w_x \in \mathbb{R}^p, w_y\in \mathbb{R}^q}{\arg \max} w_x^\top
C_{xy} w_y
\end{equation}\]</span> With respect to <span
class="math inline">\(\|\|w_x^\top C_{xx} w_x\|\|_2 = \sqrt{w_x^\top
C_{xx} w_x}=1\)</span> and <span class="math inline">\(\|\|w_y^\top
C_{yy} w_y\|\|_2 = \sqrt{w_y^\top C_{yy} w_y}=1\)</span>. The problem
can apparently sovled by Lagrange multiplier technique. Let construct
the Lagrange multiplier <span class="math inline">\(L\)</span> such
that: <span class="math display">\[\begin{equation}
    L = w_x^\top C_{xy} w_y - \frac{\rho_1}{2} w_x^\top C_{xx} w_x -
\frac{\rho_2}{2} w_y^\top C_{yy} w_y
    \end{equation}\]</span></p>
<p>The differentiation of L to <span class="math inline">\(w_x\)</span>
and <span class="math inline">\(w_y\)</span> is: <span
class="math display">\[\begin{equation}
\begin{aligned}
\frac{\partial L}{\partial w_x} = C_{xy} w_y - \rho_1 C_{xx}w_x =
\mathbf{0}\\
\frac{\partial L}{\partial w_y} = C_{yx} w_x - \rho_2 C_{yy}w_y =
\mathbf{0}
\end{aligned}
\end{equation}\]</span></p>
<p>By left multipling <span class="math inline">\(w_x\)</span> and <span
class="math inline">\(w_y\)</span> the above equation, we have:</p>
<p><span class="math display">\[\begin{equation}
\begin{aligned}
w_x^\top C_xy w_y -\rho_1 w_x^\top C_xx w_x = \mathbf{0}\\
w_y^\top C_yx w_x -\rho_2 w_y^\top C_yy w_y = \mathbf{0}
\end{aligned}
\end{equation}\]</span> Since w_x^C_xx w_x = 1 and w_y^C_yy w_y = 1, we
can obtain that <span class="math inline">\(\rho_1 = \rho_2 =
\rho\)</span>. By substituting <span class="math inline">\(\rho\)</span>
to the formula. We can get: <span
class="math display">\[\begin{equation}
w_x = \frac{C_{xx}^{-1}C_{xy}w_y}{rho}
\end{equation}\]</span> Evantually we have the equation: <span
class="math display">\[\begin{equation}
C_{yx} C_{xx}^{-1} C_{xy} w_y = \rho^2 C_yy w_y
\end{equation}\]</span> Obviously, this is the form of eigenvalue
decompostion problem where all eigen values are greater or equal to
zero. By solving the eigenvalue decomposition we can find <span
class="math inline">\(w_x\)</span> and <span
class="math inline">\(w_y\)</span>.</p>

      
    </div>

    
    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>


<div class="post-block">
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="">
    <link itemprop="mainEntityOfPage" href="http://chengmingbo.github.io/2019/03/10/add-comments/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/avatar.gif">
      <meta itemprop="name" content="Mingbo Cheng">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Mingbo">
      <meta itemprop="description" content="Mingbo">
    </span>

    <span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
      <meta itemprop="name" content="undefined | Mingbo">
      <meta itemprop="description" content="">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/2019/03/10/add-comments/" class="post-title-link" itemprop="url">简单增加博客评论</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">发表于</span>

      <time title="创建时间：2019-03-10 20:55:48" itemprop="dateCreated datePublished" datetime="2019-03-10T20:55:48+01:00">2019-03-10</time>
    </span>

  
</div>

        </div>
      </header>

    
    <div class="post-body" itemprop="articleBody">
          <h2 id="博客的评论系统">博客的评论系统</h2>
<p>希望把博客的评论系统建立起来，之前使用的是disqus，重新部署的时候，页面大部分都无法显示。不想再用disqus。看到有人创造性的利用github作为载体建立评论系统，也就是Gitment了。
按照教程在github上设置了Gitment，惊闻Gitment需要请求服务，是作者搭的，作者已经不维护了。按照以下操作：
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">npm i --save gitment</span><br></pre></td></tr></table></figure></p>
<p>修改自己js，连接自己搭建的服务器，WTF？ <figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">/node_modules/gitment/dist/gitment.browser.js</span><br></pre></td></tr></table></figure>
详细修改过程可参照：
https://sherry0429.github.io/2019/02/12/gitment%E4%BF%AE%E5%A4%8D/</p>
<p>后继续寻觅其他可以评论系统，找到这篇文章：
https://wangjiezhe.com/posts/2018-10-29-Hexo-NexT-3/
根据此文章的教程安装了utterances。目前发现还是比较不错。知识现在看到的效果是全局评论。
issue-term不太了解具体，目前不想深入探究，仅仅设置pathname。</p>

      
    </div>

    
    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>


<div class="post-block">
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="">
    <link itemprop="mainEntityOfPage" href="http://chengmingbo.github.io/2019/03/10/Change-theme-to-Next/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/avatar.gif">
      <meta itemprop="name" content="Mingbo Cheng">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Mingbo">
      <meta itemprop="description" content="Mingbo">
    </span>

    <span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
      <meta itemprop="name" content="undefined | Mingbo">
      <meta itemprop="description" content="">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/2019/03/10/Change-theme-to-Next/" class="post-title-link" itemprop="url">Change theme to Next</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">发表于</span>

      <time title="创建时间：2019-03-10 14:07:25" itemprop="dateCreated datePublished" datetime="2019-03-10T14:07:25+01:00">2019-03-10</time>
    </span>

  
</div>

        </div>
      </header>

    
    <div class="post-body" itemprop="articleBody">
          <h2 id="不再折腾主题乖乖的切到next">不再折腾主题，乖乖的切到Next</h2>
<p>又一次，今年年初的又一次，博客系统hexo下的maupassant主题又罢工了。由于年初的各种事情繁琐而多，我就放弃治疗博客系统了，也就是说，有新的博文也无法发出来，先不管那些报错了。</p>
<p>现在稍微腾出一点儿时间，准备把博客系统好好弄一下。其实最简单的办法，也是屡试不爽的方法就是把所有的环境重新安装一遍，显示hexo，再是maupassant。这次不灵了，hexo
generate之后一堆报错。我甚至觉得maupassant已经无法搞定了，搜索错误的关键词，发现没有人遇到与我相同的问题。最后是怀疑我文章里有公式的特殊字符，影响markdown
parse。修改了hexo-renderer-marked的js，仍然有问题。最后决定换其他主题了，然而只要把所有的文章迁移过来一定会报错。报错如下：
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line">INFO  Start processing</span><br><span class="line">FATAL Something&#x27;s wrong. Maybe you can find the solution here: http://hexo.io/docs/troubleshooting.html</span><br><span class="line">Template render error: (unknown path) [Line 65, Column 565]</span><br><span class="line">  expected variable end</span><br><span class="line">    at Object._prettifyError (/Users/chengmingbo/blog_deploy/blog/node_modules/nunjucks/src/lib.js:36:11)</span><br><span class="line">    at Template.render (/Users/chengmingbo/blog_deploy/blog/node_modules/nunjucks/src/environment.js:542:21)</span><br><span class="line">    at Environment.renderString (/Users/chengmingbo/blog_deploy/blog/node_modules/nunjucks/src/environment.js:380:17</span><br><span class="line">    ... ...</span><br></pre></td></tr></table></figure></p>
<p>好吧，一切从头来，一个文件一个文件的添加，每次hexo
generate一下。终于找到了一个有问题的文件。先注释掉再说，后面慢慢查是什么特殊字符引起的问题。好在可以更新博客了。</p>
<h3 id="反复">反复</h3>
<p>选定了Next主题，又出现了反复，加上评论系统disqus发现博客白屏了，只有一个文件头显示。可是加功能一时爽，调试火葬场。当时实在记不起来到底是加了什么使得博客又不工作了。只能重头再来。在找主题的过程中发现star排名第四的hexo-theme-apollo已经停止开发，作者一句话让我决定不再折腾什么主题了：
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">专注文章内容的创作胜过博客样式的美观，祝各位玩的开心:</span><br></pre></td></tr></table></figure></p>
<h3 id="后续工作">后续工作</h3>
<ol type="1">
<li>追查出什么特殊字符引起了hexo generate出现问题</li>
<li>看是否能复原评论系统，如果不能先这样吧，只要不耽误写博文。</li>
</ol>

      
    </div>

    
    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>


<div class="post-block">
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="">
    <link itemprop="mainEntityOfPage" href="http://chengmingbo.github.io/2017/08/07/Expectation-and-variance-of-poisson-distribution/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/avatar.gif">
      <meta itemprop="name" content="Mingbo Cheng">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Mingbo">
      <meta itemprop="description" content="Mingbo">
    </span>

    <span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
      <meta itemprop="name" content="undefined | Mingbo">
      <meta itemprop="description" content="">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/2017/08/07/Expectation-and-variance-of-poisson-distribution/" class="post-title-link" itemprop="url">Expectation and Variance of Poisson Distribution</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">发表于</span>

      <time title="创建时间：2017-08-07 21:49:00" itemprop="dateCreated datePublished" datetime="2017-08-07T21:49:00+02:00">2017-08-07</time>
    </span>

  
</div>

        </div>
      </header>

    
    <div class="post-body" itemprop="articleBody">
          <p>Pmf of Poisson Distribution is as follows:</p>
<p><span class="math display">\[f(X=k;\lambda)=\frac{\lambda^k
e^{-\lambda}}{k!}\]</span></p>
<p>Our aim is to derive the the expectation of <span
class="math inline">\(E(X)\)</span> and the variance <span
class="math inline">\(Var(X)\)</span>. Given that the formula of
expectation: <span class="math display">\[
E(X)=\sum_{k=0}^{\infty} k \frac{\lambda^k e^{-\lambda }}{k!}
\]</span></p>
<p>Notice that when <span class="math inline">\(k=0\)</span>, the
formula is equal to 0, that is:</p>
<p><span class="math display">\[\sum_{k=0}^{\infty} k
\frac{\lambda^ke^{-\lambda}}{k!}\Large|_{k=0}=0\]</span></p>
<p>Then, the formula become as followed:</p>
<p><span class="math display">\[E(X)=\sum_{k=1}^{\infty} k
\frac{\lambda^ke^{-\lambda}}{k!}\]</span></p>
<p><span
class="math display">\[\begin{aligned}E(X)&amp;=\sum_{k=0}^{\infty} k
\frac{\lambda^ke^{-\lambda}}{k!}=\sum_{k=0}^{\infty}
\frac{\lambda^ke^{-\lambda}}{(k-1)!}\\&amp;=\sum_{k=0}^{\infty}  \frac{\lambda^{k-1}\lambda
e^{-\lambda}}{(k-1)!}\\&amp;=\lambda
e^{-\lambda}\sum_{k=1}^{\infty}\frac{\lambda^{k-1}}{(k-1)!}\end{aligned}\]</span></p>
<p>Now we need take advantage of Taylor Expansion, recall that:</p>
<p><span
class="math display">\[e^x=1+x+\frac{x^2}{2!}+\frac{x^3}{3!}+\cdots+\frac{x^{k-1}}{(k-1)!}=\sum_{k=1}^{\infty}\frac{x^{k-1}}{(k-1)!}\]</span></p>
<p>Compare <span class="math inline">\(E(X)\)</span>, we can get:</p>
<p><span class="math display">\[E(X)=\lambda
e^{-\lambda}e^\lambda=\lambda\]</span></p>
<p>As known that <span
class="math inline">\(Var(X)=E(X^2)-(E(x))^2\)</span>, we just get <span
class="math inline">\(E(X^2)\)</span>. Given that:</p>
<p><span class="math display">\[E(X)=\sum_{k=1}^{\infty} k
\frac{\lambda^ke^{-\lambda}}{k!}=\lambda\]</span></p>
<p>we can use this formula to derive the <span
class="math inline">\(E(X^2)\)</span>,</p>
<p><span
class="math display">\[\begin{aligned}E(X)=&amp;\sum_{k=1}^{\infty} k
\frac{\lambda^ke^{-\lambda}}{k!}=\lambda\\\Leftrightarrow&amp;\sum_{k=1}^{\infty}
k \frac{\lambda^k}{k!}=\lambda
e^{\lambda}\\\Leftrightarrow&amp;\frac{\partial\sum_{k=1}^{\infty} k
\frac{\lambda^k}{k!}}{\partial \lambda}=\frac{\partial \lambda
e^{\lambda}}{\partial
\lambda}\\\Leftrightarrow&amp;\sum_{k=1}^{\infty}k^2\frac{\lambda^{k-1}}{k!}=e^\lambda+\lambda
e^\lambda\\\Leftrightarrow&amp;\sum_{k=1}^{\infty}k^2\frac{\lambda^{k-1}e^{-\lambda}}{k!}=1+\lambda
\\\Leftrightarrow&amp;\sum_{k=1}^{\infty}k^2\frac{\lambda^{k}e^{-\lambda}}{k!}=\lambda+\lambda^2=E(X^2)\end{aligned}\]</span></p>
<p>then,</p>
<p><span
class="math display">\[Var(X)=E(X^2)-(E(X))^2=\lambda+\lambda^2-(\lambda)^2=\lambda\]</span></p>
<p>Thus, we have proved that the Expectation and the Variance of Poisson
Distribution are both <span class="math inline">\(\lambda\)</span></p>

      
    </div>

    
    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>


<div class="post-block">
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="">
    <link itemprop="mainEntityOfPage" href="http://chengmingbo.github.io/2017/07/09/Gaussian-Disriminant-Analysis/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/avatar.gif">
      <meta itemprop="name" content="Mingbo Cheng">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Mingbo">
      <meta itemprop="description" content="Mingbo">
    </span>

    <span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
      <meta itemprop="name" content="undefined | Mingbo">
      <meta itemprop="description" content="">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/2017/07/09/Gaussian-Disriminant-Analysis/" class="post-title-link" itemprop="url">Gaussian Discriminant Analysis</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">发表于</span>

      <time title="创建时间：2017-07-09 18:54:33" itemprop="dateCreated datePublished" datetime="2017-07-09T18:54:33+02:00">2017-07-09</time>
    </span>

  
</div>

        </div>
      </header>

    
    <div class="post-body" itemprop="articleBody">
          <h3 id="preface">Preface</h3>
<p>There are many classification algorithm such as Logistic Regression,
SVM and Decision Tree etc. Today we'll talk about Gaussian Discriminant
Analysis(GDA) Algorithm, which is not so popular. Actually, Logistic
Regression performance better than GDA because it can fit any
distributions from exponential family. However, we can learn more
knowledge about gaussian distribution from the algorithm which is the
most import distribution in statistics. Furthermore, if you want to
understand Gaussian Mixture Model or Factor Analysis, GDA is a good
start.</p>
<p>We, firstly, talk about Gaussian Distribution and Multivariate
Gaussian Distribution, in which section, you'll see plots about Gaussian
distributions with different parameters. Then we will learn GDA
classification algorithm. We'll apply GDA to a dataset and see the
consequnce of it.</p>
<h3 id="multivariate-gaussian-distribution">Multivariate Gaussian
Distribution</h3>
<h4 id="gaussian-distribution">Gaussian Distribution</h4>
<p>As we known that the pdf(Probability Distribution Function) of
gaussian distribution is a bell-curve, which is decided by two
parameters <span class="math inline">\(\mu\)</span> and <span
class="math inline">\(\sigma^2\)</span>. The figure below shows us a
gaussian distribution with <span class="math inline">\(\mu=0\)</span>
and <span class="math inline">\(\sigma^2=1\)</span>, which is often
referred to <span class="math inline">\(\mathcal{N}(\mu,
\sigma^2)\)</span>. Thus, Figure1 is distributed normally with <span
class="math inline">\(\mathcal{N}(0,1)\)</span>. <img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-01-normal_1.jpg" />
Figure 1. Gaussian Distribution with <span
class="math inline">\(\mu=0\)</span> and <span
class="math inline">\(\sigma^2=1\)</span>.</p>
<p>Actually, parameter <span class="math inline">\(\mu\)</span> and
<span class="math inline">\(\sigma^2\)</span> are exactly the mean and
the variance of the distribution. Therefore, <span
class="math inline">\(\sigma\)</span> is the stand deviation of normal
distribution. Let's take a look at area between red lines and magenta
lines, which are respectively range from <span
class="math inline">\(\mu\pm\sigma\)</span> and from <span
class="math inline">\(\mu\pm2\sigma\)</span>. The area between redlines
accounts for 68.3% of the total area under the curve. That is, there are
68.3% samples are between <span
class="math inline">\(\mu-\sigma\)</span> and <span
class="math inline">\(\mu+\sigma\)</span> . Likely, there are 95.4%
samples are between <span class="math inline">\(\mu-2\sigma\)</span> and
<span class="math inline">\(\mu+2\sigma\)</span>.</p>
<p>You must want to know how these two parameter influence the shape of
PDF of gaussian distribution. First of all, when we change <span
class="math inline">\(\mu\)</span> with fixed <span
class="math inline">\(\sigma^2\)</span>, the curve is the same as before
but move along the random variable axis.</p>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-01-normal_2.jpg" /></p>
<p>Figure 2. Probability Density Function curver with <span
class="math inline">\(\mu=\pm2\)</span> and <span
class="math inline">\(\sigma=1\)</span>.</p>
<p>So, what if when we change <span
class="math inline">\(\sigma\)</span> then? Figure3. illustrates that
smaller <span class="math inline">\(\sigma\)</span> lead to sharper
shape of pdf. Conversely, larger <span
class="math inline">\(\sigma\)</span> brings us broader curves.</p>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-01-normal_3.jpg" /></p>
<p>Figure 3. Probability Density Function curver with <span
class="math inline">\(\mu=0\)</span> and change <span
class="math inline">\(\sigma\)</span>.</p>
<p>Some may wonder what is the form of <span
class="math inline">\(p(x)\)</span> of a gaussian distribution, I just
demonstrate here, you can compare Normal distribution with Multivariate
Gaussian.</p>
<p><span class="math display">\[\mathcal{N(x|\mu,
\sigma^2)}=\frac{1}{\sqrt{2\pi}\sigma}
e^{-\frac{1}{2}\frac{(x-\mu)^2}{\sigma^2 } } \]</span></p>
<h4 id="multivariate-gaussian">Multivariate Gaussian</h4>
<p>For convenience, we first see what is form of Multivariate Guassian
Distribution:</p>
<p><span class="math display">\[\mathcal{N(x|\mu, \Sigma)}=\frac{1}{ {
(2\pi)}^{\frac{d}{2 } } |\Sigma|^{\frac{1}{2 } } }
e^{-\frac{1}{2}(x-\mu)^T \Sigma^{-1}(x-\mu)}\]</span></p>
<p>where <span class="math inline">\(\mu\)</span> is the mean, <span
class="math inline">\(\Sigma\)</span> is the covariance matrices, <span
class="math inline">\(d\)</span> is the dimension of random variable
<span class="math inline">\(x\)</span>, specfically, 2-dimensional
gaussian distribution, we have:</p>
<p><span class="math display">\[\mathcal{N(x|\mu,
\Sigma)}=\frac{1}{\sqrt{2\pi}|\Sigma|^{\frac{1}{2 } } }
e^{-\frac{1}{2}(x-\mu)^T \Sigma^{-1}(x-\mu)}\]</span></p>
<p>In order to get an intuition of Multivariate Guassian Distribution,
We first take a look at a distribution with <span
class="math inline">\(\mu=\begin{pmatrix}0\\0\end{pmatrix}\)</span> and
<span
class="math inline">\(\Sigma=\begin{pmatrix}1&amp;0\\0&amp;1\end{pmatrix}\)</span>.</p>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-01-mvn_1.jpg" /></p>
<p>Figure 4. 2-dimensional gaussian distribution with <span
class="math inline">\(\mu=\begin{pmatrix}0\\0\end{pmatrix}\)</span> and
<span
class="math inline">\(\Sigma=\begin{pmatrix}1&amp;0\\0&amp;1\end{pmatrix}\)</span>.</p>
<p>Notice that the the figure is rather than a curve but a 3-dimensional
diagram. Just like normal distribution pdf, <span
class="math inline">\(\sigma\)</span> determines the shape of the
figure. However, there are 4 entries of <span
class="math inline">\(\Sigma\)</span> can be changed in this example.
Given that we need compute <span class="math inline">\(|\Sigma|\)</span>
as denominator and <span class="math inline">\(\Sigma^{-1}\)</span>
which demands non-zero determinant of <span
class="math inline">\(\Sigma\)</span>, we must keep in mind that <span
class="math inline">\(|\Sigma|\)</span> is positive.</p>
<h5 id="change-mu">1. change <span
class="math inline">\(\mu\)</span></h5>
<p>Rather than change <span class="math inline">\(\Sigma\)</span>, we
firstly take a look at how the contour looks like when changing <span
class="math inline">\(\mu\)</span>. Figure 5. illustrates the contour
variation when changing <span class="math inline">\(\mu\)</span>. As we
can see, we only move the center of the contour during the variation of
<span class="math inline">\(\mu\)</span>. i.e. <span
class="math inline">\(\mu\)</span> detemines the position of pdf rather
than the shape. Next, we will see how entries in <span
class="math inline">\(\Sigma\)</span> influence the shape of pdf.</p>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-01-contour.jpg" /></p>
<p>Figure 5. Contours when change <span
class="math inline">\(\mu\)</span> with <span
class="math inline">\(\Sigma=\begin{pmatrix}1&amp;0\\0&amp;1\end{pmatrix}\)</span>.</p>
<h5 id="change-diagonal-entries-of-sigma">2. change diagonal entries of
<span class="math inline">\(\Sigma\)</span></h5>
<p>If scaling diagonal entries, we can see from figure 6. samples are
concentrated to a smaller range when change <span
class="math inline">\(\Sigma\)</span> from <span
class="math inline">\(\begin{pmatrix}1&amp;0\\0&amp;1\end{pmatrix}\)</span>
to <span
class="math inline">\(\begin{pmatrix}0.3&amp;0\\0&amp;0.3\end{pmatrix}\)</span>.
Similarly, if we alter <span class="math inline">\(\Sigma\)</span> to
<span
class="math inline">\(\begin{pmatrix}3&amp;0\\0&amp;3\end{pmatrix}\)</span>,
then figure will spread out.</p>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-01-0.3000.3.jpg" /></p>
<p>Figure 6. Density when scaling diagonal entries to 0.3.</p>
<p>What if we change only one entry of the diagonal? Figure 7. shows the
variation of the density when change <span
class="math inline">\(\Sigma\)</span> to <span
class="math inline">\(\begin{pmatrix}1&amp;0\\0&amp;5\end{pmatrix}\)</span>.
Notice the parameter spuashes and stretches the figure along coordinate
axis.</p>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-01-1005.jpg" /></p>
<p>Figure 7. Density when scaling one of the diagonal entries.</p>
<h5 id="change-secondary-diagonal-entries-of-sigma">3. change secondary
diagonal entries of <span class="math inline">\(\Sigma\)</span></h5>
<p>We now try to change entries along secondary diagonal. Figure 8.
demonstrates that the variation of density is no longer parallel to
<span class="math inline">\(X\)</span> and <span
class="math inline">\(Y\)</span> axis, where <span
class="math inline">\(\Sigma=\begin{pmatrix}1
&amp;0.5\\0.5&amp;1\end{pmatrix}\)</span>.</p>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-01-10.50.51.jpg" /></p>
<p>Figure 8. Density when scaling secondary diagonal entries to 0.5</p>
<p>When we alter secondary entries to negative 0.5, the direction of
contour presents a mirror to contour when positive.</p>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-01-1-0.5-0.51.jpg" /></p>
<p>Figure 9. Density when scaling secondary diagonal entries to -0.5</p>
<p>In light of the importance of determinant of <span
class="math inline">\(\Sigma\)</span>, what will happen if the
determinant is close to zero. Actually, we can, informally, take
determinant of a matrice as the volume of which. Similarly, when
determinant is smaller, the volume under density curve become smaller.
Figure 10. illustrates the circumstance we talked above where <span
class="math inline">\(\Sigma=\begin{pmatrix}1&amp;0.99\\0.99&amp;1\end{pmatrix}\)</span>.</p>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-01-10.990.991.jpg" /></p>
<p>Figure 10. Density when determinant is close to zero.</p>
<h3 id="gaussian-discriminant-analysis">Gaussian Discriminant
Analysis</h3>
<h4 id="intuition">Intuition</h4>
<p>When input features <span class="math inline">\(x\)</span> are
continuous variables, we can use GDA classify data. Firstly, let's take
a look at how GDA to do the job. Figure 11. show us two gaussian
distributions, they share the same covariance <span
class="math inline">\(\Sigma=\begin{pmatrix}1&amp;0\\0&amp;1\end{pmatrix}\)</span>
, and repectively with parameter <span
class="math inline">\(\mu_0=\begin{pmatrix}1\\1\end{pmatrix}\)</span>
and <span
class="math inline">\(\mu_1=\begin{pmatrix}-1\\-1\end{pmatrix}\)</span>.
Imagine you have some data which fall into the cover of the first and
second Gaussian Distribution. If we can find such distributions to fit
the data, then we'll have the capcity to decide which is new data coming
from, the first or the second one.</p>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-08-gda1.jpg" /></p>
<p>Figure 11. Two gaussian distributions with respect to <span
class="math inline">\(\mu_0=\begin{pmatrix}1\\1\end{pmatrix}\)</span>
and <span
class="math inline">\(\mu_1=\begin{pmatrix}-1\\-1\end{pmatrix}\)</span>
, and <span
class="math inline">\(\Sigma=\begin{pmatrix}1&amp;0\\0&amp;1\end{pmatrix}\)</span></p>
<p>Specifically, let's look at a concrete example, Figure 12 are samples
drawn from two Gaussian distribution. There are 100 blue '+'s and 100
red 'o's. Assume that we have such data to be classified. We can apply
GDA to solve the problem.</p>
<p>CODE:</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre></td><td class="code"><pre><span class="line">%% octave</span><br><span class="line">pkg load statistics</span><br><span class="line"></span><br><span class="line">m=200;</span><br><span class="line">n=2;</span><br><span class="line">rp=mvnrnd([1 1],[1 0;0 1],m/2);%生成正样本1</span><br><span class="line">rn=mvnrnd([4 4],[1 0;0 1],m/2);%生成负样本0</span><br><span class="line">y=[ones(m/2,1);zeros(m/2,1)];</span><br><span class="line"></span><br><span class="line">figure;hold on;</span><br><span class="line"></span><br><span class="line">plot3(rp(:,1),rp(:,2),y(1:m/2,1),&#x27;b+&#x27;);</span><br><span class="line">plot3(rn(:,1),rn(:,2),y(m/2+1:m,1),&#x27;ro&#x27;);</span><br><span class="line">axis([-3 8 -3 8]);</span><br></pre></td></tr></table></figure>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-08-samples.jpg" /></p>
<p>Figure 12. 200 samples drawn from two Gaussian Distribution with
parameters <span
class="math inline">\(\mu_0=\begin{bmatrix}1\\1\end{bmatrix},\mu_1=\begin{bmatrix}4\\4\end{bmatrix},\Sigma=\begin{bmatrix}1&amp;0\\0&amp;1\end{bmatrix}\)</span>.</p>
<h4 id="definition">Definition</h4>
<p>Now, let's define the algorithm. Firstly we assume discrete random
variable classes <span class="math inline">\(y\)</span> are distributed
Bernoulli and parameterized by <span
class="math inline">\(\phi\)</span>, then we have:</p>
<p><span class="math display">\[y\sim {\rm Bernoulli}(\phi)\]</span></p>
<p>Concretely, the probablity of <span
class="math inline">\(y=1\)</span> is <span
class="math inline">\(\phi\)</span>, and <span
class="math inline">\(1-\phi\)</span> when <span
class="math inline">\(y=0\)</span>. We can simplify two equations to
one:</p>
<p><span
class="math inline">\(p(y|\phi)=\phi^y(1-\phi)^{1-y}\)</span></p>
<p>Apparently, <span class="math inline">\(p(y=1|\phi)=\phi\)</span> and
<span class="math inline">\(p(y=0|\phi)=1-\phi\)</span> given that y can
only be <span class="math inline">\(0\)</span> or <span
class="math inline">\(1\)</span>.</p>
<p>Another assumption is that we consider <span
class="math inline">\(x\)</span> are subject to different Gaussian
Distributions given different <span class="math inline">\(y\)</span>. We
assume the two Gaussian distributions share the same covariance and
different <span class="math inline">\(\mu\)</span>. Based on above all,
then</p>
<p><span class="math display">\[p(x|y=0)=\frac{1}{(2\pi)^{\frac{d}{2 } }
|\Sigma|^{\frac{1}{2 } }
}e^{-\frac{1}{2}(x-\mu_0)^{T}\Sigma^{-1}(x-\mu_0)}\]</span></p>
<p><span class="math display">\[p(x|y=1)=\frac{1}{(2\pi)^{\frac{d}{2 } }
|\Sigma|^{\frac{1}{2 } }
}e^{-\frac{1}{2}(x-\mu_1)^{T}\Sigma^{-1}(x-\mu_1)}\]</span></p>
<p>i.e. <span class="math inline">\(x|y=0 \sim
\mathcal{N}(\mu_0,\Sigma)\)</span> and <span class="math inline">\(x|y=1
\sim \mathcal{N}(\mu_1,\Sigma)\)</span>. suppose we have <span
class="math inline">\(m\)</span> samples, it is hard to compute <span
class="math inline">\(p(x^{(1)}, x^{(2)},
x^{(3)},\cdots,x^{(m)}|y=0)\)</span> or <span
class="math inline">\(p(x^{(1)}, x^{(2)},
x^{(3)},\cdots,x^{(m)}|y=1)\)</span> . In general, we assume the
probabilty of <span class="math inline">\(x^{(i)}\)</span> <span
class="math inline">\(p(x^{(i)}|y=0)\)</span> is independent to any
<span class="math inline">\(p(x^{(j)}|y=0)\)</span>, then we have:</p>
<p><span
class="math display">\[p(X|y=0)=\prod_{i=1\,y^{(i)}\neq1}^{m}\frac{1}{(2\pi)^{\frac{d}{2
} } |\Sigma|^{\frac{1}{2 } }
}e^{-\frac{1}{2}(x^{(i)}-\mu_0)^{T}\Sigma^{-1}(x^{(i)}-\mu_0)}\]</span></p>
<p>Vice versa,</p>
<p><span class="math display">\[p(X|y=1)=\prod_{i=1\,y^{(i)}\neq
0}^{m}\frac{1}{(2\pi)^{\frac{d}{2 } } |\Sigma|^{\frac{1}{2 } }
}e^{-\frac{1}{2}(x^{(i)}-\mu_1)^{T}\Sigma^{-1}(x^{(i)}-\mu_1)}\]</span></p>
<p>Here <span class="math inline">\(X=(x^{(1)}, x^{(2)},
x^{(3)},\cdots,x^{(m)})\)</span>. Now, we want to maximize <span
class="math inline">\(p(X|y=0)\)</span> and <span
class="math inline">\(p(X|y=1)\)</span>. Why is that, because we hope
find parameters that let <span
class="math inline">\(p(X|y=0)p(X|y=1)\)</span> largest, based on that
the samples are from the two Gaussian Distributions. These samples we
have are more likely emerging. Thus, our task is to maximize <span
class="math inline">\(p(X|y=0)p(X|y=1)\)</span> , we let</p>
<p><span
class="math display">\[\mathcal{L}(\phi,\mu_0,\mu_1,\Sigma)=\arg\max
p(X|y=0)p(X|y=1)=\arg\max\prod_{i=1}^{m}p(x^{(i)},
y^{(i)};\phi,\mu_0,\mu_1,\Sigma)\]</span></p>
<p>It's tough for us to maximize <span
class="math inline">\(\mathcal{L}(\phi,\mu_0,\mu_1,\Sigma)\)</span>.
Notice function <span class="math inline">\(\log\)</span> is monotonic
increasing. Thus, we can maximize <span
class="math inline">\(\log\mathcal{L}(\phi,\mu_0,\mu_1,\Sigma)\)</span>
instead of <span
class="math inline">\(\mathcal{L}(\phi,\mu_0,\mu_1,\phi)\)</span>,
then:</p>
<p><span
class="math display">\[\begin{aligned}\ell(\phi,\mu_0,\mu_1,\Sigma)&amp;=\log\mathcal{L}(\phi,\mu_0,\mu_1,\Sigma)\\&amp;=\arg\max\log\prod_{i=1}^{m}p(x^{(i)},y^{(i)};\phi,\mu_0,\mu_1,\Sigma)\\&amp;=\arg\max\log\prod_{i=1}^{m}p(x^{(i)}|y^{(i)};\mu_0,\mu_1,\Sigma)p(y^{(i)};\phi)\\&amp;=\arg\max\sum_{i=1}^{m}p(x^{(i)}|y^{(i)};\mu_0,\mu_1,\Sigma)+p(y^{(i)};\phi)\\&amp;=\arg\max\sum_{i=1}^{m}p(x^{(i)}|y^{(i)};\mu_0,\mu_1,\Sigma)+\sum_{i=1}^{m}p(y^{(i)};\phi)\end{aligned}\]</span></p>
<p>By now, we have found a convex function with respect parameters <span
class="math inline">\(\mu_0, mu_1,\Sigma\)</span> and <span
class="math inline">\(\phi\)</span>. Next section, we'll obtain these
parameter through partial derivative.</p>
<h4 id="solution">Solution</h4>
<p>To estimate these four parameters, we just apply partial derivative
to <span class="math inline">\(\ell\)</span>. Now we estimate <span
class="math inline">\(\phi\)</span> in the first place. We let <span
class="math inline">\(\frac{\partial \ell}{\partial \phi}=0\)</span>,
then</p>
<p><span class="math display">\[\begin{aligned}\frac{\partial
\ell(\phi,\mu_0,\mu_1,\Sigma)}{\partial
\phi}=0&amp;\Rightarrow\frac{\partial
\arg\max\sum_{i=1}^{m}p(x_i|y;\mu_0,\mu_1,\Sigma)+\sum_{i=1}^{m}p(y_i;\phi)}{\partial
\phi}=0\\&amp;\Rightarrow\frac{\partial\sum_{i=1}^{m}\log
p(y^{(i)};\phi)}{\partial
\phi}=0\\&amp;\Rightarrow\frac{\partial\sum_{i=1}^{m}\log \phi^{y^{(i) }
} (1-\phi)^{(1-y^{(i)}) } } {\partial
\phi}=0\\&amp;\Rightarrow\frac{\partial\sum_{i=1}^{m}{y^{(i) } } \log
\phi+{(1-y^{(i)})}\log(1-\phi)}{\partial
\phi}=0\\&amp;\Rightarrow\frac{\partial\sum_{i=1}^{m}{ {
1}{\{y^{(i)}=1\} } }\log \phi+{1}{\{y^{(i)}=0\} } \log(1-\phi)}{\partial
\phi}=0\\&amp;\Rightarrow\phi=\frac{1}{m}\sum_{i=1}^{m}1\{y^{(i)}=1\}\end{aligned}\]</span></p>
<p>Note that <span class="math inline">\(\mu_0\)</span> and <span
class="math inline">\(\mu_1\)</span> is symmetry in the equation, thus,
we need only obtain one of them. Here we take the derivative to <span
class="math inline">\(\mu_0\)</span></p>
<p><span class="math display">\[\begin{aligned}\frac{\partial
\ell(\phi,\mu_0,\mu_1,\Sigma)}{\partial
\mu_0}=0&amp;\Rightarrow\frac{\partial
\arg\max\sum_{i=1}^{m}p(x_i|y;\mu_0,\mu_1,\Sigma)+\sum_{i=1}^{m}p(y_i;\phi)}{\partial
\mu_0}=0\\&amp;\Rightarrow\frac{\partial\sum_{i=1}^{m} \log
p(x^{(i)}|y^{(i)};\mu_0,\mu_1,\Sigma)}{\partial
\mu_0}=0\\&amp;\Rightarrow\frac{\partial
\sum_{i=1}^{m}\log\frac{1}{(2\pi)^{\frac{d}{2 } } |\Sigma|^{\frac{1}{2 }
} }e^{-\frac{1}{2}(x^{(i)}-\mu_0)^T\Sigma^{-1}(x^{(i)}-\mu_0) } }
{\partial \mu_0}=0\\&amp;\Rightarrow0+\frac{\partial
\sum_{i=1}^{m}{-\frac{1}{2}(x^{(i)}-\mu_0)^T\Sigma^{-1}(x^{(i)}-\mu_0) }
} {\partial \mu_0}=0\end{aligned}\]</span></p>
<p>We have <span class="math inline">\(\frac{\partial X^TAX}{\partial
X}=(A+A^T)X\)</span>，let <span
class="math inline">\((x^{(i)}-\mu_0)=X\)</span>, then,</p>
<p><span class="math display">\[\begin{aligned}\frac{\partial
\ell(\phi,\mu_0,\mu_1,\Sigma)}{\partial \mu_0}=0&amp;\Rightarrow
0+\frac{\partial
\sum_{i=1}^{m}{-\frac{1}{2}(x^{(i)}-\mu_0)^T\Sigma^{-1}(x^{(i)}-\mu_0) }
} {\partial
\mu_0}=0\\&amp;\Rightarrow{\sum_{i=1}^{m}-\frac{1}{2}((\Sigma^{-1})^T+\Sigma^{-1})(x^{(i)}-\mu_0)\cdot(-1)}=0\\&amp;\Rightarrow
\sum_{i=1}^{m}1\{y^{(i)}=0\}x^{(i)}=\sum_{i=1}^{m}1\{y^{(i)}=0\}\mu_0\\&amp;\Rightarrow\mu_0=\frac{\sum_{i=1}^{m}1\{y^{(i)}=0\}x^{(i)
} } {\sum_{i=1}^{m}1\{y^{(i)}=0\} } \end{aligned}\]</span></p>
<p>Simlarly,</p>
<p><span
class="math display">\[\mu_1=\frac{\sum_{i=1}^{m}1\{y^{(i)}=1\}x^{(i) }
} {\sum_{i=1}^{m}1\{y^{(i)}=1\} } \]</span></p>
<p>Before calculate <span class="math inline">\(\Sigma\)</span>, I first
illustrate the truth that <span
class="math inline">\(\frac{\partial|\Sigma|}{\partial\Sigma}=|\Sigma|\Sigma^{-1},\quad
\frac{\partial\Sigma^{-1 } } {\partial\Sigma}=-\Sigma^{-2}\)</span>,
then</p>
<p><span class="math display">\[\begin{aligned}\frac{\partial
\ell(\phi,\mu_0,\mu_1,\Sigma)}{\partial
\Sigma}=0&amp;\Rightarrow\frac{\partial\sum_{i=1}^{m} \log
p(x^{(i)}|y^{(i)};\mu_0,\mu_1,\Sigma)+\sum_{i=1}^{m} \log
p(y^{(i)};\phi)}{\partial \Sigma}=0\\&amp;\Rightarrow\frac{\partial
\sum_{i=1}^{m}\log\frac{1}{(2\pi)^{\frac{d}{2 } } |\Sigma|^{\frac{1}{2 }
} }e^{-\frac{1}{2}(x^{(i)}-\mu_{y^{(i) } }
)^T\Sigma^{-1}(x^{(i)}-\mu_{y^{(i) } } ) } } {\partial
\Sigma}=0\\&amp;\Rightarrow\frac{\partial
\sum_{i=1}^{m}-\frac{d}{2}\log2\pi}{\partial \Sigma}+\frac{\partial
\sum_{i=1}^{m}-\frac{1}{2}\log|\Sigma|}{\partial \Sigma}+\frac{\partial
\sum_{i=1}^{m}{-\frac{1}{2}(x^{(i)}-\mu_{y^{(i) } }
)^T\Sigma^{-1}(x^{(i)}-\mu_{y^{(i) } } ) } } {\partial
\Sigma}=0\\&amp;\Rightarrow\frac{\partial
\sum_{i=1}^{m}-\frac{1}{2}\log|\Sigma|}{\partial \Sigma}+\frac{\partial
\sum_{i=1}^{m}{-\frac{1}{2}(x^{(i)}-\mu_{y^{(i) } }
)^T\Sigma^{-1}(x^{(i)}-\mu_{y^{(i) } } ) } } {\partial
\Sigma}=0\\&amp;\Rightarrow
m\frac{1}{|\Sigma|}|\Sigma|\Sigma^{-1}+\sum_{i=1}^m(x^{(i)}-\mu_{y^{(i)
} } )^T(x^{(i)}-\mu_{y^{(i) } }
)(-\Sigma^{-2}))=0\\&amp;\Rightarrow\Sigma=\frac{1}{m}\sum_{i=1}^{m}(x^{(i)}-\mu_{y^{(i)
} } )(x^{(i)}-\mu_{y^{(i) } } )^T\end{aligned}\]</span></p>
<p>In spite of the harshness of the deducing, the outcome are pretty
beautiful. Next, we will apply these parameters and see how the
estimation performance.</p>
<h3 id="apply-gda">Apply GDA</h3>
<p>Notice the data drawn from two Gaussian Distribution is random, thus,
if you run the code, the outcome may be different. However, in most
cases, distributions drawn by estimated parameters are roughly the same
as the original distributions.</p>
<p><span
class="math display">\[\begin{aligned}&amp;\phi=0.5\\&amp;\mu_0=\begin{bmatrix}4.0551\\4.1008\end{bmatrix}\\&amp;\mu_1=\begin{bmatrix}0.85439\\1.03622\end{bmatrix}\\&amp;\Sigma=\begin{bmatrix}1.118822&amp;-0.058976\\-0.058976&amp;1.023049\end{bmatrix}\end{aligned}\]</span></p>
<p>From Figure 13, We can see contours of two Gaussian distribution, and
most of samples are correctly classified.</p>
<p>CODE:</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br></pre></td><td class="code"><pre><span class="line">%% octave </span><br><span class="line">phi=length(find(y==1))/m;</span><br><span class="line">mu_0 = sum(rn)/length(find(y==0))</span><br><span class="line">mu_1 = sum(rp)/length(find(y==1))</span><br><span class="line">X = [rp;rn];</span><br><span class="line">X_mu1 = X(find(y==1),:)-mu_1;</span><br><span class="line">X_mu0 = X(find(y==0),:)-mu_0;</span><br><span class="line">X_mu = [X_mu1; X_mu1];</span><br><span class="line">sigma = (X_mu&#x27;*X_mu)/m</span><br><span class="line"></span><br><span class="line">[x1 y1]=meshgrid(linspace(-3,8,100)&#x27;,linspace(-3,8,100)&#x27;);</span><br><span class="line">X1=[x1(:) y1(:)];</span><br><span class="line">z1=mvnpdf(X1,mu_1,sigma);</span><br><span class="line">contour(x1,y1,reshape(z1,100,100),8);</span><br><span class="line">hold on;</span><br><span class="line">z2=mvnpdf(X1,mu_0,sigma);</span><br><span class="line">contour(x1,y1,reshape(z2,100,100),8);</span><br></pre></td></tr></table></figure>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-08-samples_fit.jpg" /></p>
<p>Figure 13. Contours drawn from parameters estimated.</p>
<p>In fact, we can compute the probability of each data point to predict
which distribution it is more likely belongs, for example, if we want to
predict <span
class="math inline">\(x=\begin{pmatrix}0.88007\\3.9501\end{pmatrix}\)</span>
is more of the left distribution or the right, we apply <span
class="math inline">\(x\)</span> to these two distribution:</p>
<p><span
class="math display">\[\begin{aligned}p\left(x=\begin{bmatrix}0.88\\3.95\end{bmatrix}\Bigg|y=0\right)=&amp;\frac{1}{2\pi|\Sigma|^{\frac{1}{2
} }
}e^{-\frac{1}{2}{\begin{bmatrix}x_1-\mu_1\\x_2-\mu_2\end{bmatrix}^T\Sigma^{-1}\begin{bmatrix}x_1-\mu_1\\x_2-\mu_2\end{bmatrix}
} }\\=&amp;\frac{1}{ {
2\pi}\left|\begin{matrix}1.1188&amp;-0.059\\-0.059&amp;1.023\end{matrix}\right|^{\frac{1}{2
} }
}e^{-\frac{1}{2}{\begin{bmatrix}-3.175\\-0.151\end{bmatrix}^T\begin{bmatrix}0.896&amp;-0.052\\-0.0520&amp;0.98\end{bmatrix}\begin{bmatrix}-3.175\\-0.151\end{bmatrix}
} }\\=&amp;\frac{1}{2\pi\sqrt{(1.141) } } e^{-\frac{1}{2}\times
9.11}=0.149\times 0.01=0.0015\end{aligned}\]</span></p>
<p>and</p>
<p><span
class="math display">\[\begin{aligned}p\left(x=\begin{bmatrix}0.88\\3.95\end{bmatrix}\Bigg|
y=1\right)&amp;\frac{1}{2\pi|\Sigma|^{\frac{1}{2 } }
}e^{-\frac{1}{2}{\begin{bmatrix}x_1-\mu_1\\x_2-\mu_2\end{bmatrix}^T\Sigma^{-1}\begin{bmatrix}x_1-\mu_1\\x_2-\mu_2\end{bmatrix}
} }\\=&amp;\frac{1}{ {
2\pi}\left|\begin{matrix}1.1188&amp;-0.059\\-0.059&amp;1.023\end{matrix}\right|^{\frac{1}{2
} }
}e^{-\frac{1}{2}{\begin{bmatrix}0.03\\2.91\end{bmatrix}^T\begin{bmatrix}0.896&amp;-0.052\\-0.0520&amp;0.98\end{bmatrix}\begin{bmatrix}0.03\\2.91\end{bmatrix}
} }\\=&amp;\frac{1}{2\pi\sqrt{(1.141) } } e^{-\frac{1}{2}\times
8.336}=0.149\times 0.015=0.0022\end{aligned}\]</span></p>
<p>In light of the equivalency of <span
class="math inline">\(p(y=1)\)</span> and <span
class="math inline">\(p(y=0)\)</span> (both are <span
class="math inline">\(0.5\)</span>), we just compare<span
class="math inline">\(p\left(x=\begin{bmatrix}0.88\\3.95\end{bmatrix}\Bigg|
y=1\right)\)</span> to <span
class="math inline">\(p\left(x=\begin{bmatrix}0.88\\3.95\end{bmatrix}\Bigg|
y=0\right)\)</span>. Apparently, this data point is predicted from the
left distribution, which is a wrong assertion. Actually, in this
example, we have only this data pointed classified incorrectly.</p>
<p>You may wonder why there is a blue line. It turns out that all the
data point below the blue line will be considered as blue class.
Otherwise, data points above the line is classified as the red class.
How it work?</p>
<p>The blue line is decision boundary, if we know the expression of this
line, the decision will be made easier. In fact GDA is a linear
classifier, we will prove it later. Still, we see the data point above,
if we just divide one probability to another, we just need find if the
ratio larger or less than 1. For our example, the ratio is roughly 0.68,
so the data point is classified to be the blue class.</p>
<p><span
class="math display">\[\frac{p\left(x=\begin{bmatrix}0.88\\3.95\end{bmatrix}\Bigg|
y=0\right)p(y=0)}{p\left(x=\begin{bmatrix}0.88\\3.95\end{bmatrix}\Bigg|y=1\right)p(y=1)}=\frac{0.0015}{0.0022}=0.68182&lt;1\]</span></p>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-07-08-decison_boundry.jpg" /></p>
<p>Figure 14. Decision Boundary</p>
<p>If we can obtain the expression of the ratio, that should be good. So
given a new <span class="math inline">\(x\)</span>, we predict problem
is tranformed as followed:</p>
<p><span class="math display">\[x\in \text{red
class}\propto\mathcal{R}=\frac{p(x|y=1)p(y=1)}{p(x|y=0)p(y=0)} &gt;
1\]</span></p>
<p>which is equal to</p>
<p><span
class="math display">\[\mathcal{R}=\log\frac{p(x|y=1)p(y=1)}{p(x|y=0)p(y=0)}
=\log\frac{\phi}{1-\phi}+\log\frac{\mathcal{N}(x;\mu_1,\Sigma)}{\mathcal{N}(x;\mu_0,\Sigma)}&gt;
0\]</span></p>
<p>Then,</p>
<p><span
class="math display">\[\begin{aligned}\mathcal{R}&amp;=\log\frac{\frac{1}{(2\pi)^{\frac{d}{2
} } |\Sigma|^{\frac{1}{2 } }
}\exp(-\frac{1}{2}(x-\mu_1)^T\Sigma^{-1}(x-\mu_1))}{\frac{1}{(2\pi)^{\frac{d}{2
} } |\Sigma|^{\frac{1}{2 } }
}\exp(-\frac{1}{2}(x-\mu_0)^T\Sigma^{-1}(x-\mu_0))}+\log\frac{\phi}{1-\phi}\\&amp;=-\frac{1}{2}(x-\mu_1)^T\Sigma^{-1}(x-\mu_1))+\frac{1}{2}(x-\mu_0)^T\Sigma^{-1}(x-\mu_0))+\log\frac{\phi}{1-\phi}\\&amp;=-\frac{1}{2}x^T\Sigma^{-1}x+\mu_1^T\Sigma^{-1}x-\frac{1}{2}\mu_1^T\Sigma^{-1}\mu_1+-\frac{1}{2}x^T\Sigma^{-1}x-\mu_0^T\Sigma^{-1}x+\frac{1}{2}\mu_0^T\Sigma^{-1}\mu_0+\log\frac{\phi}{1-\phi}\\&amp;=(\mu_0-\mu_1)^T\Sigma^{-1}x-\frac{1}{2}\mu_1^T\Sigma^{-1}\mu_1+\frac{1}{2}\mu_0^T\Sigma^{-1}\mu_0+\log\frac{\phi}{1-\phi}\end{aligned}\]</span></p>
<p>Here, <span
class="math inline">\(\mu_1^T\Sigma^{-1}x=x^T\Sigma^{-1}\mu_1\)</span>
because it is a real number. For a real number <span
class="math inline">\(a=a^T\)</span>, moreover, <span
class="math inline">\(\Sigma^{-1}\)</span> is symmetric, so <span
class="math inline">\(\Sigma^{-T}=\Sigma^{-1}\)</span>. Let's set <span
class="math inline">\(w^T=(\mu_1-\mu_0)^T\Sigma^{-1}\)</span> and <span
class="math inline">\(w_0=-\frac{1}{2}\mu_1^T\Sigma^{-1}\mu_1+\frac{1}{2}\mu_0^T\Sigma^{-1}\mu_0+\log\frac{\phi}{1-\phi}\)</span>,
then we have:</p>
<p><span
class="math display">\[\mathcal{R}=\log\frac{p(x|y=1)p(y=1)}{p(x|y=0)p(y=0)}
=w^Tx+w_0\]</span></p>
<p>If you plug parameters in the formula, you will find:</p>
<p><span
class="math inline">\(\mathcal{R}=-3.0279x_1-3.1701x_2+15.575=0\)</span></p>
<p>It is the decision boundary(Figure 14.). Since you have got the
decision boundary formula, it is convenient to use the decision boundary
function predict if a data point <span class="math inline">\(x\)</span>
belongs to the blue or red class. If <span
class="math inline">\(\mathcal{R}&gt;0\)</span>, <span
class="math inline">\(x\in \text{red class}\)</span>, otherwise, <span
class="math inline">\(x\in \text{blue class}\)</span>.</p>
<h3 id="conclusion">Conclusion</h3>
<p>Today, we have talked about Guassian Distribution and its
Multivariate form. Then, we assume two groups of data drawn from
Gaussian Distributions. We apply Gaussian Discriminant Analysis to the
data. There are 200 data point, only one is misclassified. In fact we
can deduce GDA to Logistic regression Algorithm(LR). But LR can not
deduce GDA, i.e. LR is a better classifier, especially when we do not
know the distribution of the data. However, if you have known that data
is drawn from Gaussian Distribution, GDA is the better choice.</p>
<h3 id="reference">Reference</h3>
<ol type="1">
<li>Andrew Ng http://cs229.stanford.edu/notes/cs229-notes2.pdf</li>
<li>https://en.wikipedia.org/wiki/Normal_distribution</li>
<li>https://en.wikipedia.org/wiki/Multivariate_normal_distribution</li>
<li>http://www.cnblogs.com/emituofo/archive/2011/12/02/2272584.html</li>
<li>http://m.blog.csdn.net/article/details?id=52190572</li>
<li>张贤达《矩阵分析与应用》:156-158</li>
<li>http://www.tk4479.net/hujingshuang/article/details/46357543</li>
<li>http://www.chinacloud.cn/show.aspx?id=24927&amp;cid=22</li>
<li>http://www.cnblogs.com/jcchen1987/p/4424436.html</li>
<li>http://www.xlgps.com/article/139591.html</li>
<li>http://www.matlabsky.com/thread-10308-1-1.html</li>
<li>http://classes.engr.oregonstate.edu/eecs/fall2015/cs534/notes/GaussianDiscriminantAnalysis.pdf</li>
</ol>

      
    </div>

    
    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>


<div class="post-block">
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="">
    <link itemprop="mainEntityOfPage" href="http://chengmingbo.github.io/2017/06/17/sample-variance/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/avatar.gif">
      <meta itemprop="name" content="Mingbo Cheng">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Mingbo">
      <meta itemprop="description" content="Mingbo">
    </span>

    <span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
      <meta itemprop="name" content="undefined | Mingbo">
      <meta itemprop="description" content="">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/2017/06/17/sample-variance/" class="post-title-link" itemprop="url">样本方差为什么除以N-1?（翻译）</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">发表于</span>

      <time title="创建时间：2017-06-17 17:52:38" itemprop="dateCreated datePublished" datetime="2017-06-17T17:52:38+02:00">2017-06-17</time>
    </span>

  
</div>

        </div>
      </header>

    
    <div class="post-body" itemprop="articleBody">
          <p>原文作者：<a target="_blank" rel="noopener" href="http://www.visiondummy.com/">Vincent Spruy</a></p>
<p>译者：程明波</p>
<p><a
target="_blank" rel="noopener" href="http://www.visiondummy.com/2014/03/divide-variance-n-1/">英文文章地址</a></p>
<p><a
href="http://chengmingbo.github.io/2017/06/17/sample-variance/">译文地址</a></p>
<p>译者注：由于历史原因，高斯分布(Gaussian
Distribution)，正态分布(Normal Distribution)皆指概率密度函数形如<span
class="math inline">\(\frac{1}{\sqrt{2\pi}\sigma}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\)</span>的分布。文中我会采用正态分布的提法。</p>
<h3 id="简介">简介</h3>
<p>本文，呼应标题，我将推导著名正态分布数据均值和方差的计算公式。如果一些读者对于这个问题的“为什么”并不感兴趣，仅仅是对“什么时候使用”感兴趣，那答案就非常简单了：</p>
<p>如果你想预估一份数据的均值和方差(典型情况)，那么方差公式除的是<span
class="math inline">\(N-1\)</span>，即：</p>
<p><span class="math display">\[\sigma^2 = \frac{1}{N-1}\sum_{i=1}^N
(x_i - \mu)^2\]</span></p>
<p>另一种情况，如果整体的真实均值已知，那么方差公式除的就是<span
class="math inline">\(N\)</span>，即：</p>
<p><span class="math display">\[\sigma^2 = \frac{1}{N}\sum_{i=1}^N (x_i
- \mu)^2\]</span></p>
<p>然而，前一种情况，会是你遇到更典型的情形。一会儿，我会举一个预估高斯白噪音的离散程度例子。例子中高斯白噪音的均值是已知的0，这种情况下，我们只需要估计方差。</p>
<p>如果数据是正态分布，我们可以完全用均值<span
class="math inline">\(\mu\)</span>和方差<span
class="math inline">\(\sigma^2\)</span>刻画这个分布。其中，方差是标准差<span
class="math inline">\(\sigma\)</span>的平方，标准差代表了每个数据点偏离均值点的平均距离，也就是说，方差表示了数据离散程度。对于正态分布，68.3%的数据的值会介于<span
class="math inline">\(\mu-\sigma\)</span>和<span
class="math inline">\(\mu+\sigma\)</span>之间。下面图片展示是一个正态分布的概率密度函数，他的均值是<span
class="math inline">\(\mu=10\)</span>,方差是<span
class="math inline">\(\sigma^2=3^2=9\)</span>：</p>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-06-20-110159.jpg" /></p>
<p><strong>图1.</strong> 正态分布概率密度函数.
对于正态分布数据，68%的样本落在均值<span
class="math inline">\(\pm\)</span>方差。</p>
<p>通常，我们拿不到全部的全体数据。上面的例子中，典型的情况是我们有一些观察数据，但是，我们没有上图中x轴上所有可能的观察数据。例如我们可能有下面一些观察数据：</p>
<p>表1</p>
<table>
<thead>
<tr class="header">
<th>观察数据ID</th>
<th>观察值</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>观察数据 1</td>
<td>10</td>
</tr>
<tr class="even">
<td>观察数据 2</td>
<td>12</td>
</tr>
<tr class="odd">
<td>观察数据 3</td>
<td>7</td>
</tr>
<tr class="even">
<td>观察数据 4</td>
<td>5</td>
</tr>
<tr class="odd">
<td>观察数据 5</td>
<td>11</td>
</tr>
</tbody>
</table>
<p>现在如果我们通过把所有值相加并除以观察的次数，得到经验均值：</p>
<p><span
class="math display">\[\mu=\frac{10+12+7+5+11}{5}=9\tag{1}\]</span>.</p>
<p>通常，我们会假设经验均值接近分布的未知的真实均值，因此，我们可以假设观测数据来自于均值为<span
class="math inline">\(\mu=9\)</span>的正态分布。在这个例子中，分布真实均值是10，
也就是说，经验均值实际上接近于真实均值。</p>
<p>数据的方差计算如下：</p>
<p><span class="math display">\[\begin{aligned}\sigma^2&amp;=
\frac{1}{N-1}\sum_{i=1}^N (x_i - \mu)^2\\&amp;=
\frac{(10-9)^2+(12-9)^2+(7-9)^2+(5-9)^2+(11-9)^2}{4})\\&amp;=
8.5.\end{aligned}\tag{2}\]</span></p>
<p>同样，我们一般假设经验方差接近于基于分布真实未知方差。在此例中，真实方差是9，所以，经验方差也是接近于真实方差。</p>
<p>那么我们手上的问题现在就是为什么我们用于计算经验均值和经验方差的公式是正确的。事实上，另一个我们经常用于计算方差的公式是这样定义的：</p>
<p><span class="math display">\[\begin{aligned}\sigma^2 &amp;=
\frac{1}{N}\sum_{i=1}^N (x_i - \mu)^2 \\&amp;=
\frac{(10-9)^2+(12-9)^2+(7-9)^2+(5-9)^2+(11-9)^2}{4}) \\&amp;=
6.8.\end{aligned}\tag{3}\]</span></p>
<p>公式(2)和公式(3)的唯一不同是前一个公式除的是<span
class="math inline">\(N-1\)</span>，而后一个除的是<span
class="math inline">\(N\)</span>。两个公式都是对的，只是根据不同的场景使用不同的公式。</p>
<p>接下来的部分，我们针对给定一个正态分布的样本集，完成对其未知方差和均值最好估计的完整推导。我们将会看到，一些情况下，方差除的是<span
class="math inline">\(N\)</span>，另一些情况除的是<span
class="math inline">\(N-1\)</span>。</p>
<p>用一个公式近似一个参数(均值或方差)叫做估计量。下面，我们定义一个分布的真实但未知的参数为<span
class="math inline">\(\hat{\mu}\)</span>和<span
class="math inline">\(\hat{\sigma}^2\)</span>。而估计量，例如，经验的平均和经验方差，定义为<span
class="math inline">\(\mu\)</span>和<span
class="math inline">\(\sigma^2\)</span>。</p>
<p>为了找到最优的估计量，首先，一个整体均值为<span
class="math inline">\(\mu\)</span>标准差为<span
class="math inline">\(\sigma\)</span>的正态分布，对于特定的观察点<span
class="math inline">\(x_i\)</span>，我们需要一个分析相似的表达式。对于一个已知参数的正态分布一般定义为<span
class="math inline">\(N(\mu,\sigma^2)\)</span>。似然函数为：</p>
<p><span class="math display">\[x_i \sim N(\mu,\sigma^2) \Rightarrow
P(x_i;
\mu,\sigma)=\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{1}{2\sigma^2}(x-\mu)^2}.\tag{4}\]</span></p>
<p>为了计算均值和方差，显然，我们需要这个分布一个以上的样本。接下来，设<span
class="math inline">\(\vec{x}=(x_1,x_2,\cdots,x_N)\)</span>为包含所有的可用样本的向量（例如：表一中所有的值）。如果所有这些样本统计独立，我们可以写出联合似然函数为所有似然函数的乘积：</p>
<p><span
class="math display">\[\begin{aligned}P(\vec{x};\mu,\sigma^2)&amp;=P(x_1,x_2,\cdots,x_n;\mu,\sigma^2)\\&amp;=P(x_1;\mu,\sigma^2)P(x_2;\mu,\sigma^2)\cdots
P(x_N;\mu,\sigma^2)\\&amp;=\prod_{i=1}^{N}P(x_i;\mu,\sigma^2)\end{aligned}.\tag{5}\]</span></p>
<p>把公式(4)代入公式(5)，可得出联合概率密度函数的分析表达式：</p>
<p><span
class="math display">\[\begin{aligned}P({\vec{x};\mu,\sigma})&amp;=\prod_{i=1}^{N}\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{1}{2\sigma^2}(x_i-\mu)^2}\\&amp;=\frac{1}{(2\pi\sigma^2)^{\frac{N}{2}}}e^{-\frac{1}{2\sigma^2}\sum_{i=1}^{N}(x_i-\mu)^2}\end{aligned}.\tag{6}\]</span></p>
<p>公式(6)在接下来的部分将非常重要。我们会用它推导关于正态分布著名的估计量均值和方差。</p>
<h3 id="最小方差无偏估计量">最小方差，无偏估计量</h3>
<p>决定一个估计量是不是“好”估计量，首先我们需要定义什么是真正的“好”
估计量。说一个估计量好，依赖于两个度量，叫做其偏差(bias)和方差(variance)(是的，我们要讨论均值估计量的方差，以及方差估计量的方差)。本节将简单的讨论这两个度量。</p>
<h4 id="参数偏差">参数偏差</h4>
<p>想象一下，如果我们能拿到全体不同的(互斥)数据子集。类比之前的的例子，假设，除了【表1】中的数据，我们还有完全不同观察结果表2及表3。那么，一个关于均值好的估计量，应该使得这个估计量平均下来等于真实的均值。我们可以接受其中一个自己的经验均值不等于真实均值，但是，一个好的估计量应该保证：对于所有子集均值估计的平均值等于真实均值。这个限制条件用数学化的表示，就是估计量的期望值(Expected
Value)应该等于参数值：</p>
<p><span class="math display">\[E(\mu)=\hat{\mu}\qquad
E(\sigma^2)=\hat{\sigma}^2.\tag{7}\]</span></p>
<p>如果满足上面的条件，那么这些估计量就被称之为“无偏估计”。反之，如果上面的条件不满足，这些估计量叫做“有偏的”，也就是说平均来看，他们或者低估或者高估了参数的真实值。</p>
<h4 id="参数方差">参数方差</h4>
<p>无偏估计量保证平均来看，它们估计的值等于真是参数。但是，这并不意味着每次估计是一个好的估计。比如，如果真实均值为10，一个无偏估计量可以估计全体的其中一个子集的均值为50，而另一个均值为-30。期望的估计的值确实是10，也等于真是的参数值，但是，估计量的质量明显依赖每次估计的离散程度。对于全体5个不同子集，一个估计量产生的估计值(10,15,5,12,8)是无偏的和另一个估计量产生的估计值（50，-30，100，-90，20）（译者注：原文作者最后一个是10，我计算换成20，这样均值才是10）。但是第一个估计量的所有估计值明显比第二个估计量的估计值更接近真实值。</p>
<p>因此，一个好的估计量不仅需要有低偏差，同时也需要低方差。这个方差表示为平均平方误差的估计量：</p>
<p><span
class="math display">\[Var(\mu)=E[(\hat{\mu}-\mu)^2]\]</span></p>
<p><span
class="math display">\[Var(\sigma^2)=E[(\hat{\sigma}-\sigma)^2]\]</span></p>
<p>因此一个好的估计量是低偏差，低方差的。如果存在最优的估计量，那么这个估计应该是无偏的，而且方差比所有的其他可能估计量都要低。这样的一个估计量被称之为最小方差，无偏（MVU）估计量。下一节，我们将会针对一个正态分布推导均值和方差估计量的数学表达式。我们将会看到，一个正态分布的方差MVU估计量在一些假设下需要除以<span
class="math inline">\(N\)</span>，而在另一些假设下需要除以<span
class="math inline">\(N-1\)</span>。</p>
<h3 id="最大似然估计">最大似然估计</h3>
<p>基于整体的一个子集，尽管有大量的获取一个参数估计量的技术，所有这些技术中最简单的可能就数最大似然估计了。</p>
<p>观察值<span
class="math inline">\(\vec{x}\)</span>的概率在公式(6)定义为<span
class="math inline">\(P(\vec{x};\mu,\sigma^2)\)</span>.
如果我们在此函数中固定<span class="math inline">\(x\)</span>和<span
class="math inline">\(\sigma^2\)</span>，当使<span
class="math inline">\(\vec{x}\)</span>变化时，我们就可以获得图(1)的正态分布。但是，我们也可以固定<span
class="math inline">\(\vec{x}\)</span>，使<span
class="math inline">\(\mu\)</span>和（或）<span
class="math inline">\(\sigma^2\)</span>变化。比如，我们可以选择类似前面例子中的<span
class="math inline">\(\vec{x}=(10,12,7,5,11)\)</span>。我们选择固定<span
class="math inline">\(\mu=10\)</span>，同时使<span
class="math inline">\(\sigma^2\)</span>变化。图(2)展示了当<span
class="math inline">\(x\)</span>和<span
class="math inline">\(\mu\)</span>固定时，<span
class="math inline">\(\sigma^2\)</span>对于这个分布取不同值的变化曲线：</p>
<p><img
src="http://cmb.oss-cn-qingdao.aliyuncs.com/2017-06-20-110329.jpg" /></p>
<p>图 2. 此图表示了似然函数在特定观察数据<span
class="math inline">\(\vec{x}\)</span>，下固定<span
class="math inline">\(\mu=10\)</span>，<span
class="math inline">\(\sigma^2\)</span>变化曲线。</p>
<p>上图，我们通过固定<span
class="math inline">\(\mu=10\)</span>，令<span
class="math inline">\(\sigma^2\)</span>变化计算了<span
class="math inline">\(P(\vec{x};\sigma^2)\)</span>的似然函数。在结果曲线的每一个数据点代表了似然度，观察值<span
class="math inline">\(\vec{x}\)</span>是一个正态分布在参数<span
class="math inline">\(\sigma^2\)</span>下的样本。那么对应最大似然度的参数值最有可能是从我们定义的分布中产生数据的参数。因此，我们能通过找到似然度曲线的最大值决定最优的<span
class="math inline">\(\sigma^2\)</span>。在此例中，最大值在<span
class="math inline">\(\sigma^2=7.8\)</span>，这样标准差就是<span
class="math inline">\(\sqrt{(\sigma^2)=2.8}\)</span>。事实上，如果给定<span
class="math inline">\(\mu=10\)</span>，通过传统的方法计算，我们会发明方差就是7.8：</p>
<p><span
class="math display">\[\frac{(10-10)^2+(12-10)^2+(7-10)^2+(5-10)^2+(11-10)^2}{5}=7.8\]</span></p>
<p>因此，基于样本数据的方差计算公式只需要简单的通过找到最大的似然函数的最高点。此外，除了固定<span
class="math inline">\(\mu\)</span>，我们可以使<span
class="math inline">\(\mu\)</span>和<span
class="math inline">\(\sigma^2\)</span>同时变化。然后找到两个估计量对应在两个维度的似然函数的最大值。</p>
<p>要找一个函数的最大值，也很简单，只需要求导使其等于0。如果想找一个有两个变量函数的最大值，我们需要计算每个变量的偏导，再把两个偏导全部设置为0。接下来，设<span
class="math inline">\(\hat{\mu}_{ML}\)</span>为通过极大似然方法得到的总体均值的最优估计量，设<span
class="math inline">\(\hat{\sigma}^2_ML\)</span>为方差的最优估计量。要最大化似然函数，我们可以简单的计算它的(偏)导数，然后赋值为0，如下：</p>
<p><span class="math display">\[\begin{aligned} &amp;\hat{\mu}_{ML} =
\arg\max_\mu P(\vec{x}; \mu, \sigma^2)\\ &amp;\Rightarrow \frac{\partial
P(\vec{x}; \mu, \sigma^2)}{\partial \mu} = 0 \end{aligned}\]</span></p>
<p>及</p>
<p><span class="math display">\[\begin{aligned} &amp;\hat{\sigma}^2_{ML}
= \arg\max_{\sigma^2} P(\vec{x}; \mu, \sigma^2)\\ &amp;\Rightarrow
\frac{\partial P(\vec{x}; \mu, \sigma^2)}{\partial \sigma^2} = 0
\end{aligned}\]</span></p>
<p>下一节，我们将利用这个技术得到<span
class="math inline">\(\mu\)</span>和<span
class="math inline">\(\sigma^2\)</span>的MVU估计量。我们考虑两种情形：</p>
<p>第一种情形，我们假设分布的真正的均值<span
class="math inline">\(\hat{\mu}\)</span>是已知的，因此，我们只需要估计方差，那么问题就变成在参数为<span
class="math inline">\(\sigma^2\)</span>的一维的极大似然函数中对应找其最大值。这种情况不经常出现，但是，在实际应用中确实存在。例如，如果我们知道一个信号(比如：一幅图中一个像素的颜色值)本来应该有特定的值，但是，信号被白噪音污染了（均值为0的高斯噪音），这时分布的均值是已知的，我们只需要估计方差。</p>
<p>第二种情形就是处理均值和方差的真实值都不知道的情况。这种情况最常见，这时，我们需要基于样本数据估计均值和方差。</p>
<p>后面我们将看到，每种情形产生不同的MVU估计量。具体来说，第一种情形方差估计量需要除以<span
class="math inline">\(N\)</span>来标准化MVU。而第二种除的是<span
class="math inline">\(N-1\)</span>。</p>
<h3 id="均值已知的方差估计">均值已知的方差估计</h3>
<h4 id="参数估计">参数估计</h4>
<p>如果分布的均值真实值已知，那么似然函数只有一个参数<span
class="math inline">\(\sigma^2\)</span>。求最大似然估计量也就是解决：</p>
<p><span class="math display">\[\hat{\sigma^2}_{ML}=\arg\max_{\sigma^2}
P(\vec{x};\sigma^2).\tag{8}\]</span></p>
<p>但是，根据公式(6)的定义，如果计算<span
class="math inline">\(P(\vec{x};\sigma^2)\)</span>涉及到计算函数中指数的偏导。事实上，计算对数似然函数比计算似然函数本身的导数要简单的多。因为对数函数是单调递增函数，其最大值取值位置与原似然函数是一样的。因此我们用下面的式子替换：</p>
<p><span
class="math display">\[\hat{\sigma}^2_{ML}=\arg\max_{\sigma^2}\log(P(\vec{x};\sigma^2)).\tag{9}\]</span></p>
<p>下面，我令<span
class="math inline">\(s=\sigma^2\)</span>简化式子。我们通过计算公式(6)的对数的导数赋值为0来最大化对数似然函数：</p>
<p><span class="math display">\[\begin{aligned}&amp;\frac{\partial
\log(P(\vec{x};\sigma^2))}{\partial
\sigma^2}=0\\&amp;\Leftrightarrow\frac{\partial\log(P(\vec{x};s))}{\partial
s}=0\\&amp;\Leftrightarrow\frac{\partial}{\partial
s}\log\left(\frac{1}{(2\pi
s)^{\frac{N}{2}}}e^{-\frac{1}{2s}\sum_{i=1}^{N}(x_i-\mu)^2}
\right)=0\\&amp;\Leftrightarrow\frac{\partial}{\partial
s}\log\left(\frac{1}{(2\pi)^{\frac{N}{2}}}\right)+\frac{\partial}{\partial
s}\log\left(\frac{1}{\sqrt{s}^\frac{N}{2}}\right)+\frac{\partial}{\partial
s} \log\left(e^{-\frac{1}{2s}\sum_{i=1}^{N}(x_i-\mu)^2}\right
)=0\\&amp;\Leftrightarrow0+\frac{\partial}{\partial
s}\log\left((s)^{-\frac{N}{2}}\right)+\frac{\partial}{\partial
s}\left(-\frac{1}{2s}\sum_{i=1}^{N}(x_i-\mu)^2\right)=0\\&amp;\Leftrightarrow
-\frac{N}{2}\log (s)+\frac{1}{2
s^2}\sum_{i=1}^{N}(x_i-\mu)^2=0\\&amp;\Leftrightarrow
-\frac{N}{2s}+\frac{1}{2s^2}\sum_{i=1}^{N}(x_i-\mu)^2=0\\&amp;\Leftrightarrow
\frac{N}{2s^2}\left(-s+\frac{1}{N}\sum_{i=1}^{N}(x_i-\mu)^2\right)=0\\&amp;\Leftrightarrow\frac{N}{2s^2}\left(\frac{1}{N}\sum_{i=1}^{N}(x_i-\mu)^2-s\right)=0\end{aligned}\]</span></p>
<p>很明显，如果<span
class="math inline">\(N&gt;0\)</span>，那么上面等式唯一的解就是：</p>
<p><span
class="math display">\[s=\sigma^2=\frac{1}{N}\sum_{i=1}^{N}(x_i-\mu)^2.\tag{10}\]</span></p>
<p>注意到，实际上<span
class="math inline">\(\hat{\sigma}^2\)</span>的极大似然估计估计量就是传统上一般计算方差的公式。这里标准化因子是<span
class="math inline">\(\frac{1}{N}\)</span>.</p>
<p>但是，极大似然估计并不保证得出的是一个无偏估计量。另外，就算得到的估计量是无偏的，极大似然估计也不能保证估计是最小方差，即MVU。因此，我们需要检查公式(10)的的估计量是否是无偏的。</p>
<h4 id="表现评价">表现评价</h4>
<p>我们需要检查公式(7)的等式是否成立，来确定是否公式(10)中的估计量是无偏的。即判断：</p>
<p><span class="math display">\[E(s)=\hat{s}.\]</span></p>
<p>我们把公式(10)代入到<span
class="math inline">\(E(s)\)</span>，计算：</p>
<p><span class="math display">\[\begin{aligned}E[s] &amp;= E
\left[\frac{1}{N}\sum_{i=1}^N(x_i - \mu)^2 \right] = \frac{1}{N}
\sum_{i=1}^N E \left[(x_i - \mu)^2 \right] = \frac{1}{N} \sum_{i=1}^N E
\left[x_i^2 - 2x_i \mu + \mu^2 \right]\\&amp;= \frac{1}{N} \left( N
E[x_i^2] -2N \mu E[x_i] + N \mu^2 \right)\\&amp;= \frac{1}{N} \left( N
E[x_i^2] -2N \mu^2 + N \mu^2 \right)\\&amp;= \frac{1}{N} \left( N
E[x_i^2] -N \mu^2 \right)\end{aligned}\]</span></p>
<p>另外，真实方差<span class="math inline">\(\hat{s}\)</span>有一个<a
target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Variance#Definition">非常重要的性质</a>为<span
class="math inline">\(\hat{s}=E[x_i^2]-E[x_i]^2\)</span>，可变换公式为<span
class="math inline">\(E[x_i^2]=\hat{s}+E[x_i]^2=\hat{s}+\mu^2\)</span>。使用此性质我们可能从上面的公式推出：</p>
<p><span class="math display">\[\begin{aligned}E[s]&amp;=\frac{1}{N}(N
E[x_i^2]-N\mu^2)\\&amp;=\frac{1}{N}(N\hat{s}+N\mu^2-N\mu^2)\\&amp;=\frac{1}{N}(N\hat{s})\\&amp;=\hat{s}\end{aligned}\]</span></p>
<p>满足了公式(7)的条件<span class="math inline">\(E[s]=\hat
s\)</span>，因此，我们得到的数据方差<span class="math inline">\(\hat
s\)</span>的统计量是无偏的。此外，因为极大似然估计的如果是一个无偏的估计量，那么也是最小方差(MVU)，也就是说，我们得到的估计量比任何一个其他的估计量都大。</p>
<p>因此，在分布真实均值已知的情况下，我们不用除以<span
class="math inline">\(N-1\)</span>，而是用除<span
class="math inline">\(N\)</span>计算正态分布的方差。</p>
<h3 id="均值未知的方差估计">均值未知的方差估计</h3>
<h4 id="参数估计-1">参数估计</h4>
<p>上一节，分布的真实均值已知，因此，我们只需要估计数据的方差。但是，如果真实的均值未知，我们均值的估计量就也需要计算了。</p>
<p>此外，方差的估计量需要使用均值的估计量。我们会看到，这时，之前我们得到的方差的估计量就不再无偏了。我们一会儿会通过除以N-1，而不是N来稍微的增加方差估计量的值，从而使方差估计无偏。</p>
<p>与之前一样，基于log似然函数，我们用极大似然估计计算两个估计量。首先我们先计算<span
class="math inline">\(\hat\mu\)</span>的极大似然估计量：</p>
<p><span class="math display">\[\begin{aligned}&amp;\frac{\partial
\log(P(\vec{x}; s, \mu))}{\partial \mu} = 0\\&amp;\Leftrightarrow
\frac{\partial}{\partial \mu} \log \left( \frac{1}{(2 \pi
s)^{\frac{N}{2}}} e^{-\frac{1}{2s}\sum_{i=1}^N(x_i - \mu)^2} \right) =
0\\&amp;\Leftrightarrow \frac{\partial}{\partial \mu} \log \left(
\frac{1}{(2 \pi)^{\frac{N}{2}}} \right) + \frac{\partial}{\partial \mu}
\log \left(e^{-\frac{1}{2s}\sum_{i=1}^N(x_i - \mu)^2} \right) =
0\\&amp;\Leftrightarrow \frac{\partial}{\partial \mu}
\left(-\frac{1}{2s}\sum_{i=1}^N(x_i - \mu)^2 \right) =
0\\&amp;\Leftrightarrow -\frac{1}{2s}\frac{\partial}{\partial \mu}
\left(\sum_{i=1}^N(x_i - \mu)^2 \right) = 0\\&amp;\Leftrightarrow
-\frac{1}{2s} \left(\sum_{i=1}^N -2(x_i - \mu) \right) =
0\\&amp;\Leftrightarrow \frac{1}{s} \left(\sum_{i=1}^N (x_i - \mu)
\right) = 0 \\&amp;\Leftrightarrow \frac{N}{s} \left( \frac{1}{N}
\sum_{i=1}^N (x_i) - \mu \right) = 0 \end{aligned}\]</span></p>
<p>显然，如果<span
class="math inline">\(N&gt;0\)</span>，那么上面的等式只有一种解：</p>
<p><span
class="math display">\[\mu=\frac{1}{N}\sum_{i=1}^{N}x_i.\tag{11}\]</span></p>
<p>注意到，实际的这是计算一个分布均值的著名公式。虽然我们知道这个公式，但我们现在证明了极大似然估计量估计了一个正态分布未知均值的真实值。现在我们先假定我们之前公式(10)计算的方差<span
class="math inline">\(\hat
s\)</span>的估计量仍然是MVU方差估计量。但下一节我们会证明这个估计量已经是有偏的了。</p>
<h4 id="表现评价-1">表现评价</h4>
<p>我们需要通过检查估计量<span
class="math inline">\(\mu\)</span>对真实<span class="math inline">\(\hat
\mu\)</span>的估计是否无偏来确定公式(7)的条件能否成立：</p>
<p><span
class="math display">\[E[\mu]=E\left[\frac{1}{N}\sum_{i=1}^{N}x_i\right]=\frac{1}{N}\sum_{i=1}^N
E[x_i]=\frac{1}{N}N E[x_i]=\frac{1}{N} N \hat\mu=\hat\mu.\]</span></p>
<p>既然<span
class="math inline">\(E[\mu]=\hat\mu\)</span>，那么也就是说我们对分布均值的估计量是无偏的。因为极大似然估计可以保证在估计是无偏的情况下得到的是最小方差估计量，所以我们就已经是证明了<span
class="math inline">\(\mu\)</span>是均值的MVU估计量。</p>
<p>现在我们检查基于经验均值<span
class="math inline">\(\mu\)</span>，而不是真实均值<span
class="math inline">\(\hat\mu\)</span>的方差估计量<span
class="math inline">\(s\)</span>对真实方差<span
class="math inline">\(\hat
s\)</span>的估计身上仍然是无偏的。我们只需要把得到的估计量<span
class="math inline">\(\mu\)</span>带入到之前在公式(10)推导出的公式：</p>
<p><span class="math display">\[\begin{aligned} s &amp;= \sigma^2 =
\frac{1}{N}\sum_{i=1}^N(x_i - \mu)^2\\&amp;=\frac{1}{N}\sum_{i=1}^N
\left(x_i - \frac{1}{N} \sum_{i=1}^N (x_i)
\right)^2\\&amp;=\frac{1}{N}\sum_{i=1}^N \left[x_i^2 - 2 x_i \frac{1}{N}
\sum_{i=1}^N (x_i) + \left(\frac{1}{N} \sum_{i=1}^N (x_i) \right)^2
\right]\\&amp;=\frac{\sum_{i=1}^N x_i^2}{N} - \frac{2\sum_{i=1}^N x_i
\sum_{i=1}^N x_i}{N^2} + \left(\frac{\sum_{i=1}^N x_i}{N}
\right)^2\\&amp;=\frac{\sum_{i=1}^N x_i^2}{N} - \frac{2\sum_{i=1}^N x_i
\sum_{i=1}^N x_i}{N^2} + \left(\frac{\sum_{i=1}^N x_i}{N}
\right)^2\\&amp;=\frac{\sum_{i=1}^N x_i^2}{N} - \left(\frac{\sum_{i=1}^N
x_i}{N} \right)^2\end{aligned}\]</span></p>
<p>现在我们需要再次检查公式(7)的条件是否成立，来决定估计量是否无偏：</p>
<p><span class="math display">\[\begin{aligned} E[s]&amp;= E \left[
\frac{\sum_{i=1}^N x_i^2}{N} - \left(\frac{\sum_{i=1}^N x_i}{N}
\right)^2 \right ]\\&amp;= \frac{\sum_{i=1}^N E[x_i^2]}{N} -
\frac{E[(\sum_{i=1}^N x_i)^2]}{N^2} \end{aligned}\]</span></p>
<p>记得我们在之前用过方差一个非常重要的性质，真实方差<span
class="math inline">\(\hat s\)</span>可以写成<span
class="math inline">\(\hat s = E[x_i^2]-E[x_i]^2\)</span>，即，<span
class="math inline">\(E[x_i^2]=\hat s + E[x_i]^2=\hat s
+\mu^2\)</span>。利用这个性质我们可以推出：</p>
<p><span class="math display">\[\begin{aligned} E[s] &amp;=
\frac{\sum_{i=1}^N E[x_i^2]}{N} - \frac{E[(\sum_{i=1}^N
x_i)^2]}{N^2}\\&amp;= s + \mu^2 - \frac{E[(\sum_{i=1}^N
x_i)^2]}{N^2}\\&amp;= s + \mu^2 - \frac{E[\sum_{i=1}^N x_i^2 + \sum_i^N
\sum_{j\neq i}^N x_i x_j]}{N^2}\\&amp;= s + \mu^2 - \frac{E[N(s+\mu^2) +
\sum_i^N \sum_{j\neq i}^N x_i x_j]}{N^2}\\&amp;= s + \mu^2 -
\frac{N(s+\mu^2) + \sum_i^N \sum_{j\neq i}^N E[x_i] E[x_j]}{N^2}\\&amp;=
s + \mu^2 - \frac{N(s+\mu^2) + N(N-1)\mu^2}{N^2}\\&amp;= s + \mu^2 -
\frac{N(s+\mu^2) + N^2\mu^2 -N\mu^2}{N^2}\\&amp;= s + \mu^2 -
\frac{s+\mu^2 + N\mu^2 -\mu^2}{N}\\&amp;= s + \mu^2 - \frac{s}{N} -
\frac{\mu^2}{N} - \mu^2 + \frac{\mu^2}{N}\\&amp;= s -
\frac{s}{N}\\&amp;= s \left( 1 - \frac{1}{N} \right)\\&amp;= s
\left(\frac{N-1}{N} \right) \end{aligned}\]</span></p>
<p>显然<span class="math inline">\(E[s]\neq\hat
s\)</span>，上面公式可知分布的方差估计量不再是无偏的了。事实上，平均来看，这个估计量低估了真实方差，比例为<span
class="math inline">\(\frac{N-1}{N}\)</span>。当样本的数量趋于无穷时(<span
class="math inline">\(N\rightarrow\infty\)</span>)，这个偏差趋近于0。但是对于小的样本集，这个偏差就意义了，需要被消除。</p>
<h4 id="修正偏差">修正偏差</h4>
<p>因为偏差不过是一个因子，我们只需通过对公式(10)的估计量乘以偏差的倒数。这样我们就可以定义一个如下的无偏的估计量<span
class="math inline">\(s\prime\)</span>：</p>
<p><span class="math display">\[\begin{aligned} s\prime &amp;= \left (
\frac{N-1}{N} \right )^{-1} s\\s\prime &amp;= \left ( \frac{N-1}{N}
\right )^{-1} \frac{1}{N}\sum_{i=1}^N(x_i - \mu)^2\\s\prime &amp;=\left
( \frac{N}{N-1} \right ) \frac{1}{N}\sum_{i=1}^N(x_i - \mu)^2\\s\prime
&amp;= \frac{1}{N-1}\sum_{i=1}^N(x_i - \mu)^2\end{aligned}\]</span></p>
<p>这个估计量现在就是无偏的了，事实上，这个公式与传统计算方差的公式非常像，不同的是除的是<span
class="math inline">\(N-1\)</span>而不是<span
class="math inline">\(N\)</span>。然而，你可能注意到这个估计量不再是最小方差估计量，但是这个估计量是所有无偏估计量中最小方差的一个。如果我们除以<span
class="math inline">\(N\)</span>，那么估计量就是有偏的了，如果我们除以<span
class="math inline">\(N-1\)</span>，估计量就不是最小方差估计量。但大体来说，一个有偏的估计量要比一个稍高一点方差的估计量要糟糕的多。因此，如果当总体的均值是未知的情况下，方差除的是<span
class="math inline">\(N-1\)</span>，而不是<span
class="math inline">\(N\)</span>。</p>
<h3 id="总结">总结</h3>
<p>本文，我们推导了如果从分布数据中计算常见的方差和均值公式。此外，我们还证明了在方差估计中，标准化因子在总体均值已知时是<span
class="math inline">\(\frac{1}{N}\)</span>，在均值也需要估计时是<span
class="math inline">\(\frac{1}{N-1}\)</span>。</p>
<p><a
target="_blank" rel="noopener" href="http://cmb.oss-cn-qingdao.aliyuncs.com/%E6%A0%B7%E6%9C%AC%E6%96%B9%E5%B7%AE%E4%B8%BA%E4%BB%80%E4%B9%88%E9%99%A4%E4%BB%A5N-1%3F%EF%BC%88%E7%BF%BB%E8%AF%91%EF%BC%89.pdf">本文PDF</a></p>

      
    </div>

    
    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>


  <nav class="pagination">
    <span class="page-number current">1</span><a class="page-number" href="/page/2/">2</a><a class="extend next" rel="next" title="下一页" aria-label="下一页" href="/page/2/"><i class="fa fa-angle-right"></i></a>
  </nav>

</div>
  </main>

  <footer class="footer">
    <div class="footer-inner">

  <div class="copyright">
    &copy; 
    <span itemprop="copyrightYear">2024</span>
    <span class="with-love">
      <i class=""></i>
    </span>
    <span class="author" itemprop="copyrightHolder">Mingbo Cheng</span>
  </div>
  <div class="powered-by">由 <a href="https://hexo.io/" rel="noopener" target="_blank">Hexo</a> & <a href="https://theme-next.js.org/" rel="noopener" target="_blank">NexT.Gemini</a> 强力驱动
  </div>

    </div>
  </footer>

  
  <div class="toggle sidebar-toggle" role="button">
    <span class="toggle-line"></span>
    <span class="toggle-line"></span>
    <span class="toggle-line"></span>
  </div>
  <div class="sidebar-dimmer"></div>
  <div class="back-to-top" role="button" aria-label="返回顶部">
    <i class="fa fa-arrow-up fa-lg"></i>
    <span>0%</span>
  </div>

<noscript>
  <div class="noscript-warning">Theme NexT works best with JavaScript enabled</div>
</noscript>


  <script src="https://cdnjs.cloudflare.com/ajax/libs/animejs/3.2.1/anime.min.js" integrity="sha256-XL2inqUJaslATFnHdJOi9GfQ60on8Wx1C2H8DYiN1xY=" crossorigin="anonymous"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/fancyapps-ui/5.0.31/fancybox/fancybox.umd.js" integrity="sha256-a+H7FYzJv6oU2hfsfDGM2Ohw/cR9v+hPfxHCLdmCrE8=" crossorigin="anonymous"></script>
<script src="/js/comments.js"></script><script src="/js/utils.js"></script><script src="/js/motion.js"></script><script src="/js/sidebar.js"></script><script src="/js/next-boot.js"></script>

  
  <script src="/js/third-party/fancybox.js"></script>


  <script class="next-config" data-name="enableMath" type="application/json">true</script><script class="next-config" data-name="mathjax" type="application/json">{"enable":true,"tags":"none","js":{"url":"https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.2.2/es5/tex-mml-chtml.js","integrity":"sha256-MASABpB4tYktI2Oitl4t+78w/lyA+D7b/s9GEP0JOGI="}}</script>
<script src="/js/third-party/math/mathjax.js"></script>


<script class="next-config" data-name="utterances" type="application/json">{"enable":true,"repo":"chengmingbo/gitment-comments","issue_term":"pathname","theme":"github-light"}</script>
<script src="/js/third-party/comments/utterances.js"></script>

</body>
</html>