Ch_methodology1.html



<!DOCTYPE html>


<html lang="en" >

  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>3. Methodology I: Three basic tasks &#8212; Principles of Machine Learning: A Deployment-First Perspective</title>
  
  
  <script data-cfasync="false">
    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
    document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
  </script>
  
  <!-- Loaded before other Sphinx assets -->
  <link href="_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />

  
  <link href="_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
  <link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />

    <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
    <link rel="stylesheet" href="_static/styles/sphinx-book-theme.css?digest=14f4ca6b54d191a8c7657f6c759bf11a5fb86285" type="text/css" />
    <link rel="stylesheet" type="text/css" href="_static/togglebutton.css" />
    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
    <link rel="stylesheet" type="text/css" href="_static/mystnb.4510f1fc1dee50b3e5859aac5469c37c29e427902b24a333a5f9fcb2f0b3ac41.css" />
    <link rel="stylesheet" type="text/css" href="_static/sphinx-thebe.css" />
    <link rel="stylesheet" type="text/css" href="_static/pml_admonitions.css" />
    <link rel="stylesheet" type="text/css" href="_static/custom.css" />
    <link rel="stylesheet" type="text/css" href="_static/design-style.4045f2051d55cab465a707391d5b2007.min.css" />
  
  <!-- Pre-loaded scripts that we'll load fully later -->
  <link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />

    <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
    <script src="_static/jquery.js"></script>
    <script src="_static/underscore.js"></script>
    <script src="_static/_sphinx_javascript_frameworks_compat.js"></script>
    <script src="_static/doctools.js"></script>
    <script src="_static/clipboard.min.js"></script>
    <script src="_static/copybutton.js"></script>
    <script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
    <script>let toggleHintShow = 'Click to show';</script>
    <script>let toggleHintHide = 'Click to hide';</script>
    <script>let toggleOpenOnPrint = 'true';</script>
    <script src="_static/togglebutton.js"></script>
    <script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
    <script src="_static/design-tabs.js"></script>
    <script async="async" src="https://www.googletagmanager.com/gtag/js?id=G-0HQMPESCSN"></script>
    <script>
                window.dataLayer = window.dataLayer || [];
                function gtag(){ dataLayer.push(arguments); }
                gtag('js', new Date());
                gtag('config', 'G-0HQMPESCSN');
            </script>
    <script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"
const thebe_selector = ".thebe,.cell"
const thebe_selector_input = "pre"
const thebe_selector_output = ".output, .cell_output"
</script>
    <script async="async" src="_static/sphinx-thebe.js"></script>
    <script>window.MathJax = {"options": {"processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
    <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
    <script>DOCUMENTATION_OPTIONS.pagename = 'Ch_methodology1';</script>
    <link rel="shortcut icon" href="_static/pml_ico.ico"/>
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="4. Classification I: The geometric view" href="Ch_classification1.html" />
    <link rel="prev" title="2. Regression" href="Ch_regression.html" />
  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
  </head>
  
  
  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">

  
  <a class="skip-link" href="#main-content">Skip to main content</a>
  
  <input type="checkbox"
          class="sidebar-toggle"
          name="__primary"
          id="__primary"/>
  <label class="overlay overlay-primary" for="__primary"></label>
  
  <input type="checkbox"
          class="sidebar-toggle"
          name="__secondary"
          id="__secondary"/>
  <label class="overlay overlay-secondary" for="__secondary"></label>
  
  <div class="search-button__wrapper">
    <div class="search-button__overlay"></div>
    <div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
      action="search.html"
      method="get">
  <i class="fa-solid fa-magnifying-glass"></i>
  <input type="search"
         class="form-control"
         name="q"
         id="search-input"
         placeholder="Search this book..."
         aria-label="Search this book..."
         autocomplete="off"
         autocorrect="off"
         autocapitalize="off"
         spellcheck="false"/>
  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
  </div>
  
    <nav class="bd-header navbar navbar-expand-lg bd-navbar">
    </nav>
  
  <div class="bd-container">
    <div class="bd-container__inner bd-page-width">
      
      <div class="bd-sidebar-primary bd-sidebar">
        

  <div class="sidebar-header-items sidebar-primary__section">
    
    
  </div>
  
    <div class="sidebar-primary-items__start sidebar-primary__section">
        <div class="sidebar-primary-item">
  

<a class="navbar-brand logo" href="welcome.html">
  
  
    <img src="_static/pml_logo.png" class="logo__image only-light" alt="Logo image"/>
    <script>document.write(`<img src="_static/pml_logo.png" class="logo__image only-dark" alt="Logo image"/>`);</script>
  
  
</a></div>
        <div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
    <div class="bd-toc-item navbar-nav active">
        
        <ul class="nav bd-sidenav bd-sidenav__home-link">
            <li class="toctree-l1">
                <a class="reference internal" href="welcome.html">
                    Welcome to our Principles of Machine Learning
                </a>
            </li>
        </ul>
        <ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="Ch_introduction.html">1. Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="Ch_regression.html">2. Regression</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">3. Methodology I: Three basic tasks</a></li>
<li class="toctree-l1"><a class="reference internal" href="Ch_classification1.html">4. Classification I: The geometric view</a></li>
<li class="toctree-l1"><a class="reference internal" href="Ch_discovery.html">5. Structure analysis</a></li>
<li class="toctree-l1"><a class="reference internal" href="Ch_density.html">6. Density estimation</a></li>
<li class="toctree-l1"><a class="reference internal" href="Ch_classification2.html">7. Classification II: The probabilistic view</a></li>
<li class="toctree-l1"><a class="reference internal" href="Ch_methodology2.html">8. Methodology II: Pipelines</a></li>
<li class="toctree-l1"><a class="reference internal" href="Ch_feature.html">9. Feature Engineering</a></li>
<li class="toctree-l1"><a class="reference internal" href="Ch_ensemble.html">10. Ensemble methods</a></li>
<li class="toctree-l1"><a class="reference internal" href="Ch_neuralnets.html">11. Neural networks</a></li>
<li class="toctree-l1"><a class="reference internal" href="Ch_optimisation.html">12. Optimisation methods</a></li>
<li class="toctree-l1"><a class="reference internal" href="Ch_methodology3.html">13. Methodology III: Workflows</a></li>
<li class="toctree-l1"><a class="reference internal" href="Ch_ethics.html">14. The machine learning professional</a></li>
<li class="toctree-l1"><a class="reference internal" href="Ch_appendix.html">15. Appendix</a></li>
</ul>
<hr style="height:2px;border:none;color:#000000;background-color:#000000;width:50%;text-align:center;margin:10px auto auto auto;">
</div>

</nav>
</div></div>

<a><b>Readers:</b></a>
<div style="height:80%;width:80%;">
<script type="text/javascript" id="clstr_globe" src="//clustrmaps.com/globe.js?d=06DuCmf206QlXB0PwXp_5bEXHN0MJWuVeBiYDLQ4Ovc"></script>
<!-- <h1>Test 0</h1> -->
</div>
<hr>
  <div class="sidebar-primary-items__end sidebar-primary__section">
  </div>
  
  <div id="rtd-footer-container"></div>


      </div>
      
      <main id="main-content" class="bd-main">
        
        
<div class="sbt-scroll-pixel-helper"></div>

          <div class="bd-content">
            <div class="bd-article-container">
              
              <div class="bd-header-article">
<div class="header-article-items header-article__inner">
  
    <div class="header-article-items__start">
      
        <div class="header-article-item"><label class="sidebar-toggle primary-toggle btn btn-sm" for="__primary" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
  <span class="fa-solid fa-bars"></span>
</label></div>
      
    </div>
  
  
    <div class="header-article-items__end">
      
        <div class="header-article-item">

<div class="article-header-buttons">


<div class="dropdown dropdown-source-buttons">
  <button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Source repositories">
    <i class="fab fa-github"></i>
  </button>
  <ul class="dropdown-menu">
      
      
      <li><a href="https://github.com/PMLBook/PMLBook.github.io" target="_blank"
   class="btn btn-sm btn-source-repository-button dropdown-item"
   title="Source repository"
   data-bs-placement="left" data-bs-toggle="tooltip"
>
  

<span class="btn__icon-container">
  <i class="fab fa-github"></i>
  </span>
<span class="btn__text-container">Repository</span>
</a>
</li>
      
      
      <li><a href="https://github.com/PMLBook/PMLBook.github.io/issues/new?title=Issue%20on%20page%20%2FCh_methodology1.html&body=Your%20issue%20content%20here." target="_blank"
   class="btn btn-sm btn-source-issues-button dropdown-item"
   title="Open an issue"
   data-bs-placement="left" data-bs-toggle="tooltip"
>
  

<span class="btn__icon-container">
  <i class="fas fa-lightbulb"></i>
  </span>
<span class="btn__text-container">Open issue</span>
</a>
</li>
      
  </ul>
</div>


<div class="dropdown dropdown-download-buttons">
  <button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
    <i class="fas fa-download"></i>
  </button>
  <ul class="dropdown-menu">
      
      
      <li><a href="_sources/Ch_methodology1.md" target="_blank"
   class="btn btn-sm btn-download-source-button dropdown-item"
   title="Download source file"
   data-bs-placement="left" data-bs-toggle="tooltip"
>
  

<span class="btn__icon-container">
  <i class="fas fa-file"></i>
  </span>
<span class="btn__text-container">.md</span>
</a>
</li>
      
      
      <li>
<button onclick="window.print()"
  class="btn btn-sm btn-download-pdf-button dropdown-item"
  title="Print to PDF"
  data-bs-placement="left" data-bs-toggle="tooltip"
>
  

<span class="btn__icon-container">
  <i class="fas fa-file-pdf"></i>
  </span>
<span class="btn__text-container">.pdf</span>
</button>
</li>
      
  </ul>
</div>


<button onclick="toggleFullScreen()"
  class="btn btn-sm btn-fullscreen-button"
  title="Fullscreen mode"
  data-bs-placement="bottom" data-bs-toggle="tooltip"
>
  

<span class="btn__icon-container">
  <i class="fas fa-expand"></i>
  </span>

</button>


<script>
document.write(`
  <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
    <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
    <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
  </button>
`);
</script>

<script>
document.write(`
  <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <i class="fa-solid fa-magnifying-glass"></i>
  </button>
`);
</script>
<label class="sidebar-toggle secondary-toggle btn btn-sm" for="__secondary"title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <span class="fa-solid fa-list"></span>
</label>
</div></div>
      
    </div>
  
</div>
</div>
              
              
<div id="jb-print-docs-body" class="onlyprint">
    <h1>Methodology I: Three basic tasks</h1>
    <!-- Table of contents -->
    <div id="print-main-content">
        <div id="jb-print-toc">
            
            <div>
                <h2> Contents </h2>
            </div>
            <nav aria-label="Page">
                <ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ventris-decisive-check">3.1. Ventris’ decisive check</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#populations-and-datasets">3.2. Populations and datasets</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-notion-of-population">3.2.1. The notion of population</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#from-populations-to-datasets">3.2.2. From populations to datasets</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#the-test-task">3.3. The test task</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#true-and-empirical-qualities">3.3.1. True and empirical qualities</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#testing-as-quality-estimation">3.3.2. Testing as quality estimation</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#the-training-task">3.4. The training task</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-error-surface">3.4.1. The error surface</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#looking-for-the-optimal-model">3.4.2. Looking for the optimal model</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#true-and-empirical-error-surfaces">3.4.3. True and empirical error surfaces</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#optimisation-on-the-empirical-error-surface">3.4.4. Optimisation on the empirical error surface</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#overfitting-and-regularisation">3.4.5. Overfitting and regularisation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#optimisation-quality-and-target-quality">3.4.6. Optimisation quality and target quality</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#the-validation-task">3.5. The validation task</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#summary-and-discussion">3.6. Summary and discussion</a></li>
</ul>
            </nav>
        </div>
    </div>
</div>

              
<div id="searchbox"></div>
                <article class="bd-article" role="main">
                  
  <div class="tex2jax_ignore mathjax_ignore section" id="methodology-i-three-basic-tasks">
<span id="meth1"></span><h1><span class="section-number">3. </span>Methodology I: Three basic tasks<a class="headerlink" href="#methodology-i-three-basic-tasks" title="Permalink to this heading">#</a></h1>
<p>In the <a class="reference internal" href="Ch_regression.html#reg"><span class="std std-ref">Regression</span></a> chapter we explored our first family of machine learning problems. We defined regression as a problem where we seek to predict the value of a continuous label based on a set of predictors. Crucially, we highlighted that what makes regression a machine learning problem is the use of datasets to build our solutions. However, while analysing several cases we discovered a disturbing reality: solution models that appear to work well on a training dataset, might actually perform poorly when deployed. The main take-home message was that we cannot tell how well a model will work when deployed by simply looking at its performance on the training dataset. This is very concerning, as we would like to deploy only those models that we know will work well.</p>
<p>Do these findings reveal an intrinsic limitation of machine learning? The answer is no. What they indicate is that the machine learning methodology that we have used so far is limited. In this chapter, we will focus on developing a rigorous machine learning methodology. The principles that we will present in this chapter are general and applicable to any machine learning problem, be it a regression, a classification or an unsupervised learning one. Our starting point will be a discussion around the fundamental concepts of population and datasets. This discussion will be guided by our deployment-first view of machine learning. Then, we will be covering three main machine learning tasks, namely the test task, the training task and the validation task. As you should expect, in all three tasks we will be using datasets.</p>
<p>Before we immerse ourselves into developing our machine learning methodology, let us travel back in time and revisit one of the greatest scientific achievements of the 20th century: the decipherment of the ancient Linear B script. Out of this story, seemingly unrelated to machine learning, we will extract our third top tip.</p>
<div class="section" id="ventris-decisive-check">
<h2><span class="section-number">3.1. </span>Ventris’ decisive check<a class="headerlink" href="#ventris-decisive-check" title="Permalink to this heading">#</a></h2>
<p>Crete is a mountainous island in the estearn Mediterranean, with the Aegean Sea to the North and the Lybian Sea to the South. There are traces of human settlements in Crete dating back to the Paleolithic and ever since, a multitude of cultures and civilisations have inhabited uninterruptedly its shores, mountains and valleys. In classical times, Crete was renowned for having been ruled by King Minos, who every nine years would send seven girls and seven boys to be devoured by the fearsome Minotaur, a part-man, part-bull creature trapped inside an elaborate stone maze, the Labyrinth. To say that Crete is one of the epicentres of the classical era is no overstatement.</p>
<p>The historical prominence of Crete attracted by the end of the 19th century robbers, anticuarians, archaelogists and scholars - if they could be distinguished. Among those individuals was the British archaelogist Arthur Evans who, in 1900, started excavating the ruins of the ancient city of Knossos, in central Crete. In Knossos, Evans believed to have discovered the Minotaur Labyrinth and the palace of King Minos, and during his excavations he dug up many fine artifacts, including a bull-shaped drinking cup and two snake goddess statuettes. Perhaps most intriguing of all, Evans found thousands of small, palm-sized clay tablets written in an unknown script (see <a class="reference internal" href="#linearbtablet"><span class="std std-numref">Fig. 3.1</span></a>). Evans called this script ‘Linear Script of Class B’, to distinguish it from a different script also found in Crete, the ‘Linear Script of Class A’ and went on to spend his life trying to decipher it. Evans died in 1941 having failed to decipher the Linear B script. That honour would fall to another Briton, Michael Ventris, an architect who announced his solution to this enigma in 1952.</p>
<div class="figure align-default" id="linearbtablet">
<a class="reference internal image-reference" href="_images/LinearBTablet.jpg"><img alt="_images/LinearBTablet.jpg" src="_images/LinearBTablet.jpg" style="width: 709.2px; height: 208.8px;" /></a>
<p class="caption"><span class="caption-number">Fig. 3.1 </span><span class="caption-text">Clay tablet written in Linear B. Photo taken in The British Museum, London.</span><a class="headerlink" href="#linearbtablet" title="Permalink to this image">#</a></p>
</div>
<p>You might be asking yourself what took the world so long to decipher Linear B, but the right question should be instead, how did we manage to decipher Linear B at all? Those who were trying to decipher Linear B were looking at thousands of fragments of text of unknown contents, written in an unknown script, encoding an unknown language. Think of it for a moment: Is this not an impossible proposition? Despite this, Ventris got to crack Linear B. Ventris’ reflections on the methods that he used to decipher Linear B contain authentic scientific jewels that can be easily overlooked. Let us bring back to life one of these gems, <strong>Ventris’ decisive check</strong>.</p>
<p>According to Ventris, any decipherment process consists of three separate stages, the third of which is</p>
<p><em>a decisive check, preferably with the aid of virgin material, to ensure that the apparent results are not due to fantasy, coincidence or circular reasoning</em>.</p>
<p>Ventris is telling us that no matter how promising our solution looks, our work should not end until we have <em>checked</em> it. Crucially, this final check should be done using material that we have not seen while we were building our solution. The history of decipherment abounds with examples of decipherers announcing mutually incompatible solutions to the same riddle, following what they usually describe as eureka moments. Most of these decipherers are in reality fooling themselves as they assume that for a solution to be valid, it has to work on the material that they have used to build it. Good decipherers are not only capable of providing promising solutions: they also know how to rigorously check them.</p>
<div class="tip admonition">
<p class="admonition-title">Our third top tip follows on from Ventris’ advice:</p>
<h3 style="text-align: center;"><b>Don't fool thyself!</b></h3>
</div>
<p>Ventris’ reflections should by now sound familiar to you. In machine learning, as in decipherment, we solve problems using some recorded material, in our case, datasets. Machine learning also shares with decipherment the risk of building solutions that work well on the recorded material, but are actually wrong. In machine learning lingo, decipherers that fool themselves build solutions that <em>overfit</em> to their material and are unaware of it. In machine learning, as in decipherment, being able to check rigorously our solutions is as essential as being able to build them.</p>
<p>In this chapter we will develop a rigurous machine learning methodology that will help us reduce the risk of fooling ourselves. We will call our final, decisive check, the <strong>test task</strong>. On a final note, Ventris announced his solution in July 1952, but its confirmation had to wait until May 1953, when an american archaeologist, Carl Blegen, who was excavating a site in Pylos, Greece, found a new clay tablet written in Linear B and using Ventris’ solution was able to read it. This was Ventris’ decisive check to the letter.</p>
</div>
<div class="section" id="populations-and-datasets">
<h2><span class="section-number">3.2. </span>Populations and datasets<a class="headerlink" href="#populations-and-datasets" title="Permalink to this heading">#</a></h2>
<p>We briefly introduced the concept of population in the <a class="reference internal" href="Ch_introduction.html#intro"><span class="std std-ref">Introduction</span></a> chapter, where we defined it as an entity from which samples are extracted. What did we mean by this? And how are datasets, which are collections of samples, related to populations? In this section we will explore the relationship between populations and datasets in detail.</p>
<div class="section" id="the-notion-of-population">
<h3><span class="section-number">3.2.1. </span>The notion of population<a class="headerlink" href="#the-notion-of-population" title="Permalink to this heading">#</a></h3>
<p>In its original meaning, a population is a collection of individuals who inhabit a particular place, such as a town, a region or a country. Statistics as a discipline originated out of the need to understand such -human- populations, but as time went by the field of statistics grew to encompass a wider range of problems. This resulted in the term <em>population</em> abandoning its first, concrete sense and being used to refer to any entity that can be studied using the same methods developed to study human populations.</p>
<p>Given a machine learning problem, a population is the abstract entity that consists of all the possible samples that the problem can refer to. A population in machine learning can produce samples, which is the reason why we sometimes use the term <em>data source</em> instead. Machine learning problems always define a target population and any machine learning solution is meant to work on samples extracted from the target population. Let us reflect on the notion of population using two machine learning examples. First, consider the regression problem of guessing the salary of an individual of whom we know their age. When we formulate this problem, we need to specify which group of individuals we are targetting, as it would be reasonable to expect the relationship between salary and age to be dependent on the time when and place where the individuals live. For instance, we would expect the relationship between age and salary in today’s city of Heraklion, in the island of Crete, to be different from that in 19th century Paris. If we chose Heraklion, our target population would consist of the salary and age of every Heraklian. One sample from this population would therefore be the age and the salary of one single Heraklian.</p>
<p>In our first example it is relatively easy to imagine a population as a group of concrete, physical items, i.e. humans. However, most of the time such means to imagine populations will not be available. For our second example, consider the problem of translating into English a fragment of text written in Linear B. What would be our population? The answer would be, every possible fragment of Linear B text, together with its English translation, whether they physically exist or not. In fact, most of the samples in this population do not exist physically. We could pretend that they all exist buried somewhere in the Mediterranean coast and have not been yet discovered, but we do not need to, as we can abstract away their physical existence. The same could be said about the target population in the machine learning problem of translating a fragment of text written in Hindi into Spanish. This population should include every possible fragment of Hindi text and its Spanish translation, whether they have already been written or not.</p>
<div class="question1 admonition">
<p class="admonition-title">Question for you</p>
<p>Consider the problem of predicting the distance driven by a car moving at constant speed, using its speed and journey duration as predictors. What would be our target population?</p>
<ol class="arabic simple">
<li><p>This problem does not have a target population.</p></li>
<li><p>The values of the distance, speed and duration of every possible car journey.</p></li>
<li><p>All the cars that have existed and will ever exist.</p></li>
</ol>
<p>Submit your response here: <a href="https://forms.office.com/e/gxdeFpu2ek" target = "_blank">Your Response</a></p>
</div>
<p>In the moving car problem, we are seeking to build a model that takes speed and journey duration as input predictors and produces a distance as the output label. It might be tempting to conclude that the population consists of all past, present and future cars. However, this would be wrong. To identify our population we need to first recognise the structure of one individual sample. Samples are abstractions described by a set of attributes. In the moving car problem we are considering samples that have three attributes, namely distance, speed, journey duration. Cars are therefore not samples and our target population is not a collection of cars. Our target population is instead the collection of triplets consisting of the values distance, speed and duration, of every possible car journey.</p>
<p>We have already discussed that, even though the moving car problem is a valid regression problem, we would never use machine learning to solve it. The reason for this is simple: we already know that we can compute the distance by simply multiplying speed and journey duration. The moving car problem is an example of a problem where we have a <strong>perfect description of the population</strong>, as we know the relationship between its attributes exactly. Using a perfect description of our target population, we can identify the model with the highest <em>deployment quality</em>. It is when we lack a perfect description of our population when it makes sense for us to use machine learning approaches. In fact, we could say that machine learning approaches use datasets to build an approximate description of our target population.</p>
</div>
<div class="section" id="from-populations-to-datasets">
<h3><span class="section-number">3.2.2. </span>From populations to datasets<a class="headerlink" href="#from-populations-to-datasets" title="Permalink to this heading">#</a></h3>
<p>The process by which samples are extracted from a population is known as <strong>population sampling</strong>. Understanding population sampling is very important in machine learning, as our starting point is that we lack a perfect description of our target population and because of it, we have no choice but to resort to datasets extracted from it. We sometimes say that machine learning treats datasets as population <em>surrogates</em>, which indirectly provide an imperfect description of our target population. The question is then, how can we use datasets to learn something useful about our target population?</p>
<!-- ```{figure} images/PopulationSampling.jpg
---
name: PopulationSampling
---
Datasets are created by sampling a population, in other words, by extracting a collection of individual samples from it.
``` -->
<div class="figure align-default" id="populationsampling">
<img alt="_images/PopulationSampling.svg" src="_images/PopulationSampling.svg" /><p class="caption"><span class="caption-number">Fig. 3.2 </span><span class="caption-text">Datasets are created by sampling a population, in other words, by extracting a collection of individual samples from it.</span><a class="headerlink" href="#populationsampling" title="Permalink to this image">#</a></p>
</div>
<p>Let us discuss population sampling in the context of an already familiar problem, namely that of predicting the salary of an individual from their age. To build a regression model that predicts the salary of a Heraklian based on their age, we could sample the city of Heraklion by recording the salary and age of a group of its inhabitants. The collection of all the salaries and ages that we have recorded would form our dataset. Needless to say, our ultimate goal would be to build a model that predicts accurately the salary of any inhabitant of Heraklion picked at random, not just the salary of the Heraklians that we have included in our dataset. In other words, our goal is to be able to <strong>generalise</strong> what we have learnt from the dataset, to the entire population.</p>
<p>In order for us to be able to build solutions that generalise well, datasets need to be <strong>representative</strong> of our target population. First we have to ensure that all the samples in our datasets come from the same target population, e.g. no 19th century Parisians should be included in our Heraklian dataset. In statistical lingo, when our samples are extracted from the same population we say that they are <strong>identically distributed</strong>. Second, we need the samples in our dataset to be extracted following the same mechanism that will operate when the model is deployed against the target population. For example, rather than creating a dataset using the salaries and ages of Heraklians that belong to the same family or live in the same neighbourhood, we need the salary and age of Heraklians that have been <strong>randomly</strong> and <strong>indepedently</strong> drawn from the population. In other words, when extracting samples from our population, we must not impose any relationship between the samples that are being extracted. When we create a dataset following these two rules, we say that the samples in our dataset are <strong>independent and identically distributed</strong> or <strong>IID</strong>. Finally, our datasets need to have a <strong>sufficiently large number of samples</strong> so that we can reduce the risk of representing only partial segments of our population.</p>
</div>
</div>
<div class="section" id="the-test-task">
<h2><span class="section-number">3.3. </span>The test task<a class="headerlink" href="#the-test-task" title="Permalink to this heading">#</a></h2>
<p>In our <a class="reference internal" href="Ch_introduction.html#intro"><span class="std std-ref">Introduction</span></a> chapter we presented a basic machine learning model lifecycle consisting of two basic stages, namely the learning stage and the deployment stage. In addition to understanding what each stage is about, it is worth asking ourselves <em>who</em> is involved in each stage, i.e. who builds a model ready to be deployed and who will deploy a model that has already been built. In many scenarios this will be the same person or team, but in general we should not expect this to be the case. We can in fact see the machine learning world as an ecosystem consisting of multiple actors that produce, distribute and deploy models. In this ecosystem, whoever is responsible for deploying a model, whether they have built the model or not, needs to be confident that the model is good enough to be deployed.</p>
<p>The <strong>test task is arguably the most important task in machine learning</strong>, as it allows us to estimate the future deployment quality of any given model. Whether we create our own models and would like to quantify how good they are, or we are interested in deploying models that have already been built by others, we will always need to run a test task. In this section we will discuss the principles behind machine learning test tasks and how to interpret correctly the results of machine learning testing.</p>
<div class="section" id="true-and-empirical-qualities">
<h3><span class="section-number">3.3.1. </span>True and empirical qualities<a class="headerlink" href="#true-and-empirical-qualities" title="Permalink to this heading">#</a></h3>
<p>In our <a class="reference internal" href="Ch_regression.html#reg"><span class="std std-ref">Regression</span></a> chapter we defined regression as an optimisation problem where the best model was the one with the highest quality <em>on the training dataset</em>. We found this <em>Take 1</em> definition to be naive, as we soon discovered that models that work really well on a training dataset might perform poorly when deployed. Let us update our definition. From now on, <strong>the best model will be defined as the one with the highest quality on the target population</strong>, i.e. in the environment that our model is exposed to when deployed. This is our <strong>Take 2</strong> and final definition.</p>
<p>We will call the quality of a model on our target population its <strong>true quality</strong>. Given a model, its true quality is the metric that we would really like to know. Unfortunately, to know the true quality of a model we need a perfect description of our target population. In machine learning we assess the quality of a model using a dataset consisting of samples extracted from the target population. We call this second metric the <strong>empirical quality</strong> of our model, as it is obtained from indirect observations of our population, i.e. from a dataset. The question arises, what is the relationship between the true quality and the empirical quality of a model? The answer is, <strong>the empirical quality of a model is an <em>estimation</em> of its true quality</strong>. We sometimes reflect this relationship in our mathematical notation. For instance, if we define the notion of quality of a regression model using the MSE, we would denote the true quality of the model as <span class="math notranslate nohighlight">\(MSE\)</span> and its empirical quality as <span class="math notranslate nohighlight">\(\widehat{MSE}\)</span>.</p>
<p>Once we have understood that an empirical quality should be interpreted as an estimation of a true quality, it is easier to establish whether a proposed quality metric is suitable or not. Let us look at the MSE and SSE used in regression. Back in our <a class="reference internal" href="Ch_regression.html#reg"><span class="std std-ref">Regression</span></a> chapter we suggested that we could consider both to be equivalent, but hinted at some differences in interpretation. Now we are in a position to explain what we meant. The notion of MSE can be defined both in a population and a dataset, as it makes sense to talk about an average squared error across all the samples of the population, and an average squared error across the samples of a dataset. By contrast, even though we can compute the SSE on a dataset, the SSE does not always make sense when considering the target population, as the population could potentially be infinite. Furthermore, whereas we can compare meaningfully MSE values computed on different datasets, the same cannot be said about the SSE. For instance if one dataset consists of a low number of samples and another of a large number of samples, the SSE computed on the former will be expected to be lower than SEE computed on the latter. This difference will not be due to differences in quality, but simply because of the difference in the number of samples. From now on, our quality metrics should be such that we will be able to define them both on a dataset and a population. Consequently, we will prefer the MSE to the SSE.</p>
</div>
<div class="section" id="testing-as-quality-estimation">
<h3><span class="section-number">3.3.2. </span>Testing as quality estimation<a class="headerlink" href="#testing-as-quality-estimation" title="Permalink to this heading">#</a></h3>
<p>The test task in machine learning is the process by which we <strong>assess the true quality</strong> of a given model using datasets. Note that during testing we are not interested in knowing how the model was built or how it works internally, we are only interested in assessing how well it will perform when deployed. In fact, we can test models that have been built using non machine learning approaches. For testing purposes, models can be treated as <strong>black boxes</strong>.</p>
<p>A dataset used for testing purposes is known as a <strong>test dataset</strong> and the empirical quality of the model being tested is known as the <strong>test quality</strong> (see <a class="reference internal" href="#populationsamplingtest"><span class="std std-numref">Fig. 3.3</span></a>). Test datasets need to be <strong>IID</strong> and crucially <strong>independent from datasets used for model training</strong> (remember Ventris’ decisive check?). As an empirical quantity, the test quality of a model is different from the true quality that we actually want to estimate. But how different?</p>
<div class="figure align-default" id="populationsamplingtest">
<img alt="_images/PopulationSamplingTest.svg" src="_images/PopulationSamplingTest.svg" /><p class="caption"><span class="caption-number">Fig. 3.3 </span><span class="caption-text">A test task uses a dataset extracted from the target population (a <em>test dataset</em>) to assess the true quality of a model. The estimation that a test task produces is known as the <em>test quality</em>. The true quality can only be obtained if we have a perfect description of the population.</span><a class="headerlink" href="#populationsamplingtest" title="Permalink to this image">#</a></p>
</div>
<p>To correctly interpret the value of a test quality, we need to understand its <strong>random nature</strong>. Test and in general empirical metrics are random as the datasets from which they are obtained consist of samples extracted randomly from the population. It is important to emphasise that the term <em>random</em> should not be interpreted in a colloquial sense, but strictly in a statistical one. In other words, the numerical value of a test quality will not be just <em>any</em> value (colloquial sense), but we can expect it to lie within a range of values with some probability (statistical sense).</p>
<p><a class="reference internal" href="#populationsamplingtest3"><span class="std std-numref">Fig. 3.4</span></a> illustrates the idea that a test quality is a random value. Consider a regression problem where we use the MSE as our notion of quality. Imagine we are given an already built model whose true quality is <span class="math notranslate nohighlight">\(MSE = 10\)</span>. The true quality is what we would like to know, however in machine learning we do not have direct access to it, so let us pretend we do not know this. If we extracted three separate IID datasets from the population, we would obtain three different test qualities, for instance, <span class="math notranslate nohighlight">\(\widehat{MSE}_1=9.5\)</span>, <span class="math notranslate nohighlight">\(\widehat{MSE}_2=11\)</span> and <span class="math notranslate nohighlight">\(\widehat{MSE}_3=10.9\)</span>. We would never expect the different test qualities to be identical to one another nor equal to the true quality, but we would expect their values to be related to the true quality. How close a test quality is to the underlying true quality depends on how representative our dataset is of the target population.</p>
<div class="figure align-default" id="populationsamplingtest3">
<img alt="_images/PopulationSamplingTest3.svg" src="_images/PopulationSamplingTest3.svg" /><p class="caption"><span class="caption-number">Fig. 3.4 </span><span class="caption-text">Datasets are extracted randomly from the target population. Consequently, a test quality is also a random quantity. If we test the same model on three different datasets, the test quality will be different for each dataset.</span><a class="headerlink" href="#populationsamplingtest3" title="Permalink to this image">#</a></p>
</div>
<p>The random nature of the test quality has crucial implications when we use test tasks to compare different models.</p>
<div class="question1 admonition">
<p class="admonition-title">Question for you</p>
<p>We are trying to decide which model to deploy out of three candidate models built by three separate teams, <span class="math notranslate nohighlight">\(A\)</span>, <span class="math notranslate nohighlight">\(B\)</span> and <span class="math notranslate nohighlight">\(C\)</span>. We compute the test quality of each model on the same test dataset, as shown in <a class="reference internal" href="#populationsamplingtest3model"><span class="std std-numref">Fig. 3.5</span></a>, and obtain <span class="math notranslate nohighlight">\(\widehat{MSE}_A=100\)</span>, <span class="math notranslate nohighlight">\(\widehat{MSE}_B=99.9\)</span> and <span class="math notranslate nohighlight">\(\widehat{MSE}_C=200\)</span>. Which model would we choose, the model built by team <span class="math notranslate nohighlight">\(A\)</span>, <span class="math notranslate nohighlight">\(B\)</span> or <span class="math notranslate nohighlight">\(C\)</span>?</p>
<p>Submit your response here: <a href="https://forms.office.com/e/CuAFZEn5v7" target = "_blank">Your Response</a></p>
</div>
<div class="figure align-default" id="populationsamplingtest3model">
<img alt="_images/PopulationSamplingTest3model_ABC.svg" src="_images/PopulationSamplingTest3model_ABC.svg" /><p class="caption"><span class="caption-number">Fig. 3.5 </span><span class="caption-text">Models can be compared by estimating their deployment qualities using a test task. When comparing their test qualities, we need to remember they are random quantities.</span><a class="headerlink" href="#populationsamplingtest3model" title="Permalink to this image">#</a></p>
</div>
<p>According to the test quality, model <span class="math notranslate nohighlight">\(B\)</span> appears to be the best, as it has the lowest test MSE. Is it however, the best? By now, we know that the test MSE is different from the true MSE and also that the test MSE, as an empirical quantity, is random. Could it be that model <span class="math notranslate nohighlight">\(B\)</span> appears to be better than <span class="math notranslate nohighlight">\(A\)</span> just by chance? In other words, when is a difference between two random values <em>significant</em>? Machine learning practitioners need to always be aware that random effects can mislead them. One of the best illustration of this is the so-called <strong>Infinite Monkey Theorem</strong>. According to this tongue-in-cheek theorem, if you have an unlimited number of monkeys typing in a room for long enough, at some point one of them will type the entire works of William Shakespeare. Would you say that a monkey that types a poem is in fact a poet? Would you not say that the poem has been entirely produced by chance? In machine learning, as in the Infinite Monkey Theorem room, randomness is always running against you.</p>
</div>
</div>
<div class="section" id="the-training-task">
<span id="meth1-train"></span><h2><span class="section-number">3.4. </span>The training task<a class="headerlink" href="#the-training-task" title="Permalink to this heading">#</a></h2>
<p>Training tasks are the heart of model-building. As we already saw in our <a class="reference internal" href="Ch_regression.html#reg"><span class="std std-ref">Regression</span></a> chapter, training allows us to set the parameters of a tunable model using a dataset which we call the <strong>training dataset</strong>. Specifically, training uses optimisation approaches to find the <em>best</em> values for a model’s parameters, according to a notion of quality defined <em>on the training dataset</em>.</p>
<p>As we know, populations and datasets can lead to different quality metrics and consequently, the best parameters according to a dataset and the population it comes from can be very different. In this section we will discuss in more detail model training and the crucial role of optimisation theory. Once again, we will think of our datasets as imperfect images, or surrogates, of the target populations from which they have been extracted.</p>
<div class="section" id="the-error-surface">
<h3><span class="section-number">3.4.1. </span>The error surface<a class="headerlink" href="#the-error-surface" title="Permalink to this heading">#</a></h3>
<p>In an optimisation problem we seek to find the best model, known as the <strong>optimal model</strong>, among a family of candidate models. An example of such a family is the family of linear models, which consists of all the straight lines that can be generated changing the values of the intercept and gradient parameters.</p>
<p>The notion of quality in optimisation theory is defined by a mathematical function known as the <strong>cost function</strong>, <strong>loss function</strong> or <strong>error function</strong>. We will use the more familiar term <strong>error function</strong> and will denote it by <span class="math notranslate nohighlight">\(E\{\boldsymbol{w}\}\)</span>, which could be read as <em>the error <span class="math notranslate nohighlight">\(E\)</span> associated to the model with parameters <span class="math notranslate nohighlight">\(\boldsymbol{w}\)</span></em>. The MSE of a regression model on a dataset is an example of an error function: for every model defined by a set of parameters <span class="math notranslate nohighlight">\(\boldsymbol{w}\)</span>, there is one error value <span class="math notranslate nohighlight">\(E\{\boldsymbol{w}\}\)</span>.</p>
<p>Using mathematical notation, we can define the optimal model as the model whose parameters <span class="math notranslate nohighlight">\(\boldsymbol{w}_{opt}\)</span> are</p>
<div class="math notranslate nohighlight" id="equation-eqminime">
<span class="eqno">(3.1)<a class="headerlink" href="#equation-eqminime" title="Permalink to this equation">#</a></span>\[
\boldsymbol{w}_{opt} = \underset{\boldsymbol{w} \in W}{\operatorname{argmin}} E\{\boldsymbol{w}\}
\]</div>
<p>In words, the optimal model is the one that has the lowest error among the family of models <span class="math notranslate nohighlight">\(W\)</span>. To understand optimisation methods, it is sometimes useful to visualise the error function as an <strong>error surface</strong>. Although we can only visualise error surfaces for simple models that have one or at most two parameters, the intuition that we can extract from this low-dimensional visualisation is essential for understanding more complex scenarios.</p>
<p><a class="reference internal" href="#errorsurface1"><span class="std std-numref">Fig. 3.6</span></a> shows the error surface of a model defined by two parameters <span class="math notranslate nohighlight">\(w_0\)</span> and <span class="math notranslate nohighlight">\(w_1\)</span>. The horizontal plane formed by axes <span class="math notranslate nohighlight">\(w_0\)</span> and <span class="math notranslate nohighlight">\(w_1\)</span> is known as the <strong>parameter space</strong>. A point of coordinates <span class="math notranslate nohighlight">\((a,b)\)</span> in the parameter space corresponds to one single model, whose parameters are <span class="math notranslate nohighlight">\(w_0=a\)</span> and <span class="math notranslate nohighlight">\(w_1=b\)</span>. The elevation above each point in the parameter space corresponds to its error. In this representation the error values are also colour coded.</p>
<div class="figure align-default" id="errorsurface1">
<img alt="_images/TrueError.svg" src="_images/TrueError.svg" /><p class="caption"><span class="caption-number">Fig. 3.6 </span><span class="caption-text">Error surface for a model defined by two parameters <span class="math notranslate nohighlight">\(w_0\)</span> and <span class="math notranslate nohighlight">\(w_1\)</span>.</span><a class="headerlink" href="#errorsurface1" title="Permalink to this image">#</a></p>
</div>
<p>An alternative representation of the error function consists of colour-coded or countour maps on the parameter space. <a class="reference internal" href="#errorsurfacemap"><span class="std std-numref">Fig. 3.7</span></a> and <a class="reference internal" href="#errorsurfacecontour"><span class="std std-numref">Fig. 3.8</span></a> represent the same error surface as <a class="reference internal" href="#errorsurface1"><span class="std std-numref">Fig. 3.6</span></a> as respectively a colour-coded map and a contour map.</p>
<div class="figure align-default" id="errorsurfacemap">
<img alt="_images/TrueErrorColourOpt.svg" src="_images/TrueErrorColourOpt.svg" /><p class="caption"><span class="caption-number">Fig. 3.7 </span><span class="caption-text">Error surface for a model defined by two parameters <span class="math notranslate nohighlight">\(w_0\)</span> and <span class="math notranslate nohighlight">\(w_1\)</span> represented as a colour-coded map. The optimal model, which has the lowest error, is identified by the symbol <span class="math notranslate nohighlight">\(\times\)</span>.</span><a class="headerlink" href="#errorsurfacemap" title="Permalink to this image">#</a></p>
</div>
<div class="figure align-default" id="errorsurfacecontour">
<img alt="_images/TrueErrorContourOpt.svg" src="_images/TrueErrorContourOpt.svg" /><p class="caption"><span class="caption-number">Fig. 3.8 </span><span class="caption-text">Error surface for a model defined by two parameters <span class="math notranslate nohighlight">\(w_0\)</span> and <span class="math notranslate nohighlight">\(w_1\)</span> represented as a contour map. The optimal model is identified by the symbol <span class="math notranslate nohighlight">\(\times\)</span>.</span><a class="headerlink" href="#errorsurfacecontour" title="Permalink to this image">#</a></p>
</div>
<p>Moving slowly across the parameter space can be interpreted as changing gradually the values of the parameters of a tunable model. In <a class="reference internal" href="#errorsurfacecontourwalk"><span class="std std-numref">Fig. 3.9</span></a> we visit a sequence of models by keeping <span class="math notranslate nohighlight">\(w_1\)</span> constant and changing <span class="math notranslate nohighlight">\(w_0\)</span> in small steps. As we step from one location in the parameter step to the next, we visit models that have a different error.</p>
<div class="figure align-default" id="errorsurfacecontourwalk">
<img alt="_images/TrueErrorWalk.svg" src="_images/TrueErrorWalk.svg" /><p class="caption"><span class="caption-number">Fig. 3.9 </span><span class="caption-text">By keeping <span class="math notranslate nohighlight">\(w_1\)</span> constant and increasing <span class="math notranslate nohighlight">\(w_0\)</span> in small steps, different locations in the parameter space are visited, which correspond to different models. The error of each model is represented as a vertical line.</span><a class="headerlink" href="#errorsurfacecontourwalk" title="Permalink to this image">#</a></p>
</div>
<p>Visualising the error surface allows us to easily identify the values of the parameters that define the optimal model. However, we must not be misled into thinking that this is how we identify the optimal model in optimisation. Given a model with parameters <span class="math notranslate nohighlight">\(\boldsymbol{w}\)</span>, we can use the error function <span class="math notranslate nohighlight">\(E\{\boldsymbol{w}\}\)</span> to obtain its associated error. However this does not mean that we have the error of every single model at hand, nor a visualisation of the error surface. In other words, in general optimisation assumes we do not <em>see</em> the error surface, let alone the optimal model. The question is then, how can we use what we know about <span class="math notranslate nohighlight">\(E\{\boldsymbol{w}\}\)</span> to find an optimal model?</p>
</div>
<div class="section" id="looking-for-the-optimal-model">
<span id="optimalgradient"></span><h3><span class="section-number">3.4.2. </span>Looking for the optimal model<a class="headerlink" href="#looking-for-the-optimal-model" title="Permalink to this heading">#</a></h3>
<p>As we move across the parameter space, we visit different models with parameters <span class="math notranslate nohighlight">\(\boldsymbol{w}\)</span> whose error is given by <span class="math notranslate nohighlight">\(E\{\boldsymbol{w}\}\)</span>. We can ask ourselves, how much will the error change if we change the values of <span class="math notranslate nohighlight">\(\boldsymbol{w}\)</span> slightly, or visually, if we take a small step in any direction in the parameter space? This is what we know as a <strong>directional derivative</strong> and can be computed using the error surface <span class="math notranslate nohighlight">\(E\{\boldsymbol{w}\}\)</span>. There is one direction along which the error increases the most, and this is known as the  <strong>error gradient</strong>, denoted by <span class="math notranslate nohighlight">\(\nabla E\{\boldsymbol{w}\}\)</span>. The gradient of an error surface can also be obtained from the error funcion <span class="math notranslate nohighlight">\(E\{\boldsymbol{w}\}\)</span> but for now, do not worry too much about how to do this and let us simply assume that we can compute it for every set of parameters <span class="math notranslate nohighlight">\(\boldsymbol{w}\)</span>.</p>
<p>If we happen to be at the location of the <strong>optimal model</strong> in the parameter space, <span class="math notranslate nohighlight">\(\boldsymbol{w}_{opt}\)</span>, and move in any direction, the error will always increase as by definition, the optimal model is the one with the lowest error. As a consequence of this, the gradient at the optimal model is always zero</p>
<div class="math notranslate nohighlight" id="equation-gradzero">
<span class="eqno">(3.2)<a class="headerlink" href="#equation-gradzero" title="Permalink to this equation">#</a></span>\[
\nabla E\{\boldsymbol{w}_{opt}\}=0
\]</div>
<p>Therefore, if we can easily identify which models have zero gradient, one of them will be the optimal model. Unfortunately, in general we will not be able to find an exact solution for <a class="reference internal" href="#equation-gradzero">(3.2)</a> that will return the parameters of the optimal model <span class="math notranslate nohighlight">\(\boldsymbol{w}_{opt}\)</span>. The question is then, how can we find the parameters of a model whose gradient is zero? A popular method that takes advantage of the information provided by the error surface gradient is <strong>gradient descent</strong>. Gradient descent is a numerical optimisation method where we improve iteratively our model until we find a solution whose gradient is close to zero.</p>
<p>In gradient descent, we move across the parameter space following the direction along which the error decreases the most. This direction happens to be the opposite of the direction given by the gradient. Our hope is to eventually reach a point in the parameter space from which we cannot improve the error any further, in other words, to reach a point where the gradient is zero. In gradient descent we start from an initial model, which is usually chosen randomly. During each iteration, the parameters are updated using the following rule:</p>
<div class="math notranslate nohighlight" id="equation-eqgradientupdate">
<span class="eqno">(3.3)<a class="headerlink" href="#equation-eqgradientupdate" title="Permalink to this equation">#</a></span>\[
\boldsymbol{w}_{new} = \boldsymbol{w}_{old} - \epsilon \nabla E\{\boldsymbol{w}_{old}\}
\]</div>
<p>where <span class="math notranslate nohighlight">\(\boldsymbol{w}_{new}\)</span> are the updated parameters, <span class="math notranslate nohighlight">\(\boldsymbol{w}_{old}\)</span> are the previous parameters and <span class="math notranslate nohighlight">\(\epsilon\)</span> is the step size or learning rate. The step size <span class="math notranslate nohighlight">\(\epsilon\)</span> indicates how far from <span class="math notranslate nohighlight">\(\boldsymbol{w}_{old}\)</span> we move following the direction opposite to the gradient. The choice of <span class="math notranslate nohighlight">\(\epsilon\)</span> is important. Small values result in slow convergence to the optimal model, whereas large values risk overshooting the optimal model. An illustration of gradient descent is shown in <a class="reference internal" href="#errorsurfacecontourgradientdescent"><span class="std std-numref">Fig. 3.10</span></a>.</p>
<div class="figure align-default" id="errorsurfacecontourgradientdescent">
<img alt="_images/TrueErrorGradOpt.svg" src="_images/TrueErrorGradOpt.svg" /><p class="caption"><span class="caption-number">Fig. 3.10 </span><span class="caption-text">Gradient descent is an iterative process to find the optimal model defined by an error surface. Starting from a random location in the parameter space, the parameters are updated iteratively following the direction oposite to the error surface gradient.</span><a class="headerlink" href="#errorsurfacecontourgradientdescent" title="Permalink to this image">#</a></p>
</div>
<p>In general, gradient descent might never reach a model whose gradient is exactly a zero, hence it is always necessary to include a stopping strategy. Common choices include setting a maximum number of iterations or longest processing time, reaching an acceptable error value and observing a small relative change in the error from iteration to iteration.</p>
<p>It is important to highlight that even though the error gradient is zero at the optimal model, a <strong>zero error gradient does not guarantee that the model is the optimal</strong>. <a class="reference internal" href="#errorsurface2"><span class="std std-numref">Fig. 3.11</span></a> shows a complex error surface for which there exist three models whose gradient is zero. Only one of them has the lowest error. This model is known as the <strong>global optimum</strong> whereas the other two models are <strong>local optima</strong>.</p>
<div class="figure align-default" id="errorsurface2">
<img alt="_images/TrueSurfaceComplex.svg" src="_images/TrueSurfaceComplex.svg" /><p class="caption"><span class="caption-number">Fig. 3.11 </span><span class="caption-text">Complex error surface that has two local optima and one global optimum. Local and global optima all have zero gradient.</span><a class="headerlink" href="#errorsurface2" title="Permalink to this image">#</a></p>
</div>
<p>If we use gradient descent on a complex surface such as the one shown in <a class="reference internal" href="#errorsurface2"><span class="std std-numref">Fig. 3.11</span></a>, we will progressively get closer to an optimum model, but there is no guarantee that this will be the global one. When gradient descent can get stuck in a local optimum. Because of this it is common to run gradient descent multiple times from several randomly selected initial models, as shown in <a class="reference internal" href="#errorsurface2grad"><span class="std std-numref">Fig. 3.12</span></a>. After visiting multiple optima, we can select the best among them. This does not guarantees that we might have reached the global optimum, but at least it reduces the risk of getting stuck in a bad local optimum.</p>
<div class="figure align-default" id="errorsurface2grad">
<img alt="_images/TrueErrorGradComplexOpt.svg" src="_images/TrueErrorGradComplexOpt.svg" /><p class="caption"><span class="caption-number">Fig. 3.12 </span><span class="caption-text">Running multiple times gradient descent on a complex surface increases the chances of finding a good local optimum.</span><a class="headerlink" href="#errorsurface2grad" title="Permalink to this image">#</a></p>
</div>
</div>
<div class="section" id="true-and-empirical-error-surfaces">
<h3><span class="section-number">3.4.3. </span>True and empirical error surfaces<a class="headerlink" href="#true-and-empirical-error-surfaces" title="Permalink to this heading">#</a></h3>
<p>We have previously distinguished between the quality of a model on the target population (the <em>true quality</em>) and the quality of the model on a dataset extracted from the population (the <em>empirical quality</em>). In the context of model training it is also useful to distinguish between the <strong>true error surface</strong> and the <strong>empirical error surface</strong>.</p>
<p>Given a family of models defined by a parameter vector <span class="math notranslate nohighlight">\(\boldsymbol{w}\)</span>, the <strong>true error surface</strong> corresponds to the error associated to each model when <em>deployed against the target population</em>. The optimal model defined on the true error surface is the model that we would like to find. In machine learning we do not have access to the true error surface, as this requires having a perfect description of our population. Using a dataset extracted from the population we can obtain the <strong>empirical error surface</strong>, which corresponds to the error of each model <em>on the dataset</em>. Using the empirical error surface, we can identify an optimal model, which is the model that performs the best on the dataset. The <strong>empirical error surface can be seen as an <em>estimation</em> of the true error surface</strong>. In general, we should expect them to be different, as illustrated in <a class="reference internal" href="#trueempiricalerrorsurface"><span class="std std-numref">Fig. 3.13</span></a> and <a class="reference internal" href="#trueempiricalerrorsurfacecontour"><span class="std std-numref">Fig. 3.14</span></a>.</p>
<div class="figure align-default" id="trueempiricalerrorsurface">
<img alt="_images/TrueAndEmpirical.svg" src="_images/TrueAndEmpirical.svg" /><p class="caption"><span class="caption-number">Fig. 3.13 </span><span class="caption-text">The true error surface (transparent) and the empirical error surface (opaque) are different. The more representative is our training dataset, the closer are both error surfaces.</span><a class="headerlink" href="#trueempiricalerrorsurface" title="Permalink to this image">#</a></p>
</div>
<div class="figure align-default" id="trueempiricalerrorsurfacecontour">
<img alt="_images/TrueAndEmpiricalContourOpt.svg" src="_images/TrueAndEmpiricalContourOpt.svg" /><p class="caption"><span class="caption-number">Fig. 3.14 </span><span class="caption-text">The optimal model defined by the true error surface (<span class="math notranslate nohighlight">\(\times\)</span>) is different from the one defined by the empirical error surface (<span class="math notranslate nohighlight">\(\star\)</span>). We are would like to find <span class="math notranslate nohighlight">\(\times\)</span>, but can only hope to find <span class="math notranslate nohighlight">\(\star\)</span> as we are using a dataset to train our models.</span><a class="headerlink" href="#trueempiricalerrorsurfacecontour" title="Permalink to this image">#</a></p>
</div>
<p>Since the true error sutface and the empirical error surface are in general different, we should expect the optimal model according to the true error surface, which is the model that we <em>would like to</em> find, to be different from the optimal model according to the empirical error surface, which is the model that we <em>can</em> find. In general, the similarity between the empirical and the true error surfaces, and hence the closeness between their optimal solutions, depends on how representative our training dataset is.</p>
</div>
<div class="section" id="optimisation-on-the-empirical-error-surface">
<h3><span class="section-number">3.4.4. </span>Optimisation on the empirical error surface<a class="headerlink" href="#optimisation-on-the-empirical-error-surface" title="Permalink to this heading">#</a></h3>
<p>Given a training dataset and family of models defined by a parameter vector <span class="math notranslate nohighlight">\(\boldsymbol{w}\)</span>, to identify the optimal model defined by the empirical error surface the simplest method that we could think of consists of directly evaluating as many models as possible and selecting the one with the lowest error. This method is called <strong>exhaustive</strong> or <strong>brute-force</strong> and is illustrated in <a class="reference internal" href="#errorbrute"><span class="std std-numref">Fig. 3.15</span></a>.</p>
<div class="figure align-default" id="errorbrute">
<img alt="_images/TrueErrorBrute.svg" src="_images/TrueErrorBrute.svg" /><p class="caption"><span class="caption-number">Fig. 3.15 </span><span class="caption-text">By evaluating the empirical error of many models, brute-force methods hope to find one that provides a good solution. Evaluated models are identified in the parameter space using the symbol <span class="math notranslate nohighlight">\(\times\)</span>.</span><a class="headerlink" href="#errorbrute" title="Permalink to this image">#</a></p>
</div>
<p>Brute-force methods are simple, but in most cases impractical, especially when the error surface is very complex and the number of parameters is large.</p>
<p>A second avenue to identify the optimal model on the empirical error surface is to directly solve the zero-gradient equation <a class="reference internal" href="#equation-gradzero">(3.2)</a>. This approach can be succesfully applied in some cases, for instance to find the optimal model according to the MSE empirical error function of a family of linear models with parameters <span class="math notranslate nohighlight">\(\boldsymbol{w}\)</span>. Let <span class="math notranslate nohighlight">\(\boldsymbol{X}\)</span> and <span class="math notranslate nohighlight">\(\boldsymbol{y}\)</span> be the design matrix and label vector of the training dataset. As we know, the predicted label vector can be calculated from <span class="math notranslate nohighlight">\(\boldsymbol{X}\)</span> and <span class="math notranslate nohighlight">\(\boldsymbol{w}\)</span> as</p>
<div class="math notranslate nohighlight" id="equation-eqpredlabels">
<span class="eqno">(3.4)<a class="headerlink" href="#equation-eqpredlabels" title="Permalink to this equation">#</a></span>\[
\hat{\boldsymbol{y}} =
\boldsymbol{X}\boldsymbol{w}
\]</div>
<p>Using <a class="reference internal" href="#equation-eqpredlabels">(3.4)</a>, the prediction error vector can be obtained as</p>
<div class="math notranslate nohighlight" id="equation-eqprederr">
<span class="eqno">(3.5)<a class="headerlink" href="#equation-eqprederr" title="Permalink to this equation">#</a></span>\[
\boldsymbol{e} =
\boldsymbol{y}-\hat{\boldsymbol{y}} = \boldsymbol{y}-\boldsymbol{X}\boldsymbol{w}
\]</div>
<p>Let us denote the MSE empirical error function by <span class="math notranslate nohighlight">\(E_{\widehat{MSE}}\{\boldsymbol{w}\}\)</span>, where we are using the <em>hat</em> notation <span class="math notranslate nohighlight">\(\widehat{MSE}\)</span> to indicate that as an empirical quantity, it is obtained from a dataset. We can express <span class="math notranslate nohighlight">\(E_{\widehat{MSE}}\)</span> in terms of the error vector <span class="math notranslate nohighlight">\(\boldsymbol{e}\)</span> as follows:</p>
<div class="math notranslate nohighlight" id="equation-eqmseemp">
<span class="eqno">(3.6)<a class="headerlink" href="#equation-eqmseemp" title="Permalink to this equation">#</a></span>\[\begin{split}
E_{\widehat{MSE}}\{\boldsymbol{w}\} &amp;= \frac{1}{N}\boldsymbol{e}^T\boldsymbol{e}\\
&amp;= \frac{1}{N}(\boldsymbol{y}-\hat{\boldsymbol{y}})^T(\boldsymbol{y}-\hat{\boldsymbol{y}}) \\
&amp;= \frac{1}{N}(\boldsymbol{y}-\boldsymbol{X}\boldsymbol{w})^T(\boldsymbol{y}-\boldsymbol{X}\boldsymbol{w})  
\end{split}\]</div>
<p>Note that <a class="reference internal" href="#equation-eqmseemp">(3.6)</a> defines a computation such that, given a parameter vector <span class="math notranslate nohighlight">\(\boldsymbol{w}\)</span> defining a model and a dataset defined by a design matrix <span class="math notranslate nohighlight">\(\boldsymbol{X}\)</span> and a label vector <span class="math notranslate nohighlight">\(\boldsymbol{y}\)</span>, one MSE value is returned. Using <a class="reference internal" href="#equation-eqmseemp">(3.6)</a> we can derive the following mathematical expression for the gradient of the empirical error surface:</p>
<div class="math notranslate nohighlight" id="equation-eqmseempgradient">
<span class="eqno">(3.7)<a class="headerlink" href="#equation-eqmseempgradient" title="Permalink to this equation">#</a></span>\[
\nabla E_{\widehat{MSE}}\{\boldsymbol{w}\} = \frac{-2}{N}\boldsymbol{X}^T(\boldsymbol{y}-\boldsymbol{X}\boldsymbol{w})
\]</div>
<p>As we know, the gradient at the optimal model is zero. Fortunately, we can solve the equation <span class="math notranslate nohighlight">\(\nabla E_{\widehat{MSE}}\{\boldsymbol{w}_{opt}\}=0\)</span>, and obtain the parameters of the optimal model defined by the empirical error surface:</p>
<div class="math notranslate nohighlight" id="equation-eqleastsqder">
<span class="eqno">(3.8)<a class="headerlink" href="#equation-eqleastsqder" title="Permalink to this equation">#</a></span>\[\begin{split}
\nabla E_{\widehat{MSE}}\{\boldsymbol{\boldsymbol{w}_{opt}}\} &amp;= 0\\
\boldsymbol{X}^T(\boldsymbol{y}-\boldsymbol{X}\boldsymbol{\boldsymbol{w}_{opt}}) &amp;= 0 \\
\boldsymbol{X}^T\boldsymbol{y}-\boldsymbol{X}^T\boldsymbol{X}\boldsymbol{\boldsymbol{w}_{opt}} &amp;= 0\\
\boldsymbol{X}^T\boldsymbol{X}\boldsymbol{\boldsymbol{w}_{opt}} &amp;= \boldsymbol{X}^T\boldsymbol{y} \\
(\boldsymbol{X}^T\boldsymbol{X})^{-1}\boldsymbol{X}^T\boldsymbol{X}\boldsymbol{\boldsymbol{w}_{opt}} &amp;= (\boldsymbol{X}^T\boldsymbol{X})^{-1}\boldsymbol{X}^T\boldsymbol{y} \\
\boldsymbol{\boldsymbol{w}_{opt}} &amp;= (\boldsymbol{X}^T\boldsymbol{X})^{-1}\boldsymbol{X}^T\boldsymbol{y}
\end{split}\]</div>
<p>The solution <a class="reference internal" href="#equation-eqleastsqder">(3.8)</a> is precisely the <strong>least squares</strong> solution that was presented in the <a class="reference internal" href="Ch_regression.html#reg"><span class="std std-ref">Regression</span></a> chapter. We have obtained the least squares solution by solving the equation <span class="math notranslate nohighlight">\(\nabla E_{\widehat{MSE}}\{\boldsymbol{w}_{opt}\}=0\)</span> exactly. Unfortunately, solving the zero-gradient equation <a class="reference internal" href="#equation-gradzero">(3.2)</a> exactly is only possible in a limited number of cases. In the majority of the scenarios we will need to use other optimisation methods, such as gradient descent.</p>
<p>To run gradient descent to train a model, it is necessary to calculate the gradient of the empirical error surface in every iteration. Unfortunately, calculating the gradient of the empirical error surface can be very costly when the training dataset is very large. One simple approach to reduce the computational time is to calculate the gradient using a random subset of the training dataset, known as a <strong>batch</strong>. The resulting gradient is in fact an <em>estimation</em> of the actual gradient of the empirical error surface. If the batch that we use is close to the entire training dataset, the estimated gradient will be close to the actual gradient; if it is very small, it will deviate from the it. In other words, using batches to estimate the gradient produce <strong>noisy gradients</strong>. For this reason, this is known as <strong>stochastic gradient descent</strong>.</p>
</div>
<div class="section" id="overfitting-and-regularisation">
<h3><span class="section-number">3.4.5. </span>Overfitting and regularisation<a class="headerlink" href="#overfitting-and-regularisation" title="Permalink to this heading">#</a></h3>
<p>We can design approaches that reduce the risk of overfitting by exploiting our understanding of the error surface. <strong>Regularisation</strong> is one such approach and consists of modifying the error surface by adding a new term that effectively restricts the flexibility of our model. For instance the MSE empirical error surface can be modified as follows</p>
<div class="math notranslate nohighlight" id="equation-eqmsereg">
<span class="eqno">(3.9)<a class="headerlink" href="#equation-eqmsereg" title="Permalink to this equation">#</a></span>\[
E_{\widehat{MSE}-R}\{\boldsymbol{w}\} = \frac{1}{N}\boldsymbol{e}^T\boldsymbol{e} + \lambda \boldsymbol{w}^T\boldsymbol{w}
\]</div>
<p>where <span class="math notranslate nohighlight">\(\boldsymbol{w}^T\boldsymbol{w}\)</span> is the sum of the squares of the model’s parameters and the value of <span class="math notranslate nohighlight">\(\lambda\)</span> controls the relative importance of the error term compared to the parameters term. Overfitting models tend to have large parameters <span class="math notranslate nohighlight">\(\boldsymbol{w}\)</span>. Since the term <span class="math notranslate nohighlight">\(\lambda \boldsymbol{w}^T\boldsymbol{w}\)</span> in <a class="reference internal" href="#equation-eqmsereg">(3.9)</a> effectively penalises models that have large parameters, regularising the MSE empirical error surface models that overfit are penalised.</p>
<p>The optimal solution for the regularised error surface <span class="math notranslate nohighlight">\(E_{\widehat{MSE}-R}\)</span> will strike a balance between reducing the prediction error on the training dataset and preventing the coefficients from taking on large values, which adds rigidity to our solutions. We can show that <a class="reference internal" href="#equation-gradzero">(3.2)</a> can be solved for the regularised MSE empirical error surface, obtaining</p>
<div class="math notranslate nohighlight">
\[
\boldsymbol{w}_{opt} = (\boldsymbol{X}^T\boldsymbol{X}+N\lambda\boldsymbol{I})^{-1}\boldsymbol{X}^T\boldsymbol{y}
\]</div>
<p>where <span class="math notranslate nohighlight">\(N\)</span> is the number of samples in the dataset and <span class="math notranslate nohighlight">\(\boldsymbol{I}\)</span> is the identity matrix. Note that when <span class="math notranslate nohighlight">\(\lambda=0\)</span> we obtain the least squares solution without regularisation.</p>
</div>
<div class="section" id="optimisation-quality-and-target-quality">
<h3><span class="section-number">3.4.6. </span>Optimisation quality and target quality<a class="headerlink" href="#optimisation-quality-and-target-quality" title="Permalink to this heading">#</a></h3>
<p>Regularised error surfaces are used for training purposes only, i.e. to find the values of the parameters of a model. The regularised error does not represent, however, our notion of quality, as it includes a second term whose purpose is to control the complexity of the final solution. When we use the regularised MSE empirical error surface <a class="reference internal" href="#equation-eqmsereg">(3.9)</a> to train a model, we are asking ourselves, <em>find the model that has the lowest MSE on this dataset, but make sure its coefficients are not too large</em>. In other words, we are adding constraints to our problem. After <strong>training a regression model using a regularised MSE</strong>, if we want to assess its future deployment performance we need to <strong>test it using the conventional definition of MSE</strong>. This should sound strange, as we are using a notion of quality during training that is different from our notion of deployment quality.</p>
<p>In future chapters we will come across other examples where the quantity that we are optimising does not directly correspond to our notion of quality during deployment. This could due to including other constraints in our problem formulation, as in regularisations, or because it migth be difficult to formulate mathematically an optimisation problem using the notion of quality during deployment. To illustrate the latter scenario, imagine a company that plans to increase their sales volume using a machine learning model that segments their customers into different groups. Formulating a machine learning problem using the sales volume as the quality metric that needs to be optimised would be very difficult, if not impossible. Instead, they would formulate a different problem using a quality metric that they would hope is related to the sales volume.</p>
<p>To distinguish between the notions of quality during training and deployment, we will reserve the term <strong>quality to describe our target quality during deployment</strong>, and we will use the terms <strong>cost, loss or error to refer to the quantity that we want to optimise during training</strong>. Sometimes both quantities will be the same, but in general, this will not be the case. Our hope will be that the solution obtained during optimisation will also be the one that has the highest quality during deployment.</p>
</div>
</div>
<div class="section" id="the-validation-task">
<h2><span class="section-number">3.5. </span>The validation task<a class="headerlink" href="#the-validation-task" title="Permalink to this heading">#</a></h2>
<p>In machine learning there are many families of models available to us to solve any given problem. Each family of models can produce solutions of different shapes and degrees of complexity and ideally, we should be selecting the right one for each problem at hand. Unfortunately, in machine learning we usually lack any previous insight that would guide us in choosing the right family of models. For instance, consider a simple polynomial regression problem. Changing the value of the degree of the polynomial <span class="math notranslate nohighlight">\(D\)</span> leads to subfamilies of models of different complexity. A linear model is a polynomial model where <span class="math notranslate nohighlight">\(D=1\)</span> and is very rigid, quadratic models are polynomial models where <span class="math notranslate nohighlight">\(D=2\)</span> and are more flexible, and so on. The question would be, how do we identify the right value of <span class="math notranslate nohighlight">\(D\)</span>, in other words, what is the complexity of our problem? Validation tasks allow us to precisely do this by producing an estimation of the deployment quality of several family of models, so that we can choose to train, test and deploy the right one. As usual, to conduct validation we need a dataset extracted from our population (see <a class="reference internal" href="#populationsamplingvalidation"><span class="std std-numref">Fig. 3.16</span></a>).</p>
<div class="figure align-default" id="populationsamplingvalidation">
<img alt="_images/PopulationSamplingValidation.svg" src="_images/PopulationSamplingValidation.svg" /><p class="caption"><span class="caption-number">Fig. 3.16 </span><span class="caption-text">Validation tasks use datasets extracted from the population to assess the potential of a family of models to solve a problem.</span><a class="headerlink" href="#populationsamplingvalidation" title="Permalink to this image">#</a></p>
</div>
<p>Validation involves training each of the families of models that we are considering. It is important to emphasise that models that are trainined during a validation task are not meant to be deployed. Training a family of models is a prerequisite for us to be able to estimate their deployment quality. Once we have estimated the deployment quality of each family of models, we can select the best one and train it ready for test and deployment. There are three main validation methods: validation set, leave-one-out cross-validation (LOOCV), and <span class="math notranslate nohighlight">\(k\)</span>-fold cross-validation.</p>
<p>The <strong>validation set</strong> approach is the simplest one and consists of one single round of training followed by deployment quality estimation. Before validation, the available dataset is split into a training dataset and a validation (or hold-out) dataset (<a class="reference internal" href="#validationset"><span class="std std-numref">Fig. 3.17</span></a>).</p>
<div class="figure align-default" id="validationset">
<img alt="_images/ValidationSet_1.svg" src="_images/ValidationSet_1.svg" /><p class="caption"><span class="caption-number">Fig. 3.17 </span><span class="caption-text">In the validation set approach, the available dataset is split into two sets, one of them is used for training each family of models, the other to estimate their deployment quality.</span><a class="headerlink" href="#validationset" title="Permalink to this image">#</a></p>
</div>
<p>To split the available dataset, we need to specify what fraction of samples will be assigned to each split and the assignment should be done randomly. Unfortunately, there is no general rule to decide which fraction of samples should be assigned to each split. Hence, we might ask ourselves, how good is our estimation of the future deployment quality of each family of models? If the validation set is small, the estimation itself will be poor. If it is large, the estimation will be better, however the number of samples used for training will be low and therefore, we will have a poorly trained model. Consequently, we will have a good estimation of the deployment quality of a model that has been poorly trained. In both extreme cases, the estimation of the deployment quality of each family of models will not be very reliable. In other words, it might not reflect the potential of each family of models to solve the machine learning problem.</p>
<p>In <strong>LOOCV</strong> we conduct multiple rounds of training and deployment quality estimation. During each round, we use one sample for performance estimation and the remaining samples for training - hence the name <em>leave-one-out</em>. During the first round we leave the first sample out, during the second round the second sample and so on until the last round, where we leave the last sample out (<a class="reference internal" href="#loocv"><span class="std std-numref">Fig. 3.18</span></a>). Therefore, there are as many training and deployment quality estimation rounds as there are samples.</p>
<div class="figure align-default" id="loocv">
<img alt="_images/LOOCV.svg" src="_images/LOOCV.svg" /><p class="caption"><span class="caption-number">Fig. 3.18 </span><span class="caption-text">In leave-one-out cross-validation <span class="math notranslate nohighlight">\(N\)</span> rounds of training and deployment quality estimation are conducted, where <span class="math notranslate nohighlight">\(N\)</span> is the size of the available dataset. In each round, only one sample is used to assess the deployment quality and at the end, an average is computed.</span><a class="headerlink" href="#loocv" title="Permalink to this image">#</a></p>
</div>
<p>After completing all the rounds, we collect the individual estimations of the deployment quality produced by each round and compute an average. The resulting figure is the final estimation of the performance of the family of models. Compared to the validation set approach, LOOCV is computationally intensive, as it requires one round of training and performance estimation per sample in the dataset, whereas the validation set approach consisted of one single round. On the other hand, models are trained using most of the available samples and hence, are better trained.</p>
<p>The third validation approach is <strong><span class="math notranslate nohighlight">\(k\)</span>-fold</strong> cross validation. This validation approach randomly splits the available dataset into <span class="math notranslate nohighlight">\(k\)</span> subsets, also known as <em>folds</em>. Then it runs <span class="math notranslate nohighlight">\(k\)</span> training and deployment quality estimation rounds. During each round, one of the folds is used for deployment quality estimation and the remaining folds for training (<a class="reference internal" href="#kfoldcv"><span class="std std-numref">Fig. 3.19</span></a>).</p>
<div class="figure align-default" id="kfoldcv">
<img alt="_images/kfoldCV.svg" src="_images/kfoldCV.svg" /><p class="caption"><span class="caption-number">Fig. 3.19 </span><span class="caption-text"><span class="math notranslate nohighlight">\(k\)</span>-fold cross validation condicts <span class="math notranslate nohighlight">\(k\)</span> rounds of training and deployment quality estimation, by splitting the dataset into <span class="math notranslate nohighlight">\(k\)</span> subsets or folds.</span><a class="headerlink" href="#kfoldcv" title="Permalink to this image">#</a></p>
</div>
<p>We have already seen that although simple and computationally inexpensive, the validation set approach produces deployment quality estimations whose accuracy is unknown. Furthermore, since the validation set approach uses a fraction of the available samples, the trained models will be worse than models trained using all the available samples. Therefore, the deployment quality estimation tends to be too pessimistic. LOOCV uses most of the samples for training, and therefore suffers less from overpessimistic estimations, at the expense of increased computational cost. <span class="math notranslate nohighlight">\(k\)</span>-fold cross validation offers a trade-off that reduces the computational cost by reducing the number of rounds.</p>
</div>
<div class="section" id="summary-and-discussion">
<h2><span class="section-number">3.6. </span>Summary and discussion<a class="headerlink" href="#summary-and-discussion" title="Permalink to this heading">#</a></h2>
<p>In machine learning we seek to build solutions to problems that involve a target population, and our main challenge is to build such solutions without having access to a perfect description of the population. Instead of a perfect description of the population, we assume that have access to datasets extracted from it. Understanding the relationship between populations and datasets is fundamental in machine learning. Since we use datasets as surrogates of our target populations, our datasets have to be <strong>representative</strong>. To achieve this, our datasets need to be <strong>IID</strong> and have a <strong>sufficiently large</strong> number of samples.</p>
<p>To create machine learning models and assess how well they work, we have a methodology which we need to follow rigorously. Perhaps the most important task in this methodology is the <strong>test task</strong>. A test task allows us to evaluate the future deployment quality of an already built model, using a test dataset and a notion of quality. <strong>Test datasets need to be independent from training datasets</strong>, otherwise we risk falling into a data trap and fool ourselves. In addition, to correctly interpret a test quality, it is important to remember that the test quality is random, due to the random nature of the test dataset. <strong>Training tasks</strong> use optimisation approaches to identify the best model according to a notion of quality on a training dataset, which we call empirical <strong><em>cost</em>, <em>loss</em> or <em>error</em></strong>. Gradient descent is an example of an optimisation method, which navigates the empirical error surface in search of the optimal model. We need to be mindful, however, that the the optimal model defined by the empirical error surface might be different from the one defined by the true error surface, which is the one that we would like to find. Finally, <strong>validation tasks</strong> can be used to compare different families of models and asses which one might be more suitable to be trained to solve a particular problem.</p>
<p>Understanding the difference between <strong>true and empirical quantities</strong> is essential in machine learning, as we only have direct access to the latter, but would like to know the former. A test quality and a training error are both empirical quantities defined on datasets, and can be seen as estimations of the true ones defined on the population. Furthermore, the notions of quality during training and deployment are not always the same. This is why we sometimes distinguish between deployment <strong>quality</strong> and training <strong>cost</strong>. Not everyone makes this distinction explicit, hence when reading reports on machine learning projects, we might need to find it out by ourselves.</p>
<p>We started this chapter remembering Ventris’ decisive check and its role in the decipherment of the Linear B script. Indeed, Ventris’ decisive check illustrates one of the most important red lines in the machine learning methodology: never test a machine learning model using samples that you have used for training. There are many other details of the story of the dechiperment of Linear B that resonate with machine learning. We suggested that dechipering Linear B should be an impossible proposition, as we are looking at fragments of text of uknown contents, written in an unknown script, encoding an unknown language. Other ancient scripts were only deciphered thanks to the discovery of multilingual documents, which allowed researchers to access the contents of fragments of text and speculate about the sounds represented by its symbols. The Rosetta stone’s role in deciphering Egyptian hieroglyphs is without doubt the best example of this. Ventris came up with his solution after <strong>hypothesising</strong> that the language encoded in Linear B tablets was an archaic variant of classical Greek. This hypothesis went against the general consensus at the time and constituted a leap in the dark, but turned out to be correct. The important point here is that, as a hypothesis, the Greek nature of the language encoded in Linear B was not a conclusion from analysing his data, it was Ventris’ starting point. In machine learning we sometimes make similar choices and adopt angles that are not suggested by our data. These choices are known as hypotheses.</p>
<p>An excellent understanding of our problems and the domain they belong to is fundamental to come up with sensible hypothesis. Remember our first top tip: <strong>know your domain!</strong> Ventris’ success can be ascribed to the amount of efforts he put into deciphering Linear B, but also to his excellent knowledge of the ancient world, ancient languages and linguistics. Without this background, he would not have been able to decipher Linear B. Interestingly, Ventris also acknowledges that to decipher an unknown script, it is essential to have sufficient material. This also resonates with our discussion about datasets: we need a sufficiently rich collection of samples, in order for our datasets to be representative.</p>
<p>If you are dissapointed that you have not had the opportunity to decipher the Egyptian hieroglyphs or the Linear B script, do not worry, there are still a few writing systems that await decipherment. Examples include the instance the Proto-Elamite, the Rongorongo or the Voynichese scripts. Why not giving them a try?</p>
</div>
</div>

    <script type="text/x-thebe-config">
    {
        requestKernel: true,
        binderOptions: {
            repo: "binder-examples/jupyter-stacks-datascience",
            ref: "master",
        },
        codeMirrorConfig: {
            theme: "abcdef",
            mode: "python"
        },
        kernelOptions: {
            name: "python3",
            path: "./."
        },
        predefinedOutput: true
    }
    </script>
    <script>kernelName = 'python3'</script>

                </article>
              

                <footer class="bd-footer-article">
                  
<div class="footer-article-items footer-article__inner">
  
    <div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
    <a class="left-prev"
       href="Ch_regression.html"
       title="previous page">
      <i class="fa-solid fa-angle-left"></i>
      <div class="prev-next-info">
        <p class="prev-next-subtitle">previous</p>
        <p class="prev-next-title"><span class="section-number">2. </span>Regression</p>
      </div>
    </a>
    <a class="right-next"
       href="Ch_classification1.html"
       title="next page">
      <div class="prev-next-info">
        <p class="prev-next-subtitle">next</p>
        <p class="prev-next-title"><span class="section-number">4. </span>Classification I: The geometric view</p>
      </div>
      <i class="fa-solid fa-angle-right"></i>
    </a>
</div></div>
  
</div>

                </footer>
              
            </div>
            
            
                <div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">

  <div class="sidebar-secondary-item">
  <div class="page-toc tocsection onthispage">
    <i class="fa-solid fa-list"></i> Contents
  </div>
  <nav class="bd-toc-nav page-toc">
    <ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ventris-decisive-check">3.1. Ventris’ decisive check</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#populations-and-datasets">3.2. Populations and datasets</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-notion-of-population">3.2.1. The notion of population</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#from-populations-to-datasets">3.2.2. From populations to datasets</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#the-test-task">3.3. The test task</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#true-and-empirical-qualities">3.3.1. True and empirical qualities</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#testing-as-quality-estimation">3.3.2. Testing as quality estimation</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#the-training-task">3.4. The training task</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-error-surface">3.4.1. The error surface</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#looking-for-the-optimal-model">3.4.2. Looking for the optimal model</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#true-and-empirical-error-surfaces">3.4.3. True and empirical error surfaces</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#optimisation-on-the-empirical-error-surface">3.4.4. Optimisation on the empirical error surface</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#overfitting-and-regularisation">3.4.5. Overfitting and regularisation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#optimisation-quality-and-target-quality">3.4.6. Optimisation quality and target quality</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#the-validation-task">3.5. The validation task</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#summary-and-discussion">3.6. Summary and discussion</a></li>
</ul>
  </nav></div>

</div></div>
              
            
          </div>
          <footer class="bd-footer-content">
            
<div class="bd-footer-content__inner container">
  
  <div class="footer-item">
    
<p class="component-author">
By <a href="https://www.linkedin.com/in/jesus-requena-carrion/" target="_blank">Jesús Requena Carrión</a> and <a href="http://nikeshbajaj.in" target="_blank">Nikesh Bajaj</a>

</p>

  </div>
  
  <div class="footer-item">
    
  <p class="copyright">
    
      © Copyright 2023.
      <br/>
    
  </p>

  </div>
  
  <div class="footer-item">
    
  </div>
  
  <div class="footer-item">
    
  </div>
  
</div>
          </footer>
        

      </main>
    </div>
  </div>
  
  <!-- Scripts loaded after <body> so the DOM is not blocked -->
  <script src="_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>

  <footer class="bd-footer">
  </footer>
  </body>
</html>