3-monitor-the-model.html

<!DOCTYPE html>
<!-- START: inst/pkgdown/templates/layout.html --><!-- Generated by pkgdown: do not edit by hand --><html lang="en" data-bs-theme="auto"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><title>Introduction to deep learning: Monitor the training process</title><meta name="viewport" content="width=device-width, initial-scale=1"><script src="assets/themetoggle.js"></script><link rel="stylesheet" type="text/css" href="assets/styles.css"><script src="assets/scripts.js" type="text/javascript"></script><!-- mathjax --><script type="text/x-mathjax-config">
    MathJax.Hub.Config({
      config: ["MMLorHTML.js"],
      jax: ["input/TeX","input/MathML","output/HTML-CSS","output/NativeMML", "output/PreviewHTML"],
      extensions: ["tex2jax.js","mml2jax.js","MathMenu.js","MathZoom.js", "fast-preview.js", "AssistiveMML.js", "a11y/accessibility-menu.js"],
      TeX: {
        extensions: ["AMSmath.js","AMSsymbols.js","noErrors.js","noUndefined.js"]
      },
      tex2jax: {
        inlineMath: [['\\(', '\\)']],
        displayMath: [ ['$$','$$'], ['\\[', '\\]'] ],
        processEscapes: true
      }
    });
    </script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><!-- Responsive Favicon for The Carpentries --><link rel="apple-touch-icon" sizes="180x180" href="favicons/incubator/apple-touch-icon.png"><link rel="icon" type="image/png" sizes="32x32" href="favicons/incubator/favicon-32x32.png"><link rel="icon" type="image/png" sizes="16x16" href="favicons/incubator/favicon-16x16.png"><link rel="manifest" href="favicons/incubator/site.webmanifest"><link rel="mask-icon" href="favicons/incubator/safari-pinned-tab.svg" color="#5bbad5"><meta name="msapplication-TileColor" content="#da532c"><meta name="theme-color" media="(prefers-color-scheme: light)" content="white"><meta name="theme-color" media="(prefers-color-scheme: dark)" content="black"></head><body>
    <header id="top" class="navbar navbar-expand-md top-nav incubator"><svg xmlns="http://www.w3.org/2000/svg" class="d-none"><symbol id="check2" viewbox="0 0 16 16"><path d="M13.854 3.646a.5.5 0 0 1 0 .708l-7 7a.5.5 0 0 1-.708 0l-3.5-3.5a.5.5 0 1 1 .708-.708L6.5 10.293l6.646-6.647a.5.5 0 0 1 .708 0z"></path></symbol><symbol id="circle-half" viewbox="0 0 16 16"><path d="M8 15A7 7 0 1 0 8 1v14zm0 1A8 8 0 1 1 8 0a8 8 0 0 1 0 16z"></path></symbol><symbol id="moon-stars-fill" viewbox="0 0 16 16"><path d="M6 .278a.768.768 0 0 1 .08.858 7.208 7.208 0 0 0-.878 3.46c0 4.021 3.278 7.277 7.318 7.277.527 0 1.04-.055 1.533-.16a.787.787 0 0 1 .81.316.733.733 0 0 1-.031.893A8.349 8.349 0 0 1 8.344 16C3.734 16 0 12.286 0 7.71 0 4.266 2.114 1.312 5.124.06A.752.752 0 0 1 6 .278z"></path><path d="M10.794 3.148a.217.217 0 0 1 .412 0l.387 1.162c.173.518.579.924 1.097 1.097l1.162.387a.217.217 0 0 1 0 .412l-1.162.387a1.734 1.734 0 0 0-1.097 1.097l-.387 1.162a.217.217 0 0 1-.412 0l-.387-1.162A1.734 1.734 0 0 0 9.31 6.593l-1.162-.387a.217.217 0 0 1 0-.412l1.162-.387a1.734 1.734 0 0 0 1.097-1.097l.387-1.162zM13.863.099a.145.145 0 0 1 .274 0l.258.774c.115.346.386.617.732.732l.774.258a.145.145 0 0 1 0 .274l-.774.258a1.156 1.156 0 0 0-.732.732l-.258.774a.145.145 0 0 1-.274 0l-.258-.774a1.156 1.156 0 0 0-.732-.732l-.774-.258a.145.145 0 0 1 0-.274l.774-.258c.346-.115.617-.386.732-.732L13.863.1z"></path></symbol><symbol id="sun-fill" viewbox="0 0 16 16"><path d="M8 12a4 4 0 1 0 0-8 4 4 0 0 0 0 8zM8 0a.5.5 0 0 1 .5.5v2a.5.5 0 0 1-1 0v-2A.5.5 0 0 1 8 0zm0 13a.5.5 0 0 1 .5.5v2a.5.5 0 0 1-1 0v-2A.5.5 0 0 1 8 13zm8-5a.5.5 0 0 1-.5.5h-2a.5.5 0 0 1 0-1h2a.5.5 0 0 1 .5.5zM3 8a.5.5 0 0 1-.5.5h-2a.5.5 0 0 1 0-1h2A.5.5 0 0 1 3 8zm10.657-5.657a.5.5 0 0 1 0 .707l-1.414 1.415a.5.5 0 1 1-.707-.708l1.414-1.414a.5.5 0 0 1 .707 0zm-9.193 9.193a.5.5 0 0 1 0 .707L3.05 13.657a.5.5 0 0 1-.707-.707l1.414-1.414a.5.5 0 0 1 .707 0zm9.193 2.121a.5.5 0 0 1-.707 0l-1.414-1.414a.5.5 0 0 1 .707-.707l1.414 1.414a.5.5 0 0 1 0 .707zM4.464 4.465a.5.5 0 0 1-.707 0L2.343 3.05a.5.5 0 1 1 .707-.707l1.414 1.414a.5.5 0 0 1 0 .708z"></path></symbol></svg><a class="visually-hidden-focusable skip-link" href="#main-content">Skip to main content</a>
  <div class="container-fluid top-nav-container">
    <div class="col-md-8">
      <div class="large-logo">
        <img id="incubator-logo" alt="Carpentries Incubator" src="assets/images/incubator-logo.svg"></div>
    </div>
    <div class="selector-container">
      <div id="theme-selector">
        <li class="nav-item dropdown" id="theme-button-list">
          <button class="btn btn-link nav-link px-0 px-lg-2 dropdown-toggle d-flex align-items-center" id="bd-theme" type="button" aria-expanded="false" data-bs-toggle="dropdown" data-bs-display="static" aria-label="Toggle theme (auto)">
            <svg class="bi my-1 theme-icon-active"><use href="#circle-half"></use></svg><i data-feather="chevron-down"></i>
          </button>
          <ul class="dropdown-menu dropdown-menu-end" aria-labelledby="bd-theme-text"><li>
              <button type="button" class="btn dropdown-item d-flex align-items-center" data-bs-theme-value="light" aria-pressed="false">
                <svg class="bi me-2 theme-icon"><use href="#sun-fill"></use></svg>
                Light
                <svg class="bi ms-auto d-none"><use href="#check2"></use></svg></button>
            </li>
            <li>
              <button type="button" class="btn dropdown-item d-flex align-items-center" data-bs-theme-value="dark" aria-pressed="false">
                <svg class="bi me-2 theme-icon"><use href="#moon-stars-fill"></use></svg>
                Dark
                <svg class="bi ms-auto d-none"><use href="#check2"></use></svg></button>
            </li>
            <li>
              <button type="button" class="btn dropdown-item d-flex align-items-center active" data-bs-theme-value="auto" aria-pressed="true">
                <svg class="bi me-2 theme-icon"><use href="#circle-half"></use></svg>
                Auto
                <svg class="bi ms-auto d-none"><use href="#check2"></use></svg></button>
            </li>
          </ul></li>
      </div>

      <div class="dropdown" id="instructor-dropdown">
        <button class="btn btn-secondary dropdown-toggle bordered-button" type="button" id="dropdownMenu1" data-bs-toggle="dropdown" aria-expanded="false">
          <i aria-hidden="true" class="icon" data-feather="eye"></i> Learner View <i data-feather="chevron-down"></i>
        </button>
        <ul class="dropdown-menu" aria-labelledby="dropdownMenu1"><li><button class="dropdown-item" type="button" onclick="window.location.href='instructor/3-monitor-the-model.html';">Instructor View</button></li>
        </ul></div>
    </div>
  </div>
  <hr></header><nav class="navbar navbar-expand-xl bottom-nav incubator" aria-label="Main Navigation"><div class="container-fluid nav-container">
    <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarSupportedContent" aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle Navigation">
      <span class="navbar-toggler-icon"></span>
      <span class="menu-title">Menu</span>
    </button>
    <div class="nav-logo">
      <img class="small-logo" alt="Carpentries Incubator" src="assets/images/incubator-logo-sm.svg"></div>
    <div class="lesson-title-md">
      Introduction to deep learning
    </div>
    <div class="search-icon-sm">
      <!-- TODO: do not show until we have search
        <i role="img" aria-label="Search the All In One page" data-feather="search"></i>
      -->
    </div>
    <div class="desktop-nav">
      <ul class="navbar-nav me-auto mb-2 mb-lg-0"><li class="nav-item">
          <span class="lesson-title">
            Introduction to deep learning
          </span>
        </li>
        <li class="nav-item">
          <a class="nav-link" href="key-points.html">Key Points</a>
        </li>
        <li class="nav-item">
          <a class="nav-link" href="reference.html#glossary">Glossary</a>
        </li>
        <li class="nav-item">
          <a class="nav-link" href="profiles.html">Learner Profiles</a>
        </li>
        <li class="nav-item dropdown">
          <button class="nav-link dropdown-toggle" id="navbarDropdown" data-bs-toggle="dropdown" aria-expanded="false">
            More <i data-feather="chevron-down"></i>
          </button>
          <ul class="dropdown-menu" aria-labelledby="navbarDropdown"><li><a class="dropdown-item" href="reference.html">Reference</a></li>
          </ul></li>
      </ul></div>
    <!--
    <form class="d-flex col-md-2 search-form">
      <fieldset disabled>
      <input class="form-control me-2 searchbox" type="search" placeholder="" aria-label="">
        <button class="btn btn-outline-success tablet-search-button"  type="submit">
          <i class="search-icon" data-feather="search" role="img" aria-label="Search the All In One page"></i>
        </button>
      </fieldset>
    </form>
    -->
    <a id="search-button" class="btn btn-primary" href="aio.html" role="button" aria-label="Search the All In One page">Search the All In One page</a>
  </div><!--/div.container-fluid -->
</nav><div class="col-md-12 mobile-title">
  Introduction to deep learning
</div>

<aside class="col-md-12 lesson-progress"><div style="width: 31%" class="percentage">
    31%
  </div>
  <div class="progress incubator">
    <div class="progress-bar incubator" role="progressbar" style="width: 31%" aria-valuenow="31" aria-label="Lesson Progress" aria-valuemin="0" aria-valuemax="100">
    </div>
  </div>
</aside><div class="container">
      <div class="row">
        <!-- START: inst/pkgdown/templates/navbar.html -->
<div id="sidebar-col" class="col-lg-4">
  <div id="sidebar" class="sidebar">
      <nav aria-labelledby="flush-headingEleven"><button role="button" aria-label="close menu" alt="close menu" aria-expanded="true" aria-controls="sidebar" class="collapse-toggle" data-collapse="Collapse " data-episodes="Episodes ">
          <i class="search-icon" data-feather="x" role="img"></i>
        </button>
        <div class="sidebar-inner">
          <div class="row mobile-row" id="theme-row-mobile">
            <div class="col" id="theme-selector">
              <li class="nav-item dropdown" id="theme-button-list">
                <button class="btn btn-link nav-link px-0 px-lg-2 dropdown-toggle d-flex align-items-center" id="bd-theme" type="button" aria-expanded="false" data-bs-toggle="dropdown" data-bs-display="static" aria-label="Toggle theme (auto)">
                  <svg class="bi my-1 theme-icon-active"><use href="#circle-half"></use></svg><span class="d-lg-none ms-1" id="bd-theme-text">Toggle Theme</span>
                </button>
                <ul class="dropdown-menu dropdown-menu-right" aria-labelledby="bd-theme-text"><li>
                    <button type="button" class="btn dropdown-item d-flex align-items-center" data-bs-theme-value="light" aria-pressed="false">
                      <svg class="bi me-2 theme-icon"><use href="#sun-fill"></use></svg>
                      Light
                      <svg class="bi ms-auto d-none"><use href="#check2"></use></svg></button>
                  </li>
                  <li>
                    <button type="button" class="btn dropdown-item d-flex align-items-center" data-bs-theme-value="dark" aria-pressed="false">
                      <svg class="bi me-2 theme-icon"><use href="#moon-stars-fill"></use></svg>
                      Dark
                      <svg class="bi ms-auto d-none"><use href="#check2"></use></svg></button>
                  </li>
                  <li>
                    <button type="button" class="btn dropdown-item d-flex align-items-center active" data-bs-theme-value="auto" aria-pressed="true">
                      <svg class="bi me-2 theme-icon"><use href="#circle-half"></use></svg>
                      Auto
                      <svg class="bi ms-auto d-none"><use href="#check2"></use></svg></button>
                  </li>
                </ul></li>
            </div>
          </div>
          <div class="row mobile-row">
            <div class="col">
              <div class="sidenav-view-selector">
                <div class="accordion accordion-flush" id="accordionFlush9">
                  <div class="accordion-item">
                    <h2 class="accordion-header" id="flush-headingNine">
                      <button class="accordion-button collapsed" id="instructor" type="button" data-bs-toggle="collapse" data-bs-target="#flush-collapseNine" aria-expanded="false" aria-controls="flush-collapseNine">
                        <i id="eye" aria-hidden="true" class="icon" data-feather="eye"></i> Learner View
                      </button>
                    </h2>
                    <div id="flush-collapseNine" class="accordion-collapse collapse" aria-labelledby="flush-headingNine" data-bs-parent="#accordionFlush2">
                      <div class="accordion-body">
                        <a href="instructor/3-monitor-the-model.html">Instructor View</a>
                      </div>
                    </div>
                  </div><!--/div.accordion-item-->
                </div><!--/div.accordion-flush-->
              </div><!--div.sidenav-view-selector -->
            </div><!--/div.col -->

            <hr></div><!--/div.mobile-row -->

          <div class="accordion accordion-flush" id="accordionFlush11">
            <div class="accordion-item">

              <button id="chapters" class="accordion-button show" type="button" data-bs-toggle="collapse" data-bs-target="#flush-collapseEleven" aria-expanded="false" aria-controls="flush-collapseEleven">
                <h2 class="accordion-header chapters" id="flush-headingEleven">
                  EPISODES
                </h2>
              </button>
              <div id="flush-collapseEleven" class="accordion-collapse show collapse" aria-labelledby="flush-headingEleven" data-bs-parent="#accordionFlush11">

                <div class="accordion-body">
                  <div class="accordion accordion-flush" id="accordionFlush1">
  <div class="accordion-item">
    <div class="accordion-header" id="flush-heading1">
        <a href="index.html">Summary and Setup</a>
    </div><!--/div.accordion-header-->

  </div><!--/div.accordion-item-->
</div><!--/div.accordion-flush-->

<div class="accordion accordion-flush" id="accordionFlush2">
  <div class="accordion-item">
    <div class="accordion-header" id="flush-heading2">
        <a href="1-introduction.html">1. Introduction</a>
    </div><!--/div.accordion-header-->

  </div><!--/div.accordion-item-->
</div><!--/div.accordion-flush-->

<div class="accordion accordion-flush" id="accordionFlush3">
  <div class="accordion-item">
    <div class="accordion-header" id="flush-heading3">
        <a href="2-keras.html">2. Classification by a neural network using Keras</a>
    </div><!--/div.accordion-header-->

  </div><!--/div.accordion-item-->
</div><!--/div.accordion-flush-->

<div class="accordion accordion-flush" id="accordionFlushcurrent">
  <div class="accordion-item">
    <div class="accordion-header" id="flush-headingcurrent">
      <button class="accordion-button" type="button" data-bs-toggle="collapse" data-bs-target="#flush-collapsecurrent" aria-expanded="true" aria-controls="flush-collapsecurrent">
        <span class="visually-hidden">Current Chapter</span>
        <span class="current-chapter">
        3. Monitor the training process
        </span>
      </button>
    </div><!--/div.accordion-header-->

    <div id="flush-collapsecurrent" class="accordion-collapse collapse show" aria-labelledby="flush-headingcurrent" data-bs-parent="#accordionFlushcurrent">
      <div class="accordion-body">
        <ul><li><a href="#formulate-outline-the-problem-weather-prediction">1. Formulate / Outline the problem: weather prediction</a></li>
<li><a href="#identify-inputs-and-outputs">2. Identify inputs and outputs</a></li>
<li><a href="#prepare-data">3. Prepare data</a></li>
<li><a href="#choose-a-pretrained-model-or-start-building-architecture-from-scratch">4. Choose a pretrained model or start building architecture from
scratch</a></li>
<li><a href="#intermezzo-how-do-neural-networks-learn">Intermezzo: How do neural networks learn?</a></li>
<li><a href="#choose-a-loss-function-and-optimizer">5. Choose a loss function and optimizer</a></li>
<li><a href="#train-the-model">6. Train the model</a></li>
<li><a href="#perform-a-predictionclassification">7. Perform a Prediction/Classification</a></li>
<li><a href="#measure-performance">8. Measure performance</a></li>
<li><a href="#refine-the-model">9. Refine the model</a></li>
<li><a href="#save-model">10. Save model</a></li>
<li><a href="#outlook">Outlook</a></li>
        </ul></div><!--/div.accordion-body-->
    </div><!--/div.accordion-collapse-->

  </div><!--/div.accordion-item-->
</div><!--/div.accordion-flush-->

<div class="accordion accordion-flush" id="accordionFlush5">
  <div class="accordion-item">
    <div class="accordion-header" id="flush-heading5">
        <a href="4-advanced-layer-types.html">4. Advanced layer types</a>
    </div><!--/div.accordion-header-->

  </div><!--/div.accordion-item-->
</div><!--/div.accordion-flush-->

<div class="accordion accordion-flush" id="accordionFlush6">
  <div class="accordion-item">
    <div class="accordion-header" id="flush-heading6">
        <a href="5-transfer-learning.html">5. Transfer learning</a>
    </div><!--/div.accordion-header-->

  </div><!--/div.accordion-item-->
</div><!--/div.accordion-flush-->

<div class="accordion accordion-flush" id="accordionFlush7">
  <div class="accordion-item">
    <div class="accordion-header" id="flush-heading7">
        <a href="6-outlook.html">6. Outlook</a>
    </div><!--/div.accordion-header-->

  </div><!--/div.accordion-item-->
</div><!--/div.accordion-flush-->

                </div>
              </div>
            </div>

            <hr class="half-width"><div class="accordion accordion-flush lesson-resources" id="accordionFlush12">
              <div class="accordion-item">
                <h2 class="accordion-header" id="flush-headingTwelve">
                  <button class="accordion-button collapsed" id="lesson-resources" type="button" data-bs-toggle="collapse" data-bs-target="#flush-collapseTwelve" aria-expanded="false" aria-controls="flush-collapseTwelve">
                    RESOURCES
                  </button>
                </h2>
                <div id="flush-collapseTwelve" class="accordion-collapse collapse" aria-labelledby="flush-headingTwelve" data-bs-parent="#accordionFlush12">
                  <div class="accordion-body">
                    <ul><li>
                        <a href="key-points.html">Key Points</a>
                      </li>
                      <li>
                        <a href="reference.html#glossary">Glossary</a>
                      </li>
                      <li>
                        <a href="profiles.html">Learner Profiles</a>
                      </li>
                      <li><a href="reference.html">Reference</a></li>
                    </ul></div>
                </div>
              </div>
            </div>
            <hr class="half-width lesson-resources"><a href="aio.html">See all in one page</a>


            <hr class="d-none d-sm-block d-md-none"><div class="d-grid gap-1">

            </div>
          </div><!-- /div.accordion -->
        </div><!-- /div.sidebar-inner -->
      </nav></div><!-- /div.sidebar -->
  </div><!-- /div.sidebar-col -->
<!-- END:   inst/pkgdown/templates/navbar.html-->

        <!-- START: inst/pkgdown/templates/content-instructor.html -->
  <div class="col-xl-8 col-lg-12 primary-content">
    <nav class="lesson-content mx-md-4" aria-label="Previous and Next Chapter"><!-- content for small screens --><div class="d-block d-sm-block d-md-none">
        <a class="chapter-link" href="2-keras.html"><i aria-hidden="true" class="small-arrow" data-feather="arrow-left"></i>Previous</a>
        <a class="chapter-link float-end" href="4-advanced-layer-types.html">Next<i aria-hidden="true" class="small-arrow" data-feather="arrow-right"></i></a>
      </div>
      <!-- content for large screens -->
      <div class="d-none d-sm-none d-md-block">
        <a class="chapter-link" href="2-keras.html" rel="prev">
          <i aria-hidden="true" class="small-arrow" data-feather="arrow-left"></i>
          Previous: Classification by a
        </a>
        <a class="chapter-link float-end" href="4-advanced-layer-types.html" rel="next">
          Next: Advanced layer types...
          <i aria-hidden="true" class="small-arrow" data-feather="arrow-right"></i>
        </a>
      </div>
      <hr></nav><main id="main-content" class="main-content"><div class="container lesson-content">
        <h1>Monitor the training process</h1>
        <p>Last updated on 2024-12-03 |

        <a href="https://github.com/carpentries-incubator/deep-learning-intro/edit/main/episodes/3-monitor-the-model.Rmd" class="external-link">Edit this page <i aria-hidden="true" data-feather="edit"></i></a></p>


        <div class="text-end">
          <button role="button" aria-pressed="false" tabindex="0" id="expand-code" class="pull-right" data-expand="Expand All Solutions " data-collapse="Collapse All Solutions "> Expand All Solutions <i aria-hidden="true" data-feather="plus"></i></button>
        </div>

        
<div class="overview card">
<h2 class="card-header">Overview</h2>
<div class="row g-0">
<div class="col-md-4">
<div class="card-body">
<div class="inner">
<h3 class="card-title">Questions</h3>
<ul><li>How do I create a neural network for a regression task?</li>
<li>How does optimization work?</li>
<li>How do I monitor the training process?</li>
<li>How do I detect (and avoid) overfitting?</li>
<li>What are common options to improve the model performance?</li>
</ul></div>
</div>
</div>
<div class="col-md-8">
<div class="card-body">
<div class="inner bordered">
<h3 class="card-title">Objectives</h3>
<ul><li>Explain the importance of keeping your test set clean, by validating
on the validation set instead of the test set</li>
<li>Use the data splits to plot the training process</li>
<li>Explain how optimization works</li>
<li>Design a neural network for a regression task</li>
<li>Measure the performance of your deep neural network</li>
<li>Interpret the training plots to recognize overfitting</li>
<li>Use normalization as preparation step for deep learning</li>
<li>Implement basic strategies to prevent overfitting</li>
</ul></div>
</div>
</div>
</div>
</div>

<p>In this episode we will explore how to monitor the training progress,
evaluate our the model predictions and finetune the model to avoid
over-fitting. For that we will use a more complicated weather
data-set.</p>
<section><h2 class="section-heading" id="formulate-outline-the-problem-weather-prediction">1. Formulate / Outline the problem: weather prediction<a class="anchor" aria-label="anchor" href="#formulate-outline-the-problem-weather-prediction"></a></h2>
<hr class="half-width"><p>Here we want to work with the <em>weather prediction dataset</em>
(the light version) which can be <a href="https://doi.org/10.5281/zenodo.5071376" class="external-link">downloaded from
Zenodo</a>. It contains daily weather observations from 11 different
European cities or places through the years 2000 to 2010. For all
locations the data contains the variables ‘mean temperature’, ‘max
temperature’, and ‘min temperature’. In addition, for multiple
locations, the following variables are provided: ‘cloud_cover’,
‘wind_speed’, ‘wind_gust’, ‘humidity’, ‘pressure’, ‘global_radiation’,
‘precipitation’, ‘sunshine’, but not all of them are provided for every
location. A more extensive description of the dataset including the
different physical units is given in accompanying metadata file. The
full dataset comprises of 10 years (3654 days) of collected weather data
across Europe.</p>
<figure><img src="fig/03_weather_prediction_dataset_map.png" alt="18 European locations in the weather prediction dataset" class="figure mx-auto d-block"><div class="figcaption">European locations in the weather prediction
dataset</div>
</figure><p>A very common task with weather data is to make a prediction about
the weather sometime in the future, say the next day. In this episode,
we will try to predict tomorrow’s sunshine hours, a
challenging-to-predict feature, using a neural network with the
available weather data for one location: BASEL.</p>
</section><section><h2 class="section-heading" id="identify-inputs-and-outputs">2. Identify inputs and outputs<a class="anchor" aria-label="anchor" href="#identify-inputs-and-outputs"></a></h2>
<hr class="half-width"><div class="section level3">
<h3 id="import-dataset">Import Dataset<a class="anchor" aria-label="anchor" href="#import-dataset"></a></h3>
<p>We will now import and explore the weather data-set:</p>
<div id="load-the-data" class="callout">
<div class="callout-square">
<i class="callout-icon" data-feather="bell"></i>
</div>
<div id="load-the-data" class="callout-inner">
<h3 class="callout-title">Load the data</h3>
<div class="callout-content">
<p>If you have not downloaded the data yet, you can also load it
directly from Zenodo:</p>
<div class="codewrapper sourceCode" id="cb1">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" tabindex="-1"></a>data <span class="op">=</span> pd.read_csv(<span class="st">"https://zenodo.org/record/5071376/files/weather_prediction_dataset_light.csv?download=1"</span>)</span></code></pre>
</div>
</div>
</div>
</div>
<div class="codewrapper sourceCode" id="cb2">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
<span id="cb2-2"><a href="#cb2-2" tabindex="-1"></a></span>
<span id="cb2-3"><a href="#cb2-3" tabindex="-1"></a>filename_data <span class="op">=</span> <span class="st">"weather_prediction_dataset_light.csv"</span></span>
<span id="cb2-4"><a href="#cb2-4" tabindex="-1"></a>data <span class="op">=</span> pd.read_csv(filename_data)</span>
<span id="cb2-5"><a href="#cb2-5" tabindex="-1"></a>data.head()</span></code></pre>
</div>
<table class="table"><colgroup><col width="7%"><col width="7%"><col width="17%"><col width="16%"><col width="21%"><col width="14%"><col width="14%"></colgroup><thead><tr class="header"><th align="right"></th>
<th align="right">DATE</th>
<th align="right">MONTH</th>
<th align="right">BASEL_cloud_cover</th>
<th align="right">BASEL_humidity</th>
<th align="right">BASEL_pressure</th>
<th align="right">…</th>
</tr></thead><tbody><tr class="odd"><td align="right">0</td>
<td align="right">20000101</td>
<td align="right">1</td>
<td align="right">8</td>
<td align="right">0.89</td>
<td align="right">1.0286</td>
<td align="right">…</td>
</tr><tr class="even"><td align="right">1</td>
<td align="right">20000102</td>
<td align="right">1</td>
<td align="right">8</td>
<td align="right">0.87</td>
<td align="right">1.0318</td>
<td align="right">…</td>
</tr><tr class="odd"><td align="right">2</td>
<td align="right">20000103</td>
<td align="right">1</td>
<td align="right">5</td>
<td align="right">0.81</td>
<td align="right">1.0314</td>
<td align="right">…</td>
</tr><tr class="even"><td align="right">3</td>
<td align="right">20000104</td>
<td align="right">1</td>
<td align="right">7</td>
<td align="right">0.79</td>
<td align="right">1.0262</td>
<td align="right">…</td>
</tr><tr class="odd"><td align="right">4</td>
<td align="right">20000105</td>
<td align="right">1</td>
<td align="right">5</td>
<td align="right">0.90</td>
<td align="right">1.0246</td>
<td align="right">…</td>
</tr></tbody></table></div>
<div class="section level3">
<h3 id="brief-exploration-of-the-data">Brief exploration of the data<a class="anchor" aria-label="anchor" href="#brief-exploration-of-the-data"></a></h3>
<p>Let us start with a quick look at the type of features that we find
in the data.</p>
<div class="codewrapper sourceCode" id="cb3">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" tabindex="-1"></a>data.columns</span></code></pre>
</div>
<div class="codewrapper">
<h3 class="code-label">OUTPUT<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="output" tabindex="0"><code>Index(['DATE', 'MONTH', 'BASEL_cloud_cover', 'BASEL_humidity',
       'BASEL_pressure', 'BASEL_global_radiation', 'BASEL_precipitation',
       'BASEL_sunshine', 'BASEL_temp_mean', 'BASEL_temp_min', 'BASEL_temp_max',
        ...
       'SONNBLICK_temp_min', 'SONNBLICK_temp_max', 'TOURS_humidity',
       'TOURS_pressure', 'TOURS_global_radiation', 'TOURS_precipitation',
       'TOURS_temp_mean', 'TOURS_temp_min', 'TOURS_temp_max'],
      dtype='object')</code></pre>
</div>
<p>There is a total of 9 different measured variables (global_radiation,
humidity, etcetera)</p>
<p>Let’s have a look at the shape of the dataset:</p>
<div class="codewrapper sourceCode" id="cb5">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" tabindex="-1"></a>data.shape</span></code></pre>
</div>
<div class="codewrapper">
<h3 class="code-label">OUTPUT<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="output" tabindex="0"><code>(3654, 91)</code></pre>
</div>
<p>This will give both the number of samples (3654) and the number of
features (89 + month + date).</p>
</div>
</section><section><h2 class="section-heading" id="prepare-data">3. Prepare data<a class="anchor" aria-label="anchor" href="#prepare-data"></a></h2>
<hr class="half-width"><div class="section level3">
<h3 id="select-a-subset-and-split-into-data-x-and-labels-y">Select a subset and split into data (X) and labels (y)<a class="anchor" aria-label="anchor" href="#select-a-subset-and-split-into-data-x-and-labels-y"></a></h3>
<p>The full dataset comprises of 10 years (3654 days) from which we will
select only the first 3 years. The present dataset is sorted by “DATE”,
so for each row <code>i</code> in the table we can pick a corresponding
feature and location from row <code>i+1</code> that we later want to
predict with our model. As outlined in step 1, we would like to predict
the sunshine hours for the location: BASEL.</p>
<div class="codewrapper sourceCode" id="cb7">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" tabindex="-1"></a>nr_rows <span class="op">=</span> <span class="dv">365</span><span class="op">*</span><span class="dv">3</span> <span class="co"># 3 years</span></span>
<span id="cb7-2"><a href="#cb7-2" tabindex="-1"></a><span class="co"># data</span></span>
<span id="cb7-3"><a href="#cb7-3" tabindex="-1"></a>X_data <span class="op">=</span> data.loc[:nr_rows] <span class="co"># Select first 3 years</span></span>
<span id="cb7-4"><a href="#cb7-4" tabindex="-1"></a>X_data <span class="op">=</span> X_data.drop(columns<span class="op">=</span>[<span class="st">'DATE'</span>, <span class="st">'MONTH'</span>]) <span class="co"># Drop date and month column</span></span>
<span id="cb7-5"><a href="#cb7-5" tabindex="-1"></a></span>
<span id="cb7-6"><a href="#cb7-6" tabindex="-1"></a><span class="co"># labels (sunshine hours the next day)</span></span>
<span id="cb7-7"><a href="#cb7-7" tabindex="-1"></a>y_data <span class="op">=</span> data.loc[<span class="dv">1</span>:(nr_rows <span class="op">+</span> <span class="dv">1</span>)][<span class="st">"BASEL_sunshine"</span>]</span></code></pre>
</div>
<p>In general, it is important to check if the data contains any
unexpected values such as <code>9999</code> or <code>NaN</code> or
<code>NoneType</code>. You can use the pandas
<code>data.describe()</code> or <code>data.isnull()</code> function for
this. If so, such values must be removed or replaced. In the present
case the data is luckily well prepared and shouldn’t contain such
values, so that this step can be omitted.</p>
</div>
<div class="section level3">
<h3 id="split-data-and-labels-into-training-validation-and-test-set">Split data and labels into training, validation, and test set<a class="anchor" aria-label="anchor" href="#split-data-and-labels-into-training-validation-and-test-set"></a></h3>
<p>As with classical machine learning techniques, it is required in deep
learning to split off a hold-out <em>test set</em> which remains
untouched during model training and tuning. It is later used to evaluate
the model performance. On top, we will also split off an additional
<em>validation set</em>, the reason of which will hopefully become
clearer later in this lesson.</p>
<p>To make our lives a bit easier, we employ a trick to create these 3
datasets, <code>training set</code>, <code>test set</code> and
<code>validation set</code>, by calling the
<code>train_test_split</code> method of <code>scikit-learn</code>
twice.</p>
<p>First we create the training set and leave the remainder of 30 % of
the data to the two hold-out sets.</p>
<div class="codewrapper sourceCode" id="cb8">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> train_test_split</span>
<span id="cb8-2"><a href="#cb8-2" tabindex="-1"></a></span>
<span id="cb8-3"><a href="#cb8-3" tabindex="-1"></a>X_train, X_holdout, y_train, y_holdout <span class="op">=</span> train_test_split(X_data, y_data, test_size<span class="op">=</span><span class="fl">0.3</span>, random_state<span class="op">=</span><span class="dv">0</span>)</span></code></pre>
</div>
<p>Now we split the 30 % of the data in two equal sized parts.</p>
<div class="codewrapper sourceCode" id="cb9">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" tabindex="-1"></a>X_val, X_test, y_val, y_test <span class="op">=</span> train_test_split(X_holdout, y_holdout, test_size<span class="op">=</span><span class="fl">0.5</span>, random_state<span class="op">=</span><span class="dv">0</span>)</span></code></pre>
</div>
<p>Setting the <code>random_state</code> to <code>0</code> is a
short-hand at this point. Note however, that changing this seed of the
pseudo-random number generator will also change the composition of your
data sets. For the sake of reproducibility, this is one example of a
parameters that should not change at all.</p>

</div>
</section><section><h2 class="section-heading" id="choose-a-pretrained-model-or-start-building-architecture-from-scratch">4. Choose a pretrained model or start building architecture from
scratch<a class="anchor" aria-label="anchor" href="#choose-a-pretrained-model-or-start-building-architecture-from-scratch"></a></h2>
<hr class="half-width"><div class="section level3">
<h3 id="regression-and-classification">Regression and classification<a class="anchor" aria-label="anchor" href="#regression-and-classification"></a></h3>
<p>In episode 2 we trained a dense neural network on a
<em>classification task</em>. For this one hot encoding was used
together with a <code>Categorical Crossentropy</code> loss function.
This measured how close the distribution of the neural network outputs
corresponds to the distribution of the three values in the one hot
encoding. Now we want to work on a <em>regression task</em>, thus not
predicting a class label (or integer number) for a datapoint. In
regression, we predict one (and sometimes many) values of a feature.
This is typically a floating point number.</p>
<div id="exercise-architecture-of-the-network" class="callout challenge">
<div class="callout-square">
<i class="callout-icon" data-feather="zap"></i>
</div>
<div id="exercise-architecture-of-the-network" class="callout-inner">
<h3 class="callout-title">Exercise: Architecture of the network</h3>
<div class="callout-content">
<p>As we want to design a neural network architecture for a regression
task, see if you can first come up with the answers to the following
questions:</p>
<ol style="list-style-type: decimal"><li>What must be the dimension of our input layer?</li>
<li>We want to output the prediction of a single number. The output
layer of the NN hence cannot be the same as for the classification task
earlier. This is because the <code>softmax</code> activation being used
had a concrete meaning with respect to the class labels which is not
needed here. What output layer design would you choose for regression?
Hint: A layer with <code>relu</code> activation, with
<code>sigmoid</code> activation or no activation at all?</li>
<li>(Optional) How would we change the model if we would like to output
a prediction of the precipitation in Basel in <em>addition</em> to the
sunshine hours?</li>
</ol></div>
</div>
</div>
<div id="accordionSolution1" class="accordion challenge-accordion accordion-flush">
<div class="accordion-item">
<button class="accordion-button solution-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseSolution1" aria-expanded="false" aria-controls="collapseSolution1">
  <h4 class="accordion-header" id="headingSolution1"> Show me the solution </h4>
</button>
<div id="collapseSolution1" class="accordion-collapse collapse" aria-labelledby="headingSolution1" data-bs-parent="#accordionSolution1">
<div class="accordion-body">
<ol style="list-style-type: decimal"><li>The shape of the input layer has to correspond to the number of
features in our data: 89</li>
<li>The output is a single value per prediction, so the output layer can
consist of a dense layer with only one node. The <em>softmax</em>
activiation function works well for a classification task, but here we
do not want to restrict the possible outcomes to the range of zero and
one. In fact, we can omit the activation in the output layer.</li>
<li>The output layer should have 2 neurons, one for each number that we
try to predict. Our y_train (and val and test) then becomes a
(n_samples, 2) matrix.</li>
</ol></div>
</div>
</div>
</div>
<p>In our example we want to predict the sunshine hours in Basel (or any
other place in the dataset) for tomorrow based on the weather data of
all 18 locations today. <code>BASEL_sunshine</code> is a floating point
value (i.e. <code>float64</code>). The network should hence output a
single float value which is why the last layer of our network will only
consist of a single node.</p>
<p>We compose a network of two hidden layers to start off with
something. We go by a scheme with 100 neurons in the first hidden layer
and 50 neurons in the second layer. As activation function we settle on
the <code>relu</code> function as a it proved very robust and widely
used. To make our live easier later, we wrap the definition of the
network in a method called <code>create_nn</code>.</p>
<div class="codewrapper sourceCode" id="cb10">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" tabindex="-1"></a><span class="im">from</span> tensorflow <span class="im">import</span> keras</span>
<span id="cb10-2"><a href="#cb10-2" tabindex="-1"></a></span>
<span id="cb10-3"><a href="#cb10-3" tabindex="-1"></a><span class="kw">def</span> create_nn():</span>
<span id="cb10-4"><a href="#cb10-4" tabindex="-1"></a>    <span class="co"># Input layer</span></span>
<span id="cb10-5"><a href="#cb10-5" tabindex="-1"></a>    inputs <span class="op">=</span> keras.Input(shape<span class="op">=</span>(X_data.shape[<span class="dv">1</span>],), name<span class="op">=</span><span class="st">'input'</span>)</span>
<span id="cb10-6"><a href="#cb10-6" tabindex="-1"></a></span>
<span id="cb10-7"><a href="#cb10-7" tabindex="-1"></a>    <span class="co"># Dense layers</span></span>
<span id="cb10-8"><a href="#cb10-8" tabindex="-1"></a>    layers_dense <span class="op">=</span> keras.layers.Dense(<span class="dv">100</span>, <span class="st">'relu'</span>)(inputs)</span>
<span id="cb10-9"><a href="#cb10-9" tabindex="-1"></a>    layers_dense <span class="op">=</span> keras.layers.Dense(<span class="dv">50</span>, <span class="st">'relu'</span>)(layers_dense)</span>
<span id="cb10-10"><a href="#cb10-10" tabindex="-1"></a></span>
<span id="cb10-11"><a href="#cb10-11" tabindex="-1"></a>    <span class="co"># Output layer</span></span>
<span id="cb10-12"><a href="#cb10-12" tabindex="-1"></a>    outputs <span class="op">=</span> keras.layers.Dense(<span class="dv">1</span>)(layers_dense)</span>
<span id="cb10-13"><a href="#cb10-13" tabindex="-1"></a></span>
<span id="cb10-14"><a href="#cb10-14" tabindex="-1"></a>    <span class="cf">return</span> keras.Model(inputs<span class="op">=</span>inputs, outputs<span class="op">=</span>outputs, name<span class="op">=</span><span class="st">"weather_prediction_model"</span>)</span>
<span id="cb10-15"><a href="#cb10-15" tabindex="-1"></a></span>
<span id="cb10-16"><a href="#cb10-16" tabindex="-1"></a>model <span class="op">=</span> create_nn()</span></code></pre>
</div>
<p>The shape of the input layer has to correspond to the number of
features in our data: <code>89</code>. We use
<code>X_data.shape[1]</code> to obtain this value dynamically</p>
<p>The output layer here is a dense layer with only 1 node. And we here
have chosen to use <em>no activation function</em>. While we might use
<em>softmax</em> for a classification task, here we do not want to
restrict the possible outcomes for a start.</p>
<p>In addition, we have here chosen to write the network creation as a
function so that we can use it later again to initiate new models.</p>
<p>Let us check how our model looks like by calling the
<code>summary</code> method.</p>
<div class="codewrapper sourceCode" id="cb11">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" tabindex="-1"></a>model.summary()</span></code></pre>
</div>
<div class="codewrapper">
<h3 class="code-label">OUTPUT<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="output" tabindex="0"><code>Model: "weather_prediction_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #
=================================================================
input (InputLayer)           [(None, 89)]              0
_________________________________________________________________
dense (Dense)                (None, 100)               9000
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51
=================================================================
Total params: 14,101
Trainable params: 14,101
Non-trainable params: 0</code></pre>
</div>
<p>When compiling the model we can define a few very important aspects.
We will discuss them now in more detail.</p>
</div>
</section><section><h2 class="section-heading" id="intermezzo-how-do-neural-networks-learn">Intermezzo: How do neural networks learn?<a class="anchor" aria-label="anchor" href="#intermezzo-how-do-neural-networks-learn"></a></h2>
<hr class="half-width"><p>In the introduction we learned about the loss function: it quantifies
the total error of the predictions made by the model. During model
training we aim to find the model parameters that minimize the loss.
This is called optimization, but how does optimization actually
work?</p>
<div class="section level3">
<h3 id="gradient-descent">Gradient descent<a class="anchor" aria-label="anchor" href="#gradient-descent"></a></h3>
<p>Gradient descent is a widely used optimization algorithm, most other
optimization algorithms are based on it. It works as follows: Imagine a
neural network with only one neuron. Take a look at the figure below.
The plot shows the loss as a function of the weight of the neuron. As
you can see there is a global loss minimum, we would like to find the
weight at this point in the parabola. To do this, we initialize the
model weight with some random value. Then we compute the gradient of the
loss function with respect to the weight. This tells us how much the
loss function will change if we change the weight by a small amount.
Then, we update the weight by taking a small step in the direction of
the negative gradient, so down the slope. This will slightly decrease
the loss. This process is repeated until the loss function reaches a
minimum. The size of the step that is taken in each iteration is called
the ‘learning rate’.</p>
<figure><img src="fig/03_gradient_descent.png" alt="Plot of the loss as a function of the weights. Through gradient descent the global loss minimum is found" class="figure mx-auto d-block"></figure></div>
<div class="section level3">
<h3 id="batch-gradient-descent">Batch gradient descent<a class="anchor" aria-label="anchor" href="#batch-gradient-descent"></a></h3>
<p>You could use the entire training dataset to perform one learning
step in gradient descent, which would mean that one epoch equals one
learning step. In practice, in each learning step we only use a subset
of the training data to compute the loss and the gradients. This subset
is called a ‘batch’, the number of samples in one batch is called the
‘batch size’.</p>
<div id="exercise-gradient-descent" class="callout challenge">
<div class="callout-square">
<i class="callout-icon" data-feather="zap"></i>
</div>
<div id="exercise-gradient-descent" class="callout-inner">
<h3 class="callout-title">Exercise: Gradient descent</h3>
<div class="callout-content">
<p>Answer the following questions:</p>
<div class="section level3">
<h3 id="what-is-the-goal-of-optimization">1. What is the goal of optimization?<a class="anchor" aria-label="anchor" href="#what-is-the-goal-of-optimization"></a></h3>
<ul><li>A. To find the weights that maximize the loss function</li>
<li>B. To find the weights that minimize the loss function</li>
</ul></div>
<div class="section level3">
<h3 id="what-happens-in-one-gradient-descent-step">2. What happens in one gradient descent step?<a class="anchor" aria-label="anchor" href="#what-happens-in-one-gradient-descent-step"></a></h3>
<ul><li>A. The weights are adjusted so that we move in the direction of the
gradient, so up the slope of the loss function</li>
<li>B. The weights are adjusted so that we move in the direction of the
gradient, so down the slope of the loss function</li>
<li>C. The weights are adjusted so that we move in the direction of the
negative gradient, so up the slope of the loss function</li>
<li>D. The weights are adjusted so that we move in the direction of the
negative gradient, so down the slope of the loss function</li>
</ul></div>
<div class="section level3">
<h3 id="when-the-batch-size-is-increased">3. When the batch size is increased:<a class="anchor" aria-label="anchor" href="#when-the-batch-size-is-increased"></a></h3>
<p>(multiple answers might apply)</p>
<ul><li>A. The number of samples in an epoch also increases</li>
<li>B. The number of batches in an epoch goes down</li>
<li>C. The training progress is more jumpy, because more samples are
consulted in each update step (one batch).</li>
<li>D. The memory load (memory as in computer hardware) of the training
process is increased</li>
</ul></div>
</div>
</div>
</div>
<div id="accordionSolution2" class="accordion challenge-accordion accordion-flush">
<div class="accordion-item">
<button class="accordion-button solution-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseSolution2" aria-expanded="false" aria-controls="collapseSolution2">
  <h4 class="accordion-header" id="headingSolution2"> Show me the solution </h4>
</button>
<div id="collapseSolution2" class="accordion-collapse collapse" aria-labelledby="headingSolution2" data-bs-parent="#accordionSolution2">
<div class="accordion-body">
<ol style="list-style-type: decimal"><li><p>Correct answer: B. To find the weights that minimize the loss
function. The loss function quantifies the total error of the network,
we want to have the smallest error as possible, hence we minimize the
loss.</p></li>
<li><p>Correct answer: D The weights are adjusted so that we move in the
direction of the negative gradient, so down the slope of the loss
function. We want to move towards the global minimum, so in the opposite
direction of the gradient.</p></li>
<li>
<p>Correct answer: B &amp; D</p>
<ul><li>A. The number of samples in an epoch also increases
(<strong>incorrect</strong>, an epoch is always defined as passing
through the training data for one cycle)</li>
<li>B. The number of batches in an epoch goes down
(<strong>correct</strong>, the number of batches is the samples in an
epoch divided by the batch size)</li>
<li>C. The training progress is more jumpy, because more samples are
consulted in each update step (one batch). (<strong>incorrect</strong>,
more samples are consulted in each update step, but this makes the
progress less jumpy since you get a more accurate estimate of the loss
in the entire dataset)</li>
<li>D. The memory load (memory as in computer hardware) of the training
process is increased (<strong>correct</strong>, the data is begin loaded
one batch at a time, so more samples means more memory usage)</li>
</ul></li>
</ol></div>
</div>
</div>
</div>
</div>
</section><section><h2 class="section-heading" id="choose-a-loss-function-and-optimizer">5. Choose a loss function and optimizer<a class="anchor" aria-label="anchor" href="#choose-a-loss-function-and-optimizer"></a></h2>
<hr class="half-width"><div class="section level3">
<h3 id="loss-function">Loss function<a class="anchor" aria-label="anchor" href="#loss-function"></a></h3>
<p>The loss is what the neural network will be optimized on during
training, so choosing a suitable loss function is crucial for training
neural networks. In the given case we want to stimulate that the
predicted values are as close as possible to the true values. This is
commonly done by using the <em>mean squared error</em> (mse) or the
<em>mean absolute error</em> (mae), both of which should work OK in this
case. Often, mse is preferred over mae because it “punishes” large
prediction errors more severely. In Keras this is implemented in the
<code>keras.losses.MeanSquaredError</code> class (see Keras
documentation: <a href="https://keras.io/api/losses/" class="external-link uri">https://keras.io/api/losses/</a>). This can be provided into
the <code>model.compile</code> method with the <code>loss</code>
parameter and setting it to <code>mse</code>, e.g.</p>
<!--cce:skip-->
<div class="codewrapper sourceCode" id="cb13">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" tabindex="-1"></a>model.<span class="bu">compile</span>(loss<span class="op">=</span><span class="st">'mse'</span>)</span></code></pre>
</div>
</div>
<div class="section level3">
<h3 id="optimizer">Optimizer<a class="anchor" aria-label="anchor" href="#optimizer"></a></h3>
<p>Somewhat coupled to the loss function is the <em>optimizer</em> that
we want to use. The <em>optimizer</em> here refers to the algorithm with
which the model learns to optimize on the provided loss function. A
basic example for such an optimizer would be <em>stochastic gradient
descent</em>. For now, we can largely skip this step and pick one of the
most common optimizers that works well for most tasks: the <em>Adam
optimizer</em>. Similar to activation functions, the choice of optimizer
depends on the problem you are trying to solve, your model architecture
and your data. <em>Adam</em> is a good starting point though, which is
why we chose it.</p>
<!--cce:skip-->
<div class="codewrapper sourceCode" id="cb14">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" tabindex="-1"></a>model.<span class="bu">compile</span>(optimizer<span class="op">=</span><span class="st">'adam'</span>,</span>
<span id="cb14-2"><a href="#cb14-2" tabindex="-1"></a>              loss<span class="op">=</span><span class="st">'mse'</span>)</span></code></pre>
</div>
</div>
<div class="section level3">
<h3 id="metrics">Metrics<a class="anchor" aria-label="anchor" href="#metrics"></a></h3>
<p>In our first example (episode 2) we plotted the progression of the
loss during training. That is indeed a good first indicator if things
are working alright, i.e. if the loss is indeed decreasing as it should
with the number of epochs. However, when models become more complicated
then also the loss functions often become less intuitive. That is why it
is good practice to monitor the training process with additional, more
intuitive metrics. They are not used to optimize the model, but are
simply recorded during training.</p>
<p>With Keras, such additional metrics can be added via
<code>metrics=[...]</code> parameter and can contain one or multiple
metrics of interest. Here we could for instance chose <code>mae</code>
(<a href="https://glosario.carpentries.org/en/#mean_absolute_error" class="external-link">mean
absolute error</a>), or the the <a href="https://glosario.carpentries.org/en/#root_mean_squared_error" class="external-link"><em>root
mean squared error</em> (RMSE)</a> which unlike the <em>mse</em> has the
same units as the predicted values. For the sake of units, we choose the
latter.</p>
<div class="codewrapper sourceCode" id="cb15">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" tabindex="-1"></a>model.<span class="bu">compile</span>(optimizer<span class="op">=</span><span class="st">'adam'</span>,</span>
<span id="cb15-2"><a href="#cb15-2" tabindex="-1"></a>              loss<span class="op">=</span><span class="st">'mse'</span>,</span>
<span id="cb15-3"><a href="#cb15-3" tabindex="-1"></a>              metrics<span class="op">=</span>[keras.metrics.RootMeanSquaredError()])</span></code></pre>
</div>
<p>Let’s create a <code>compile_model</code> function to easily compile
the model throughout this lesson:</p>
<div class="codewrapper sourceCode" id="cb16">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" tabindex="-1"></a><span class="kw">def</span> compile_model(model):</span>
<span id="cb16-2"><a href="#cb16-2" tabindex="-1"></a>    model.<span class="bu">compile</span>(optimizer<span class="op">=</span><span class="st">'adam'</span>,</span>
<span id="cb16-3"><a href="#cb16-3" tabindex="-1"></a>                  loss<span class="op">=</span><span class="st">'mse'</span>,</span>
<span id="cb16-4"><a href="#cb16-4" tabindex="-1"></a>                  metrics<span class="op">=</span>[keras.metrics.RootMeanSquaredError()])</span>
<span id="cb16-5"><a href="#cb16-5" tabindex="-1"></a>compile_model(model)</span></code></pre>
</div>
<p>With this, we complete the compilation of our network and are ready
to start training.</p>
</div>
</section><section><h2 class="section-heading" id="train-the-model">6. Train the model<a class="anchor" aria-label="anchor" href="#train-the-model"></a></h2>
<hr class="half-width"><p>Now that we created and compiled our dense neural network, we can
start training it. One additional concept we need to introduce though,
is the <code>batch_size</code>. This defines how many samples from the
training data will be used to estimate the error gradient before the
model weights are updated. Larger batches will produce better, more
accurate gradient estimates but also less frequent updates of the
weights. Here we are going to use a batch size of 32 which is a common
starting point.</p>
<div class="codewrapper sourceCode" id="cb17">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" tabindex="-1"></a>history <span class="op">=</span> model.fit(X_train, y_train,</span>
<span id="cb17-2"><a href="#cb17-2" tabindex="-1"></a>                    batch_size<span class="op">=</span><span class="dv">32</span>,</span>
<span id="cb17-3"><a href="#cb17-3" tabindex="-1"></a>                    epochs<span class="op">=</span><span class="dv">200</span>,</span>
<span id="cb17-4"><a href="#cb17-4" tabindex="-1"></a>                    verbose<span class="op">=</span><span class="dv">2</span>)</span></code></pre>
</div>
<p>We can plot the training process using the <code>history</code>
object returned from the model training. We will create a function for
it, because we will make use of this more often in this lesson!</p>
<div class="codewrapper sourceCode" id="cb18">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
<span id="cb18-2"><a href="#cb18-2" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
<span id="cb18-3"><a href="#cb18-3" tabindex="-1"></a></span>
<span id="cb18-4"><a href="#cb18-4" tabindex="-1"></a><span class="kw">def</span> plot_history(history, metrics):</span>
<span id="cb18-5"><a href="#cb18-5" tabindex="-1"></a>    <span class="co">"""</span></span>
<span id="cb18-6"><a href="#cb18-6" tabindex="-1"></a><span class="co">    Plot the training history</span></span>
<span id="cb18-7"><a href="#cb18-7" tabindex="-1"></a></span>
<span id="cb18-8"><a href="#cb18-8" tabindex="-1"></a><span class="co">    Args:</span></span>
<span id="cb18-9"><a href="#cb18-9" tabindex="-1"></a><span class="co">        history (keras History object that is returned by model.fit())</span></span>
<span id="cb18-10"><a href="#cb18-10" tabindex="-1"></a><span class="co">        metrics (str, list): Metric or a list of metrics to plot</span></span>
<span id="cb18-11"><a href="#cb18-11" tabindex="-1"></a><span class="co">    """</span></span>
<span id="cb18-12"><a href="#cb18-12" tabindex="-1"></a>    history_df <span class="op">=</span> pd.DataFrame.from_dict(history.history)</span>
<span id="cb18-13"><a href="#cb18-13" tabindex="-1"></a>    sns.lineplot(data<span class="op">=</span>history_df[metrics])</span>
<span id="cb18-14"><a href="#cb18-14" tabindex="-1"></a>    plt.xlabel(<span class="st">"epochs"</span>)</span>
<span id="cb18-15"><a href="#cb18-15" tabindex="-1"></a>    plt.ylabel(<span class="st">"metric"</span>)</span>
<span id="cb18-16"><a href="#cb18-16" tabindex="-1"></a></span>
<span id="cb18-17"><a href="#cb18-17" tabindex="-1"></a>plot_history(history, <span class="st">'root_mean_squared_error'</span>)</span></code></pre>
</div>
<figure><img src="fig/03_training_history_1_rmse.png" alt="Plot of the RMSE over epochs for the trained model that shows a decreasing error metric" class="figure mx-auto d-block"></figure><p>This looks very promising! Our metric (“RMSE”) is dropping nicely and
while it maybe keeps fluctuating a bit it does end up at fairly low
<em>RMSE</em> values. But the <em>RMSE</em> is just the root
<em>mean</em> squared error, so we might want to look a bit more in
detail how well our just trained model does in predicting the sunshine
hours.</p>
</section><section><h2 class="section-heading" id="perform-a-predictionclassification">7. Perform a Prediction/Classification<a class="anchor" aria-label="anchor" href="#perform-a-predictionclassification"></a></h2>
<hr class="half-width"><p>Now that we have our model trained, we can make a prediction with the
model before measuring the performance of our neural network.</p>
<div class="codewrapper sourceCode" id="cb19">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" tabindex="-1"></a>y_train_predicted <span class="op">=</span> model.predict(X_train)</span>
<span id="cb19-2"><a href="#cb19-2" tabindex="-1"></a>y_test_predicted <span class="op">=</span> model.predict(X_test)</span></code></pre>
</div>

</section><section><h2 class="section-heading" id="measure-performance">8. Measure performance<a class="anchor" aria-label="anchor" href="#measure-performance"></a></h2>
<hr class="half-width"><p>There is not a single way to evaluate how a model performs. But there
are at least two very common approaches. For a <em>classification
task</em> that is to compute a <em>confusion matrix</em> for the test
set which shows how often particular classes were predicted correctly or
incorrectly.</p>
<p>For the present <em>regression task</em>, it makes more sense to
compare true and predicted values in a scatter plot.</p>
<p>So, let’s look at how the predicted sunshine hour have developed with
reference to their ground truth values.</p>
<div class="codewrapper sourceCode" id="cb20">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" tabindex="-1"></a><span class="co"># We define a function that we will reuse in this lesson</span></span>
<span id="cb20-2"><a href="#cb20-2" tabindex="-1"></a><span class="kw">def</span> plot_predictions(y_pred, y_true, title):</span>
<span id="cb20-3"><a href="#cb20-3" tabindex="-1"></a>    plt.style.use(<span class="st">'ggplot'</span>)  <span class="co"># optional, that's only to define a visual style</span></span>
<span id="cb20-4"><a href="#cb20-4" tabindex="-1"></a>    plt.scatter(y_pred, y_true, s<span class="op">=</span><span class="dv">10</span>, alpha<span class="op">=</span><span class="fl">0.5</span>)</span>
<span id="cb20-5"><a href="#cb20-5" tabindex="-1"></a>    plt.xlabel(<span class="st">"predicted sunshine hours"</span>)</span>
<span id="cb20-6"><a href="#cb20-6" tabindex="-1"></a>    plt.ylabel(<span class="st">"true sunshine hours"</span>)</span>
<span id="cb20-7"><a href="#cb20-7" tabindex="-1"></a>    plt.title(title)</span>
<span id="cb20-8"><a href="#cb20-8" tabindex="-1"></a></span>
<span id="cb20-9"><a href="#cb20-9" tabindex="-1"></a>plot_predictions(y_train_predicted, y_train, title<span class="op">=</span><span class="st">'Predictions on the training set'</span>)</span></code></pre>
</div>
<figure><img src="fig/03_regression_predictions_trainset.png" alt="Scatter plot between predictions and true sunshine hours in Basel on the train set showing a concise spread" class="figure mx-auto d-block"></figure><div class="codewrapper sourceCode" id="cb21">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" tabindex="-1"></a>plot_predictions(y_test_predicted, y_test, title<span class="op">=</span><span class="st">'Predictions on the test set'</span>)</span></code></pre>
</div>
<figure><img src="fig/03_regression_predictions_testset.png" alt="Scatter plot between predictions and true sunshine hours in Basel on the test set showing a wide spread" class="figure mx-auto d-block"></figure><div id="exercise-reflecting-on-our-results" class="callout challenge">
<div class="callout-square">
<i class="callout-icon" data-feather="zap"></i>
</div>
<div id="exercise-reflecting-on-our-results" class="callout-inner">
<h3 class="callout-title">Exercise: Reflecting on our results</h3>
<div class="callout-content">
<ul><li>Is the performance of the model as you expected (or
better/worse)?</li>
<li>Is there a noteable difference between training set and test set?
And if so, any idea why?</li>
<li>(Optional) When developing a model, you will often vary different
aspects of your model like which features you use, model parameters and
architecture. It is important to settle on a single-number evaluation
metric to compare your models.
<ul><li>What single-number evaluation metric would you choose here and
why?</li>
</ul></li>
</ul></div>
</div>
</div>
<div id="accordionSolution3" class="accordion challenge-accordion accordion-flush">
<div class="accordion-item">
<button class="accordion-button solution-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseSolution3" aria-expanded="false" aria-controls="collapseSolution3">
  <h4 class="accordion-header" id="headingSolution3"> Show me the solution </h4>
</button>
<div id="collapseSolution3" class="accordion-collapse collapse" aria-labelledby="headingSolution3" data-bs-parent="#accordionSolution3">
<div class="accordion-body">
<p>While the performance on the train set seems reasonable, the
performance on the test set is much worse. This is a common problem
called <strong>overfitting</strong>, which we will discuss in more
detail later.</p>
<div class="section level4">
<h4 id="optional-exercise">Optional exercise:<a class="anchor" aria-label="anchor" href="#optional-exercise"></a></h4>
<p>The metric that we are using: RMSE would be a good one. You could
also consider Mean Squared Error, that punishes large errors more
(because large errors create even larger squared errors). It is
important that if the model improves in performance on the basis of this
metric then that should also lead you a step closer to reaching your
goal: to predict tomorrow’s sunshine hours. If you feel that improving
the metric does not lead you closer to your goal, then it would be
better to choose a different metric</p>
</div>
</div>
</div>
</div>
</div>
<p>The accuracy on the training set seems fairly good. In fact,
considering that the task of predicting the daily sunshine hours is
really not easy it might even be surprising how well the model predicts
that (at least on the training set). Maybe a little too good? We also
see the noticeable difference between train and test set when
calculating the exact value of the RMSE:</p>
<div class="codewrapper sourceCode" id="cb22">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" tabindex="-1"></a>train_metrics <span class="op">=</span> model.evaluate(X_train, y_train, return_dict<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb22-2"><a href="#cb22-2" tabindex="-1"></a>test_metrics <span class="op">=</span> model.evaluate(X_test, y_test, return_dict<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb22-3"><a href="#cb22-3" tabindex="-1"></a><span class="bu">print</span>(<span class="st">'Train RMSE: </span><span class="sc">{:.2f}</span><span class="st">, Test RMSE: </span><span class="sc">{:.2f}</span><span class="st">'</span>.<span class="bu">format</span>(train_metrics[<span class="st">'root_mean_squared_error'</span>], test_metrics[<span class="st">'root_mean_squared_error'</span>]))</span></code></pre>
</div>
<div class="codewrapper">
<h3 class="code-label">OUTPUT<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="output" tabindex="0"><code>24/24 [==============================] - 0s 442us/step - loss: 0.7092 - root_mean_squared_error: 0.8421
6/6 [==============================] - 0s 647us/step - loss: 16.4413 - root_mean_squared_error: 4.0548
Train RMSE: 0.84, Test RMSE: 4.05</code></pre>
</div>
<p>For those experienced with (classical) machine learning this might
look familiar. The plots above expose the signs of
<strong>overfitting</strong> which means that the model has to some
extent memorized aspects of the training data. As a result, it makes
much more accurate predictions on the training data than on unseen test
data.</p>
<p>Overfitting also happens in classical machine learning, but there it
is usually interpreted as the model having more parameters than the
training data would justify (say, a decision tree with too many branches
for the number of training instances). As a consequence one would reduce
the number of parameters to avoid overfitting. In deep learning the
situation is slightly different. It can - as for classical machine
learning - also be a sign of having a <em>too big</em> model, meaning a
model with too many parameters (layers and/or nodes). However, in deep
learning higher number of model parameters are often still considered
acceptable and models often perform best (in terms of prediction
accuracy) when they are at the verge of overfitting. So, in a way,
training deep learning models is always a bit like playing with
fire…</p>
<div class="section level3">
<h3 id="set-expectations-how-difficult-is-the-defined-problem">Set expectations: How difficult is the defined problem?<a class="anchor" aria-label="anchor" href="#set-expectations-how-difficult-is-the-defined-problem"></a></h3>
<p>Before we dive deeper into handling overfitting and (trying to)
improving the model performance, let us ask the question: How well must
a model perform before we consider it a good model?</p>
<p>Now that we defined a problem (predict tomorrow’s sunshine hours), it
makes sense to develop an intuition for how difficult the posed problem
is. Frequently, models will be evaluated against a so called
<strong>baseline</strong>. A baseline can be the current standard in the
field or if such a thing does not exist it could also be an intuitive
first guess or toy model. The latter is exactly what we would use for
our case.</p>
<p>Maybe the simplest sunshine hour prediction we can easily do is:
Tomorrow we will have the same number of sunshine hours as today.
(sounds very naive, but for many observables such as temperature this is
already a fairly good predictor)</p>
<p>We can take the <code>BASEL_sunshine</code> column of our data,
because this contains the sunshine hours from one day before what we
have as a label.</p>
<div class="codewrapper sourceCode" id="cb24">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" tabindex="-1"></a>y_baseline_prediction <span class="op">=</span> X_test[<span class="st">'BASEL_sunshine'</span>]</span>
<span id="cb24-2"><a href="#cb24-2" tabindex="-1"></a>plot_predictions(y_baseline_prediction, y_test, title<span class="op">=</span><span class="st">'Baseline predictions on the test set'</span>)</span></code></pre>
</div>
<figure><img src="fig/03_regression_test_5_naive_baseline.png" alt="Scatter plot of predicted vs true sunshine hours in Basel for the test set where today's sunshine hours is considered as the true sunshine hours for tomorrow" class="figure mx-auto d-block"></figure><p>It is difficult to interpret from this plot whether our model is
doing better than the baseline. We can also have a look at the RMSE:</p>
<div class="codewrapper sourceCode" id="cb25">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> mean_squared_error</span>
<span id="cb25-2"><a href="#cb25-2" tabindex="-1"></a>rmse_baseline <span class="op">=</span> mean_squared_error(y_test, y_baseline_prediction, squared<span class="op">=</span><span class="va">False</span>)</span>
<span id="cb25-3"><a href="#cb25-3" tabindex="-1"></a><span class="bu">print</span>(<span class="st">'Baseline:'</span>, rmse_baseline)</span>
<span id="cb25-4"><a href="#cb25-4" tabindex="-1"></a><span class="bu">print</span>(<span class="st">'Neural network: '</span>, test_metrics[<span class="st">'root_mean_squared_error'</span>])</span></code></pre>
</div>
<div class="codewrapper">
<h3 class="code-label">OUTPUT<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="output" tabindex="0"><code>Baseline: 3.877323350410224
Neural network:  4.077792167663574</code></pre>
</div>
<p>Judging from the numbers alone, our neural network prediction would
be performing worse than the baseline.</p>
<div id="exercise-baseline" class="callout challenge">
<div class="callout-square">
<i class="callout-icon" data-feather="zap"></i>
</div>
<div id="exercise-baseline" class="callout-inner">
<h3 class="callout-title">Exercise: Baseline</h3>
<div class="callout-content">
<ol style="list-style-type: decimal"><li>Looking at this baseline: Would you consider this a simple or a hard
problem to solve?</li>
<li>(Optional) Can you think of other baselines?</li>
</ol></div>
</div>
</div>
<div id="accordionSolution4" class="accordion challenge-accordion accordion-flush">
<div class="accordion-item">
<button class="accordion-button solution-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseSolution4" aria-expanded="false" aria-controls="collapseSolution4">
  <h4 class="accordion-header" id="headingSolution4"> Show me the solution </h4>
</button>
<div id="collapseSolution4" class="accordion-collapse collapse" aria-labelledby="headingSolution4" data-bs-parent="#accordionSolution4">
<div class="accordion-body">
<ol style="list-style-type: decimal"><li>This really depends on your definition of hard! The baseline gives a
more accurate prediction than just randomly predicting a number, so the
problem is not impossible to solve with machine learning. However, given
the structure of the data and our expectations with respect to quality
of prediction, it may remain hard to find a good algorithm which exceeds
our baseline by orders of magnitude.</li>
<li>There are a lot of possible answers. A slighly more complicated
baseline would be to take the average over the last couple of days.</li>
</ol></div>
</div>
</div>
</div>
</div>
</section><section><h2 class="section-heading" id="refine-the-model">9. Refine the model<a class="anchor" aria-label="anchor" href="#refine-the-model"></a></h2>
<hr class="half-width"><div class="section level3">
<h3 id="watch-your-model-training-closely">Watch your model training closely<a class="anchor" aria-label="anchor" href="#watch-your-model-training-closely"></a></h3>
<p>As we saw when comparing the predictions for the training and the
test set, deep learning models are prone to overfitting. Instead of
iterating through countless cycles of model trainings and subsequent
evaluations with a reserved test set, it is common practice to work with
a second split off dataset to monitor the model during training. This is
the <em>validation set</em> which can be regarded as a second test set.
As with the test set, the datapoints of the <em>validation set</em> are
not used for the actual model training itself. Instead, we evaluate the
model with the <em>validation set</em> after every epoch during
training, for instance to stop if we see signs of clear overfitting.
Since we are adapting our model (tuning our hyperparameters) based on
this validation set, it is <em>very</em> important that it is kept
separate from the test set. If we used the same set, we would not know
whether our model truly generalizes or is only overfitting.</p>
<div id="test-vs.-validation-set" class="callout">
<div class="callout-square">
<i class="callout-icon" data-feather="bell"></i>
</div>
<div id="test-vs.-validation-set" class="callout-inner">
<h3 class="callout-title">Test vs. validation set</h3>
<div class="callout-content">
<p>Not everybody agrees on the terminology of test set versus validation
set. You might find examples in literature where these terms are used
the other way around. We are sticking to the definition that is
consistent with the Keras API. In there, the validation set can be used
during training, and the test set is reserved for afterwards.</p>
</div>
</div>
</div>
<p>Let’s give this a try!</p>
<p>We need to initiate a new model – otherwise Keras will simply assume
that we want to continue training the model we already trained
above.</p>
<div class="codewrapper sourceCode" id="cb27">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb27-1"><a href="#cb27-1" tabindex="-1"></a>model <span class="op">=</span> create_nn()</span>
<span id="cb27-2"><a href="#cb27-2" tabindex="-1"></a>compile_model(model)</span></code></pre>
</div>
<p>But now we train it with the small addition of also passing it our
validation set:</p>
<div class="codewrapper sourceCode" id="cb28">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb28-1"><a href="#cb28-1" tabindex="-1"></a>history <span class="op">=</span> model.fit(X_train, y_train,</span>
<span id="cb28-2"><a href="#cb28-2" tabindex="-1"></a>                    batch_size<span class="op">=</span><span class="dv">32</span>,</span>
<span id="cb28-3"><a href="#cb28-3" tabindex="-1"></a>                    epochs<span class="op">=</span><span class="dv">200</span>,</span>
<span id="cb28-4"><a href="#cb28-4" tabindex="-1"></a>                    validation_data<span class="op">=</span>(X_val, y_val))</span></code></pre>
</div>
<p>With this we can plot both the performance on the training data and
on the validation data!</p>
<div class="codewrapper sourceCode" id="cb29">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb29-1"><a href="#cb29-1" tabindex="-1"></a>plot_history(history, [<span class="st">'root_mean_squared_error'</span>, <span class="st">'val_root_mean_squared_error'</span>])</span></code></pre>
</div>
<figure><img src="fig/03_training_history_2_rmse.png" alt="Plot of RMSE vs epochs for the training set and the validation set which depicts a divergence between the two around 10 epochs." class="figure mx-auto d-block"></figure><div id="exercise-plot-the-training-progress." class="callout challenge">
<div class="callout-square">
<i class="callout-icon" data-feather="zap"></i>
</div>
<div id="exercise-plot-the-training-progress." class="callout-inner">
<h3 class="callout-title">Exercise: plot the training progress.</h3>
<div class="callout-content">
<ol style="list-style-type: decimal"><li>Is there a difference between the training curves of training versus
validation data? And if so, what would this imply?</li>
<li>(Optional) Take a pen and paper, draw the perfect training and
validation curves. (This may seem trivial, but it will trigger you to
think about what you actually would like to see)</li>
</ol></div>
</div>
</div>
<div id="accordionSolution5" class="accordion challenge-accordion accordion-flush">
<div class="accordion-item">
<button class="accordion-button solution-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseSolution5" aria-expanded="false" aria-controls="collapseSolution5">
  <h4 class="accordion-header" id="headingSolution5"> Show me the solution </h4>
</button>
<div id="collapseSolution5" class="accordion-collapse collapse" aria-labelledby="headingSolution5" data-bs-parent="#accordionSolution5">
<div class="accordion-body">
<p>The difference in the two curves shows that something is not
completely right here. The error for the model predictions on the
validation set quickly seem to reach a plateau while the error on the
training set keeps decreasing. That is a common signature of
<em>overfitting</em>.</p>
<p>Optional:</p>
<p>Ideally you would like the training and validation curves to be
identical and slope down steeply to 0. After that the curves will just
consistently stay at 0.</p>
</div>
</div>
</div>
</div>
</div>
<div class="section level3">
<h3 id="counteract-model-overfitting">Counteract model overfitting<a class="anchor" aria-label="anchor" href="#counteract-model-overfitting"></a></h3>
<p>Overfitting is a very common issue and there are many strategies to
handle it. Most similar to classical machine learning might to
<strong>reduce the number of parameters</strong>.</p>
<div id="exercise-try-to-reduce-the-degree-of-overfitting-by-lowering-the-number-of-parameters" class="callout challenge">
<div class="callout-square">
<i class="callout-icon" data-feather="zap"></i>
</div>
<div id="exercise-try-to-reduce-the-degree-of-overfitting-by-lowering-the-number-of-parameters" class="callout-inner">
<h3 class="callout-title">Exercise: Try to reduce the degree of overfitting by lowering the number of parameters</h3>
<div class="callout-content">
<p>We can keep the network architecture unchanged (2 dense layers + a
one-node output layer) and only play with the number of nodes per layer.
Try to lower the number of nodes in one or both of the two dense layers
and observe the changes to the training and validation losses. If time
is short: Suggestion is to run one network with only 10 and 5 nodes in
the first and second layer.</p>
<ol style="list-style-type: decimal"><li>Is it possible to get rid of overfitting this way?</li>
<li>Does the overall performance suffer or does it mostly stay the
same?</li>
<li>(optional) How low can you go with the number of parameters without
notable effect on the performance on the validation set?</li>
</ol></div>
</div>
</div>
<div id="accordionSolution6" class="accordion challenge-accordion accordion-flush">
<div class="accordion-item">
<button class="accordion-button solution-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseSolution6" aria-expanded="false" aria-controls="collapseSolution6">
  <h4 class="accordion-header" id="headingSolution6"> Show me the solution </h4>
</button>
<div id="collapseSolution6" class="accordion-collapse collapse" aria-labelledby="headingSolution6" data-bs-parent="#accordionSolution6">
<div class="accordion-body">
<p>Let’s first adapt our <code>create_nn</code> function so that we can
tweak the number of nodes in the 2 layers by passing arguments to the
function:</p>
<div class="codewrapper sourceCode" id="cb30">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb30-1"><a href="#cb30-1" tabindex="-1"></a><span class="kw">def</span> create_nn(nodes1<span class="op">=</span><span class="dv">100</span>, nodes2<span class="op">=</span><span class="dv">50</span>):</span>
<span id="cb30-2"><a href="#cb30-2" tabindex="-1"></a>   <span class="co"># Input layer</span></span>
<span id="cb30-3"><a href="#cb30-3" tabindex="-1"></a>   inputs <span class="op">=</span> keras.layers.Input(shape<span class="op">=</span>(X_data.shape[<span class="dv">1</span>],), name<span class="op">=</span><span class="st">'input'</span>)</span>
<span id="cb30-4"><a href="#cb30-4" tabindex="-1"></a>   <span class="co"># Dense layers</span></span>
<span id="cb30-5"><a href="#cb30-5" tabindex="-1"></a>   layers_dense <span class="op">=</span> keras.layers.Dense(nodes1, <span class="st">'relu'</span>)(inputs)</span>
<span id="cb30-6"><a href="#cb30-6" tabindex="-1"></a>   layers_dense <span class="op">=</span> keras.layers.Dense(nodes2, <span class="st">'relu'</span>)(layers_dense)</span>
<span id="cb30-7"><a href="#cb30-7" tabindex="-1"></a>   <span class="co"># Output layer</span></span>
<span id="cb30-8"><a href="#cb30-8" tabindex="-1"></a>   outputs <span class="op">=</span> keras.layers.Dense(<span class="dv">1</span>)(layers_dense)</span>
<span id="cb30-9"><a href="#cb30-9" tabindex="-1"></a>   <span class="cf">return</span> keras.Model(inputs<span class="op">=</span>inputs, outputs<span class="op">=</span>outputs, name<span class="op">=</span><span class="st">"model_small"</span>)</span></code></pre>
</div>
<p>Let’s see if it works by creating a much smaller network with 10
nodes in the first layer, and 5 nodes in the second layer:</p>
<div class="codewrapper sourceCode" id="cb31">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb31-1"><a href="#cb31-1" tabindex="-1"></a>model <span class="op">=</span> create_nn(<span class="dv">10</span>, <span class="dv">5</span>)</span>
<span id="cb31-2"><a href="#cb31-2" tabindex="-1"></a>model.summary()</span></code></pre>
</div>
<pre><code>Model: "model_small"
_________________________________________________________________
Layer (type)                 Output Shape              Param #
=================================================================
input (InputLayer)           [(None, 89)]              0
_________________________________________________________________
dense_9 (Dense)              (None, 10)                900
_________________________________________________________________
dense_10 (Dense)             (None, 5)                 55
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 6
=================================================================
Total params: 961
Trainable params: 961
Non-trainable params: 0</code></pre>
<p>Let’s compile and train this network:</p>
<div class="codewrapper sourceCode" id="cb33">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb33-1"><a href="#cb33-1" tabindex="-1"></a>compile_model(model)</span>
<span id="cb33-2"><a href="#cb33-2" tabindex="-1"></a>history <span class="op">=</span> model.fit(X_train, y_train,</span>
<span id="cb33-3"><a href="#cb33-3" tabindex="-1"></a>                   batch_size <span class="op">=</span> <span class="dv">32</span>,</span>
<span id="cb33-4"><a href="#cb33-4" tabindex="-1"></a>                   epochs <span class="op">=</span> <span class="dv">200</span>,</span>
<span id="cb33-5"><a href="#cb33-5" tabindex="-1"></a>                   validation_data<span class="op">=</span>(X_val, y_val))</span>
<span id="cb33-6"><a href="#cb33-6" tabindex="-1"></a>plot_history(history, [<span class="st">'root_mean_squared_error'</span>, <span class="st">'val_root_mean_squared_error'</span>])</span></code></pre>
</div>
<figure><img src="fig/03_training_history_3_rmse_smaller_model.png" alt="Plot of RMSE vs epochs for the training set and the validation set with similar performance across the two sets." class="figure mx-auto d-block"></figure><ol style="list-style-type: decimal"><li>With this smaller model we have reduced overfitting a bit, since the
training and validation loss are now closer to each other, and the
validation loss does now reach a plateau and does not further increase.
We have not completely avoided overfitting though.</li>
<li>In the case of this small example model, the validation RMSE seems
to end up around 3.2, which is much better than the 4.08 we had before.
Note that you can double check the actual score by calling
<code>model.evaluate()</code> on the test set.</li>
<li>In general, it quickly becomes a complicated search for the right
“sweet spot”, i.e. the settings for which overfitting will be (nearly)
avoided but the model still performs equally well. A model with 3
neurons in both layers seems to be around this spot, reaching an RMSE of
3.1 on the validation set. Reducing the number of nodes further
increases the validation RMSE again.</li>
</ol></div>
</div>
</div>
</div>
<p>We saw that reducing the number of parameters can be a strategy to
avoid overfitting. In practice, however, this is usually not the (main)
way to go when it comes to deep learning. One reason is, that finding
the sweet spot can be really hard and time consuming. And it has to be
repeated every time the model is adapted, e.g. when more training data
becomes available.</p>
</div>
<div class="section level3">
<h3 id="early-stopping-stop-when-things-are-looking-best">Early stopping: stop when things are looking best<a class="anchor" aria-label="anchor" href="#early-stopping-stop-when-things-are-looking-best"></a></h3>
<p>Arguable <strong>the</strong> most common technique to avoid (severe)
overfitting in deep learning is called <strong>early stopping</strong>.
As the name suggests, this technique just means that you stop the model
training if things do not seem to improve anymore. More specifically,
this usually means that the training is stopped if the validation loss
does not (notably) improve anymore. Early stopping is both intuitive and
effective to use, so it has become a standard addition for model
training.</p>
<p>To better study the effect, we can now safely go back to models with
many (too many?) parameters:</p>
<div class="codewrapper sourceCode" id="cb34">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb34-1"><a href="#cb34-1" tabindex="-1"></a>model <span class="op">=</span> create_nn()</span>
<span id="cb34-2"><a href="#cb34-2" tabindex="-1"></a>compile_model(model)</span></code></pre>
</div>
<p>To apply early stopping during training it is easiest to use Keras
<code>EarlyStopping</code> class. This allows to define the condition of
when to stop training. In our case we will say when the validation loss
is lowest. However, since we have seen quiet some fluctuation of the
losses during training above we will also set <code>patience=10</code>
which means that the model will stop training if the validation loss has
not gone down for 10 epochs.</p>
<div class="codewrapper sourceCode" id="cb35">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb35-1"><a href="#cb35-1" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.callbacks <span class="im">import</span> EarlyStopping</span>
<span id="cb35-2"><a href="#cb35-2" tabindex="-1"></a></span>
<span id="cb35-3"><a href="#cb35-3" tabindex="-1"></a>earlystopper <span class="op">=</span> EarlyStopping(</span>
<span id="cb35-4"><a href="#cb35-4" tabindex="-1"></a>    monitor<span class="op">=</span><span class="st">'val_loss'</span>,</span>
<span id="cb35-5"><a href="#cb35-5" tabindex="-1"></a>    patience<span class="op">=</span><span class="dv">10</span></span>
<span id="cb35-6"><a href="#cb35-6" tabindex="-1"></a>    )</span>
<span id="cb35-7"><a href="#cb35-7" tabindex="-1"></a></span>
<span id="cb35-8"><a href="#cb35-8" tabindex="-1"></a>history <span class="op">=</span> model.fit(X_train, y_train,</span>
<span id="cb35-9"><a href="#cb35-9" tabindex="-1"></a>                    batch_size <span class="op">=</span> <span class="dv">32</span>,</span>
<span id="cb35-10"><a href="#cb35-10" tabindex="-1"></a>                    epochs <span class="op">=</span> <span class="dv">200</span>,</span>
<span id="cb35-11"><a href="#cb35-11" tabindex="-1"></a>                    validation_data<span class="op">=</span>(X_val, y_val),</span>
<span id="cb35-12"><a href="#cb35-12" tabindex="-1"></a>                    callbacks<span class="op">=</span>[earlystopper])</span></code></pre>
</div>
<p>As before, we can plot the losses during training:</p>
<div class="codewrapper sourceCode" id="cb36">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb36-1"><a href="#cb36-1" tabindex="-1"></a>plot_history(history, [<span class="st">'root_mean_squared_error'</span>, <span class="st">'val_root_mean_squared_error'</span>])</span></code></pre>
</div>
<figure><img src="fig/03_training_history_3_rmse_early_stopping.png" alt="Plot of RMSE vs epochs for the training set and the validation set displaying similar performance across the two sets." class="figure mx-auto d-block"></figure><p>This still seems to reveal the onset of overfitting, but the training
stops before the discrepancy between training and validation loss can
grow further. Despite avoiding severe cases of overfitting, early
stopping has the additional advantage that the number of training epochs
will be regulated automatically. Instead of comparing training runs for
different number of epochs, early stopping allows to simply set the
number of epochs to a desired maximum value.</p>
<p>What might be a bit unintuitive is that the training runs might now
end very rapidly. This might spark the question: have we really reached
an optimum yet? And often the answer to this is “no”, which is why early
stopping frequently is combined with other approaches to avoid
overfitting. Overfitting means that a model (seemingly) performs better
on seen data compared to unseen data. One then often also says that it
does not “generalize” well. Techniques to avoid overfitting, or to
improve model generalization, are termed <strong>regularization
techniques</strong> and we will come back to this in <strong>episode
4</strong>.</p>
</div>
<div class="section level3">
<h3 id="batchnorm-the-standard-scaler-for-deep-learning">BatchNorm: the “standard scaler” for deep learning<a class="anchor" aria-label="anchor" href="#batchnorm-the-standard-scaler-for-deep-learning"></a></h3>
<p>A very common step in classical machine learning pipelines is to
scale the features, for instance by using sckit-learn’s
<code>StandardScaler</code>. This can in principle also be done for deep
learning. An alternative, more common approach, is to add
<strong>BatchNormalization</strong> layers (<a href="https://keras.io/api/layers/normalization_layers/batch_normalization/" class="external-link">documentation
of the batch normalization layer</a>) which will learn how to scale the
input values. Similar to dropout, batch normalization is available as a
network layer in Keras and can be added to the network in a similar way.
It does not require any additional parameter setting.</p>
<p>The <code>BatchNormalization</code> can be inserted as yet another
layer into the architecture.</p>
<div class="codewrapper sourceCode" id="cb37">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb37-1"><a href="#cb37-1" tabindex="-1"></a><span class="kw">def</span> create_nn():</span>
<span id="cb37-2"><a href="#cb37-2" tabindex="-1"></a>    <span class="co"># Input layer</span></span>
<span id="cb37-3"><a href="#cb37-3" tabindex="-1"></a>    inputs <span class="op">=</span> keras.layers.Input(shape<span class="op">=</span>(X_data.shape[<span class="dv">1</span>],), name<span class="op">=</span><span class="st">'input'</span>)</span>
<span id="cb37-4"><a href="#cb37-4" tabindex="-1"></a></span>
<span id="cb37-5"><a href="#cb37-5" tabindex="-1"></a>    <span class="co"># Dense layers</span></span>
<span id="cb37-6"><a href="#cb37-6" tabindex="-1"></a>    layers_dense <span class="op">=</span> keras.layers.BatchNormalization()(inputs) <span class="co"># This is new!</span></span>
<span id="cb37-7"><a href="#cb37-7" tabindex="-1"></a>    layers_dense <span class="op">=</span> keras.layers.Dense(<span class="dv">100</span>, <span class="st">'relu'</span>)(layers_dense)</span>
<span id="cb37-8"><a href="#cb37-8" tabindex="-1"></a>    layers_dense <span class="op">=</span> keras.layers.Dense(<span class="dv">50</span>, <span class="st">'relu'</span>)(layers_dense)</span>
<span id="cb37-9"><a href="#cb37-9" tabindex="-1"></a></span>
<span id="cb37-10"><a href="#cb37-10" tabindex="-1"></a>    <span class="co"># Output layer</span></span>
<span id="cb37-11"><a href="#cb37-11" tabindex="-1"></a>    outputs <span class="op">=</span> keras.layers.Dense(<span class="dv">1</span>)(layers_dense)</span>
<span id="cb37-12"><a href="#cb37-12" tabindex="-1"></a></span>
<span id="cb37-13"><a href="#cb37-13" tabindex="-1"></a>    <span class="co"># Defining the model and compiling it</span></span>
<span id="cb37-14"><a href="#cb37-14" tabindex="-1"></a>    <span class="cf">return</span> keras.Model(inputs<span class="op">=</span>inputs, outputs<span class="op">=</span>outputs, name<span class="op">=</span><span class="st">"model_batchnorm"</span>)</span>
<span id="cb37-15"><a href="#cb37-15" tabindex="-1"></a></span>
<span id="cb37-16"><a href="#cb37-16" tabindex="-1"></a>model <span class="op">=</span> create_nn()</span>
<span id="cb37-17"><a href="#cb37-17" tabindex="-1"></a>compile_model(model)</span>
<span id="cb37-18"><a href="#cb37-18" tabindex="-1"></a>model.summary()</span></code></pre>
</div>
<p>This new layer appears in the model summary as well.</p>
<div class="codewrapper">
<h3 class="code-label">OUTPUT<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="output" tabindex="0"><code>Model: "model_batchnorm"
_________________________________________________________________
Layer (type)                 Output Shape              Param #
=================================================================
input_1 (InputLayer)         [(None, 89)]              0
_________________________________________________________________
batch_normalization (BatchNo (None, 89)                356
_________________________________________________________________
dense (Dense)             (None, 100)               9000
_________________________________________________________________
dense_1 (Dense)             (None, 50)                5050
_________________________________________________________________
dense_2 (Dense)             (None, 1)                 51
=================================================================
Total params: 14,457
Trainable params: 14,279
Non-trainable params: 178</code></pre>
</div>
<p>We can train the model again as follows:</p>
<div class="codewrapper sourceCode" id="cb39">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb39-1"><a href="#cb39-1" tabindex="-1"></a>history <span class="op">=</span> model.fit(X_train, y_train,</span>
<span id="cb39-2"><a href="#cb39-2" tabindex="-1"></a>                    batch_size <span class="op">=</span> <span class="dv">32</span>,</span>
<span id="cb39-3"><a href="#cb39-3" tabindex="-1"></a>                    epochs <span class="op">=</span> <span class="dv">1000</span>,</span>
<span id="cb39-4"><a href="#cb39-4" tabindex="-1"></a>                    validation_data<span class="op">=</span>(X_val, y_val),</span>
<span id="cb39-5"><a href="#cb39-5" tabindex="-1"></a>                    callbacks<span class="op">=</span>[earlystopper])</span>
<span id="cb39-6"><a href="#cb39-6" tabindex="-1"></a></span>
<span id="cb39-7"><a href="#cb39-7" tabindex="-1"></a>plot_history(history, [<span class="st">'root_mean_squared_error'</span>, <span class="st">'val_root_mean_squared_error'</span>])</span></code></pre>
</div>
<figure><img src="fig/03_training_history_5_rmse_batchnorm.png" alt="Output of plotting sample" class="figure mx-auto d-block"></figure><div id="batchnorm-parameters" class="callout">
<div class="callout-square">
<i class="callout-icon" data-feather="bell"></i>
</div>
<div id="batchnorm-parameters" class="callout-inner">
<h3 class="callout-title">Batchnorm parameters</h3>
<div class="callout-content">
<p>You may have noticed that the number of parameters of the Batchnorm
layers corresponds to 4 parameters per input node. These are the moving
mean, moving standard deviation, additional scaling factor (gamma) and
offset factor (beta). There is a difference in behavior for Batchnorm
between training and prediction time. During training time, the data is
scaled with the mean and standard deviation of the batch. During
prediction time, the moving mean and moving standard deviation of the
training set is used instead. The additional parameters gamma and beta
are introduced to allow for more flexibility in output values, and are
used in both training and prediction.</p>
</div>
</div>
</div>
</div>
<div class="section level3">
<h3 id="run-on-test-set-and-compare-to-naive-baseline">Run on test set and compare to naive baseline<a class="anchor" aria-label="anchor" href="#run-on-test-set-and-compare-to-naive-baseline"></a></h3>
<p>It seems that no matter what we add, the overall loss does not
decrease much further (we at least avoided overfitting though!). Let us
again plot the results on the test set:</p>
<div class="codewrapper sourceCode" id="cb40">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb40-1"><a href="#cb40-1" tabindex="-1"></a>y_test_predicted <span class="op">=</span> model.predict(X_test)</span>
<span id="cb40-2"><a href="#cb40-2" tabindex="-1"></a>plot_predictions(y_test_predicted, y_test, title<span class="op">=</span><span class="st">'Predictions on the test set'</span>)</span></code></pre>
</div>
<figure><img src="fig/03_regression_test_5_dropout_batchnorm.png" alt="Scatter plot between predictions and true sunshine hours for Basel on the test set" class="figure mx-auto d-block"></figure><p>Well, the above is certainly not perfect. But how good or bad is
this? Maybe not good enough to plan your picnic for tomorrow. But let’s
better compare it to the naive baseline we created in the beginning.
What would you say, did we improve on that?</p>
<div id="exercise-simplify-the-model-and-add-data" class="callout challenge">
<div class="callout-square">
<i class="callout-icon" data-feather="zap"></i>
</div>
<div id="exercise-simplify-the-model-and-add-data" class="callout-inner">
<h3 class="callout-title">Exercise: Simplify the model and add data</h3>
<div class="callout-content">
<p>You may have been wondering why we are including weather observations
from multiple cities to predict sunshine hours only in Basel. The
weather is a complex phenomenon with correlations over large distances
and time scales, but what happens if we limit ourselves to only one
city?</p>
<ol style="list-style-type: decimal"><li>Since we will be reducing the number of features quite
significantly, we could afford to include more data. Instead of using
only 3 years, use 8 or 9 years!</li>
<li>Only use the features in the dataset that are for Basel, remove the
data for other cities. You can use something like:</li>
</ol><div class="codewrapper sourceCode" id="cb41">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb41-1"><a href="#cb41-1" tabindex="-1"></a>cols <span class="op">=</span> [c <span class="cf">for</span> c <span class="kw">in</span> X_data.columns <span class="cf">if</span> c[:<span class="dv">5</span>] <span class="op">==</span> <span class="st">'BASEL'</span>]</span>
<span id="cb41-2"><a href="#cb41-2" tabindex="-1"></a>X_data <span class="op">=</span> X_data[cols]</span></code></pre>
</div>
<ol start="3" style="list-style-type: decimal"><li>Now rerun the last model we defined which included the BatchNorm
layer. Recreate the scatter plot comparing your predictions with the
true values, and evaluate the model by computing the RMSE on the test
score. Note that even though we will use many more observations than
previously, the network should still train quickly because we reduce the
number of features (columns). Is the prediction better compared to what
we had before?</li>
<li>(Optional) Try to train a model on all years that are available, and
all features from all cities. How does it perform?</li>
</ol></div>
</div>
</div>
<div id="accordionSolution7" class="accordion challenge-accordion accordion-flush">
<div class="accordion-item">
<button class="accordion-button solution-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseSolution7" aria-expanded="false" aria-controls="collapseSolution7">
  <h4 class="accordion-header" id="headingSolution7"> Show me the solution </h4>
</button>
<div id="collapseSolution7" class="accordion-collapse collapse" aria-labelledby="headingSolution7" data-bs-parent="#accordionSolution7">
<div class="accordion-body">
<div class="section level3">
<h3 id="use-9-years-out-of-the-dataset">1. Use 9 years out of the dataset<a class="anchor" aria-label="anchor" href="#use-9-years-out-of-the-dataset"></a></h3>
<div class="codewrapper sourceCode" id="cb42">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb42-1"><a href="#cb42-1" tabindex="-1"></a>nr_rows <span class="op">=</span> <span class="dv">365</span><span class="op">*</span><span class="dv">9</span></span>
<span id="cb42-2"><a href="#cb42-2" tabindex="-1"></a><span class="co"># data</span></span>
<span id="cb42-3"><a href="#cb42-3" tabindex="-1"></a>X_data <span class="op">=</span> data.loc[:nr_rows].drop(columns<span class="op">=</span>[<span class="st">'DATE'</span>, <span class="st">'MONTH'</span>])</span>
<span id="cb42-4"><a href="#cb42-4" tabindex="-1"></a></span>
<span id="cb42-5"><a href="#cb42-5" tabindex="-1"></a><span class="co"># labels (sunshine hours the next day)</span></span>
<span id="cb42-6"><a href="#cb42-6" tabindex="-1"></a>y_data <span class="op">=</span> data.loc[<span class="dv">1</span>:(nr_rows <span class="op">+</span> <span class="dv">1</span>)][<span class="st">"BASEL_sunshine"</span>]</span></code></pre>
</div>
</div>
<div class="section level3">
<h3 id="only-use-features-for-basel">2. Only use features for Basel<a class="anchor" aria-label="anchor" href="#only-use-features-for-basel"></a></h3>
<div class="codewrapper sourceCode" id="cb43">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb43-1"><a href="#cb43-1" tabindex="-1"></a><span class="co"># only use columns with 'BASEL'</span></span>
<span id="cb43-2"><a href="#cb43-2" tabindex="-1"></a>cols <span class="op">=</span> [c <span class="cf">for</span> c <span class="kw">in</span> X_data.columns <span class="cf">if</span> c[:<span class="dv">5</span>] <span class="op">==</span> <span class="st">'BASEL'</span>]</span>
<span id="cb43-3"><a href="#cb43-3" tabindex="-1"></a>X_data <span class="op">=</span> X_data[cols]</span></code></pre>
</div>
</div>
<div class="section level3">
<h3 id="rerun-the-model-and-evaluate-it">3. Rerun the model and evaluate it<a class="anchor" aria-label="anchor" href="#rerun-the-model-and-evaluate-it"></a></h3>
<p>Do the train-test-validation split:</p>
<div class="codewrapper sourceCode" id="cb44">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb44-1"><a href="#cb44-1" tabindex="-1"></a>X_train, X_holdout, y_train, y_holdout <span class="op">=</span> train_test_split(X_data, y_data, test_size<span class="op">=</span><span class="fl">0.3</span>, random_state<span class="op">=</span><span class="dv">0</span>)</span>
<span id="cb44-2"><a href="#cb44-2" tabindex="-1"></a>X_val, X_test, y_val, y_test <span class="op">=</span> train_test_split(X_holdout, y_holdout, test_size<span class="op">=</span><span class="fl">0.5</span>, random_state<span class="op">=</span><span class="dv">0</span>)</span></code></pre>
</div>
<p>Create the network. We can re-use the <code>create_nn</code> that we
already have. Because we have reduced the number of input features the
number of parameters in the network goes down from 14457 to 6137.</p>
<div class="codewrapper sourceCode" id="cb45">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb45-1"><a href="#cb45-1" tabindex="-1"></a><span class="co"># create the network and view its summary</span></span>
<span id="cb45-2"><a href="#cb45-2" tabindex="-1"></a>model <span class="op">=</span> create_nn()</span>
<span id="cb45-3"><a href="#cb45-3" tabindex="-1"></a>compile_model(model)</span>
<span id="cb45-4"><a href="#cb45-4" tabindex="-1"></a>model.summary()</span></code></pre>
</div>
<p>Fit with early stopping and output showing performance on validation
set:</p>
<div class="codewrapper sourceCode" id="cb46">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb46-1"><a href="#cb46-1" tabindex="-1"></a>history <span class="op">=</span> model.fit(X_train, y_train,</span>
<span id="cb46-2"><a href="#cb46-2" tabindex="-1"></a>                   batch_size <span class="op">=</span> <span class="dv">32</span>,</span>
<span id="cb46-3"><a href="#cb46-3" tabindex="-1"></a>                   epochs <span class="op">=</span> <span class="dv">1000</span>,</span>
<span id="cb46-4"><a href="#cb46-4" tabindex="-1"></a>                   validation_data<span class="op">=</span>(X_val, y_val),</span>
<span id="cb46-5"><a href="#cb46-5" tabindex="-1"></a>                   callbacks<span class="op">=</span>[earlystopper],</span>
<span id="cb46-6"><a href="#cb46-6" tabindex="-1"></a>                   verbose <span class="op">=</span> <span class="dv">2</span>)</span>
<span id="cb46-7"><a href="#cb46-7" tabindex="-1"></a>plot_history(history, [<span class="st">'root_mean_squared_error'</span>, <span class="st">'val_root_mean_squared_error'</span>])</span></code></pre>
</div>
<p>Create a scatter plot to compare with true observations:</p>
<div class="codewrapper sourceCode" id="cb47">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb47-1"><a href="#cb47-1" tabindex="-1"></a>y_test_predicted <span class="op">=</span> model.predict(X_test)</span>
<span id="cb47-2"><a href="#cb47-2" tabindex="-1"></a>plot_predictions(y_test_predicted, y_test, title<span class="op">=</span><span class="st">'Predictions on the test set'</span>)</span></code></pre>
</div>
<figure><img src="fig/03_scatter_plot_basel_model.png" alt="Scatterplot of predictions and true number of sunshine hours" class="figure mx-auto d-block"></figure><p>Compute the RMSE on the test set:</p>
<div class="codewrapper sourceCode" id="cb48">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb48-1"><a href="#cb48-1" tabindex="-1"></a>test_metrics <span class="op">=</span> model.evaluate(X_test, y_test, return_dict<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb48-2"><a href="#cb48-2" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f'Test RMSE: </span><span class="sc">{</span>test_metrics[<span class="st">"root_mean_squared_error"</span>]<span class="sc">}</span><span class="ss">'</span>)</span></code></pre>
</div>
<div class="codewrapper">
<h3 class="code-label">OUTPUT<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="output" tabindex="0"><code>Test RMSE: 3.3761725425720215</code></pre>
</div>
<p>This RMSE is already a lot better compared to what we had before and
certainly better than the baseline. Additionally, it could be further
improved with hyperparameter tuning.</p>
<p>Note that because we ran <code>train_test_split()</code> again, we
are evaluating on a different test set than before. In the real world it
is important to always compare results on the exact same test set.</p>
</div>
<div class="section level3">
<h3 id="optional-train-a-model-on-all-years-and-all-features-available-">4. (optional) Train a model on all years and all features
available.<a class="anchor" aria-label="anchor" href="#optional-train-a-model-on-all-years-and-all-features-available-"></a></h3>
<p>You can tweak the above code to use all years and all features:</p>
<div class="codewrapper sourceCode" id="cb50">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb50-1"><a href="#cb50-1" tabindex="-1"></a><span class="co"># We cannot take all rows, because we need to be able to take the sunshine hours of the next day</span></span>
<span id="cb50-2"><a href="#cb50-2" tabindex="-1"></a>nr_rows <span class="op">=</span> <span class="bu">len</span>(data) <span class="op">-</span> <span class="dv">2</span></span>
<span id="cb50-3"><a href="#cb50-3" tabindex="-1"></a></span>
<span id="cb50-4"><a href="#cb50-4" tabindex="-1"></a><span class="co"># data</span></span>
<span id="cb50-5"><a href="#cb50-5" tabindex="-1"></a>X_data <span class="op">=</span> data.loc[:nr_rows].drop(columns<span class="op">=</span>[<span class="st">'DATE'</span>, <span class="st">'MONTH'</span>])</span>
<span id="cb50-6"><a href="#cb50-6" tabindex="-1"></a></span>
<span id="cb50-7"><a href="#cb50-7" tabindex="-1"></a><span class="co"># labels (sunshine hours the next day)</span></span>
<span id="cb50-8"><a href="#cb50-8" tabindex="-1"></a>y_data <span class="op">=</span> data.loc[<span class="dv">1</span>:(nr_rows <span class="op">+</span> <span class="dv">1</span>)][<span class="st">"BASEL_sunshine"</span>]</span></code></pre>
</div>
<p>For the rest you can use the same code as above to train and evaluate
the model</p>
<p>This results in an RMSE on the test set of 3.23 (your result can be
different, but should be in the same range). From this we can conclude
that adding more training data results in even better performance!</p>
</div>
</div>
</div>
</div>
</div>
<div id="tensorboard" class="callout">
<div class="callout-square">
<i class="callout-icon" data-feather="bell"></i>
</div>
<div id="tensorboard" class="callout-inner">
<h3 class="callout-title">Tensorboard</h3>
<div class="callout-content">
<p>If we run many different experiments with different architectures, it
can be difficult to keep track of these different models or compare the
achieved performance. We can use <em>tensorboard</em>, a framework that
keeps track of our experiments and shows graphs like we plotted above.
Tensorboard is included in our tensorflow installation by default. To
use it, we first need to add a <em>callback</em> to our (compiled) model
that saves the progress of training performance in a logs rectory:</p>
<div class="codewrapper sourceCode" id="cb51">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb51-1"><a href="#cb51-1" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.callbacks <span class="im">import</span> TensorBoard</span>
<span id="cb51-2"><a href="#cb51-2" tabindex="-1"></a><span class="im">import</span> datetime</span>
<span id="cb51-3"><a href="#cb51-3" tabindex="-1"></a>log_dir <span class="op">=</span> <span class="st">"logs/fit/"</span> <span class="op">+</span> datetime.datetime.now().strftime(<span class="st">"%Y%m</span><span class="sc">%d</span><span class="st">-%H%M%S"</span>) <span class="co"># You can adjust this to add a more meaningful model name</span></span>
<span id="cb51-4"><a href="#cb51-4" tabindex="-1"></a>tensorboard_callback <span class="op">=</span> TensorBoard(log_dir<span class="op">=</span>log_dir, histogram_freq<span class="op">=</span><span class="dv">1</span>)</span>
<span id="cb51-5"><a href="#cb51-5" tabindex="-1"></a>history <span class="op">=</span> model.fit(X_train, y_train,</span>
<span id="cb51-6"><a href="#cb51-6" tabindex="-1"></a>                   batch_size <span class="op">=</span> <span class="dv">32</span>,</span>
<span id="cb51-7"><a href="#cb51-7" tabindex="-1"></a>                   epochs <span class="op">=</span> <span class="dv">200</span>,</span>
<span id="cb51-8"><a href="#cb51-8" tabindex="-1"></a>                   validation_data<span class="op">=</span>(X_val, y_val),</span>
<span id="cb51-9"><a href="#cb51-9" tabindex="-1"></a>                   callbacks<span class="op">=</span>[tensorboard_callback],</span>
<span id="cb51-10"><a href="#cb51-10" tabindex="-1"></a>                   verbose <span class="op">=</span> <span class="dv">2</span>)</span></code></pre>
</div>
<p>You can launch the tensorboard interface from a Jupyter notebook,
showing all trained models: <!--cce:skip--></p>
<pre><code>%load_ext tensorboard
%tensorboard --logdir logs/fit</code></pre>
<p>Which will show an interface that looks something like this: <img src="fig/03_tensorboard.png" alt="Screenshot of tensorboard" class="figure"></p>
</div>
</div>
</div>
</div>
</section><section><h2 class="section-heading" id="save-model">10. Save model<a class="anchor" aria-label="anchor" href="#save-model"></a></h2>
<hr class="half-width"><p>Now that we have a somewhat acceptable model, let us not forget to
save it for future users to benefit from our explorative efforts!</p>
<div class="codewrapper sourceCode" id="cb53">
<h3 class="code-label">PYTHON<i aria-hidden="true" data-feather="chevron-left"></i><i aria-hidden="true" data-feather="chevron-right"></i>
</h3>
<pre class="sourceCode python" tabindex="0"><code class="sourceCode python"><span id="cb53-1"><a href="#cb53-1" tabindex="-1"></a>model.save(<span class="st">'my_tuned_weather_model'</span>)</span></code></pre>
</div>
</section><section><h2 class="section-heading" id="outlook">Outlook<a class="anchor" aria-label="anchor" href="#outlook"></a></h2>
<hr class="half-width"><p>Correctly predicting tomorrow’s sunshine hours is apparently not that
simple. Our models get the general trends right, but still predictions
vary quite a bit and can even be far off.</p>
<div id="open-question-what-could-be-next-steps-to-further-improve-the-model" class="callout challenge">
<div class="callout-square">
<i class="callout-icon" data-feather="zap"></i>
</div>
<div id="open-question-what-could-be-next-steps-to-further-improve-the-model" class="callout-inner">
<h3 class="callout-title">Open question: What could be next steps to further improve the model?</h3>
<div class="callout-content">
<p>With unlimited options to modify the model architecture or to play
with the training parameters, deep learning can trigger very extensive
hunting for better and better results. Usually models are “well
behaving” in the sense that small changes to the architectures also only
result in small changes of the performance (if any). It is often
tempting to hunt for some magical settings that will lead to much better
results. But do those settings exist? Applying common sense is often a
good first step to make a guess of how much better results
<em>could</em> be. In the present case we might certainly not expect to
be able to reliably predict sunshine hours for the next day with 5-10
minute precision. But how much better our model could be exactly, often
remains difficult to answer.</p>
<ul><li>What changes to the model architecture might make sense to
explore?</li>
<li>Ignoring changes to the model architecture, what might notably
improve the prediction quality?</li>
</ul></div>
</div>
</div>
<div id="accordionSolution8" class="accordion challenge-accordion accordion-flush">
<div class="accordion-item">
<button class="accordion-button solution-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseSolution8" aria-expanded="false" aria-controls="collapseSolution8">
  <h4 class="accordion-header" id="headingSolution8"> Show me the solution </h4>
</button>
<div id="collapseSolution8" class="accordion-collapse collapse" aria-labelledby="headingSolution8" data-bs-parent="#accordionSolution8">
<div class="accordion-body">
<p>This is an open question. And we don’t actually know how far one
could push this sunshine hour prediction (try it out yourself if you
like! We’re curious!). But there are a few things that might be worth
exploring. Regarding the model architecture:</p>
<ul><li>In the present case we do not see a magical silver bullet to
suddenly boost the performance. But it might be worth testing if
<em>deeper</em> networks do better (more layers).</li>
</ul><p>Other changes that might impact the quality notably:</p>
<ul><li>The most obvious answer here would be: more data! Even this will not
always work (e.g. if data is very noisy and uncorrelated, more data
might not add much).</li>
<li>Related to more data: use data augmentation. By creating realistic
variations of the available data, the model might improve as well.</li>
<li>More data can mean more data points (you can test it yourself by
taking more than the 3 years we used here!)</li>
<li>More data can also mean more features! What about adding the
month?</li>
<li>The labels we used here (sunshine hours) are highly biased, many
days with no or nearly no sunshine but a few with &gt;10 hours.
Techniques such as oversampling or undersampling might handle such
biased labels better.</li>
</ul><p>Another alternative would be to not only look at data from one day,
but use the data of a longer period such as a full week. This will turn
the data into time series data which in turn might also make it worth to
apply different model architectures…</p>
</div>
</div>
</div>
</div>
<div id="keypoints1" class="callout keypoints">
<div class="callout-square">
<i class="callout-icon" data-feather="key"></i>
</div>
<div class="callout-inner">
<h3 class="callout-title">Key Points</h3>
<div class="callout-content">
<ul><li>Separate training, validation, and test sets allows monitoring and
evaluating your model.</li>
<li>Batchnormalization scales the data as part of the model.</li>
</ul></div>
</div>
</div>
<!--
Place links that you need to refer to multiple times across pages here. Delete
any links that you are not going to use.
 -->
</section></div> <!-- / div.lesson-content -->
    </main><!-- / main#main-content.main-content --><nav class="bottom-pagination mx-md-4" aria-label="Previous and Next Chapter"><div class="d-block d-sm-block d-md-none">
        <a class="chapter-link" href="2-keras.html"><i aria-hidden="true" class="small-arrow" data-feather="arrow-left"></i>Previous</a>
        <a class="chapter-link float-end" href="4-advanced-layer-types.html">Next<i aria-hidden="true" class="small-arrow" data-feather="arrow-right"></i></a>
      </div>
      <!-- content for large screens -->
      <div class="d-none d-sm-none d-md-block">
        <a class="chapter-link" href="2-keras.html" rel="prev">
          <i aria-hidden="true" class="small-arrow" data-feather="arrow-left"></i>
          Previous: Classification by a
        </a>
        <a class="chapter-link float-end" href="4-advanced-layer-types.html" rel="next">
          Next: Advanced layer types...
          <i aria-hidden="true" class="small-arrow" data-feather="arrow-right"></i>
        </a>
      </div>
    </nav></div> <!-- / div.primary-content.col-xs-12 -->
<!-- END:   inst/pkgdown/templates/content-instructor.html-->

      </div><!--/div.row-->
      		<footer class="row footer mx-md-3"><hr><div class="col-md-6">
        <p>This lesson is subject to the <a href="CODE_OF_CONDUCT.html">Code of Conduct</a></p>
        <p>

        <a href="https://github.com/carpentries-incubator/deep-learning-intro/edit/main/episodes/3-monitor-the-model.Rmd" class="external-link">Edit on GitHub</a>

	
        | <a href="https://github.com/carpentries-incubator/deep-learning-intro/blob/main/CONTRIBUTING.md" class="external-link">Contributing</a>
        | <a href="https://github.com/carpentries-incubator/deep-learning-intro/" class="external-link">Source</a></p>
				<p><a href="https://github.com/carpentries-incubator/deep-learning-intro/blob/main/CITATION.cff" class="external-link">Cite</a> | <a href="mailto:team@carpentries.org">Contact</a> | <a href="https://carpentries.org/about/" class="external-link">About</a></p>
			</div>
			<div class="col-md-6">

        <p>Materials licensed under <a href="LICENSE.html">CC-BY 4.0</a> by the authors</p>

        <p>Template licensed under <a href="https://creativecommons.org/licenses/by-sa/4.0/" class="external-link">CC-BY 4.0</a> by <a href="https://carpentries.org/" class="external-link">The Carpentries</a></p>
        <p>Built with <a href="https://github.com/carpentries/sandpaper/tree/0.16.10" class="external-link">sandpaper (0.16.10)</a>, <a href="https://github.com/carpentries/pegboard/tree/0.7.7" class="external-link">pegboard (0.7.7)</a>, and <a href="https://github.com/carpentries/varnish/tree/1.0.5" class="external-link">varnish (1.0.5)</a></p>
			</div>
		</footer></div> <!-- / div.container -->
	<div id="to-top">
		<a href="#top">
      <i class="search-icon" data-feather="arrow-up" role="img" aria-label="Back To Top"></i><br><!-- <span class="d-none d-sm-none d-md-none d-lg-none d-xl-block">Back</span> To Top --><span class="d-none d-sm-none d-md-none d-lg-none d-xl-block">Back</span> To Top
		</a>
	</div>
  <script type="application/ld+json">
    {
  "@context": "https://schema.org",
  "@type": "TrainingMaterial",
  "@id": "https://carpentries-incubator.github.io/deep-learning-intro/3-monitor-the-model.html",
  "inLanguage": "en",
  "dct:conformsTo": "https://bioschemas.org/profiles/TrainingMaterial/1.0-RELEASE",
  "description": "A Carpentries Lesson teaching foundational data and coding skills to researchers worldwide",
  "keywords": "deep learning, keras, lesson, The Carpentries, neural networks",
  "name": "Monitor the training process",
  "creativeWorkStatus": "active",
  "url": "https://carpentries-incubator.github.io/deep-learning-intro/3-monitor-the-model.html",
  "identifier": "https://carpentries-incubator.github.io/deep-learning-intro/3-monitor-the-model.html",
  "dateCreated": "2020-10-17",
  "dateModified": "2024-12-03",
  "datePublished": "2024-12-03"
}

  </script><script>
		feather.replace();
	</script></body></html><!-- END:   inst/pkgdown/templates/layout.html-->