pytorch · loganthomas · Jun 8, 2024 · Jun 9, 2024 · Jun 9, 2024 · Jun 9, 2024
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
@@ -69,3 +69,4 @@ pycocotools
 semilearn==0.3.2
 torchao==0.0.3
 segment_anything==1.0
+torchinfo
diff --git a/_static/img/thumbnails/cropped/inspect-model-parameters.png b/_static/img/thumbnails/cropped/inspect-model-parameters.png
diff --git a/recipes_source/inspecting_model_parameters.py b/recipes_source/inspecting_model_parameters.py
@@ -0,0 +1,259 @@
+# -*- coding: utf-8 -*-
+
+"""
+Inspecting Model Parameters
+===========================
+
+**Author:** `Logan Thomas <https://github.com/loganthomas>`_
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+
+      * How to inspect a model's parameters using ``.parameters()`` and ``.named_parameters()``
+      * How to collect the trainable parameters of a model
+      * How to use the ``torchinfo`` package (formerly ``torch-summary``) to print a model summary
+"""
+
+#########################################################################
+# Overview
+# --------
+#
+# When building neural networks, it's helpful to be able to inspect
+# parameters (model weights) at intermediate stages of development.
+#
+# This can help inform model architecture decisions, like how many
+# neurons to put in a proceeding layer.
+# Or, it can be used for debugging purposes to ensure each model's layer
+# has the anticipated number of weights.
+#
+# Inspecting Parameters of a Simple Neural Network
+# ------------------------------------------------
+# Let's start with a simple example:
+#
+from torch import nn
+
+
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28 * 28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10),
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+
+model = NeuralNetwork()
+print(model)
+
+#########################################################################
+# Layers inside a neural network are parameterized, i.e.
+# have associated weights and biases that are optimized during training.
+# Subclassing ``nn.Module`` automatically tracks all fields defined
+# inside a model object, and makes all parameters accessible using a
+# model’s ``parameters()`` or ``named_parameters()`` methods.
+#
+# To inspect the shape of the parameter's associated with each layer in the model,
+# use ``model.parameters()``:
+print([param.shape for param in model.parameters()])
+
+#########################################################################
+# Sometimes, it's more helpful to be able to have a name associated with
+# the parameters of each layer. Use ``model.named_parameters()`` to access
+# the parameter name in addition to the shape:
+for name, param in model.named_parameters():
+    print(name, param.shape)
+
+#########################################################################
+# Notice that the parameters are collected from the ``nn.Linear`` modules
+# specified in the network. Because the default behavior for ``nn.Linear``
+# is to include a bias term, the output shows both a ``weight`` and ``bias``
+# parameter for each of the ``nn.Linear`` modules.
+#
+# The shape of these parameters relate to the input shape (``in_features``)
+# and output shape (``out_features``) specified in each of the model's layers.
+#
+# Take for example the first ``nn.Linear(28*28, 512)`` module specified:
+layer = nn.Linear(28 * 28, 512)
+
+for name, param in layer.named_parameters():
+    print(name, param.size())
+
+#########################################################################
+# The first line from the printed ``model.named_parameters()`` section
+# (``linear_relu_stack.0.weight torch.Size([512, 784])``) specifies
+# the ``weight`` associated with this layer.
+# The second line from the printed ``model.named_parameters()`` section
+# (``linear_relu_stack.0.bias torch.Size([512])``) specifies
+# the ``bias`` associated with this layer. The printed statements using ``.named_parameters()``
+# are *not* meant to report the original shapes of the model's **layers**
+# but the shape of the **weights** (and/or **biases**) of the **parameters of the layers**.
+# This can cause confusion for new practitioners since the shape of the weights
+# seem to invert the input shape and output shape specified for the Linear layers.
+# These weights will be **transposed** during the matrix
+# multiplication process when the model makes a prediction (as specified in the `nn.Linear <https://pytorch.org/docs/stable/generated/torch.nn.Linear.html>`__
+# docstring).
+
+#########################################################################
+# There is also a helpful ``.numel()`` method that can be used to gather
+# the number of elements that are in each model parameter:
+for name, param in model.named_parameters():
+    print(f"{name=}, {param.size()=}, {param.numel()=}")
+
+#########################################################################
+# The number of elements for each parameter is calculated by taking
+# the product of the entries of the Size tensor.
+# The ``.numel()`` can be used to find all the parameters in a model by taking
+# the sum across all the layer parameters:
+print(f"Total model params: {sum(p.numel() for p in model.parameters()):,}")
+
+#########################################################################
+# Sometimes, only the *trainable* parameters are of interest.
+# Use the ``requires_grad`` attribute to collect only those parameters
+# that require a gradient to be computed (i.e. those parameters that will be optimized during model training):
+print(
+    f"Total model trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}"
+)
+
+#########################################################################
+# Since all the model weights currently require a gradient, the number
+# of trainable parameters are the same as the total number of model
+# parameters. Simply for educational purposes, parameters can be frozen
+# to show a difference in count. Below, the first linear layer's ``weight`` parameters are frozen
+# by setting ``requires_grad=False`` which will result in the trainable
+# parameters count having 401,408 less parameters.
+for name, param in model.named_parameters():
+    if name == "linear_relu_stack.0.weight":
+        param.requires_grad = False
+    print(f"{name=}, {param.size()=}, {param.numel()=}, {param.requires_grad=}")
+print(
+    f"Total model trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}"
+)
+#########################################################################
+# Inspecting Parameters of a Convolutional Neural Network
+# -------------------------------------------------------
+# These techniques also work for Convolutional Neural Networks:
+
+
+class CNN(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=(2, 2)),
+        )
+        self.classifier = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(64 * 12 * 12, 128),
+            nn.ReLU(inplace=True),
+            nn.Linear(128, 10),
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.classifier(x)
+        return x
+
+
+cnn_model = CNN()
+print(cnn_model)
+print("-" * 72)
+for name, param in cnn_model.named_parameters():
+    print(f"{name=}, {param.size()=}, {param.numel()=}, {param.requires_grad=}")
+print("-" * 72)
+print(
+    f"Total model trainable params: {sum(p.numel() for p in cnn_model.parameters() if p.requires_grad):,}"
+)
+
+######################################################################
+# As with the simple network example above, the number of elements per parameter
+# is the product of the parameter size:
+import numpy as np
+
+for name, param in cnn_model.named_parameters():
+    print(f"{name=}, {param.size()=}, {np.prod(param.size())=} == {param.numel()=}")
+
+######################################################################
+# For a more robust approach, consider using the `torchinfo package <https://github.com/TylerYep/torchinfo>`__ (formerly ``torch-summary``).
+# This package provides information complementary to what is provided by ``print(model)`` in PyTorch,
+# similar to Tensorflow's ``model.summary()`` API to view the visualization of the model.
+#
+# Notice that the trainable parameters reported by ``torchinfo`` matches
+# the manually gathered trainable parameters.
+import torchinfo
+
+# If running from a notebook, use print(torchinfo.summary(model))
+torchinfo.summary(model)
+print("-" * 72)
+print(
+    f"Manually gathered model trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}"
+)
+######################################################################
+# There is one minor, but important, difference in the way ``torchinfo`` reports the number of parameters per layer.
+# Notice that the ``weight`` and ``bias`` parameter counts are **combined**
+# to report on the *total* number of parameters per layer.
+# For example, the first linear layer of the ``model`` created in the
+# "Inspecting Parameters of a Simple Neural Network" section has a
+# ``weight`` parameter with ``401,408`` elements and a ``bias`` parameter
+# with ``512``. Combining these two yields a total
+# of ``401,920`` (``401,408+512``) parameters for the layer -- which is
+# equivalent to what the ``torchinfo`` summary showed.
+#
+# A similar report can be generated manually by summing parameters per layer:
+from collections import defaultdict
+
+layer_params = defaultdict(int)
+
+for name, param in model.named_parameters():
+    # combine weight and bias together using layer name
+    # linear_relu_stack.0 = linear_relu_stack.0.weight + linear_relu_stack.bias
+    layer_params[name.rsplit(".", 1)[0]] += param.numel()
+
+for name, total_params in layer_params.items():
+    print(f"{name=} {total_params=:,}")
+
+######################################################################
+# These approaches works for the Convolutional Neural Network as well:
+
+# If running from a notebook, use print(torchinfo.summary(model))
+torchinfo.summary(cnn_model)
+print("-" * 72)
+print(
+    f"Manually gathered model trainable params: {sum(p.numel() for p in cnn_model.parameters() if p.requires_grad):,}"
+)
+print("-" * 72)
+print("Manually generated total number of parameters per layer:")
+cnn_layer_params = defaultdict(int)
+
+for name, param in cnn_model.named_parameters():
+    cnn_layer_params[name.rsplit(".", 1)[0]] += param.numel()
+
+for name, total_params in cnn_layer_params.items():
+    print(f"{name=} {total_params=:,}")
+
+######################################################################
+# Conclusion
+# ----------
+#
+# Layers inside a neural network have associated weights and biases
+# that are optimized during training. These parameters (model weights)
+# are made accessible using a model’s ``parameters()`` or ``named_parameters()``
+# methods. Interacting with these parameters can help inform model
+# architecture decisions or support model debugging.
+#
+# Further Reading
+# ---------------
+#
+# * `torchinfo <https://github.com/TylerYep/torchinfo>`__: provides information complementary to what is provided by ``print(model)`` in PyTorch, similar to Tensorflow's model.summary() API.
diff --git a/recipes_source/recipes/README.txt b/recipes_source/recipes/README.txt
@@ -16,12 +16,12 @@ PyTorch Recipes
 	 Saving and loading models for inference in PyTorch
          https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_models_for_inference.html
 
-5. custom_dataset_transforms_loader.py 
+5. custom_dataset_transforms_loader.py
 	 Developing Custom PyTorch Dataloaders
          https://pytorch.org/tutorials/recipes/recipes/custom_dataset_transforms_loader.html
 
 
-6. Captum_Recipe.py 
+6. Captum_Recipe.py
 	 Model Interpretability using Captum
          https://pytorch.org/tutorials/recipes/recipes/Captum_Recipe.html
 
@@ -45,7 +45,7 @@ PyTorch Recipes
          Saving and loading multiple models in one file using PyTorch
          https://pytorch.org/tutorials/recipes/recipes/saving_multiple_models_in_one_file.html
 
-12. warmstarting_model_using_parameters_from_a_different_model.py 
+12. warmstarting_model_using_parameters_from_a_different_model.py
          Warmstarting models using parameters from different model
          https://pytorch.org/tutorials/recipes/recipes/warmstarting_model_using_parameters_from_a_different_model.html
 
@@ -60,3 +60,7 @@ PyTorch Recipes
 15. amp_recipe.py
          Automatic Mixed Precision
          https://pytorch.org/tutorials/recipes/amp_recipe.html
+
+16. inspecting_model_parameters.py
+         Inspecting Model Parameters
+         https://pytorch.org/tutorials/recipes/inspecting_model_parameters.html
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
@@ -158,6 +158,12 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    :link: ../recipes/recipes/swap_tensors.html
    :tags: Basics
 
+.. customcarditem::
+   :header: Inspecting Model Parameters
+   :card_description: Learn how to gather and inspect parameters (model weights) of your models.
+   :image: ../_static/img/thumbnails/cropped/inspect-model-parameters.png
+   :link: ../recipes/inspecting_model_parameters.html
+   :tags: Basics
 
 .. Interpretability
 
@@ -434,3 +440,4 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    /recipes/cuda_rpc
    /recipes/distributed_optim_torchscript
    /recipes/mobile_interpreter
+   /recipes/interpreting_model_parameters