|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +""" |
| 4 | +Inspecting Model Parameters |
| 5 | +=========================== |
| 6 | +
|
| 7 | +**Author:** `Logan Thomas <https://github.com/loganthomas>`_ |
| 8 | +
|
| 9 | +.. grid:: 2 |
| 10 | +
|
| 11 | + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn |
| 12 | +
|
| 13 | + * How to inspect a model's parameters using ``.parameters()`` and ``.named_parameters()`` |
| 14 | + * How to collect the trainable parameters of a model |
| 15 | + * How to use the ``torchinfo`` package (formerly ``torch-summary``) to print a model summary |
| 16 | +""" |
| 17 | + |
| 18 | +######################################################################### |
| 19 | +# Overview |
| 20 | +# -------- |
| 21 | +# |
| 22 | +# When building neural networks, it's helpful to be able to inspect |
| 23 | +# parameters (model weights) at intermediate stages of development. |
| 24 | +# |
| 25 | +# This can help inform model architecture decisions, like how many |
| 26 | +# neurons to put in a proceeding layer. |
| 27 | +# Or, it can be used for debugging purposes to ensure each model's layer |
| 28 | +# has the anticipated number of weights. |
| 29 | +# |
| 30 | +# Inspecting Parameters of a Simple Neural Network |
| 31 | +# ------------------------------------------------ |
| 32 | +# Let's start with a simple example: |
| 33 | +# |
| 34 | +from torch import nn |
| 35 | + |
| 36 | + |
| 37 | +class NeuralNetwork(nn.Module): |
| 38 | + def __init__(self): |
| 39 | + super().__init__() |
| 40 | + self.flatten = nn.Flatten() |
| 41 | + self.linear_relu_stack = nn.Sequential( |
| 42 | + nn.Linear(28 * 28, 512), |
| 43 | + nn.ReLU(), |
| 44 | + nn.Linear(512, 512), |
| 45 | + nn.ReLU(), |
| 46 | + nn.Linear(512, 10), |
| 47 | + ) |
| 48 | + |
| 49 | + def forward(self, x): |
| 50 | + x = self.flatten(x) |
| 51 | + logits = self.linear_relu_stack(x) |
| 52 | + return logits |
| 53 | + |
| 54 | + |
| 55 | +model = NeuralNetwork() |
| 56 | +print(model) |
| 57 | + |
| 58 | +######################################################################### |
| 59 | +# Layers inside a neural network are parameterized, i.e. |
| 60 | +# have associated weights and biases that are optimized during training. |
| 61 | +# Subclassing ``nn.Module`` automatically tracks all fields defined |
| 62 | +# inside a model object, and makes all parameters accessible using a |
| 63 | +# model’s ``parameters()`` or ``named_parameters()`` methods. |
| 64 | +# |
| 65 | +# To inspect the shape of the parameter's associated with each layer in the model, |
| 66 | +# use ``model.parameters()``: |
| 67 | +print([param.shape for param in model.parameters()]) |
| 68 | + |
| 69 | +######################################################################### |
| 70 | +# Sometimes, it's more helpful to be able to have a name associated with |
| 71 | +# the parameters of each layer. Use ``model.named_parameters()`` to access |
| 72 | +# the parameter name in addition to the shape: |
| 73 | +for name, param in model.named_parameters(): |
| 74 | + print(name, param.shape) |
| 75 | + |
| 76 | +######################################################################### |
| 77 | +# Notice that the parameters are collected from the ``nn.Linear`` modules |
| 78 | +# specified in the network. Because the default behavior for ``nn.Linear`` |
| 79 | +# is to include a bias term, the output shows both a ``weight`` and ``bias`` |
| 80 | +# parameter for each of the ``nn.Linear`` modules. |
| 81 | +# |
| 82 | +# The shape of these parameters relate to the input shape (``in_features``) |
| 83 | +# and output shape (``out_features``) specified in each of the model's layers. |
| 84 | +# |
| 85 | +# Take for example the first ``nn.Linear(28*28, 512)`` module specified: |
| 86 | +layer = nn.Linear(28 * 28, 512) |
| 87 | + |
| 88 | +for name, param in layer.named_parameters(): |
| 89 | + print(name, param.size()) |
| 90 | + |
| 91 | +######################################################################### |
| 92 | +# The first line from the printed ``model.named_parameters()`` section |
| 93 | +# (``linear_relu_stack.0.weight torch.Size([512, 784])``) specifies |
| 94 | +# the ``weight`` associated with this layer. |
| 95 | +# The second line from the printed ``model.named_parameters()`` section |
| 96 | +# (``linear_relu_stack.0.bias torch.Size([512])``) specifies |
| 97 | +# the ``bias`` associated with this layer. The printed statements using ``.named_parameters()`` |
| 98 | +# are *not* meant to report the original shapes of the model's **layers** |
| 99 | +# but the shape of the **weights** (and/or **biases**) of the **parameters of the layers**. |
| 100 | +# This can cause confusion for new practitioners since the shape of the weights |
| 101 | +# seem to invert the input shape and output shape specified for the Linear layers. |
| 102 | +# These weights will be **transposed** during the matrix |
| 103 | +# multiplication process when the model makes a prediction (as specified in the `nn.Linear <https://pytorch.org/docs/stable/generated/torch.nn.Linear.html>`__ |
| 104 | +# docstring). |
| 105 | + |
| 106 | +######################################################################### |
| 107 | +# There is also a helpful ``.numel()`` method that can be used to gather |
| 108 | +# the number of elements that are in each model parameter: |
| 109 | +for name, param in model.named_parameters(): |
| 110 | + print(f"{name=}, {param.size()=}, {param.numel()=}") |
| 111 | + |
| 112 | +######################################################################### |
| 113 | +# The number of elements for each parameter is calculated by taking |
| 114 | +# the product of the entries of the Size tensor. |
| 115 | +# The ``.numel()`` can be used to find all the parameters in a model by taking |
| 116 | +# the sum across all the layer parameters: |
| 117 | +print(f"Total model params: {sum(p.numel() for p in model.parameters()):,}") |
| 118 | + |
| 119 | +######################################################################### |
| 120 | +# Sometimes, only the *trainable* parameters are of interest. |
| 121 | +# Use the ``requires_grad`` attribute to collect only those parameters |
| 122 | +# that require a gradient to be computed (i.e. those parameters that will be optimized during model training): |
| 123 | +print( |
| 124 | + f"Total model trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}" |
| 125 | +) |
| 126 | + |
| 127 | +######################################################################### |
| 128 | +# Since all the model weights currently require a gradient, the number |
| 129 | +# of trainable parameters are the same as the total number of model |
| 130 | +# parameters. Simply for educational purposes, parameters can be frozen |
| 131 | +# to show a difference in count. Below, the first linear layer's ``weight`` parameters are frozen |
| 132 | +# by setting ``requires_grad=False`` which will result in the trainable |
| 133 | +# parameters count having 401,408 less parameters. |
| 134 | +for name, param in model.named_parameters(): |
| 135 | + if name == "linear_relu_stack.0.weight": |
| 136 | + param.requires_grad = False |
| 137 | + print(f"{name=}, {param.size()=}, {param.numel()=}, {param.requires_grad=}") |
| 138 | +print( |
| 139 | + f"Total model trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}" |
| 140 | +) |
| 141 | +######################################################################### |
| 142 | +# Inspecting Parameters of a Convolutional Neural Network |
| 143 | +# ------------------------------------------------------- |
| 144 | +# These techniques also work for Convolutional Neural Networks: |
| 145 | + |
| 146 | + |
| 147 | +class CNN(nn.Module): |
| 148 | + def __init__(self): |
| 149 | + super().__init__() |
| 150 | + self.features = nn.Sequential( |
| 151 | + nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3), |
| 152 | + nn.ReLU(inplace=True), |
| 153 | + nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3), |
| 154 | + nn.ReLU(inplace=True), |
| 155 | + nn.MaxPool2d(kernel_size=(2, 2)), |
| 156 | + ) |
| 157 | + self.classifier = nn.Sequential( |
| 158 | + nn.Flatten(), |
| 159 | + nn.Linear(64 * 12 * 12, 128), |
| 160 | + nn.ReLU(inplace=True), |
| 161 | + nn.Linear(128, 10), |
| 162 | + ) |
| 163 | + |
| 164 | + def forward(self, x): |
| 165 | + x = self.features(x) |
| 166 | + x = self.classifier(x) |
| 167 | + return x |
| 168 | + |
| 169 | + |
| 170 | +cnn_model = CNN() |
| 171 | +print(cnn_model) |
| 172 | +print("-" * 72) |
| 173 | +for name, param in cnn_model.named_parameters(): |
| 174 | + print(f"{name=}, {param.size()=}, {param.numel()=}, {param.requires_grad=}") |
| 175 | +print("-" * 72) |
| 176 | +print( |
| 177 | + f"Total model trainable params: {sum(p.numel() for p in cnn_model.parameters() if p.requires_grad):,}" |
| 178 | +) |
| 179 | + |
| 180 | +###################################################################### |
| 181 | +# As with the simple network example above, the number of elements per parameter |
| 182 | +# is the product of the parameter size: |
| 183 | +import numpy as np |
| 184 | + |
| 185 | +for name, param in cnn_model.named_parameters(): |
| 186 | + print(f"{name=}, {param.size()=}, {np.prod(param.size())=} == {param.numel()=}") |
| 187 | + |
| 188 | +###################################################################### |
| 189 | +# For a more robust approach, consider using the `torchinfo package <https://github.com/TylerYep/torchinfo>`__ (formerly ``torch-summary``). |
| 190 | +# This package provides information complementary to what is provided by ``print(model)`` in PyTorch, |
| 191 | +# similar to Tensorflow's ``model.summary()`` API to view the visualization of the model. |
| 192 | +# |
| 193 | +# Notice that the trainable parameters reported by ``torchinfo`` matches |
| 194 | +# the manually gathered trainable parameters. |
| 195 | +import torchinfo |
| 196 | + |
| 197 | +# If running from a notebook, use print(torchinfo.summary(model)) |
| 198 | +torchinfo.summary(model) |
| 199 | +print("-" * 72) |
| 200 | +print( |
| 201 | + f"Manually gathered model trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}" |
| 202 | +) |
| 203 | +###################################################################### |
| 204 | +# There is one minor, but important, difference in the way ``torchinfo`` reports the number of parameters per layer. |
| 205 | +# Notice that the ``weight`` and ``bias`` parameter counts are **combined** |
| 206 | +# to report on the *total* number of parameters per layer. |
| 207 | +# For example, the first linear layer of the ``model`` created in the |
| 208 | +# "Inspecting Parameters of a Simple Neural Network" section has a |
| 209 | +# ``weight`` parameter with ``401,408`` elements and a ``bias`` parameter |
| 210 | +# with ``512``. Combining these two yields a total |
| 211 | +# of ``401,920`` (``401,408+512``) parameters for the layer -- which is |
| 212 | +# equivalent to what the ``torchinfo`` summary showed. |
| 213 | +# |
| 214 | +# A similar report can be generated manually by summing parameters per layer: |
| 215 | +from collections import defaultdict |
| 216 | + |
| 217 | +layer_params = defaultdict(int) |
| 218 | + |
| 219 | +for name, param in model.named_parameters(): |
| 220 | + # combine weight and bias together using layer name |
| 221 | + # linear_relu_stack.0 = linear_relu_stack.0.weight + linear_relu_stack.bias |
| 222 | + layer_params[name.rsplit(".", 1)[0]] += param.numel() |
| 223 | + |
| 224 | +for name, total_params in layer_params.items(): |
| 225 | + print(f"{name=} {total_params=:,}") |
| 226 | + |
| 227 | +###################################################################### |
| 228 | +# These approaches works for the Convolutional Neural Network as well: |
| 229 | + |
| 230 | +# If running from a notebook, use print(torchinfo.summary(model)) |
| 231 | +torchinfo.summary(cnn_model) |
| 232 | +print("-" * 72) |
| 233 | +print( |
| 234 | + f"Manually gathered model trainable params: {sum(p.numel() for p in cnn_model.parameters() if p.requires_grad):,}" |
| 235 | +) |
| 236 | +print("-" * 72) |
| 237 | +print("Manually generated total number of parameters per layer:") |
| 238 | +cnn_layer_params = defaultdict(int) |
| 239 | + |
| 240 | +for name, param in cnn_model.named_parameters(): |
| 241 | + cnn_layer_params[name.rsplit(".", 1)[0]] += param.numel() |
| 242 | + |
| 243 | +for name, total_params in cnn_layer_params.items(): |
| 244 | + print(f"{name=} {total_params=:,}") |
| 245 | + |
| 246 | +###################################################################### |
| 247 | +# Conclusion |
| 248 | +# ---------- |
| 249 | +# |
| 250 | +# Layers inside a neural network have associated weights and biases |
| 251 | +# that are optimized during training. These parameters (model weights) |
| 252 | +# are made accessible using a model’s ``parameters()`` or ``named_parameters()`` |
| 253 | +# methods. Interacting with these parameters can help inform model |
| 254 | +# architecture decisions or support model debugging. |
| 255 | +# |
| 256 | +# Further Reading |
| 257 | +# --------------- |
| 258 | +# |
| 259 | +# * `torchinfo <https://github.com/TylerYep/torchinfo>`__: provides information complementary to what is provided by ``print(model)`` in PyTorch, similar to Tensorflow's model.summary() API. |
0 commit comments