diff --git a/games/atari.py b/games/atari.py
index 56461328..568a2f60 100644
--- a/games/atari.py
+++ b/games/atari.py
@@ -84,7 +84,7 @@ def __init__(self):
         self.batch_size = 1024  # Number of parts of games to train on at each training step
         self.checkpoint_interval = int(1e3)  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
-        self.train_on_gpu = True if torch.cuda.is_available() else False  # Train on GPU if available
+        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
 
         self.optimizer = "SGD"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
diff --git a/games/breakout.py b/games/breakout.py
index 26bbfc70..b8556efe 100644
--- a/games/breakout.py
+++ b/games/breakout.py
@@ -84,7 +84,7 @@ def __init__(self):
         self.batch_size = 16  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 500  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
-        self.train_on_gpu = True if torch.cuda.is_available() else False  # Train on GPU if available
+        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
 
         self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
diff --git a/games/cartpole.py b/games/cartpole.py
index 0a3aab93..767af700 100644
--- a/games/cartpole.py
+++ b/games/cartpole.py
@@ -79,7 +79,7 @@ def __init__(self):
         self.batch_size = 128  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 1  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
-        self.train_on_gpu = True if torch.cuda.is_available() else False  # Train on GPU if available
+        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
 
         self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
diff --git a/games/connect4.py b/games/connect4.py
index d7cfcd9e..4408261e 100644
--- a/games/connect4.py
+++ b/games/connect4.py
@@ -78,7 +78,7 @@ def __init__(self):
         self.batch_size = 64  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
-        self.train_on_gpu = True if torch.cuda.is_available() else False  # Train on GPU if available
+        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
 
         self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
diff --git a/games/gomoku.py b/games/gomoku.py
index 150afa51..4d42a457 100644
--- a/games/gomoku.py
+++ b/games/gomoku.py
@@ -79,7 +79,7 @@ def __init__(self):
         self.batch_size = 512  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 50  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 1  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
-        self.train_on_gpu = True if torch.cuda.is_available() else False  # Train on GPU if available
+        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
 
         self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
diff --git a/games/gridworld.py b/games/gridworld.py
index dfa47b91..4c4f8ba6 100644
--- a/games/gridworld.py
+++ b/games/gridworld.py
@@ -84,7 +84,7 @@ def __init__(self):
         self.batch_size = 128  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 1  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
-        self.train_on_gpu = True if torch.cuda.is_available() else False  # Train on GPU if available
+        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
 
         self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
diff --git a/games/lunarlander.py b/games/lunarlander.py
index 5e9e6e6e..a3a771be 100644
--- a/games/lunarlander.py
+++ b/games/lunarlander.py
@@ -79,7 +79,7 @@ def __init__(self):
         self.batch_size = 64  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 1  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
-        self.train_on_gpu = True if torch.cuda.is_available() else False  # Train on GPU if available
+        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
 
         self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
diff --git a/games/simple_grid.py b/games/simple_grid.py
index e66c254a..322872ac 100644
--- a/games/simple_grid.py
+++ b/games/simple_grid.py
@@ -78,7 +78,7 @@ def __init__(self):
         self.batch_size = 32  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 1  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
-        self.train_on_gpu = True if torch.cuda.is_available() else False  # Train on GPU if available
+        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
 
         self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
diff --git a/games/tictactoe.py b/games/tictactoe.py
index 4245bd56..d7e24f44 100644
--- a/games/tictactoe.py
+++ b/games/tictactoe.py
@@ -78,7 +78,7 @@ def __init__(self):
         self.batch_size = 64  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
-        self.train_on_gpu = True if torch.cuda.is_available() else False  # Train on GPU if available
+        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
 
         self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
diff --git a/games/twentyone.py b/games/twentyone.py
index 7860158e..4a6cd1f4 100644
--- a/games/twentyone.py
+++ b/games/twentyone.py
@@ -85,7 +85,7 @@ def __init__(self):
         self.batch_size = 64  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
-        self.train_on_gpu = True if torch.cuda.is_available() else False  # Train on GPU if available
+        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
 
         self.optimizer = "SGD"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
diff --git a/models.py b/models.py
index 4cb09f3a..fd985d65 100644
--- a/models.py
+++ b/models.py
@@ -219,14 +219,14 @@ def __init__(self, num_channels, stride=1):
         self.bn2 = torch.nn.BatchNorm2d(num_channels)
 
     def forward(self, x):
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = torch.nn.functional.relu(x)
-        x = self.conv2(x)
-        x = self.bn2(x)
-        x += x
-        x = torch.nn.functional.relu(x)
-        return x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = torch.nn.functional.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out += x
+        out = torch.nn.functional.relu(out)
+        return out
 
 
 # Downsample observations before representation network (See paper appendix Network Architecture)