Add a workaround for invalid outputs of nn.Linear on MPS (#124)

danieldk · web-flow · commit d78f768b51de · 2023-03-22T10:27:48.000+01:00
`nn.Linear` produces incorrect outputs with certain matrix sizes when using the MPS backend: pytorch/pytorch#97239 The actual issue is in the underlying `torch.nn.functional.linear` function. Work around this by using an explicit matrix multiplication when the MPS backend is used.
diff --git a/curated_transformers/models/pytorch/__init__.py b/curated_transformers/models/pytorch/__init__.py
@@ -3,3 +3,4 @@
 from .albert.encoder import AlbertEncoder
 from .bert.encoder import BertEncoder
 from .roberta.encoder import RobertaEncoder
+from .linear import Linear
diff --git a/curated_transformers/models/pytorch/bert/layer.py b/curated_transformers/models/pytorch/bert/layer.py
@@ -5,6 +5,7 @@
 from .. import GeluNew
 from ..attention import AttentionMask, ScaledDotProductAttention
 from .config import BertAttentionConfig, BertLayerConfig
+from ..linear import Linear
 from ....errors import Errors
 
 
@@ -24,8 +25,8 @@ def __init__(self, config: BertAttentionConfig):
 
         self.dims_per_head = self.model_dim // self.num_heads
         self.attention = ScaledDotProductAttention(dropout_prob=config.dropout_prob)
-        self.input = torch.nn.Linear(self.model_dim, self.model_dim * 3)
-        self.output = torch.nn.Linear(self.model_dim, self.model_dim)
+        self.input = Linear(self.model_dim, self.model_dim * 3)
+        self.output = Linear(self.model_dim, self.model_dim)
 
     def _split_heads(self, x: Tensor) -> Tensor:
         """
@@ -75,10 +76,8 @@ class BertFeedForward(Module):
     def __init__(self, config: BertLayerConfig):
         super().__init__()
 
-        self.intermediate = torch.nn.Linear(
-            config.hidden_width, config.intermediate_width
-        )
-        self.output = torch.nn.Linear(config.intermediate_width, config.hidden_width)
+        self.intermediate = Linear(config.hidden_width, config.intermediate_width)
+        self.output = Linear(config.intermediate_width, config.hidden_width)
         if config.hidden_act == "relu":
             self.activation = torch.nn.ReLU()  # type: ignore
         elif config.hidden_act == "gelu":
diff --git a/curated_transformers/models/pytorch/linear.py b/curated_transformers/models/pytorch/linear.py
@@ -0,0 +1,14 @@
+import torch
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Linear(nn.Linear):
+    def forward(self, input: Tensor) -> Tensor:
+        # Work around issue with linear with the MPS backend. See:
+        # https://github.com/pytorch/pytorch/issues/97239
+        if hasattr(input, "is_mps") and input.is_mps:
+            return torch.matmul(input, self.weight.t()) + self.bias
+        else:
+            return F.linear(input, self.weight, self.bias)
diff --git a/curated_transformers/tests/models/test_hf_model.py b/curated_transformers/tests/models/test_hf_model.py
@@ -72,11 +72,11 @@ def test_model_against_hf_transformers(model_config):
     Y_hf_encoder = hf_encoder(X, attention_mask=attention_mask)
 
     assert torch.allclose(
-        Y_encoder.last_hidden_layer_states, Y_hf_encoder.last_hidden_state
+        Y_encoder.last_hidden_layer_states, Y_hf_encoder.last_hidden_state, atol=1e-6
     )
 
     # Try to infer the attention mask from padding.
     Y_encoder = encoder(X)
     assert torch.allclose(
-        Y_encoder.last_hidden_layer_states, Y_hf_encoder.last_hidden_state
+        Y_encoder.last_hidden_layer_states, Y_hf_encoder.last_hidden_state, atol=1e-6
     )

Original file line number	Diff line number	Diff line change
`@@ -72,11 +72,11 @@ def test_model_against_hf_transformers(model_config):`
`72`	`72`	`Y_hf_encoder = hf_encoder(X, attention_mask=attention_mask)`
`73`	`73`
`74`	`74`	`assert torch.allclose(`
`75`		`- Y_encoder.last_hidden_layer_states, Y_hf_encoder.last_hidden_state`
	`75`	`+ Y_encoder.last_hidden_layer_states, Y_hf_encoder.last_hidden_state, atol=1e-6`
`76`	`76`	`)`
`77`	`77`
`78`	`78`	`# Try to infer the attention mask from padding.`
`79`	`79`	`Y_encoder = encoder(X)`
`80`	`80`	`assert torch.allclose(`
`81`		`- Y_encoder.last_hidden_layer_states, Y_hf_encoder.last_hidden_state`
	`81`	`+ Y_encoder.last_hidden_layer_states, Y_hf_encoder.last_hidden_state, atol=1e-6`
`82`	`82`	`)`