patrick-kidger · Artur-Galstyan · Aug 12, 2024 · Aug 17, 2024 · Aug 18, 2024 · Aug 18, 2024
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,6 @@ examples/MNIST
 examples/multipart_serialised.eqx
 .python-version
 .DS_Store
+.ruff_cache
+.pytest_cache
+.venv
diff --git a/equinox/nn/_embedding.py b/equinox/nn/_embedding.py
@@ -118,31 +118,76 @@ class RotaryPositionalEmbedding(Module, strict=True):
         ```python
         class TransformerBlock(eqx.Module):
             rope_embeddings: RotaryPositionalEmbedding
+            mha_attention: MultiheadAttention
 
-            def __init__(...):
-                self.rope_embeddings = RotaryPositionalEmbedding(...)
+            def __init__(self, embedding_size, max_seq_length, num_heads, query_size):
+                self.rope_embeddings = RotaryPositionalEmbedding(
+                    embedding_size, max_seq_length
+                )
+                self.mha_attention = MultiheadAttention(
+                    num_heads=num_heads, query_size=query_size, key=jax.random.key(0)
+                )
 
-            def __call__(...):
+            def __call__(self, query, key_, value, index):
                 def process_heads(
                     query_heads: Float[Array, "seq_length num_heads qk_size"],
                     key_heads: Float[Array, "seq_length num_heads qk_size"],
-                    value_heads: Float[Array, "seq_length num_heads vo_size"]
+                    value_heads: Float[Array, "seq_length num_heads vo_size"],
+                    index: Int[Array, ""],
                 ) -> tuple[
                     Float[Array, "seq_length num_heads qk_size"],
                     Float[Array, "seq_length num_heads qk_size"],
-                    Float[Array, "seq_length num_heads vo_size"]
+                    Float[Array, "seq_length num_heads vo_size"],
                 ]:
-                    query_heads = jax.vmap(self.rope_embeddings,
-                                           in_axes=1,
-                                           out_axes=1)(query_heads)
-                    key_heads = jax.vmap(self.rope_embeddings,
-                                         in_axes=1,
-                                         out_axes=1)(key_heads)
+                    # index is the autoregressive index of the current token
+                    rope_p = functools.partial(self.rope_embeddings, offset=index)
+                    query_heads = jax.vmap(rope_p, in_axes=1, out_axes=1)(query_heads)
+                    key_heads = jax.vmap(rope_p, in_axes=1, out_axes=1)(key_heads)
 
                     return query_heads, key_heads, value_heads
 
-                x = self.mha_attention(... process_heads=process_heads)
-                ...
+                x = self.mha_attention(
+                    query=query,
+                    key_=key_,
+                    value=value,
+                    process_heads=functools.partial(process_heads, index=index),
+                )
+
+                return x
+
+        embedding_size = 32
+        max_seq_length = 8
+        seq_length = 4
+        num_heads = 2
+        query_size = 64
+
+        transformer_block = eqx.filter_jit(
+            TransformerBlock(embedding_size, max_seq_length, num_heads, query_size)
+        )
+
+        q = jnp.ones(shape=(seq_length, query_size))
+        k = jnp.ones(shape=(seq_length, query_size))
+        v = jnp.ones(shape=(seq_length, query_size))
+
+        out = transformer_block(q, k, v, jnp.array(0))
+        out = transformer_block(q, k, v, jnp.array(1)) # no re-JITing
+        ```
+
+        If you're training a transformer, you likely don't want to use any offset. In
+        those cases, it can be helpful to use `functools.partial` like so:
+        ```python
+        embedding_size = 32
+        max_seq_length = 8
+
+        rot_emb = RotaryPositionalEmbedding(
+            embedding_size=embedding_size, max_seq_length=max_seq_length
+        )
+        rot_emb = eqx.filter_jit(rot_emb)
+        rot_emb_no_offset = functools.partial(rot_emb, offset=jnp.array(0))
+
+        x = jnp.ones(shape=(max_seq_length, embedding_size))
+
+        assert jnp.allclose(rot_emb(x, offset=jnp.array(0)), rot_emb_no_offset(x))
         ```
 
     ??? cite
@@ -161,13 +206,16 @@ def process_heads(
     """
 
     embedding_size: int = field(static=True)
+    max_seq_length: int = field(static=True)
     theta: float = field(static=True, default=10_000.0)
 
     def __check_init__(self):
         if self.embedding_size < 0:
             raise ValueError("`embedding_size` must not be negative.")
         if (self.embedding_size % 2) != 0:
             raise ValueError("`embedding_size` must be even.")
+        if self.max_seq_length < 0:
+            raise ValueError("`max_seq_length` must not be negative.")
 
     @staticmethod
     def rotate_half(x: Float[Array, "seq_length embedding_size"]):
@@ -194,12 +242,14 @@ def precompute_freqs_cis(
     def __call__(
         self,
         x: Float[Array, "seq_length embedding_size"],
+        offset: Int[Array, ""],
         *,
         key: Optional[PRNGKeyArray] = None,
     ) -> Float[Array, "seq_length embedding_size"]:
         """**Arguments:**
 
         - `x`: A JAX array of shape `(seq_length, embedding_size)`.
+        - `offset`: The offset to apply to the positional encoding.
         - `key`: Ignored; provided for compatibility with the rest of the Equinox API.
             (Keyword only argument.)
 
@@ -208,44 +258,55 @@ def __call__(
         A JAX array of shape `(seq_length, embedding_size)`, with the rotary positional
         encoding applied to the input.
         """
-
+        print("JIT ROPE")
         seq_len, embedding_size = x.shape
         if embedding_size != self.embedding_size:
             raise ValueError(
                 f"x.shape[-1] must match self.embedding_size, "
                 f"but {x.shape[-1]} != {self.embedding_size}"
             )
+        if seq_len > self.max_seq_length:
+            raise ValueError(
+                f"seq_len must be less than or equal to self.max_seq_length, "
+                f"but {seq_len} > {self.max_seq_length}"
+            )
 
         with jax.ensure_compile_time_eval():
             if embedding_size in internal_rope_embedding_cache:
                 freqs_cis = internal_rope_embedding_cache[embedding_size]
                 freqs_cis_seq_len, _ = freqs_cis.shape
-                if seq_len > freqs_cis_seq_len:
+                if self.max_seq_length > freqs_cis_seq_len:
                     freqs_cis = self.precompute_freqs_cis(
-                        embedding_size, seq_len, self.theta
+                        embedding_size, self.max_seq_length, self.theta
                     )
                     internal_rope_embedding_cache[embedding_size] = freqs_cis
                 else:
-                    freqs_cis = freqs_cis[:seq_len]
+                    freqs_cis = freqs_cis[: self.max_seq_length]
             else:
                 freqs_cis = self.precompute_freqs_cis(
-                    embedding_size, seq_len, self.theta
+                    embedding_size, self.max_seq_length, self.theta
                 )
                 internal_rope_embedding_cache[embedding_size] = freqs_cis
 
+        freqs_cis = jax.lax.dynamic_slice_in_dim(freqs_cis, offset, seq_len)
+
         freqs_real = jnp.tile(freqs_cis.real, (1, 2))
         freqs_imag = jnp.tile(freqs_cis.imag, (1, 2))
 
         rotate_x = self.rotate_half(x)
+
         x_rope = (x * freqs_real) + (rotate_x * freqs_imag)
         return x_rope
 
 
 RotaryPositionalEmbedding.__init__.__doc__ = """**Arguments:**
 
 - `embedding_size`: Size of the token embeddings. Must be non-negative and even.
-- `theta`: The base frequency for the sinusoidal functions. It defines the rate 
-   of oscillation for the sine and cosine waves that encode positional information 
+- `theta`: The base frequency for the sinusoidal functions. It defines the rate
+   of oscillation for the sine and cosine waves that encode positional information
    into the embeddings. The larger the theta value, the slower the oscillations
    and vice versa. Defaults to 10_000.0
+- `max_seq_length`: The maximum sequence length for which to precompute the
+   positional encodings. This is used to determine the size of the precomputed
+   positional encodings.
 """
diff --git a/tests/test_nn.py b/tests/test_nn.py
@@ -1,3 +1,4 @@
+import functools
 import warnings
 from typing import Union
 
@@ -238,8 +239,8 @@ def test_mlp_learnt_activation():
         key=jrandom.PRNGKey(5678),
     )
     x = jnp.array([0.5, 0.7])
-    assert mlp.activation.negative_slope.shape == (2, 8)
-    assert mlp.final_activation.negative_slope.shape == (5,)
+    assert mlp.activation.negative_slope.shape == (2, 8)  # pyright: ignore
+    assert mlp.final_activation.negative_slope.shape == (5,)  # pyright: ignore
 
     @eqx.filter_jit
     @eqx.filter_grad
@@ -1352,13 +1353,17 @@ def test_prelu(getkey):
 
 def test_rope_embeddings_shapes(getkey):
     embedding_size = 32
-    rope_embeddings = eqx.nn.RotaryPositionalEmbedding(embedding_size)
 
     n_heads = 4
     seq_length = 8
     query_size = 32
     key_size = 32
 
+    rope_embeddings = eqx.nn.RotaryPositionalEmbedding(
+        embedding_size, max_seq_length=seq_length
+    )
+    rope_embeddings = functools.partial(rope_embeddings, offset=jnp.array(0))
+
     query_heads = jax.random.normal(
         key=getkey(), shape=(seq_length, n_heads, query_size)
     )
@@ -1435,7 +1440,13 @@ def test_rope_embeddings_values():
         seq_length, embedding_size
     )
 
-    rope_embeddings = eqx.nn.RotaryPositionalEmbedding(embedding_size)
+    rope_embeddings = eqx.nn.RotaryPositionalEmbedding(
+        embedding_size, max_seq_length=seq_length
+    )
+    rope_embeddings = functools.partial(rope_embeddings, offset=jnp.array(0))
+    res = rope_embeddings(x)
+
+    assert jnp.allclose(res, expected_values, atol=1e-6)
     res = rope_embeddings(x)
 
     assert jnp.allclose(res, expected_values, atol=1e-6)