patrick-kidger · Artur-Galstyan · Aug 12, 2024 · Aug 17, 2024 · Aug 18, 2024 · Aug 18, 2024
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,6 @@ examples/MNIST
 examples/multipart_serialised.eqx
 .python-version
 .DS_Store
+.ruff_cache
+.pytest_cache
+.venv
diff --git a/equinox/nn/_embedding.py b/equinox/nn/_embedding.py
@@ -118,31 +118,50 @@ class RotaryPositionalEmbedding(Module, strict=True):
         ```python
         class TransformerBlock(eqx.Module):
             rope_embeddings: RotaryPositionalEmbedding
+            mha_attention: MultiheadAttention
 
-            def __init__(...):
-                self.rope_embeddings = RotaryPositionalEmbedding(...)
+            def __init__(self, embedding_size, ...):
+                self.rope_embeddings = RotaryPositionalEmbedding(embedding_size)
+                self.mha_attention = MultiheadAttention(...)
 
-            def __call__(...):
+            def __call__(self, query, key_, value, index):
                 def process_heads(
                     query_heads: Float[Array, "seq_length num_heads qk_size"],
                     key_heads: Float[Array, "seq_length num_heads qk_size"],
-                    value_heads: Float[Array, "seq_length num_heads vo_size"]
+                    value_heads: Float[Array, "seq_length num_heads vo_size"],
+                    index: Int[Array, ""],
                 ) -> tuple[
                     Float[Array, "seq_length num_heads qk_size"],
                     Float[Array, "seq_length num_heads qk_size"],
-                    Float[Array, "seq_length num_heads vo_size"]
+                    Float[Array, "seq_length num_heads vo_size"],
                 ]:
-                    query_heads = jax.vmap(self.rope_embeddings,
-                                           in_axes=1,
-                                           out_axes=1)(query_heads)
-                    key_heads = jax.vmap(self.rope_embeddings,
-                                         in_axes=1,
-                                         out_axes=1)(key_heads)
+                    # index is the autoregressive index of the current token
+                    rope_p = functools.partial(self.rope_embeddings, offset=index)
+                    query_heads = jax.vmap(rope_p, in_axes=1, out_axes=1)(query_heads)
+                    key_heads = jax.vmap(rope_p, in_axes=1, out_axes=1)(key_heads)
 
                     return query_heads, key_heads, value_heads
 
-                x = self.mha_attention(... process_heads=process_heads)
-                ...
+                x = self.mha_attention(
+                    query=query,
+                    key_=key_,
+                    value=value,
+                    process_heads=functools.partial(process_heads, index=index),
+                )
+
+                return x
+
+
+        transformer_block = eqx.filter_jit(
+            TransformerBlock(embedding_size, ...)
+        )
+
+        q = jnp.ones(shape=(seq_length, query_size))
+        k = jnp.ones(shape=(seq_length, query_size))
+        v = jnp.ones(shape=(seq_length, query_size))
+
+        out = transformer_block(q, k, v, jnp.array(0))
+        out = transformer_block(q, k, v, jnp.array(1))
         ```
 
     ??? cite
@@ -194,12 +213,14 @@ def precompute_freqs_cis(
     def __call__(
         self,
         x: Float[Array, "seq_length embedding_size"],
+        offset: Int[Array, ""],
         *,
         key: Optional[PRNGKeyArray] = None,
     ) -> Float[Array, "seq_length embedding_size"]:
         """**Arguments:**
 
         - `x`: A JAX array of shape `(seq_length, embedding_size)`.
+        - `offset`: The offset to apply to the positional encoding.
         - `key`: Ignored; provided for compatibility with the rest of the Equinox API.
             (Keyword only argument.)
 
@@ -208,8 +229,8 @@ def __call__(
         A JAX array of shape `(seq_length, embedding_size)`, with the rotary positional
         encoding applied to the input.
         """
-
         seq_len, embedding_size = x.shape
+
         if embedding_size != self.embedding_size:
             raise ValueError(
                 f"x.shape[-1] must match self.embedding_size, "
@@ -233,19 +254,22 @@ def __call__(
                 )
                 internal_rope_embedding_cache[embedding_size] = freqs_cis
 
+        freqs_cis = jax.lax.dynamic_slice_in_dim(freqs_cis, offset, seq_len)
+
         freqs_real = jnp.tile(freqs_cis.real, (1, 2))
         freqs_imag = jnp.tile(freqs_cis.imag, (1, 2))
 
         rotate_x = self.rotate_half(x)
+
         x_rope = (x * freqs_real) + (rotate_x * freqs_imag)
         return x_rope
 
 
 RotaryPositionalEmbedding.__init__.__doc__ = """**Arguments:**
 
 - `embedding_size`: Size of the token embeddings. Must be non-negative and even.
-- `theta`: The base frequency for the sinusoidal functions. It defines the rate 
-   of oscillation for the sine and cosine waves that encode positional information 
+- `theta`: The base frequency for the sinusoidal functions. It defines the rate
+   of oscillation for the sine and cosine waves that encode positional information
    into the embeddings. The larger the theta value, the slower the oscillations
    and vice versa. Defaults to 10_000.0
 """
diff --git a/tests/test_nn.py b/tests/test_nn.py
@@ -1,3 +1,4 @@
+import functools
 import warnings
 from typing import Union
 
@@ -238,8 +239,8 @@ def test_mlp_learnt_activation():
         key=jrandom.PRNGKey(5678),
     )
     x = jnp.array([0.5, 0.7])
-    assert mlp.activation.negative_slope.shape == (2, 8)
-    assert mlp.final_activation.negative_slope.shape == (5,)
+    assert mlp.activation.negative_slope.shape == (2, 8)  # pyright: ignore
+    assert mlp.final_activation.negative_slope.shape == (5,)  # pyright: ignore
 
     @eqx.filter_jit
     @eqx.filter_grad
@@ -1352,13 +1353,15 @@ def test_prelu(getkey):
 
 def test_rope_embeddings_shapes(getkey):
     embedding_size = 32
-    rope_embeddings = eqx.nn.RotaryPositionalEmbedding(embedding_size)
 
     n_heads = 4
     seq_length = 8
     query_size = 32
     key_size = 32
 
+    rope_embeddings = eqx.nn.RotaryPositionalEmbedding(embedding_size)
+    rope_embeddings = functools.partial(rope_embeddings, offset=jnp.array(0))
+
     query_heads = jax.random.normal(
         key=getkey(), shape=(seq_length, n_heads, query_size)
     )
@@ -1436,6 +1439,10 @@ def test_rope_embeddings_values():
     )
 
     rope_embeddings = eqx.nn.RotaryPositionalEmbedding(embedding_size)
+    rope_embeddings = functools.partial(rope_embeddings, offset=jnp.array(0))
+    res = rope_embeddings(x)
+
+    assert jnp.allclose(res, expected_values, atol=1e-6)
     res = rope_embeddings(x)
 
     assert jnp.allclose(res, expected_values, atol=1e-6)