From 6f87cb8a1921bdcd60e5941fe8a500d2941bb948 Mon Sep 17 00:00:00 2001
From: Laxma Reddy Patlolla <laxmareddyp@google.com>
Date: Tue, 21 Jan 2025 11:05:45 -0800
Subject: [PATCH 01/12]  Kaggle presets path update (#2052)

---
 keras_hub/src/models/basnet/basnet_presets.py | 16 +++++++++++++++-
 keras_hub/src/models/basnet/basnet_test.py    |  1 -
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/keras_hub/src/models/basnet/basnet_presets.py b/keras_hub/src/models/basnet/basnet_presets.py
index 3d96ab7885..1e1ee8e8ee 100644
--- a/keras_hub/src/models/basnet/basnet_presets.py
+++ b/keras_hub/src/models/basnet/basnet_presets.py
@@ -1,3 +1,17 @@
 """BASNet model preset configurations."""
 
-basnet_presets = {}
+basnet_presets = {
+    "basnet_duts": {
+        "metadata": {
+            "description": (
+                "BASNet model with a 34-layer ResNet backbone, pre-trained "
+                "on the DUTS image dataset at a 288x288 resolution. Model "
+                "training was performed by Hamid Ali "
+                "(https://github.com/hamidriasat/BASNet)."
+            ),
+            "params": 108886792,
+            "path": "basnet",
+        },
+        "kaggle_handle": "kaggle://keras/basnet/keras/base1",
+    },
+}
diff --git a/keras_hub/src/models/basnet/basnet_test.py b/keras_hub/src/models/basnet/basnet_test.py
index 4147d43c7c..b5bbe405e2 100644
--- a/keras_hub/src/models/basnet/basnet_test.py
+++ b/keras_hub/src/models/basnet/basnet_test.py
@@ -54,7 +54,6 @@ def test_end_to_end_model_predict(self):
         output = model.predict(self.images)
         self.assertAllEqual(output.shape, (2, 64, 64, 1))
 
-    @pytest.mark.skip(reason="disabled until preset's been uploaded to Kaggle")
     @pytest.mark.extra_large
     def test_all_presets(self):
         for preset in BASNetImageSegmenter.presets:

From 96b2fe5bce5fef2b6405a78b8c6e0ba14a2f2f4b Mon Sep 17 00:00:00 2001
From: Matt Watson <1389937+mattdangerw@users.noreply.github.com>
Date: Tue, 21 Jan 2025 13:50:37 -0800
Subject: [PATCH 02/12] Update asserts to avoid deprecated methods (#2053)

---
 .../preprocessing/masked_lm_mask_generator_test.py   |  2 +-
 keras_hub/src/models/efficientnet/cba_test.py        |  4 ++--
 .../efficientnet/efficientnet_backbone_test.py       | 12 ++++++------
 .../src/models/efficientnet/fusedmbconv_test.py      |  6 +++---
 keras_hub/src/models/efficientnet/mbconv_test.py     |  6 +++---
 keras_hub/src/tests/test_case.py                     |  2 +-
 6 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/keras_hub/src/layers/preprocessing/masked_lm_mask_generator_test.py b/keras_hub/src/layers/preprocessing/masked_lm_mask_generator_test.py
index f3e58d3133..6b03f54d11 100644
--- a/keras_hub/src/layers/preprocessing/masked_lm_mask_generator_test.py
+++ b/keras_hub/src/layers/preprocessing/masked_lm_mask_generator_test.py
@@ -148,7 +148,7 @@ def test_config(self):
             "vocabulary_size": self.vocabulary_size,
             "unselectable_token_ids": unselectable_token_ids,
         }
-        self.assertDictContainsSubset(expected_config, config)
+        self.assertEqual(config, {**config, **expected_config})
 
         # Test cloned masked_lm_masker can be run.
         cloned_masked_lm_masker = MaskedLMMaskGenerator.from_config(config)
diff --git a/keras_hub/src/models/efficientnet/cba_test.py b/keras_hub/src/models/efficientnet/cba_test.py
index ec028b1239..e9ea31ccbe 100644
--- a/keras_hub/src/models/efficientnet/cba_test.py
+++ b/keras_hub/src/models/efficientnet/cba_test.py
@@ -10,7 +10,7 @@ def test_same_input_output_shapes(self):
         layer = CBABlock(input_filters=32, output_filters=32)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 32))
+        self.assertEqual(output.shape, (1, 64, 64, 32))
         self.assertLen(output, 1)
 
     def test_different_input_output_shapes(self):
@@ -18,5 +18,5 @@ def test_different_input_output_shapes(self):
         layer = CBABlock(input_filters=32, output_filters=48)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 48))
+        self.assertEqual(output.shape, (1, 64, 64, 48))
         self.assertLen(output, 1)
diff --git a/keras_hub/src/models/efficientnet/efficientnet_backbone_test.py b/keras_hub/src/models/efficientnet/efficientnet_backbone_test.py
index c11e636540..1f54f71925 100644
--- a/keras_hub/src/models/efficientnet/efficientnet_backbone_test.py
+++ b/keras_hub/src/models/efficientnet/efficientnet_backbone_test.py
@@ -87,24 +87,24 @@ def test_feature_pyramid_outputs(self):
         height = width = 256
         outputs = model(keras.ops.ones(shape=(batch_size, height, width, 3)))
         levels = ["P1", "P2", "P3", "P4", "P5"]
-        self.assertEquals(list(outputs.keys()), levels)
-        self.assertEquals(
+        self.assertEqual(list(outputs.keys()), levels)
+        self.assertEqual(
             outputs["P1"].shape,
             (batch_size, height // 2**1, width // 2**1, 24),
         )
-        self.assertEquals(
+        self.assertEqual(
             outputs["P2"].shape,
             (batch_size, height // 2**2, width // 2**2, 48),
         )
-        self.assertEquals(
+        self.assertEqual(
             outputs["P3"].shape,
             (batch_size, height // 2**3, width // 2**3, 64),
         )
-        self.assertEquals(
+        self.assertEqual(
             outputs["P4"].shape,
             (batch_size, height // 2**4, width // 2**4, 160),
         )
-        self.assertEquals(
+        self.assertEqual(
             outputs["P5"].shape,
             (batch_size, height // 2**5, width // 2**5, 1280),
         )
diff --git a/keras_hub/src/models/efficientnet/fusedmbconv_test.py b/keras_hub/src/models/efficientnet/fusedmbconv_test.py
index b12f729ddc..a3049dc462 100644
--- a/keras_hub/src/models/efficientnet/fusedmbconv_test.py
+++ b/keras_hub/src/models/efficientnet/fusedmbconv_test.py
@@ -10,7 +10,7 @@ def test_same_input_output_shapes(self):
         layer = FusedMBConvBlock(input_filters=32, output_filters=32)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 32))
+        self.assertEqual(output.shape, (1, 64, 64, 32))
         self.assertLen(output, 1)
 
     def test_different_input_output_shapes(self):
@@ -18,7 +18,7 @@ def test_different_input_output_shapes(self):
         layer = FusedMBConvBlock(input_filters=32, output_filters=48)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 48))
+        self.assertEqual(output.shape, (1, 64, 64, 48))
         self.assertLen(output, 1)
 
     def test_squeeze_excitation_ratio(self):
@@ -28,5 +28,5 @@ def test_squeeze_excitation_ratio(self):
         )
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 48))
+        self.assertEqual(output.shape, (1, 64, 64, 48))
         self.assertLen(output, 1)
diff --git a/keras_hub/src/models/efficientnet/mbconv_test.py b/keras_hub/src/models/efficientnet/mbconv_test.py
index ea92c7a9c6..b1085770d4 100644
--- a/keras_hub/src/models/efficientnet/mbconv_test.py
+++ b/keras_hub/src/models/efficientnet/mbconv_test.py
@@ -10,7 +10,7 @@ def test_same_input_output_shapes(self):
         layer = MBConvBlock(input_filters=32, output_filters=32)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 32))
+        self.assertEqual(output.shape, (1, 64, 64, 32))
         self.assertLen(output, 1)
 
     def test_different_input_output_shapes(self):
@@ -18,7 +18,7 @@ def test_different_input_output_shapes(self):
         layer = MBConvBlock(input_filters=32, output_filters=48)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 48))
+        self.assertEqual(output.shape, (1, 64, 64, 48))
         self.assertLen(output, 1)
 
     def test_squeeze_excitation_ratio(self):
@@ -26,5 +26,5 @@ def test_squeeze_excitation_ratio(self):
         layer = MBConvBlock(input_filters=32, output_filters=48, se_ratio=0.25)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 48))
+        self.assertEqual(output.shape, (1, 64, 64, 48))
         self.assertLen(output, 1)
diff --git a/keras_hub/src/tests/test_case.py b/keras_hub/src/tests/test_case.py
index 8053fff63b..54155e0517 100644
--- a/keras_hub/src/tests/test_case.py
+++ b/keras_hub/src/tests/test_case.py
@@ -479,7 +479,7 @@ def run_backbone_test(
         # Check name maps to classname.
         name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", cls.__name__)
         name = re.sub("([a-z])([A-Z])", r"\1_\2", name).lower()
-        self.assertRegexpMatches(backbone.name, name)
+        self.assertRegex(backbone.name, name)
 
         # Check mixed precision.
         if run_mixed_precision_check:

From 221ea6b44984bd8e42b281d9ad659123e4453496 Mon Sep 17 00:00:00 2001
From: Siva Sravana Kumar Neeli <113718461+sineeli@users.noreply.github.com>
Date: Mon, 27 Jan 2025 16:47:12 -0800
Subject: [PATCH 03/12] Add `pad_to_aspect_ratio` flag to ImageConverter
 (#2045)

* Add `pad_to_aspect_ratio` flag to ImageConverter

* skip resize test with pad_to_aspect_ratio when backend set to torch

* nit
---
 .../layers/preprocessing/image_converter.py   | 10 ++++++++
 .../preprocessing/image_converter_test.py     | 23 ++++++++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/keras_hub/src/layers/preprocessing/image_converter.py b/keras_hub/src/layers/preprocessing/image_converter.py
index db1b4b7756..2edeffb666 100644
--- a/keras_hub/src/layers/preprocessing/image_converter.py
+++ b/keras_hub/src/layers/preprocessing/image_converter.py
@@ -98,6 +98,7 @@ def __init__(
         scale=None,
         offset=None,
         crop_to_aspect_ratio=True,
+        pad_to_aspect_ratio=False,
         interpolation="bilinear",
         data_format=None,
         **kwargs,
@@ -112,12 +113,19 @@ def __init__(
 
         super().__init__(**kwargs)
 
+        if crop_to_aspect_ratio and pad_to_aspect_ratio:
+            raise ValueError(
+                "Only one of 'crop_to_aspect_ratio' or 'pad_to_aspect_ratio' "
+                "can be True."
+            )
+
         # Create the `Resizing` layer here even if it's not being used. That
         # allows us to make `image_size` a settable property.
         self.resizing = keras.layers.Resizing(
             height=image_size[0] if image_size else None,
             width=image_size[1] if image_size else None,
             crop_to_aspect_ratio=crop_to_aspect_ratio,
+            pad_to_aspect_ratio=pad_to_aspect_ratio,
             interpolation=interpolation,
             data_format=data_format,
             dtype=self.dtype_policy,
@@ -126,6 +134,7 @@ def __init__(
         self.scale = scale
         self.offset = offset
         self.crop_to_aspect_ratio = crop_to_aspect_ratio
+        self.pad_to_aspect_ratio = pad_to_aspect_ratio
         self.interpolation = interpolation
         self.data_format = standardize_data_format(data_format)
 
@@ -182,6 +191,7 @@ def get_config(self):
                 "offset": self.offset,
                 "interpolation": self.interpolation,
                 "crop_to_aspect_ratio": self.crop_to_aspect_ratio,
+                "pad_to_aspect_ratio": self.pad_to_aspect_ratio,
             }
         )
         return config
diff --git a/keras_hub/src/layers/preprocessing/image_converter_test.py b/keras_hub/src/layers/preprocessing/image_converter_test.py
index d638ccf9ab..1fdc97e031 100644
--- a/keras_hub/src/layers/preprocessing/image_converter_test.py
+++ b/keras_hub/src/layers/preprocessing/image_converter_test.py
@@ -1,8 +1,10 @@
 import os
 import pathlib
 
+import keras
 import numpy as np
 import pytest
+from absl.testing import parameterized
 from keras import ops
 
 from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
@@ -33,11 +35,21 @@ def test_unbatched(self):
         self.assertAllClose(outputs[:, :, 1], np.ones((4, 4)) * 0.301569)
         self.assertAllClose(outputs[:, :, 2], np.ones((4, 4)) * 0.852353)
 
-    def test_resize_batch(self):
+    @parameterized.parameters(
+        (True, False),
+        (False, True),
+    )
+    @pytest.mark.skipif(
+        keras.config.backend() == "torch",
+        reason="disabled until resize is fixed for torch backend",
+    )  # TODO: remove skip after new release with fix of https://github.com/keras-team/keras/pull/20797
+    def test_resize_batch(self, crop_to_aspect_ratio, pad_to_aspect_ratio):
         converter = ImageConverter(
             image_size=(4, 4),
             scale=(1.0 / 255.0, 0.8 / 255.0, 1.2 / 255.0),
             offset=(0.2, -0.1, 0.25),
+            crop_to_aspect_ratio=crop_to_aspect_ratio,
+            pad_to_aspect_ratio=pad_to_aspect_ratio,
         )
         inputs = np.ones((2, 10, 10, 3)) * 128
         outputs = converter(inputs)
@@ -46,6 +58,15 @@ def test_resize_batch(self):
         self.assertAllClose(outputs[:, :, :, 1], np.ones((2, 4, 4)) * 0.301569)
         self.assertAllClose(outputs[:, :, :, 2], np.ones((2, 4, 4)) * 0.852353)
 
+    def test_pad_and_crop_to_aspect_ratio(self):
+        with self.assertRaisesRegex(ValueError, "Only one of"):
+            _ = ImageConverter(
+                image_size=(4, 4),
+                scale=1 / 255.0,
+                crop_to_aspect_ratio=True,
+                pad_to_aspect_ratio=True,
+            )
+
     def test_config(self):
         converter = ImageConverter(
             image_size=(12, 20),

From 63863ab7bc41522c1c79a4e6cb748bd2f780b9f2 Mon Sep 17 00:00:00 2001
From: "Hongyu, Chiu" <20734616+james77777778@users.noreply.github.com>
Date: Tue, 28 Jan 2025 11:10:54 +0800
Subject: [PATCH 04/12] Use Flash Attention if available (#2058)

* Use Flash Attention if available

* Torch's `dot_product_attention` doesn't support `bias`.
---
 .../src/models/falcon/falcon_attention.py     |  9 ++++--
 .../models/gpt_neo_x/gpt_neo_x_attention.py   | 29 +++++++++++++++----
 keras_hub/src/models/llama/llama_attention.py | 25 ++++++++++++++--
 .../src/models/mistral/mistral_attention.py   | 27 ++++++++++++++---
 keras_hub/src/models/phi3/phi3_attention.py   | 25 ++++++++++++++--
 .../src/models/stable_diffusion_3/mmdit.py    | 13 ++++-----
 keras_hub/src/utils/keras_utils.py            |  7 +++++
 7 files changed, 110 insertions(+), 25 deletions(-)

diff --git a/keras_hub/src/models/falcon/falcon_attention.py b/keras_hub/src/models/falcon/falcon_attention.py
index b150a1ca89..48db9664ea 100644
--- a/keras_hub/src/models/falcon/falcon_attention.py
+++ b/keras_hub/src/models/falcon/falcon_attention.py
@@ -110,9 +110,11 @@ def call(
 
         attention_scores = ops.einsum("bqnh,bknh->bnqk", query, key)
         attention_scores = ops.add(attention_scores, alibi)
-        attention_scores = (
-            attention_scores * self.inv_norm_factor
-        )  # [batch_size, num_heads, query_length, kv_length]
+        # [batch_size, num_heads, query_length, kv_length]
+        attention_scores = ops.multiply(
+            attention_scores,
+            ops.cast(self.inv_norm_factor, self.compute_dtype),
+        )
         attention_scores = self.softmax(
             attention_scores, ops.expand_dims(attention_mask, 1)
         )
@@ -120,6 +122,7 @@ def call(
         attention_output = ops.einsum(
             "bnqk,bknh->bqnh", attention_scores, value
         )
+
         attention_output = ops.reshape(
             attention_output,
             [batch_size, seq_length, self.num_heads * self.head_dim],
diff --git a/keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py b/keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py
index 6824a641d4..a0db2c7836 100644
--- a/keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py
+++ b/keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py
@@ -1,8 +1,11 @@
+import math
+
 import keras
 from keras import ops
 
 from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
 from keras_hub.src.utils.keras_utils import clone_initializer
+from keras_hub.src.utils.keras_utils import has_flash_attention_support
 
 
 class GPTNeoXAttention(keras.layers.Layer):
@@ -58,6 +61,8 @@ def __init__(
         self.bias_initializer = keras.initializers.get(bias_initializer)
         self.max_sequence_length = max_sequence_length
 
+        self._inv_norm_factor = 1.0 / math.sqrt(self.attn_head_size)
+
     def build(self, input_shape):
         self._qkv_dense = keras.layers.EinsumDense(
             equation="abc,cde->abde",
@@ -120,14 +125,26 @@ def _masked_softmax(self, attention_scores, attention_mask=None):
     def _compute_attention(
         self, query, key, value, attention_mask=None, training=None
     ):
-        attention_scores = ops.einsum("aecd,abcd->acbe", key, query)
+        if has_flash_attention_support() and self.dropout == 0:
+            # Use `dot_product_attention` with Flash Attention support if
+            # available.
+            if attention_mask is not None:
+                attention_mask = ops.expand_dims(attention_mask, axis=1)
+                attention_mask = ops.cast(attention_mask, dtype="bool")
+            attention_output = ops.dot_product_attention(
+                query,
+                key,
+                value,
+                mask=attention_mask,
+                scale=self._inv_norm_factor,
+            )
+            return attention_output
 
-        norm_factor = ops.sqrt(
-            ops.convert_to_tensor(self.attn_head_size, self.compute_dtype)
+        attention_scores = ops.einsum("aecd,abcd->acbe", key, query)
+        attention_scores = ops.multiply(
+            attention_scores,
+            ops.cast(self._inv_norm_factor, self.compute_dtype),
         )
-
-        attention_scores /= norm_factor
-
         attention_scores = self._masked_softmax(
             attention_scores, attention_mask
         )
diff --git a/keras_hub/src/models/llama/llama_attention.py b/keras_hub/src/models/llama/llama_attention.py
index a8bcef9fa3..6ef8079005 100644
--- a/keras_hub/src/models/llama/llama_attention.py
+++ b/keras_hub/src/models/llama/llama_attention.py
@@ -1,8 +1,11 @@
+import math
+
 import keras
 from keras import ops
 
 from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
 from keras_hub.src.utils.keras_utils import clone_initializer
+from keras_hub.src.utils.keras_utils import has_flash_attention_support
 
 
 class LlamaAttention(keras.layers.Layer):
@@ -43,7 +46,7 @@ def build(self, inputs_shape):
         # h = head dim
         hidden_dim = inputs_shape[-1]
         head_dim = hidden_dim // self.num_query_heads
-        self._norm_factor = ops.sqrt(ops.cast(head_dim, self.compute_dtype))
+        self._inv_norm_factor = 1.0 / math.sqrt(head_dim)
 
         self._query_dense = keras.layers.EinsumDense(
             equation="bqm,muh->bquh",
@@ -182,9 +185,27 @@ def _masked_softmax(self, attention_scores, attention_mask=None):
         return self._softmax(attention_scores)
 
     def _compute_attention(self, query, key, value, attention_mask=None):
+        if has_flash_attention_support():
+            # Use `dot_product_attention` with Flash Attention support if
+            # available.
+            if attention_mask is not None:
+                attention_mask = ops.expand_dims(attention_mask, axis=1)
+                attention_mask = ops.cast(attention_mask, dtype="bool")
+            attention_output = ops.dot_product_attention(
+                query,
+                key,
+                value,
+                mask=attention_mask,
+                scale=self._inv_norm_factor,
+            )
+            return attention_output
+
         attention_scores = ops.einsum(self._dot_product_equation, query, key)
 
-        attention_scores = attention_scores / self._norm_factor
+        attention_scores = ops.multiply(
+            attention_scores,
+            ops.cast(self._inv_norm_factor, self.compute_dtype),
+        )
         attention_scores = self._masked_softmax(
             attention_scores, attention_mask
         )
diff --git a/keras_hub/src/models/mistral/mistral_attention.py b/keras_hub/src/models/mistral/mistral_attention.py
index b0c0ecd3aa..d87a676de2 100644
--- a/keras_hub/src/models/mistral/mistral_attention.py
+++ b/keras_hub/src/models/mistral/mistral_attention.py
@@ -1,8 +1,11 @@
+import math
+
 import keras
 from keras import ops
 
 from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
 from keras_hub.src.utils.keras_utils import clone_initializer
+from keras_hub.src.utils.keras_utils import has_flash_attention_support
 
 
 # This is just a self-attention layer in Mistral. But it can be generalized
@@ -52,6 +55,7 @@ def build(self, inputs_shape):
         # h = head dim
         self._hidden_dim = inputs_shape[-1]
         self._head_dim = self._hidden_dim // self._num_query_heads
+        self._inv_norm_factor = 1.0 / math.sqrt(self._head_dim)
 
         self._query_dense = keras.layers.EinsumDense(
             equation="bqm,muh->bquh",
@@ -192,11 +196,26 @@ def _masked_softmax(self, attention_scores, attention_mask=None):
         return self._softmax(attention_scores)
 
     def _compute_attention(self, query, key, value, attention_mask=None):
-        attention_scores = ops.einsum(self._dot_product_equation, query, key)
-
-        norm_factor = ops.sqrt(ops.cast(self._head_dim, self.compute_dtype))
+        if has_flash_attention_support():
+            # Use `dot_product_attention` with Flash Attention support if
+            # available.
+            if attention_mask is not None:
+                attention_mask = ops.expand_dims(attention_mask, axis=1)
+                attention_mask = ops.cast(attention_mask, dtype="bool")
+            attention_output = ops.dot_product_attention(
+                query,
+                key,
+                value,
+                mask=attention_mask,
+                scale=self._inv_norm_factor,
+            )
+            return attention_output
 
-        attention_scores = attention_scores / norm_factor
+        attention_scores = ops.einsum(self._dot_product_equation, query, key)
+        attention_scores = ops.multiply(
+            attention_scores,
+            ops.cast(self._inv_norm_factor, self.compute_dtype),
+        )
         attention_scores = self._masked_softmax(
             attention_scores, attention_mask
         )
diff --git a/keras_hub/src/models/phi3/phi3_attention.py b/keras_hub/src/models/phi3/phi3_attention.py
index 1c4476240d..2860799dc2 100644
--- a/keras_hub/src/models/phi3/phi3_attention.py
+++ b/keras_hub/src/models/phi3/phi3_attention.py
@@ -1,3 +1,5 @@
+import math
+
 import keras
 from keras import ops
 
@@ -6,6 +8,7 @@
     Phi3SuScaledRotaryEmbedding,
 )
 from keras_hub.src.utils.keras_utils import clone_initializer
+from keras_hub.src.utils.keras_utils import has_flash_attention_support
 
 
 class Phi3Attention(keras.layers.Layer):
@@ -53,7 +56,7 @@ def build(self, inputs_shape):
         # h = head dim
         hidden_dim = inputs_shape[-1]
         head_dim = hidden_dim // self.num_query_heads
-        self._norm_factor = ops.sqrt(ops.cast(head_dim, self.compute_dtype))
+        self._inv_norm_factor = 1.0 / math.sqrt(head_dim)
 
         self.query_dense = keras.layers.EinsumDense(
             equation="bqm,muh->bquh",
@@ -214,8 +217,26 @@ def _masked_softmax(self, attention_scores, attention_mask=None):
         return self.softmax(attention_scores)
 
     def _compute_attention(self, query, key, value, attention_mask=None):
+        if has_flash_attention_support():
+            # Use `dot_product_attention` with Flash Attention support if
+            # available.
+            if attention_mask is not None:
+                attention_mask = ops.expand_dims(attention_mask, axis=1)
+                attention_mask = ops.cast(attention_mask, dtype="bool")
+            attention_output = ops.dot_product_attention(
+                query,
+                key,
+                value,
+                mask=attention_mask,
+                scale=self._inv_norm_factor,
+            )
+            return attention_output
+
         attention_scores = ops.einsum("bquh,bkuh->buqk", query, key)
-        attention_scores = attention_scores / self._norm_factor
+        attention_scores = ops.multiply(
+            attention_scores,
+            ops.cast(self._inv_norm_factor, self.compute_dtype),
+        )
         attention_scores = self._masked_softmax(
             attention_scores, attention_mask
         )
diff --git a/keras_hub/src/models/stable_diffusion_3/mmdit.py b/keras_hub/src/models/stable_diffusion_3/mmdit.py
index 36cbc11d79..fc5c2d6aaa 100644
--- a/keras_hub/src/models/stable_diffusion_3/mmdit.py
+++ b/keras_hub/src/models/stable_diffusion_3/mmdit.py
@@ -7,6 +7,7 @@
 from keras_hub.src.layers.modeling.position_embedding import PositionEmbedding
 from keras_hub.src.models.backbone import Backbone
 from keras_hub.src.utils.keras_utils import gelu_approximate
+from keras_hub.src.utils.keras_utils import has_flash_attention_support
 from keras_hub.src.utils.keras_utils import standardize_data_format
 
 
@@ -770,17 +771,14 @@ def build(self, inputs_shape, context_shape, timestep_embedding_shape):
     def _compute_attention(self, query, key, value):
         batch_size = ops.shape(query)[0]
 
-        # Use the fast path when `ops.dot_product_attention` and flash attention
-        # are available.
-        if hasattr(ops, "dot_product_attention") and hasattr(
-            keras.config, "is_flash_attention_enabled"
-        ):
+        if has_flash_attention_support():
+            # Use `dot_product_attention` with Flash Attention support if
+            # available.
             encoded = ops.dot_product_attention(
                 query,
                 key,
                 value,
                 scale=self._inverse_sqrt_key_dim,
-                flash_attention=keras.config.is_flash_attention_enabled(),
             )
             return ops.reshape(
                 encoded, (batch_size, -1, self.num_heads * self.head_dim)
@@ -793,10 +791,9 @@ def _compute_attention(self, query, key, value):
         probs = self.softmax(logits)
         probs = ops.cast(probs, self.compute_dtype)
         encoded = ops.einsum("BNTS,BSNH->BTNH", probs, value)
-        encoded = ops.reshape(
+        return ops.reshape(
             encoded, (batch_size, -1, self.num_heads * self.head_dim)
         )
-        return encoded
 
     def call(self, inputs, context, timestep_embedding, training=None):
         # Compute pre-attention.
diff --git a/keras_hub/src/utils/keras_utils.py b/keras_hub/src/utils/keras_utils.py
index 29414811e4..360b030660 100644
--- a/keras_hub/src/utils/keras_utils.py
+++ b/keras_hub/src/utils/keras_utils.py
@@ -53,3 +53,10 @@ def standardize_data_format(data_format):
             f"Received: data_format={data_format}"
         )
     return data_format
+
+
+def has_flash_attention_support():
+    if hasattr(keras.config, "is_flash_attention_enabled"):
+        return True
+    else:
+        return False

From 8ca2076d651532f7270ffa2beba44d377b100bbf Mon Sep 17 00:00:00 2001
From: Matt Watson <1389937+mattdangerw@users.noreply.github.com>
Date: Tue, 28 Jan 2025 13:56:25 -0800
Subject: [PATCH 05/12] os.make_dirs is not a thing; os.makedirs is (#2061)

We were also missing any coverage for this codepath :(
---
 keras_hub/src/utils/preset_utils.py      |  2 +-
 keras_hub/src/utils/preset_utils_test.py | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/keras_hub/src/utils/preset_utils.py b/keras_hub/src/utils/preset_utils.py
index 4b4e3bc3b3..b1a295cbcc 100644
--- a/keras_hub/src/utils/preset_utils.py
+++ b/keras_hub/src/utils/preset_utils.py
@@ -240,7 +240,7 @@ def tf_copy_gfile_to_cache(preset, path):
         try:
             import tensorflow as tf
 
-            os.make_dirs(os.path.dirname(local_path), exist_ok=True)
+            os.makedirs(os.path.dirname(local_path), exist_ok=True)
             tf.io.gfile.copy(url, local_path)
         except Exception as e:
             # gfile.copy will leave an empty file after an error.
diff --git a/keras_hub/src/utils/preset_utils_test.py b/keras_hub/src/utils/preset_utils_test.py
index 787a1ea439..998dcadfa9 100644
--- a/keras_hub/src/utils/preset_utils_test.py
+++ b/keras_hub/src/utils/preset_utils_test.py
@@ -33,6 +33,18 @@ def test_preset_errors(self):
         with self.assertRaisesRegex(ValueError, "class keras_hub>BortBackbone"):
             BertBackbone.from_preset(preset_dir)
 
+    @pytest.mark.large
+    def test_tf_file_io(self):
+        # Load a model from Kaggle to use as a test model.
+        preset = "bert_tiny_en_uncased"
+        backbone = BertBackbone.from_preset(preset)
+        # Save the model on a local directory.
+        temp_dir = self.get_temp_dir()
+        local_preset_dir = os.path.join(temp_dir, "bert_preset")
+        backbone.save_to_preset(local_preset_dir)
+        # Load with "file://" which tf supports.
+        backbone = BertBackbone.from_preset("file://" + local_preset_dir)
+
     @pytest.mark.large
     def test_upload_empty_preset(self):
         temp_dir = self.get_temp_dir()

From d71318d92e751a9d9efa725cd423ccfc325d9e70 Mon Sep 17 00:00:00 2001
From: Matt Watson <1389937+mattdangerw@users.noreply.github.com>
Date: Wed, 29 Jan 2025 13:13:23 -0800
Subject: [PATCH 06/12] Update README.md (#2063)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6368571388..4a5aa35899 100644
--- a/README.md
+++ b/README.md
@@ -78,7 +78,7 @@ print(keras_hub.utils.decode_imagenet_predictions(preds))
 Load a Bert model and fine-tune it on IMDb movie reviews:
 
 ```python
-classifier = keras_hub.models.BertClassifier.from_preset(
+classifier = keras_hub.models.TextClassifier.from_preset(
     "bert_base_en_uncased",
     activation="softmax",
     num_classes=2,

From e62569965fcebe690756c06965d2b8568dddc35f Mon Sep 17 00:00:00 2001
From: balanprasanth <112931254+balanprasanth@users.noreply.github.com>
Date: Thu, 30 Jan 2025 02:43:46 +0530
Subject: [PATCH 07/12] Update auto-assignment.js (#2057)

update issues assignee usernames
---
 .github/workflows/scripts/auto-assignment.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scripts/auto-assignment.js b/.github/workflows/scripts/auto-assignment.js
index 176b305f39..d47805aca9 100644
--- a/.github/workflows/scripts/auto-assignment.js
+++ b/.github/workflows/scripts/auto-assignment.js
@@ -12,7 +12,7 @@ module.exports = async ({ github, context }) => {
   // Is this an issue? If so, assign the issue number. Otherwise, assign the PR number.
   if (context.payload.issue) {
     //assignee List for issues. 
-    assigneesList = ["SuryanarayanaY", "sachinprasadhs"];
+    assigneesList = ["mehtamansi29","sonali-kumari1", "sachinprasadhs"];
     issueNumber = context.payload.issue.number;
   } else {
     //assignee List for PRs. 

From 6adc92f505e629b7e2a2dfe6140e7afb5a99d845 Mon Sep 17 00:00:00 2001
From: Abheesht <sharmabhee@gmail.com>
Date: Mon, 3 Feb 2025 11:39:33 +0530
Subject: [PATCH 08/12] Remove `mask = None` (#2067)

---
 keras_hub/src/models/pali_gemma/pali_gemma_vit.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/keras_hub/src/models/pali_gemma/pali_gemma_vit.py b/keras_hub/src/models/pali_gemma/pali_gemma_vit.py
index b217509541..621b4562de 100644
--- a/keras_hub/src/models/pali_gemma/pali_gemma_vit.py
+++ b/keras_hub/src/models/pali_gemma/pali_gemma_vit.py
@@ -204,9 +204,8 @@ def __init__(
         self.intermediate_dim = intermediate_dim
 
     def compute_attention(self, x, mask=None):
-        mask = None
         if mask is not None:
-            mask = ops.cast(mask, dtype=x.dtype) if mask is not None else None
+            mask = ops.cast(mask, dtype=x.dtype)
         return self.attn(x, attention_mask=mask)[0]
 
     def build(self, input_shape):

From c6644576d0ad15f1f2424435f029d9a9eb801443 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 3 Feb 2025 09:26:34 -0800
Subject: [PATCH 09/12] Bump the python group with 2 updates (#2066)

Bumps the python group with 2 updates: torch and torchvision.


Updates `torch` from 2.5.1+cu121 to 2.6.0+cpu

Updates `torchvision` from 0.20.1+cu121 to 0.21.0+cpu

---
updated-dependencies:
- dependency-name: torch
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: python
- dependency-name: torchvision
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: python
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-torch-cuda.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-torch-cuda.txt b/requirements-torch-cuda.txt
index 2a601f0f20..a696dc2a85 100644
--- a/requirements-torch-cuda.txt
+++ b/requirements-torch-cuda.txt
@@ -4,8 +4,8 @@ tensorflow-text~=2.18
 
 # Torch with cuda support.
 --extra-index-url https://download.pytorch.org/whl/cu121
-torch==2.5.1+cu121
-torchvision==0.20.1+cu121
+torch==2.6.0+cpu
+torchvision==0.21.0+cpu
 
 # Jax cpu-only version.
 jax[cpu]

From f157ff53e15a5d03ccd75230fa6adf8157ef250f Mon Sep 17 00:00:00 2001
From: Matt Watson <1389937+mattdangerw@users.noreply.github.com>
Date: Mon, 3 Feb 2025 12:15:57 -0800
Subject: [PATCH 10/12] Make gemma inputs int32 same as other models (#2069)

I don't think anything was broken because of this, but we should
not have float32 inputs in this case.
---
 keras_hub/src/models/gemma/gemma_backbone.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_hub/src/models/gemma/gemma_backbone.py b/keras_hub/src/models/gemma/gemma_backbone.py
index 93dea30199..22ca535c3d 100644
--- a/keras_hub/src/models/gemma/gemma_backbone.py
+++ b/keras_hub/src/models/gemma/gemma_backbone.py
@@ -148,10 +148,10 @@ def __init__(
 
         # === Functional Model ===
         token_id_input = keras.Input(
-            shape=(None,), dtype="float32", name="token_ids"
+            shape=(None,), dtype="int32", name="token_ids"
         )
         padding_mask_input = keras.Input(
-            shape=(None,), dtype="float32", name="padding_mask"
+            shape=(None,), dtype="int32", name="padding_mask"
         )
         x = self.token_embedding(token_id_input)
         x = x * ops.cast(ops.sqrt(hidden_dim), x.dtype)

From 4ecbadf9917e4e90dbadb9c485a104b63effe83e Mon Sep 17 00:00:00 2001
From: Siva Sravana Kumar Neeli <113718461+sineeli@users.noreply.github.com>
Date: Mon, 3 Feb 2025 15:12:18 -0800
Subject: [PATCH 11/12] Vit bug (#2070)

* add missing dropout_rate argument to MLP layer

* dropout layer missing as per official implementation
---
 keras_hub/src/models/vit/vit_layers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/keras_hub/src/models/vit/vit_layers.py b/keras_hub/src/models/vit/vit_layers.py
index f3509440d5..473cb9cb66 100644
--- a/keras_hub/src/models/vit/vit_layers.py
+++ b/keras_hub/src/models/vit/vit_layers.py
@@ -65,6 +65,7 @@ def build(self, input_shape):
 
     def call(self, inputs):
         x = self.dense_1(inputs)
+        x = self.dropout(x)
         x = self.dense_2(x)
         out = self.dropout(x)
         return out
@@ -257,6 +258,7 @@ def build(self, input_shape):
             hidden_dim=self.hidden_dim,
             mlp_dim=self.mlp_dim,
             use_bias=self.use_mlp_bias,
+            dropout_rate=self.dropout_rate,
             name="mlp",
             dtype=self.dtype_policy,
         )

From a80ea2817a1a00f30641b01b5ffb7a6a9d187f12 Mon Sep 17 00:00:00 2001
From: balanprasanth <112931254+balanprasanth@users.noreply.github.com>
Date: Wed, 5 Feb 2025 23:17:45 +0530
Subject: [PATCH 12/12] Update auto-assignment.js (#2065)

Update issue assignee username
---
 .github/workflows/scripts/auto-assignment.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scripts/auto-assignment.js b/.github/workflows/scripts/auto-assignment.js
index d47805aca9..d9e7956426 100644
--- a/.github/workflows/scripts/auto-assignment.js
+++ b/.github/workflows/scripts/auto-assignment.js
@@ -12,7 +12,7 @@ module.exports = async ({ github, context }) => {
   // Is this an issue? If so, assign the issue number. Otherwise, assign the PR number.
   if (context.payload.issue) {
     //assignee List for issues. 
-    assigneesList = ["mehtamansi29","sonali-kumari1", "sachinprasadhs"];
+    assigneesList = ["mehtamansi29", "sonali-kumari1", "dhantule", "sachinprasadhs"];
     issueNumber = context.payload.issue.number;
   } else {
     //assignee List for PRs.