Merge branch 'keras-team:master' into port_mobilenet

pkgoogle · Feb 5, 2025 · fe34e36 · fe34e36
2 parents ef9cb73 + a80ea28
commit fe34e36
Show file tree

Hide file tree

Showing 25 changed files with 195 additions and 53 deletions.
diff --git a/.github/workflows/scripts/auto-assignment.js b/.github/workflows/scripts/auto-assignment.js
@@ -12,7 +12,7 @@ module.exports = async ({ github, context }) => {
   // Is this an issue? If so, assign the issue number. Otherwise, assign the PR number.
   if (context.payload.issue) {
     //assignee List for issues. 
-    assigneesList = ["SuryanarayanaY", "sachinprasadhs"];
+    assigneesList = ["mehtamansi29", "sonali-kumari1", "dhantule", "sachinprasadhs"];
     issueNumber = context.payload.issue.number;
   } else {
     //assignee List for PRs. 

diff --git a/README.md b/README.md
@@ -78,7 +78,7 @@ print(keras_hub.utils.decode_imagenet_predictions(preds))
 Load a Bert model and fine-tune it on IMDb movie reviews:
 
 ```python
-classifier = keras_hub.models.BertClassifier.from_preset(
+classifier = keras_hub.models.TextClassifier.from_preset(
     "bert_base_en_uncased",
     activation="softmax",
     num_classes=2,

diff --git a/keras_hub/src/layers/preprocessing/image_converter.py b/keras_hub/src/layers/preprocessing/image_converter.py
@@ -98,6 +98,7 @@ def __init__(
         scale=None,
         offset=None,
         crop_to_aspect_ratio=True,
+        pad_to_aspect_ratio=False,
         interpolation="bilinear",
         data_format=None,
         **kwargs,
@@ -112,12 +113,19 @@ def __init__(
 
         super().__init__(**kwargs)
 
+        if crop_to_aspect_ratio and pad_to_aspect_ratio:
+            raise ValueError(
+                "Only one of 'crop_to_aspect_ratio' or 'pad_to_aspect_ratio' "
+                "can be True."
+            )
+
         # Create the `Resizing` layer here even if it's not being used. That
         # allows us to make `image_size` a settable property.
         self.resizing = keras.layers.Resizing(
             height=image_size[0] if image_size else None,
             width=image_size[1] if image_size else None,
             crop_to_aspect_ratio=crop_to_aspect_ratio,
+            pad_to_aspect_ratio=pad_to_aspect_ratio,
             interpolation=interpolation,
             data_format=data_format,
             dtype=self.dtype_policy,
@@ -126,6 +134,7 @@ def __init__(
         self.scale = scale
         self.offset = offset
         self.crop_to_aspect_ratio = crop_to_aspect_ratio
+        self.pad_to_aspect_ratio = pad_to_aspect_ratio
         self.interpolation = interpolation
         self.data_format = standardize_data_format(data_format)
 
@@ -182,6 +191,7 @@ def get_config(self):
                 "offset": self.offset,
                 "interpolation": self.interpolation,
                 "crop_to_aspect_ratio": self.crop_to_aspect_ratio,
+                "pad_to_aspect_ratio": self.pad_to_aspect_ratio,
             }
         )
         return config

diff --git a/keras_hub/src/layers/preprocessing/image_converter_test.py b/keras_hub/src/layers/preprocessing/image_converter_test.py
@@ -1,8 +1,10 @@
 import os
 import pathlib
 
+import keras
 import numpy as np
 import pytest
+from absl.testing import parameterized
 from keras import ops
 
 from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
@@ -33,11 +35,21 @@ def test_unbatched(self):
         self.assertAllClose(outputs[:, :, 1], np.ones((4, 4)) * 0.301569)
         self.assertAllClose(outputs[:, :, 2], np.ones((4, 4)) * 0.852353)
 
-    def test_resize_batch(self):
+    @parameterized.parameters(
+        (True, False),
+        (False, True),
+    )
+    @pytest.mark.skipif(
+        keras.config.backend() == "torch",
+        reason="disabled until resize is fixed for torch backend",
+    )  # TODO: remove skip after new release with fix of https://github.com/keras-team/keras/pull/20797
+    def test_resize_batch(self, crop_to_aspect_ratio, pad_to_aspect_ratio):
         converter = ImageConverter(
             image_size=(4, 4),
             scale=(1.0 / 255.0, 0.8 / 255.0, 1.2 / 255.0),
             offset=(0.2, -0.1, 0.25),
+            crop_to_aspect_ratio=crop_to_aspect_ratio,
+            pad_to_aspect_ratio=pad_to_aspect_ratio,
         )
         inputs = np.ones((2, 10, 10, 3)) * 128
         outputs = converter(inputs)
@@ -46,6 +58,15 @@ def test_resize_batch(self):
         self.assertAllClose(outputs[:, :, :, 1], np.ones((2, 4, 4)) * 0.301569)
         self.assertAllClose(outputs[:, :, :, 2], np.ones((2, 4, 4)) * 0.852353)
 
+    def test_pad_and_crop_to_aspect_ratio(self):
+        with self.assertRaisesRegex(ValueError, "Only one of"):
+            _ = ImageConverter(
+                image_size=(4, 4),
+                scale=1 / 255.0,
+                crop_to_aspect_ratio=True,
+                pad_to_aspect_ratio=True,
+            )
+
     def test_config(self):
         converter = ImageConverter(
             image_size=(12, 20),

diff --git a/keras_hub/src/layers/preprocessing/masked_lm_mask_generator_test.py b/keras_hub/src/layers/preprocessing/masked_lm_mask_generator_test.py
@@ -148,7 +148,7 @@ def test_config(self):
             "vocabulary_size": self.vocabulary_size,
             "unselectable_token_ids": unselectable_token_ids,
         }
-        self.assertDictContainsSubset(expected_config, config)
+        self.assertEqual(config, {**config, **expected_config})
 
         # Test cloned masked_lm_masker can be run.
         cloned_masked_lm_masker = MaskedLMMaskGenerator.from_config(config)

diff --git a/keras_hub/src/models/basnet/basnet_presets.py b/keras_hub/src/models/basnet/basnet_presets.py
@@ -1,3 +1,17 @@
 """BASNet model preset configurations."""
 
-basnet_presets = {}
+basnet_presets = {
+    "basnet_duts": {
+        "metadata": {
+            "description": (
+                "BASNet model with a 34-layer ResNet backbone, pre-trained "
+                "on the DUTS image dataset at a 288x288 resolution. Model "
+                "training was performed by Hamid Ali "
+                "(https://github.com/hamidriasat/BASNet)."
+            ),
+            "params": 108886792,
+            "path": "basnet",
+        },
+        "kaggle_handle": "kaggle://keras/basnet/keras/base1",
+    },
+}
diff --git a/keras_hub/src/models/basnet/basnet_test.py b/keras_hub/src/models/basnet/basnet_test.py
@@ -54,7 +54,6 @@ def test_end_to_end_model_predict(self):
         output = model.predict(self.images)
         self.assertAllEqual(output.shape, (2, 64, 64, 1))
 
-    @pytest.mark.skip(reason="disabled until preset's been uploaded to Kaggle")
     @pytest.mark.extra_large
     def test_all_presets(self):
         for preset in BASNetImageSegmenter.presets:

diff --git a/keras_hub/src/models/efficientnet/cba_test.py b/keras_hub/src/models/efficientnet/cba_test.py
@@ -10,13 +10,13 @@ def test_same_input_output_shapes(self):
         layer = CBABlock(input_filters=32, output_filters=32)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 32))
+        self.assertEqual(output.shape, (1, 64, 64, 32))
         self.assertLen(output, 1)
 
     def test_different_input_output_shapes(self):
         inputs = keras.random.normal(shape=(1, 64, 64, 32), dtype="float32")
         layer = CBABlock(input_filters=32, output_filters=48)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 48))
+        self.assertEqual(output.shape, (1, 64, 64, 48))
         self.assertLen(output, 1)
diff --git a/keras_hub/src/models/efficientnet/efficientnet_backbone_test.py b/keras_hub/src/models/efficientnet/efficientnet_backbone_test.py
@@ -87,24 +87,24 @@ def test_feature_pyramid_outputs(self):
         height = width = 256
         outputs = model(keras.ops.ones(shape=(batch_size, height, width, 3)))
         levels = ["P1", "P2", "P3", "P4", "P5"]
-        self.assertEquals(list(outputs.keys()), levels)
-        self.assertEquals(
+        self.assertEqual(list(outputs.keys()), levels)
+        self.assertEqual(
             outputs["P1"].shape,
             (batch_size, height // 2**1, width // 2**1, 24),
         )
-        self.assertEquals(
+        self.assertEqual(
             outputs["P2"].shape,
             (batch_size, height // 2**2, width // 2**2, 48),
         )
-        self.assertEquals(
+        self.assertEqual(
             outputs["P3"].shape,
             (batch_size, height // 2**3, width // 2**3, 64),
         )
-        self.assertEquals(
+        self.assertEqual(
             outputs["P4"].shape,
             (batch_size, height // 2**4, width // 2**4, 160),
         )
-        self.assertEquals(
+        self.assertEqual(
             outputs["P5"].shape,
             (batch_size, height // 2**5, width // 2**5, 1280),
         )

diff --git a/keras_hub/src/models/efficientnet/fusedmbconv_test.py b/keras_hub/src/models/efficientnet/fusedmbconv_test.py
@@ -10,15 +10,15 @@ def test_same_input_output_shapes(self):
         layer = FusedMBConvBlock(input_filters=32, output_filters=32)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 32))
+        self.assertEqual(output.shape, (1, 64, 64, 32))
         self.assertLen(output, 1)
 
     def test_different_input_output_shapes(self):
         inputs = keras.random.normal(shape=(1, 64, 64, 32), dtype="float32")
         layer = FusedMBConvBlock(input_filters=32, output_filters=48)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 48))
+        self.assertEqual(output.shape, (1, 64, 64, 48))
         self.assertLen(output, 1)
 
     def test_squeeze_excitation_ratio(self):
@@ -28,5 +28,5 @@ def test_squeeze_excitation_ratio(self):
         )
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 48))
+        self.assertEqual(output.shape, (1, 64, 64, 48))
         self.assertLen(output, 1)
diff --git a/keras_hub/src/models/efficientnet/mbconv_test.py b/keras_hub/src/models/efficientnet/mbconv_test.py
@@ -10,21 +10,21 @@ def test_same_input_output_shapes(self):
         layer = MBConvBlock(input_filters=32, output_filters=32)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 32))
+        self.assertEqual(output.shape, (1, 64, 64, 32))
         self.assertLen(output, 1)
 
     def test_different_input_output_shapes(self):
         inputs = keras.random.normal(shape=(1, 64, 64, 32), dtype="float32")
         layer = MBConvBlock(input_filters=32, output_filters=48)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 48))
+        self.assertEqual(output.shape, (1, 64, 64, 48))
         self.assertLen(output, 1)
 
     def test_squeeze_excitation_ratio(self):
         inputs = keras.random.normal(shape=(1, 64, 64, 32), dtype="float32")
         layer = MBConvBlock(input_filters=32, output_filters=48, se_ratio=0.25)
 
         output = layer(inputs)
-        self.assertEquals(output.shape, (1, 64, 64, 48))
+        self.assertEqual(output.shape, (1, 64, 64, 48))
         self.assertLen(output, 1)
diff --git a/keras_hub/src/models/falcon/falcon_attention.py b/keras_hub/src/models/falcon/falcon_attention.py
@@ -110,16 +110,19 @@ def call(
 
         attention_scores = ops.einsum("bqnh,bknh->bnqk", query, key)
         attention_scores = ops.add(attention_scores, alibi)
-        attention_scores = (
-            attention_scores * self.inv_norm_factor
-        )  # [batch_size, num_heads, query_length, kv_length]
+        # [batch_size, num_heads, query_length, kv_length]
+        attention_scores = ops.multiply(
+            attention_scores,
+            ops.cast(self.inv_norm_factor, self.compute_dtype),
+        )
         attention_scores = self.softmax(
             attention_scores, ops.expand_dims(attention_mask, 1)
         )
         attention_scores = self.attention_dropout(attention_scores)
         attention_output = ops.einsum(
             "bnqk,bknh->bqnh", attention_scores, value
         )
+
         attention_output = ops.reshape(
             attention_output,
             [batch_size, seq_length, self.num_heads * self.head_dim],

diff --git a/keras_hub/src/models/gemma/gemma_backbone.py b/keras_hub/src/models/gemma/gemma_backbone.py
@@ -148,10 +148,10 @@ def __init__(
 
         # === Functional Model ===
         token_id_input = keras.Input(
-            shape=(None,), dtype="float32", name="token_ids"
+            shape=(None,), dtype="int32", name="token_ids"
         )
         padding_mask_input = keras.Input(
-            shape=(None,), dtype="float32", name="padding_mask"
+            shape=(None,), dtype="int32", name="padding_mask"
         )
         x = self.token_embedding(token_id_input)
         x = x * ops.cast(ops.sqrt(hidden_dim), x.dtype)

diff --git a/keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py b/keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py
@@ -1,8 +1,11 @@
+import math
+
 import keras
 from keras import ops
 
 from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
 from keras_hub.src.utils.keras_utils import clone_initializer
+from keras_hub.src.utils.keras_utils import has_flash_attention_support
 
 
 class GPTNeoXAttention(keras.layers.Layer):
@@ -58,6 +61,8 @@ def __init__(
         self.bias_initializer = keras.initializers.get(bias_initializer)
         self.max_sequence_length = max_sequence_length
 
+        self._inv_norm_factor = 1.0 / math.sqrt(self.attn_head_size)
+
     def build(self, input_shape):
         self._qkv_dense = keras.layers.EinsumDense(
             equation="abc,cde->abde",
@@ -120,14 +125,26 @@ def _masked_softmax(self, attention_scores, attention_mask=None):
     def _compute_attention(
         self, query, key, value, attention_mask=None, training=None
     ):
-        attention_scores = ops.einsum("aecd,abcd->acbe", key, query)
+        if has_flash_attention_support() and self.dropout == 0:
+            # Use `dot_product_attention` with Flash Attention support if
+            # available.
+            if attention_mask is not None:
+                attention_mask = ops.expand_dims(attention_mask, axis=1)
+                attention_mask = ops.cast(attention_mask, dtype="bool")
+            attention_output = ops.dot_product_attention(
+                query,
+                key,
+                value,
+                mask=attention_mask,
+                scale=self._inv_norm_factor,
+            )
+            return attention_output
 
-        norm_factor = ops.sqrt(
-            ops.convert_to_tensor(self.attn_head_size, self.compute_dtype)
+        attention_scores = ops.einsum("aecd,abcd->acbe", key, query)
+        attention_scores = ops.multiply(
+            attention_scores,
+            ops.cast(self._inv_norm_factor, self.compute_dtype),
         )
-
-        attention_scores /= norm_factor
-
         attention_scores = self._masked_softmax(
             attention_scores, attention_mask
         )

diff --git a/keras_hub/src/models/llama/llama_attention.py b/keras_hub/src/models/llama/llama_attention.py
@@ -1,8 +1,11 @@
+import math
+
 import keras
 from keras import ops
 
 from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
 from keras_hub.src.utils.keras_utils import clone_initializer
+from keras_hub.src.utils.keras_utils import has_flash_attention_support
 
 
 class LlamaAttention(keras.layers.Layer):
@@ -43,7 +46,7 @@ def build(self, inputs_shape):
         # h = head dim
         hidden_dim = inputs_shape[-1]
         head_dim = hidden_dim // self.num_query_heads
-        self._norm_factor = ops.sqrt(ops.cast(head_dim, self.compute_dtype))
+        self._inv_norm_factor = 1.0 / math.sqrt(head_dim)
 
         self._query_dense = keras.layers.EinsumDense(
             equation="bqm,muh->bquh",
@@ -182,9 +185,27 @@ def _masked_softmax(self, attention_scores, attention_mask=None):
         return self._softmax(attention_scores)
 
     def _compute_attention(self, query, key, value, attention_mask=None):
+        if has_flash_attention_support():
+            # Use `dot_product_attention` with Flash Attention support if
+            # available.
+            if attention_mask is not None:
+                attention_mask = ops.expand_dims(attention_mask, axis=1)
+                attention_mask = ops.cast(attention_mask, dtype="bool")
+            attention_output = ops.dot_product_attention(
+                query,
+                key,
+                value,
+                mask=attention_mask,
+                scale=self._inv_norm_factor,
+            )
+            return attention_output
+
         attention_scores = ops.einsum(self._dot_product_equation, query, key)
 
-        attention_scores = attention_scores / self._norm_factor
+        attention_scores = ops.multiply(
+            attention_scores,
+            ops.cast(self._inv_norm_factor, self.compute_dtype),
+        )
         attention_scores = self._masked_softmax(
             attention_scores, attention_mask
         )