From 6f87cb8a1921bdcd60e5941fe8a500d2941bb948 Mon Sep 17 00:00:00 2001 From: Laxma Reddy Patlolla Date: Tue, 21 Jan 2025 11:05:45 -0800 Subject: [PATCH 01/12] Kaggle presets path update (#2052) --- keras_hub/src/models/basnet/basnet_presets.py | 16 +++++++++++++++- keras_hub/src/models/basnet/basnet_test.py | 1 - 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/keras_hub/src/models/basnet/basnet_presets.py b/keras_hub/src/models/basnet/basnet_presets.py index 3d96ab7885..1e1ee8e8ee 100644 --- a/keras_hub/src/models/basnet/basnet_presets.py +++ b/keras_hub/src/models/basnet/basnet_presets.py @@ -1,3 +1,17 @@ """BASNet model preset configurations.""" -basnet_presets = {} +basnet_presets = { + "basnet_duts": { + "metadata": { + "description": ( + "BASNet model with a 34-layer ResNet backbone, pre-trained " + "on the DUTS image dataset at a 288x288 resolution. Model " + "training was performed by Hamid Ali " + "(https://github.com/hamidriasat/BASNet)." + ), + "params": 108886792, + "path": "basnet", + }, + "kaggle_handle": "kaggle://keras/basnet/keras/base1", + }, +} diff --git a/keras_hub/src/models/basnet/basnet_test.py b/keras_hub/src/models/basnet/basnet_test.py index 4147d43c7c..b5bbe405e2 100644 --- a/keras_hub/src/models/basnet/basnet_test.py +++ b/keras_hub/src/models/basnet/basnet_test.py @@ -54,7 +54,6 @@ def test_end_to_end_model_predict(self): output = model.predict(self.images) self.assertAllEqual(output.shape, (2, 64, 64, 1)) - @pytest.mark.skip(reason="disabled until preset's been uploaded to Kaggle") @pytest.mark.extra_large def test_all_presets(self): for preset in BASNetImageSegmenter.presets: From 96b2fe5bce5fef2b6405a78b8c6e0ba14a2f2f4b Mon Sep 17 00:00:00 2001 From: Matt Watson <1389937+mattdangerw@users.noreply.github.com> Date: Tue, 21 Jan 2025 13:50:37 -0800 Subject: [PATCH 02/12] Update asserts to avoid deprecated methods (#2053) --- .../preprocessing/masked_lm_mask_generator_test.py | 2 +- keras_hub/src/models/efficientnet/cba_test.py | 4 ++-- .../efficientnet/efficientnet_backbone_test.py | 12 ++++++------ .../src/models/efficientnet/fusedmbconv_test.py | 6 +++--- keras_hub/src/models/efficientnet/mbconv_test.py | 6 +++--- keras_hub/src/tests/test_case.py | 2 +- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/keras_hub/src/layers/preprocessing/masked_lm_mask_generator_test.py b/keras_hub/src/layers/preprocessing/masked_lm_mask_generator_test.py index f3e58d3133..6b03f54d11 100644 --- a/keras_hub/src/layers/preprocessing/masked_lm_mask_generator_test.py +++ b/keras_hub/src/layers/preprocessing/masked_lm_mask_generator_test.py @@ -148,7 +148,7 @@ def test_config(self): "vocabulary_size": self.vocabulary_size, "unselectable_token_ids": unselectable_token_ids, } - self.assertDictContainsSubset(expected_config, config) + self.assertEqual(config, {**config, **expected_config}) # Test cloned masked_lm_masker can be run. cloned_masked_lm_masker = MaskedLMMaskGenerator.from_config(config) diff --git a/keras_hub/src/models/efficientnet/cba_test.py b/keras_hub/src/models/efficientnet/cba_test.py index ec028b1239..e9ea31ccbe 100644 --- a/keras_hub/src/models/efficientnet/cba_test.py +++ b/keras_hub/src/models/efficientnet/cba_test.py @@ -10,7 +10,7 @@ def test_same_input_output_shapes(self): layer = CBABlock(input_filters=32, output_filters=32) output = layer(inputs) - self.assertEquals(output.shape, (1, 64, 64, 32)) + self.assertEqual(output.shape, (1, 64, 64, 32)) self.assertLen(output, 1) def test_different_input_output_shapes(self): @@ -18,5 +18,5 @@ def test_different_input_output_shapes(self): layer = CBABlock(input_filters=32, output_filters=48) output = layer(inputs) - self.assertEquals(output.shape, (1, 64, 64, 48)) + self.assertEqual(output.shape, (1, 64, 64, 48)) self.assertLen(output, 1) diff --git a/keras_hub/src/models/efficientnet/efficientnet_backbone_test.py b/keras_hub/src/models/efficientnet/efficientnet_backbone_test.py index c11e636540..1f54f71925 100644 --- a/keras_hub/src/models/efficientnet/efficientnet_backbone_test.py +++ b/keras_hub/src/models/efficientnet/efficientnet_backbone_test.py @@ -87,24 +87,24 @@ def test_feature_pyramid_outputs(self): height = width = 256 outputs = model(keras.ops.ones(shape=(batch_size, height, width, 3))) levels = ["P1", "P2", "P3", "P4", "P5"] - self.assertEquals(list(outputs.keys()), levels) - self.assertEquals( + self.assertEqual(list(outputs.keys()), levels) + self.assertEqual( outputs["P1"].shape, (batch_size, height // 2**1, width // 2**1, 24), ) - self.assertEquals( + self.assertEqual( outputs["P2"].shape, (batch_size, height // 2**2, width // 2**2, 48), ) - self.assertEquals( + self.assertEqual( outputs["P3"].shape, (batch_size, height // 2**3, width // 2**3, 64), ) - self.assertEquals( + self.assertEqual( outputs["P4"].shape, (batch_size, height // 2**4, width // 2**4, 160), ) - self.assertEquals( + self.assertEqual( outputs["P5"].shape, (batch_size, height // 2**5, width // 2**5, 1280), ) diff --git a/keras_hub/src/models/efficientnet/fusedmbconv_test.py b/keras_hub/src/models/efficientnet/fusedmbconv_test.py index b12f729ddc..a3049dc462 100644 --- a/keras_hub/src/models/efficientnet/fusedmbconv_test.py +++ b/keras_hub/src/models/efficientnet/fusedmbconv_test.py @@ -10,7 +10,7 @@ def test_same_input_output_shapes(self): layer = FusedMBConvBlock(input_filters=32, output_filters=32) output = layer(inputs) - self.assertEquals(output.shape, (1, 64, 64, 32)) + self.assertEqual(output.shape, (1, 64, 64, 32)) self.assertLen(output, 1) def test_different_input_output_shapes(self): @@ -18,7 +18,7 @@ def test_different_input_output_shapes(self): layer = FusedMBConvBlock(input_filters=32, output_filters=48) output = layer(inputs) - self.assertEquals(output.shape, (1, 64, 64, 48)) + self.assertEqual(output.shape, (1, 64, 64, 48)) self.assertLen(output, 1) def test_squeeze_excitation_ratio(self): @@ -28,5 +28,5 @@ def test_squeeze_excitation_ratio(self): ) output = layer(inputs) - self.assertEquals(output.shape, (1, 64, 64, 48)) + self.assertEqual(output.shape, (1, 64, 64, 48)) self.assertLen(output, 1) diff --git a/keras_hub/src/models/efficientnet/mbconv_test.py b/keras_hub/src/models/efficientnet/mbconv_test.py index ea92c7a9c6..b1085770d4 100644 --- a/keras_hub/src/models/efficientnet/mbconv_test.py +++ b/keras_hub/src/models/efficientnet/mbconv_test.py @@ -10,7 +10,7 @@ def test_same_input_output_shapes(self): layer = MBConvBlock(input_filters=32, output_filters=32) output = layer(inputs) - self.assertEquals(output.shape, (1, 64, 64, 32)) + self.assertEqual(output.shape, (1, 64, 64, 32)) self.assertLen(output, 1) def test_different_input_output_shapes(self): @@ -18,7 +18,7 @@ def test_different_input_output_shapes(self): layer = MBConvBlock(input_filters=32, output_filters=48) output = layer(inputs) - self.assertEquals(output.shape, (1, 64, 64, 48)) + self.assertEqual(output.shape, (1, 64, 64, 48)) self.assertLen(output, 1) def test_squeeze_excitation_ratio(self): @@ -26,5 +26,5 @@ def test_squeeze_excitation_ratio(self): layer = MBConvBlock(input_filters=32, output_filters=48, se_ratio=0.25) output = layer(inputs) - self.assertEquals(output.shape, (1, 64, 64, 48)) + self.assertEqual(output.shape, (1, 64, 64, 48)) self.assertLen(output, 1) diff --git a/keras_hub/src/tests/test_case.py b/keras_hub/src/tests/test_case.py index 8053fff63b..54155e0517 100644 --- a/keras_hub/src/tests/test_case.py +++ b/keras_hub/src/tests/test_case.py @@ -479,7 +479,7 @@ def run_backbone_test( # Check name maps to classname. name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", cls.__name__) name = re.sub("([a-z])([A-Z])", r"\1_\2", name).lower() - self.assertRegexpMatches(backbone.name, name) + self.assertRegex(backbone.name, name) # Check mixed precision. if run_mixed_precision_check: From 221ea6b44984bd8e42b281d9ad659123e4453496 Mon Sep 17 00:00:00 2001 From: Siva Sravana Kumar Neeli <113718461+sineeli@users.noreply.github.com> Date: Mon, 27 Jan 2025 16:47:12 -0800 Subject: [PATCH 03/12] Add `pad_to_aspect_ratio` flag to ImageConverter (#2045) * Add `pad_to_aspect_ratio` flag to ImageConverter * skip resize test with pad_to_aspect_ratio when backend set to torch * nit --- .../layers/preprocessing/image_converter.py | 10 ++++++++ .../preprocessing/image_converter_test.py | 23 ++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/keras_hub/src/layers/preprocessing/image_converter.py b/keras_hub/src/layers/preprocessing/image_converter.py index db1b4b7756..2edeffb666 100644 --- a/keras_hub/src/layers/preprocessing/image_converter.py +++ b/keras_hub/src/layers/preprocessing/image_converter.py @@ -98,6 +98,7 @@ def __init__( scale=None, offset=None, crop_to_aspect_ratio=True, + pad_to_aspect_ratio=False, interpolation="bilinear", data_format=None, **kwargs, @@ -112,12 +113,19 @@ def __init__( super().__init__(**kwargs) + if crop_to_aspect_ratio and pad_to_aspect_ratio: + raise ValueError( + "Only one of 'crop_to_aspect_ratio' or 'pad_to_aspect_ratio' " + "can be True." + ) + # Create the `Resizing` layer here even if it's not being used. That # allows us to make `image_size` a settable property. self.resizing = keras.layers.Resizing( height=image_size[0] if image_size else None, width=image_size[1] if image_size else None, crop_to_aspect_ratio=crop_to_aspect_ratio, + pad_to_aspect_ratio=pad_to_aspect_ratio, interpolation=interpolation, data_format=data_format, dtype=self.dtype_policy, @@ -126,6 +134,7 @@ def __init__( self.scale = scale self.offset = offset self.crop_to_aspect_ratio = crop_to_aspect_ratio + self.pad_to_aspect_ratio = pad_to_aspect_ratio self.interpolation = interpolation self.data_format = standardize_data_format(data_format) @@ -182,6 +191,7 @@ def get_config(self): "offset": self.offset, "interpolation": self.interpolation, "crop_to_aspect_ratio": self.crop_to_aspect_ratio, + "pad_to_aspect_ratio": self.pad_to_aspect_ratio, } ) return config diff --git a/keras_hub/src/layers/preprocessing/image_converter_test.py b/keras_hub/src/layers/preprocessing/image_converter_test.py index d638ccf9ab..1fdc97e031 100644 --- a/keras_hub/src/layers/preprocessing/image_converter_test.py +++ b/keras_hub/src/layers/preprocessing/image_converter_test.py @@ -1,8 +1,10 @@ import os import pathlib +import keras import numpy as np import pytest +from absl.testing import parameterized from keras import ops from keras_hub.src.layers.preprocessing.image_converter import ImageConverter @@ -33,11 +35,21 @@ def test_unbatched(self): self.assertAllClose(outputs[:, :, 1], np.ones((4, 4)) * 0.301569) self.assertAllClose(outputs[:, :, 2], np.ones((4, 4)) * 0.852353) - def test_resize_batch(self): + @parameterized.parameters( + (True, False), + (False, True), + ) + @pytest.mark.skipif( + keras.config.backend() == "torch", + reason="disabled until resize is fixed for torch backend", + ) # TODO: remove skip after new release with fix of https://github.com/keras-team/keras/pull/20797 + def test_resize_batch(self, crop_to_aspect_ratio, pad_to_aspect_ratio): converter = ImageConverter( image_size=(4, 4), scale=(1.0 / 255.0, 0.8 / 255.0, 1.2 / 255.0), offset=(0.2, -0.1, 0.25), + crop_to_aspect_ratio=crop_to_aspect_ratio, + pad_to_aspect_ratio=pad_to_aspect_ratio, ) inputs = np.ones((2, 10, 10, 3)) * 128 outputs = converter(inputs) @@ -46,6 +58,15 @@ def test_resize_batch(self): self.assertAllClose(outputs[:, :, :, 1], np.ones((2, 4, 4)) * 0.301569) self.assertAllClose(outputs[:, :, :, 2], np.ones((2, 4, 4)) * 0.852353) + def test_pad_and_crop_to_aspect_ratio(self): + with self.assertRaisesRegex(ValueError, "Only one of"): + _ = ImageConverter( + image_size=(4, 4), + scale=1 / 255.0, + crop_to_aspect_ratio=True, + pad_to_aspect_ratio=True, + ) + def test_config(self): converter = ImageConverter( image_size=(12, 20), From 63863ab7bc41522c1c79a4e6cb748bd2f780b9f2 Mon Sep 17 00:00:00 2001 From: "Hongyu, Chiu" <20734616+james77777778@users.noreply.github.com> Date: Tue, 28 Jan 2025 11:10:54 +0800 Subject: [PATCH 04/12] Use Flash Attention if available (#2058) * Use Flash Attention if available * Torch's `dot_product_attention` doesn't support `bias`. --- .../src/models/falcon/falcon_attention.py | 9 ++++-- .../models/gpt_neo_x/gpt_neo_x_attention.py | 29 +++++++++++++++---- keras_hub/src/models/llama/llama_attention.py | 25 ++++++++++++++-- .../src/models/mistral/mistral_attention.py | 27 ++++++++++++++--- keras_hub/src/models/phi3/phi3_attention.py | 25 ++++++++++++++-- .../src/models/stable_diffusion_3/mmdit.py | 13 ++++----- keras_hub/src/utils/keras_utils.py | 7 +++++ 7 files changed, 110 insertions(+), 25 deletions(-) diff --git a/keras_hub/src/models/falcon/falcon_attention.py b/keras_hub/src/models/falcon/falcon_attention.py index b150a1ca89..48db9664ea 100644 --- a/keras_hub/src/models/falcon/falcon_attention.py +++ b/keras_hub/src/models/falcon/falcon_attention.py @@ -110,9 +110,11 @@ def call( attention_scores = ops.einsum("bqnh,bknh->bnqk", query, key) attention_scores = ops.add(attention_scores, alibi) - attention_scores = ( - attention_scores * self.inv_norm_factor - ) # [batch_size, num_heads, query_length, kv_length] + # [batch_size, num_heads, query_length, kv_length] + attention_scores = ops.multiply( + attention_scores, + ops.cast(self.inv_norm_factor, self.compute_dtype), + ) attention_scores = self.softmax( attention_scores, ops.expand_dims(attention_mask, 1) ) @@ -120,6 +122,7 @@ def call( attention_output = ops.einsum( "bnqk,bknh->bqnh", attention_scores, value ) + attention_output = ops.reshape( attention_output, [batch_size, seq_length, self.num_heads * self.head_dim], diff --git a/keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py b/keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py index 6824a641d4..a0db2c7836 100644 --- a/keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py +++ b/keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py @@ -1,8 +1,11 @@ +import math + import keras from keras import ops from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding from keras_hub.src.utils.keras_utils import clone_initializer +from keras_hub.src.utils.keras_utils import has_flash_attention_support class GPTNeoXAttention(keras.layers.Layer): @@ -58,6 +61,8 @@ def __init__( self.bias_initializer = keras.initializers.get(bias_initializer) self.max_sequence_length = max_sequence_length + self._inv_norm_factor = 1.0 / math.sqrt(self.attn_head_size) + def build(self, input_shape): self._qkv_dense = keras.layers.EinsumDense( equation="abc,cde->abde", @@ -120,14 +125,26 @@ def _masked_softmax(self, attention_scores, attention_mask=None): def _compute_attention( self, query, key, value, attention_mask=None, training=None ): - attention_scores = ops.einsum("aecd,abcd->acbe", key, query) + if has_flash_attention_support() and self.dropout == 0: + # Use `dot_product_attention` with Flash Attention support if + # available. + if attention_mask is not None: + attention_mask = ops.expand_dims(attention_mask, axis=1) + attention_mask = ops.cast(attention_mask, dtype="bool") + attention_output = ops.dot_product_attention( + query, + key, + value, + mask=attention_mask, + scale=self._inv_norm_factor, + ) + return attention_output - norm_factor = ops.sqrt( - ops.convert_to_tensor(self.attn_head_size, self.compute_dtype) + attention_scores = ops.einsum("aecd,abcd->acbe", key, query) + attention_scores = ops.multiply( + attention_scores, + ops.cast(self._inv_norm_factor, self.compute_dtype), ) - - attention_scores /= norm_factor - attention_scores = self._masked_softmax( attention_scores, attention_mask ) diff --git a/keras_hub/src/models/llama/llama_attention.py b/keras_hub/src/models/llama/llama_attention.py index a8bcef9fa3..6ef8079005 100644 --- a/keras_hub/src/models/llama/llama_attention.py +++ b/keras_hub/src/models/llama/llama_attention.py @@ -1,8 +1,11 @@ +import math + import keras from keras import ops from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding from keras_hub.src.utils.keras_utils import clone_initializer +from keras_hub.src.utils.keras_utils import has_flash_attention_support class LlamaAttention(keras.layers.Layer): @@ -43,7 +46,7 @@ def build(self, inputs_shape): # h = head dim hidden_dim = inputs_shape[-1] head_dim = hidden_dim // self.num_query_heads - self._norm_factor = ops.sqrt(ops.cast(head_dim, self.compute_dtype)) + self._inv_norm_factor = 1.0 / math.sqrt(head_dim) self._query_dense = keras.layers.EinsumDense( equation="bqm,muh->bquh", @@ -182,9 +185,27 @@ def _masked_softmax(self, attention_scores, attention_mask=None): return self._softmax(attention_scores) def _compute_attention(self, query, key, value, attention_mask=None): + if has_flash_attention_support(): + # Use `dot_product_attention` with Flash Attention support if + # available. + if attention_mask is not None: + attention_mask = ops.expand_dims(attention_mask, axis=1) + attention_mask = ops.cast(attention_mask, dtype="bool") + attention_output = ops.dot_product_attention( + query, + key, + value, + mask=attention_mask, + scale=self._inv_norm_factor, + ) + return attention_output + attention_scores = ops.einsum(self._dot_product_equation, query, key) - attention_scores = attention_scores / self._norm_factor + attention_scores = ops.multiply( + attention_scores, + ops.cast(self._inv_norm_factor, self.compute_dtype), + ) attention_scores = self._masked_softmax( attention_scores, attention_mask ) diff --git a/keras_hub/src/models/mistral/mistral_attention.py b/keras_hub/src/models/mistral/mistral_attention.py index b0c0ecd3aa..d87a676de2 100644 --- a/keras_hub/src/models/mistral/mistral_attention.py +++ b/keras_hub/src/models/mistral/mistral_attention.py @@ -1,8 +1,11 @@ +import math + import keras from keras import ops from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding from keras_hub.src.utils.keras_utils import clone_initializer +from keras_hub.src.utils.keras_utils import has_flash_attention_support # This is just a self-attention layer in Mistral. But it can be generalized @@ -52,6 +55,7 @@ def build(self, inputs_shape): # h = head dim self._hidden_dim = inputs_shape[-1] self._head_dim = self._hidden_dim // self._num_query_heads + self._inv_norm_factor = 1.0 / math.sqrt(self._head_dim) self._query_dense = keras.layers.EinsumDense( equation="bqm,muh->bquh", @@ -192,11 +196,26 @@ def _masked_softmax(self, attention_scores, attention_mask=None): return self._softmax(attention_scores) def _compute_attention(self, query, key, value, attention_mask=None): - attention_scores = ops.einsum(self._dot_product_equation, query, key) - - norm_factor = ops.sqrt(ops.cast(self._head_dim, self.compute_dtype)) + if has_flash_attention_support(): + # Use `dot_product_attention` with Flash Attention support if + # available. + if attention_mask is not None: + attention_mask = ops.expand_dims(attention_mask, axis=1) + attention_mask = ops.cast(attention_mask, dtype="bool") + attention_output = ops.dot_product_attention( + query, + key, + value, + mask=attention_mask, + scale=self._inv_norm_factor, + ) + return attention_output - attention_scores = attention_scores / norm_factor + attention_scores = ops.einsum(self._dot_product_equation, query, key) + attention_scores = ops.multiply( + attention_scores, + ops.cast(self._inv_norm_factor, self.compute_dtype), + ) attention_scores = self._masked_softmax( attention_scores, attention_mask ) diff --git a/keras_hub/src/models/phi3/phi3_attention.py b/keras_hub/src/models/phi3/phi3_attention.py index 1c4476240d..2860799dc2 100644 --- a/keras_hub/src/models/phi3/phi3_attention.py +++ b/keras_hub/src/models/phi3/phi3_attention.py @@ -1,3 +1,5 @@ +import math + import keras from keras import ops @@ -6,6 +8,7 @@ Phi3SuScaledRotaryEmbedding, ) from keras_hub.src.utils.keras_utils import clone_initializer +from keras_hub.src.utils.keras_utils import has_flash_attention_support class Phi3Attention(keras.layers.Layer): @@ -53,7 +56,7 @@ def build(self, inputs_shape): # h = head dim hidden_dim = inputs_shape[-1] head_dim = hidden_dim // self.num_query_heads - self._norm_factor = ops.sqrt(ops.cast(head_dim, self.compute_dtype)) + self._inv_norm_factor = 1.0 / math.sqrt(head_dim) self.query_dense = keras.layers.EinsumDense( equation="bqm,muh->bquh", @@ -214,8 +217,26 @@ def _masked_softmax(self, attention_scores, attention_mask=None): return self.softmax(attention_scores) def _compute_attention(self, query, key, value, attention_mask=None): + if has_flash_attention_support(): + # Use `dot_product_attention` with Flash Attention support if + # available. + if attention_mask is not None: + attention_mask = ops.expand_dims(attention_mask, axis=1) + attention_mask = ops.cast(attention_mask, dtype="bool") + attention_output = ops.dot_product_attention( + query, + key, + value, + mask=attention_mask, + scale=self._inv_norm_factor, + ) + return attention_output + attention_scores = ops.einsum("bquh,bkuh->buqk", query, key) - attention_scores = attention_scores / self._norm_factor + attention_scores = ops.multiply( + attention_scores, + ops.cast(self._inv_norm_factor, self.compute_dtype), + ) attention_scores = self._masked_softmax( attention_scores, attention_mask ) diff --git a/keras_hub/src/models/stable_diffusion_3/mmdit.py b/keras_hub/src/models/stable_diffusion_3/mmdit.py index 36cbc11d79..fc5c2d6aaa 100644 --- a/keras_hub/src/models/stable_diffusion_3/mmdit.py +++ b/keras_hub/src/models/stable_diffusion_3/mmdit.py @@ -7,6 +7,7 @@ from keras_hub.src.layers.modeling.position_embedding import PositionEmbedding from keras_hub.src.models.backbone import Backbone from keras_hub.src.utils.keras_utils import gelu_approximate +from keras_hub.src.utils.keras_utils import has_flash_attention_support from keras_hub.src.utils.keras_utils import standardize_data_format @@ -770,17 +771,14 @@ def build(self, inputs_shape, context_shape, timestep_embedding_shape): def _compute_attention(self, query, key, value): batch_size = ops.shape(query)[0] - # Use the fast path when `ops.dot_product_attention` and flash attention - # are available. - if hasattr(ops, "dot_product_attention") and hasattr( - keras.config, "is_flash_attention_enabled" - ): + if has_flash_attention_support(): + # Use `dot_product_attention` with Flash Attention support if + # available. encoded = ops.dot_product_attention( query, key, value, scale=self._inverse_sqrt_key_dim, - flash_attention=keras.config.is_flash_attention_enabled(), ) return ops.reshape( encoded, (batch_size, -1, self.num_heads * self.head_dim) @@ -793,10 +791,9 @@ def _compute_attention(self, query, key, value): probs = self.softmax(logits) probs = ops.cast(probs, self.compute_dtype) encoded = ops.einsum("BNTS,BSNH->BTNH", probs, value) - encoded = ops.reshape( + return ops.reshape( encoded, (batch_size, -1, self.num_heads * self.head_dim) ) - return encoded def call(self, inputs, context, timestep_embedding, training=None): # Compute pre-attention. diff --git a/keras_hub/src/utils/keras_utils.py b/keras_hub/src/utils/keras_utils.py index 29414811e4..360b030660 100644 --- a/keras_hub/src/utils/keras_utils.py +++ b/keras_hub/src/utils/keras_utils.py @@ -53,3 +53,10 @@ def standardize_data_format(data_format): f"Received: data_format={data_format}" ) return data_format + + +def has_flash_attention_support(): + if hasattr(keras.config, "is_flash_attention_enabled"): + return True + else: + return False From 8ca2076d651532f7270ffa2beba44d377b100bbf Mon Sep 17 00:00:00 2001 From: Matt Watson <1389937+mattdangerw@users.noreply.github.com> Date: Tue, 28 Jan 2025 13:56:25 -0800 Subject: [PATCH 05/12] os.make_dirs is not a thing; os.makedirs is (#2061) We were also missing any coverage for this codepath :( --- keras_hub/src/utils/preset_utils.py | 2 +- keras_hub/src/utils/preset_utils_test.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/keras_hub/src/utils/preset_utils.py b/keras_hub/src/utils/preset_utils.py index 4b4e3bc3b3..b1a295cbcc 100644 --- a/keras_hub/src/utils/preset_utils.py +++ b/keras_hub/src/utils/preset_utils.py @@ -240,7 +240,7 @@ def tf_copy_gfile_to_cache(preset, path): try: import tensorflow as tf - os.make_dirs(os.path.dirname(local_path), exist_ok=True) + os.makedirs(os.path.dirname(local_path), exist_ok=True) tf.io.gfile.copy(url, local_path) except Exception as e: # gfile.copy will leave an empty file after an error. diff --git a/keras_hub/src/utils/preset_utils_test.py b/keras_hub/src/utils/preset_utils_test.py index 787a1ea439..998dcadfa9 100644 --- a/keras_hub/src/utils/preset_utils_test.py +++ b/keras_hub/src/utils/preset_utils_test.py @@ -33,6 +33,18 @@ def test_preset_errors(self): with self.assertRaisesRegex(ValueError, "class keras_hub>BortBackbone"): BertBackbone.from_preset(preset_dir) + @pytest.mark.large + def test_tf_file_io(self): + # Load a model from Kaggle to use as a test model. + preset = "bert_tiny_en_uncased" + backbone = BertBackbone.from_preset(preset) + # Save the model on a local directory. + temp_dir = self.get_temp_dir() + local_preset_dir = os.path.join(temp_dir, "bert_preset") + backbone.save_to_preset(local_preset_dir) + # Load with "file://" which tf supports. + backbone = BertBackbone.from_preset("file://" + local_preset_dir) + @pytest.mark.large def test_upload_empty_preset(self): temp_dir = self.get_temp_dir() From d71318d92e751a9d9efa725cd423ccfc325d9e70 Mon Sep 17 00:00:00 2001 From: Matt Watson <1389937+mattdangerw@users.noreply.github.com> Date: Wed, 29 Jan 2025 13:13:23 -0800 Subject: [PATCH 06/12] Update README.md (#2063) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6368571388..4a5aa35899 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ print(keras_hub.utils.decode_imagenet_predictions(preds)) Load a Bert model and fine-tune it on IMDb movie reviews: ```python -classifier = keras_hub.models.BertClassifier.from_preset( +classifier = keras_hub.models.TextClassifier.from_preset( "bert_base_en_uncased", activation="softmax", num_classes=2, From e62569965fcebe690756c06965d2b8568dddc35f Mon Sep 17 00:00:00 2001 From: balanprasanth <112931254+balanprasanth@users.noreply.github.com> Date: Thu, 30 Jan 2025 02:43:46 +0530 Subject: [PATCH 07/12] Update auto-assignment.js (#2057) update issues assignee usernames --- .github/workflows/scripts/auto-assignment.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scripts/auto-assignment.js b/.github/workflows/scripts/auto-assignment.js index 176b305f39..d47805aca9 100644 --- a/.github/workflows/scripts/auto-assignment.js +++ b/.github/workflows/scripts/auto-assignment.js @@ -12,7 +12,7 @@ module.exports = async ({ github, context }) => { // Is this an issue? If so, assign the issue number. Otherwise, assign the PR number. if (context.payload.issue) { //assignee List for issues. - assigneesList = ["SuryanarayanaY", "sachinprasadhs"]; + assigneesList = ["mehtamansi29","sonali-kumari1", "sachinprasadhs"]; issueNumber = context.payload.issue.number; } else { //assignee List for PRs. From 6adc92f505e629b7e2a2dfe6140e7afb5a99d845 Mon Sep 17 00:00:00 2001 From: Abheesht Date: Mon, 3 Feb 2025 11:39:33 +0530 Subject: [PATCH 08/12] Remove `mask = None` (#2067) --- keras_hub/src/models/pali_gemma/pali_gemma_vit.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/keras_hub/src/models/pali_gemma/pali_gemma_vit.py b/keras_hub/src/models/pali_gemma/pali_gemma_vit.py index b217509541..621b4562de 100644 --- a/keras_hub/src/models/pali_gemma/pali_gemma_vit.py +++ b/keras_hub/src/models/pali_gemma/pali_gemma_vit.py @@ -204,9 +204,8 @@ def __init__( self.intermediate_dim = intermediate_dim def compute_attention(self, x, mask=None): - mask = None if mask is not None: - mask = ops.cast(mask, dtype=x.dtype) if mask is not None else None + mask = ops.cast(mask, dtype=x.dtype) return self.attn(x, attention_mask=mask)[0] def build(self, input_shape): From c6644576d0ad15f1f2424435f029d9a9eb801443 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 3 Feb 2025 09:26:34 -0800 Subject: [PATCH 09/12] Bump the python group with 2 updates (#2066) Bumps the python group with 2 updates: torch and torchvision. Updates `torch` from 2.5.1+cu121 to 2.6.0+cpu Updates `torchvision` from 0.20.1+cu121 to 0.21.0+cpu --- updated-dependencies: - dependency-name: torch dependency-type: direct:production update-type: version-update:semver-minor dependency-group: python - dependency-name: torchvision dependency-type: direct:production update-type: version-update:semver-minor dependency-group: python ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements-torch-cuda.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-torch-cuda.txt b/requirements-torch-cuda.txt index 2a601f0f20..a696dc2a85 100644 --- a/requirements-torch-cuda.txt +++ b/requirements-torch-cuda.txt @@ -4,8 +4,8 @@ tensorflow-text~=2.18 # Torch with cuda support. --extra-index-url https://download.pytorch.org/whl/cu121 -torch==2.5.1+cu121 -torchvision==0.20.1+cu121 +torch==2.6.0+cpu +torchvision==0.21.0+cpu # Jax cpu-only version. jax[cpu] From f157ff53e15a5d03ccd75230fa6adf8157ef250f Mon Sep 17 00:00:00 2001 From: Matt Watson <1389937+mattdangerw@users.noreply.github.com> Date: Mon, 3 Feb 2025 12:15:57 -0800 Subject: [PATCH 10/12] Make gemma inputs int32 same as other models (#2069) I don't think anything was broken because of this, but we should not have float32 inputs in this case. --- keras_hub/src/models/gemma/gemma_backbone.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keras_hub/src/models/gemma/gemma_backbone.py b/keras_hub/src/models/gemma/gemma_backbone.py index 93dea30199..22ca535c3d 100644 --- a/keras_hub/src/models/gemma/gemma_backbone.py +++ b/keras_hub/src/models/gemma/gemma_backbone.py @@ -148,10 +148,10 @@ def __init__( # === Functional Model === token_id_input = keras.Input( - shape=(None,), dtype="float32", name="token_ids" + shape=(None,), dtype="int32", name="token_ids" ) padding_mask_input = keras.Input( - shape=(None,), dtype="float32", name="padding_mask" + shape=(None,), dtype="int32", name="padding_mask" ) x = self.token_embedding(token_id_input) x = x * ops.cast(ops.sqrt(hidden_dim), x.dtype) From 4ecbadf9917e4e90dbadb9c485a104b63effe83e Mon Sep 17 00:00:00 2001 From: Siva Sravana Kumar Neeli <113718461+sineeli@users.noreply.github.com> Date: Mon, 3 Feb 2025 15:12:18 -0800 Subject: [PATCH 11/12] Vit bug (#2070) * add missing dropout_rate argument to MLP layer * dropout layer missing as per official implementation --- keras_hub/src/models/vit/vit_layers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/keras_hub/src/models/vit/vit_layers.py b/keras_hub/src/models/vit/vit_layers.py index f3509440d5..473cb9cb66 100644 --- a/keras_hub/src/models/vit/vit_layers.py +++ b/keras_hub/src/models/vit/vit_layers.py @@ -65,6 +65,7 @@ def build(self, input_shape): def call(self, inputs): x = self.dense_1(inputs) + x = self.dropout(x) x = self.dense_2(x) out = self.dropout(x) return out @@ -257,6 +258,7 @@ def build(self, input_shape): hidden_dim=self.hidden_dim, mlp_dim=self.mlp_dim, use_bias=self.use_mlp_bias, + dropout_rate=self.dropout_rate, name="mlp", dtype=self.dtype_policy, ) From a80ea2817a1a00f30641b01b5ffb7a6a9d187f12 Mon Sep 17 00:00:00 2001 From: balanprasanth <112931254+balanprasanth@users.noreply.github.com> Date: Wed, 5 Feb 2025 23:17:45 +0530 Subject: [PATCH 12/12] Update auto-assignment.js (#2065) Update issue assignee username --- .github/workflows/scripts/auto-assignment.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scripts/auto-assignment.js b/.github/workflows/scripts/auto-assignment.js index d47805aca9..d9e7956426 100644 --- a/.github/workflows/scripts/auto-assignment.js +++ b/.github/workflows/scripts/auto-assignment.js @@ -12,7 +12,7 @@ module.exports = async ({ github, context }) => { // Is this an issue? If so, assign the issue number. Otherwise, assign the PR number. if (context.payload.issue) { //assignee List for issues. - assigneesList = ["mehtamansi29","sonali-kumari1", "sachinprasadhs"]; + assigneesList = ["mehtamansi29", "sonali-kumari1", "dhantule", "sachinprasadhs"]; issueNumber = context.payload.issue.number; } else { //assignee List for PRs.