From db27dbee8aacef6d3fd97a7b50a826f9f2e09e50 Mon Sep 17 00:00:00 2001
From: asingh9530 <abhinav199530singh@gmail.com>
Date: Mon, 7 Aug 2023 22:13:25 +0530
Subject: [PATCH] initial commit for text preprocessing porting

---
 .../preprocessing/text_vectorization.py       | 357 +++++++++++++++---
 keras_core/utils/__init__.py                  |   1 +
 keras_core/utils/layer_utils.py               |  58 +++
 3 files changed, 373 insertions(+), 43 deletions(-)
 create mode 100644 keras_core/utils/layer_utils.py

diff --git a/keras_core/layers/preprocessing/text_vectorization.py b/keras_core/layers/preprocessing/text_vectorization.py
index 92d8811f9..c13a30069 100644
--- a/keras_core/layers/preprocessing/text_vectorization.py
+++ b/keras_core/layers/preprocessing/text_vectorization.py
@@ -6,8 +6,34 @@
 from keras_core.saving import serialization_lib
 from keras_core.utils import backend_utils
 from keras_core.utils.module_utils import tensorflow as tf
+from keras_core.layers.preprocessing.string_lookup import StringLookup
+from keras_core.utils import layer_utils
 
 
+keras_kpl_gauge = tf.__internal__.monitoring.BoolGauge(
+    "/keras_core/layers/preprocessing",
+    "keras core preprocessing layers usage",
+    "method",
+)
+
+
+LOWER_AND_STRIP_PUNCTUATION = "lower_and_strip_punctuation"
+STRIP_PUNCTUATION = "strip_punctuation"
+LOWER = "lower"
+
+WHITESPACE = "whitespace"
+CHARACTER = "character"
+
+TF_IDF = "tf_idf"
+INT = "int"
+MULTI_HOT = "multi_hot"
+COUNT = "count"
+
+# This is an explicit regex of all the tokens that will be stripped if
+# LOWER_AND_STRIP_PUNCTUATION is set. If an application requires other
+# stripping, a Callable should be passed into the 'standardize' arg.
+DEFAULT_STRIP_REGEX = r'[!"#$%&()\*\+,-\./:;<=>?@\[\\\]^_`{|}~\']'
+
 @keras_core_export("keras_core.layers.TextVectorization")
 class TextVectorization(Layer):
     """A preprocessing layer which maps text features to integer sequences.
@@ -230,28 +256,168 @@ def __init__(
                 "`ragged` can only be set to True with the "
                 "TensorFlow backend."
             )
-        self.layer = tf.keras.layers.TextVectorization(
+        layer_utils.validate_string_arg(
+            standardize,
+            allowable_strings=(
+                LOWER_AND_STRIP_PUNCTUATION,
+                LOWER,
+                STRIP_PUNCTUATION,
+            ),
+            layer_name="TextVectorization",
+            arg_name="standardize",
+            allow_none=True,
+            allow_callables=True,
+        )
+
+        # 'split' must be one of (None, WHITESPACE, CHARACTER, callable)
+        layer_utils.validate_string_arg(
+            split,
+            allowable_strings=(WHITESPACE, CHARACTER),
+            layer_name="TextVectorization",
+            arg_name="split",
+            allow_none=True,
+            allow_callables=True,
+        )
+
+        # Support deprecated names for output_modes.
+        if output_mode == "binary":
+            output_mode = MULTI_HOT
+        if output_mode == "tf-idf":
+            output_mode = TF_IDF
+        # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF)
+        layer_utils.validate_string_arg(
+            output_mode,
+            allowable_strings=(INT, COUNT, MULTI_HOT, TF_IDF),
+            layer_name="TextVectorization",
+            arg_name="output_mode",
+            allow_none=True,
+        )
+
+        # 'ngrams' must be one of (None, int, tuple(int))
+        if not (
+            ngrams is None
+            or isinstance(ngrams, int)
+            or isinstance(ngrams, tuple)
+            and all(isinstance(item, int) for item in ngrams)
+        ):
+            raise ValueError(
+                "`ngrams` must be None, an integer, or a tuple of "
+                f"integers. Received: ngrams={ngrams}"
+            )
+
+        # 'output_sequence_length' must be one of (None, int) and is only
+        # set if output_mode is INT.
+        if output_mode == INT and not (
+            isinstance(output_sequence_length, int)
+            or (output_sequence_length is None)
+        ):
+            raise ValueError(
+                "`output_sequence_length` must be either None or an "
+                "integer when `output_mode` is 'int'. Received: "
+                f"output_sequence_length={output_sequence_length}"
+            )
+
+        if output_mode != INT and output_sequence_length is not None:
+            raise ValueError(
+                "`output_sequence_length` must not be set if `output_mode` is "
+                "not 'int'. "
+                f"Received output_sequence_length={output_sequence_length}."
+            )
+
+        if ragged and output_mode != INT:
+            raise ValueError(
+                "`ragged` must not be true if `output_mode` is "
+                f"`'int'`. Received: ragged={ragged} and "
+                f"output_mode={output_mode}"
+            )
+
+        if ragged and output_sequence_length is not None:
+            raise ValueError(
+                "`output_sequence_length` must not be set if ragged "
+                f"is True. Received: ragged={ragged} and "
+                f"output_sequence_length={output_sequence_length}"
+            )
+
+        self._max_tokens = max_tokens
+        self._standardize = standardize
+        self._split = split
+        self._ngrams_arg = ngrams
+        if isinstance(ngrams, int):
+            self._ngrams = tuple(range(1, ngrams + 1))
+        else:
+            self._ngrams = ngrams
+        self._ragged = ragged
+
+        self._output_mode = output_mode
+        self._output_sequence_length = output_sequence_length
+        self._encoding = encoding
+
+        # VocabularySavedModelSaver will clear the config vocabulary to restore
+        # the lookup table ops directly. We persist this hidden option to
+        # persist the fact that we have have a non-adaptable layer with a
+        # manually set vocab.
+        self._has_input_vocabulary = kwargs.pop(
+            "has_input_vocabulary", (vocabulary is not None)
+        )
+
+        vocabulary_size = kwargs.pop("vocabulary_size", None)
+
+        super().__init__(**kwargs)
+        keras_kpl_gauge.get_cell(
+            "TextVectorization"
+        ).set(True)
+
+        self._lookup_layer = StringLookup(
             max_tokens=max_tokens,
-            standardize=standardize,
-            split=split,
-            ngrams=ngrams,
-            output_mode=output_mode,
-            output_sequence_length=output_sequence_length,
-            pad_to_max_tokens=pad_to_max_tokens,
             vocabulary=vocabulary,
             idf_weights=idf_weights,
+            pad_to_max_tokens=pad_to_max_tokens,
+            mask_token="",
+            output_mode=output_mode if output_mode is not None else INT,
             sparse=sparse,
-            ragged=ragged,
+            has_input_vocabulary=self._has_input_vocabulary,
             encoding=encoding,
-            name=name,
-            **kwargs,
+            vocabulary_size=vocabulary_size,
         )
+        # self.layer = tf.keras.layers.TextVectorization(
+        #     max_tokens=max_tokens,
+        #     standardize=standardize,
+        #     split=split,
+        #     ngrams=ngrams,
+        #     output_mode=output_mode,
+        #     output_sequence_length=output_sequence_length,
+        #     pad_to_max_tokens=pad_to_max_tokens,
+        #     vocabulary=vocabulary,
+        #     idf_weights=idf_weights,
+        #     sparse=sparse,
+        #     ragged=ragged,
+        #     encoding=encoding,
+        #     name=name,
+        #     **kwargs,
+        # )
         self._convert_input_args = False
         self._allow_non_tensor_positional_args = True
         self.supports_jit = False
 
     def compute_output_shape(self, input_shape):
-        return tuple(self.layer.compute_output_shape(input_shape))
+        if self._output_mode == INT:
+            return tf.TensorShape(
+                [input_shape[0], self._output_sequence_length]
+            )
+
+        if self._split is None:
+            if len(input_shape) <= 1:
+                input_shape = tuple(input_shape) + (1,)
+        else:
+            input_shape = tuple(input_shape) + (None,)
+        return self._lookup_layer.compute_output_shape(input_shape)
+    
+    def compute_output_signature(self, input_spec):
+        output_shape = self.compute_output_shape(input_spec.shape.as_list())
+        output_dtype = (
+            tf.int64 if self._output_mode == INT else backend.floatx()
+        )
+        return tf.TensorSpec(shape=output_shape, dtype=output_dtype)
 
     def adapt(self, data, batch_size=None, steps=None):
         """Computes a vocabulary of string terms from tokens in a dataset.
@@ -295,13 +461,13 @@ def adapt(self, data, batch_size=None, steps=None):
         self.layer.adapt(data, batch_size=batch_size, steps=steps)
 
     def update_state(self, data):
-        self.layer.update_state(data)
+        self._lookup_layer.update_state(self._preprocess(data))
 
     def finalize_state(self):
-        self.layer.finalize_state()
+        self._lookup_layer.finalize_state()
 
     def reset_state(self):
-        self.layer.reset_state()
+        self._lookup_layer.reset_state()
 
     def get_vocabulary(self, include_special_tokens=True):
         """Returns the current vocabulary of the layer.
@@ -314,9 +480,7 @@ def get_vocabulary(self, include_special_tokens=True):
                 returned vocabulary will not include any padding
                 or OOV tokens.
         """
-        return self.layer.get_vocabulary(
-            include_special_tokens=include_special_tokens
-        )
+        return self._lookup_layer.get_vocabulary(include_special_tokens)
 
     def vocabulary_size(self):
         """Gets the current size of the layer's vocabulary.
@@ -325,18 +489,42 @@ def vocabulary_size(self):
             The integer size of the vocabulary, including optional
             mask and OOV indices.
         """
-        return self.layer.vocabulary_size()
+        return self._lookup_layer.vocabulary_size()
 
     def get_config(self):
-        return self.layer.get_config()
+        config = {
+            "max_tokens": self._lookup_layer.max_tokens,
+            "standardize": self._standardize,
+            "split": self._split,
+            "ngrams": self._ngrams_arg,
+            "output_mode": self._output_mode,
+            "output_sequence_length": self._output_sequence_length,
+            "pad_to_max_tokens": self._lookup_layer.pad_to_max_tokens,
+            "sparse": self._lookup_layer.sparse,
+            "ragged": self._ragged,
+            "vocabulary": layer_utils.listify_tensors(
+                self._lookup_layer.input_vocabulary
+            ),
+            "idf_weights": layer_utils.listify_tensors(
+                self._lookup_layer.input_idf_weights
+            ),
+            "encoding": self._encoding,
+            "vocabulary_size": self.vocabulary_size(),
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
     @classmethod
     def from_config(cls, config):
-        if not isinstance(config["standardize"], str):
+        if config["standardize"] not in (
+            LOWER_AND_STRIP_PUNCTUATION,
+            LOWER,
+            STRIP_PUNCTUATION,
+        ):
             config["standardize"] = serialization_lib.deserialize_keras_object(
                 config["standardize"]
             )
-        if not isinstance(config["split"], str):
+        if config["split"] not in (WHITESPACE, CHARACTER):
             config["split"] = serialization_lib.deserialize_keras_object(
                 config["split"]
             )
@@ -362,39 +550,122 @@ def set_vocabulary(self, vocabulary, idf_weights=None):
                 Must be set if `output_mode` is `"tf_idf"`.
                 Should not be set otherwise.
         """
-        self.layer.set_vocabulary(vocabulary, idf_weights=idf_weights)
+        self._lookup_layer.set_vocabulary(vocabulary, idf_weights=idf_weights)
+    
+        def _preprocess(self, inputs):
+            inputs = layer_utils.ensure_tensor(inputs, dtype=tf.string)
+            if self._standardize in (LOWER, LOWER_AND_STRIP_PUNCTUATION):
+                inputs = tf.strings.lower(inputs)
+            if self._standardize in (
+                STRIP_PUNCTUATION,
+                LOWER_AND_STRIP_PUNCTUATION,
+            ):
+                inputs = tf.strings.regex_replace(inputs, DEFAULT_STRIP_REGEX, "")
+            if callable(self._standardize):
+                inputs = self._standardize(inputs)
+
+            if self._split is not None:
+                # If we are splitting, we validate that the 1st axis is of dimension
+                # 1 and so can be squeezed out. We do this here instead of after
+                # splitting for performance reasons - it's more expensive to squeeze
+                # a ragged tensor.
+                if inputs.shape.rank > 1:
+                    if inputs.shape[-1] != 1:
+                        raise ValueError(
+                            "When using `TextVectorization` to tokenize strings, "
+                            "the input rank must be 1 or the last shape dimension "
+                            f"must be 1. Received: inputs.shape={inputs.shape} "
+                            f"with rank={inputs.shape.rank}"
+                        )
+                    else:
+                        inputs = tf.squeeze(inputs, axis=-1)
+                if self._split == WHITESPACE:
+                    # This treats multiple whitespaces as one whitespace, and strips
+                    # leading and trailing whitespace.
+                    inputs = tf.strings.split(inputs)
+                elif self._split == CHARACTER:
+                    inputs = tf.strings.unicode_split(inputs, "UTF-8")
+                elif callable(self._split):
+                    inputs = self._split(inputs)
+                else:
+                    raise ValueError(
+                        "%s is not a supported splitting."
+                        "TextVectorization supports the following options "
+                        "for `split`: None, 'whitespace', or a Callable."
+                        % self._split
+                    )
+
+            # Note that 'inputs' here can be either ragged or dense depending on the
+            # configuration choices for this Layer. The strings.ngrams op, however,
+            # does support both ragged and dense inputs.
+            if self._ngrams is not None:
+                inputs = tf.strings.ngrams(
+                    inputs, ngram_width=self._ngrams, separator=" "
+                )
+
+            return inputs
 
     def call(self, inputs):
-        if not isinstance(inputs, (tf.Tensor, np.ndarray, list, tuple)):
-            inputs = tf.convert_to_tensor(np.array(inputs))
-        outputs = self.layer.call(inputs)
+        if isinstance(inputs, (list, tuple, np.ndarray)):
+            inputs = backend.convert_to_tensor(inputs)
+
+        inputs = self._preprocess(inputs)
+
+        # If we're not doing any output processing, return right away.
+        if self._output_mode is None:
+            lookup_data = inputs
+
+        lookup_data = self._lookup_layer(inputs)
+
+        # For any non-int output, we can return directly from the underlying
+        # layer.
+        if self._output_mode != INT:
+            lookup_data = lookup_data
+
+        if self._ragged:
+            lookup_data = lookup_data
+
+        # If we have a ragged tensor, we can pad during the conversion to dense.
+        if layer_utils.is_ragged(lookup_data):
+            shape = lookup_data.shape.as_list()
+            # If output sequence length is None, to_tensor will pad the last
+            # dimension to the bounding shape of the ragged dimension.
+            shape[-1] = self._output_sequence_length
+            lookup_data = lookup_data.to_tensor(default_value=0, shape=shape)
+
+        # If we have a dense tensor, we need to pad/trim directly.
+        if self._output_sequence_length is not None:
+            # Maybe trim the output.
+            lookup_data = lookup_data[..., : self._output_sequence_length]
+
+            # Maybe pad the output. We need to be careful to use dynamic shape
+            # here as required_space_to_batch_paddings requires a fully known
+            # shape.
+            shape = tf.shape(lookup_data)
+            padded_shape = tf.concat(
+                (shape[:-1], [self._output_sequence_length]), 0
+            )
+            padding, _ = tf.required_space_to_batch_paddings(
+                shape, padded_shape
+            )
+            lookup_data = tf.pad(lookup_data, padding)
+
+        
         if (
             backend.backend() != "tensorflow"
             and not backend_utils.in_tf_graph()
         ):
-            outputs = backend.convert_to_tensor(outputs)
-        return outputs
+            lookup_data = backend.convert_to_tensor(lookup_data)
+        return lookup_data
 
     def save_own_variables(self, store):
-        if hasattr(self.layer, "save_own_variables"):
-            self.layer.save_own_variables(store)
-        else:
-            self.layer._save_own_variables(store)
+        self._lookup_layer.save_own_variables(store)
 
     def load_own_variables(self, store):
-        if hasattr(self.layer, "load_own_variables"):
-            self.layer.load_own_variables(store)
-        else:
-            self.layer._load_own_variables(store)
+        self._lookup_layer.load_own_variables(store)
 
     def save_assets(self, dir_path):
-        if hasattr(self.layer, "save_assets"):
-            self.layer.save_assets(dir_path)
-        else:
-            self.layer._save_assets(dir_path)
+        self._lookup_layer.save_assets(dir_path)
 
     def load_assets(self, dir_path):
-        if hasattr(self.layer, "save_assets"):
-            self.layer.load_assets(dir_path)
-        else:
-            self.layer._load_assets(dir_path)
+        self._lookup_layer.load_assets(dir_path)
\ No newline at end of file
diff --git a/keras_core/utils/__init__.py b/keras_core/utils/__init__.py
index c1ca205f0..fe026a096 100644
--- a/keras_core/utils/__init__.py
+++ b/keras_core/utils/__init__.py
@@ -24,3 +24,4 @@
 from keras_core.utils.timeseries_dataset_utils import (
     timeseries_dataset_from_array,
 )
+from keras_core.utils.layer_utils import validate_string_arg
diff --git a/keras_core/utils/layer_utils.py b/keras_core/utils/layer_utils.py
new file mode 100644
index 000000000..d51b4ed45
--- /dev/null
+++ b/keras_core/utils/layer_utils.py
@@ -0,0 +1,58 @@
+from keras_core.utils.module_utils import tensorflow as tf
+import numpy as np
+
+def validate_string_arg(
+    input_data,
+    allowable_strings,
+    layer_name,
+    arg_name,
+    allow_none=False,
+    allow_callables=False,
+):
+    """Validates the correctness of a string-based arg."""
+    if allow_none and input_data is None:
+        return
+    elif allow_callables and callable(input_data):
+        return
+    elif isinstance(input_data, str) and input_data in allowable_strings:
+        return
+    else:
+        allowed_args = "`None`, " if allow_none else ""
+        allowed_args += "a `Callable`, " if allow_callables else ""
+        allowed_args += f"or one of the following values: {allowable_strings}"
+        if allow_callables:
+            callable_note = (
+                f"If restoring a model and `{arg_name}` is a custom callable, "
+                "please ensure the callable is registered as a custom object. "
+                "See https://www.tensorflow.org/guide/keras/save_and_serialize"
+                "#registering_the_custom_object for details. "
+            )
+        else:
+            callable_note = ""
+        raise ValueError(
+            f"Unkown value for `{arg_name}` argument of layer {layer_name}. "
+            f"{callable_note}Allowed values are: {allowed_args}. Received: "
+            f"{input_data}"
+        )
+    
+def listify_tensors(x):
+    """Convert any tensors or numpy arrays to lists for config serialization."""
+    if tf.is_tensor(x):
+        x = x.numpy()
+    if isinstance(x, np.ndarray):
+        x = x.tolist()
+    return x
+
+def ensure_tensor(inputs, dtype=None):
+    """Ensures the input is a Tensor, SparseTensor or RaggedTensor."""
+    if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor, tf.SparseTensor)):
+        inputs = tf.convert_to_tensor(inputs, dtype)
+    if dtype is not None and inputs.dtype != dtype:
+        inputs = tf.cast(inputs, dtype)
+    return inputs
+
+def is_ragged(tensor):
+    """Returns true if `tensor` is a ragged tensor or ragged tensor value."""
+    return isinstance(
+        tensor, (tf.RaggedTensor, tf.compat.v1.ragged.RaggedTensorValue)
+    )
\ No newline at end of file