From 59639bbe6e82d62c615a508d988176e48ff668fe Mon Sep 17 00:00:00 2001
From: d61h6k4 <d.petrov@rasa.com>
Date: Mon, 19 Jun 2023 15:37:01 +0200
Subject: [PATCH] Remove dependency on tensorflow-addons (#12514)

Remove dependency on tensorflow-addons

Copy tests for crf and metrics
---
 changelog/12514.improvement.md         |   1 +
 poetry.lock                            |  96 ++-------
 pyproject.toml                         |   8 -
 rasa/utils/tensorflow/crf.py           | 279 +++++++++++++++++++++++-
 rasa/utils/tensorflow/layers.py        |   9 +-
 rasa/utils/tensorflow/metrics.py       | 282 +++++++++++++++++++++++++
 tests/utils/tensorflow/test_crf.py     | 233 ++++++++++++++++++++
 tests/utils/tensorflow/test_metrics.py | 205 ++++++++++++++++++
 8 files changed, 1019 insertions(+), 94 deletions(-)
 create mode 100644 changelog/12514.improvement.md
 create mode 100644 rasa/utils/tensorflow/metrics.py
 create mode 100644 tests/utils/tensorflow/test_crf.py
 create mode 100644 tests/utils/tensorflow/test_metrics.py

diff --git a/changelog/12514.improvement.md b/changelog/12514.improvement.md
new file mode 100644
index 000000000000..262b5161b53a
--- /dev/null
+++ b/changelog/12514.improvement.md
@@ -0,0 +1 @@
+Remove tensorflow-addons from dependencies as it is now deprecated.
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index da0b875f2fa4..65fbe9dec30b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -588,18 +588,18 @@ numpy = ">=1.15.0"
 
 [[package]]
 name = "boto3"
-version = "1.26.154"
+version = "1.26.155"
 description = "The AWS SDK for Python"
 category = "main"
 optional = false
 python-versions = ">= 3.7"
 files = [
-    {file = "boto3-1.26.154-py3-none-any.whl", hash = "sha256:ee2b3733f40f935da78bf76bc8e82af6e90841406e04605e3b2d765b50cad05e"},
-    {file = "boto3-1.26.154.tar.gz", hash = "sha256:cf1067d101be538f399b685bbe6beb4bfed01095da8497d0c7fa8b8788a65c6b"},
+    {file = "boto3-1.26.155-py3-none-any.whl", hash = "sha256:dd15823e8c0554d98c18584d9a6a0342c67611c1114ef61495934c2e560f632c"},
+    {file = "boto3-1.26.155.tar.gz", hash = "sha256:2d4095e2029ce5ceccb25591f13e55aa5b8ba17794de09963654bd9ced45158f"},
 ]
 
 [package.dependencies]
-botocore = ">=1.29.154,<1.30.0"
+botocore = ">=1.29.155,<1.30.0"
 jmespath = ">=0.7.1,<2.0.0"
 s3transfer = ">=0.6.0,<0.7.0"
 
@@ -608,14 +608,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
 
 [[package]]
 name = "botocore"
-version = "1.29.154"
+version = "1.29.155"
 description = "Low-level, data-driven core of boto 3."
 category = "main"
 optional = false
 python-versions = ">= 3.7"
 files = [
-    {file = "botocore-1.29.154-py3-none-any.whl", hash = "sha256:b9853f72a3c93f1aa8c9a1636911cdbec3662bca2e04e4ee00437c4f8c9fa2d4"},
-    {file = "botocore-1.29.154.tar.gz", hash = "sha256:a9c7da497ac5f7d4f3e932b4442e7c32cc2936f3a4658165f1528336fc429c3d"},
+    {file = "botocore-1.29.155-py3-none-any.whl", hash = "sha256:32d5da68212e10c060fd484f41df4f7048fc7731ccd16fd00e37b11b6e841142"},
+    {file = "botocore-1.29.155.tar.gz", hash = "sha256:7fbb7ebba5f645c9750fe557b1ea789d40017a028cdaa2c22fcbf06d4a4d3c1d"},
 ]
 
 [package.dependencies]
@@ -2374,14 +2374,14 @@ files = [
 
 [[package]]
 name = "importlib-metadata"
-version = "6.6.0"
+version = "6.7.0"
 description = "Read metadata from Python packages"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "importlib_metadata-6.6.0-py3-none-any.whl", hash = "sha256:43dd286a2cd8995d5eaef7fee2066340423b818ed3fd70adf0bad5f1fac53fed"},
-    {file = "importlib_metadata-6.6.0.tar.gz", hash = "sha256:92501cdf9cc66ebd3e612f1b4f0c0765dfa42f0fa38ffb319b6bd84dd675d705"},
+    {file = "importlib_metadata-6.7.0-py3-none-any.whl", hash = "sha256:cb52082e659e97afc5dac71e79de97d8681de3aa07ff18578330904a9d18e5b5"},
+    {file = "importlib_metadata-6.7.0.tar.gz", hash = "sha256:1aaf550d4f73e5d6783e7acb77aec43d49da8017410afae93822cc9cca98c4d4"},
 ]
 
 [package.dependencies]
@@ -2390,7 +2390,7 @@ zipp = ">=0.5"
 [package.extras]
 docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
 perf = ["ipython"]
-testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"]
+testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"]
 
 [[package]]
 name = "importlib-resources"
@@ -3630,14 +3630,14 @@ files = [
 
 [[package]]
 name = "platformdirs"
-version = "3.5.3"
+version = "3.6.0"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "platformdirs-3.5.3-py3-none-any.whl", hash = "sha256:0ade98a4895e87dc51d47151f7d2ec290365a585151d97b4d8d6312ed6132fed"},
-    {file = "platformdirs-3.5.3.tar.gz", hash = "sha256:e48fabd87db8f3a7df7150a4a5ea22c546ee8bc39bc2473244730d4b56d2cc4e"},
+    {file = "platformdirs-3.6.0-py3-none-any.whl", hash = "sha256:ffa199e3fbab8365778c4a10e1fbf1b9cd50707de826eb304b50e57ec0cc8d38"},
+    {file = "platformdirs-3.6.0.tar.gz", hash = "sha256:57e28820ca8094678b807ff529196506d7a21e17156cb1cddb3e74cebce54640"},
 ]
 
 [package.extras]
@@ -4152,14 +4152,14 @@ zstd = ["zstandard"]
 
 [[package]]
 name = "pyparsing"
-version = "3.0.9"
+version = "3.1.0"
 description = "pyparsing module - Classes and methods to define and execute parsing grammars"
 category = "main"
 optional = false
 python-versions = ">=3.6.8"
 files = [
-    {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
-    {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
+    {file = "pyparsing-3.1.0-py3-none-any.whl", hash = "sha256:d554a96d1a7d3ddaf7183104485bc19fd80543ad6ac5bdb6426719d766fb06c1"},
+    {file = "pyparsing-3.1.0.tar.gz", hash = "sha256:edb662d6fe322d6e990b1594b5feaeadf806803359e3d4d42f11e295e588f0ea"},
 ]
 
 [package.extras]
@@ -5810,40 +5810,6 @@ termcolor = ">=1.1.0"
 typing-extensions = ">=3.6.6"
 wrapt = ">=1.11.0,<1.15"
 
-[[package]]
-name = "tensorflow-addons"
-version = "0.19.0"
-description = "TensorFlow Addons."
-category = "main"
-optional = false
-python-versions = "*"
-files = [
-    {file = "tensorflow_addons-0.19.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:ca3764beba54c4ee4bb01a4294f8c2fef5c3814fd0f521dbe8beb4522545cb2d"},
-    {file = "tensorflow_addons-0.19.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f74646fe83fd6f0d84ae5e0186c85cae3dd7e6c2329c8a5db4574c144706f39"},
-    {file = "tensorflow_addons-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a46016fe9a1705043e39b7dacee3b089303ecdedbf1b12eb607aa35b7d2471e3"},
-    {file = "tensorflow_addons-0.19.0-cp310-cp310-win_amd64.whl", hash = "sha256:eefbdb4e0450b93fba6b393870784dad4c91189e5551e01b268aeb5fe5b04da6"},
-    {file = "tensorflow_addons-0.19.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:a297db1af6e682277f593411d4d28b939646c2b67b8351ef0d31a30b9531fb93"},
-    {file = "tensorflow_addons-0.19.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06b673fe22c4113edabdc0dc1ef919ba0f1fb024ca39a5718ec146285c400e8f"},
-    {file = "tensorflow_addons-0.19.0-cp37-cp37m-win_amd64.whl", hash = "sha256:eefb6bf6d7a31d60649d6f6e99aee172ed4f5e693a079acfb264297997de21d0"},
-    {file = "tensorflow_addons-0.19.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:d447a3f7852810b7985c890852dbcb6454f3899100d439d5eba370a78d8bd281"},
-    {file = "tensorflow_addons-0.19.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:51fefd5f496ada5dafb13c446853fa1ddeb5482a0b9074af14efe0b99903816e"},
-    {file = "tensorflow_addons-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101c3142149f16e81362cc1d0959686543cb69df79f38a3ea3c5205fbf57b28e"},
-    {file = "tensorflow_addons-0.19.0-cp38-cp38-win_amd64.whl", hash = "sha256:c93602cf3b8a7bbe1fbf973b7b9f986892be34ba8b943923f09ae6cd79f0a241"},
-    {file = "tensorflow_addons-0.19.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:fc058876dce711009227c47559b05295a5fb480748d6ec5c49386b1dc2c00167"},
-    {file = "tensorflow_addons-0.19.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9334910bb6b599dd627e632a59f35ae9256bda2312b06929066a437076bf4789"},
-    {file = "tensorflow_addons-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f25b029a917b635162b1f14df0263b2f79deadcd71daecd3161f69ccb1fbcea4"},
-    {file = "tensorflow_addons-0.19.0-cp39-cp39-win_amd64.whl", hash = "sha256:b8f4c3a88b381bd28bba3189a0216749f9e799ae3dc4959651728e01ae20d738"},
-]
-
-[package.dependencies]
-packaging = "*"
-typeguard = ">=2.7"
-
-[package.extras]
-tensorflow = ["tensorflow (>=2.9.0,<2.12.0)"]
-tensorflow-cpu = ["tensorflow-cpu (>=2.9.0,<2.12.0)"]
-tensorflow-gpu = ["tensorflow-gpu (>=2.9.0,<2.12.0)"]
-
 [[package]]
 name = "tensorflow-cpu-aws"
 version = "2.12.0"
@@ -6424,26 +6390,6 @@ PyJWT = ">=2.0.0,<3.0.0"
 pytz = "*"
 requests = ">=2.0.0"
 
-[[package]]
-name = "typeguard"
-version = "4.0.0"
-description = "Run-time type checker for Python"
-category = "main"
-optional = false
-python-versions = ">=3.7.4"
-files = [
-    {file = "typeguard-4.0.0-py3-none-any.whl", hash = "sha256:c4a40af0ba8a41077221271b46d0a6d8d46045443e4d887887c69254ca861952"},
-    {file = "typeguard-4.0.0.tar.gz", hash = "sha256:194fb3dbcb06ea9caf7088f3befee014de57961689f9c859ac5239b1ef61d987"},
-]
-
-[package.dependencies]
-importlib-metadata = {version = ">=3.6", markers = "python_version < \"3.10\""}
-typing-extensions = {version = ">=4.4.0", markers = "python_version < \"3.11\""}
-
-[package.extras]
-doc = ["packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
-test = ["mypy (>=1.2.0)", "pytest (>=7)"]
-
 [[package]]
 name = "typer"
 version = "0.7.0"
@@ -6870,14 +6816,14 @@ requests-toolbelt = "*"
 
 [[package]]
 name = "websocket-client"
-version = "1.5.3"
+version = "1.6.0"
 description = "WebSocket client for Python with low level API options"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "websocket-client-1.5.3.tar.gz", hash = "sha256:b96f3bce3e54e3486ebe6504bc22bd4c140392bd2eb71764db29be8f2639aa65"},
-    {file = "websocket_client-1.5.3-py3-none-any.whl", hash = "sha256:3566f8467cd350874c4913816355642a4942f6c1ed1e9406e3d42fae6d6c072a"},
+    {file = "websocket-client-1.6.0.tar.gz", hash = "sha256:e84c7eafc66aade6d1967a51dfd219aabdf81d15b9705196e11fd81f48666b78"},
+    {file = "websocket_client-1.6.0-py3-none-any.whl", hash = "sha256:72d7802608745b0a212f79b478642473bd825777d8637b6c8c421bf167790d4f"},
 ]
 
 [package.extras]
@@ -7215,4 +7161,4 @@ transformers = ["sentencepiece", "transformers"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8,<3.11"
-content-hash = "96a65a2de5328f7b4a7517a772b6292a9b2541613bfa2a30d384718e6ed43acf"
+content-hash = "9c2b5b76db4e055b464d9ba532645a4c996326cdc2f85343527d03343f7531ab"
diff --git a/pyproject.toml b/pyproject.toml
index 675e4e00ea28..3630622b121c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -204,14 +204,6 @@ optional = true
 version = "<1.10.10"
 optional = true
 
-[[tool.poetry.dependencies.tensorflow-addons]]
-version = ">=0.18,<0.20"
-markers = "sys_platform != 'linux' or (platform_machine != 'arm64' and platform_machine != 'aarch64')"
-
-[[tool.poetry.dependencies.tensorflow-addons]]
-version = "0.19.0"
-markers = "sys_platform == 'linux' and (platform_machine == 'arm64' or platform_machine == 'aarch64')"
-
 [tool.poetry.extras]
 spacy = [ "spacy",]
 jieba = [ "jieba",]
diff --git a/rasa/utils/tensorflow/crf.py b/rasa/utils/tensorflow/crf.py
index 68a1e5bc3298..1318eedd9c3b 100644
--- a/rasa/utils/tensorflow/crf.py
+++ b/rasa/utils/tensorflow/crf.py
@@ -1,20 +1,17 @@
 import tensorflow as tf
 from tensorflow import TensorShape
-
-from tensorflow_addons.utils.types import TensorLike
-from typeguard import typechecked
-from typing import Tuple, Any, List, Union
+from tensorflow.types.experimental import TensorLike
+from typing import Tuple, Any, List, Union, Optional
 
 
 # original code taken from
-# https://github.com/tensorflow/addons/blob/master/tensorflow_addons/text/crf.py
+# https://github.com/tensorflow/addons/blob/b8cab7fd61af4f697a1cdae4f51c37c346b9c6f0/tensorflow_addons/text/crf.py
 # (modified to our neeeds)
 
 
 class CrfDecodeForwardRnnCell(tf.keras.layers.AbstractRNNCell):
     """Computes the forward decoding in a linear-chain CRF."""
 
-    @typechecked
     def __init__(self, transition_params: TensorLike, **kwargs: Any) -> None:
         """Initialize the CrfDecodeForwardRnnCell.
 
@@ -218,3 +215,273 @@ def _multi_seq_fn() -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         return _multi_seq_fn()
 
     return tf.cond(tf.equal(tf.shape(potentials)[1], 1), _single_seq_fn, _multi_seq_fn)
+
+
+def crf_unary_score(
+    tag_indices: TensorLike, sequence_lengths: TensorLike, inputs: TensorLike
+) -> tf.Tensor:
+    """Computes the unary scores of tag sequences.
+
+    Args:
+      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
+    Returns:
+      unary_scores: A [batch_size] vector of unary scores.
+    """
+    tag_indices = tf.cast(tag_indices, dtype=tf.int32)
+    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
+
+    batch_size = tf.shape(inputs)[0]
+    max_seq_len = tf.shape(inputs)[1]
+    num_tags = tf.shape(inputs)[2]
+
+    flattened_inputs = tf.reshape(inputs, [-1])
+
+    offsets = tf.expand_dims(tf.range(batch_size) * max_seq_len * num_tags, 1)
+    offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0)
+    # Use int32 or int64 based on tag_indices' dtype.
+    if tag_indices.dtype == tf.int64:
+        offsets = tf.cast(offsets, tf.int64)
+    flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1])
+
+    unary_scores = tf.reshape(
+        tf.gather(flattened_inputs, flattened_tag_indices), [batch_size, max_seq_len]
+    )
+
+    masks = tf.sequence_mask(
+        sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=unary_scores.dtype
+    )
+
+    unary_scores = tf.reduce_sum(unary_scores * masks, 1)
+    return unary_scores
+
+
+def crf_binary_score(
+    tag_indices: TensorLike, sequence_lengths: TensorLike, transition_params: TensorLike
+) -> tf.Tensor:
+    """Computes the binary scores of tag sequences.
+
+    Args:
+      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] matrix of binary potentials.
+    Returns:
+      binary_scores: A [batch_size] vector of binary scores.
+    """
+    tag_indices = tf.cast(tag_indices, dtype=tf.int32)
+    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
+
+    num_tags = tf.shape(transition_params)[0]
+    num_transitions = tf.shape(tag_indices)[1] - 1
+
+    # Truncate by one on each side of the sequence to get the start and end
+    # indices of each transition.
+    start_tag_indices = tf.slice(tag_indices, [0, 0], [-1, num_transitions])
+    end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions])
+
+    # Encode the indices in a flattened representation.
+    flattened_transition_indices = start_tag_indices * num_tags + end_tag_indices
+    flattened_transition_params = tf.reshape(transition_params, [-1])
+
+    # Get the binary scores based on the flattened representation.
+    binary_scores = tf.gather(flattened_transition_params, flattened_transition_indices)
+
+    masks = tf.sequence_mask(
+        sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=binary_scores.dtype
+    )
+    truncated_masks = tf.slice(masks, [0, 1], [-1, -1])
+    binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1)
+    return binary_scores
+
+
+def crf_sequence_score(
+    inputs: TensorLike,
+    tag_indices: TensorLike,
+    sequence_lengths: TensorLike,
+    transition_params: TensorLike,
+) -> tf.Tensor:
+    """Computes the unnormalized score for a tag sequence.
+
+    Args:
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+          to use as input to the CRF layer.
+      tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which
+          we compute the unnormalized score.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] transition matrix.
+    Returns:
+      sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+    """
+    tag_indices = tf.cast(tag_indices, dtype=tf.int32)
+    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
+
+    # If max_seq_len is 1, we skip the score calculation and simply gather the
+    # unary potentials of the single tag.
+    def _single_seq_fn() -> TensorLike:
+        batch_size = tf.shape(inputs, out_type=tf.int32)[0]
+        batch_inds = tf.reshape(tf.range(batch_size), [-1, 1])
+        indices = tf.concat([batch_inds, tf.zeros_like(batch_inds)], axis=1)
+
+        tag_inds = tf.gather_nd(tag_indices, indices)
+        tag_inds = tf.reshape(tag_inds, [-1, 1])
+        indices = tf.concat([indices, tag_inds], axis=1)
+
+        sequence_scores = tf.gather_nd(inputs, indices)
+
+        sequence_scores = tf.where(
+            tf.less_equal(sequence_lengths, 0),
+            tf.zeros_like(sequence_scores),
+            sequence_scores,
+        )
+        return sequence_scores
+
+    def _multi_seq_fn() -> TensorLike:
+        # Compute the scores of the given tag sequence.
+        unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
+        binary_scores = crf_binary_score(
+            tag_indices, sequence_lengths, transition_params
+        )
+        sequence_scores = unary_scores + binary_scores
+        return sequence_scores
+
+    return tf.cond(tf.equal(tf.shape(inputs)[1], 1), _single_seq_fn, _multi_seq_fn)
+
+
+def crf_forward(
+    inputs: TensorLike,
+    state: TensorLike,
+    transition_params: TensorLike,
+    sequence_lengths: TensorLike,
+) -> tf.Tensor:
+    """Computes the alpha values in a linear-chain CRF.
+
+    See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
+
+    Args:
+      inputs: A [batch_size, num_tags] matrix of unary potentials.
+      state: A [batch_size, num_tags] matrix containing the previous alpha
+         values.
+      transition_params: A [num_tags, num_tags] matrix of binary potentials.
+          This matrix is expanded into a [1, num_tags, num_tags] in preparation
+          for the broadcast summation occurring within the cell.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+
+    Returns:
+      new_alphas: A [batch_size, num_tags] matrix containing the
+          new alpha values.
+    """
+    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
+
+    last_index = tf.maximum(
+        tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1
+    )
+    inputs = tf.transpose(inputs, [1, 0, 2])
+    transition_params = tf.expand_dims(transition_params, 0)
+
+    def _scan_fn(_state: TensorLike, _inputs: TensorLike) -> TensorLike:
+        _state = tf.expand_dims(_state, 2)
+        transition_scores = _state + transition_params
+        new_alphas = _inputs + tf.reduce_logsumexp(transition_scores, [1])
+        return new_alphas
+
+    all_alphas = tf.transpose(tf.scan(_scan_fn, inputs, state), [1, 0, 2])
+    # add first state for sequences of length 1
+    all_alphas = tf.concat([tf.expand_dims(state, 1), all_alphas], 1)
+
+    idxs = tf.stack([tf.range(tf.shape(last_index)[0]), last_index], axis=1)
+    return tf.gather_nd(all_alphas, idxs)
+
+
+def crf_log_norm(
+    inputs: TensorLike, sequence_lengths: TensorLike, transition_params: TensorLike
+) -> tf.Tensor:
+    """Computes the normalization for a CRF.
+
+    Args:
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+          to use as input to the CRF layer.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] transition matrix.
+    Returns:
+      log_norm: A [batch_size] vector of normalizers for a CRF.
+    """
+    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
+    # Split up the first and rest of the inputs in preparation for the forward
+    # algorithm.
+    first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1])
+    first_input = tf.squeeze(first_input, [1])
+
+    # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp
+    # over the "initial state" (the unary potentials).
+    def _single_seq_fn() -> TensorLike:
+        log_norm = tf.reduce_logsumexp(first_input, [1])
+        # Mask `log_norm` of the sequences with length <= zero.
+        log_norm = tf.where(
+            tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm), log_norm
+        )
+        return log_norm
+
+    def _multi_seq_fn() -> TensorLike:
+        """Forward computation of alpha values."""
+        rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1])
+        # Compute the alpha values in the forward algorithm in order to get the
+        # partition function.
+
+        alphas = crf_forward(
+            rest_of_input, first_input, transition_params, sequence_lengths
+        )
+        log_norm = tf.reduce_logsumexp(alphas, [1])
+        # Mask `log_norm` of the sequences with length <= zero.
+        log_norm = tf.where(
+            tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm), log_norm
+        )
+        return log_norm
+
+    return tf.cond(tf.equal(tf.shape(inputs)[1], 1), _single_seq_fn, _multi_seq_fn)
+
+
+def crf_log_likelihood(
+    inputs: TensorLike,
+    tag_indices: TensorLike,
+    sequence_lengths: TensorLike,
+    transition_params: Optional[TensorLike] = None,
+) -> Tuple[tf.Tensor, tf.Tensor]:
+    """Computes the log-likelihood of tag sequences in a CRF.
+
+    Args:
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+          to use as input to the CRF layer.
+      tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which
+          we compute the log-likelihood.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] transition matrix,
+          if available.
+    Returns:
+      log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
+        each example, given the sequence of tag indices.
+      transition_params: A [num_tags, num_tags] transition matrix. This is
+          either provided by the caller or created in this function.
+    """
+    inputs = tf.convert_to_tensor(inputs)
+
+    num_tags = inputs.shape[2]
+
+    # cast type to handle different types
+    tag_indices = tf.cast(tag_indices, dtype=tf.int32)
+    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
+
+    if transition_params is None:
+        initializer = tf.keras.initializers.GlorotUniform()
+        transition_params = tf.Variable(
+            initializer([num_tags, num_tags]), "transitions"
+        )
+    transition_params = tf.cast(transition_params, inputs.dtype)
+    sequence_scores = crf_sequence_score(
+        inputs, tag_indices, sequence_lengths, transition_params
+    )
+    log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)
+
+    # Normalize the scores to get the log-likelihood per example.
+    log_likelihood = sequence_scores - log_norm
+    return log_likelihood, transition_params
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 78501aebe4dc..6ba29ec2a32f 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -1,7 +1,6 @@
 import logging
 from typing import List, Optional, Text, Tuple, Callable, Union, Any
 import tensorflow as tf
-import tensorflow_addons as tfa
 
 # TODO: The following is not (yet) available via tf.keras
 from keras.utils.control_flow_util import smart_cond
@@ -21,13 +20,13 @@
 from rasa.shared.nlu.constants import FEATURE_TYPE_SENTENCE, FEATURE_TYPE_SEQUENCE
 from rasa.shared.nlu.constants import TEXT, INTENT, ACTION_NAME, ACTION_TEXT
 
+from rasa.utils.tensorflow.metrics import F1Score
 from rasa.utils.tensorflow.exceptions import TFLayerConfigException
 import rasa.utils.tensorflow.layers_utils as layers_utils
+from rasa.utils.tensorflow.crf import crf_log_likelihood
 
 logger = logging.getLogger(__name__)
 
-# https://github.com/tensorflow/addons#gpu-and-cpu-custom-ops-1
-tfa.options.TF_ADDONS_PY_OPS = True
 
 POSSIBLE_ATTRIBUTES = [
     TEXT,
@@ -590,7 +589,7 @@ def __init__(
         self.num_tags = num_tags
         self.scale_loss = scale_loss
         self.transition_regularizer = tf.keras.regularizers.l2(reg_lambda)
-        self.f1_score_metric = tfa.metrics.F1Score(
+        self.f1_score_metric = F1Score(
             num_classes=num_tags - 1,  # `0` prediction is not a prediction
             average="micro",
         )
@@ -653,7 +652,7 @@ def loss(
             given the sequence of tag indices.
         """
 
-        log_likelihood, _ = tfa.text.crf.crf_log_likelihood(
+        log_likelihood, _ = crf_log_likelihood(
             logits, tag_indices, sequence_lengths, self.transition_params
         )
         loss = -log_likelihood
diff --git a/rasa/utils/tensorflow/metrics.py b/rasa/utils/tensorflow/metrics.py
new file mode 100644
index 000000000000..7face21ff2b2
--- /dev/null
+++ b/rasa/utils/tensorflow/metrics.py
@@ -0,0 +1,282 @@
+import tensorflow as tf
+from tensorflow.keras import backend as K
+from tensorflow.types.experimental import TensorLike
+from typing import Any, Dict, Optional
+
+
+# original code taken from
+# https://github.com/tensorflow/addons/blob/f30df4322b5580b3e5946530a60f7126035dd73b/tensorflow_addons/metrics/f_scores.py
+# (modified to our neeeds)
+
+
+class FBetaScore(tf.keras.metrics.Metric):
+    r"""Computes F-Beta score.
+
+    It is the weighted harmonic mean of precision
+    and recall. Output range is `[0, 1]`. Works for
+    both multi-class and multi-label classification.
+
+    $$
+    F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{recall}}
+                                  {(\beta^2 \cdot \textrm{precision}) + \textrm{recall}}
+    $$
+
+    Args:
+        num_classes: Number of unique classes in the dataset.
+        average: Type of averaging to be performed on data.
+            Acceptable values are `None`, `micro`, `macro` and
+            `weighted`. Default value is None.
+        beta: Determines the weight of precision and recall
+            in harmonic mean. Determines the weight given to the
+            precision and recall. Default value is 1.
+        threshold: Elements of `y_pred` greater than threshold are
+            converted to be 1, and the rest 0. If threshold is
+            None, the argmax is converted to 1, and the rest 0.
+        name: (Optional) String name of the metric instance.
+        dtype: (Optional) Data type of the metric result.
+
+    Returns:
+        F-Beta Score: float.
+
+    Raises:
+        ValueError: If the `average` has values other than
+        `[None, 'micro', 'macro', 'weighted']`.
+
+        ValueError: If the `beta` value is less than or equal
+        to 0.
+
+    `average` parameter behavior:
+
+        None: Scores for each class are returned.
+
+        micro: True positivies, false positives and
+            false negatives are computed globally.
+
+        macro: True positivies, false positives and
+            false negatives are computed for each class
+            and their unweighted mean is returned.
+
+        weighted: Metrics are computed for each class
+            and returns the mean weighted by the
+            number of true instances in each class.
+
+    Usage:
+
+    >>> metric = tfa.metrics.FBetaScore(num_classes=3, beta=2.0, threshold=0.5)
+    >>> y_true = np.array([[1, 1, 1],
+    ...                    [1, 0, 0],
+    ...                    [1, 1, 0]], np.int32)
+    >>> y_pred = np.array([[0.2, 0.6, 0.7],
+    ...                    [0.2, 0.6, 0.6],
+    ...                    [0.6, 0.8, 0.0]], np.float32)
+    >>> metric.update_state(y_true, y_pred)
+    >>> result = metric.result()
+    >>> result.numpy()
+    array([0.3846154 , 0.90909094, 0.8333334 ], dtype=float32)
+    """
+
+    def __init__(
+        self,
+        num_classes: TensorLike,
+        average: Optional[str] = None,
+        beta: TensorLike = 1.0,
+        threshold: Optional[TensorLike] = None,
+        name: str = "fbeta_score",
+        dtype: Any = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(name=name, dtype=dtype)
+
+        if average not in (None, "micro", "macro", "weighted"):
+            raise ValueError(
+                "Unknown average type. Acceptable values "
+                "are: [None, 'micro', 'macro', 'weighted']"
+            )
+
+        if not isinstance(beta, float):
+            raise TypeError("The value of beta should be a python float")
+
+        if beta <= 0.0:
+            raise ValueError("beta value should be greater than zero")
+
+        if threshold is not None:
+            if not isinstance(threshold, float):
+                raise TypeError("The value of threshold should be a python float")
+            if threshold > 1.0 or threshold <= 0.0:
+                raise ValueError("threshold should be between 0 and 1")
+
+        self.num_classes = num_classes
+        self.average = average
+        self.beta = beta
+        self.threshold = threshold
+        self.axis = None
+        self.init_shape = []
+
+        if self.average != "micro":
+            self.axis = 0
+            self.init_shape = [self.num_classes]
+
+        def _zero_wt_init(name: Any) -> Any:
+            return self.add_weight(
+                name, shape=self.init_shape, initializer="zeros", dtype=self.dtype
+            )
+
+        self.true_positives = _zero_wt_init("true_positives")
+        self.false_positives = _zero_wt_init("false_positives")
+        self.false_negatives = _zero_wt_init("false_negatives")
+        self.weights_intermediate = _zero_wt_init("weights_intermediate")
+
+    def update_state(
+        self,
+        y_true: TensorLike,
+        y_pred: TensorLike,
+        sample_weight: Optional[TensorLike] = None,
+    ) -> None:
+        if self.threshold is None:
+            threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True)
+            # make sure [0, 0, 0] doesn't become [1, 1, 1]
+            # Use abs(x) > eps, instead of x != 0 to check for zero
+            y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12)
+        else:
+            y_pred = y_pred > self.threshold
+
+        y_true = tf.cast(y_true, self.dtype)
+        y_pred = tf.cast(y_pred, self.dtype)
+
+        def _weighted_sum(
+            val: TensorLike, sample_weight: Optional[TensorLike]
+        ) -> TensorLike:
+            if sample_weight is not None:
+                val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1))
+            return tf.reduce_sum(val, axis=self.axis)
+
+        self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight))
+        self.false_positives.assign_add(
+            _weighted_sum(y_pred * (1 - y_true), sample_weight)
+        )
+        self.false_negatives.assign_add(
+            _weighted_sum((1 - y_pred) * y_true, sample_weight)
+        )
+        self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight))
+
+    def result(self) -> TensorLike:
+        precision = tf.math.divide_no_nan(
+            self.true_positives, self.true_positives + self.false_positives
+        )
+        recall = tf.math.divide_no_nan(
+            self.true_positives, self.true_positives + self.false_negatives
+        )
+
+        mul_value = precision * recall
+        add_value = (tf.math.square(self.beta) * precision) + recall
+        mean = tf.math.divide_no_nan(mul_value, add_value)
+        f1_score = mean * (1 + tf.math.square(self.beta))
+
+        if self.average == "weighted":
+            weights = tf.math.divide_no_nan(
+                self.weights_intermediate, tf.reduce_sum(self.weights_intermediate)
+            )
+            f1_score = tf.reduce_sum(f1_score * weights)
+
+        elif self.average is not None:  # [micro, macro]
+            f1_score = tf.reduce_mean(f1_score)
+
+        return f1_score
+
+    def get_config(self) -> Dict[str, Any]:
+        """Returns the serializable config of the metric."""
+
+        config = {
+            "num_classes": self.num_classes,
+            "average": self.average,
+            "beta": self.beta,
+            "threshold": self.threshold,
+        }
+
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+    def reset_state(self) -> None:
+        reset_value = tf.zeros(self.init_shape, dtype=self.dtype)
+        K.batch_set_value([(v, reset_value) for v in self.variables])
+
+    def reset_states(self) -> None:
+        # Backwards compatibility alias of `reset_state`. New classes should
+        # only implement `reset_state`.
+        # Required in Tensorflow < 2.5.0
+        return self.reset_state()
+
+
+class F1Score(FBetaScore):
+    r"""Computes F-1 Score.
+
+    It is the harmonic mean of precision and recall.
+    Output range is `[0, 1]`. Works for both multi-class
+    and multi-label classification.
+
+    $$
+    F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision}
+          + \textrm{recall}}
+    $$
+
+    Args:
+        num_classes: Number of unique classes in the dataset.
+        average: Type of averaging to be performed on data.
+            Acceptable values are `None`, `micro`, `macro`
+            and `weighted`. Default value is None.
+        threshold: Elements of `y_pred` above threshold are
+            considered to be 1, and the rest 0. If threshold is
+            None, the argmax is converted to 1, and the rest 0.
+        name: (Optional) String name of the metric instance.
+        dtype: (Optional) Data type of the metric result.
+
+    Returns:
+        F-1 Score: float.
+
+    Raises:
+        ValueError: If the `average` has values other than
+        [None, 'micro', 'macro', 'weighted'].
+
+    `average` parameter behavior:
+        None: Scores for each class are returned
+
+        micro: True positivies, false positives and
+            false negatives are computed globally.
+
+        macro: True positivies, false positives and
+            false negatives are computed for each class
+            and their unweighted mean is returned.
+
+        weighted: Metrics are computed for each class
+            and returns the mean weighted by the
+            number of true instances in each class.
+
+    Usage:
+
+    >>> metric = tfa.metrics.F1Score(num_classes=3, threshold=0.5)
+    >>> y_true = np.array([[1, 1, 1],
+    ...                    [1, 0, 0],
+    ...                    [1, 1, 0]], np.int32)
+    >>> y_pred = np.array([[0.2, 0.6, 0.7],
+    ...                    [0.2, 0.6, 0.6],
+    ...                    [0.6, 0.8, 0.0]], np.float32)
+    >>> metric.update_state(y_true, y_pred)
+    >>> result = metric.result()
+    >>> result.numpy()
+    array([0.5      , 0.8      , 0.6666667], dtype=float32)
+    """
+
+    def __init__(
+        self,
+        num_classes: TensorLike,
+        average: str = None,
+        threshold: Optional[TensorLike] = None,
+        name: str = "f1_score",
+        dtype: Any = None,
+    ):
+        super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype)
+
+    def get_config(self) -> Dict[str, Any]:
+        base_config = super().get_config()
+        del base_config["beta"]
+        return base_config
diff --git a/tests/utils/tensorflow/test_crf.py b/tests/utils/tensorflow/test_crf.py
new file mode 100644
index 000000000000..593327f2bc5d
--- /dev/null
+++ b/tests/utils/tensorflow/test_crf.py
@@ -0,0 +1,233 @@
+"""Tests for CRF."""
+
+# original code taken from
+# https://github.com/tensorflow/addons/blob/master/tensorflow_addons/text/tests/crf_test.py
+# (modified to our neeeds)
+
+import itertools
+
+import pytest
+import numpy as np
+import tensorflow as tf
+
+from rasa.utils.tensorflow.crf import (
+    crf_sequence_score,
+    crf_unary_score,
+    crf_binary_score,
+    crf_log_norm,
+    crf_log_likelihood,
+)
+
+
+def calculate_sequence_score(inputs, transition_params, tag_indices, sequence_lengths):
+    expected_unary_score = sum(
+        inputs[i][tag_indices[i]] for i in range(sequence_lengths)
+    )
+    expected_binary_score = sum(
+        transition_params[tag_indices[i], tag_indices[i + 1]]
+        for i in range(sequence_lengths - 1)
+    )
+    return expected_unary_score + expected_binary_score
+
+
+def brute_force_decode(sequence_lengths, inputs, transition_params):
+    num_words = inputs.shape[0]
+    num_tags = inputs.shape[1]
+
+    all_sequence_scores = []
+    all_sequences = []
+
+    tag_indices_iterator = itertools.product(range(num_tags), repeat=sequence_lengths)
+    inputs = tf.expand_dims(inputs, 0)
+    sequence_lengths = tf.expand_dims(sequence_lengths, 0)
+    transition_params = tf.constant(transition_params)
+
+    # Compare the dynamic program with brute force computation.
+    for tag_indices in tag_indices_iterator:
+        tag_indices = list(tag_indices)
+        tag_indices.extend([0] * (num_words - sequence_lengths))
+        all_sequences.append(tag_indices)
+        sequence_score = crf_sequence_score(
+            inputs=inputs,
+            tag_indices=tf.expand_dims(tag_indices, 0),
+            sequence_lengths=sequence_lengths,
+            transition_params=transition_params,
+        )
+        sequence_score = tf.squeeze(sequence_score, [0])
+        all_sequence_scores.append(sequence_score)
+
+    expected_max_sequence_index = np.argmax(all_sequence_scores)
+    expected_max_sequence = all_sequences[expected_max_sequence_index]
+    expected_max_score = all_sequence_scores[expected_max_sequence_index]
+    return expected_max_sequence, expected_max_score
+
+
+@pytest.mark.parametrize("dtype", [np.float16, np.float32])
+def test_crf_sequence_score(dtype):
+    transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=dtype)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+        np.array(3, dtype=np.int32),
+        np.array(1, dtype=np.int32),
+    ]
+    inputs_list = [
+        np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
+        np.array([[4, 5, -3]], dtype=dtype),
+    ]
+    tag_indices_list = [
+        np.array([1, 2, 1, 0], dtype=np.int32),
+        np.array([1], dtype=np.int32),
+    ]
+    for sequence_lengths, inputs, tag_indices in zip(
+        sequence_lengths_list, inputs_list, tag_indices_list
+    ):
+        sequence_score = crf_sequence_score(
+            inputs=tf.expand_dims(inputs, 0),
+            tag_indices=tf.expand_dims(tag_indices, 0),
+            sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+            transition_params=tf.constant(transition_params),
+        )
+        sequence_score = tf.squeeze(sequence_score, [0])
+
+        expected_sequence_score = calculate_sequence_score(
+            inputs, transition_params, tag_indices, sequence_lengths
+        )
+        np.testing.assert_allclose(sequence_score, expected_sequence_score)
+
+
+@pytest.mark.parametrize("dtype", [np.float16, np.float32])
+def test_crf_unary_score(dtype):
+    inputs = np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype)
+    for dtype in (np.int32, np.int64):
+        tag_indices = np.array([1, 2, 1, 0], dtype=dtype)
+        sequence_lengths = np.array(3, dtype=np.int32)
+        unary_score = crf_unary_score(
+            tag_indices=tf.expand_dims(tag_indices, 0),
+            sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+            inputs=tf.expand_dims(inputs, 0),
+        )
+        unary_score = tf.squeeze(unary_score, [0])
+        expected_unary_score = sum(
+            inputs[i][tag_indices[i]] for i in range(sequence_lengths)
+        )
+        np.testing.assert_allclose(unary_score, expected_unary_score)
+
+
+@pytest.mark.parametrize("dtype", [np.float16, np.float32])
+def test_crf_binary_score(dtype):
+    tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
+    transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=dtype)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    binary_score = crf_binary_score(
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params),
+    )
+    binary_score = tf.squeeze(binary_score, [0])
+    expected_binary_score = sum(
+        transition_params[tag_indices[i], tag_indices[i + 1]]
+        for i in range(sequence_lengths - 1)
+    )
+    np.testing.assert_allclose(binary_score, expected_binary_score)
+
+
+@pytest.mark.parametrize("dtype", [np.float16, np.float32])
+def test_crf_log_norm(dtype):
+    transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=dtype)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+        np.array(3, dtype=np.int32),
+        np.array(1, dtype=np.int64),
+    ]
+    inputs_list = [
+        np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
+        np.array([[3, -1, 3]], dtype=dtype),
+    ]
+    tag_indices_list = [
+        np.array([1, 2, 1, 0], dtype=np.int32),
+        np.array([2], dtype=np.int32),
+    ]
+
+    for sequence_lengths, inputs, tag_indices in zip(
+        sequence_lengths_list, inputs_list, tag_indices_list
+    ):
+        num_words = inputs.shape[0]
+        num_tags = inputs.shape[1]
+        all_sequence_scores = []
+
+        # Compare the dynamic program with brute force computation.
+        for tag_indices in itertools.product(range(num_tags), repeat=sequence_lengths):
+            tag_indices = list(tag_indices)
+            tag_indices.extend([0] * (num_words - sequence_lengths))
+            all_sequence_scores.append(
+                crf_sequence_score(
+                    inputs=tf.expand_dims(inputs, 0),
+                    tag_indices=tf.expand_dims(tag_indices, 0),
+                    sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                    transition_params=tf.constant(transition_params),
+                )
+            )
+
+        brute_force_log_norm = tf.reduce_logsumexp(all_sequence_scores)
+        log_norm = crf_log_norm(
+            inputs=tf.expand_dims(inputs, 0),
+            sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+            transition_params=tf.constant(transition_params),
+        )
+        log_norm = tf.squeeze(log_norm, [0])
+
+        np.testing.assert_allclose(log_norm, brute_force_log_norm)
+
+
+@pytest.mark.parametrize("dtype", [np.float16, np.float32])
+def test_crf_log_norm_zero_seq_length(dtype):
+    """Test `crf_log_norm` when `sequence_lengths` contains one or more
+    zeros."""
+    inputs = tf.constant(np.ones([2, 10, 5], dtype=dtype))
+    transition_params = tf.constant(np.ones([5, 5], dtype=dtype))
+    sequence_lengths = tf.constant(np.zeros([2], dtype=np.int32))
+    expected_log_norm = np.zeros([2], dtype=dtype)
+    log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)
+    np.testing.assert_allclose(log_norm, expected_log_norm)
+
+
+@pytest.mark.parametrize("dtype", [np.float32])
+def test_crf_log_likelihood(dtype):
+    inputs = np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype)
+    transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=dtype)
+    sequence_lengths = np.array(3, dtype=np.int32)
+
+    num_words = inputs.shape[0]
+    num_tags = inputs.shape[1]
+    all_sequence_log_likelihoods = []
+
+    # Make sure all probabilities sum to 1.
+    for tag_indices in itertools.product(range(num_tags), repeat=sequence_lengths):
+        tag_indices = list(tag_indices)
+        tag_indices.extend([0] * (num_words - sequence_lengths))
+        sequence_log_likelihood, _ = crf_log_likelihood(
+            inputs=tf.expand_dims(inputs, 0),
+            tag_indices=tf.expand_dims(tag_indices, 0),
+            sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+            transition_params=tf.constant(transition_params),
+        )
+        all_sequence_log_likelihoods.append(sequence_log_likelihood)
+    total_log_likelihood = tf.reduce_logsumexp(all_sequence_log_likelihoods)
+    np.testing.assert_allclose(total_log_likelihood, 0.0, rtol=1e-6, atol=1e-6)
+
+    # check if `transition_params = None` raises an error
+    crf_log_likelihood(
+        inputs=tf.expand_dims(inputs, 0),
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+    )
+
+
+def test_different_dtype():
+    inputs = np.ones([16, 20, 5], dtype=np.float32)
+    tags = tf.convert_to_tensor(np.ones([16, 20], dtype=np.int64))
+    seq_lens = np.ones([16], dtype=np.int64) * 20
+
+    loss, _ = crf_log_likelihood(
+        inputs=inputs, tag_indices=tags, sequence_lengths=seq_lens
+    )
diff --git a/tests/utils/tensorflow/test_metrics.py b/tests/utils/tensorflow/test_metrics.py
new file mode 100644
index 000000000000..9d6ffb5c2e4a
--- /dev/null
+++ b/tests/utils/tensorflow/test_metrics.py
@@ -0,0 +1,205 @@
+"""Tests F beta metrics."""
+
+# original code taken from
+# https://github.com/tensorflow/addons/blob/master/tensorflow_addons/metrics/tests/f_scores_test.py
+# (modified to our neeeds)
+
+import numpy as np
+import pytest
+import tensorflow as tf
+from rasa.utils.tensorflow.metrics import FBetaScore, F1Score
+
+
+def test_config_fbeta():
+    fbeta_obj = FBetaScore(num_classes=3, beta=0.5, threshold=0.3, average=None)
+    assert fbeta_obj.beta == 0.5
+    assert fbeta_obj.average is None
+    assert fbeta_obj.threshold == 0.3
+    assert fbeta_obj.num_classes == 3
+    assert fbeta_obj.dtype == tf.float32
+
+    # Check save and restore config
+    fbeta_obj2 = FBetaScore.from_config(fbeta_obj.get_config())
+    assert fbeta_obj2.beta == 0.5
+    assert fbeta_obj2.average is None
+    assert fbeta_obj2.threshold == 0.3
+    assert fbeta_obj2.num_classes == 3
+    assert fbeta_obj2.dtype == tf.float32
+
+
+def _test_tf(avg, beta, act, pred, sample_weights, threshold):
+    act = tf.constant(act, tf.float32)
+    pred = tf.constant(pred, tf.float32)
+
+    fbeta = FBetaScore(3, avg, beta, threshold)
+    fbeta.update_state(act, pred, sample_weights)
+    return fbeta.result().numpy()
+
+
+def _test_fbeta_score(actuals, preds, sample_weights, avg, beta_val, result, threshold):
+    tf_score = _test_tf(avg, beta_val, actuals, preds, sample_weights, threshold)
+    np.testing.assert_allclose(tf_score, result, atol=1e-7, rtol=1e-6)
+
+
+def test_fbeta_perfect_score():
+    preds = [[0.7, 0.7, 0.7], [1, 0, 0], [0.9, 0.8, 0]]
+    actuals = [[1, 1, 1], [1, 0, 0], [1, 1, 0]]
+
+    for avg_val in ["micro", "macro", "weighted"]:
+        for beta in [0.5, 1.0, 2.0]:
+            _test_fbeta_score(actuals, preds, None, avg_val, beta, 1.0, 0.66)
+
+
+def test_fbeta_worst_score():
+    preds = [[0.7, 0.7, 0.7], [1, 0, 0], [0.9, 0.8, 0]]
+    actuals = [[0, 0, 0], [0, 1, 0], [0, 0, 1]]
+
+    for avg_val in ["micro", "macro", "weighted"]:
+        for beta in [0.5, 1.0, 2.0]:
+            _test_fbeta_score(actuals, preds, None, avg_val, beta, 0.0, 0.66)
+
+
+@pytest.mark.parametrize(
+    "avg_val, beta, result",
+    [
+        (None, 0.5, [0.71428573, 0.5, 0.833334]),
+        (None, 1.0, [0.8, 0.5, 0.6666667]),
+        (None, 2.0, [0.9090904, 0.5, 0.555556]),
+        ("micro", 0.5, 0.6666667),
+        ("micro", 1.0, 0.6666667),
+        ("micro", 2.0, 0.6666667),
+        ("macro", 0.5, 0.6825397),
+        ("macro", 1.0, 0.6555555),
+        ("macro", 2.0, 0.6548822),
+        ("weighted", 0.5, 0.6825397),
+        ("weighted", 1.0, 0.6555555),
+        ("weighted", 2.0, 0.6548822),
+    ],
+)
+def test_fbeta_random_score(avg_val, beta, result):
+    preds = [[0.7, 0.7, 0.7], [1, 0, 0], [0.9, 0.8, 0]]
+    actuals = [[0, 0, 1], [1, 1, 0], [1, 1, 1]]
+    _test_fbeta_score(actuals, preds, None, avg_val, beta, result, 0.66)
+
+
+@pytest.mark.parametrize(
+    "avg_val, beta, result",
+    [
+        (None, 0.5, [0.9090904, 0.555556, 1.0]),
+        (None, 1.0, [0.8, 0.6666667, 1.0]),
+        (None, 2.0, [0.71428573, 0.833334, 1.0]),
+        ("micro", 0.5, 0.833334),
+        ("micro", 1.0, 0.833334),
+        ("micro", 2.0, 0.833334),
+        ("macro", 0.5, 0.821549),
+        ("macro", 1.0, 0.822222),
+        ("macro", 2.0, 0.849206),
+        ("weighted", 0.5, 0.880471),
+        ("weighted", 1.0, 0.844445),
+        ("weighted", 2.0, 0.829365),
+    ],
+)
+def test_fbeta_random_score_none(avg_val, beta, result):
+    preds = [
+        [0.9, 0.1, 0],
+        [0.2, 0.6, 0.2],
+        [0, 0, 1],
+        [0.4, 0.3, 0.3],
+        [0, 0.9, 0.1],
+        [0, 0, 1],
+    ]
+    actuals = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0], [1, 0, 0], [0, 0, 1]]
+    _test_fbeta_score(actuals, preds, None, avg_val, beta, result, None)
+
+
+@pytest.mark.parametrize(
+    "avg_val, beta, sample_weights, result",
+    [
+        (None, 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0.909091, 0.555556, 1.0]),
+        (None, 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0]),
+        (None, 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], [0.9375, 0.714286, 1.0]),
+        (None, 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0.8, 0.666667, 1.0]),
+        (None, 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0]),
+        (None, 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], [0.857143, 0.8, 1.0]),
+        (None, 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0.714286, 0.833333, 1.0]),
+        (None, 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0]),
+        (None, 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], [0.789474, 0.909091, 1.0]),
+        ("micro", 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.833333),
+        ("micro", 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+        ("micro", 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.9),
+        ("micro", 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.833333),
+        ("micro", 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+        ("micro", 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.9),
+        ("micro", 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.833333),
+        ("micro", 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+        ("micro", 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.9),
+        ("macro", 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.821549),
+        ("macro", 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 0.666667),
+        ("macro", 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.883929),
+        ("macro", 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.822222),
+        ("macro", 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 0.666667),
+        ("macro", 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.885714),
+        ("macro", 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.849206),
+        ("macro", 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 0.666667),
+        ("macro", 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.899522),
+        ("weighted", 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.880471),
+        ("weighted", 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+        ("weighted", 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.917857),
+        ("weighted", 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.844444),
+        ("weighted", 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+        ("weighted", 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.902857),
+        ("weighted", 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.829365),
+        ("weighted", 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+        ("weighted", 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.897608),
+    ],
+)
+def test_fbeta_weighted_random_score_none(avg_val, beta, sample_weights, result):
+    preds = [
+        [0.9, 0.1, 0],
+        [0.2, 0.6, 0.2],
+        [0, 0, 1],
+        [0.4, 0.3, 0.3],
+        [0, 0.9, 0.1],
+        [0, 0, 1],
+    ]
+    actuals = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0], [1, 0, 0], [0, 0, 1]]
+    _test_fbeta_score(actuals, preds, sample_weights, avg_val, beta, result, None)
+
+
+def test_eq():
+    f1 = F1Score(3)
+    fbeta = FBetaScore(3, beta=1.0)
+
+    preds = [
+        [0.9, 0.1, 0],
+        [0.2, 0.6, 0.2],
+        [0, 0, 1],
+        [0.4, 0.3, 0.3],
+        [0, 0.9, 0.1],
+        [0, 0, 1],
+    ]
+    actuals = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0], [1, 0, 0], [0, 0, 1]]
+
+    fbeta.update_state(actuals, preds)
+    f1.update_state(actuals, preds)
+    np.testing.assert_allclose(fbeta.result().numpy(), f1.result().numpy())
+
+
+def test_sample_eq():
+    f1 = F1Score(3)
+    f1_weighted = F1Score(3)
+
+    preds = [
+        [0.9, 0.1, 0],
+        [0.2, 0.6, 0.2],
+        [0, 0, 1],
+        [0.4, 0.3, 0.3],
+        [0, 0.9, 0.1],
+        [0, 0, 1],
+    ]
+    actuals = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0], [1, 0, 0], [0, 0, 1]]
+    sample_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+
+    f1.update_state(actuals, preds)
+    f1_weighted(actuals, preds, sample_weights)
+    np.testing.assert_allclose(f1.result().numpy(), f1_weighted.result().numpy())