From f54f4632b708d52741c9e7e73292ea9a488b41b1 Mon Sep 17 00:00:00 2001
From: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com>
Date: Thu, 18 Jul 2024 14:39:05 +0100
Subject: [PATCH] Pretty printing dataset (#3987)

* Implemented basic __repr__

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated __repr__

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Removed __str__

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated _describe() for CachedDataset

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Made pretty_repr protected

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Reverted width parameter to default

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated printing params

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated printing width

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Disable sorting

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Disable sorting

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated test_str_representation

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated cached dataset tests

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated data catalog tests

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated core tests

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated versioned dataset tests

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated tests for lambda dataset

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated tests for memory dataset

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Set width to maxsize

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Removed top-level keys sorting

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated tests

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated release notes

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Decoupled describe from pretty printing

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Returned old __str__ to avoid a breaking change

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Updated tests

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

* Replaced deprecation comment with TODO

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>

---------

Signed-off-by: Elena Khaustova <ymax70rus@gmail.com>
Co-authored-by: Nok Lam Chan <nok.lam.chan@quantumblack.com>
---
 RELEASE.md                      |  1 +
 kedro/io/cached_dataset.py      | 10 +++++++---
 kedro/io/core.py                | 21 +++++++++++++++++++++
 tests/io/test_cached_dataset.py |  8 ++++++++
 tests/io/test_core.py           | 15 +++++++++++++--
 tests/io/test_lambda_dataset.py | 25 +++++++++++++++++++++++++
 tests/io/test_memory_dataset.py | 28 ++++++++++++++++++++++++++--
 7 files changed, 101 insertions(+), 7 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index bab921cd3b..d714f50ddb 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -9,6 +9,7 @@
 * Fixed a bug in the `DataCatalog` `shallow_copy()` method to ensure it returns the type of the used catalog and doesn't cast it to `DataCatalog`.
 * Implemented key completion support for accessing datasets in the `DataCatalog`.
 * Made [kedro-telemetry](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-telemetry) a core dependency.
+* Implemented dataset pretty printing.
 
 ## Breaking changes to the API
 
diff --git a/kedro/io/cached_dataset.py b/kedro/io/cached_dataset.py
index a284f2aed0..2ca0bac28b 100644
--- a/kedro/io/cached_dataset.py
+++ b/kedro/io/cached_dataset.py
@@ -93,10 +93,14 @@ def _from_config(config: dict, version: Version | None) -> AbstractDataset:
         return AbstractDataset.from_config("_cached", config)
 
     def _describe(self) -> dict[str, Any]:
-        return {
-            "dataset": self._dataset._describe(),
-            "cache": self._cache._describe(),
+        return {"dataset": self._dataset._describe(), "cache": self._cache._describe()}
+
+    def __repr__(self) -> str:
+        object_description = {
+            "dataset": self._dataset._pretty_repr(self._dataset._describe()),
+            "cache": self._dataset._pretty_repr(self._cache._describe()),
         }
+        return self._pretty_repr(object_description)
 
     def _load(self) -> Any:
         data = self._cache.load() if self._cache.exists() else self._dataset.load()
diff --git a/kedro/io/core.py b/kedro/io/core.py
index ff388a50ed..53c2000e35 100644
--- a/kedro/io/core.py
+++ b/kedro/io/core.py
@@ -7,7 +7,9 @@
 import copy
 import logging
 import os
+import pprint
 import re
+import sys
 import warnings
 from collections import namedtuple
 from datetime import datetime, timezone
@@ -228,6 +230,7 @@ def save(self, data: _DI) -> None:
             raise DatasetError(message) from exc
 
     def __str__(self) -> str:
+        # TODO: Replace with __repr__ implementation in 0.20.0 release.
         def _to_str(obj: Any, is_root: bool = False) -> str:
             """Returns a string representation where
             1. The root level (i.e. the Dataset.__init__ arguments) are
@@ -254,6 +257,24 @@ def _to_str(obj: Any, is_root: bool = False) -> str:
 
         return f"{type(self).__name__}({_to_str(self._describe(), True)})"
 
+    def _pretty_repr(self, object_description: dict[str, Any]) -> str:
+        str_keys = []
+        for arg_name, arg_descr in object_description.items():
+            if arg_descr is not None:
+                descr = pprint.pformat(
+                    arg_descr,
+                    sort_dicts=False,
+                    compact=True,
+                    depth=2,
+                    width=sys.maxsize,
+                )
+                str_keys.append(f"{arg_name}={descr}")
+
+        return f"{type(self).__module__}.{type(self).__name__}({', '.join(str_keys)})"
+
+    def __repr__(self) -> str:
+        return self._pretty_repr(self._describe())
+
     @abc.abstractmethod
     def _load(self) -> _DO:
         raise NotImplementedError(
diff --git a/tests/io/test_cached_dataset.py b/tests/io/test_cached_dataset.py
index f49820f11d..3305291545 100644
--- a/tests/io/test_cached_dataset.py
+++ b/tests/io/test_cached_dataset.py
@@ -125,6 +125,14 @@ def test_pickle(self, cached_ds, caplog):
         _ = pickle.dumps(cached_ds)
         assert caplog.records[0].message == f"{cached_ds}: clearing cache to pickle."
 
+    def test_repr(self):
+        assert (
+            repr(CachedDataset(MemoryDataset(42)))
+            == """kedro.io.cached_dataset.CachedDataset("""
+            """dataset="kedro.io.memory_dataset.MemoryDataset(data='<int>')", """
+            """cache='kedro.io.memory_dataset.MemoryDataset()')"""
+        )
+
     def test_str(self):
         assert (
             str(CachedDataset(MemoryDataset(42))) == "CachedDataset(cache={}, "
diff --git a/tests/io/test_core.py b/tests/io/test_core.py
index 175a869f62..c2f07a106c 100644
--- a/tests/io/test_core.py
+++ b/tests/io/test_core.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import pprint
 import shutil
 from decimal import Decimal
 from fractions import Fraction
@@ -206,11 +207,21 @@ def dummy_data():
 class TestCoreFunctions:
     @pytest.mark.parametrize("var", [1, True] + FALSE_BUILTINS)
     def test_str_representation(self, var):
-        filepath = "."
-        assert str(MyDataset(var=var)) == f"MyDataset(filepath={filepath}, var={var})"
+        var_str = pprint.pformat(var)
+        filepath_str = pprint.pformat(PurePosixPath("."))
+        assert str(MyDataset(var=var)) == f"MyDataset(filepath=., var={var})"
+        assert (
+            repr(MyDataset(var=var))
+            == f"tests.io.test_core.MyDataset(filepath={filepath_str}, var={var_str})"
+        )
 
     def test_str_representation_none(self):
         assert str(MyDataset()) == "MyDataset(filepath=.)"
+        filepath_str = pprint.pformat(PurePosixPath("."))
+        assert (
+            repr(MyDataset())
+            == f"tests.io.test_core.MyDataset(filepath={filepath_str})"
+        )
 
     def test_get_filepath_str(self):
         path = get_filepath_str(PurePosixPath("example.com/test.csv"), "http")
diff --git a/tests/io/test_lambda_dataset.py b/tests/io/test_lambda_dataset.py
index cc2bd07cc6..a3072af451 100644
--- a/tests/io/test_lambda_dataset.py
+++ b/tests/io/test_lambda_dataset.py
@@ -31,16 +31,32 @@ def _dummy_release():
     assert "LambdaDataset(load=<tests.io.test_lambda_dataset._dummy_load>)" in str(
         LambdaDataset(_dummy_load, None)
     )
+    assert (
+        "kedro.io.lambda_dataset.LambdaDataset(load='<tests.io.test_lambda_dataset._dummy_load>')"
+        in repr(LambdaDataset(_dummy_load, None))
+    )
     assert "LambdaDataset(save=<tests.io.test_lambda_dataset._dummy_save>)" in str(
         LambdaDataset(None, _dummy_save)
     )
+    assert (
+        "kedro.io.lambda_dataset.LambdaDataset(save='<tests.io.test_lambda_dataset._dummy_save>')"
+        in repr(LambdaDataset(None, _dummy_save))
+    )
     assert "LambdaDataset(exists=<tests.io.test_lambda_dataset._dummy_exists>)" in str(
         LambdaDataset(None, None, _dummy_exists)
     )
+    assert (
+        "kedro.io.lambda_dataset.LambdaDataset(exists='<tests.io.test_lambda_dataset._dummy_exists>')"
+        in repr(LambdaDataset(None, None, _dummy_exists))
+    )
     assert (
         "LambdaDataset(release=<tests.io.test_lambda_dataset._dummy_release>)"
         in str(LambdaDataset(None, None, None, _dummy_release))
     )
+    assert (
+        "kedro.io.lambda_dataset.LambdaDataset(release='<tests.io.test_lambda_dataset._dummy_release>')"
+        in repr(LambdaDataset(None, None, None, _dummy_release))
+    )
 
     # __init__ keys alphabetically sorted, None values not shown
     expected = (
@@ -51,6 +67,15 @@ def _dummy_release():
     actual = str(LambdaDataset(_dummy_load, _dummy_save, _dummy_exists, None))
     assert actual == expected
 
+    # __init__ keys remains in the provided order, None values not shown
+    expected = (
+        "kedro.io.lambda_dataset.LambdaDataset(load='<tests.io.test_lambda_dataset._dummy_load>', "
+        "save='<tests.io.test_lambda_dataset._dummy_save>', "
+        "exists='<tests.io.test_lambda_dataset._dummy_exists>')"
+    )
+    actual = repr(LambdaDataset(_dummy_load, _dummy_save, _dummy_exists, None))
+    assert actual == expected
+
 
 def test_ephemeral_attribute(mocked_dataset):
     assert mocked_dataset._EPHEMERAL is False
diff --git a/tests/io/test_memory_dataset.py b/tests/io/test_memory_dataset.py
index e2b0fd2d83..c2dbe56925 100644
--- a/tests/io/test_memory_dataset.py
+++ b/tests/io/test_memory_dataset.py
@@ -141,8 +141,14 @@ def test_saving_none(self):
     @pytest.mark.parametrize(
         "input_data,expected",
         [
-            ("dummy_dataframe", "MemoryDataset(data=<DataFrame>)"),
-            ("dummy_numpy_array", "MemoryDataset(data=<ndarray>)"),
+            (
+                "dummy_dataframe",
+                "MemoryDataset(data=<DataFrame>)",
+            ),
+            (
+                "dummy_numpy_array",
+                "MemoryDataset(data=<ndarray>)",
+            ),
         ],
         indirect=["input_data"],
     )
@@ -150,6 +156,24 @@ def test_str_representation(self, memory_dataset, input_data, expected):
         """Test string representation of the dataset"""
         assert expected in str(memory_dataset)
 
+    @pytest.mark.parametrize(
+        "input_data,expected",
+        [
+            (
+                "dummy_dataframe",
+                "kedro.io.memory_dataset.MemoryDataset(data='<DataFrame>')",
+            ),
+            (
+                "dummy_numpy_array",
+                "kedro.io.memory_dataset.MemoryDataset(data='<ndarray>')",
+            ),
+        ],
+        indirect=["input_data"],
+    )
+    def test_repr_representation(self, memory_dataset, input_data, expected):
+        """Test string representation of the dataset"""
+        assert expected in repr(memory_dataset)
+
     def test_exists(self, new_data):
         """Test `exists` method invocation"""
         dataset = MemoryDataset()