From f54f4632b708d52741c9e7e73292ea9a488b41b1 Mon Sep 17 00:00:00 2001 From: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Date: Thu, 18 Jul 2024 14:39:05 +0100 Subject: [PATCH] Pretty printing dataset (#3987) * Implemented basic __repr__ Signed-off-by: Elena Khaustova * Updated __repr__ Signed-off-by: Elena Khaustova * Removed __str__ Signed-off-by: Elena Khaustova * Updated _describe() for CachedDataset Signed-off-by: Elena Khaustova * Made pretty_repr protected Signed-off-by: Elena Khaustova * Reverted width parameter to default Signed-off-by: Elena Khaustova * Updated printing params Signed-off-by: Elena Khaustova * Updated printing width Signed-off-by: Elena Khaustova * Disable sorting Signed-off-by: Elena Khaustova * Disable sorting Signed-off-by: Elena Khaustova * Updated test_str_representation Signed-off-by: Elena Khaustova * Updated cached dataset tests Signed-off-by: Elena Khaustova * Updated data catalog tests Signed-off-by: Elena Khaustova * Updated core tests Signed-off-by: Elena Khaustova * Updated versioned dataset tests Signed-off-by: Elena Khaustova * Updated tests for lambda dataset Signed-off-by: Elena Khaustova * Updated tests for memory dataset Signed-off-by: Elena Khaustova * Set width to maxsize Signed-off-by: Elena Khaustova * Removed top-level keys sorting Signed-off-by: Elena Khaustova * Updated tests Signed-off-by: Elena Khaustova * Updated release notes Signed-off-by: Elena Khaustova * Decoupled describe from pretty printing Signed-off-by: Elena Khaustova * Returned old __str__ to avoid a breaking change Signed-off-by: Elena Khaustova * Updated tests Signed-off-by: Elena Khaustova * Replaced deprecation comment with TODO Signed-off-by: Elena Khaustova --------- Signed-off-by: Elena Khaustova Co-authored-by: Nok Lam Chan --- RELEASE.md | 1 + kedro/io/cached_dataset.py | 10 +++++++--- kedro/io/core.py | 21 +++++++++++++++++++++ tests/io/test_cached_dataset.py | 8 ++++++++ tests/io/test_core.py | 15 +++++++++++++-- tests/io/test_lambda_dataset.py | 25 +++++++++++++++++++++++++ tests/io/test_memory_dataset.py | 28 ++++++++++++++++++++++++++-- 7 files changed, 101 insertions(+), 7 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index bab921cd3b..d714f50ddb 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -9,6 +9,7 @@ * Fixed a bug in the `DataCatalog` `shallow_copy()` method to ensure it returns the type of the used catalog and doesn't cast it to `DataCatalog`. * Implemented key completion support for accessing datasets in the `DataCatalog`. * Made [kedro-telemetry](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-telemetry) a core dependency. +* Implemented dataset pretty printing. ## Breaking changes to the API diff --git a/kedro/io/cached_dataset.py b/kedro/io/cached_dataset.py index a284f2aed0..2ca0bac28b 100644 --- a/kedro/io/cached_dataset.py +++ b/kedro/io/cached_dataset.py @@ -93,10 +93,14 @@ def _from_config(config: dict, version: Version | None) -> AbstractDataset: return AbstractDataset.from_config("_cached", config) def _describe(self) -> dict[str, Any]: - return { - "dataset": self._dataset._describe(), - "cache": self._cache._describe(), + return {"dataset": self._dataset._describe(), "cache": self._cache._describe()} + + def __repr__(self) -> str: + object_description = { + "dataset": self._dataset._pretty_repr(self._dataset._describe()), + "cache": self._dataset._pretty_repr(self._cache._describe()), } + return self._pretty_repr(object_description) def _load(self) -> Any: data = self._cache.load() if self._cache.exists() else self._dataset.load() diff --git a/kedro/io/core.py b/kedro/io/core.py index ff388a50ed..53c2000e35 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -7,7 +7,9 @@ import copy import logging import os +import pprint import re +import sys import warnings from collections import namedtuple from datetime import datetime, timezone @@ -228,6 +230,7 @@ def save(self, data: _DI) -> None: raise DatasetError(message) from exc def __str__(self) -> str: + # TODO: Replace with __repr__ implementation in 0.20.0 release. def _to_str(obj: Any, is_root: bool = False) -> str: """Returns a string representation where 1. The root level (i.e. the Dataset.__init__ arguments) are @@ -254,6 +257,24 @@ def _to_str(obj: Any, is_root: bool = False) -> str: return f"{type(self).__name__}({_to_str(self._describe(), True)})" + def _pretty_repr(self, object_description: dict[str, Any]) -> str: + str_keys = [] + for arg_name, arg_descr in object_description.items(): + if arg_descr is not None: + descr = pprint.pformat( + arg_descr, + sort_dicts=False, + compact=True, + depth=2, + width=sys.maxsize, + ) + str_keys.append(f"{arg_name}={descr}") + + return f"{type(self).__module__}.{type(self).__name__}({', '.join(str_keys)})" + + def __repr__(self) -> str: + return self._pretty_repr(self._describe()) + @abc.abstractmethod def _load(self) -> _DO: raise NotImplementedError( diff --git a/tests/io/test_cached_dataset.py b/tests/io/test_cached_dataset.py index f49820f11d..3305291545 100644 --- a/tests/io/test_cached_dataset.py +++ b/tests/io/test_cached_dataset.py @@ -125,6 +125,14 @@ def test_pickle(self, cached_ds, caplog): _ = pickle.dumps(cached_ds) assert caplog.records[0].message == f"{cached_ds}: clearing cache to pickle." + def test_repr(self): + assert ( + repr(CachedDataset(MemoryDataset(42))) + == """kedro.io.cached_dataset.CachedDataset(""" + """dataset="kedro.io.memory_dataset.MemoryDataset(data='')", """ + """cache='kedro.io.memory_dataset.MemoryDataset()')""" + ) + def test_str(self): assert ( str(CachedDataset(MemoryDataset(42))) == "CachedDataset(cache={}, " diff --git a/tests/io/test_core.py b/tests/io/test_core.py index 175a869f62..c2f07a106c 100644 --- a/tests/io/test_core.py +++ b/tests/io/test_core.py @@ -1,5 +1,6 @@ from __future__ import annotations +import pprint import shutil from decimal import Decimal from fractions import Fraction @@ -206,11 +207,21 @@ def dummy_data(): class TestCoreFunctions: @pytest.mark.parametrize("var", [1, True] + FALSE_BUILTINS) def test_str_representation(self, var): - filepath = "." - assert str(MyDataset(var=var)) == f"MyDataset(filepath={filepath}, var={var})" + var_str = pprint.pformat(var) + filepath_str = pprint.pformat(PurePosixPath(".")) + assert str(MyDataset(var=var)) == f"MyDataset(filepath=., var={var})" + assert ( + repr(MyDataset(var=var)) + == f"tests.io.test_core.MyDataset(filepath={filepath_str}, var={var_str})" + ) def test_str_representation_none(self): assert str(MyDataset()) == "MyDataset(filepath=.)" + filepath_str = pprint.pformat(PurePosixPath(".")) + assert ( + repr(MyDataset()) + == f"tests.io.test_core.MyDataset(filepath={filepath_str})" + ) def test_get_filepath_str(self): path = get_filepath_str(PurePosixPath("example.com/test.csv"), "http") diff --git a/tests/io/test_lambda_dataset.py b/tests/io/test_lambda_dataset.py index cc2bd07cc6..a3072af451 100644 --- a/tests/io/test_lambda_dataset.py +++ b/tests/io/test_lambda_dataset.py @@ -31,16 +31,32 @@ def _dummy_release(): assert "LambdaDataset(load=)" in str( LambdaDataset(_dummy_load, None) ) + assert ( + "kedro.io.lambda_dataset.LambdaDataset(load='')" + in repr(LambdaDataset(_dummy_load, None)) + ) assert "LambdaDataset(save=)" in str( LambdaDataset(None, _dummy_save) ) + assert ( + "kedro.io.lambda_dataset.LambdaDataset(save='')" + in repr(LambdaDataset(None, _dummy_save)) + ) assert "LambdaDataset(exists=)" in str( LambdaDataset(None, None, _dummy_exists) ) + assert ( + "kedro.io.lambda_dataset.LambdaDataset(exists='')" + in repr(LambdaDataset(None, None, _dummy_exists)) + ) assert ( "LambdaDataset(release=)" in str(LambdaDataset(None, None, None, _dummy_release)) ) + assert ( + "kedro.io.lambda_dataset.LambdaDataset(release='')" + in repr(LambdaDataset(None, None, None, _dummy_release)) + ) # __init__ keys alphabetically sorted, None values not shown expected = ( @@ -51,6 +67,15 @@ def _dummy_release(): actual = str(LambdaDataset(_dummy_load, _dummy_save, _dummy_exists, None)) assert actual == expected + # __init__ keys remains in the provided order, None values not shown + expected = ( + "kedro.io.lambda_dataset.LambdaDataset(load='', " + "save='', " + "exists='')" + ) + actual = repr(LambdaDataset(_dummy_load, _dummy_save, _dummy_exists, None)) + assert actual == expected + def test_ephemeral_attribute(mocked_dataset): assert mocked_dataset._EPHEMERAL is False diff --git a/tests/io/test_memory_dataset.py b/tests/io/test_memory_dataset.py index e2b0fd2d83..c2dbe56925 100644 --- a/tests/io/test_memory_dataset.py +++ b/tests/io/test_memory_dataset.py @@ -141,8 +141,14 @@ def test_saving_none(self): @pytest.mark.parametrize( "input_data,expected", [ - ("dummy_dataframe", "MemoryDataset(data=)"), - ("dummy_numpy_array", "MemoryDataset(data=)"), + ( + "dummy_dataframe", + "MemoryDataset(data=)", + ), + ( + "dummy_numpy_array", + "MemoryDataset(data=)", + ), ], indirect=["input_data"], ) @@ -150,6 +156,24 @@ def test_str_representation(self, memory_dataset, input_data, expected): """Test string representation of the dataset""" assert expected in str(memory_dataset) + @pytest.mark.parametrize( + "input_data,expected", + [ + ( + "dummy_dataframe", + "kedro.io.memory_dataset.MemoryDataset(data='')", + ), + ( + "dummy_numpy_array", + "kedro.io.memory_dataset.MemoryDataset(data='')", + ), + ], + indirect=["input_data"], + ) + def test_repr_representation(self, memory_dataset, input_data, expected): + """Test string representation of the dataset""" + assert expected in repr(memory_dataset) + def test_exists(self, new_data): """Test `exists` method invocation""" dataset = MemoryDataset()