diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 0017d8021..15c13da84 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,11 +1,15 @@ # Upcoming Release ## Major features and improvements -* Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. + +- Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. + ## Bug fixes and other changes +- Fix polars.CSVDataset `save` method on Windows using `utf-8` as default encoding. + ## Breaking Changes -- Removed `tracking.MetricsDataset` and `tracking.JSONDataset` +- Removed `tracking.MetricsDataset` and `tracking.JSONDataset`. ## Community contributions diff --git a/kedro-datasets/docs/source/conf.py b/kedro-datasets/docs/source/conf.py index f62e80104..039658936 100644 --- a/kedro-datasets/docs/source/conf.py +++ b/kedro-datasets/docs/source/conf.py @@ -14,6 +14,8 @@ from __future__ import annotations import importlib +import inspect +import os import re import sys from inspect import getmembers, isclass, isfunction @@ -22,6 +24,8 @@ from click import secho, style from kedro import __version__ as release +import kedro_datasets + # -- Project information ----------------------------------------------------- project = "kedro-datasets" @@ -47,7 +51,7 @@ "sphinx_autodoc_typehints", "sphinx.ext.doctest", "sphinx.ext.ifconfig", - "sphinx.ext.viewcode", + "sphinx.ext.linkcode", "sphinxcontrib.jquery", "sphinx_copybutton", "myst_parser", @@ -452,3 +456,25 @@ def setup(app): user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0" myst_heading_anchors = 5 + +def linkcode_resolve(domain, info): + """Resolve a GitHub URL corresponding to a Python object.""" + if domain != 'py': + return None + + try: + mod = sys.modules[info['module']] + obj = mod + for attr in info['fullname'].split('.'): + obj = getattr(obj, attr) + obj = inspect.unwrap(obj) + + filename = inspect.getsourcefile(obj) + source, lineno = inspect.getsourcelines(obj) + relpath = os.path.relpath(filename, start=os.path.dirname( + kedro_datasets.__file__)) + + return f'https://github.com/kedro-org/kedro-plugins/blob/main/kedro-datasets/kedro_datasets/{relpath}#L{lineno}#L{lineno + len(source) - 1}' + + except (KeyError, ImportError, AttributeError, TypeError, OSError, ValueError): + return None diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index 6d8a988a5..9e6f35846 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -72,7 +72,9 @@ class CSVDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): DEFAULT_LOAD_ARGS: dict[str, Any] = {"rechunk": True} DEFAULT_SAVE_ARGS: dict[str, Any] = {} - DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}} + DEFAULT_FS_ARGS: dict[str, Any] = { + "open_args_save": {"mode": "w", "encoding": "utf-8"} + } def __init__( # noqa: PLR0913 self, diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py index e03f192cc..5312e9b48 100644 --- a/kedro-datasets/tests/polars/test_csv_dataset.py +++ b/kedro-datasets/tests/polars/test_csv_dataset.py @@ -88,14 +88,12 @@ def mocked_csv_in_s3(mocked_s3_bucket, mocked_dataframe: pl.DataFrame): class TestCSVDataset: - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_save_and_load(self, csv_dataset, dummy_dataframe): """Test saving and reloading the dataset.""" csv_dataset.save(dummy_dataframe) reloaded = csv_dataset.load() assert_frame_equal(dummy_dataframe, reloaded) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_exists(self, csv_dataset, dummy_dataframe): """Test `exists` method invocation for both existing and nonexistent dataset.""" @@ -204,7 +202,6 @@ def test_version_str_repr(self, load_version, save_version): assert "load_args={'rechunk': True}" in str(ds) assert "load_args={'rechunk': True}" in str(ds_versioned) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_save_and_load(self, versioned_csv_dataset, dummy_dataframe): """Test that saved and reloaded data matches the original one for the versioned dataset.""" @@ -212,7 +209,6 @@ def test_save_and_load(self, versioned_csv_dataset, dummy_dataframe): reloaded_df = versioned_csv_dataset.load() assert_frame_equal(dummy_dataframe, reloaded_df) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_multiple_loads(self, versioned_csv_dataset, dummy_dataframe, filepath_csv): """Test that if a new version is created mid-run, by an external system, it won't be loaded in the current run.""" @@ -236,7 +232,6 @@ def test_multiple_loads(self, versioned_csv_dataset, dummy_dataframe, filepath_c ds_new.resolve_load_version() == v_new ) # new version is discoverable by a new instance - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_multiple_saves(self, dummy_dataframe, filepath_csv): """Test multiple cycles of save followed by load for the same dataset""" ds_versioned = CSVDataset(filepath=filepath_csv, version=Version(None, None)) @@ -259,7 +254,6 @@ def test_multiple_saves(self, dummy_dataframe, filepath_csv): ds_new = CSVDataset(filepath=filepath_csv, version=Version(None, None)) assert ds_new.resolve_load_version() == second_load_version - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_release_instance_cache(self, dummy_dataframe, filepath_csv): """Test that cache invalidation does not affect other instances""" ds_a = CSVDataset(filepath=filepath_csv, version=Version(None, None)) @@ -288,14 +282,12 @@ def test_no_versions(self, versioned_csv_dataset): with pytest.raises(DatasetError, match=pattern): versioned_csv_dataset.load() - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_exists(self, versioned_csv_dataset, dummy_dataframe): """Test `exists` method invocation for versioned dataset.""" assert not versioned_csv_dataset.exists() versioned_csv_dataset.save(dummy_dataframe) assert versioned_csv_dataset.exists() - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_prevent_overwrite(self, versioned_csv_dataset, dummy_dataframe): """Check the error when attempting to override the dataset if the corresponding CSV file for a given save version already exists.""" @@ -307,7 +299,6 @@ def test_prevent_overwrite(self, versioned_csv_dataset, dummy_dataframe): with pytest.raises(DatasetError, match=pattern): versioned_csv_dataset.save(dummy_dataframe) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") @pytest.mark.parametrize( "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True ) @@ -334,7 +325,6 @@ def test_http_filesystem_no_versioning(self): filepath="https://example.com/file.csv", version=Version(None, None) ) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_versioning_existing_dataset( self, csv_dataset, versioned_csv_dataset, dummy_dataframe ):