diff --git a/benchmarks/table/column_operations_polars.py b/benchmarks/table/column_operations_polars.py new file mode 100644 index 000000000..c452edca5 --- /dev/null +++ b/benchmarks/table/column_operations_polars.py @@ -0,0 +1,50 @@ +from timeit import timeit + +from safeds.data.tabular.containers import ExperimentalTable + +from benchmarks.table.utils import create_synthetic_table_polars + +REPETITIONS = 10 + + +def _run_remove_columns_with_missing_values() -> None: + table.remove_columns_with_missing_values()._lazy_frame.collect() + + +def _run_remove_non_numeric_columns() -> None: + table.remove_non_numeric_columns()._lazy_frame.collect() + + +def _run_summarize_statistics() -> None: + table.summarize_statistics()._lazy_frame.collect() + + +if __name__ == "__main__": + # Create a synthetic Table + table = create_synthetic_table_polars(100, 5000) + + # Run the benchmarks + timings: dict[str, float] = { + "remove_columns_with_missing_values": timeit( + _run_remove_columns_with_missing_values, + number=REPETITIONS, + ), + "remove_non_numeric_columns": timeit( + _run_remove_non_numeric_columns, + number=REPETITIONS, + ), + "summarize_statistics": timeit( + _run_summarize_statistics, + number=REPETITIONS, + ), + } + + # Print the timings + print( + ExperimentalTable( + { + "method": list(timings.keys()), + "timing": list(timings.values()), + } + ) + ) diff --git a/benchmarks/table/row_operations_polars.py b/benchmarks/table/row_operations_polars.py index 9fc014cb8..403bfb80d 100644 --- a/benchmarks/table/row_operations_polars.py +++ b/benchmarks/table/row_operations_polars.py @@ -1,6 +1,8 @@ from timeit import timeit -from safeds.data.tabular.containers import Table +import polars as pl + +from safeds.data.tabular.containers import ExperimentalTable from benchmarks.table.utils import create_synthetic_table_polars @@ -15,14 +17,18 @@ def _run_remove_rows_with_missing_values() -> None: table.remove_rows_with_missing_values()._lazy_frame.collect() -# def _run_remove_rows_with_outliers() -> None: -# table.remove_rows_with_outliers() +def _run_remove_rows_with_outliers() -> None: + table.remove_rows_with_outliers() def _run_remove_rows() -> None: table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0)._lazy_frame.collect() +def _run_remove_rows_by_column() -> None: + table.remove_rows_by_column("column_0", lambda cell: cell % 2 == 0)._lazy_frame.collect() + + def _run_shuffle_rows() -> None: table.shuffle_rows()._lazy_frame.collect() @@ -63,14 +69,18 @@ def _run_transform_column() -> None: _run_remove_rows_with_missing_values, number=REPETITIONS, ), - # "remove_rows_with_outliers": timeit( - # _run_remove_rows_with_outliers, - # number=REPETITIONS, - # ), + "remove_rows_with_outliers": timeit( + _run_remove_rows_with_outliers, + number=REPETITIONS, + ), "remove_rows": timeit( _run_remove_rows, number=REPETITIONS, ), + "remove_rows_by_column": timeit( + _run_remove_rows_by_column, + number=REPETITIONS, + ), "shuffle_rows": timeit( _run_shuffle_rows, number=REPETITIONS, @@ -98,11 +108,14 @@ def _run_transform_column() -> None: } # Print the timings - print( - Table( - { - "method": list(timings.keys()), - "timing": list(timings.values()), - } + with pl.Config( + tbl_rows=-1, + ): + print( + ExperimentalTable( + { + "method": list(timings.keys()), + "timing": list(timings.values()), + } + ) ) - ) diff --git a/benchmarks/table/utils/create_synthetic_table.py b/benchmarks/table/utils/create_synthetic_table.py index d1ad47d6e..9c201a098 100644 --- a/benchmarks/table/utils/create_synthetic_table.py +++ b/benchmarks/table/utils/create_synthetic_table.py @@ -10,7 +10,8 @@ def create_synthetic_table( min_value: int = 0, max_value: int = 1000, ) -> Table: - """Create a synthetic Table with random numerical data. + """ + Create a synthetic Table with random numerical data. Parameters ---------- diff --git a/benchmarks/table/utils/create_synthetic_table_polars.py b/benchmarks/table/utils/create_synthetic_table_polars.py index d1425c851..34a354b13 100644 --- a/benchmarks/table/utils/create_synthetic_table_polars.py +++ b/benchmarks/table/utils/create_synthetic_table_polars.py @@ -10,7 +10,8 @@ def create_synthetic_table_polars( min_value: int = 0, max_value: int = 1000, ) -> ExperimentalTable: - """Create a synthetic Table with random numerical data. + """ + Create a synthetic Table with random numerical data. Parameters ---------- diff --git a/poetry.lock b/poetry.lock index e20acf8c6..86d2ff99c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2268,17 +2268,17 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "polars" -version = "0.20.24" +version = "0.20.25" description = "Blazingly fast DataFrame library" optional = false python-versions = ">=3.8" files = [ - {file = "polars-0.20.24-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:a2c7282e0c81f038c9800ec4e1d97fe53dcacbba9632baf31a633e8bf12caab3"}, - {file = "polars-0.20.24-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:02587e12435e583693351c4757cf571b90165ceb53b031e891aadf2c816cc59d"}, - {file = "polars-0.20.24-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dfbb129941dd0cfa05f0fb5ef1cde341fed336b4dfcb81c3bef6f3f6b899cb17"}, - {file = "polars-0.20.24-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:9921df98cee040903d35aef2c7237182240451e1ad413116a82e1e166d8fe943"}, - {file = "polars-0.20.24-cp38-abi3-win_amd64.whl", hash = "sha256:dc0fb1169d3d0b286793421a6919c6a9f06235b9f93c1e00f01f199e038d3681"}, - {file = "polars-0.20.24.tar.gz", hash = "sha256:a0c11f3b5e756bab7ba164ed73104c96fa9c861efce157fe8991b3eafeb4b0b8"}, + {file = "polars-0.20.25-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:126e3b7d9394e4b23b4cc48919b7188203feeeb35d861ad808f281eaa06d76e2"}, + {file = "polars-0.20.25-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:3bda62b681726538714a1159638ab7c9eeca6b8633fd778d84810c3e13b9c7e3"}, + {file = "polars-0.20.25-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:62c8826e81c759f07bf5c0ae00f57a537644ae05fe68737185666b8ad8430664"}, + {file = "polars-0.20.25-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:0fb5e7a4a9831fba742f1c706e01656607089b6362a5e6f8d579b134a99795ce"}, + {file = "polars-0.20.25-cp38-abi3-win_amd64.whl", hash = "sha256:9eaeb9080c853e11b207d191025e0ba8fd59ea06a36c22d410a48f2f124e18cd"}, + {file = "polars-0.20.25.tar.gz", hash = "sha256:4308d63f956874bac9ae040bdd6d62b2992d0b1e1349301bc7a3b59458189108"}, ] [package.dependencies] @@ -3789,4 +3789,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11,<3.13" -content-hash = "e8918188c27818e6491dbdfdbf92304c2d3bce38f3f1a6e01ebeebd09418d809" +content-hash = "84af08810fc4597a0076fb879faf198ec8935a359927460982b02da7bc4a71c9" diff --git a/pyproject.toml b/pyproject.toml index ad6eb4148..706aee510 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ matplotlib = "^3.6.3" openpyxl = "^3.1.2" pandas = "^2.0.0" pillow = ">=9.5,<11.0" -polars = {extras = ["numpy", "pyarrow"], version = "^0.20.24"} +polars = {extras = ["numpy", "pyarrow"], version = "^0.20.25"} scikit-learn = "^1.2.0" seaborn = "^0.13.0" statsmodels = "^0.14.1" diff --git a/src/resources/from_json_file_2.json b/src/resources/from_json_file_2.json new file mode 100644 index 000000000..5cd814134 --- /dev/null +++ b/src/resources/from_json_file_2.json @@ -0,0 +1,6 @@ +{ + "columns": [ + { "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] }, + { "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] } + ] +} diff --git a/src/resources/from_parquet_file.parquet b/src/resources/from_parquet_file.parquet new file mode 100644 index 000000000..75234fd67 Binary files /dev/null and b/src/resources/from_parquet_file.parquet differ diff --git a/src/resources/to_excel_file.xlsx b/src/resources/to_excel_file.xlsx index d28e5f48f..42a6af394 100644 Binary files a/src/resources/to_excel_file.xlsx and b/src/resources/to_excel_file.xlsx differ diff --git a/src/resources/to_json_file_2.json b/src/resources/to_json_file_2.json new file mode 100644 index 000000000..5cd814134 --- /dev/null +++ b/src/resources/to_json_file_2.json @@ -0,0 +1,6 @@ +{ + "columns": [ + { "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] }, + { "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] } + ] +} diff --git a/src/resources/to_parquet_file.parquet b/src/resources/to_parquet_file.parquet new file mode 100644 index 000000000..75234fd67 Binary files /dev/null and b/src/resources/to_parquet_file.parquet differ diff --git a/src/safeds/_config/__init__.py b/src/safeds/_config/__init__.py index bd7b5188a..a4db1a36f 100644 --- a/src/safeds/_config/__init__.py +++ b/src/safeds/_config/__init__.py @@ -5,17 +5,19 @@ import apipkg if TYPE_CHECKING: - from ._device import _get_device, _init_default_device + from ._torch import _get_device, _init_default_device, _set_default_device apipkg.initpkg( __name__, { - "_get_device": "._device:_get_device", - "_init_default_device": "._device:_init_default_device", + "_get_device": "._torch:_get_device", + "_init_default_device": "._torch:_init_default_device", + "_set_default_device": "._torch:_set_default_device", }, ) __all__ = [ "_get_device", "_init_default_device", + "_set_default_device", ] diff --git a/src/safeds/_config/_polars.py b/src/safeds/_config/_polars.py new file mode 100644 index 000000000..02b595994 --- /dev/null +++ b/src/safeds/_config/_polars.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import polars as pl + + +def _get_polars_config() -> pl.Config: + import polars as pl + + return pl.Config( + float_precision=5, + tbl_cell_numeric_alignment="RIGHT", + tbl_formatting="ASCII_FULL_CONDENSED", + tbl_hide_dataframe_shape=True, + ) diff --git a/src/safeds/_config/_device.py b/src/safeds/_config/_torch.py similarity index 88% rename from src/safeds/_config/_device.py rename to src/safeds/_config/_torch.py index 3fc1db282..63b79ea80 100644 --- a/src/safeds/_config/_device.py +++ b/src/safeds/_config/_torch.py @@ -18,7 +18,7 @@ def _get_device() -> Device: def _init_default_device() -> None: import torch - global _default_device + global _default_device # noqa: PLW0603 if _default_device is None: _default_device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") @@ -28,7 +28,7 @@ def _init_default_device() -> None: def _set_default_device(device: Device) -> None: # This changes all future tensors, but not any tensor that already exists - global _default_device + global _default_device # noqa: PLW0603 _default_device = device _init_default_device() diff --git a/src/safeds/_utils/__init__.py b/src/safeds/_utils/__init__.py index 3d445c97c..f50d26ed5 100644 --- a/src/safeds/_utils/__init__.py +++ b/src/safeds/_utils/__init__.py @@ -7,16 +7,22 @@ if TYPE_CHECKING: from ._file_io import _check_and_normalize_file_path from ._hashing import _structural_hash + from ._plotting import _figure_to_image + from ._random import _get_random_seed apipkg.initpkg( __name__, { "_check_and_normalize_file_path": "._file_io:_check_and_normalize_file_path", "_structural_hash": "._hashing:_structural_hash", + "_figure_to_image": "._plotting:_figure_to_image", + "_get_random_seed": "._random:_get_random_seed", }, ) __all__ = [ "_check_and_normalize_file_path", "_structural_hash", + "_figure_to_image", + "_get_random_seed", ] diff --git a/src/safeds/_utils/_plotting.py b/src/safeds/_utils/_plotting.py new file mode 100644 index 000000000..6b80c74f0 --- /dev/null +++ b/src/safeds/_utils/_plotting.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import io +from typing import TYPE_CHECKING + +from safeds.data.image.containers import Image + +if TYPE_CHECKING: + import matplotlib.pyplot as plt + + +def _figure_to_image(figure: plt.Figure) -> Image: + """ + Store the figure as an image and closes it. + + Parameters + ---------- + figure: + The figure to store. + + Returns + ------- + image: + The figure as an image. + """ + import matplotlib.pyplot as plt + + buffer = io.BytesIO() + figure.savefig(buffer, format="png") + plt.close(figure) # Prevents the figure from being displayed directly + buffer.seek(0) + return Image.from_bytes(buffer.read()) diff --git a/src/safeds/data/labeled/containers/__init__.py b/src/safeds/data/labeled/containers/__init__.py index e6237ec24..8eed70294 100644 --- a/src/safeds/data/labeled/containers/__init__.py +++ b/src/safeds/data/labeled/containers/__init__.py @@ -5,6 +5,7 @@ import apipkg if TYPE_CHECKING: + from ._experimental_tabular_dataset import ExperimentalTabularDataset from ._image_dataset import ImageDataset from ._tabular_dataset import TabularDataset from ._time_series_dataset import TimeSeriesDataset @@ -12,6 +13,7 @@ apipkg.initpkg( __name__, { + "ExperimentalTabularDataset": "._experimental_tabular_dataset:ExperimentalTabularDataset", "ImageDataset": "._image_dataset:ImageDataset", "TabularDataset": "._tabular_dataset:TabularDataset", "TimeSeriesDataset": "._time_series_dataset:TimeSeriesDataset", @@ -19,6 +21,7 @@ ) __all__ = [ + "ExperimentalTabularDataset", "ImageDataset", "TabularDataset", "TimeSeriesDataset", diff --git a/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py b/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py new file mode 100644 index 000000000..b6711045a --- /dev/null +++ b/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +import sys +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash + +if TYPE_CHECKING: + from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable + + +class ExperimentalTabularDataset: + """ + A dataset containing tabular data. It can be used to train machine learning models. + + Columns in a tabular dataset are divided into three categories: + + * The target column is the column that a model should predict. + * Feature columns are columns that a model should use to make predictions. + * Extra columns are columns that are neither feature nor target. They can be used to provide additional context, + like an ID column. + + Feature columns are implicitly defined as all columns except the target and extra columns. If no extra columns + are specified, all columns except the target column are used as features. + + Parameters + ---------- + data: + The data. + target_name: + Name of the target column. + extra_names: + Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but + the target column are used as features. + + Raises + ------ + KeyError + If a column name is not found in the data. + ValueError + If the target column is also an extra column. + ValueError + If no feature columns remains. + + Examples + -------- + >>> from safeds.data.labeled.containers import TabularDataset + >>> dataset = TabularDataset( + ... {"id": [1, 2, 3], "feature": [4, 5, 6], "target": [1, 2, 3]}, + ... target_name="target", + ... extra_names=["id"] + ... ) + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__( + self, + data: ExperimentalTable, + target_name: str, + extra_names: list[str] | None = None, + ): + # Preprocess inputs + if extra_names is None: + extra_names = [] + + # Derive feature names + non_feature_names = {target_name, *extra_names} # perf: Comprehensions evaluate their condition every iteration + feature_names = [name for name in data.column_names if name not in non_feature_names] + + # Validate inputs + if target_name in extra_names: + raise ValueError(f"Column '{target_name}' cannot be both target and extra.") + if len(feature_names) == 0: + raise ValueError("At least one feature column must remain.") + + # Set attributes + self._table: ExperimentalTable = data + self._features: ExperimentalTable = data.remove_columns_except(feature_names) + self._target: ExperimentalColumn = data.get_column(target_name) + self._extras: ExperimentalTable = data.remove_columns_except(extra_names) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, ExperimentalTabularDataset): + return NotImplemented + if self is other: + return True + return self.target == other.target and self.features == other.features and self._extras == other._extras + + def __hash__(self) -> int: + return _structural_hash(self.target, self.features, self._extras) + + def __repr__(self) -> str: + return self._table.__repr__() + + def __sizeof__(self) -> int: + return sys.getsizeof(self._target) + sys.getsizeof(self._features) + sys.getsizeof(self._extras) + + def __str__(self) -> str: + return self._table.__str__() + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def features(self) -> ExperimentalTable: + """The feature columns of the tabular dataset.""" + return self._features + + @property + def target(self) -> ExperimentalColumn: + """The target column of the tabular dataset.""" + return self._target + + @property + def extras(self) -> ExperimentalTable: + """ + Additional columns of the tabular dataset that are neither features nor target. + + These can be used to store additional information about instances, such as IDs. + """ + return self._extras + + # ------------------------------------------------------------------------------------------------------------------ + # Conversion + # ------------------------------------------------------------------------------------------------------------------ + + def to_table(self) -> ExperimentalTable: + """ + Return a table containing all columns of the tabular dataset. + + Returns + ------- + table: + A table containing all columns of the tabular dataset. + """ + return self._table + + # ------------------------------------------------------------------------------------------------------------------ + # IPython integration + # ------------------------------------------------------------------------------------------------------------------ + + def _repr_html_(self) -> str: + """ + Return a compact HTML representation of the tabular dataset for IPython. + + Returns + ------- + html: + The generated HTML. + """ + return self._table._repr_html_() diff --git a/src/safeds/data/labeled/containers/_tabular_dataset.py b/src/safeds/data/labeled/containers/_tabular_dataset.py index 77dfc8c54..938e62469 100644 --- a/src/safeds/data/labeled/containers/_tabular_dataset.py +++ b/src/safeds/data/labeled/containers/_tabular_dataset.py @@ -3,7 +3,7 @@ import sys from typing import TYPE_CHECKING -from safeds._config import _init_default_device, _get_device +from safeds._config import _get_device, _init_default_device from safeds._utils import _structural_hash from safeds.data.tabular.containers import Column, Table @@ -196,7 +196,8 @@ def _into_dataloader_with_classes(self, batch_size: int, num_of_classes: int) -> dataset=_create_dataset( torch.Tensor(self.features._data.values).to(_get_device()), torch.nn.functional.one_hot( - torch.LongTensor(self.target._data).to(_get_device()), num_classes=num_of_classes + torch.LongTensor(self.target._data).to(_get_device()), + num_classes=num_of_classes, ), ), batch_size=batch_size, diff --git a/src/safeds/data/labeled/containers/_time_series_dataset.py b/src/safeds/data/labeled/containers/_time_series_dataset.py index 529f789d4..1603d21f6 100644 --- a/src/safeds/data/labeled/containers/_time_series_dataset.py +++ b/src/safeds/data/labeled/containers/_time_series_dataset.py @@ -6,7 +6,7 @@ from safeds._config import _init_default_device from safeds._utils import _structural_hash from safeds.data.tabular.containers import Column, Table -from safeds.exceptions import OutOfBoundsError, ClosedBound +from safeds.exceptions import ClosedBound, OutOfBoundsError if TYPE_CHECKING: from collections.abc import Mapping, Sequence diff --git a/src/safeds/data/tabular/containers/_experimental_cell.py b/src/safeds/data/tabular/containers/_experimental_cell.py index c5fe6ce57..7c77046e5 100644 --- a/src/safeds/data/tabular/containers/_experimental_cell.py +++ b/src/safeds/data/tabular/containers/_experimental_cell.py @@ -1,7 +1,10 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import Any, Generic, TypeVar +from typing import TYPE_CHECKING, Any, Generic, TypeVar + +if TYPE_CHECKING: + import polars as pl T = TypeVar("T") P = TypeVar("P") @@ -9,7 +12,11 @@ class ExperimentalCell(ABC, Generic[T]): - """A cell is a single value in a table.""" + """ + A single value in a table. + + This class cannot be instantiated directly. It is only used for arguments of callbacks. + """ # ------------------------------------------------------------------------------------------------------------------ # Dunder methods @@ -65,6 +72,12 @@ def __ne__(self, other: object) -> ExperimentalCell[bool]: # type: ignore[overr @abstractmethod def __abs__(self) -> ExperimentalCell[R]: ... + @abstractmethod + def __ceil__(self) -> ExperimentalCell[R]: ... + + @abstractmethod + def __floor__(self) -> ExperimentalCell[R]: ... + @abstractmethod def __neg__(self) -> ExperimentalCell[R]: ... @@ -121,10 +134,557 @@ def __hash__(self) -> int: ... @abstractmethod def __sizeof__(self) -> int: ... + # ------------------------------------------------------------------------------------------------------------------ + # Boolean operations + # ------------------------------------------------------------------------------------------------------------------ + + def not_(self) -> ExperimentalCell[bool]: + """ + Negate a boolean. This is equivalent to the `~` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [True, False]) + >>> column.transform(lambda cell: cell.not_()) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + + >>> column.transform(lambda cell: ~cell) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + """ + return self.__invert__() + + def and_(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + """ + Perform a boolean AND operation. This is equivalent to the `&` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [True, False]) + >>> column.transform(lambda cell: cell.and_(False)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | false | + +---------+ + + >>> column.transform(lambda cell: cell & False) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | false | + +---------+ + """ + return self.__and__(other) + + def or_(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + """ + Perform a boolean OR operation. This is equivalent to the `|` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [True, False]) + >>> column.transform(lambda cell: cell.or_(True)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | true | + | true | + +---------+ + + >>> column.transform(lambda cell: cell | True) + +---------+ + | example | + | --- | + | bool | + +=========+ + | true | + | true | + +---------+ + """ + return self.__or__(other) + + def xor(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + """ + Perform a boolean XOR operation. This is equivalent to the `^` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [True, False]) + >>> column.transform(lambda cell: cell.xor(True)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + + >>> column.transform(lambda cell: cell ^ True) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + """ + return self.__xor__(other) + + # ------------------------------------------------------------------------------------------------------------------ + # Numeric operations + # ------------------------------------------------------------------------------------------------------------------ + + def abs(self) -> ExperimentalCell[R]: + """ + Get the absolute value. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, -2]) + >>> column.transform(lambda cell: cell.abs()) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 1 | + | 2 | + +---------+ + """ + return self.__abs__() + + def ceil(self) -> ExperimentalCell[R]: + """ + Round up to the nearest integer. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1.1, 2.9]) + >>> column.transform(lambda cell: cell.ceil()) + +---------+ + | example | + | --- | + | f64 | + +=========+ + | 2.00000 | + | 3.00000 | + +---------+ + """ + return self.__ceil__() + + def floor(self) -> ExperimentalCell[R]: + """ + Round down to the nearest integer. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1.1, 2.9]) + >>> column.transform(lambda cell: cell.floor()) + +---------+ + | example | + | --- | + | f64 | + +=========+ + | 1.00000 | + | 2.00000 | + +---------+ + """ + return self.__floor__() + + def neg(self) -> ExperimentalCell[R]: + """ + Negate the value. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, -2]) + >>> column.transform(lambda cell: cell.neg()) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | -1 | + | 2 | + +---------+ + """ + return self.__neg__() + + def add(self, other: Any) -> ExperimentalCell[R]: + """ + Add a value. This is equivalent to the `+` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, 2]) + >>> column.transform(lambda cell: cell.add(3)) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 4 | + | 5 | + +---------+ + + >>> column.transform(lambda cell: cell + 3) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 4 | + | 5 | + +---------+ + """ + return self.__add__(other) + + def mod(self, other: Any) -> ExperimentalCell[R]: + """ + Perform a modulo operation. This is equivalent to the `%` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [5, 6]) + >>> column.transform(lambda cell: cell.mod(3)) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 2 | + | 0 | + +---------+ + + >>> column.transform(lambda cell: cell % 3) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 2 | + | 0 | + +---------+ + """ + return self.__mod__(other) + + def mul(self, other: Any) -> ExperimentalCell[R]: + """ + Multiply by a value. This is equivalent to the `*` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [2, 3]) + >>> column.transform(lambda cell: cell.mul(4)) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 8 | + | 12 | + +---------+ + + >>> column.transform(lambda cell: cell * 4) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 8 | + | 12 | + +---------+ + """ + return self.__mul__(other) + + def pow(self, other: float | ExperimentalCell[P]) -> ExperimentalCell[R]: + """ + Raise to a power. This is equivalent to the `**` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [2, 3]) + >>> column.transform(lambda cell: cell.pow(3)) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 8 | + | 27 | + +---------+ + + >>> column.transform(lambda cell: cell ** 3) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 8 | + | 27 | + +---------+ + """ + return self.__pow__(other) + + def sub(self, other: Any) -> ExperimentalCell[R]: + """ + Subtract a value. This is equivalent to the `-` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [5, 6]) + >>> column.transform(lambda cell: cell.sub(3)) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 2 | + | 3 | + +---------+ + + >>> column.transform(lambda cell: cell - 3) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 2 | + | 3 | + +---------+ + """ + return self.__sub__(other) + + def div(self, other: Any) -> ExperimentalCell[R]: + """ + Divide by a value. This is equivalent to the `/` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [6, 8]) + >>> column.transform(lambda cell: cell.div(2)) + +---------+ + | example | + | --- | + | f64 | + +=========+ + | 3.00000 | + | 4.00000 | + +---------+ + + >>> column.transform(lambda cell: cell / 2) + +---------+ + | example | + | --- | + | f64 | + +=========+ + | 3.00000 | + | 4.00000 | + +---------+ + """ + return self.__truediv__(other) + + # ------------------------------------------------------------------------------------------------------------------ + # Comparison operations + # ------------------------------------------------------------------------------------------------------------------ + + def eq(self, other: Any) -> ExperimentalCell[bool]: + """ + Check if equal to a value. This is equivalent to the `==` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, 2]) + >>> column.transform(lambda cell: cell.eq(2)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + + >>> column.transform(lambda cell: cell == 2) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + """ + return self.__eq__(other) + + def ge(self, other: Any) -> ExperimentalCell[bool]: + """ + Check if greater than or equal to a value. This is equivalent to the `>=` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, 2]) + >>> column.transform(lambda cell: cell.ge(2)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + + >>> column.transform(lambda cell: cell >= 2) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + """ + return self.__ge__(other) + + def gt(self, other: Any) -> ExperimentalCell[bool]: + """ + Check if greater than a value. This is equivalent to the `>` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, 2]) + >>> column.transform(lambda cell: cell.gt(2)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | false | + +---------+ + + >>> column.transform(lambda cell: cell > 2) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | false | + +---------+ + """ + return self.__gt__(other) + + def le(self, other: Any) -> ExperimentalCell[bool]: + """ + Check if less than or equal to a value. This is equivalent to the `<=` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, 2]) + >>> column.transform(lambda cell: cell.le(2)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | true | + | true | + +---------+ + + >>> column.transform(lambda cell: cell <= 2) + +---------+ + | example | + | --- | + | bool | + +=========+ + | true | + | true | + +---------+ + """ + return self.__le__(other) + + def lt(self, other: Any) -> ExperimentalCell[bool]: + """ + Check if less than a value. This is equivalent to the `<` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, 2]) + >>> column.transform(lambda cell: cell.lt(2)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | true | + | false | + +---------+ + + >>> column.transform(lambda cell: cell < 2) + +---------+ + | example | + | --- | + | bool | + +=========+ + | true | + | false | + +---------+ + """ + return self.__lt__(other) + # ------------------------------------------------------------------------------------------------------------------ # Internal # ------------------------------------------------------------------------------------------------------------------ + @property + @abstractmethod + def _polars_expression(self) -> pl.Expr | pl.Series: + """The Polars expression that corresponds to this cell.""" + @abstractmethod def _equals(self, other: object) -> bool: """ diff --git a/src/safeds/data/tabular/containers/_experimental_column.py b/src/safeds/data/tabular/containers/_experimental_column.py index 3f2533880..65a85ac20 100644 --- a/src/safeds/data/tabular/containers/_experimental_column.py +++ b/src/safeds/data/tabular/containers/_experimental_column.py @@ -26,7 +26,7 @@ class ExperimentalColumn(Sequence[T]): """ - A column is a named, one-dimensional collection of homogeneous values. + A named, one-dimensional collection of homogeneous values. Parameters ---------- @@ -39,7 +39,15 @@ class ExperimentalColumn(Sequence[T]): -------- >>> from safeds.data.tabular.containers import ExperimentalColumn >>> ExperimentalColumn("test", [1, 2, 3]) - Column('test', [1, 2, 3]) + +------+ + | test | + | --- | + | i64 | + +======+ + | 1 | + | 2 | + | 3 | + +------+ """ # ------------------------------------------------------------------------------------------------------------------ @@ -97,20 +105,13 @@ def __len__(self) -> int: return self.number_of_rows def __repr__(self) -> str: - import polars as pl - - if self.number_of_rows <= 50: - data = self._series.to_list() - else: - data = f"[{', '.join(self._series.slice(0, 50).cast(pl.String).to_list())}, ...]" - - return f"Column({self.name!r}, {data})" + return self.to_table().__repr__() def __sizeof__(self) -> int: return self._series.estimated_size() def __str__(self) -> str: - return self.__repr__() + return self.to_table().__str__() # ------------------------------------------------------------------------------------------------------------------ # Properties @@ -150,6 +151,24 @@ def type(self) -> ExperimentalDataType: # Value operations # ------------------------------------------------------------------------------------------------------------------ + def get_distinct_values(self) -> list[T]: + """ + Return the distinct values in the column. + + Returns + ------- + distinct_values: + The distinct values in the column. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("test", [1, 2, 3, 2]) + >>> column.get_distinct_values() + [1, 2, 3] + """ + return self._series.unique().sort().to_list() + def get_value(self, index: int) -> T: """ Return the column value at specified index. Indexing starts at 0. @@ -358,7 +377,15 @@ def rename(self, new_name: str) -> ExperimentalColumn[T]: >>> from safeds.data.tabular.containers import ExperimentalColumn >>> column = ExperimentalColumn("test", [1, 2, 3]) >>> column.rename("new_name") - Column('new_name', [1, 2, 3]) + +----------+ + | new_name | + | --- | + | i64 | + +==========+ + | 1 | + | 2 | + | 3 | + +----------+ """ return self._from_polars_series(self._series.rename(new_name)) @@ -386,7 +413,15 @@ def transform( >>> from safeds.data.tabular.containers import ExperimentalColumn >>> column = ExperimentalColumn("test", [1, 2, 3]) >>> column.transform(lambda cell: 2 * cell) - Column('test', [2, 4, 6]) + +------+ + | test | + | --- | + | i64 | + +======+ + | 2 | + | 4 | + | 6 | + +------+ """ result = transformer(_VectorizedCell(self)) if not isinstance(result, _VectorizedCell): @@ -412,26 +447,29 @@ def summarize_statistics(self) -> ExperimentalTable: >>> from safeds.data.tabular.containers import ExperimentalColumn >>> column = ExperimentalColumn("a", [1, 3]) >>> column.summarize_statistics() - shape: (10, 2) - ┌──────────────────────┬──────────┐ - │ metric ┆ a │ - │ --- ┆ --- │ - │ str ┆ f64 │ - ╞══════════════════════╪══════════╡ - │ min ┆ 1.0 │ - │ max ┆ 3.0 │ - │ mean ┆ 2.0 │ - │ median ┆ 2.0 │ - │ standard deviation ┆ 1.414214 │ - │ distinct value count ┆ 2.0 │ - │ missing value count ┆ 0.0 │ - │ missing value ratio ┆ 0.0 │ - │ idness ┆ 1.0 │ - │ stability ┆ 0.5 │ - └──────────────────────┴──────────┘ + +----------------------+--------------------+ + | metric | a | + | --- | --- | + | str | str | + +===========================================+ + | min | 1 | + | max | 3 | + | mean | 2.0 | + | median | 2.0 | + | standard deviation | 1.4142135623730951 | + | distinct value count | 2 | + | idness | 1.0 | + | missing value ratio | 0.0 | + | stability | 0.5 | + +----------------------+--------------------+ """ from ._experimental_table import ExperimentalTable + # TODO: turn this around (call table method, implement in table; allows parallelization) + mean = self.mean() or "-" + median = self.median() or "-" + standard_deviation = self.standard_deviation() or "-" + return ExperimentalTable( { "metric": [ @@ -441,24 +479,22 @@ def summarize_statistics(self) -> ExperimentalTable: "median", "standard deviation", "distinct value count", - "missing value count", - "missing value ratio", "idness", + "missing value ratio", "stability", ], self.name: [ - self.min(), - self.max(), - self.mean(), - self.median(), - self.standard_deviation(), - self.distinct_value_count(), - self.missing_value_count(), - self.missing_value_ratio(), - self.idness(), - self.stability(), + str(self.min()), + str(self.max()), + str(mean), + str(median), + str(standard_deviation), + str(self.distinct_value_count()), + str(self.idness()), + str(self.missing_value_ratio()), + str(self.stability()), ], - } + }, ) def correlation_with(self, other: ExperimentalColumn) -> float: @@ -494,7 +530,7 @@ def correlation_with(self, other: ExperimentalColumn) -> float: """ import polars as pl - return pl.DataFrame({"a": self._series, "b": other._series}).corr()["a"][1] + return pl.DataFrame({"a": self._series, "b": other._series}).corr().item(row=1, column="a") def distinct_value_count(self) -> int: """ @@ -682,7 +718,14 @@ def mode(self) -> ExperimentalColumn[T]: >>> from safeds.data.tabular.containers import ExperimentalColumn >>> column = ExperimentalColumn("test", [3, 1, 2, 1, 3]) >>> column.mode() - Column('test', [1, 3]) + +------+ + | test | + | --- | + | i64 | + +======+ + | 1 | + | 3 | + +------+ """ return self._from_polars_series(self._series.mode().sort()) @@ -716,7 +759,7 @@ def stability(self) -> float: return mode_count / non_missing.len() - def standard_deviation(self) -> float: + def standard_deviation(self) -> float | None: """ Return the standard deviation of the values in the column. @@ -725,7 +768,8 @@ def standard_deviation(self) -> float: Returns ------- standard_deviation: - The standard deviation of the values in the column. + The standard deviation of the values in the column. If no standard deviation can be calculated due to the + type of the column, None is returned. Examples -------- @@ -734,9 +778,14 @@ def standard_deviation(self) -> float: >>> column.standard_deviation() 1.0 """ - return self._series.std() + from polars.exceptions import InvalidOperationError - def variance(self) -> float: + try: + return self._series.std() + except InvalidOperationError: + return None + + def variance(self) -> float | None: """ Return the variance of the values in the column. @@ -745,7 +794,8 @@ def variance(self) -> float: Returns ------- variance: - The variance of the values in the column. + The variance of the values in the column. If no variance can be calculated due to the type of the column, + None is returned. Examples -------- @@ -754,7 +804,12 @@ def variance(self) -> float: >>> column.variance() 1.0 """ - return self._series.var() + from polars.exceptions import InvalidOperationError + + try: + return self._series.var() + except InvalidOperationError: + return None # ------------------------------------------------------------------------------------------------------------------ # Export @@ -792,20 +847,19 @@ def to_table(self) -> ExperimentalTable: >>> from safeds.data.tabular.containers import ExperimentalColumn >>> column = ExperimentalColumn("test", [1, 2, 3]) >>> column.to_table() - shape: (3, 1) - ┌──────┐ - │ test │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └──────┘ + +------+ + | test | + | --- | + | i64 | + +======+ + | 1 | + | 2 | + | 3 | + +------+ """ from ._experimental_table import ExperimentalTable - return ExperimentalTable._from_polars_dataframe(self._series.to_frame()) + return ExperimentalTable._from_polars_data_frame(self._series.to_frame()) def temporary_to_old_column(self) -> Column: """ diff --git a/src/safeds/data/tabular/containers/_experimental_lazy_cell.py b/src/safeds/data/tabular/containers/_experimental_lazy_cell.py index 88b2e7276..d3b5e56cf 100644 --- a/src/safeds/data/tabular/containers/_experimental_lazy_cell.py +++ b/src/safeds/data/tabular/containers/_experimental_lazy_cell.py @@ -16,7 +16,7 @@ class _LazyCell(ExperimentalCell[T]): """ - A cell is a single value in a table. + A single value in a table. This implementation only builds an expression that will be evaluated when needed. """ @@ -82,6 +82,12 @@ def __ne__(self, other: object) -> ExperimentalCell[bool]: # type: ignore[overr def __abs__(self) -> ExperimentalCell[R]: return _wrap(self._expression.__abs__()) + def __ceil__(self) -> ExperimentalCell[R]: + return _wrap(self._expression.ceil()) + + def __floor__(self) -> ExperimentalCell[R]: + return _wrap(self._expression.floor()) + def __neg__(self) -> ExperimentalCell[R]: return _wrap(self._expression.__neg__()) @@ -164,6 +170,10 @@ def __sizeof__(self) -> int: # Internal # ------------------------------------------------------------------------------------------------------------------ + @property + def _polars_expression(self) -> pl.Expr: + return self._expression + def _equals(self, other: object) -> bool: if not isinstance(other, _LazyCell): return NotImplemented diff --git a/src/safeds/data/tabular/containers/_experimental_lazy_vectorized_row.py b/src/safeds/data/tabular/containers/_experimental_lazy_vectorized_row.py index 3b328dd07..7f29db99c 100644 --- a/src/safeds/data/tabular/containers/_experimental_lazy_vectorized_row.py +++ b/src/safeds/data/tabular/containers/_experimental_lazy_vectorized_row.py @@ -16,7 +16,7 @@ class _LazyVectorizedRow(ExperimentalRow): """ - A row is a one-dimensional collection of named, heterogeneous values. + A one-dimensional collection of named, heterogeneous values. This implementation treats an entire table as a row, where each column is a "cell" in the row. This greatly speeds up operations on the row. diff --git a/src/safeds/data/tabular/containers/_experimental_row.py b/src/safeds/data/tabular/containers/_experimental_row.py index 92c4aaf02..49a1deb8b 100644 --- a/src/safeds/data/tabular/containers/_experimental_row.py +++ b/src/safeds/data/tabular/containers/_experimental_row.py @@ -12,7 +12,11 @@ class ExperimentalRow(ABC, Mapping[str, Any]): - """A row is a one-dimensional collection of named, heterogeneous values.""" + """ + A one-dimensional collection of named, heterogeneous values. + + This class cannot be instantiated directly. It is only used for arguments of callbacks. + """ # ------------------------------------------------------------------------------------------------------------------ # Dunder methods diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 4c515a2d9..7fd7d700f 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -2,8 +2,10 @@ from typing import TYPE_CHECKING, Any, Literal +from safeds._config._polars import _get_polars_config from safeds._utils import _check_and_normalize_file_path, _structural_hash from safeds._utils._random import _get_random_seed +from safeds.data.labeled.containers import ExperimentalTabularDataset from safeds.data.tabular.plotting._experimental_table_plotter import ExperimentalTablePlotter from safeds.data.tabular.typing._experimental_polars_data_type import _PolarsDataType from safeds.data.tabular.typing._experimental_polars_schema import _PolarsSchema @@ -18,17 +20,18 @@ from ._experimental_column import ExperimentalColumn from ._experimental_lazy_cell import _LazyCell from ._experimental_lazy_vectorized_row import _LazyVectorizedRow -from ._experimental_vectorized_cell import _VectorizedCell from ._table import Table if TYPE_CHECKING: from collections.abc import Callable, Mapping, Sequence from pathlib import Path - from polars import DataFrame, LazyFrame + import polars as pl - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.transformation import InvertibleTableTransformer, TableTransformer + from safeds.data.tabular.transformation import ( + ExperimentalInvertibleTableTransformer, + ExperimentalTableTransformer, + ) from safeds.data.tabular.typing import ExperimentalSchema from safeds.data.tabular.typing._experimental_data_type import ExperimentalDataType @@ -38,7 +41,7 @@ class ExperimentalTable: """ - A table is a two-dimensional collection of data. It can either be seen as a list of rows or as a list of columns. + A two-dimensional collection of data. It can either be seen as a list of rows or as a list of columns. To create a `Table` call the constructor or use one of the following static methods: @@ -57,7 +60,7 @@ class ExperimentalTable: Raises ------ - ColumnLengthMismatchError + ValueError If columns have different lengths. Examples @@ -72,7 +75,45 @@ class ExperimentalTable: @staticmethod def from_columns(columns: ExperimentalColumn | list[ExperimentalColumn]) -> ExperimentalTable: - raise NotImplementedError + """ + Create a table from a list of columns. + + Parameters + ---------- + columns: + The columns. + + Returns + ------- + table: + The created table. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable + >>> a = ExperimentalColumn("a", [1, 2, 3]) + >>> b = ExperimentalColumn("b", [4, 5, 6]) + >>> ExperimentalTable.from_columns([a, b]) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ + import polars as pl + + # TODO: raises + + if isinstance(columns, ExperimentalColumn): + columns = [columns] + + return ExperimentalTable._from_polars_lazy_frame( + pl.LazyFrame([column._series for column in columns]), + ) @staticmethod def from_csv_file(path: str | Path) -> ExperimentalTable: @@ -100,15 +141,14 @@ def from_csv_file(path: str | Path) -> ExperimentalTable: -------- >>> from safeds.data.tabular.containers import ExperimentalTable >>> ExperimentalTable.from_csv_file("./src/resources/from_csv_file.csv") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 2 ┆ 1 │ - │ 0 ┆ 0 ┆ 7 │ - └─────┴─────┴─────┘ + +-----+-----+-----+ + | a | b | c | + | --- | --- | --- | + | i64 | i64 | i64 | + +=================+ + | 1 | 2 | 1 | + | 0 | 0 | 7 | + +-----+-----+-----+ """ import polars as pl @@ -132,7 +172,7 @@ def from_dict(data: dict[str, list[Any]]) -> ExperimentalTable: Raises ------ - ColumnLengthMismatchError + ValueError If columns have different lengths. Examples @@ -140,39 +180,112 @@ def from_dict(data: dict[str, list[Any]]) -> ExperimentalTable: >>> from safeds.data.tabular.containers import ExperimentalTable >>> data = {'a': [1, 2, 3], 'b': [4, 5, 6]} >>> ExperimentalTable.from_dict(data) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 5 │ - │ 3 ┆ 6 │ - └─────┴─────┘ + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ """ return ExperimentalTable(data) @staticmethod def from_json_file(path: str | Path) -> ExperimentalTable: - raise NotImplementedError + """ + Create a table from a JSON file. + + Parameters + ---------- + path: + The path to the JSON file. If the file extension is omitted, it is assumed to be ".json". + + Returns + ------- + table: + The created table. + + Raises + ------ + FileNotFoundError + If no file exists at the given path. + ValueError + If the path has an extension that is not ".json". + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> ExperimentalTable.from_json_file("./src/resources/from_json_file_2.json") + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ + import polars as pl + + path = _check_and_normalize_file_path(path, ".json", [".json"], check_if_file_exists=True) + return ExperimentalTable._from_polars_data_frame(pl.read_json(path)) @staticmethod def from_parquet_file(path: str | Path) -> ExperimentalTable: - raise NotImplementedError + """ + Create a table from a Parquet file. + + Parameters + ---------- + path: + The path to the Parquet file. If the file extension is omitted, it is assumed to be ".parquet". + + Returns + ------- + table: + The created table. + + Raises + ------ + FileNotFoundError + If no file exists at the given path. + ValueError + If the path has an extension that is not ".parquet". + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> ExperimentalTable.from_parquet_file("./src/resources/from_parquet_file.parquet") + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ + import polars as pl + + path = _check_and_normalize_file_path(path, ".parquet", [".parquet"], check_if_file_exists=True) + return ExperimentalTable._from_polars_lazy_frame(pl.scan_parquet(path)) @staticmethod - def _from_polars_dataframe(data: DataFrame) -> ExperimentalTable: + def _from_polars_data_frame(data: pl.DataFrame) -> ExperimentalTable: result = object.__new__(ExperimentalTable) result._lazy_frame = data.lazy() - result._data_frame = data + result.__data_frame_cache = data return result @staticmethod - def _from_polars_lazy_frame(data: LazyFrame) -> ExperimentalTable: + def _from_polars_lazy_frame(data: pl.LazyFrame) -> ExperimentalTable: result = object.__new__(ExperimentalTable) result._lazy_frame = data - result._data_frame = None + result.__data_frame_cache = None return result # ------------------------------------------------------------------------------------------------------------------ @@ -197,7 +310,7 @@ def __init__(self, data: Mapping[str, Sequence[Any]] | None = None) -> None: # Implementation self._lazy_frame: pl.LazyFrame = pl.LazyFrame(data) - self._data_frame: pl.DataFrame | None = None + self.__data_frame_cache: pl.DataFrame | None = None def __eq__(self, other: object) -> bool: if not isinstance(other, ExperimentalTable): @@ -205,38 +318,33 @@ def __eq__(self, other: object) -> bool: if self is other: return True - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - if other._data_frame is None: - other._data_frame = other._lazy_frame.collect() - return self._data_frame.frame_equal(other._data_frame) def __hash__(self) -> int: return _structural_hash(self.schema, self.number_of_rows) def __repr__(self) -> str: - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - - return self._data_frame.__repr__() + with _get_polars_config(): + return self._data_frame.__repr__() def __sizeof__(self) -> int: - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return self._data_frame.estimated_size() def __str__(self) -> str: - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - - return self._data_frame.__str__() + with _get_polars_config(): + return self._data_frame.__str__() # ------------------------------------------------------------------------------------------------------------------ # Properties # ------------------------------------------------------------------------------------------------------------------ + @property + def _data_frame(self) -> pl.DataFrame: + if self.__data_frame_cache is None: + self.__data_frame_cache = self._lazy_frame.collect() + + return self.__data_frame_cache + @property def column_names(self) -> list[str]: """ @@ -270,7 +378,7 @@ def number_of_rows(self) -> int: """ The number of rows in the table. - Note that this operation must fully load the data into memory, which can be expensive. + **Note:** This operation must fully load the data into memory, which can be expensive. Examples -------- @@ -279,17 +387,16 @@ def number_of_rows(self) -> int: >>> table.number_of_rows 3 """ - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return self._data_frame.height @property def plot(self) -> ExperimentalTablePlotter: + """The plotter for the table.""" return ExperimentalTablePlotter(self) @property def schema(self) -> ExperimentalSchema: + """The schema of the table.""" return _PolarsSchema(self._lazy_frame.schema) # ------------------------------------------------------------------------------------------------------------------ @@ -300,62 +407,366 @@ def add_columns( self, columns: ExperimentalColumn | list[ExperimentalColumn], ) -> ExperimentalTable: - raise NotImplementedError + """ + Return a new table with additional columns. + + **Notes:** + + * The original table is not modified. + * This operation must fully load the data into memory, which can be expensive. - def compute_column( + Parameters + ---------- + columns: + The columns to add. + + Returns + ------- + new_table: + The table with the additional columns. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3]}) + >>> new_column = ExperimentalColumn("b", [4, 5, 6]) + >>> table.add_columns(new_column) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ + if isinstance(columns, ExperimentalColumn): + columns = [columns] + + if len(columns) == 0: + return self + + return ExperimentalTable._from_polars_data_frame( + self._data_frame.hstack([column._series for column in columns]), + ) + + def add_computed_column( self, name: str, computer: Callable[[ExperimentalRow], ExperimentalCell], ) -> ExperimentalTable: + """ + Return a new table with an additional computed column. + + **Note:** The original table is not modified. + + Parameters + ---------- + name: + The name of the new column. + computer: + The function that computes the values of the new column. + + Returns + ------- + new_table: + The table with the computed column. + + Raises + ------ + ValueError + If the column name already exists. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.add_computed_column("c", lambda row: row.get_value("a") + row.get_value("b")) + +-----+-----+-----+ + | a | b | c | + | --- | --- | --- | + | i64 | i64 | i64 | + +=================+ + | 1 | 4 | 5 | + | 2 | 5 | 7 | + | 3 | 6 | 9 | + +-----+-----+-----+ + """ if self.has_column(name): raise DuplicateColumnNameError(name) computed_column = computer(_LazyVectorizedRow(self)) - if not isinstance(computed_column, _LazyCell): - raise TypeError("The computer must return a cell.") return self._from_polars_lazy_frame( - self._lazy_frame.with_columns(name, computed_column._expression), + self._lazy_frame.with_columns(computed_column._polars_expression.alias(name)), ) def get_column(self, name: str) -> ExperimentalColumn: - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() + """ + Get a column from the table. + + **Note:** This operation must fully load the data into memory, which can be expensive. + + Parameters + ---------- + name: + The name of the column. + + Returns + ------- + column: + The column. + + Raises + ------ + KeyError + If the column does not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.get_column("a") + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + +-----+ + """ + if not self.has_column(name): + raise UnknownColumnNameError([name]) return ExperimentalColumn._from_polars_series(self._data_frame.get_column(name)) def get_column_type(self, name: str) -> ExperimentalDataType: + """ + Get the data type of a column. + + Parameters + ---------- + name: + The name of the column. + + Returns + ------- + type: + The data type of the column. + + Raises + ------ + KeyError + If the column does not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.get_column_type("a") + Int64 + """ + if not self.has_column(name): + raise UnknownColumnNameError([name]) + return _PolarsDataType(self._lazy_frame.schema[name]) def has_column(self, name: str) -> bool: + """ + Check if the table has a column with a specific name. + + Parameters + ---------- + name: + The name of the column. + + Returns + ------- + has_column: + Whether the table has a column with the specified name. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.has_column("a") + True + """ return name in self.column_names - def remove_columns(self, names: str | list[str]) -> ExperimentalTable: + def remove_columns( + self, + names: str | list[str], + /, + ) -> ExperimentalTable: + """ + Return a new table without the specified columns. + + **Note:** The original table is not modified. + + Parameters + ---------- + names: + The names of the columns to remove. + + Returns + ------- + new_table: + The table with the columns removed. + + Raises + ------ + KeyError + If a column does not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.remove_columns("a") + +-----+ + | b | + | --- | + | i64 | + +=====+ + | 4 | + | 5 | + | 6 | + +-----+ + """ if isinstance(names, str): names = [names] + # TODO: raises? + return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.drop(names), ) - def remove_columns_except(self, names: str | list[str]) -> ExperimentalTable: + def remove_columns_except( + self, + names: str | list[str], + /, + ) -> ExperimentalTable: + """ + Return a new table with only the specified columns. + + Parameters + ---------- + names: + The names of the columns to keep. + + Returns + ------- + new_table: + The table with only the specified columns. + + Raises + ------ + KeyError + If a column does not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.remove_columns_except("a") + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + +-----+ + """ if isinstance(names, str): names = [names] - names_set = set(names) - return self.remove_columns([name for name in self.column_names if name not in names_set]) + # TODO: raises? + + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.select(names), + ) def remove_columns_with_missing_values(self) -> ExperimentalTable: - raise NotImplementedError + """ + Return a new table without columns that contain missing values. - def remove_columns_with_non_numeric_values(self) -> ExperimentalTable: - raise NotImplementedError + **Notes:** + + * The original table is not modified. + * This operation must fully load the data into memory, which can be expensive. + + Returns + ------- + new_table: + The table without columns containing missing values. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, None]}) + >>> table.remove_columns_with_missing_values() + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + +-----+ + """ + import polars as pl + + return ExperimentalTable._from_polars_lazy_frame( + pl.LazyFrame( + [series for series in self._data_frame.get_columns() if series.null_count() == 0], + ), + ) + + def remove_non_numeric_columns(self) -> ExperimentalTable: + """ + Return a new table without non-numeric columns. + + **Note:** The original table is not modified. + + Returns + ------- + new_table: + The table without non-numeric columns. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": ["4", "5", "6"]}) + >>> table.remove_non_numeric_columns() + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + +-----+ + """ + import polars.selectors as cs + + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.select(cs.numeric()), + ) def rename_column(self, old_name: str, new_name: str) -> ExperimentalTable: """ Return a new table with a column renamed. - Note that the original table is not modified. + **Note:** The original table is not modified. Parameters ---------- @@ -369,23 +780,29 @@ def rename_column(self, old_name: str, new_name: str) -> ExperimentalTable: new_table: The table with the column renamed. + Raises + ------ + KeyError + If no column with the old name exists. + Examples -------- >>> from safeds.data.tabular.containers import ExperimentalTable >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.rename_column("a", "A") - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 5 │ - │ 3 ┆ 6 │ - └─────┴─────┘ + >>> table.rename_column("a", "c") + +-----+-----+ + | c | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ """ - # TODO: raises? + if not self.has_column(old_name): + raise UnknownColumnNameError([old_name]) + return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.rename({old_name: new_name}), ) @@ -395,28 +812,92 @@ def replace_column( old_name: str, new_columns: ExperimentalColumn | list[ExperimentalColumn], ) -> ExperimentalTable: + """ + Return a new table with a column replaced by zero or more columns. + + **Note:** The original table is not modified. + + Parameters + ---------- + old_name: + The name of the column to replace. + new_columns: + The new column or columns. + + Returns + ------- + new_table: + The table with the column replaced. + + Raises + ------ + KeyError + If no column with the old name exists. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.replace_column("a", []) + +-----+ + | b | + | --- | + | i64 | + +=====+ + | 4 | + | 5 | + | 6 | + +-----+ + + >>> column1 = ExperimentalColumn("c", [7, 8, 9]) + >>> table.replace_column("a", column1) + +-----+-----+ + | c | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 7 | 4 | + | 8 | 5 | + | 9 | 6 | + +-----+-----+ + + >>> column2 = ExperimentalColumn("d", [10, 11, 12]) + >>> table.replace_column("a", [column1, column2]) + +-----+-----+-----+ + | c | d | b | + | --- | --- | --- | + | i64 | i64 | i64 | + +=================+ + | 7 | 10 | 4 | + | 8 | 11 | 5 | + | 9 | 12 | 6 | + +-----+-----+-----+ + """ + if not self.has_column(old_name): + raise UnknownColumnNameError([old_name]) + if isinstance(new_columns, ExperimentalColumn): new_columns = [new_columns] if len(new_columns) == 0: return self.remove_columns(old_name) - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - - new_frame = self._data_frame - index = new_frame.get_column_index(old_name) - if len(new_columns) == 1: - return ExperimentalTable._from_polars_dataframe( - new_frame.replace_column(index, new_columns[0]._series), + new_column = new_columns[0] + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.with_columns(new_column._series.alias(old_name)).rename({old_name: new_column.name}), ) - prefix = new_frame.select(self.column_names[:index]) - suffix = new_frame.select(self.column_names[index + 1 :]) + import polars as pl - return ExperimentalTable._from_polars_dataframe( - prefix.hstack([column._series for column in new_columns]).hstack(suffix), + index = self.column_names.index(old_name) + + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.select( + *[pl.col(name) for name in self.column_names[:index]], + *[column._series for column in new_columns], + *[pl.col(name) for name in self.column_names[index + 1 :]], + ), ) def transform_column( @@ -424,16 +905,53 @@ def transform_column( name: str, transformer: Callable[[ExperimentalCell], ExperimentalCell], ) -> ExperimentalTable: + """ + Return a new table with a column transformed. + + **Note:** The original table is not modified. + + Parameters + ---------- + name: + The name of the column to transform. + + transformer: + The function that transforms the column. + + Returns + ------- + new_table: + The table with the transformed column. + + Raises + ------ + KeyError + If no column with the specified name exists. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.transform_column("a", lambda cell: cell + 1) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 2 | 4 | + | 3 | 5 | + | 4 | 6 | + +-----+-----+ + """ if not self.has_column(name): raise UnknownColumnNameError([name]) # TODO: in the error, compute similar column names - transformed_column = transformer(_VectorizedCell(self.get_column(name))) - if not isinstance(transformed_column, _VectorizedCell): - raise TypeError("The transformer must return a cell.") + import polars as pl + + transformed_column = transformer(_LazyCell(pl.col(name))) - return self.replace_column( - name, - ExperimentalColumn._from_polars_series(transformed_column._series), + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.with_columns(transformed_column._polars_expression), ) # ------------------------------------------------------------------------------------------------------------------ @@ -444,42 +962,70 @@ def transform_column( def remove_duplicate_rows(self) -> ExperimentalTable: """ - Remove duplicate rows from the table. + Return a new table without duplicate rows. + + **Note:** The original table is not modified. Returns ------- - filtered_table: + new_table: The table without duplicate rows. Examples -------- >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"a": [1, 2, 2], "b": [4, 5, 5]}) - >>> table.remove_duplicate_rows() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 5 │ - └─────┴─────┘ + >>> table = ExperimentalTable({"a": [1, 2, 2], "b": [4, 5, 5]}) + >>> table.remove_duplicate_rows() + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + +-----+-----+ + """ + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.unique(maintain_order=True), + ) + + def remove_rows( + self, + query: Callable[[ExperimentalRow], ExperimentalCell[bool]], + ) -> ExperimentalTable: + """ + Return a new table without rows that satisfy a condition. + + **Note:** The original table is not modified. + + Parameters + ---------- + query: + The function that determines which rows to remove. + + Returns + ------- + new_table: + The table without the specified rows. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.remove_rows(lambda row: row.get_value("a") == 2) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 3 | 6 | + +-----+-----+ """ - return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.unique(maintain_order=True), - ) - - def remove_rows( - self, - query: Callable[[ExperimentalRow], ExperimentalCell[bool]], - ) -> ExperimentalTable: mask = query(_LazyVectorizedRow(self)) - if not isinstance(mask, _LazyCell): - raise TypeError("The query must return a boolean cell.") return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.filter(mask._expression), + self._lazy_frame.filter(~mask._polars_expression), ) def remove_rows_by_column( @@ -487,25 +1033,70 @@ def remove_rows_by_column( name: str, query: Callable[[ExperimentalCell], ExperimentalCell[bool]], ) -> ExperimentalTable: - raise NotImplementedError + """ + Return a new table without rows that satisfy a condition on a specific column. + + **Note:** The original table is not modified. + + Parameters + ---------- + name: + The name of the column. + query: + The function that determines which rows to remove. + + Returns + ------- + new_table: + The table without the specified rows. + + Raises + ------ + KeyError + If the column does not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.remove_rows_by_column("a", lambda cell: cell == 2) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 3 | 6 | + +-----+-----+ + """ + import polars as pl + + if not self.has_column(name): + raise UnknownColumnNameError([name]) + + mask = query(_LazyCell(pl.col(name))) + + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.filter(~mask._polars_expression), + ) def remove_rows_with_missing_values( self, - subset_names: list[str] | None = None, + column_names: list[str] | None = None, ) -> ExperimentalTable: """ - Remove rows with missing values from the table. + Return a new table without rows containing missing values in the specified columns. - Note that the original table is not modified. + **Note:** The original table is not modified. Parameters ---------- - subset_names: + column_names: Names of the columns to consider. If None, all columns are considered. Returns ------- - filtered_table: + new_table: The table without rows containing missing values in the specified columns. Examples @@ -513,30 +1104,119 @@ def remove_rows_with_missing_values( >>> from safeds.data.tabular.containers import ExperimentalTable >>> table = ExperimentalTable({"a": [1, None, 3], "b": [4, 5, None]}) >>> table.remove_rows_with_missing_values() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - └─────┴─────┘ + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + +-----+-----+ """ return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.drop_nulls(subset=subset_names), + self._lazy_frame.drop_nulls(subset=column_names), ) def remove_rows_with_outliers( self, - subset_names: list[str] | None = None, + column_names: list[str] | None = None, + *, + z_score_threshold: float = 3, ) -> ExperimentalTable: - raise NotImplementedError + """ + Return a new table without rows containing outliers in the specified columns. + + Whether a data point is an outlier in a column is determined by its z-score. The z-score the distance of the + data point from the mean of the column divided by the standard deviation of the column. If the z-score is + greater than the given threshold, the data point is considered an outlier. Missing values are ignored during the + calculation of the z-score. + + The z-score is only defined for numeric columns. Non-numeric columns are ignored, even if they are specified in + `column_names`. + + **Notes:** + + * The original table is not modified. + * This operation must fully load the data into memory, which can be expensive. + + Parameters + ---------- + column_names: + Names of the columns to consider. If None, all numeric columns are considered. + z_score_threshold: + The z-score threshold for detecting outliers. + + Returns + ------- + new_table: + The table without rows containing outliers in the specified columns. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable( + ... { + ... "a": [1, 2, 3, 4, 5, 6, 1000, None], + ... "b": [1, 2, 3, 4, 5, 6, 7, 8], + ... } + ... ) + >>> table.remove_rows_with_outliers(z_score_threshold=2) + +------+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +============+ + | 1 | 1 | + | 2 | 2 | + | 3 | 3 | + | 4 | 4 | + | 5 | 5 | + | 6 | 6 | + | null | 8 | + +------+-----+ + """ + if column_names is None: + column_names = self.column_names + + import polars as pl + import polars.selectors as cs + + non_outlier_mask = pl.all_horizontal( + self._data_frame.select(cs.numeric() & cs.by_name(column_names)).select( + pl.all().is_null() | (((pl.all() - pl.all().mean()) / pl.all().std()).abs() <= z_score_threshold), + ), + ) + + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.filter(non_outlier_mask), + ) def shuffle_rows(self) -> ExperimentalTable: - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() + """ + Return a new table with the rows shuffled. + + **Note:** The original table is not modified. + + Returns + ------- + new_table: + The table with the rows shuffled. - return ExperimentalTable._from_polars_dataframe( + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.shuffle_rows() + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 3 | 6 | + | 2 | 5 | + | 1 | 4 | + +-----+-----+ + """ + return ExperimentalTable._from_polars_data_frame( self._data_frame.sample( fraction=1, shuffle=True, @@ -545,6 +1225,46 @@ def shuffle_rows(self) -> ExperimentalTable: ) def slice_rows(self, start: int = 0, length: int | None = None) -> ExperimentalTable: + """ + Return a new table with a slice of rows. + + **Note:** The original table is not modified. + + Parameters + ---------- + start: + The start index of the slice. + length: + The length of the slice. If None, the slice contains all rows starting from `start`. + + Returns + ------- + new_table: + The table with the slice of rows. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.slice_rows(start=1) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + + >>> table.slice_rows(start=1, length=1) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 2 | 5 | + +-----+-----+ + """ return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.slice(start, length), ) @@ -555,13 +1275,43 @@ def sort_rows( *, descending: bool = False, ) -> ExperimentalTable: + """ + Return a new table with the rows sorted. + + **Note:** The original table is not modified. + + Parameters + ---------- + key_selector: + The function that selects the key to sort by. + descending: + Whether to sort in descending order. + + Returns + ------- + new_table: + The table with the rows sorted. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [2, 1, 3], "b": [1, 1, 2]}) + >>> table.sort_rows(lambda row: row.get_value("a") - row.get_value("b")) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 1 | + | 2 | 1 | + | 3 | 2 | + +-----+-----+ + """ key = key_selector(_LazyVectorizedRow(self)) - if not isinstance(key, _LazyCell): - raise TypeError("The key selector must return a cell.") return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.sort( - key._expression, + key._polars_expression, descending=descending, maintain_order=True, ), @@ -573,6 +1323,46 @@ def sort_rows_by_column( *, descending: bool = False, ) -> ExperimentalTable: + """ + Return a new table with the rows sorted by a specific column. + + **Note:** The original table is not modified. + + Parameters + ---------- + name: + The name of the column to sort by. + descending: + Whether to sort in descending order. + + Returns + ------- + new_table: + The table with the rows sorted by the specified column. + + Raises + ------ + KeyError + If the column does not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [2, 1, 3], "b": [1, 1, 2]}) + >>> table.sort_rows_by_column("a") + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 1 | + | 2 | 1 | + | 3 | 2 | + +-----+-----+ + """ + if not self.has_column(name): + raise UnknownColumnNameError([name]) + return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.sort( name, @@ -587,6 +1377,58 @@ def split_rows( *, shuffle: bool = True, ) -> tuple[ExperimentalTable, ExperimentalTable]: + """ + Create two tables by splitting the rows of the current table. + + The first table contains a percentage of the rows specified by `percentage_in_first`, and the second table + contains the remaining rows. + + **Note:** The original table is not modified. + + Parameters + ---------- + percentage_in_first: + The percentage of rows to include in the first table. Must be between 0 and 1. + shuffle: + Whether to shuffle the rows before splitting. + + Returns + ------- + first_table: + The first table. + second_table: + The second table. + + Raises + ------ + ValueError + If `percentage_in_first` is not between 0 and 1. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3, 4, 5], "b": [6, 7, 8, 9, 10]}) + >>> first_table, second_table = table.split_rows(0.6) + >>> first_table + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 6 | + | 4 | 9 | + | 3 | 8 | + +-----+-----+ + >>> second_table + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 5 | 10 | + | 2 | 7 | + +-----+-----+ + """ if percentage_in_first < 0 or percentage_in_first > 1: raise OutOfBoundsError( actual=percentage_in_first, @@ -608,30 +1450,231 @@ def split_rows( # ------------------------------------------------------------------------------------------------------------------ def add_table_as_columns(self, other: ExperimentalTable) -> ExperimentalTable: - raise NotImplementedError + """ + Return a new table with the columns of another table added. + + **Notes:** + + * The original tables are not modified. + * This operation must fully load the data into memory, which can be expensive. + + Parameters + ---------- + other: + The table to add as columns. + + Returns + ------- + new_table: + The table with the columns added. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table1 = ExperimentalTable({"a": [1, 2, 3]}) + >>> table2 = ExperimentalTable({"b": [4, 5, 6]}) + >>> table1.add_table_as_columns(table2) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ + # TODO: raises? + + return ExperimentalTable._from_polars_data_frame( + self._data_frame.hstack(other._data_frame), + ) def add_table_as_rows(self, other: ExperimentalTable) -> ExperimentalTable: - raise NotImplementedError + """ + Return a new table with the rows of another table added. + + **Notes:** + + * The original tables are not modified. + * This operation must fully load the data into memory, which can be expensive. + + Parameters + ---------- + other: + The table to add as rows. + + Returns + ------- + new_table: + The table with the rows added. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table1 = ExperimentalTable({"a": [1, 2, 3]}) + >>> table2 = ExperimentalTable({"a": [4, 5, 6]}) + >>> table1.add_table_as_rows(table2) + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + | 4 | + | 5 | + | 6 | + +-----+ + """ + # TODO: raises? + + return ExperimentalTable._from_polars_data_frame( + self._data_frame.vstack(other._data_frame), + ) + + def inverse_transform_table(self, fitted_transformer: ExperimentalInvertibleTableTransformer) -> ExperimentalTable: + """ + Return a new table inverse-transformed by a **fitted, invertible** transformer. + + **Notes:** + + * The original table is not modified. + * Depending on the transformer, this operation might fully load the data into memory, which can be expensive. + + Parameters + ---------- + fitted_transformer: + The fitted, invertible transformer to apply. + + Returns + ------- + new_table: + The inverse-transformed table. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> from safeds.data.tabular.transformation import ExperimentalRangeScaler + >>> table = ExperimentalTable({"a": [1, 2, 3]}) + >>> transformer, transformed_table = ExperimentalRangeScaler(min_=0, max_=1).fit_and_transform(table, ["a"]) + >>> transformed_table.inverse_transform_table(transformer) + +---------+ + | a | + | --- | + | f64 | + +=========+ + | 1.00000 | + | 2.00000 | + | 3.00000 | + +---------+ + """ + return fitted_transformer.inverse_transform(self) + + def transform_table(self, fitted_transformer: ExperimentalTableTransformer) -> ExperimentalTable: + """ + Return a new table transformed by a **fitted** transformer. + + **Notes:** + + * The original table is not modified. + * Depending on the transformer, this operation might fully load the data into memory, which can be expensive. + + + Parameters + ---------- + fitted_transformer: + The fitted transformer to apply. - def inverse_transform_table(self, fitted_transformer: InvertibleTableTransformer) -> ExperimentalTable: - raise NotImplementedError + Returns + ------- + new_table: + The transformed table. - def transform_table(self, fitted_transformer: TableTransformer) -> ExperimentalTable: - raise NotImplementedError + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> from safeds.data.tabular.transformation import ExperimentalRangeScaler + >>> table = ExperimentalTable({"a": [1, 2, 3]}) + >>> transformer = ExperimentalRangeScaler(min_=0, max_=1).fit(table, ["a"]) + >>> table.transform_table(transformer) + +---------+ + | a | + | --- | + | f64 | + +=========+ + | 0.00000 | + | 0.50000 | + | 1.00000 | + +---------+ + """ + return fitted_transformer.transform(self) # ------------------------------------------------------------------------------------------------------------------ # Statistics # ------------------------------------------------------------------------------------------------------------------ def summarize_statistics(self) -> ExperimentalTable: - raise NotImplementedError + """ + Return a table with important statistics about this table. + + Returns + ------- + statistics: + The table with statistics. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 3]}) + >>> table.summarize_statistics() + +----------------------+--------------------+ + | metric | a | + | --- | --- | + | str | str | + +===========================================+ + | min | 1 | + | max | 3 | + | mean | 2.0 | + | median | 2.0 | + | standard deviation | 1.4142135623730951 | + | distinct value count | 2 | + | idness | 1.0 | + | missing value ratio | 0.0 | + | stability | 0.5 | + +----------------------+--------------------+ + """ + if self.number_of_columns == 0: + return ExperimentalTable() + + head = self.get_column(self.column_names[0]).summarize_statistics() + tail = [self.get_column(name).summarize_statistics().get_column(name)._series for name in self.column_names[1:]] + + return ExperimentalTable._from_polars_data_frame( + head._lazy_frame.collect().hstack(tail, in_place=True), + ) # ------------------------------------------------------------------------------------------------------------------ # Export # ------------------------------------------------------------------------------------------------------------------ def to_columns(self) -> list[ExperimentalColumn]: - raise NotImplementedError + """ + Return the data of the table as a list of columns. + + Returns + ------- + columns: + List of columns. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> columns = table.to_columns() + """ + return [ExperimentalColumn._from_polars_series(column) for column in self._data_frame.get_columns()] def to_csv_file(self, path: str | Path) -> None: """ @@ -677,9 +1720,6 @@ def to_dict(self) -> dict[str, list[Any]]: >>> table.to_dict() {'a': [1, 2, 3], 'b': [4, 5, 6]} """ - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return self._data_frame.to_dict(as_series=False) def to_json_file( @@ -694,7 +1734,7 @@ def to_json_file( If the file and/or the parent directories do not exist, they will be created. If the file exists already, it will be overwritten. - Note that this operation must fully load the data into memory, which can be expensive. + **Note:** This operation must fully load the data into memory, which can be expensive. Parameters ---------- @@ -714,28 +1754,50 @@ def to_json_file( -------- >>> from safeds.data.tabular.containers import ExperimentalTable >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.to_json_file("./src/resources/to_json_file.json") + >>> table.to_json_file("./src/resources/to_json_file_2.json") """ path = _check_and_normalize_file_path(path, ".json", [".json"]) path.parent.mkdir(parents=True, exist_ok=True) # Write JSON to file - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - self._data_frame.write_json(path, row_oriented=(orientation == "row")) def to_parquet_file(self, path: str | Path) -> None: - raise NotImplementedError + """ + Write the table to a Parquet file. + + If the file and/or the parent directories do not exist, they will be created. If the file exists already, it + will be overwritten. + + Parameters + ---------- + path: + The path to the Parquet file. If the file extension is omitted, it is assumed to be ".parquet". - def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = None) -> TabularDataset: + Raises + ------ + ValueError + If the path has an extension that is not ".parquet". + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.to_parquet_file("./src/resources/to_parquet_file.parquet") + """ + path = _check_and_normalize_file_path(path, ".parquet", [".parquet"]) + path.parent.mkdir(parents=True, exist_ok=True) + + self._lazy_frame.sink_parquet(path) + + def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = None) -> ExperimentalTabularDataset: """ Return a new `TabularDataset` with columns marked as a target, feature, or extra. * The target column is the column that a model should predict. * Feature columns are columns that a model should use to make predictions. * Extra columns are columns that are neither feature nor target. They can be used to provide additional context, - like an ID or name column. + like an ID column. Feature columns are implicitly defined as all columns except the target and extra columns. If no extra columns are specified, all columns except the target column are used as features. @@ -763,13 +1825,16 @@ def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = N Examples -------- >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"item": ["apple", "milk", "beer"], "price": [1.10, 1.19, 1.79], "amount_bought": [74, 72, 51]}) + >>> table = ExperimentalTable( + ... { + ... "item": ["apple", "milk", "beer"], + ... "price": [1.10, 1.19, 1.79], + ... "amount_bought": [74, 72, 51], + ... } + ... ) >>> dataset = table.to_tabular_dataset(target_name="amount_bought", extra_names=["item"]) """ - from safeds.data.labeled.containers import TabularDataset - - # TODO: more efficient implementation - return TabularDataset(self.temporary_to_old_table(), target_name, extra_names) + return ExperimentalTabularDataset(self, target_name, extra_names) def temporary_to_old_table(self) -> Table: """ @@ -786,9 +1851,6 @@ def temporary_to_old_table(self) -> Table: >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) >>> old_table = table.temporary_to_old_table() """ - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return Table._from_pandas_dataframe(self._data_frame.to_pandas()) # ------------------------------------------------------------------------------------------------------------------ @@ -807,7 +1869,7 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): # The specification of the dataframe interchange protocol can be found [here](https://data-apis.org/dataframe-protocol/latest/index.html). - Note that this operation must fully load the data into memory, which can be expensive. + **Note:** This operation must fully load the data into memory, which can be expensive. Parameters ---------- @@ -822,9 +1884,6 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): # dataframe: A dataframe object that conforms to the dataframe interchange protocol. """ - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return self._data_frame.__dataframe__(allow_copy=allow_copy) # ------------------------------------------------------------------------------------------------------------------ @@ -835,14 +1894,11 @@ def _repr_html_(self) -> str: """ Return a compact HTML representation of the table for IPython. - Note that this operation must fully load the data into memory, which can be expensive. + **Note:** This operation must fully load the data into memory, which can be expensive. Returns ------- html: The generated HTML. """ - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return self._data_frame._repr_html_() diff --git a/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py b/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py index 34ee1893f..b648cc2ab 100644 --- a/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py +++ b/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py @@ -21,7 +21,7 @@ class _VectorizedCell(ExperimentalCell[T]): """ - A cell is a single value in a table. + A single value in a table. This implementation treats an entire column as a cell. This greatly speeds up operations on the cell. """ @@ -126,6 +126,12 @@ def __ne__(self, other: object) -> ExperimentalCell[bool]: # type: ignore[overr def __abs__(self) -> ExperimentalCell[R]: return _wrap(self._series.__abs__()) + def __ceil__(self) -> ExperimentalCell[R]: + return _wrap(self._series.ceil()) + + def __floor__(self) -> ExperimentalCell[R]: + return _wrap(self._series.floor()) + def __neg__(self) -> ExperimentalCell[R]: return _wrap(self._series.__neg__()) @@ -220,6 +226,10 @@ def type(self) -> ExperimentalDataType: # Internal # ------------------------------------------------------------------------------------------------------------------ + @property + def _polars_expression(self) -> pl.Series: + return self._series + def _equals(self, other: object) -> bool: if not isinstance(other, _VectorizedCell): return NotImplemented diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 67aa79e0c..91bbbf347 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, TypeVar -from safeds._config import _init_default_device, _get_device +from safeds._config import _get_device, _init_default_device from safeds._utils import _structural_hash from safeds.data.image.containers import Image from safeds.data.tabular.typing import ColumnType, Schema diff --git a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py index 866229992..3402cd6d4 100644 --- a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py @@ -2,20 +2,135 @@ from typing import TYPE_CHECKING +from safeds._utils import _figure_to_image +from safeds.exceptions import NonNumericColumnError + if TYPE_CHECKING: from safeds.data.image.containers import Image from safeds.data.tabular.containers import ExperimentalColumn class ExperimentalColumnPlotter: + """ + A class that contains plotting methods for a column. + + Parameters + ---------- + column: + The column to plot. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("test", [1, 2, 3]) + >>> plotter = column.plot + """ + def __init__(self, column: ExperimentalColumn): - self.column: ExperimentalColumn = column + self._column: ExperimentalColumn = column def box_plot(self) -> Image: - raise NotImplementedError + """ + Create a box plot for the values in the column. This is only possible for numeric columns. + + Returns + ------- + box_plot: + The box plot as an image. + + Raises + ------ + TypeError + If the column is not numeric. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("test", [1, 2, 3]) + >>> boxplot = column.plot.box_plot() + """ + if not self._column.is_numeric: + raise NonNumericColumnError(f"{self._column.name} is of type {self._column.type}.") + + import matplotlib.pyplot as plt + + fig, ax = plt.subplots() + plot = ax.boxplot( + self._column._series, + patch_artist=True, + ) + plt.setp(plot["boxes"], facecolor="lightsteelblue") + plt.setp(plot["medians"], color="red") + + ax.set(title=self._column.name) + ax.set_xticks([]) + ax.yaxis.grid(visible=True) + fig.tight_layout() + + return _figure_to_image(fig) + + def histogram(self, *, number_of_bins: int = 10) -> Image: + """ + Create a histogram for the values in the column. + + Parameters + ---------- + number_of_bins: + The number of bins to use in the histogram. Default is 10. + + Returns + ------- + histogram: + The plot as an image. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("test", [1, 2, 3]) + >>> histogram = column.plot.histogram() + """ + return self._column.to_table().plot.histograms(number_of_bins=number_of_bins) + + def lag_plot(self, lag: int) -> Image: + """ + Create a lag plot for the values in the column. + + Parameters + ---------- + lag: + The amount of lag. + + Returns + ------- + lag_plot: + The plot as an image. + + Raises + ------ + TypeError + If the column is not numeric. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("values", [1, 2, 3, 4]) + >>> image = column.plot.lag_plot(2) + """ + if not self._column.is_numeric: + raise NonNumericColumnError("This time series target contains non-numerical columns.") + + import matplotlib.pyplot as plt - def histogram(self) -> Image: - raise NotImplementedError + fig, ax = plt.subplots() + series = self._column._series + ax.scatter( + x=series.slice(0, len(self._column) - lag), + y=series.slice(lag), + ) + ax.set( + xlabel="y(t)", + ylabel=f"y(t + {lag})", + ) + fig.tight_layout() - def lag_plot(self) -> Image: - raise NotImplementedError + return _figure_to_image(fig) diff --git a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py index 7c8ca79be..16c8e5916 100644 --- a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py @@ -1,7 +1,11 @@ from __future__ import annotations +import warnings from typing import TYPE_CHECKING +from safeds._utils import _figure_to_image +from safeds.exceptions import NonNumericColumnError, UnknownColumnNameError + if TYPE_CHECKING: from safeds.data.image.containers import Image from safeds.data.tabular.containers import ExperimentalTable @@ -9,21 +13,325 @@ class ExperimentalTablePlotter: def __init__(self, table: ExperimentalTable): - self.table: ExperimentalTable = table + self._table: ExperimentalTable = table def box_plots(self) -> Image: - raise NotImplementedError + """ + Plot a boxplot for every numerical column. + + Returns + ------- + plot: + The plot as an image. + + Raises + ------ + NonNumericColumnError + If the table contains only non-numerical columns. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> table = Table({"a":[1, 2], "b": [3, 42]}) + >>> image = table.plot_boxplots() + """ + # TOOD: implement using matplotlib and polars + import matplotlib.pyplot as plt + import seaborn as sns + + numerical_table = self._table.remove_non_numeric_columns() + if numerical_table.number_of_columns == 0: + raise NonNumericColumnError("This table contains only non-numerical columns.") + col_wrap = min(numerical_table.number_of_columns, 3) + + data = numerical_table._lazy_frame.melt(value_vars=numerical_table.column_names).collect() + grid = sns.FacetGrid(data, col="variable", col_wrap=col_wrap, sharex=False, sharey=False) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Using the boxplot function without specifying `order` is likely to produce an incorrect plot.", + ) + grid.map(sns.boxplot, "variable", "value") + grid.set_xlabels("") + grid.set_ylabels("") + grid.set_titles("{col_name}") + for axes in grid.axes.flat: + axes.set_xticks([]) + plt.tight_layout() + fig = grid.fig + + return _figure_to_image(fig) def correlation_heatmap(self) -> Image: - raise NotImplementedError + """ + Plot a correlation heatmap for all numerical columns of this `Table`. + + Returns + ------- + plot: + The plot as an image. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) + >>> image = table.plot_correlation_heatmap() + """ + # TODO: implement using matplotlib and polars + # https://stackoverflow.com/questions/33282368/plotting-a-2d-heatmap + import matplotlib.pyplot as plt + import seaborn as sns + + only_numerical = self._table.remove_non_numeric_columns() + + if self._table.number_of_rows == 0: + warnings.warn( + "An empty table has been used. A correlation heatmap on an empty table will show nothing.", + stacklevel=2, + ) + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=( + "Attempting to set identical low and high (xlims|ylims) makes transformation singular;" + " automatically expanding." + ), + ) + fig = plt.figure() + sns.heatmap( + data=only_numerical._data_frame.corr(), + vmin=-1, + vmax=1, + xticklabels=only_numerical.column_names, + yticklabels=only_numerical.column_names, + cmap="vlag", + ) + plt.tight_layout() + else: + fig = plt.figure() + sns.heatmap( + data=only_numerical._data_frame.corr(), + vmin=-1, + vmax=1, + xticklabels=only_numerical.column_names, + yticklabels=only_numerical.column_names, + cmap="vlag", + ) + plt.tight_layout() + + return _figure_to_image(fig) def histograms(self, *, number_of_bins: int = 10) -> Image: - raise NotImplementedError + """ + Plot a histogram for every column. + + Parameters + ---------- + number_of_bins: + The number of bins to use in the histogram. Default is 10. + + Returns + ------- + plot: + The plot as an image. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> table = Table({"a": [2, 3, 5, 1], "b": [54, 74, 90, 2014]}) + >>> image = table.plot_histograms() + """ + # TODO: implement using polars + import matplotlib.pyplot as plt + import numpy as np + import pandas as pd + + n_cols = min(3, self._table.number_of_columns) + n_rows = 1 + (self._table.number_of_columns - 1) // n_cols + + if n_cols == 1 and n_rows == 1: + fig, axs = plt.subplots(1, 1, tight_layout=True) + one_col = True + else: + fig, axs = plt.subplots(n_rows, n_cols, tight_layout=True, figsize=(n_cols * 3, n_rows * 3)) + one_col = False + + col_names = self._table.column_names + for col_name, ax in zip(col_names, axs.flatten() if not one_col else [axs], strict=False): + np_col = np.array(self._table.get_column(col_name)) + bins = min(number_of_bins, len(pd.unique(np_col))) + + ax.set_title(col_name) + ax.set_xlabel("") + ax.set_ylabel("") + + if self._table.get_column(col_name).type.is_numeric: + np_col = np_col[~np.isnan(np_col)] + + if bins < len(pd.unique(np_col)): + min_val = np.min(np_col) + max_val = np.max(np_col) + hist, bin_edges = np.histogram(self._table.get_column(col_name), bins, range=(min_val, max_val)) + + bars = np.array([]) + for i in range(len(hist)): + bars = np.append(bars, f"{round(bin_edges[i], 2)}-{round(bin_edges[i + 1], 2)}") + + ax.bar(bars, hist, edgecolor="black") + ax.set_xticks(np.arange(len(hist)), bars, rotation=45, horizontalalignment="right") + continue + + np_col = np_col.astype(str) + unique_values = np.unique(np_col) + hist = np.array([np.sum(np_col == value) for value in unique_values]) + ax.bar(unique_values, hist, edgecolor="black") + ax.set_xticks(np.arange(len(unique_values)), unique_values, rotation=45, horizontalalignment="right") + + for i in range(len(col_names), n_rows * n_cols): + fig.delaxes(axs.flatten()[i]) # Remove empty subplots + + return _figure_to_image(fig) def line_plot(self, x_name: str, y_name: str) -> Image: - raise NotImplementedError + """ + Create a line plot for two columns in the table. + + Parameters + ---------- + x_name: + The name of the column to be plotted on the x-axis. + y_name: + The name of the column to be plotted on the y-axis. + + Returns + ------- + line_plot: + The plot as an image. + + Raises + ------ + KeyError + If a column does not exist. + TypeError + If a column is not numeric. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": [2, 3, 4, 5, 6], + ... } + ... ) + >>> image = table.plot.line_plot("a", "b") + """ + # TODO: extract validation + missing_columns = [] + if not self._table.has_column(x_name): + missing_columns.append(x_name) + if not self._table.has_column(y_name): + missing_columns.append(y_name) + if missing_columns: + raise UnknownColumnNameError(missing_columns) + + # TODO: pass list of columns names + if not self._table.get_column(x_name).is_numeric: + raise NonNumericColumnError(x_name) + if not self._table.get_column(y_name).is_numeric: + raise NonNumericColumnError(y_name) + + import matplotlib.pyplot as plt + + fig, ax = plt.subplots() + ax.plot( + self._table.get_column(x_name)._series, + self._table.get_column(y_name)._series, + ) + ax.set( + xlabel=x_name, + ylabel=y_name, + ) + ax.set_xticks(ax.get_xticks()) + ax.set_xticklabels( + ax.get_xticklabels(), + rotation=45, + horizontalalignment="right", + ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels + fig.tight_layout() + + return _figure_to_image(fig) def scatter_plot(self, x_name: str, y_name: str) -> Image: - raise NotImplementedError + """ + Create a scatter plot for two columns in the table. + + Parameters + ---------- + x_name: + The name of the column to be plotted on the x-axis. + y_name: + The name of the column to be plotted on the y-axis. + + Returns + ------- + scatter_plot: + The plot as an image. + + Raises + ------ + KeyError + If a column does not exist. + TypeError + If a column is not numeric. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": [2, 3, 4, 5, 6], + ... } + ... ) + >>> image = table.plot.scatter_plot("a", "b") + """ + # TODO: merge with line_plot? + # TODO: extract validation + missing_columns = [] + if not self._table.has_column(x_name): + missing_columns.append(x_name) + if not self._table.has_column(y_name): + missing_columns.append(y_name) + if missing_columns: + raise UnknownColumnNameError(missing_columns) + + # TODO: pass list of columns names + if not self._table.get_column(x_name).is_numeric: + raise NonNumericColumnError(x_name) + if not self._table.get_column(y_name).is_numeric: + raise NonNumericColumnError(y_name) + + import matplotlib.pyplot as plt + + fig, ax = plt.subplots() + ax.scatter( + x=self._table.get_column(x_name)._series, + y=self._table.get_column(y_name)._series, + ) + ax.set( + xlabel=x_name, + ylabel=y_name, + ) + ax.set_xticks(ax.get_xticks()) + ax.set_xticklabels( + ax.get_xticklabels(), + rotation=45, + horizontalalignment="right", + ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels + fig.tight_layout() + + return _figure_to_image(fig) # TODO: equivalent to Column.plot_compare_columns that takes a list of column names (index_plot)? diff --git a/src/safeds/data/tabular/transformation/__init__.py b/src/safeds/data/tabular/transformation/__init__.py index 3ae5fc572..098af9adf 100644 --- a/src/safeds/data/tabular/transformation/__init__.py +++ b/src/safeds/data/tabular/transformation/__init__.py @@ -6,6 +6,14 @@ if TYPE_CHECKING: from ._discretizer import Discretizer + from ._experimental_discretizer import ExperimentalDiscretizer + from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer + from ._experimental_label_encoder import ExperimentalLabelEncoder + from ._experimental_one_hot_encoder import ExperimentalOneHotEncoder + from ._experimental_range_scaler import ExperimentalRangeScaler + from ._experimental_simple_imputer import ExperimentalSimpleImputer + from ._experimental_standard_scaler import ExperimentalStandardScaler + from ._experimental_table_transformer import ExperimentalTableTransformer from ._imputer import Imputer from ._label_encoder import LabelEncoder from ._one_hot_encoder import OneHotEncoder @@ -17,6 +25,14 @@ __name__, { "Discretizer": "._discretizer:Discretizer", + "ExperimentalDiscretizer": "._experimental_discretizer:ExperimentalDiscretizer", + "ExperimentalInvertibleTableTransformer": "._experimental_invertible_table_transformer:ExperimentalInvertibleTableTransformer", + "ExperimentalLabelEncoder": "._experimental_label_encoder:ExperimentalLabelEncoder", + "ExperimentalOneHotEncoder": "._experimental_one_hot_encoder:ExperimentalOneHotEncoder", + "ExperimentalRangeScaler": "._experimental_range_scaler:ExperimentalRangeScaler", + "ExperimentalSimpleImputer": "._experimental_simple_imputer:ExperimentalSimpleImputer", + "ExperimentalStandardScaler": "._experimental_standard_scaler:ExperimentalStandardScaler", + "ExperimentalTableTransformer": "._experimental_table_transformer:ExperimentalTableTransformer", "Imputer": "._imputer:Imputer", "InvertibleTableTransformer": "._table_transformer:InvertibleTableTransformer", "LabelEncoder": "._label_encoder:LabelEncoder", @@ -29,6 +45,14 @@ __all__ = [ "Discretizer", + "ExperimentalDiscretizer", + "ExperimentalInvertibleTableTransformer", + "ExperimentalLabelEncoder", + "ExperimentalOneHotEncoder", + "ExperimentalRangeScaler", + "ExperimentalSimpleImputer", + "ExperimentalStandardScaler", + "ExperimentalTableTransformer", "Imputer", "InvertibleTableTransformer", "LabelEncoder", diff --git a/src/safeds/data/tabular/transformation/_experimental_discretizer.py b/src/safeds/data/tabular/transformation/_experimental_discretizer.py new file mode 100644 index 000000000..ea3485831 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_discretizer.py @@ -0,0 +1,215 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds.data.tabular.containers import ExperimentalTable +from safeds.exceptions import ( + ClosedBound, + NonNumericColumnError, + OutOfBoundsError, + TransformerNotFittedError, + UnknownColumnNameError, +) + +from ._experimental_table_transformer import ExperimentalTableTransformer + +if TYPE_CHECKING: + from sklearn.preprocessing import KBinsDiscretizer as sk_KBinsDiscretizer + + +class ExperimentalDiscretizer(ExperimentalTableTransformer): + """ + The Discretizer bins continuous data into intervals. + + Parameters + ---------- + number_of_bins: + The number of bins to be created. + + Raises + ------ + OutOfBoundsError + If the given number_of_bins is less than 2. + """ + + def __init__(self, number_of_bins: int = 5): + self._column_names: list[str] | None = None + self._wrapped_transformer: sk_KBinsDiscretizer | None = None + + if number_of_bins < 2: + raise OutOfBoundsError(number_of_bins, name="number_of_bins", lower_bound=ClosedBound(2)) + self._number_of_bins = number_of_bins + + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalDiscretizer: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + ValueError + If the table is empty. + NonNumericColumnError + If one of the columns, that should be fitted is non-numeric. + UnknownColumnNameError + If one of the columns, that should be fitted is not in the table. + """ + from sklearn.preprocessing import KBinsDiscretizer as sk_KBinsDiscretizer + + if table.number_of_rows == 0: + raise ValueError("The Discretizer cannot be fitted because the table contains 0 rows") + + if column_names is None: + column_names = table.column_names + else: + missing_columns = set(column_names) - set(table.column_names) + if len(missing_columns) > 0: + raise UnknownColumnNameError( + sorted( + missing_columns, + key={val: ix for ix, val in enumerate(column_names)}.__getitem__, + ), + ) + + for column in column_names: + if not table.get_column(column).type.is_numeric: + raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") + + wrapped_transformer = sk_KBinsDiscretizer(n_bins=self._number_of_bins, encode="ordinal") + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) + + result = ExperimentalDiscretizer(self._number_of_bins) + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names + + return result + + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + ValueError + If the table is empty. + UnknownColumnNameError + If one of the columns, that should be transformed is not in the table. + NonNumericColumnError + If one of the columns, that should be fitted is non-numeric. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + if table.number_of_rows == 0: + raise ValueError("The table cannot be transformed because it contains 0 rows") + + # Input table does not contain all columns used to fit the transformer + missing_columns = set(self._column_names) - set(table.column_names) + if len(missing_columns) > 0: + raise UnknownColumnNameError( + sorted( + missing_columns, + key={val: ix for ix, val in enumerate(self._column_names)}.__getitem__, + ), + ) + + for column in self._column_names: + if not table.get_column(column).type.is_numeric: + raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") + + new_data = self._wrapped_transformer.transform( + table.remove_columns_except(self._column_names)._data_frame, + ) + return ExperimentalTable._from_polars_lazy_frame( + table._lazy_frame.update(new_data.lazy()), + ) + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._wrapped_transformer is not None + + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the Discretizer. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that may have been changed by the Discretizer. + + Returns + ------- + changed_columns: + The list of (potentially) changed column names, as passed to fit. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return self._column_names + + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the Discretizer. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the Discretizer was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] diff --git a/src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py b/src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py new file mode 100644 index 000000000..9e240c050 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from abc import abstractmethod +from typing import TYPE_CHECKING + +from ._experimental_table_transformer import ExperimentalTableTransformer + +if TYPE_CHECKING: + from safeds.data.tabular.containers import ExperimentalTable + + +class ExperimentalInvertibleTableTransformer(ExperimentalTableTransformer): + """A `TableTransformer` that can also undo the learned transformation after it has been applied.""" + + @abstractmethod + def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: + """ + Undo the learned transformation. + + The table is not modified. + + Parameters + ---------- + transformed_table: + The table to be transformed back to the original version. + + Returns + ------- + original_table: + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ diff --git a/src/safeds/data/tabular/transformation/_experimental_label_encoder.py b/src/safeds/data/tabular/transformation/_experimental_label_encoder.py new file mode 100644 index 000000000..a556260aa --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_label_encoder.py @@ -0,0 +1,246 @@ +from __future__ import annotations + +import warnings +from typing import TYPE_CHECKING + +from safeds.data.tabular.containers import ExperimentalTable +from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError + +from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer + +if TYPE_CHECKING: + from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder + + +class ExperimentalLabelEncoder(ExperimentalInvertibleTableTransformer): + """The LabelEncoder encodes one or more given columns into labels.""" + + def __init__(self) -> None: + self._wrapped_transformer: sk_OrdinalEncoder | None = None + self._column_names: list[str] | None = None + + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalLabelEncoder: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + UnknownColumnNameError + If column_names contain a column name that is missing in the table. + ValueError + If the table contains 0 rows. + """ + from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder + + if column_names is None: + column_names = table.column_names + else: + missing_columns = sorted(set(column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") + + if table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns > 0: + warnings.warn( + "The columns" + f" {table.remove_columns_except(column_names).remove_non_numeric_columns().column_names} contain" + " numerical data. The LabelEncoder is designed to encode non-numerical values into numerical values", + UserWarning, + stacklevel=2, + ) + + # TODO: use polars Enum type instead: + # my_enum = pl.Enum(['A', 'B', 'C']) <-- create this from the given order + # my_data = pl.Series(['A', 'A', 'B'], dtype=my_enum) + wrapped_transformer = sk_OrdinalEncoder() + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) + + result = ExperimentalLabelEncoder() + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names + + return result + + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + # Input table does not contain all columns used to fit the transformer + missing_columns = sorted(set(self._column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") + + new_data = self._wrapped_transformer.transform( + table.remove_columns_except(self._column_names)._data_frame, + ) + return ExperimentalTable._from_polars_lazy_frame( + table._lazy_frame.update(new_data.lazy()), + ) + + def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: + """ + Undo the learned transformation. + + The table is not modified. + + Parameters + ---------- + transformed_table: + The table to be transformed back to the original version. + + Returns + ------- + original_table: + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + NonNumericColumnError + If the specified columns of the input table contain non-numerical data. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + missing_columns = sorted(set(self._column_names) - set(transformed_table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if transformed_table.number_of_rows == 0: + raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows") + + if transformed_table.remove_columns_except( + self._column_names, + ).remove_non_numeric_columns().number_of_columns < len(self._column_names): + raise NonNumericColumnError( + str( + sorted( + set(self._column_names) + - set( + transformed_table.remove_columns_except(self._column_names) + .remove_non_numeric_columns() + .column_names, + ), + ), + ), + ) + + new_data = self._wrapped_transformer.inverse_transform( + transformed_table.remove_columns_except(self._column_names)._data_frame, + ) + return ExperimentalTable._from_polars_lazy_frame( + transformed_table._lazy_frame.update(new_data.lazy()), + ) + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._wrapped_transformer is not None + + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the LabelEncoder. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + # (Must implement abstract method, cannot instantiate class otherwise.) + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that may have been changed by the LabelEncoder. + + Returns + ------- + changed_columns: + The list of (potentially) changed column names, as passed to fit. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return self._column_names + + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the LabelEncoder. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the LabelEncoder was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] diff --git a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py new file mode 100644 index 000000000..a11cf5798 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py @@ -0,0 +1,381 @@ +from __future__ import annotations + +import warnings +from collections import Counter +from typing import Any + +from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable +from safeds.exceptions import ( + NonNumericColumnError, + TransformerNotFittedError, + UnknownColumnNameError, + ValueNotPresentWhenFittedError, +) + +from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer + + +class ExperimentalOneHotEncoder(ExperimentalInvertibleTableTransformer): + """ + A way to deal with categorical features that is particularly useful for unordered (i.e. nominal) data. + + It replaces a column with a set of columns, each representing a unique value in the original column. The value of + each new column is 1 if the original column had that value, and 0 otherwise. Take the following table as an + example: + + | col1 | + |------| + | "a" | + | "b" | + | "c" | + | "a" | + + The one-hot encoding of this table is: + + | col1__a | col1__b | col1__c | + |---------|---------|---------| + | 1 | 0 | 0 | + | 0 | 1 | 0 | + | 0 | 0 | 1 | + | 1 | 0 | 0 | + + The name "one-hot" comes from the fact that each row has exactly one 1 in it, and the rest of the values are 0s. + One-hot encoding is closely related to dummy variable / indicator variables, which are used in statistics. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> from safeds.data.tabular.transformation import OneHotEncoder + >>> table = Table({"col1": ["a", "b", "c", "a"]}) + >>> transformer = OneHotEncoder() + >>> transformer.fit_and_transform(table, ["col1"])[1] + col1__a col1__b col1__c + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + 3 1.0 0.0 0.0 + """ + + def __init__(self) -> None: + # Maps each old column to (list of) new columns created from it: + self._column_names: dict[str, list[str]] | None = None + # Maps concrete values (tuples of old column and value) to corresponding new column names: + self._value_to_column: dict[tuple[str, Any], str] | None = None + # Maps nan values (str of old column) to corresponding new column name + self._value_to_column_nans: dict[str, str] | None = None + + def __hash__(self) -> int: + return super().__hash__() + + def __eq__(self, other: object) -> bool: + if not isinstance(other, ExperimentalOneHotEncoder): + return NotImplemented + return ( + self._column_names == other._column_names + and self._value_to_column == other._value_to_column + and self._value_to_column_nans == other._value_to_column_nans + ) + + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalOneHotEncoder: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + UnknownColumnNameError + If column_names contain a column name that is missing in the table. + ValueError + If the table contains 0 rows. + """ + import numpy as np + + if column_names is None: + column_names = table.column_names + else: + missing_columns = sorted(set(column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The OneHotEncoder cannot be fitted because the table contains 0 rows") + + if table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns > 0: + warnings.warn( + "The columns" + f" {table.remove_columns_except(column_names).remove_non_numeric_columns().column_names} contain" + " numerical data. The OneHotEncoder is designed to encode non-numerical values into numerical values", + UserWarning, + stacklevel=2, + ) + + result = ExperimentalOneHotEncoder() + + result._column_names = {} + result._value_to_column = {} + result._value_to_column_nans = {} + + # Keep track of number of occurrences of column names; + # initially all old column names appear exactly once: + name_counter = Counter(table.column_names) + + # Iterate through all columns to-be-changed: + for column in column_names: + result._column_names[column] = [] + for element in table.get_column(column).get_distinct_values(): + base_name = f"{column}__{element}" + name_counter[base_name] += 1 + new_column_name = base_name + # Check if newly created name matches some other existing column name: + if name_counter[base_name] > 1: + new_column_name += f"#{name_counter[base_name]}" + # Update dictionary entries: + result._column_names[column] += [new_column_name] + if isinstance(element, float) and np.isnan(element): + result._value_to_column_nans[column] = new_column_name + else: + result._value_to_column[(column, element)] = new_column_name + + return result + + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + ValueError + If the table contains 0 rows. + ValueNotPresentWhenFittedError + If a column in the to-be-transformed table contains a new value that was not already present in the table the OneHotEncoder was fitted on. + """ + import numpy as np + + # Transformer has not been fitted yet + if self._column_names is None or self._value_to_column is None or self._value_to_column_nans is None: + raise TransformerNotFittedError + + # Input table does not contain all columns used to fit the transformer + missing_columns = sorted(set(self._column_names.keys()) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") + + encoded_values = {} + for new_column_name in self._value_to_column.values(): + encoded_values[new_column_name] = [0.0 for _ in range(table.number_of_rows)] + for new_column_name in self._value_to_column_nans.values(): + encoded_values[new_column_name] = [0.0 for _ in range(table.number_of_rows)] + + values_not_present_when_fitted = [] + for old_column_name in self._column_names: + for i in range(table.number_of_rows): + value = table.get_column(old_column_name).get_value(i) + try: + if isinstance(value, float) and np.isnan(value): + new_column_name = self._value_to_column_nans[old_column_name] + else: + new_column_name = self._value_to_column[(old_column_name, value)] + encoded_values[new_column_name][i] = 1.0 + except KeyError: + # This happens when a column in the to-be-transformed table contains a new value that was not + # already present in the table the OneHotEncoder was fitted on. + values_not_present_when_fitted.append((value, old_column_name)) + + for new_column in self._column_names[old_column_name]: + table = table.add_columns([ExperimentalColumn(new_column, encoded_values[new_column])]) + + if len(values_not_present_when_fitted) > 0: + raise ValueNotPresentWhenFittedError(values_not_present_when_fitted) + + # New columns may not be sorted: + column_names = [] + for name in table.column_names: + if name not in self._column_names: + column_names.append(name) + else: + column_names.extend( + [f_name for f_name in self._value_to_column.values() if f_name.startswith(name)] + + [f_name for f_name in self._value_to_column_nans.values() if f_name.startswith(name)], + ) + + # Drop old, non-encoded columns: + # (Don't do this earlier - we need the old column nams for sorting, + # plus we need to prevent the table from possibly having 0 columns temporarily.) + return table.remove_columns(list(self._column_names.keys())) + + def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: + """ + Undo the learned transformation. + + The table is not modified. + + Parameters + ---------- + transformed_table: + The table to be transformed back to the original version. + + Returns + ------- + table: + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + NonNumericColumnError + If the transformed columns of the input table contain non-numerical data. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._column_names is None or self._value_to_column is None or self._value_to_column_nans is None: + raise TransformerNotFittedError + + _transformed_column_names = [item for sublist in self._column_names.values() for item in sublist] + + missing_columns = sorted(set(_transformed_column_names) - set(transformed_table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if transformed_table.number_of_rows == 0: + raise ValueError("The OneHotEncoder cannot inverse transform the table because it contains 0 rows") + + if transformed_table.remove_columns_except( + _transformed_column_names, + ).remove_non_numeric_columns().number_of_columns < len(_transformed_column_names): + raise NonNumericColumnError( + str( + sorted( + set(_transformed_column_names) + - set( + transformed_table.remove_columns_except(_transformed_column_names) + .remove_non_numeric_columns() + .column_names, + ), + ), + ), + ) + + original_columns = {} + for original_column_name in self._column_names: + original_columns[original_column_name] = [None for _ in range(transformed_table.number_of_rows)] + + for original_column_name, value in self._value_to_column: + constructed_column = self._value_to_column[(original_column_name, value)] + for i in range(transformed_table.number_of_rows): + if transformed_table.get_column(constructed_column)[i] == 1.0: + original_columns[original_column_name][i] = value + + for original_column_name in self._value_to_column_nans: + constructed_column = self._value_to_column_nans[original_column_name] + for i in range(transformed_table.number_of_rows): + if transformed_table.get_column(constructed_column)[i] == 1.0: + original_columns[original_column_name][i] = None + + table = transformed_table + + for column_name, encoded_column in original_columns.items(): + table = table.add_columns(ExperimentalColumn(column_name, encoded_column)) + + # Drop old column names: + table = table.remove_columns(list(self._value_to_column.values())) + return table.remove_columns(list(self._value_to_column_nans.values())) + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return ( + self._column_names is not None + and self._value_to_column is not None + and self._value_to_column_nans is not None + ) + + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the OneHotEncoder. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return [name for column_names in self._column_names.values() for name in column_names] + + # (Must implement abstract method, cannot instantiate class otherwise.) + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that have been changed by the OneHotEncoder (none). + + Returns + ------- + changed_columns: + The empty list. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the OneHotEncoder. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the OneHotEncoder was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return list(self._column_names.keys()) diff --git a/src/safeds/data/tabular/transformation/_experimental_range_scaler.py b/src/safeds/data/tabular/transformation/_experimental_range_scaler.py new file mode 100644 index 000000000..7d708b721 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_range_scaler.py @@ -0,0 +1,295 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds.data.tabular.containers import ExperimentalTable +from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError + +from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer + +if TYPE_CHECKING: + from sklearn.preprocessing import MinMaxScaler as sk_MinMaxScaler + + +class ExperimentalRangeScaler(ExperimentalInvertibleTableTransformer): + """ + The RangeScaler transforms column values by scaling each value to a given range. + + Parameters + ---------- + min_: + The minimum of the new range after the transformation + max_: + The maximum of the new range after the transformation + + Raises + ------ + ValueError + If the given minimum is greater or equal to the given maximum + """ + + def __init__(self, min_: float = 0.0, max_: float = 1.0): + self._column_names: list[str] | None = None + self._wrapped_transformer: sk_MinMaxScaler | None = None + if min_ >= max_: + raise ValueError('Parameter "maximum" must be higher than parameter "minimum".') + self._minimum = min_ + self._maximum = max_ + + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalRangeScaler: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + UnknownColumnNameError + If column_names contain a column name that is missing in the table. + NonNumericColumnError + If at least one of the specified columns in the table contains non-numerical data. + ValueError + If the table contains 0 rows. + """ + from sklearn.preprocessing import MinMaxScaler as sk_MinMaxScaler + + if column_names is None: + column_names = table.column_names + else: + missing_columns = sorted(set(column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The RangeScaler cannot be fitted because the table contains 0 rows") + + if ( + table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns + < table.remove_columns_except(column_names).number_of_columns + ): + raise NonNumericColumnError( + str( + sorted( + set(table.remove_columns_except(column_names).column_names) + - set( + table.remove_columns_except(column_names).remove_non_numeric_columns().column_names, + ), + ), + ), + ) + + wrapped_transformer = sk_MinMaxScaler((self._minimum, self._maximum)) + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) + + result = ExperimentalRangeScaler() + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names + + return result + + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + NonNumericColumnError + If at least one of the columns in the input table that is used to fit contains non-numerical data. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + # Input table does not contain all columns used to fit the transformer + missing_columns = sorted(set(self._column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") + + if ( + table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns + < table.remove_columns_except(self._column_names).number_of_columns + ): + raise NonNumericColumnError( + str( + sorted( + set(table.remove_columns_except(self._column_names).column_names) + - set( + table.remove_columns_except(self._column_names).remove_non_numeric_columns().column_names, + ), + ), + ), + ) + + new_data = self._wrapped_transformer.transform( + table.remove_columns_except(self._column_names)._data_frame, + ) + return ExperimentalTable._from_polars_lazy_frame( + table._lazy_frame.update(new_data.lazy()), + ) + + def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: + """ + Undo the learned transformation. + + The table is not modified. + + Parameters + ---------- + transformed_table: + The table to be transformed back to the original version. + + Returns + ------- + original_table: + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + NonNumericColumnError + If the transformed columns of the input table contain non-numerical data. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + missing_columns = sorted(set(self._column_names) - set(transformed_table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if transformed_table.number_of_rows == 0: + raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") + + if ( + transformed_table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns + < transformed_table.remove_columns_except(self._column_names).number_of_columns + ): + raise NonNumericColumnError( + str( + sorted( + set(transformed_table.remove_columns_except(self._column_names).column_names) + - set( + transformed_table.remove_columns_except(self._column_names) + .remove_non_numeric_columns() + .column_names, + ), + ), + ), + ) + + import polars as pl + + new_data = pl.DataFrame( + self._wrapped_transformer.inverse_transform( + transformed_table.remove_columns_except(self._column_names)._data_frame, + ) + ) + + name_mapping = dict(zip(new_data.columns, self._column_names, strict=True)) + + new_data = new_data.rename(name_mapping) + + return ExperimentalTable._from_polars_data_frame( + transformed_table._data_frame.update(new_data), + ) + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._wrapped_transformer is not None + + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the RangeScaler. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + # (Must implement abstract method, cannot instantiate class otherwise.) + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that may have been changed by the RangeScaler. + + Returns + ------- + changed_columns: + The list of (potentially) changed column names, as passed to fit. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return self._column_names + + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the RangeScaler. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the RangeScaler was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] diff --git a/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py b/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py new file mode 100644 index 000000000..d75c87acf --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py @@ -0,0 +1,386 @@ +from __future__ import annotations + +import sys +import warnings +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +import pandas as pd + +from safeds._utils import _structural_hash +from safeds.data.tabular.containers import ExperimentalTable +from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError + +from ._experimental_table_transformer import ExperimentalTableTransformer + +if TYPE_CHECKING: + from sklearn.impute import SimpleImputer as sk_SimpleImputer + + +class ExperimentalSimpleImputer(ExperimentalTableTransformer): + """ + Replace missing values using the given strategy. + + Parameters + ---------- + strategy: + How to replace missing values. + value_to_replace: + The value that should be replaced. + + Examples + -------- + >>> from safeds.data.tabular.containers import Column, Table + >>> from safeds.data.tabular.transformation import Imputer + >>> + >>> table = Table.from_columns( + ... [ + ... Column("a", [1, 3, None]), + ... Column("b", [None, 2, 3]), + ... ], + ... ) + >>> transformer = Imputer(Imputer.Strategy.Constant(0)) + >>> transformed_table = transformer.fit_and_transform(table) + """ + + class Strategy(ABC): + """Various strategies to replace missing values. Use the static methods to create instances of this class.""" + + @abstractmethod + def __eq__(self, other: object) -> bool: + pass # pragma: no cover + + @abstractmethod + def __hash__(self) -> int: + pass # pragma: no cover + + @abstractmethod + def _apply(self, imputer: sk_SimpleImputer) -> None: + """ + Set the imputer strategy of the given imputer. + + Parameters + ---------- + imputer: + The imputer to augment. + """ + + @staticmethod + def Constant(value: Any) -> ExperimentalSimpleImputer.Strategy: # noqa: N802 + """ + Replace missing values with the given constant value. + + Parameters + ---------- + value: + The value to replace missing values. + """ + return _Constant(value) # pragma: no cover + + @staticmethod + def Mean() -> ExperimentalSimpleImputer.Strategy: # noqa: N802 + """Replace missing values with the mean of each column.""" + return _Mean() # pragma: no cover + + @staticmethod + def Median() -> ExperimentalSimpleImputer.Strategy: # noqa: N802 + """Replace missing values with the median of each column.""" + return _Median() # pragma: no cover + + @staticmethod + def Mode() -> ExperimentalSimpleImputer.Strategy: # noqa: N802 + """Replace missing values with the mode of each column.""" + return _Mode() # pragma: no cover + + def __init__(self, strategy: ExperimentalSimpleImputer.Strategy, *, value_to_replace: float | str | None = None): + if value_to_replace is None: + value_to_replace = pd.NA + + self._strategy = strategy + self._value_to_replace = value_to_replace + + self._wrapped_transformer: sk_SimpleImputer | None = None + self._column_names: list[str] | None = None + + @property + def strategy(self) -> ExperimentalSimpleImputer.Strategy: + """The strategy used to replace missing values.""" + return self._strategy + + @property + def value_to_replace(self) -> Any: + """The value that should be replaced.""" + return self._value_to_replace + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._wrapped_transformer is not None + + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalSimpleImputer: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + UnknownColumnNameError + If column_names contain a column name that is missing in the table + ValueError + If the table contains 0 rows + NonNumericColumnError + If the strategy is set to either Mean or Median and the specified columns of the table contain non-numerical data. + """ + from sklearn.impute import SimpleImputer as sk_SimpleImputer + + if column_names is None: + column_names = table.column_names + else: + missing_columns = sorted(set(column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The Imputer cannot be fitted because the table contains 0 rows") + + if (isinstance(self._strategy, _Mean | _Median)) and table.remove_columns_except( + column_names, + ).remove_non_numeric_columns().number_of_columns < len( + column_names, + ): + raise NonNumericColumnError( + str( + sorted( + set(table.remove_columns_except(column_names).column_names) + - set( + table.remove_columns_except(column_names).remove_non_numeric_columns().column_names, + ), + ), + ), + ) + + if isinstance(self._strategy, _Mode): + multiple_most_frequent = {} + for name in column_names: + if len(table.get_column(name).mode()) > 1: + multiple_most_frequent[name] = table.get_column(name).mode() + if len(multiple_most_frequent) > 0: + warnings.warn( + "There are multiple most frequent values in a column given to the Imputer.\nThe lowest values" + " are being chosen in this cases. The following columns have multiple most frequent" + f" values:\n{multiple_most_frequent}", + UserWarning, + stacklevel=2, + ) + + wrapped_transformer = sk_SimpleImputer(missing_values=self._value_to_replace) + self._strategy._apply(wrapped_transformer) + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) + + result = ExperimentalSimpleImputer(self._strategy) + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names + + return result + + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + # Input table does not contain all columns used to fit the transformer + missing_columns = sorted(set(self._column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The Imputer cannot transform the table because it contains 0 rows") + + new_data = self._wrapped_transformer.transform(table.remove_columns_except(self._column_names)._data_frame) + return ExperimentalTable._from_polars_lazy_frame( + table._lazy_frame.update(new_data.lazy()), + ) + + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the Imputer. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that may have been changed by the Imputer. + + Returns + ------- + changed_columns: + The list of (potentially) changed column names, as passed to fit. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return self._column_names + + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the Imputer. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the Imputer was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + +# ---------------------------------------------------------------------------------------------------------------------- +# Imputation strategies +# ---------------------------------------------------------------------------------------------------------------------- + + +class _Constant(ExperimentalSimpleImputer.Strategy): + def __init__(self, value: Any): + self._value = value + + @property + def value(self) -> Any: + return self._value + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Constant): + return NotImplemented + if self is other: + return True + return self._value == other._value + + def __hash__(self) -> int: + return _structural_hash(str(self)) + + def __sizeof__(self) -> int: + return sys.getsizeof(self._value) + + def __str__(self) -> str: + return f"Constant({self._value})" + + def _apply(self, imputer: sk_SimpleImputer) -> None: + imputer.strategy = "constant" + imputer.fill_value = self._value + + +class _Mean(ExperimentalSimpleImputer.Strategy): + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Mean): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash(str(self)) + + def __str__(self) -> str: + return "Mean" + + def _apply(self, imputer: sk_SimpleImputer) -> None: + imputer.strategy = "mean" + + +class _Median(ExperimentalSimpleImputer.Strategy): + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Median): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash(str(self)) + + def __str__(self) -> str: + return "Median" + + def _apply(self, imputer: sk_SimpleImputer) -> None: + imputer.strategy = "median" + + +class _Mode(ExperimentalSimpleImputer.Strategy): + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Mode): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash(str(self)) + + def __str__(self) -> str: + return "Mode" + + def _apply(self, imputer: sk_SimpleImputer) -> None: + imputer.strategy = "most_frequent" + + +# Override the methods with classes, so they can be used in `isinstance` calls. Unlike methods, classes define a type. +# This is needed for the DSL, where imputer strategies are variants of an enum. +ExperimentalSimpleImputer.Strategy.Constant = _Constant # type: ignore[method-assign] +ExperimentalSimpleImputer.Strategy.Mean = _Mean # type: ignore[method-assign] +ExperimentalSimpleImputer.Strategy.Median = _Median # type: ignore[method-assign] +ExperimentalSimpleImputer.Strategy.Mode = _Mode # type: ignore[method-assign] diff --git a/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py b/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py new file mode 100644 index 000000000..c3176eb81 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py @@ -0,0 +1,268 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds.data.tabular.containers import ExperimentalTable +from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError + +from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer + +if TYPE_CHECKING: + from sklearn.preprocessing import StandardScaler as sk_StandardScaler + + +class ExperimentalStandardScaler(ExperimentalInvertibleTableTransformer): + """The StandardScaler transforms column values to a range by removing the mean and scaling to unit variance.""" + + def __init__(self) -> None: + self._column_names: list[str] | None = None + self._wrapped_transformer: sk_StandardScaler | None = None + + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalStandardScaler: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + UnknownColumnNameError + If column_names contain a column name that is missing in the table. + NonNumericColumnError + If at least one of the specified columns in the table contains non-numerical data. + ValueError + If the table contains 0 rows. + """ + from sklearn.preprocessing import StandardScaler as sk_StandardScaler + + if column_names is None: + column_names = table.column_names + else: + missing_columns = sorted(set(column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The StandardScaler cannot be fitted because the table contains 0 rows") + + if ( + table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns + < table.remove_columns_except(column_names).number_of_columns + ): + raise NonNumericColumnError( + str( + sorted( + set(table.remove_columns_except(column_names).column_names) + - set( + table.remove_columns_except(column_names).remove_non_numeric_columns().column_names, + ), + ), + ), + ) + + wrapped_transformer = sk_StandardScaler() + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) + + result = ExperimentalStandardScaler() + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names + + return result + + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + NonNumericColumnError + If at least one of the columns in the input table that is used to fit contains non-numerical data. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + # Input table does not contain all columns used to fit the transformer + missing_columns = sorted(set(self._column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") + + if ( + table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns + < table.remove_columns_except(self._column_names).number_of_columns + ): + raise NonNumericColumnError( + str( + sorted( + set(table.remove_columns_except(self._column_names).column_names) + - set( + table.remove_columns_except(self._column_names).remove_non_numeric_columns().column_names, + ), + ), + ), + ) + + new_data = self._wrapped_transformer.transform( + table.remove_columns_except(self._column_names)._data_frame, + ) + return ExperimentalTable._from_polars_lazy_frame( + table._lazy_frame.update(new_data.lazy()), + ) + + def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: + """ + Undo the learned transformation. + + The table is not modified. + + Parameters + ---------- + transformed_table: + The table to be transformed back to the original version. + + Returns + ------- + original_table: + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + NonNumericColumnError + If the transformed columns of the input table contain non-numerical data. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + missing_columns = sorted(set(self._column_names) - set(transformed_table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if transformed_table.number_of_rows == 0: + raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") + + if ( + transformed_table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns + < transformed_table.remove_columns_except(self._column_names).number_of_columns + ): + raise NonNumericColumnError( + str( + sorted( + set(transformed_table.remove_columns_except(self._column_names).column_names) + - set( + transformed_table.remove_columns_except(self._column_names) + .remove_non_numeric_columns() + .column_names, + ), + ), + ), + ) + + new_data = self._wrapped_transformer.inverse_transform( + transformed_table.remove_columns_except(self._column_names)._data_frame, + ) + return ExperimentalTable._from_polars_data_frame( + transformed_table._data_frame.update(new_data), + ) + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._wrapped_transformer is not None + + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the StandardScaler. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + # (Must implement abstract method, cannot instantiate class otherwise.) + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that may have been changed by the StandardScaler. + + Returns + ------- + changed_columns: + The list of (potentially) changed column names, as passed to fit. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return self._column_names + + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the StandardScaler. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the StandardScaler was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] diff --git a/src/safeds/data/tabular/transformation/_experimental_table_transformer.py b/src/safeds/data/tabular/transformation/_experimental_table_transformer.py new file mode 100644 index 000000000..ed30ae728 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_table_transformer.py @@ -0,0 +1,161 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Self + +from safeds._utils import _structural_hash + +if TYPE_CHECKING: + from safeds.data.tabular.containers import ExperimentalTable + + +class ExperimentalTableTransformer(ABC): + """Learn a transformation for a set of columns in a `Table` and transform another `Table` with the same columns.""" + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __hash__(self) -> int: + """ + Return a deterministic hash value for a table transformer. + + Returns + ------- + hash: + The hash value. + """ + added = self.get_names_of_added_columns() if self.is_fitted else [] + changed = self.get_names_of_changed_columns() if self.is_fitted else [] + removed = self.get_names_of_removed_columns() if self.is_fitted else [] + return _structural_hash(self.__class__.__qualname__, self.is_fitted, added, changed, removed) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + @abstractmethod + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + + # ------------------------------------------------------------------------------------------------------------------ + # Methods + # ------------------------------------------------------------------------------------------------------------------ + + @abstractmethod + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Self: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + """ + + @abstractmethod + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + + @abstractmethod + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the transformer. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + + @abstractmethod + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that have been changed by the transformer. + + Returns + ------- + changed_columns: + A list of names of changed columns, ordered as they appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + + @abstractmethod + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the transformer. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the transformer was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + + def fit_and_transform( + self, table: ExperimentalTable, column_names: list[str] | None = None + ) -> tuple[Self, ExperimentalTable]: + """ + Learn a transformation for a set of columns in a table and apply the learned transformation to the same table. + + Neither the transformer nor the table are modified. + + Parameters + ---------- + table: + The table used to fit the transformer. The transformer is then applied to this table. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + transformed_table: + The transformed table. + """ + fitted_transformer = self.fit(table, column_names) + transformed_table = fitted_transformer.transform(table) + return fitted_transformer, transformed_table diff --git a/src/safeds/exceptions/_data.py b/src/safeds/exceptions/_data.py index ec37ae31a..a13542537 100644 --- a/src/safeds/exceptions/_data.py +++ b/src/safeds/exceptions/_data.py @@ -53,7 +53,7 @@ def __init__(self, column_info: str, help_msg: str | None = None) -> None: ) -class DuplicateColumnNameError(Exception): +class DuplicateColumnNameError(ValueError): """ Exception raised for trying to modify a table resulting in a duplicate column name. @@ -120,7 +120,7 @@ def __init__(self, expected_size: str, actual_size: str): super().__init__(f"Expected a column of size {expected_size} but got column of size {actual_size}.") -class ColumnLengthMismatchError(Exception): +class ColumnLengthMismatchError(ValueError): """Exception raised when the lengths of two or more columns do not match.""" def __init__(self, column_info: str): diff --git a/src/safeds/ml/classical/_util_sklearn.py b/src/safeds/ml/classical/_util_sklearn.py index 12025d585..df8ec362a 100644 --- a/src/safeds/ml/classical/_util_sklearn.py +++ b/src/safeds/ml/classical/_util_sklearn.py @@ -1,8 +1,8 @@ import warnings from typing import Any -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table +from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset +from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable, Table from safeds.exceptions import ( DatasetMissesDataError, DatasetMissesFeaturesError, @@ -15,8 +15,7 @@ ) -# noinspection PyProtectedMember -def fit(model: Any, tabular_dataset: TabularDataset) -> None: +def fit(model: Any, tabular_dataset: TabularDataset | ExperimentalTabularDataset) -> None: """ Fit a model for a given tabular dataset. @@ -46,9 +45,14 @@ def fit(model: Any, tabular_dataset: TabularDataset) -> None: if tabular_dataset._table.number_of_rows == 0: raise DatasetMissesDataError - non_numerical_column_names = set(tabular_dataset.features.column_names) - set( - tabular_dataset.features.remove_columns_with_non_numerical_values().column_names, - ) + if isinstance(tabular_dataset, TabularDataset): + non_numerical_column_names = set(tabular_dataset.features.column_names) - set( + tabular_dataset.features.remove_columns_with_non_numerical_values().column_names, + ) + else: # pragma: no cover + non_numerical_column_names = set(tabular_dataset.features.column_names) - set( + tabular_dataset.features.remove_non_numeric_columns().column_names, + ) if len(non_numerical_column_names) != 0: raise NonNumericColumnError( str(non_numerical_column_names), @@ -68,16 +72,27 @@ def fit(model: Any, tabular_dataset: TabularDataset) -> None: ) try: - model.fit( - tabular_dataset.features._data, - tabular_dataset.target._data, - ) + if isinstance(tabular_dataset, TabularDataset): + model.fit( + tabular_dataset.features._data, + tabular_dataset.target._data, + ) + else: # pragma: no cover + model.fit( + tabular_dataset.features._data_frame, + tabular_dataset.target._series, + ) except ValueError as exception: raise LearningError(str(exception)) from exception # noinspection PyProtectedMember -def predict(model: Any, dataset: Table, feature_names: list[str] | None, target_name: str | None) -> TabularDataset: +def predict( + model: Any, + dataset: Table | ExperimentalTable | ExperimentalTabularDataset, + feature_names: list[str] | None, + target_name: str | None, +) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. @@ -115,57 +130,111 @@ def predict(model: Any, dataset: Table, feature_names: list[str] | None, target_ # Validation if model is None or target_name is None or feature_names is None: raise ModelNotFittedError - missing_feature_names = [feature_name for feature_name in feature_names if not dataset.has_column(feature_name)] - if missing_feature_names: - raise DatasetMissesFeaturesError(missing_feature_names) - if isinstance(dataset, TabularDataset): - dataset = dataset.features # Cast to Table type, so Python will call the right methods... + if isinstance(dataset, ExperimentalTabularDataset): # pragma: no cover + dataset = dataset.features - if dataset.number_of_rows == 0: - raise DatasetMissesDataError + if isinstance(dataset, Table): + missing_feature_names = [feature_name for feature_name in feature_names if not dataset.has_column(feature_name)] + if missing_feature_names: + raise DatasetMissesFeaturesError(missing_feature_names) - non_numerical_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set( - dataset.keep_only_columns(feature_names).remove_columns_with_non_numerical_values().column_names, - ) - if len(non_numerical_column_names) != 0: - raise NonNumericColumnError( - str(non_numerical_column_names), - "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" - " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many" - " different values\nor is ordinal, you should use the LabelEncoder.\n", - ) + if dataset.number_of_rows == 0: + raise DatasetMissesDataError - null_containing_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set( - dataset.keep_only_columns(feature_names).remove_columns_with_missing_values().column_names, - ) - if len(null_containing_column_names) != 0: - raise MissingValuesColumnError( - str(null_containing_column_names), - "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" - " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.", + non_numerical_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set( + dataset.keep_only_columns(feature_names).remove_columns_with_non_numerical_values().column_names, ) - - dataset_df = dataset.keep_only_columns(feature_names)._data - dataset_df.columns = feature_names - - result_set = dataset._data.reset_index(drop=True) - result_set.columns = dataset.column_names - - try: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="X does not have valid feature names") - predicted_target_vector = model.predict(dataset_df.values) - result_set[target_name] = predicted_target_vector - - extra_names = [ - column_name - for column_name in dataset.column_names - if column_name != target_name and column_name not in feature_names - ] - - return Table._from_pandas_dataframe(result_set).to_tabular_dataset( - target_name=target_name, - extra_names=extra_names, + if len(non_numerical_column_names) != 0: + raise NonNumericColumnError( + str(non_numerical_column_names), + "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" + " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many" + " different values\nor is ordinal, you should use the LabelEncoder.\n", + ) + + null_containing_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set( + dataset.keep_only_columns(feature_names).remove_columns_with_missing_values().column_names, ) - except ValueError as exception: - raise PredictionError(str(exception)) from exception + if len(null_containing_column_names) != 0: + raise MissingValuesColumnError( + str(null_containing_column_names), + "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" + " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.", + ) + + dataset_df = dataset.keep_only_columns(feature_names)._data + dataset_df.columns = feature_names + + result_set = dataset._data.reset_index(drop=True) + result_set.columns = dataset.column_names + + try: + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="X does not have valid feature names") + predicted_target_vector = model.predict(dataset_df.values) + result_set[target_name] = predicted_target_vector + + extra_names = [ + column_name + for column_name in dataset.column_names + if column_name != target_name and column_name not in feature_names + ] + + return Table._from_pandas_dataframe(result_set).to_tabular_dataset( + target_name=target_name, + extra_names=extra_names, + ) + except ValueError as exception: + raise PredictionError(str(exception)) from exception + elif isinstance(dataset, ExperimentalTable): # pragma: no cover + missing_feature_names = [feature_name for feature_name in feature_names if not dataset.has_column(feature_name)] + if missing_feature_names: + raise DatasetMissesFeaturesError(missing_feature_names) + + if dataset.number_of_rows == 0: + raise DatasetMissesDataError + + non_numerical_column_names_2 = set(dataset.column_names) - set( + dataset.remove_non_numeric_columns().column_names, + ) + if len(non_numerical_column_names_2) != 0: + raise NonNumericColumnError( + str(non_numerical_column_names_2), + "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" + " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many" + " different values\nor is ordinal, you should use the LabelEncoder.\n", + ) + + null_containing_column_names_2 = set(dataset.column_names) - set( + dataset.remove_columns_with_missing_values().column_names, + ) + if len(null_containing_column_names_2) != 0: + raise MissingValuesColumnError( + str(null_containing_column_names_2), + "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" + " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.", + ) + + dataset_df = dataset.remove_columns_except(feature_names) + + try: + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="X does not have valid feature names") + predicted_target_vector = model.predict(dataset_df._data_frame) + output = dataset.remove_columns(target_name).add_columns( + ExperimentalColumn(target_name, predicted_target_vector), + ) + + extra_names = [ + column_name + for column_name in dataset.column_names + if column_name != target_name and column_name not in feature_names + ] + + return TabularDataset( + output.to_dict(), + target_name=target_name, + extra_names=extra_names, + ) + except ValueError as exception: + raise PredictionError(str(exception)) from exception diff --git a/src/safeds/ml/classical/classification/_ada_boost.py b/src/safeds/ml/classical/classification/_ada_boost.py index d251e542c..20c1b2304 100644 --- a/src/safeds/ml/classical/classification/_ada_boost.py +++ b/src/safeds/ml/classical/classification/_ada_boost.py @@ -12,8 +12,8 @@ from sklearn.base import ClassifierMixin from sklearn.ensemble import AdaBoostClassifier as sk_AdaBoostClassifier - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class AdaBoostClassifier(Classifier): @@ -109,7 +109,7 @@ def learning_rate(self) -> float: """ return self._learning_rate - def fit(self, training_set: TabularDataset) -> AdaBoostClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> AdaBoostClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -152,7 +152,7 @@ def fit(self, training_set: TabularDataset) -> AdaBoostClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/classification/_classifier.py b/src/safeds/ml/classical/classification/_classifier.py index c9a05cff3..614428092 100644 --- a/src/safeds/ml/classical/classification/_classifier.py +++ b/src/safeds/ml/classical/classification/_classifier.py @@ -4,8 +4,8 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table +from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset +from safeds.data.tabular.containers import ExperimentalTable, Table from safeds.exceptions import PlainTableError if TYPE_CHECKING: @@ -29,7 +29,7 @@ def __hash__(self) -> int: return _structural_hash(self.__class__.__qualname__, self.is_fitted) @abstractmethod - def fit(self, training_set: TabularDataset) -> Classifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> Classifier: """ Create a copy of this classifier and fit it with the given training data. @@ -52,7 +52,10 @@ def fit(self, training_set: TabularDataset) -> Classifier: """ @abstractmethod - def predict(self, dataset: Table) -> TabularDataset: + def predict( + self, + dataset: Table | ExperimentalTable | ExperimentalTabularDataset, + ) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. @@ -96,7 +99,11 @@ def _get_sklearn_classifier(self) -> ClassifierMixin: # Metrics # ------------------------------------------------------------------------------------------------------------------ - def summarize_metrics(self, validation_or_test_set: TabularDataset, positive_class: Any) -> Table: + def summarize_metrics( + self, + validation_or_test_set: TabularDataset | ExperimentalTabularDataset, + positive_class: Any, + ) -> Table: """ Summarize the classifier's metrics on the given data. @@ -129,7 +136,7 @@ def summarize_metrics(self, validation_or_test_set: TabularDataset, positive_cla }, ) - def accuracy(self, validation_or_test_set: TabularDataset) -> float: + def accuracy(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset) -> float: """ Compute the accuracy of the classifier on the given data. @@ -153,12 +160,20 @@ def accuracy(self, validation_or_test_set: TabularDataset) -> float: if not isinstance(validation_or_test_set, TabularDataset) and isinstance(validation_or_test_set, Table): raise PlainTableError - expected_values = validation_or_test_set.target - predicted_values = self.predict(validation_or_test_set.features).target + if isinstance(validation_or_test_set, TabularDataset): + expected_values = validation_or_test_set.target + else: # pragma: no cover + expected_values = validation_or_test_set.target._series + predicted_values = self.predict(validation_or_test_set.features).target._data - return sk_accuracy_score(expected_values._data, predicted_values._data) + # TODO: more efficient implementation using polars + return sk_accuracy_score(expected_values._data, predicted_values) - def precision(self, validation_or_test_set: TabularDataset, positive_class: Any) -> float: + def precision( + self, + validation_or_test_set: TabularDataset | ExperimentalTabularDataset, + positive_class: Any, + ) -> float: """ Compute the classifier's precision on the given data. @@ -184,6 +199,7 @@ def precision(self, validation_or_test_set: TabularDataset, positive_class: Any) n_true_positives = 0 n_false_positives = 0 + # TODO: more efficient implementation using polars for expected_value, predicted_value in zip(expected_values, predicted_values, strict=True): if predicted_value == positive_class: if expected_value == positive_class: @@ -195,7 +211,7 @@ def precision(self, validation_or_test_set: TabularDataset, positive_class: Any) return 1.0 return n_true_positives / (n_true_positives + n_false_positives) - def recall(self, validation_or_test_set: TabularDataset, positive_class: Any) -> float: + def recall(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset, positive_class: Any) -> float: """ Compute the classifier's recall on the given data. @@ -221,6 +237,7 @@ def recall(self, validation_or_test_set: TabularDataset, positive_class: Any) -> n_true_positives = 0 n_false_negatives = 0 + # TODO: more efficient implementation using polars for expected_value, predicted_value in zip(expected_values, predicted_values, strict=True): if predicted_value == positive_class: if expected_value == positive_class: @@ -232,7 +249,11 @@ def recall(self, validation_or_test_set: TabularDataset, positive_class: Any) -> return 1.0 return n_true_positives / (n_true_positives + n_false_negatives) - def f1_score(self, validation_or_test_set: TabularDataset, positive_class: Any) -> float: + def f1_score( + self, + validation_or_test_set: TabularDataset | ExperimentalTabularDataset, + positive_class: Any, + ) -> float: """ Compute the classifier's $F_1$-score on the given data. @@ -259,6 +280,7 @@ def f1_score(self, validation_or_test_set: TabularDataset, positive_class: Any) n_false_negatives = 0 n_false_positives = 0 + # TODO: more efficient implementation using polars for expected_value, predicted_value in zip(expected_values, predicted_values, strict=True): if predicted_value == positive_class: if expected_value == positive_class: diff --git a/src/safeds/ml/classical/classification/_decision_tree.py b/src/safeds/ml/classical/classification/_decision_tree.py index ca7cd8d5b..e9a43466c 100644 --- a/src/safeds/ml/classical/classification/_decision_tree.py +++ b/src/safeds/ml/classical/classification/_decision_tree.py @@ -12,8 +12,8 @@ from sklearn.base import ClassifierMixin from sklearn.tree import DecisionTreeClassifier as sk_DecisionTreeClassifier - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class DecisionTreeClassifier(Classifier): @@ -77,7 +77,7 @@ def minimum_number_of_samples_in_leaves(self) -> int: """The minimum number of samples that must remain in the leaves of the tree.""" return self._minimum_number_of_samples_in_leaves - def fit(self, training_set: TabularDataset) -> DecisionTreeClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> DecisionTreeClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -119,7 +119,7 @@ def fit(self, training_set: TabularDataset) -> DecisionTreeClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/classification/_gradient_boosting.py b/src/safeds/ml/classical/classification/_gradient_boosting.py index 56545d345..869c77028 100644 --- a/src/safeds/ml/classical/classification/_gradient_boosting.py +++ b/src/safeds/ml/classical/classification/_gradient_boosting.py @@ -12,8 +12,8 @@ from sklearn.base import ClassifierMixin from sklearn.ensemble import GradientBoostingClassifier as sk_GradientBoostingClassifier - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class GradientBoostingClassifier(Classifier): @@ -84,7 +84,7 @@ def learning_rate(self) -> float: """ return self._learning_rate - def fit(self, training_set: TabularDataset) -> GradientBoostingClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> GradientBoostingClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -123,7 +123,7 @@ def fit(self, training_set: TabularDataset) -> GradientBoostingClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/classification/_k_nearest_neighbors.py b/src/safeds/ml/classical/classification/_k_nearest_neighbors.py index 82c6cf920..974a4f9a5 100644 --- a/src/safeds/ml/classical/classification/_k_nearest_neighbors.py +++ b/src/safeds/ml/classical/classification/_k_nearest_neighbors.py @@ -3,8 +3,8 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table +from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset +from safeds.data.tabular.containers import ExperimentalTable, Table from safeds.exceptions import ClosedBound, DatasetMissesDataError, OutOfBoundsError, PlainTableError from safeds.ml.classical._util_sklearn import fit, predict @@ -64,7 +64,7 @@ def number_of_neighbors(self) -> int: """ return self._number_of_neighbors - def fit(self, training_set: TabularDataset) -> KNearestNeighborsClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> KNearestNeighborsClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -116,7 +116,7 @@ def fit(self, training_set: TabularDataset) -> KNearestNeighborsClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/classification/_logistic_regression.py b/src/safeds/ml/classical/classification/_logistic_regression.py index c3e0b09d0..22a6bcd00 100644 --- a/src/safeds/ml/classical/classification/_logistic_regression.py +++ b/src/safeds/ml/classical/classification/_logistic_regression.py @@ -11,8 +11,8 @@ from sklearn.base import ClassifierMixin from sklearn.linear_model import LogisticRegression as sk_LogisticRegression - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class LogisticRegressionClassifier(Classifier): @@ -27,7 +27,7 @@ def __init__(self) -> None: self._feature_names: list[str] | None = None self._target_name: str | None = None - def fit(self, training_set: TabularDataset) -> LogisticRegressionClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> LogisticRegressionClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -66,7 +66,7 @@ def fit(self, training_set: TabularDataset) -> LogisticRegressionClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/classification/_random_forest.py b/src/safeds/ml/classical/classification/_random_forest.py index 567106c3d..ed5bb2681 100644 --- a/src/safeds/ml/classical/classification/_random_forest.py +++ b/src/safeds/ml/classical/classification/_random_forest.py @@ -12,8 +12,8 @@ from sklearn.base import ClassifierMixin from sklearn.ensemble import RandomForestClassifier as sk_RandomForestClassifier - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class RandomForestClassifier(Classifier): @@ -93,7 +93,7 @@ def minimum_number_of_samples_in_leaves(self) -> int: """The minimum number of samples that must remain in the leaves of each tree.""" return self._minimum_number_of_samples_in_leaves - def fit(self, training_set: TabularDataset) -> RandomForestClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> RandomForestClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -136,7 +136,7 @@ def fit(self, training_set: TabularDataset) -> RandomForestClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/classification/_support_vector_machine.py b/src/safeds/ml/classical/classification/_support_vector_machine.py index 6890ebfd7..33499d782 100644 --- a/src/safeds/ml/classical/classification/_support_vector_machine.py +++ b/src/safeds/ml/classical/classification/_support_vector_machine.py @@ -13,15 +13,15 @@ from sklearn.base import ClassifierMixin from sklearn.svm import SVC as sk_SVC # noqa: N811 - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class SupportVectorMachineKernel(ABC): """The abstract base class of the different subclasses supported by the `Kernel`.""" @abstractmethod - def _get_sklearn_arguments(self) -> dict[str, Any]: + def _get_sklearn_arguments(self) -> dict[str, Any]: # TODO: use apply pattern (imputer strategy) instead """Return the arguments to pass to scikit-learn.""" @abstractmethod @@ -188,7 +188,7 @@ def __eq__(self, other: object) -> bool: __hash__ = SupportVectorMachineKernel.__hash__ - def fit(self, training_set: TabularDataset) -> SupportVectorMachineClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> SupportVectorMachineClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -227,7 +227,7 @@ def fit(self, training_set: TabularDataset) -> SupportVectorMachineClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_ada_boost.py b/src/safeds/ml/classical/regression/_ada_boost.py index dd27e266d..3b85df127 100644 --- a/src/safeds/ml/classical/regression/_ada_boost.py +++ b/src/safeds/ml/classical/regression/_ada_boost.py @@ -12,8 +12,8 @@ from sklearn.base import RegressorMixin from sklearn.ensemble import AdaBoostRegressor as sk_AdaBoostRegressor - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class AdaBoostRegressor(Regressor): @@ -109,7 +109,7 @@ def learning_rate(self) -> float: """ return self._learning_rate - def fit(self, training_set: TabularDataset) -> AdaBoostRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> AdaBoostRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -152,7 +152,7 @@ def fit(self, training_set: TabularDataset) -> AdaBoostRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_decision_tree.py b/src/safeds/ml/classical/regression/_decision_tree.py index d8a066973..33c40d1e6 100644 --- a/src/safeds/ml/classical/regression/_decision_tree.py +++ b/src/safeds/ml/classical/regression/_decision_tree.py @@ -12,8 +12,8 @@ from sklearn.base import RegressorMixin from sklearn.tree import DecisionTreeRegressor as sk_DecisionTreeRegressor - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class DecisionTreeRegressor(Regressor): @@ -77,7 +77,7 @@ def minimum_number_of_samples_in_leaves(self) -> int: """The minimum number of samples that must remain in the leaves of the tree.""" return self._minimum_number_of_samples_in_leaves - def fit(self, training_set: TabularDataset) -> DecisionTreeRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> DecisionTreeRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -119,7 +119,7 @@ def fit(self, training_set: TabularDataset) -> DecisionTreeRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_elastic_net_regression.py b/src/safeds/ml/classical/regression/_elastic_net_regression.py index 125f49e7a..45b3069f4 100644 --- a/src/safeds/ml/classical/regression/_elastic_net_regression.py +++ b/src/safeds/ml/classical/regression/_elastic_net_regression.py @@ -14,8 +14,8 @@ from sklearn.base import RegressorMixin from sklearn.linear_model import ElasticNet as sk_ElasticNet - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class ElasticNetRegressor(Regressor): @@ -114,7 +114,7 @@ def lasso_ratio(self) -> float: """ return self._lasso_ratio - def fit(self, training_set: TabularDataset) -> ElasticNetRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> ElasticNetRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -153,7 +153,7 @@ def fit(self, training_set: TabularDataset) -> ElasticNetRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_gradient_boosting.py b/src/safeds/ml/classical/regression/_gradient_boosting.py index 34ec419ab..4cf46bc97 100644 --- a/src/safeds/ml/classical/regression/_gradient_boosting.py +++ b/src/safeds/ml/classical/regression/_gradient_boosting.py @@ -12,8 +12,8 @@ from sklearn.base import RegressorMixin from sklearn.ensemble import GradientBoostingRegressor as sk_GradientBoostingRegressor - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class GradientBoostingRegressor(Regressor): @@ -84,7 +84,7 @@ def learning_rate(self) -> float: """ return self._learning_rate - def fit(self, training_set: TabularDataset) -> GradientBoostingRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> GradientBoostingRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -123,7 +123,7 @@ def fit(self, training_set: TabularDataset) -> GradientBoostingRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_k_nearest_neighbors.py b/src/safeds/ml/classical/regression/_k_nearest_neighbors.py index 8a96b3a62..aa6198de3 100644 --- a/src/safeds/ml/classical/regression/_k_nearest_neighbors.py +++ b/src/safeds/ml/classical/regression/_k_nearest_neighbors.py @@ -3,8 +3,8 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table +from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset +from safeds.data.tabular.containers import ExperimentalTable, Table from safeds.exceptions import ClosedBound, DatasetMissesDataError, OutOfBoundsError, PlainTableError from safeds.ml.classical._util_sklearn import fit, predict @@ -64,7 +64,7 @@ def number_of_neighbors(self) -> int: """ return self._number_of_neighbors - def fit(self, training_set: TabularDataset) -> KNearestNeighborsRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> KNearestNeighborsRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -118,7 +118,7 @@ def fit(self, training_set: TabularDataset) -> KNearestNeighborsRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_lasso_regression.py b/src/safeds/ml/classical/regression/_lasso_regression.py index e912c4aba..2a74cc244 100644 --- a/src/safeds/ml/classical/regression/_lasso_regression.py +++ b/src/safeds/ml/classical/regression/_lasso_regression.py @@ -13,8 +13,8 @@ from sklearn.base import RegressorMixin from sklearn.linear_model import Lasso as sk_Lasso - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class LassoRegressor(Regressor): @@ -68,7 +68,7 @@ def alpha(self) -> float: """ return self._alpha - def fit(self, training_set: TabularDataset) -> LassoRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> LassoRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -107,7 +107,7 @@ def fit(self, training_set: TabularDataset) -> LassoRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_linear_regression.py b/src/safeds/ml/classical/regression/_linear_regression.py index 101fec7a5..8c9d5db4d 100644 --- a/src/safeds/ml/classical/regression/_linear_regression.py +++ b/src/safeds/ml/classical/regression/_linear_regression.py @@ -11,10 +11,11 @@ from sklearn.base import RegressorMixin from sklearn.linear_model import LinearRegression as sk_LinearRegression - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table +# TODO: rename to linear regressor class LinearRegressionRegressor(Regressor): """Linear regression.""" @@ -27,7 +28,7 @@ def __init__(self) -> None: self._feature_names: list[str] | None = None self._target_name: str | None = None - def fit(self, training_set: TabularDataset) -> LinearRegressionRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> LinearRegressionRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -66,7 +67,7 @@ def fit(self, training_set: TabularDataset) -> LinearRegressionRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_random_forest.py b/src/safeds/ml/classical/regression/_random_forest.py index 1d807d3b9..2d4a8ad98 100644 --- a/src/safeds/ml/classical/regression/_random_forest.py +++ b/src/safeds/ml/classical/regression/_random_forest.py @@ -12,8 +12,8 @@ from sklearn.base import RegressorMixin from sklearn.ensemble import RandomForestRegressor as sk_RandomForestRegressor - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class RandomForestRegressor(Regressor): @@ -93,7 +93,7 @@ def minimum_number_of_samples_in_leaves(self) -> int: """The minimum number of samples that must remain in the leaves of each tree.""" return self._minimum_number_of_samples_in_leaves - def fit(self, training_set: TabularDataset) -> RandomForestRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> RandomForestRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -136,7 +136,7 @@ def fit(self, training_set: TabularDataset) -> RandomForestRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_regressor.py b/src/safeds/ml/classical/regression/_regressor.py index 1779bbb0e..d1ac75c2c 100644 --- a/src/safeds/ml/classical/regression/_regressor.py +++ b/src/safeds/ml/classical/regression/_regressor.py @@ -4,8 +4,8 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Column, Table +from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset +from safeds.data.tabular.containers import Column, ExperimentalColumn, ExperimentalTable, Table from safeds.exceptions import ColumnLengthMismatchError, PlainTableError if TYPE_CHECKING: @@ -27,7 +27,7 @@ def __hash__(self) -> int: return _structural_hash(self.__class__.__qualname__, self.is_fitted) @abstractmethod - def fit(self, training_set: TabularDataset) -> Regressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> Regressor: """ Create a copy of this regressor and fit it with the given training data. @@ -50,7 +50,7 @@ def fit(self, training_set: TabularDataset) -> Regressor: """ @abstractmethod - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. @@ -94,7 +94,7 @@ def _get_sklearn_regressor(self) -> RegressorMixin: # Metrics # ------------------------------------------------------------------------------------------------------------------ - def summarize_metrics(self, validation_or_test_set: TabularDataset) -> Table: + def summarize_metrics(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset) -> Table: """ Summarize the regressor's metrics on the given data. @@ -123,7 +123,7 @@ def summarize_metrics(self, validation_or_test_set: TabularDataset) -> Table: }, ) - def mean_absolute_error(self, validation_or_test_set: TabularDataset) -> float: + def mean_absolute_error(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset) -> float: """ Compute the mean absolute error (MAE) of the regressor on the given data. @@ -146,14 +146,24 @@ def mean_absolute_error(self, validation_or_test_set: TabularDataset) -> float: if not isinstance(validation_or_test_set, TabularDataset) and isinstance(validation_or_test_set, Table): raise PlainTableError - expected = validation_or_test_set.target - predicted = self.predict(validation_or_test_set.features).target - _check_metrics_preconditions(predicted, expected) - return sk_mean_absolute_error(expected._data, predicted._data) + if isinstance(validation_or_test_set, TabularDataset): + expected = validation_or_test_set.target + predicted = self.predict(validation_or_test_set.features).target + + # TODO: more efficient implementation using polars + _check_metrics_preconditions(predicted, expected) + return sk_mean_absolute_error(expected._data, predicted._data) + elif isinstance(validation_or_test_set, ExperimentalTabularDataset): # pragma: no cover + expected_2 = validation_or_test_set.target + predicted_2 = self.predict(validation_or_test_set.features).target + + # TODO: more efficient implementation using polars + _check_metrics_preconditions_experimental(predicted_2, expected_2) + return sk_mean_absolute_error(expected_2._series, predicted_2._data) # noinspection PyProtectedMember - def mean_squared_error(self, validation_or_test_set: TabularDataset) -> float: + def mean_squared_error(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset) -> float: """ Compute the mean squared error (MSE) on the given data. @@ -176,14 +186,23 @@ def mean_squared_error(self, validation_or_test_set: TabularDataset) -> float: if not isinstance(validation_or_test_set, TabularDataset) and isinstance(validation_or_test_set, Table): raise PlainTableError - expected = validation_or_test_set.target - predicted = self.predict(validation_or_test_set.features).target - _check_metrics_preconditions(predicted, expected) - return sk_mean_squared_error(expected._data, predicted._data) + if isinstance(validation_or_test_set, TabularDataset): + expected = validation_or_test_set.target + predicted = self.predict(validation_or_test_set.features).target + + # TODO: more efficient implementation using polars + _check_metrics_preconditions(predicted, expected) + return sk_mean_squared_error(expected._data, predicted._data) + elif isinstance(validation_or_test_set, ExperimentalTabularDataset): # pragma: no cover + expected_2 = validation_or_test_set.target + predicted_2 = self.predict(validation_or_test_set.features).target + + # TODO: more efficient implementation using polars + _check_metrics_preconditions_experimental(predicted_2, expected_2) + return sk_mean_squared_error(expected_2._series, predicted_2._data) -# noinspection PyProtectedMember def _check_metrics_preconditions(actual: Column, expected: Column) -> None: if not actual.type.is_numeric(): raise TypeError(f"Column 'actual' is not numerical but {actual.type}.") @@ -194,3 +213,20 @@ def _check_metrics_preconditions(actual: Column, expected: Column) -> None: raise ColumnLengthMismatchError( "\n".join([f"{column.name}: {column._data.size}" for column in [actual, expected]]), ) + + +def _check_metrics_preconditions_experimental(actual: Column, expected: ExperimentalColumn) -> None: # pragma: no cover + if not actual.type.is_numeric(): + raise TypeError(f"Column 'actual' is not numerical but {actual.type}.") + if not expected.type.is_numeric: + raise TypeError(f"Column 'expected' is not numerical but {expected.type}.") + + if actual.number_of_rows != expected.number_of_rows: + raise ColumnLengthMismatchError( + "\n".join( + [ + f"{actual.name}: {actual.number_of_rows}", + f"{expected.name}: {expected.number_of_rows}", + ], + ), + ) diff --git a/src/safeds/ml/classical/regression/_ridge_regression.py b/src/safeds/ml/classical/regression/_ridge_regression.py index de1d5cfc1..9a9b8f706 100644 --- a/src/safeds/ml/classical/regression/_ridge_regression.py +++ b/src/safeds/ml/classical/regression/_ridge_regression.py @@ -13,8 +13,8 @@ from sklearn.base import RegressorMixin from sklearn.linear_model import Ridge as sk_Ridge - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class RidgeRegressor(Regressor): @@ -69,7 +69,7 @@ def alpha(self) -> float: """ return self._alpha - def fit(self, training_set: TabularDataset) -> RidgeRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> RidgeRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -108,7 +108,7 @@ def fit(self, training_set: TabularDataset) -> RidgeRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_support_vector_machine.py b/src/safeds/ml/classical/regression/_support_vector_machine.py index c1425530c..56627e978 100644 --- a/src/safeds/ml/classical/regression/_support_vector_machine.py +++ b/src/safeds/ml/classical/regression/_support_vector_machine.py @@ -13,15 +13,15 @@ from sklearn.base import RegressorMixin from sklearn.svm import SVC as sk_SVR # noqa: N811 - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class SupportVectorMachineKernel(ABC): """The abstract base class of the different subclasses supported by the `Kernel`.""" @abstractmethod - def _get_sklearn_arguments(self) -> dict[str, Any]: + def _get_sklearn_arguments(self) -> dict[str, Any]: # TODO: use apply pattern (imputer strategy) instead """Return the arguments to pass to scikit-learn.""" @abstractmethod @@ -188,7 +188,7 @@ def __eq__(self, other: object) -> bool: __hash__ = SupportVectorMachineKernel.__hash__ - def fit(self, training_set: TabularDataset) -> SupportVectorMachineRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> SupportVectorMachineRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -227,7 +227,7 @@ def fit(self, training_set: TabularDataset) -> SupportVectorMachineRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/tests/helpers/_devices.py b/tests/helpers/_devices.py index b371a85f7..54b043e0e 100644 --- a/tests/helpers/_devices.py +++ b/tests/helpers/_devices.py @@ -1,10 +1,8 @@ import pytest import torch +from safeds._config import _init_default_device, _set_default_device from torch.types import Device -from safeds._config import _init_default_device -from safeds._config._device import _set_default_device - _init_default_device() device_cpu = torch.device("cpu") diff --git a/tests/safeds/_config/test_device.py b/tests/safeds/_config/test_torch.py similarity index 82% rename from tests/safeds/_config/test_device.py rename to tests/safeds/_config/test_torch.py index d99997757..9a2d7e008 100644 --- a/tests/safeds/_config/test_device.py +++ b/tests/safeds/_config/test_torch.py @@ -1,10 +1,9 @@ import pytest import torch +from safeds._config import _get_device, _init_default_device, _set_default_device from torch.types import Device -from safeds._config import _get_device, _init_default_device -from safeds._config._device import _set_default_device -from tests.helpers import get_devices, get_devices_ids, configure_test_with_device, device_cuda, device_cpu +from tests.helpers import configure_test_with_device, device_cpu, device_cuda, get_devices, get_devices_ids from tests.helpers._devices import _skip_if_device_not_available diff --git a/tests/safeds/data/image/containers/test_image.py b/tests/safeds/data/image/containers/test_image.py index 34d5bb37d..5cecde78a 100644 --- a/tests/safeds/data/image/containers/test_image.py +++ b/tests/safeds/data/image/containers/test_image.py @@ -7,7 +7,6 @@ import PIL.Image import pytest import torch - from safeds._config import _get_device from safeds.data.image.containers import Image from safeds.data.image.typing import ImageSize @@ -18,6 +17,8 @@ from tests.helpers import ( configure_test_with_device, + device_cpu, + device_cuda, get_devices, get_devices_ids, grayscale_jpg_id, @@ -41,8 +42,6 @@ white_square_jpg_path, white_square_png_id, white_square_png_path, - device_cpu, - device_cuda, )