diff --git a/modin/polars/base.py b/modin/polars/base.py index 9de6935b03e..10aceddde68 100644 --- a/modin/polars/base.py +++ b/modin/polars/base.py @@ -205,25 +205,6 @@ def to_arrow(self): """ return polars.from_pandas(self._query_compiler.to_pandas()).to_arrow() - def to_dict( - self, *, as_series: bool = True - ) -> dict[str, "Series"] | dict[str, list[Any]]: - """ - Convert the DataFrame to a dictionary representation. - - Args: - as_series: Whether to convert the columns to Series. - - Returns: - Dictionary representation of the DataFrame. - """ - if as_series: - return {name: self[name] for name in self.columns} - else: - return polars.from_pandas(self._query_compiler.to_pandas()).to_dict( - as_series=as_series - ) - def to_jax(self, device=None): """ Convert the DataFrame to JAX format. @@ -238,20 +219,6 @@ def to_jax(self, device=None): device=device ) - def to_list(self, *, use_pyarrow: bool | None = None) -> list[Any]: - """ - Convert the DataFrame to a list representation. - - Args: - use_pyarrow: Whether to use PyArrow for conversion. - - Returns: - List representation of the DataFrame. - """ - return polars.from_pandas(self._query_compiler.to_pandas()).to_list( - use_pyarrow=use_pyarrow - ) - def to_numpy( self, *, @@ -313,15 +280,6 @@ def cast(self, dtypes, *, strict: bool = True) -> "BasePolarsDataset": # TODO: support strict return self.__constructor__(_query_compiler=self._query_compiler.astype(dtypes)) - def copy(self): - """ - Copy the DataFrame. - - Returns: - Copied DataFrame. - """ - return self.__constructor__(_query_compiler=self._query_compiler.copy()) - def clone(self) -> "BasePolarsDataset": """ Clone the DataFrame. @@ -345,8 +303,6 @@ def drop_nulls(self, subset=None): _query_compiler=self._query_compiler.dropna(subset=subset, how="any") ) - drop_nans = drop_nulls - def explode(self, columns: str, *more_columns: str) -> "BasePolarsDataset": """ Explode the given columns to long format. @@ -528,9 +484,6 @@ def sample( def shift(self, n: int = 1, *, fill_value=None) -> "DataFrame": raise NotImplementedError("not yet") - def shift_and_fill(self, fill_value=None, *, n: int = 1) -> "DataFrame": - return self.shift(n=n, fill_value=fill_value) - def shrink_to_fit(self) -> "DataFrame": """ Shrink the DataFrame to fit in memory. @@ -605,8 +558,6 @@ def tail(self, n: int = 5) -> "DataFrame": _query_compiler=self._query_compiler.getitem_row_array(slice(-n, None)) ) - take_every = gather_every - def to_dummies( self, columns: str | Sequence[str] | None = None, @@ -706,3 +657,12 @@ def equals(self, other: "BasePolarsDataset", *, null_equal: bool = True) -> bool @property def plot(self): return polars.from_pandas(self._query_compiler.to_pandas()).plot + + def count(self): + """ + Get the number of non-null values in each column. + + Returns: + DataFrame with the counts. + """ + return self.__constructor__(_query_compiler=self._query_compiler.count(axis=0)) diff --git a/modin/polars/dataframe.py b/modin/polars/dataframe.py index b2589776bfa..bd1ee978196 100644 --- a/modin/polars/dataframe.py +++ b/modin/polars/dataframe.py @@ -129,9 +129,8 @@ def _set_columns(self, new_columns): Args: new_columns: New columns to set. """ - new_query_compiler = self.__constructor__( - self.to_pandas().rename(columns=new_columns) - )._query_compiler + new_query_compiler = self._query_compiler.copy() + new_query_compiler.columns = new_columns self._query_compiler = new_query_compiler columns = property(_get_columns, _set_columns) @@ -220,24 +219,6 @@ def __repr__(self): """ return repr(polars.from_pandas(self._query_compiler.to_pandas())) - def copy(self): - """ - Copy the DataFrame. - - Returns: - Copied DataFrame. - """ - return self.__constructor__(_query_compiler=self._query_compiler.copy()) - - def count(self): - """ - Get the number of non-null values in each column. - - Returns: - DataFrame with the counts. - """ - return self.__constructor__(_query_compiler=self._query_compiler.count(axis=0)) - def max(self, axis=None): """ Get the maximum value in each column. @@ -278,9 +259,12 @@ def _convert_non_numeric_to_null(self): ] if len(non_numeric_cols) > 0: return self.__constructor__( - self.to_pandas().assign( - **{c: None for c in non_numeric_cols} - ) # .astype(self._query_compiler.dtypes) + _query_compiler=self._query_compiler.write_items( + slice(None), + [self.columns.index(c) for c in non_numeric_cols], + pandas.NA, + need_columns_reindex=False, + ).astype({c: self._query_compiler.dtypes[c] for c in non_numeric_cols}) ) return self.copy() @@ -309,6 +293,19 @@ def mean(self, *, axis=None, null_strategy="ignore"): ignore_nulls=True if null_strategy == "ignore" else False ) + def median(self) -> "DataFrame": + """ + Get the median of each column. + + Returns: + DataFrame with the median of each column. + """ + return self.__constructor__( + _query_compiler=self._convert_non_numeric_to_null()._query_compiler.median( + 0 + ) + ) + def mean_horizontal(self, *, ignore_nulls: bool = True): """ Get the mean of each row. @@ -571,8 +568,6 @@ def group_by( return GroupBy(self, *by, maintain_order=maintain_order, **named_by) - groupby = group_by - def drop(self, *columns, strict: bool = True) -> "DataFrame": """ Drop the given columns. @@ -629,8 +624,6 @@ def get_column_index(self, name: str) -> int: """ return self.columns.index(name) - find_column_index_by_name = get_column_index - def get_columns(self) -> list["Series"]: """ Get the columns of the DataFrame. @@ -657,19 +650,6 @@ def group_by_dynamic( ): raise NotImplementedError("not yet") - def group_by_rolling( - self, - index_column, - *, - every, - period, - closed, - offset, - by, - check_sorted, - ): - raise NotImplementedError("not yet") - def hstack(self, columns, *, inplace: bool = False) -> "DataFrame": """ Stack the given columns horizontally. @@ -992,25 +972,6 @@ def rename(self, mapping: dict[str, str] | callable) -> "DataFrame": new_obj.columns = new_columns return new_obj - def replace(self, column: str, new_column: "Series") -> "DataFrame": - """ - Replace the column with the new column. - - Args: - column: Column to replace. - new_column: New column to replace with. - - Returns: - DataFrame with the column replaced. - """ - new_loc = self.get_column_index(column) - self._query_compiler = self._query_compiler.drop([column]).insert( - new_loc, - column, - new_column._query_compiler, - ) - return self - def replace_column(self, index: int, column: "Series") -> "DataFrame": """ Replace the column at the given index with the new column. @@ -1029,8 +990,6 @@ def replace_column(self, index: int, column: "Series") -> "DataFrame": ) return self - replace_at_idx = replace_column - def reverse(self) -> "DataFrame": """ Reverse the DataFrame. @@ -1296,8 +1255,6 @@ def map_rows( _query_compiler=self._query_compiler.apply(function, axis=1) ) - apply = map_rows - def corr(self, **kwargs: Any) -> "DataFrame": """ Compute the correlation of the DataFrame. @@ -1307,18 +1264,6 @@ def corr(self, **kwargs: Any) -> "DataFrame": """ return self.__constructor__(_query_compiler=self._query_compiler.corr(**kwargs)) - def frame_equal(self, other: "DataFrame", *, null_equal: bool = True) -> bool: - """ - Check if the DataFrame is equal to another DataFrame. - - Args: - other: DataFrame to compare with. - - Returns: - Whether the DataFrames are equal. - """ - return self.equals(other, null_equal=null_equal) - def lazy(self) -> "LazyFrame": """ Convert the DataFrame to a lazy DataFrame. @@ -1352,3 +1297,139 @@ def serialize(self, file=None) -> str | None: Serialized DataFrame. """ return polars.from_pandas(self._query_compiler.to_pandas()).serialize(file) + + @property + def style(self): + """ + Create a Great Table for styling. + + Returns: + GreatTable object. + """ + return self._to_polars().style + + def to_dict( + self, *, as_series: bool = True + ) -> dict[str, "Series"] | dict[str, list[Any]]: + """ + Convert the DataFrame to a dictionary representation. + + Args: + as_series: Whether to convert the columns to Series. + + Returns: + Dictionary representation of the DataFrame. + """ + if as_series: + return {name: self[name] for name in self.columns} + else: + return polars.from_pandas(self._query_compiler.to_pandas()).to_dict( + as_series=as_series + ) + + def to_dicts(self) -> list[dict[str, Any]]: + """ + Convert the DataFrame to a list of dictionaries. + + Returns: + List of dictionaries. + """ + return self._to_polars().to_dicts() + + def to_init_repr(self, n: int = 1000) -> str: + """ + Get the string representation of the DataFrame for initialization. + + Returns: + String representation of the DataFrame for initialization. + """ + return self._to_polars().to_init_repr(n) + + def to_struct(self, name: str = "") -> "Series": + """ + Convert the DataFrame to a struct. + + Args: + name: Name of the struct. + + Returns: + Series representation of the DataFrame as a struct. + """ + raise NotImplementedError("not yet") + + def unpivot( + self, + on, + *, + index, + variable_name: str | None = None, + value_name: str | None = None, + ) -> "DataFrame": + """ + Unpivot a DataFrame from wide to long format. + + Args: + on: Columns to unpivot. + index: Columns to keep. + variable_name: Name of the variable column. + value_name: Name of the value column. + + Returns: + Unpivoted DataFrame. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.melt( + on=on, + index=index, + var_name=variable_name, + value_name=value_name, + ) + ) + + write_avro = write_clipboard = write_csv = write_database = write_delta = ( + write_excel + ) = write_ipc = write_ipc_stream = write_json = write_ndjson = write_parquet = ( + write_parquet_partitioned + ) = lambda *args, **kwargs: (_ for _ in ()).throw(NotImplementedError("not yet")) + + def clear(self, n: int = 0) -> "DataFrame": + """ + Create an empty (n=0) or null filled (n>0) DataFrame. + + Args: + n: Number of rows to create. + + Returns: + Empty or null filled DataFrame. + """ + return self.__constructor__(polars.DataFrame(schema=self.schema).clear(n=n)) + + def collect_schema(self) -> dict[str, str]: + """ + Collect the schema of the DataFrame. + + Returns: + Dictionary of the schema. + """ + return self.schema + + def fold(self, operation: callable) -> "Series": + """ + Fold the DataFrame. + + Args: + operation: Operation to fold the DataFrame with. + + Returns: + Series with the folded DataFrame. + """ + raise NotImplementedError("not yet") + + def hash_rows( + self, + seed: int = 0, + seed_1: int | None = None, + seed_2: int | None = None, + seed_3: int | None = None, + ) -> "Series": + raise NotImplementedError("not yet") diff --git a/modin/polars/series.py b/modin/polars/series.py index 91edb7b6687..5eba917da10 100644 --- a/modin/polars/series.py +++ b/modin/polars/series.py @@ -20,6 +20,7 @@ import numpy as np import pandas import polars +from polars._utils.various import no_default from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler from modin.pandas import Series as ModinPandasSeries @@ -242,8 +243,6 @@ def dtype(self) -> polars.datatypes.DataType: pandas.Series().astype(self._query_compiler.dtypes.iloc[0]) ).dtype - inner_dtype = None - @property def name(self) -> str: """ @@ -770,8 +769,6 @@ def replace( self.to_pandas().apply(lambda x: mapping.get(x, default)) ) - map_dict = replace - def pct_change(self, n: int = 1) -> "Series": """ Calculate the percentage change. @@ -849,8 +846,6 @@ def rolling_map( .apply(function) ) - rolling_apply = rolling_map - def rolling_max( self, window_size: int, @@ -1202,15 +1197,6 @@ def has_nulls(self) -> bool: has_validity = has_nulls - def is_boolean(self) -> bool: - """ - Check if the data type is boolean. - - Returns: - True if the data type is boolean, False otherwise. - """ - return self.dtype == polars.datatypes.Boolean - def is_finite(self) -> "Series": """ Check if the values are finite. @@ -1229,17 +1215,6 @@ def is_first_distinct(self) -> "Series": """ raise NotImplementedError("not yet") - is_first = is_first_distinct - - def is_float(self) -> bool: - """ - Check if the data type is float. - - Returns: - True if the data type is float, False otherwise. - """ - return self.dtype in polars.datatypes.FLOAT_DTYPES - def is_in(self, other: "Series" | list[Any]) -> "Series": """ Check if the values are in the other Series. @@ -1261,23 +1236,6 @@ def is_infinite(self) -> "Series": """ return self.__eq__(np.inf) - def is_integer(self, signed: bool | None = None) -> bool: - """ - Check if the data type is integer. - - Args: - signed: Signed integer. - - Returns: - True if the data type is integer, False otherwise. - """ - if signed is None: - return self.dtype in polars.datatypes.INTEGER_DTYPES - elif signed: - return self.dtype in polars.datatypes.SIGNED_INTEGER_DTYPES - else: - return self.dtype in polars.datatypes.UNSIGNED_INTEGER_DTYPES - def is_last_distinct(self) -> "Series": """ Check if the values are the last occurrence. @@ -1287,8 +1245,6 @@ def is_last_distinct(self) -> "Series": """ raise NotImplementedError("not yet") - is_last = is_last_distinct - def is_nan(self) -> "Series": """ Check if the values are NaN. @@ -1325,15 +1281,6 @@ def is_null(self) -> "Series": """ return self.is_nan() - def is_numeric(self) -> bool: - """ - Check if the data type is numeric. - - Returns: - True if the data type is numeric, False otherwise. - """ - return self.dtype in polars.datatypes.NUMERIC_DTYPES - def is_sorted( self, *, @@ -1355,27 +1302,6 @@ def is_sorted( else self.to_pandas().is_monotonic_decreasing ) - def is_temporal(self, excluding=None) -> bool: - """ - Check if the data type is temporal. - - Args: - excluding: Excluding data types. - - Returns: - True if the data type is temporal, False otherwise. - """ - return self.dtype in polars.datatypes.DATETIME_DTYPES - - def is_utf8(self) -> bool: - """ - Check if the data type is UTF-8. - - Returns: - True if the data type is UTF-8, False otherwise. - """ - return self.dtype == polars.String - def len(self) -> int: """ Get the length of the values. @@ -1551,30 +1477,6 @@ def clip(self, lower_bound=None, upper_bound=None) -> "Series": values=self.to_pandas().clip(lower_bound, upper_bound) ) - def clip_max(self, upper_bound) -> "Series": - """ - Clip the maximum values. - - Args: - upper_bound: Upper bound. - - Returns: - Clipped maximum values Series. - """ - return self.clip(lower_bound=None, upper_bound=upper_bound) - - def clip_min(self, lower_bound) -> "Series": - """ - Clip the minimum values. - - Args: - lower_bound: Lower bound. - - Returns: - Clipped minimum values Series. - """ - return self.clip(lower_bound=lower_bound, upper_bound=None) - def cut( self, breaks: Sequence[float], @@ -1793,19 +1695,6 @@ def set(self, filter: "Series", value: int | float | str | bool | None) -> "Seri """ raise NotImplementedError("not yet") - def set_at_idx(self, indices, values) -> "Series": - """ - Set values by indices. - - Args: - indices: Indices. - value: Value. - - Returns: - Set Series. - """ - raise NotImplementedError("not yet") - def shrink_dtype(self) -> "Series": """ Shrink the data type. @@ -1827,11 +1716,6 @@ def shuffle(self, seed: int | None = None) -> "Series": """ raise NotImplementedError("not yet") - take = gather - - def view(self, *, ignore_nulls: bool = False): - raise NotImplementedError("not yet") - def zip_with(self, mask: "Series", other: "Series") -> "Series": """ Zip the Series with another Series. @@ -1867,8 +1751,6 @@ def map_elements( """ return self.__constructor__(values=self.to_pandas().apply(function)) - apply = map_elements - def reinterpret(self, *, signed: bool = True) -> "Series": """ Reinterpret the data type of the series as signed or unsigned. @@ -1881,18 +1763,6 @@ def reinterpret(self, *, signed: bool = True) -> "Series": """ raise NotImplementedError("not yet") - def series_equal(self, other: "Series", *, null_equal: bool = True) -> bool: - """ - Check if the Series are equal. - - Args: - other: Other Series. - - Returns: - True if the Series are equal, False otherwise. - """ - return self.equals(other, null_equal=null_equal) - def set_sorted(self, *, descending: bool = False) -> "Series": """ Set the Series as sorted. @@ -1942,3 +1812,336 @@ def dt(self): # TODO: implement dt object # https://docs.pola.rs/api/python/stable/reference/series/temporal.html raise NotImplementedError("not yet") + + def __len__(self) -> int: + """ + Get the length of the Series. + """ + return self.len() + + def __matmul__(self, other) -> "Series": + """ + Matrix multiplication. + + Args: + other: Other Series. + + Returns: + Matrix multiplication Series. + """ + raise NotImplementedError("not yet") + + def __radd__(self, other) -> "Series": + """ + Right addition. + + Args: + other: Other Series. + + Returns: + Added Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.radd(other, axis=0) + ) + + def __rand__(self, other) -> "Series": + """ + Right and. + + Args: + other: Other Series. + + Returns: + And Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.__rand__(other, axis=0) + ) + + def __rfloordiv__(self, other) -> "Series": + """ + Right floor division. + + Args: + other: Other Series. + + Returns: + Floored Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rfloordiv(other, axis=0) + ) + + def __rmatmul__(self, other) -> "Series": + """ + Right matrix multiplication. + + Args: + other: Other Series. + + Returns: + Matrix multiplication Series. + """ + raise NotImplementedError("not yet") + + def __rmod__(self, other) -> "Series": + """ + Right modulo. + + Args: + other: Other Series. + + Returns: + Modulo Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rmod(other, axis=0) + ) + + def __rmul__(self, other) -> "Series": + """ + Right multiplication. + + Args: + other: Other Series. + + Returns: + Multiplied Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rmul(other, axis=0) + ) + + def __ror__(self, other) -> "Series": + """ + Right or. + + Args: + other: Other Series. + + Returns: + Or Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.__ror__(other, axis=0) + ) + + def __rpow__(self, other) -> "Series": + """ + Right power. + + Args: + other: Other Series. + + Returns: + Powered Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rpow(other, axis=0) + ) + + def __rsub__(self, other) -> "Series": + """ + Right subtraction. + + Args: + other: Other Series. + + Returns: + Subtracted Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rsub(other, axis=0) + ) + + def __rtruediv__(self, other) -> "Series": + """ + Right true division. + + Args: + other: Other Series. + + Returns: + Divided Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rtruediv(other, axis=0) + ) + + def __rxor__(self, other) -> "Series": + """ + Right xor. + + Args: + other: Other Series. + + Returns: + Xor Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.__rxor__(other, axis=0) + ) + + def eq(self, other) -> "Series": + """ + Check if the values are equal to the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.eq(other._query_compiler) + ) + + def eq_missing(self, other) -> "Series": + """ + Check if the values are equal to the other Series, including missing values. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + raise NotImplementedError("not yet") + + def ge(self, other) -> "Series": + """ + Check if the values are greater than or equal to the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.ge(other._query_compiler) + ) + + def gt(self, other) -> "Series": + """ + Check if the values are greater than the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.gt(other._query_compiler) + ) + + def le(self, other) -> "Series": + """ + Check if the values are less than or equal to the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.le(other._query_compiler) + ) + + def lt(self, other) -> "Series": + """ + Check if the values are less than the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.lt(other._query_compiler) + ) + + def n_unique(self) -> int: + """ + Get the number of unique values. + + Returns: + Number of unique values. + """ + return self._query_compiler.nunique().to_pandas().squeeze(axis=None) + + def ne(self, other) -> "Series": + """ + Check if the values are not equal to the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.ne(other._query_compiler) + ) + + def ne_missing(self, other) -> "Series": + """ + Check if the values are not equal to the other Series, including missing values. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + raise NotImplementedError("not yet") + + def pow(self, exponent) -> "Series": + """ + Raise the values to the power of the exponent. + + Args: + exponent: Exponent. + + Returns: + Powered Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.pow(exponent, axis=0) + ) + + def replace_strict(self, old, new=no_default, *, default=no_default, return_dtype=None) -> "Series": + """ + Replace values strictly. + + Args: + old: Old values. + new: New values. + default: Default value. + + Returns: + Replaced Series. + """ + raise NotImplementedError("not yet") + + def to_list(self) -> list: + """ + Convert the Series to a list. + + Returns: + List representation of the Series. + """ + return self._to_polars().tolist() + + def drop_nans(self) -> "Series": + """ + Drop NaN values. + + Returns: + Series without NaN values. + """ + return self.__constructor__(_query_compiler=self._query_compiler.dropna(how="any"))