diff --git a/.kokoro/docs/docs-presubmit-gerrit.cfg b/.kokoro/docs/docs-presubmit-gerrit.cfg new file mode 100644 index 0000000000..1d0dc4b499 --- /dev/null +++ b/.kokoro/docs/docs-presubmit-gerrit.cfg @@ -0,0 +1,23 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "V2_STAGING_BUCKET" + value: "gcloud-python-test" +} + +# We only upload the image in the main `docs` build. +env_vars: { + key: "TRAMPOLINE_IMAGE_UPLOAD" + value: "false" +} + +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: ".kokoro/build.sh" +} + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "docfx" +} diff --git a/.kokoro/presubmit/e2e-gerrit.cfg b/.kokoro/presubmit/e2e-gerrit.cfg new file mode 100644 index 0000000000..d875f36060 --- /dev/null +++ b/.kokoro/presubmit/e2e-gerrit.cfg @@ -0,0 +1,7 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "system_noextras e2e notebook samples" +} diff --git a/.kokoro/presubmit/presubmit-gerrit.cfg b/.kokoro/presubmit/presubmit-gerrit.cfg new file mode 100644 index 0000000000..18a4c35325 --- /dev/null +++ b/.kokoro/presubmit/presubmit-gerrit.cfg @@ -0,0 +1 @@ +# Format: //devtools/kokoro/config/proto/build.proto diff --git a/OWNERS b/OWNERS index 672da38afa..f86ad551ef 100644 --- a/OWNERS +++ b/OWNERS @@ -1,3 +1,4 @@ +ashleyxu@google.com bmil@google.com chelsealin@google.com garrettwu@google.com diff --git a/README.rst b/README.rst index 935c54cc8b..23aea446ff 100644 --- a/README.rst +++ b/README.rst @@ -57,9 +57,13 @@ internally to manage metadata on the service side. This session is tied to a BigQuery DataFrames uses the US multi-region as the default location, but you can use ``session_options.location`` to set a different location. Every query in a session is executed in the location where the session was created. +BigQuery DataFrames +auto-populates ``bf.options.bigquery.location`` if the user starts with +``read_gbq/read_gbq_table/read_gbq_query()`` and specifies a table, either +directly or in a SQL statement. If you want to reset the location of the created DataFrame or Series objects, -can reset the session by executing ``bigframes.pandas.reset_session()``. +you can reset the session by executing ``bigframes.pandas.reset_session()``. After that, you can reuse ``bigframes.pandas.options.bigquery.location`` to specify another location. @@ -68,6 +72,11 @@ specify another location. querying is not in the US multi-region. If you try to read a table from another location, you get a NotFound exception. +Project +------- +If ``bf.options.bigquery.project`` is not set, the ``$GOOGLE_CLOUD_PROJECT`` +environment variable is used, which is set in the notebook runtime serving the +BigQuery Studio/Vertex Notebooks. ML Capabilities --------------- diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 7086269af9..27fe4a4fe6 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -518,8 +518,8 @@ def aggregate( """ Apply aggregations to the expression. Arguments: - by_column_id: column id of the aggregation key, this is preserved through the transform aggregations: input_column_id, operation, output_column_id tuples + by_column_id: column id of the aggregation key, this is preserved through the transform dropna: whether null keys should be dropped """ table = self.to_ibis_expr(ordering_mode="unordered") diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index abf8b887d8..5dcd9fe753 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -117,6 +117,25 @@ def value_counts( return block.select_column(count_id).with_column_labels(["count"]) +def pct_change(block: blocks.Block, periods: int = 1) -> blocks.Block: + column_labels = block.column_labels + window_spec = core.WindowSpec( + preceding=periods if periods > 0 else None, + following=-periods if periods < 0 else None, + ) + + original_columns = block.value_columns + block, shift_columns = block.multi_apply_window_op( + original_columns, agg_ops.ShiftOp(periods), window_spec=window_spec + ) + result_ids = [] + for original_col, shifted_col in zip(original_columns, shift_columns): + block, change_id = block.apply_binary_op(original_col, shifted_col, ops.sub_op) + block, pct_change_id = block.apply_binary_op(change_id, shifted_col, ops.div_op) + result_ids.append(pct_change_id) + return block.select_columns(result_ids).with_column_labels(column_labels) + + def rank( block: blocks.Block, method: str = "average", @@ -229,3 +248,160 @@ def dropna(block: blocks.Block, how: typing.Literal["all", "any"] = "any"): filtered_block = filtered_block.filter(predicate) filtered_block = filtered_block.select_columns(block.value_columns) return filtered_block + + +def nsmallest( + block: blocks.Block, + n: int, + column_ids: typing.Sequence[str], + keep: str, +) -> blocks.Block: + if keep not in ("first", "last", "all"): + raise ValueError("'keep must be one of 'first', 'last', or 'all'") + if keep == "last": + block = block.reversed() + order_refs = [ + ordering.OrderingColumnReference( + col_id, direction=ordering.OrderingDirection.ASC + ) + for col_id in column_ids + ] + block = block.order_by(order_refs, stable=True) + if keep in ("first", "last"): + return block.slice(0, n) + else: # keep == "all": + block, counter = block.apply_window_op( + column_ids[0], + agg_ops.rank_op, + window_spec=core.WindowSpec(ordering=order_refs), + ) + block, condition = block.apply_unary_op( + counter, ops.partial_right(ops.le_op, n) + ) + block = block.filter(condition) + return block.drop_columns([counter, condition]) + + +def nlargest( + block: blocks.Block, + n: int, + column_ids: typing.Sequence[str], + keep: str, +) -> blocks.Block: + if keep not in ("first", "last", "all"): + raise ValueError("'keep must be one of 'first', 'last', or 'all'") + if keep == "last": + block = block.reversed() + order_refs = [ + ordering.OrderingColumnReference( + col_id, direction=ordering.OrderingDirection.DESC + ) + for col_id in column_ids + ] + block = block.order_by(order_refs, stable=True) + if keep in ("first", "last"): + return block.slice(0, n) + else: # keep == "all": + block, counter = block.apply_window_op( + column_ids[0], + agg_ops.rank_op, + window_spec=core.WindowSpec(ordering=order_refs), + ) + block, condition = block.apply_unary_op( + counter, ops.partial_right(ops.le_op, n) + ) + block = block.filter(condition) + return block.drop_columns([counter, condition]) + + +def skew( + block: blocks.Block, + skew_column_ids: typing.Sequence[str], + grouping_column_ids: typing.Sequence[str] = (), +) -> blocks.Block: + + original_columns = skew_column_ids + column_labels = block.select_columns(original_columns).column_labels + + block, delta3_ids = _mean_delta_to_power( + block, 3, original_columns, grouping_column_ids + ) + # counts, moment3 for each column + aggregations = [] + for i, col in enumerate(original_columns): + count_agg = (col, agg_ops.count_op) + moment3_agg = (delta3_ids[i], agg_ops.mean_op) + variance_agg = (col, agg_ops.PopVarOp()) + aggregations.extend([count_agg, moment3_agg, variance_agg]) + + block, agg_ids = block.aggregate( + by_column_ids=grouping_column_ids, aggregations=aggregations + ) + + skew_ids = [] + for i, col in enumerate(original_columns): + # Corresponds to order of aggregations in preceding loop + count_id, moment3_id, var_id = agg_ids[i * 3 : (i * 3) + 3] + block, skew_id = _skew_from_moments_and_count( + block, count_id, moment3_id, var_id + ) + skew_ids.append(skew_id) + + block = block.select_columns(skew_ids).with_column_labels(column_labels) + if not grouping_column_ids: + # When ungrouped, stack everything into single column so can be returned as series + block = block.stack() + block = block.drop_levels([block.index_columns[0]]) + return block + + +def _mean_delta_to_power( + block: blocks.Block, + n_power, + column_ids: typing.Sequence[str], + grouping_column_ids: typing.Sequence[str], +) -> typing.Tuple[blocks.Block, typing.Sequence[str]]: + """Calculate (x-mean(x))^n. Useful for calculating moment statistics such as skew and kurtosis.""" + window = core.WindowSpec(grouping_keys=grouping_column_ids) + block, mean_ids = block.multi_apply_window_op(column_ids, agg_ops.mean_op, window) + delta_ids = [] + cube_op = ops.partial_right(ops.pow_op, n_power) + for val_id, mean_val_id in zip(column_ids, mean_ids): + block, delta_id = block.apply_binary_op(val_id, mean_val_id, ops.sub_op) + block, delta_power_id = block.apply_unary_op(delta_id, cube_op) + block = block.drop_columns(delta_id) + delta_ids.append(delta_power_id) + return block, delta_ids + + +def _skew_from_moments_and_count( + block: blocks.Block, count_id: str, moment3_id: str, var_id: str +) -> typing.Tuple[blocks.Block, str]: + # Calculate skew using count, third moment and population variance + # See G1 estimator: + # https://en.wikipedia.org/wiki/Skewness#Sample_skewness + block, denominator_id = block.apply_unary_op( + var_id, ops.partial_right(ops.pow_op, 3 / 2) + ) + block, base_id = block.apply_binary_op(moment3_id, denominator_id, ops.div_op) + block, countminus1_id = block.apply_unary_op( + count_id, ops.partial_right(ops.sub_op, 1) + ) + block, countminus2_id = block.apply_unary_op( + count_id, ops.partial_right(ops.sub_op, 2) + ) + block, adjustment_id = block.apply_binary_op(count_id, countminus1_id, ops.mul_op) + block, adjustment_id = block.apply_unary_op( + adjustment_id, ops.partial_right(ops.pow_op, 1 / 2) + ) + block, adjustment_id = block.apply_binary_op( + adjustment_id, countminus2_id, ops.div_op + ) + block, skew_id = block.apply_binary_op(base_id, adjustment_id, ops.mul_op) + + # Need to produce NA if have less than 3 data points + block, na_cond_id = block.apply_unary_op(count_id, ops.partial_right(ops.ge_op, 3)) + block, skew_id = block.apply_binary_op( + skew_id, na_cond_id, ops.partial_arg3(ops.where_op, None) + ) + return block, skew_id diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 482cfd0141..5b414252ee 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -709,8 +709,9 @@ def multi_apply_window_op( window_spec: core.WindowSpec, *, skip_null_groups: bool = False, - ) -> Block: + ) -> typing.Tuple[Block, typing.Sequence[str]]: block = self + result_ids = [] for i, col_id in enumerate(columns): label = self.col_id_to_label[col_id] block, result_id = block.apply_window_op( @@ -721,9 +722,8 @@ def multi_apply_window_op( result_label=label, skip_null_groups=skip_null_groups, ) - block = block.copy_values(result_id, col_id) - block = block.drop_columns([result_id]) - return block + result_ids.append(result_id) + return block, result_ids def multi_apply_unary_op( self, @@ -1123,7 +1123,9 @@ def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: ) def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: - axis_number = bigframes.core.utils.get_axis_number(axis) + axis_number = bigframes.core.utils.get_axis_number( + "rows" if (axis is None) else axis + ) if axis_number == 0: expr = self._expr for index_col in self._index_columns: @@ -1140,7 +1142,9 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: return self.rename(columns=lambda label: f"{prefix}{label}") def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: - axis_number = bigframes.core.utils.get_axis_number(axis) + axis_number = bigframes.core.utils.get_axis_number( + "rows" if (axis is None) else axis + ) if axis_number == 0: expr = self._expr for index_col in self._index_columns: diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 589c5c251c..810e145d33 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -20,6 +20,7 @@ import bigframes.constants as constants import bigframes.core as core +import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.ordering as order import bigframes.core.utils as utils @@ -145,6 +146,16 @@ def var( self._raise_on_non_numeric("var") return self._aggregate_all(agg_ops.var_op, numeric_only=True) + def skew( + self, + *, + numeric_only: bool = False, + ) -> df.DataFrame: + if not numeric_only: + self._raise_on_non_numeric("skew") + block = block_ops.skew(self._block, self._selected_cols, self._by_col_ids) + return df.DataFrame(block) + def all(self) -> df.DataFrame: return self._aggregate_all(agg_ops.all_op) @@ -168,6 +179,22 @@ def cummax(self, *args, numeric_only: bool = False, **kwargs) -> df.DataFrame: def cumprod(self, *args, **kwargs) -> df.DataFrame: return self._apply_window_op(agg_ops.product_op, numeric_only=True) + def shift(self, periods=1) -> series.Series: + window = core.WindowSpec( + grouping_keys=self._by_col_ids, + preceding=periods if periods > 0 else None, + following=-periods if periods < 0 else None, + ) + return self._apply_window_op(agg_ops.ShiftOp(periods), window=window) + + def diff(self, periods=1) -> series.Series: + window = core.WindowSpec( + grouping_keys=self._by_col_ids, + preceding=periods if periods > 0 else None, + following=-periods if periods < 0 else None, + ) + return self._apply_window_op(agg_ops.DiffOp(periods), window=window) + def agg(self, func=None, **kwargs) -> df.DataFrame: if func: if isinstance(func, str): @@ -323,10 +350,10 @@ def _apply_window_op( grouping_keys=self._by_col_ids, following=0 ) columns = self._aggregated_columns(numeric_only=numeric_only) - block = self._block.multi_apply_window_op( + block, result_ids = self._block.multi_apply_window_op( columns, op, window_spec=window_spec, skip_null_groups=self._dropna ) - block = block.select_columns(columns) + block = block.select_columns(result_ids) return df.DataFrame(block) def _resolve_label(self, label: blocks.Label) -> str: @@ -391,6 +418,10 @@ def std(self, *args, **kwargs) -> series.Series: def var(self, *args, **kwargs) -> series.Series: return self._aggregate(agg_ops.var_op) + def skew(self, *args, **kwargs) -> series.Series: + block = block_ops.skew(self._block, [self._value_column], self._by_col_ids) + return series.Series(block) + def prod(self, *args) -> series.Series: return self._aggregate(agg_ops.product_op) @@ -459,8 +490,13 @@ def shift(self, periods=1) -> series.Series: ) return self._apply_window_op(agg_ops.ShiftOp(periods), window=window) - def diff(self) -> series.Series: - return self._ungroup() - self.shift(1) + def diff(self, periods=1) -> series.Series: + window = core.WindowSpec( + grouping_keys=self._by_col_ids, + preceding=periods if periods > 0 else None, + following=-periods if periods < 0 else None, + ) + return self._apply_window_op(agg_ops.DiffOp(periods), window=window) def rolling(self, window: int, min_periods=None) -> windows.Window: # To get n size window, need current row and n-1 preceding rows. diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 46091f211a..28bce05338 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -145,23 +145,41 @@ def __setitem__( value: bigframes.dataframe.SingleItemValue, ): if ( - not isinstance(key, tuple) - or len(key) != 2 - or not isinstance(key[0], slice) - or (key[0].start is not None and key[0].start != 0) - or (key[0].step is not None and key[0].step != 1) - or key[0].stop is not None + isinstance(key, tuple) + and len(key) == 2 + and isinstance(key[0], slice) + and (key[0].start is None or key[0].start == 0) + and (key[0].step is None or key[0].step == 1) + and key[0].stop is None ): + # TODO(swast): Support setting multiple columns with key[1] as a list + # of labels and value as a DataFrame. + df = self._dataframe.assign(**{key[1]: value}) + self._dataframe._set_block(df._get_block()) + elif ( + isinstance(key, tuple) + and len(key) == 2 + and isinstance(key[0], bigframes.series.Series) + and key[0].dtype == "boolean" + ) and pd.api.types.is_scalar(value): + new_column = key[0].map({True: value, False: None}) + try: + original_column = self._dataframe[key[1]] + except KeyError: + self._dataframe[key[1]] = new_column + return + try: + self._dataframe[key[1]] = new_column.fillna(original_column) + except ibis.common.exceptions.IbisTypeError: + raise TypeError( + f"Cannot assign scalar of type {type(value)} to column of type {original_column.dtype}, or index type of series argument does not match dataframe." + ) + else: raise NotImplementedError( - "Only setting a column by DataFrame.loc[:, 'column'] is supported." + "Only DataFrame.loc[:, 'column'] and DataFrame.loc[bool series, 'column'] = Scalar are supported." f"{constants.FEEDBACK_LINK}" ) - # TODO(swast): Support setting multiple columns with key[1] as a list - # of labels and value as a DataFrame. - df = self._dataframe.assign(**{key[1]: value}) - self._dataframe._set_block(df._get_block()) - class ILocDataFrameIndexer: def __init__(self, dataframe: bigframes.dataframe.DataFrame): diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 04b9a36b64..748a68c944 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -53,6 +53,10 @@ def names(self) -> typing.Sequence[blocks.Label]: def names(self, values: typing.Sequence[blocks.Label]): return self._data._set_block(self._data._get_block().with_index_labels(values)) + @property + def nlevels(self) -> int: + return len(self._data._get_block().index_columns) + @property def shape(self) -> typing.Tuple[int]: return (self._data._get_block().shape[0],) @@ -97,6 +101,22 @@ def is_monotonic_decreasing(self) -> bool: ), ) + @property + def is_unique(self) -> bool: + # TODO: Cache this at block level + # Avoid circular imports + import bigframes.core.block_transforms as block_ops + import bigframes.dataframe as df + + duplicates_block, _ = block_ops.indicate_duplicates( + self._data._get_block(), self._data._get_block().index_columns + ) + duplicates_block = duplicates_block.with_column_labels( + ["values", "is_duplicate"] + ) + duplicates_df = df.DataFrame(duplicates_block) + return not duplicates_df["is_duplicate"].any() + def __getitem__(self, key: int) -> typing.Any: if isinstance(key, int): result_pd_df, _ = self._data._get_block().slice(key, key + 1, 1).to_pandas() diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 1c0a2a1a81..75175690ce 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -23,8 +23,8 @@ UNNAMED_INDEX_ID = "bigframes_unnamed_index" -def get_axis_number(axis: typing.Union[str, int, None]) -> typing.Literal[0, 1]: - if axis in {0, "index", "rows", None}: +def get_axis_number(axis: typing.Union[str, int]) -> typing.Literal[0, 1]: + if axis in {0, "index", "rows"}: return 0 elif axis in {1, "columns"}: return 1 diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 31777f3fac..d65d4ce344 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -455,7 +455,7 @@ def __getattr__(self, key: str): raise AttributeError(key) def __repr__(self) -> str: - """Converts a DataFrame to a string. Calls compute. + """Converts a DataFrame to a string. Calls to_pandas. Only represents the first `bigframes.options.display.max_rows`. """ @@ -532,13 +532,14 @@ def _apply_binop( other: float | int | bigframes.series.Series | DataFrame, op, axis: str | int = "columns", + how: str = "outer", ): if isinstance(other, (float, int)): return self._apply_scalar_binop(other, op) elif isinstance(other, bigframes.series.Series): - return self._apply_series_binop(other, op, axis=axis) + return self._apply_series_binop(other, op, axis=axis, how=how) elif isinstance(other, DataFrame): - return self._apply_dataframe_binop(other, op) + return self._apply_dataframe_binop(other, op, how=how) raise NotImplementedError( f"binary operation is not implemented on the second operand of type {type(other).__name__}." f"{constants.FEEDBACK_LINK}" @@ -559,6 +560,7 @@ def _apply_series_binop( other: bigframes.series.Series, op: ops.BinaryOp, axis: str | int = "columns", + how: str = "outer", ) -> DataFrame: if axis not in ("columns", "index", 0, 1): raise ValueError(f"Invalid input: axis {axis}.") @@ -569,7 +571,7 @@ def _apply_series_binop( ) joined_index, (get_column_left, get_column_right) = self._block.index.join( - other._block.index, how="outer" + other._block.index, how=how ) series_column_id = other._value.get_name() @@ -591,22 +593,27 @@ def _apply_series_binop( return DataFrame(block) def _apply_dataframe_binop( - self, - other: DataFrame, - op: ops.BinaryOp, + self, other: DataFrame, op: ops.BinaryOp, how: str = "outer" ) -> DataFrame: # Join rows joined_index, (get_column_left, get_column_right) = self._block.index.join( - other._block.index, how="outer" + other._block.index, how=how ) # join columns schema + # indexers will be none for exact match columns, lcol_indexer, rcol_indexer = self.columns.join( - other.columns, how="outer", return_indexers=True + other.columns, how=how, return_indexers=True ) binop_result_ids = [] block = joined_index._block - for left_index, right_index in zip(lcol_indexer, rcol_indexer): + + column_indices = zip( + lcol_indexer if (lcol_indexer is not None) else range(len(columns)), + rcol_indexer if (lcol_indexer is not None) else range(len(columns)), + ) + + for left_index, right_index in column_indices: if left_index >= 0 and right_index >= 0: # -1 indices indicate missing left_col_id = self._block.value_columns[left_index] right_col_id = other._block.value_columns[right_index] @@ -617,13 +624,19 @@ def _apply_dataframe_binop( ) binop_result_ids.append(result_col_id) elif left_index >= 0: - dtype = self.dtypes[left_index] - block, null_col_id = block.create_constant(None, dtype=dtype) - binop_result_ids.append(null_col_id) + left_col_id = self._block.value_columns[left_index] + block, result_col_id = block.apply_unary_op( + get_column_left(left_col_id), + ops.partial_right(op, None), + ) + binop_result_ids.append(result_col_id) elif right_index >= 0: - dtype = other.dtypes[right_index] - block, null_col_id = block.create_constant(None, dtype=dtype) - binop_result_ids.append(null_col_id) + right_col_id = other._block.value_columns[right_index] + block, result_col_id = block.apply_unary_op( + get_column_right(right_col_id), + ops.partial_left(op, None), + ) + binop_result_ids.append(result_col_id) else: # Should not be possible raise ValueError("No right or left index.") @@ -759,6 +772,75 @@ def rpow( __rpow__ = rpow + def combine( + self, + other: DataFrame, + func: typing.Callable[ + [bigframes.series.Series, bigframes.series.Series], bigframes.series.Series + ], + fill_value=None, + overwrite: bool = True, + ) -> DataFrame: + # Join rows + joined_index, (get_column_left, get_column_right) = self._block.index.join( + other._block.index, how="outer" + ) + columns, lcol_indexer, rcol_indexer = self.columns.join( + other.columns, how="outer", return_indexers=True + ) + + column_indices = zip( + lcol_indexer if (lcol_indexer is not None) else range(len(columns)), + rcol_indexer if (lcol_indexer is not None) else range(len(columns)), + ) + + block = joined_index._block + results = [] + for left_index, right_index in column_indices: + if left_index >= 0 and right_index >= 0: # -1 indices indicate missing + left_col_id = get_column_left(self._block.value_columns[left_index]) + right_col_id = get_column_right(other._block.value_columns[right_index]) + left_series = bigframes.series.Series(block.select_column(left_col_id)) + right_series = bigframes.series.Series( + block.select_column(right_col_id) + ) + if fill_value is not None: + left_series = left_series.fillna(fill_value) + right_series = right_series.fillna(fill_value) + results.append(func(left_series, right_series)) + elif left_index >= 0: + # Does not exist in other + if overwrite: + dtype = self.dtypes[left_index] + block, null_col_id = block.create_constant(None, dtype=dtype) + result = bigframes.series.Series(block.select_column(null_col_id)) + results.append(result) + else: + left_col_id = get_column_left(self._block.value_columns[left_index]) + result = bigframes.series.Series(block.select_column(left_col_id)) + if fill_value is not None: + result = result.fillna(fill_value) + results.append(result) + elif right_index >= 0: + right_col_id = get_column_right(other._block.value_columns[right_index]) + result = bigframes.series.Series(block.select_column(right_col_id)) + if fill_value is not None: + result = result.fillna(fill_value) + results.append(result) + else: + # Should not be possible + raise ValueError("No right or left index.") + + if all([isinstance(val, bigframes.series.Series) for val in results]): + import bigframes.core.reshape as rs + + return rs.concat(results, axis=1) + else: + raise ValueError("'func' must return Series") + + def combine_first(self, other: DataFrame): + return self._apply_dataframe_binop(other, ops.fillna_op) + def to_pandas( self, max_download_size: Optional[int] = None, @@ -810,6 +892,28 @@ def head(self, n: int = 5) -> DataFrame: def tail(self, n: int = 5) -> DataFrame: return typing.cast(DataFrame, self.iloc[-n:]) + def nlargest( + self, + n: int, + columns: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + keep: str = "first", + ) -> DataFrame: + if keep not in ("first", "last", "all"): + raise ValueError("'keep must be one of 'first', 'last', or 'all'") + column_ids = self._sql_names(columns) + return DataFrame(block_ops.nlargest(self._block, n, column_ids, keep=keep)) + + def nsmallest( + self, + n: int, + columns: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + keep: str = "first", + ) -> DataFrame: + if keep not in ("first", "last", "all"): + raise ValueError("'keep must be one of 'first', 'last', or 'all'") + column_ids = self._sql_names(columns) + return DataFrame(block_ops.nsmallest(self._block, n, column_ids, keep=keep)) + def drop( self, labels: typing.Any = None, @@ -852,13 +956,50 @@ def drop( raise ValueError("Must specify 'labels' or 'index'/'columns") return DataFrame(block) - def droplevel(self, level: LevelsType): - resolved_level_ids = self._resolve_levels(level) - return DataFrame(self._block.drop_levels(resolved_level_ids)) + def droplevel(self, level: LevelsType, axis: int | str = 0): + axis_n = utils.get_axis_number(axis) + if axis_n == 0: + resolved_level_ids = self._resolve_levels(level) + return DataFrame(self._block.drop_levels(resolved_level_ids)) + else: + if isinstance(self.columns, pandas.MultiIndex): + new_df = self.copy() + new_df.columns = self.columns.droplevel(level) + return new_df + else: + raise ValueError("Columns must be a multiindex to drop levels.") + + def swaplevel(self, i: int = -2, j: int = -1, axis: int | str = 0): + axis_n = utils.get_axis_number(axis) + if axis_n == 0: + level_i = self._block.index_columns[i] + level_j = self._block.index_columns[j] + mapping = {level_i: level_j, level_j: level_i} + reordering = [ + mapping.get(index_id, index_id) + for index_id in self._block.index_columns + ] + return DataFrame(self._block.reorder_levels(reordering)) + else: + if isinstance(self.columns, pandas.MultiIndex): + new_df = self.copy() + new_df.columns = self.columns.swaplevel(i, j) + return new_df + else: + raise ValueError("Columns must be a multiindex to reorder levels.") - def reorder_levels(self, order: LevelsType): - resolved_level_ids = self._resolve_levels(order) - return DataFrame(self._block.reorder_levels(resolved_level_ids)) + def reorder_levels(self, order: LevelsType, axis: int | str = 0): + axis_n = utils.get_axis_number(axis) + if axis_n == 0: + resolved_level_ids = self._resolve_levels(order) + return DataFrame(self._block.reorder_levels(resolved_level_ids)) + else: + if isinstance(self.columns, pandas.MultiIndex): + new_df = self.copy() + new_df.columns = self.columns.reorder_levels(order) + return new_df + else: + raise ValueError("Columns must be a multiindex to reorder levels.") def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]: if utils.is_list_like(level): @@ -1096,8 +1237,177 @@ def add_suffix(self, suffix: str, axis: int | str | None = None) -> DataFrame: axis = 1 if axis is None else axis return DataFrame(self._get_block().add_suffix(suffix, axis)) + def filter( + self, + items: typing.Optional[typing.Iterable] = None, + like: typing.Optional[str] = None, + regex: typing.Optional[str] = None, + axis: int | str | None = None, + ) -> DataFrame: + if sum([(items is not None), (like is not None), (regex is not None)]) != 1: + raise ValueError( + "Need to provide exactly one of 'items', 'like', or 'regex'" + ) + axis_n = utils.get_axis_number(axis) if (axis is not None) else 1 + if axis_n == 0: # row labels + return self._filter_rows(items, like, regex) + else: # column labels + return self._filter_columns(items, like, regex) + + def _filter_rows( + self, + items: typing.Optional[typing.Iterable] = None, + like: typing.Optional[str] = None, + regex: typing.Optional[str] = None, + ) -> DataFrame: + if len(self._block.index_columns) > 1: + raise NotImplementedError( + "Method filter does not support rows multiindex. {constants.FEEDBACK_LINK}" + ) + if (like is not None) or (regex is not None): + block = self._block + block, label_string_id = block.apply_unary_op( + self._block.index_columns[0], + ops.AsTypeOp(pandas.StringDtype(storage="pyarrow")), + ) + if like is not None: + block, mask_id = block.apply_unary_op( + label_string_id, ops.ContainsStringOp(pat=like) + ) + else: # regex + assert regex is not None + block, mask_id = block.apply_unary_op( + label_string_id, ops.ContainsRegexOp(pat=regex) + ) + + block = block.filter(mask_id) + block = block.select_columns(self._block.value_columns) + return DataFrame(block) + elif items is not None: + # Behavior matches pandas 2.1+, older pandas versions would reindex + block = self._block + block, mask_id = block.apply_unary_op( + self._block.index_columns[0], ops.IsInOp(values=list(items)) + ) + block = block.filter(mask_id) + block = block.select_columns(self._block.value_columns) + return DataFrame(block) + else: + raise ValueError("Need to provide 'items', 'like', or 'regex'") + + def _filter_columns( + self, + items: typing.Optional[typing.Iterable] = None, + like: typing.Optional[str] = None, + regex: typing.Optional[str] = None, + ) -> DataFrame: + if (like is not None) or (regex is not None): + + def label_filter(label): + label_str = label if isinstance(label, str) else str(label) + if like: + return like in label_str + else: # regex + return re.match(regex, label_str) is not None + + cols = [ + col_id + for col_id, label in zip(self._block.value_columns, self.columns) + if label_filter(label) + ] + return DataFrame(self._block.select_columns(cols)) + if items is not None: + # Behavior matches pandas 2.1+, older pandas versions would reorder using order of items + new_columns = self.columns.intersection(pandas.Index(items)) + return self.reindex(columns=new_columns) + else: + raise ValueError("Need to provide 'items', 'like', or 'regex'") + + def reindex( + self, + labels=None, + *, + index=None, + columns=None, + axis: typing.Optional[typing.Union[str, int]] = None, + validate: typing.Optional[bool] = None, + ): + if labels: + if index or columns: + raise ValueError("Cannot specify both 'labels' and 'index'/'columns") + axis_n = utils.get_axis_number(axis) if (axis is not None) else 0 + if axis_n == 0: + index = labels + else: + columns = labels + if (index is not None) and (columns is not None): + return self._reindex_columns(columns)._reindex_rows( + index, validate=validate or False + ) + if index is not None: + return self._reindex_rows(index, validate=validate or False) + if columns is not None: + return self._reindex_columns(columns) + + def _reindex_rows( + self, + index, + *, + validate: typing.Optional[bool] = None, + ): + if validate and not self.index.is_unique: + raise ValueError("Original index must be unique to reindex") + keep_original_names = False + if isinstance(index, indexes.Index): + new_indexer = DataFrame(data=index._data._get_block())[[]] + else: + if not isinstance(index, pandas.Index): + keep_original_names = True + index = pandas.Index(index) + if index.nlevels != self.index.nlevels: + raise NotImplementedError( + "Cannot reindex with index with different nlevels" + ) + new_indexer = DataFrame(index=index)[[]] + # multiindex join is senstive to index names, so we will set all these + result = new_indexer.rename_axis(range(new_indexer.index.nlevels)).join( + self.rename_axis(range(self.index.nlevels)), + how="left", + ) + # and then reset the names after the join + return result.rename_axis( + self.index.names if keep_original_names else index.names + ) + + def _reindex_columns(self, columns): + block = self._block + new_column_index, indexer = self.columns.reindex(columns) + result_cols = [] + for label, index in zip(columns, indexer): + if index >= 0: + result_cols.append(self._block.value_columns[index]) + else: + block, null_col = block.create_constant( + pandas.NA, label, dtype=pandas.Float64Dtype() + ) + result_cols.append(null_col) + result_df = DataFrame(block.select_columns(result_cols)) + result_df.columns = new_column_index + return result_df + + def reindex_like(self, other: DataFrame, *, validate: typing.Optional[bool] = None): + return self.reindex(index=other.index, columns=other.columns, validate=validate) + def fillna(self, value=None) -> DataFrame: - return self._apply_binop(value, ops.fillna_op) + return self._apply_binop(value, ops.fillna_op, how="left") + + def ffill(self, *, limit: typing.Optional[int] = None) -> DataFrame: + window = bigframes.core.WindowSpec(preceding=limit, following=0) + return self._apply_window_op(agg_ops.LastNonNullOp(), window) + + def bfill(self, *, limit: typing.Optional[int] = None) -> DataFrame: + window = bigframes.core.WindowSpec(preceding=0, following=limit) + return self._apply_window_op(agg_ops.FirstNonNullOp(), window) def isin(self, values) -> DataFrame: if utils.is_dict_like(values): @@ -1309,6 +1619,14 @@ def describe(self) -> DataFrame: ) return typing.cast(DataFrame, result) + def skew(self, *, numeric_only: bool = False): + if not numeric_only: + frame = self._raise_on_non_numeric("skew") + else: + frame = self._drop_non_numeric() + result_block = block_ops.skew(frame._block, frame._block.value_columns) + return bigframes.series.Series(result_block) + def pivot( self, *, @@ -1702,17 +2020,29 @@ def shift(self, periods: int = 1) -> DataFrame: ) return self._apply_window_op(agg_ops.ShiftOp(periods), window) + def diff(self, periods: int = 1) -> DataFrame: + window = bigframes.core.WindowSpec( + preceding=periods if periods > 0 else None, + following=-periods if periods < 0 else None, + ) + return self._apply_window_op(agg_ops.DiffOp(periods), window) + + def pct_change(self, periods: int = 1) -> DataFrame: + # Future versions of pandas will not perfrom ffill automatically + df = self.ffill() + return DataFrame(block_ops.pct_change(df._block, periods=periods)) + def _apply_window_op( self, op: agg_ops.WindowOp, window_spec: bigframes.core.WindowSpec, ): - block = self._block.multi_apply_window_op( + block, result_ids = self._block.multi_apply_window_op( self._block.value_columns, op, window_spec=window_spec, ) - return DataFrame(block) + return DataFrame(block.select_columns(result_ids)) def sample( self, @@ -1875,6 +2205,98 @@ def to_parquet(self, path: str, *, index: bool = True) -> None: _, query_job = self._block.expr._session._start_query(export_data_statement) self._set_internal_query_job(query_job) + def to_dict( + self, + orient: Literal[ + "dict", "list", "series", "split", "tight", "records", "index" + ] = "dict", + into: type[dict] = dict, + **kwargs, + ) -> dict | list[dict]: + return self.to_pandas().to_dict(orient, into, **kwargs) # type: ignore + + def to_excel(self, excel_writer, sheet_name: str = "Sheet1", **kwargs) -> None: + return self.to_pandas().to_excel(excel_writer, sheet_name, **kwargs) + + def to_latex( + self, + buf=None, + columns: Sequence | None = None, + header: bool | Sequence[str] = True, + index: bool = True, + **kwargs, + ) -> str | None: + return self.to_pandas().to_latex( + buf, columns=columns, header=header, index=index, **kwargs # type: ignore + ) + + def to_records( + self, index: bool = True, column_dtypes=None, index_dtypes=None + ) -> numpy.recarray: + return self.to_pandas().to_records(index, column_dtypes, index_dtypes) + + def to_string( + self, + buf=None, + columns: Sequence[str] | None = None, + col_space=None, + header: bool | Sequence[str] = True, + index: bool = True, + na_rep: str = "NaN", + formatters=None, + float_format=None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool = False, + decimal: str = ".", + line_width: int | None = None, + min_rows: int | None = None, + max_colwidth: int | None = None, + encoding: str | None = None, + ) -> str | None: + return self.to_pandas().to_string( + buf, + columns, # type: ignore + col_space, + header, # type: ignore + index, + na_rep, + formatters, + float_format, + sparsify, + index_names, + justify, + max_rows, + max_cols, + show_dimensions, + decimal, + line_width, + min_rows, + max_colwidth, + encoding, + ) + + def to_markdown( + self, + buf=None, + mode: str = "wt", + index: bool = True, + **kwargs, + ) -> str | None: + return self.to_pandas().to_markdown(buf, mode, index, **kwargs) # type: ignore + + def to_pickle(self, path, **kwargs) -> None: + return self.to_pandas().to_pickle(path, **kwargs) + + def to_orc(self, path=None, **kwargs) -> bytes | None: + as_pandas = self.to_pandas() + # to_orc only works with default index + as_pandas_default_index = as_pandas.reset_index() + return as_pandas_default_index.to_orc(path, **kwargs) + def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame: block = self._block.multi_apply_unary_op(self._block.value_columns, operation) return DataFrame(block) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 95cf737b2e..af3209b0e1 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -157,7 +157,7 @@ def ibis_dtype_to_bigframes_dtype( return IBIS_TO_BIGFRAMES[ibis_dtype] else: raise ValueError( - f"Unexpected Ibis data type {type(ibis_dtype)}. {constants.FEEDBACK_LINK}" + f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}" ) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 9f9d9f85d0..f07274f8fc 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -133,7 +133,7 @@ class TrainablePredictor(Predictor): Also the predictor can be attached to a pipeline with transformers.""" @abc.abstractmethod - def fit(self, X, y, transforms): + def _fit(self, X, y, transforms=None): pass @abc.abstractmethod @@ -146,6 +146,36 @@ def to_gbq(self, model_name, replace): pass +class SupervisedTrainablePredictor(TrainablePredictor): + """A BigQuery DataFrames ML Supervised Model base class that can be used to fit and predict outputs. + + Need to provide both X and y in supervised tasks.""" + + _T = TypeVar("_T", bound="SupervisedTrainablePredictor") + + def fit( + self: _T, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], + ) -> _T: + return self._fit(X, y) + + +class UnsupervisedTrainablePredictor(TrainablePredictor): + """A BigQuery DataFrames ML Unsupervised Model base class that can be used to fit and predict outputs. + + Only need to provide both X (y is optional and ignored) in unsupervised tasks.""" + + _T = TypeVar("_T", bound="UnsupervisedTrainablePredictor") + + def fit( + self: _T, + X: Union[bpd.DataFrame, bpd.Series], + y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + ) -> _T: + return self._fit(X, y) + + class Transformer(BaseEstimator): """A BigQuery DataFrames Transformer base class that transforms data. diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index 2501d2b21f..14cce2879e 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -28,13 +28,13 @@ class KMeans( + base.UnsupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.cluster._kmeans.KMeans, - base.TrainablePredictor, ): __doc__ = third_party.bigframes_vendored.sklearn.cluster._kmeans.KMeans.__doc__ - def __init__(self, n_clusters=8): + def __init__(self, n_clusters: int = 8): self.n_clusters = n_clusters self._bqml_model: Optional[core.BqmlModel] = None @@ -58,7 +58,7 @@ def _bqml_options(self) -> Dict[str, str | int | float | List[str]]: """The model options as they will be set for BQML""" return {"model_type": "KMEANS", "num_clusters": self.n_clusters} - def fit( + def _fit( self, X: Union[bpd.DataFrame, bpd.Series], y=None, # ignored diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index df01303ffa..a3d3503ad0 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -33,8 +33,8 @@ class ColumnTransformer( - third_party.bigframes_vendored.sklearn.compose._column_transformer.ColumnTransformer, base.Transformer, + third_party.bigframes_vendored.sklearn.compose._column_transformer.ColumnTransformer, ): __doc__ = ( third_party.bigframes_vendored.sklearn.compose._column_transformer.ColumnTransformer.__doc__ diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 75b57f2e54..0cfe3b3ddf 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -28,12 +28,12 @@ class PCA( + base.UnsupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.decomposition._pca.PCA, - base.TrainablePredictor, ): __doc__ = third_party.bigframes_vendored.sklearn.decomposition._pca.PCA.__doc__ - def __init__(self, n_components=3): + def __init__(self, n_components: int = 3): self.n_components = n_components self._bqml_model: Optional[core.BqmlModel] = None @@ -52,7 +52,7 @@ def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> PCA: new_pca._bqml_model = core.BqmlModel(session, model) return new_pca - def fit( + def _fit( self, X: Union[bpd.DataFrame, bpd.Series], y=None, diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 56a0cc3d94..142edaa00f 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -48,8 +48,8 @@ class XGBRegressor( + base.SupervisedTrainablePredictor, third_party.bigframes_vendored.xgboost.sklearn.XGBRegressor, - base.TrainablePredictor, ): __doc__ = third_party.bigframes_vendored.xgboost.sklearn.XGBRegressor.__doc__ @@ -57,22 +57,22 @@ def __init__( self, num_parallel_tree: int = 1, booster: Literal["gbtree", "dart"] = "gbtree", - dart_normalized_type: Literal["TREE", "FOREST"] = "TREE", + dart_normalized_type: Literal["tree", "forest"] = "tree", tree_method: Literal["auto", "exact", "approx", "hist"] = "auto", min_tree_child_weight: int = 1, - colsample_bytree=1.0, - colsample_bylevel=1.0, - colsample_bynode=1.0, - gamma=0.0, + colsample_bytree: float = 1.0, + colsample_bylevel: float = 1.0, + colsample_bynode: float = 1.0, + gamma: float = 0.0, max_depth: int = 6, - subsample=1.0, - reg_alpha=0.0, - reg_lambda=1.0, - early_stop=True, - learning_rate=0.3, + subsample: float = 1.0, + reg_alpha: float = 0.0, + reg_lambda: float = 1.0, + early_stop: float = True, + learning_rate: float = 0.3, max_iterations: int = 20, - min_rel_progress=0.01, - enable_global_explain=False, + min_rel_progress: float = 0.01, + enable_global_explain: bool = False, xgboost_version: Literal["0.9", "1.1"] = "0.9", ): self.num_parallel_tree = num_parallel_tree @@ -143,7 +143,7 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: "xgboost_version": self.xgboost_version, } - def fit( + def _fit( self, X: Union[bpd.DataFrame, bpd.Series], y: Union[bpd.DataFrame, bpd.Series], @@ -211,8 +211,8 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor: class XGBClassifier( + base.SupervisedTrainablePredictor, third_party.bigframes_vendored.xgboost.sklearn.XGBClassifier, - base.TrainablePredictor, ): __doc__ = third_party.bigframes_vendored.xgboost.sklearn.XGBClassifier.__doc__ @@ -221,22 +221,22 @@ def __init__( self, num_parallel_tree: int = 1, booster: Literal["gbtree", "dart"] = "gbtree", - dart_normalized_type: Literal["TREE", "FOREST"] = "TREE", + dart_normalized_type: Literal["tree", "forest"] = "tree", tree_method: Literal["auto", "exact", "approx", "hist"] = "auto", min_tree_child_weight: int = 1, - colsample_bytree=1.0, - colsample_bylevel=1.0, - colsample_bynode=1.0, - gamma=0.0, + colsample_bytree: float = 1.0, + colsample_bylevel: float = 1.0, + colsample_bynode: float = 1.0, + gamma: float = 0.0, max_depth: int = 6, - subsample=1.0, - reg_alpha=0.0, - reg_lambda=1.0, - early_stop=True, - learning_rate=0.3, + subsample: float = 1.0, + reg_alpha: float = 0.0, + reg_lambda: float = 1.0, + early_stop: bool = True, + learning_rate: float = 0.3, max_iterations: int = 20, - min_rel_progress=0.01, - enable_global_explain=False, + min_rel_progress: float = 0.01, + enable_global_explain: bool = False, xgboost_version: Literal["0.9", "1.1"] = "0.9", ): self.num_parallel_tree = num_parallel_tree @@ -307,7 +307,7 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: "xgboost_version": self.xgboost_version, } - def fit( + def _fit( self, X: Union[bpd.DataFrame, bpd.Series], y: Union[bpd.DataFrame, bpd.Series], @@ -374,8 +374,8 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier: class RandomForestRegressor( + base.SupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor, - base.TrainablePredictor, ): __doc__ = ( @@ -461,7 +461,7 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: "xgboost_version": self.xgboost_version, } - def fit( + def _fit( self, X: Union[bpd.DataFrame, bpd.Series], y: Union[bpd.DataFrame, bpd.Series], @@ -542,8 +542,8 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegresso class RandomForestClassifier( + base.SupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier, - base.TrainablePredictor, ): __doc__ = ( @@ -629,7 +629,7 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: "xgboost_version": self.xgboost_version, } - def fit( + def _fit( self, X: Union[bpd.DataFrame, bpd.Series], y: Union[bpd.DataFrame, bpd.Series], diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index b7e0553ecb..22d81294fc 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -27,7 +27,7 @@ _PREDICT_OUTPUT_COLUMNS = ["forecast_timestamp", "forecast_value"] -class ARIMAPlus(base.TrainablePredictor): +class ARIMAPlus(base.SupervisedTrainablePredictor): """Time Series ARIMA Plus model.""" def __init__(self): @@ -48,7 +48,7 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: """The model options as they will be set for BQML.""" return {"model_type": "ARIMA_PLUS"} - def fit( + def _fit( self, X: Union[bpd.DataFrame, bpd.Series], y: Union[bpd.DataFrame, bpd.Series], diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 0b18db9315..1606a15d73 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -30,8 +30,8 @@ class LinearRegression( + base.SupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.linear_model._base.LinearRegression, - base.TrainablePredictor, ): __doc__ = ( third_party.bigframes_vendored.sklearn.linear_model._base.LinearRegression.__doc__ @@ -39,7 +39,7 @@ class LinearRegression( def __init__( self, - fit_intercept=True, + fit_intercept: bool = True, ): self.fit_intercept = fit_intercept self._bqml_model: Optional[core.BqmlModel] = None @@ -71,7 +71,7 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: "fit_intercept": self.fit_intercept, } - def fit( + def _fit( self, X: Union[bpd.DataFrame, bpd.Series], y: Union[bpd.DataFrame, bpd.Series], @@ -136,8 +136,8 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LinearRegression: class LogisticRegression( + base.SupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression, - base.TrainablePredictor, ): __doc__ = ( third_party.bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression.__doc__ @@ -189,12 +189,13 @@ def _bqml_options(self) -> Dict[str, str | int | float | List[str]]: # "class_weights": self.class_weights, } - def fit( + def _fit( self, X: Union[bpd.DataFrame, bpd.Series], y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, ) -> LogisticRegression: + """Fit model with transforms.""" X, y = utils.convert_to_dataframe(X, y) self._bqml_model = core.create_bqml_model( diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 345e3deb72..973fbf2ad9 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -100,26 +100,26 @@ def predict( Temperature controls the degree of randomness in token selection. Lower temperatures are good for prompts that expect a true or correct response, while higher temperatures can lead to more diverse or unexpected results. A temperature of 0 is deterministic: the highest probability token is always selected. For most use cases, try starting with a temperature of 0.2. - Default 0. + Default 0. Possible values [0.0, 1.0]. max_output_tokens (int, default 128): Maximum number of tokens that can be generated in the response. Specify a lower value for shorter responses and a higher value for longer responses. A token may be smaller than a word. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. - Default 128. + Default 128. Possible values [1, 1024]. top_k (int, default 40): Top-k changes how the model selects tokens for output. A top-k of 1 means the selected token is the most probable among all tokens in the model’s vocabulary (also called greedy decoding), while a top-k of 3 means that the next token is selected from among the 3 most probable tokens (using temperature). For each token selection step, the top K tokens with the highest probabilities are sampled. Then tokens are further filtered based on topP with the final token selected using temperature sampling. Specify a lower value for less random responses and a higher value for more random responses. - Default 40. + Default 40. Possible values [1, 40]. top_p (float, default 0.95):: Top-p changes how the model selects tokens for output. Tokens are selected from most K (see topK parameter) probable to least until the sum of their probabilities equals the top-p value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the top-p value is 0.5, then the model will select either A or B as the next token (using temperature) and not consider C at all. Specify a lower value for less random responses and a higher value for more random responses. - Default 0.95. + Default 0.95. Possible values [0.0, 1.0]. Returns: diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index bfd0392526..bff0bf36ad 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -24,14 +24,14 @@ import bigframes import bigframes.constants as constants -from bigframes.ml import base, compose, loader, preprocessing, utils +from bigframes.ml import base, compose, forecasting, loader, preprocessing, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.pipeline class Pipeline( - third_party.bigframes_vendored.sklearn.pipeline.Pipeline, base.BaseEstimator, + third_party.bigframes_vendored.sklearn.pipeline.Pipeline, ): __doc__ = third_party.bigframes_vendored.sklearn.pipeline.Pipeline.__doc__ @@ -55,7 +55,7 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): self._transform = transform else: raise NotImplementedError( - f"Transform {transform} is not yet supported by Pipeline. {constants.FEEDBACK_LINK}" + f"Transformer type {type(transform)} is not yet supported by Pipeline. {constants.FEEDBACK_LINK}" ) if not isinstance( @@ -63,7 +63,13 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): base.TrainablePredictor, ): raise NotImplementedError( - f"Estimator {estimator} is not supported by Pipeline. {constants.FEEDBACK_LINK}" + f"Estimator type {type(estimator)} is not supported by Pipeline. {constants.FEEDBACK_LINK}" + ) + + # BQML doesn't support ARIMA_PLUS with transformers. b/298676367 + if isinstance(estimator, forecasting.ARIMAPlus): + raise NotImplementedError( + f"Estimator type {type(estimator)} is not supported by Pipeline. {constants.FEEDBACK_LINK}" ) self._transform = transform @@ -92,7 +98,7 @@ def fit( (y,) = utils.convert_to_dataframe(y) transform_sqls.extend(y.columns.tolist()) - self._estimator.fit(X=X, y=y, transforms=transform_sqls) + self._estimator._fit(X=X, y=y, transforms=transform_sqls) return self def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index ee46a37052..8add7bdd76 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -29,8 +29,8 @@ class StandardScaler( - third_party.bigframes_vendored.sklearn.preprocessing._data.StandardScaler, base.Transformer, + third_party.bigframes_vendored.sklearn.preprocessing._data.StandardScaler, ): __doc__ = ( third_party.bigframes_vendored.sklearn.preprocessing._data.StandardScaler.__doc__ @@ -105,8 +105,8 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: class OneHotEncoder( - third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder, base.Transformer, + third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder, ): # BQML max value https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder#syntax TOP_K_DEFAULT = 1000000 diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 874c264194..23271e8220 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -19,8 +19,10 @@ import ibis import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types +from pandas import Int64Dtype import bigframes.constants as constants +import bigframes.dtypes as dtypes import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops @@ -202,6 +204,16 @@ def _as_ibis(self, x: ibis_types.Column, window=None) -> ibis_types.Value: ) +class PopVarOp(AggregateOp): + name = "popvar" + + @numeric_op + def _as_ibis(self, x: ibis_types.Column, window=None) -> ibis_types.Value: + return _apply_window_if_present( + typing.cast(ibis_types.NumericColumn, x).var(how="pop"), window + ) + + class CountOp(AggregateOp): name = "count" @@ -217,16 +229,20 @@ def skips_nulls(self): class CutOp(WindowOp): def __init__(self, bins: int): - self._bins = bins + self._bins_ibis = dtypes.literal_to_ibis_scalar(bins, force_dtype=Int64Dtype()) + self._bins_int = bins def _as_ibis(self, x: ibis_types.Column, window=None): col_min = _apply_window_if_present(x.min(), window) col_max = _apply_window_if_present(x.max(), window) - bin_width = (col_max - col_min) / self._bins + bin_width = (col_max - col_min) / self._bins_ibis out = ibis.case() - for bin in range(self._bins - 1): - out = out.when(x <= (col_min + (bin + 1) * bin_width), bin) - out = out.when(x.notnull(), self._bins - 1) + for this_bin in range(self._bins_int - 1): + out = out.when( + x <= (col_min + (this_bin + 1) * bin_width), + dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()), + ) + out = out.when(x.notnull(), self._bins_ibis - 1) return out.end() @property @@ -305,6 +321,28 @@ def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: return _apply_window_if_present(column.first(), window) +class FirstNonNullOp(WindowOp): + @property + def skips_nulls(self): + return False + + def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: + return _apply_window_if_present( + vendored_ibis_ops.FirstNonNullValue(column).to_expr(), window # type: ignore + ) + + +class LastNonNullOp(WindowOp): + @property + def skips_nulls(self): + return False + + def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: + return _apply_window_if_present( + vendored_ibis_ops.LastNonNullValue(column).to_expr(), window # type: ignore + ) + + class ShiftOp(WindowOp): def __init__(self, periods: int): self._periods = periods @@ -321,6 +359,28 @@ def skips_nulls(self): return False +class DiffOp(WindowOp): + def __init__(self, periods: int): + self._periods = periods + + def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: + shifted = ShiftOp(self._periods)._as_ibis(column, window) + if column.type().is_boolean(): + return typing.cast(ibis_types.BooleanColumn, column) != typing.cast( + ibis_types.BooleanColumn, shifted + ) + elif column.type().is_numeric(): + return typing.cast(ibis_types.NumericColumn, column) - typing.cast( + ibis_types.NumericColumn, shifted + ) + else: + raise TypeError(f"Cannot perform diff on type{column.type()}") + + @property + def skips_nulls(self): + return False + + class AllOp(AggregateOp): def _as_ibis( self, column: ibis_types.Column, window=None diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index e1a23e67a1..cb27834590 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -399,6 +399,7 @@ def remote_function( dataset: Optional[str] = None, bigquery_connection: Optional[str] = None, reuse: bool = True, + name: Optional[str] = None, ): return global_session.with_default_session( bigframes.session.Session.remote_function, @@ -407,6 +408,7 @@ def remote_function( dataset=dataset, bigquery_connection=bigquery_connection, reuse=reuse, + name=name, ) diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index 6932e5b580..6fc2f8e59f 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -28,6 +28,8 @@ import textwrap from typing import List, NamedTuple, Optional, Sequence, TYPE_CHECKING +import requests + if TYPE_CHECKING: from bigframes.session import Session @@ -99,7 +101,7 @@ def get_remote_function_locations(bq_location): def _get_hash(def_): - "Get hash of a function." + "Get hash (32 digits alphanumeric) of a function." def_repr = cloudpickle.dumps(def_, protocol=_pickle_protocol_version) return hashlib.md5(def_repr).hexdigest() @@ -128,7 +130,7 @@ class IbisSignature(NamedTuple): def get_cloud_function_name(def_, uniq_suffix=None): - """Get the name of the cloud function.""" + "Get a name for the cloud function for the given user defined function." cf_name = _get_hash(def_) cf_name = f"bigframes-{cf_name}" # for identification if uniq_suffix: @@ -137,7 +139,7 @@ def get_cloud_function_name(def_, uniq_suffix=None): def get_remote_function_name(def_, uniq_suffix=None): - """Get the name for the BQ remote function.""" + "Get a name for the BQ remote function for the given user defined function." bq_rf_name = _get_hash(def_) bq_rf_name = f"bigframes_{bq_rf_name}" # for identification if uniq_suffix: @@ -206,9 +208,15 @@ def create_bq_remote_function( query_job.result() # Wait for the job to complete. logger.info(f"Created remote function {query_job.ddl_target_routine}") + def get_cloud_function_fully_qualified_parent(self): + "Get the fully qualilfied parent for a cloud function." + return self._cloud_functions_client.common_location_path( + self._gcp_project_id, self._cloud_function_region + ) + def get_cloud_function_fully_qualified_name(self, name): "Get the fully qualilfied name for a cloud function." - return "projects/{}/locations/{}/functions/{}".format( + return self._cloud_functions_client.function_path( self._gcp_project_id, self._cloud_function_region, name ) @@ -319,6 +327,7 @@ def create_cloud_function(self, def_, cf_name): # Build and deploy folder structure containing cloud function with tempfile.TemporaryDirectory() as dir: entry_point = self.generate_cloud_function_code(def_, dir) + archive_path = shutil.make_archive(dir, "zip", dir) # We are creating cloud function source code from the currently running # python version. Use the same version to deploy. This is necessary @@ -331,50 +340,56 @@ def create_cloud_function(self, def_, cf_name): sys.version_info.major, sys.version_info.minor ) - # deploy/redeploy the cloud function - # TODO(shobs): Figure out a way to skip this step if a cloud function - # already exists with the same name and source code - command = ( - "gcloud functions deploy" - + f" {cf_name} --gen2" - + f" --runtime={python_version}" - + f" --project={self._gcp_project_id}" - + f" --region={self._cloud_function_region}" - + f" --source={dir}" - + f" --entry-point={entry_point}" - + " --trigger-http" + # Determine an upload URL for user code + upload_url_request = functions_v2.GenerateUploadUrlRequest() + upload_url_request.parent = self.get_cloud_function_fully_qualified_parent() + upload_url_response = self._cloud_functions_client.generate_upload_url( + request=upload_url_request ) - # If the cloud function is being created for the first time, then let's - # make it not allow unauthenticated calls. If it was previously created - # then this invocation will update it, in which case do not touch that - # aspect and let the previous policy hold. The reason we do this is to - # avoid an IAM permission needed to update the invocation policy. - # For example, when a cloud function is being created for the first - # time, i.e. - # $ gcloud functions deploy python-foo-http --gen2 --runtime=python310 - # --region=us-central1 - # --source=/source/code/dir - # --entry-point=foo_http - # --trigger-http - # --no-allow-unauthenticated - # It works. When an invocation of the same command is done for the - # second time, it may run into an error like: - # ERROR: (gcloud.functions.deploy) PERMISSION_DENIED: Permission - # 'run.services.setIamPolicy' denied on resource - # 'projects/my_project/locations/us-central1/services/python-foo-http' (or resource may not exist) - # But when --no-allow-unauthenticated is omitted then it goes through. - # It suggests that in the second invocation the command is trying to set - # the IAM policy of the service, and the user running BigQuery - # DataFrame may not have privilege to do so, so better avoid this - # if we can. - if self.get_cloud_function_endpoint(cf_name): - logger.info(f"Updating existing cloud function: {command}") - else: - command = f"{command} --no-allow-unauthenticated" - logger.info(f"Creating new cloud function: {command}") - - _run_system_command(command) + # Upload the code to GCS + with open(archive_path, "rb") as f: + response = requests.put( + upload_url_response.upload_url, + data=f, + headers={"content-type": "application/zip"}, + ) + if response.status_code != 200: + raise RuntimeError( + "Failed to upload user code. code={}, reason={}, text={}".format( + response.status_code, response.reason, response.text + ) + ) + + # Deploy Cloud Function + create_function_request = functions_v2.CreateFunctionRequest() + create_function_request.parent = ( + self.get_cloud_function_fully_qualified_parent() + ) + create_function_request.function_id = cf_name + function = functions_v2.Function() + function.name = self.get_cloud_function_fully_qualified_name(cf_name) + function.build_config = functions_v2.BuildConfig() + function.build_config.runtime = python_version + function.build_config.entry_point = entry_point + function.build_config.source = functions_v2.Source() + function.build_config.source.storage_source = functions_v2.StorageSource() + function.build_config.source.storage_source.bucket = ( + upload_url_response.storage_source.bucket + ) + function.build_config.source.storage_source.object_ = ( + upload_url_response.storage_source.object_ + ) + create_function_request.function = function + + # Create the cloud function and wait for it to be ready to use + operation = self._cloud_functions_client.create_function( + request=create_function_request + ) + operation.result() + + # Cleanup + os.remove(archive_path) # Fetch the endpoint of the just created function endpoint = self.get_cloud_function_endpoint(cf_name) @@ -389,23 +404,47 @@ def create_cloud_function(self, def_, cf_name): return endpoint def provision_bq_remote_function( - self, def_, input_types, output_type, uniq_suffix=None + self, + def_, + input_types, + output_type, + reuse, + name, ): """Provision a BigQuery remote function.""" - # Derive the name of the underlying cloud function and first create - # it if it does not exist + # If reuse of any existing function with the same name (indicated by the + # same hash of its source code) is not intended, then attach a unique + # suffix to the intended function name to make it unique. + uniq_suffix = None + if not reuse: + uniq_suffix = "".join( + random.choices(string.ascii_lowercase + string.digits, k=8) + ) + + # Derive the name of the cloud function underlying the intended BQ + # remote function cloud_function_name = get_cloud_function_name(def_, uniq_suffix) cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) + + # Create the cloud function if it does not exist if not cf_endpoint: - self.check_cloud_function_tools_and_permissions() cf_endpoint = self.create_cloud_function(def_, cloud_function_name) else: logger.info(f"Cloud function {cloud_function_name} already exists.") - # Derive the name of the remote function and create/replace it if needed - remote_function_name = get_remote_function_name(def_, uniq_suffix) + # Derive the name of the remote function + remote_function_name = name + if not remote_function_name: + remote_function_name = get_remote_function_name(def_, uniq_suffix) rf_endpoint, rf_conn = self.get_remote_function_specs(remote_function_name) - if rf_endpoint != cf_endpoint or rf_conn != self._bq_connection_id: + + # Create the BQ remote function in following circumstances: + # 1. It does not exist + # 2. It exists but the existing remote function has different + # configuration than intended + if not rf_endpoint or ( + rf_endpoint != cf_endpoint or rf_conn != self._bq_connection_id + ): input_args = inspect.getargs(def_.__code__).args if len(input_args) != len(input_types): raise ValueError( @@ -439,27 +478,6 @@ def get_remote_function_specs(self, remote_function_name): break return (http_endpoint, bq_connection) - def check_cloud_function_tools_and_permissions(self): - """Check if the necessary tools and permissions are in place for creating remote function""" - # gcloud CLI comes with bq CLI and they are required for creating google - # cloud function and BigQuery remote function respectively - if not shutil.which("gcloud"): - raise ValueError( - "gcloud tool not installed, install it from https://cloud.google.com/sdk/docs/install. " - f"{constants.FEEDBACK_LINK}" - ) - - # TODO(shobs): Check for permissions too - # I (shobs) tried the following method - # $ gcloud asset search-all-iam-policies \ - # --format=json \ - # --scope=projects/{gcp_project_id} \ - # --query='policy.role.permissions:cloudfunctions.functions.create' - # as a proxy to all the privilges necessary to create cloud function - # https://cloud.google.com/functions/docs/reference/iam/roles#cloudfunctions.developer - # but that itself required the runner to have the permission to enable - # `cloudasset.googleapis.com` - def remote_function_node( routine_ref: bigquery.RoutineReference, ibis_signature: IbisSignature @@ -583,6 +601,7 @@ def remote_function( dataset: Optional[str] = None, bigquery_connection: Optional[str] = None, reuse: bool = True, + name: Optional[str] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. @@ -613,7 +632,7 @@ def remote_function( * BigQuery Data Editor (roles/bigquery.dataEditor) * BigQuery Connection Admin (roles/bigquery.connectionAdmin) * Cloud Functions Developer (roles/cloudfunctions.developer) - * Service Account User (roles/iam.serviceAccountUser) + * Service Account User (roles/iam.serviceAccountUser) on the service account `PROJECT_NUMBER-compute@developer.gserviceaccount.com` * Storage Object Viewer (roles/storage.objectViewer) * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) @@ -664,10 +683,16 @@ def remote_function( reuse (bool, Optional): Reuse the remote function if is already exists. `True` by default, which results in reusing an existing remote - function (if any) that was previously created for the same udf. - Setting it to false forces the creation of creating a unique remote function. + function and corresponding cloud function (if any) that was + previously created for the same udf. + Setting it to `False` forces the creation of a unique remote function. If the required remote function does not exist then it would be created irrespective of this param. + name (str, Optional): + Explicit name of the persisted BigQuery remote function. Use it with + caution, because two users working in the same project and dataset + could overwrite each other's remote functions if they use the same + persistent name. """ @@ -739,12 +764,6 @@ def remote_function( f"{constants.FEEDBACK_LINK}" ) - uniq_suffix = None - if not reuse: - uniq_suffix = "".join( - random.choices(string.ascii_lowercase + string.digits, k=8) - ) - # Check connection_id with `LOCATION.CONNECTION_ID` or `PROJECT_ID.LOCATION.CONNECTION_ID` format. if bigquery_connection.count(".") == 1: bq_connection_location, bq_connection_id = bigquery_connection.split(".") @@ -792,8 +811,13 @@ def wrapper(f): bigquery_connection, resource_manager_client, ) + rf_name, cf_name = remote_function_client.provision_bq_remote_function( - f, ibis_signature.input_types, ibis_signature.output_type, uniq_suffix + f, + ibis_signature.input_types, + ibis_signature.output_type, + reuse, + name, ) node = remote_function_node(dataset_ref.routine(rf_name), ibis_signature) diff --git a/bigframes/series.py b/bigframes/series.py index 8eadee37ed..12e72c58b6 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -42,6 +42,7 @@ STABLE_SORTS, ) import bigframes.core.scalar as scalars +import bigframes.core.utils as utils import bigframes.core.window import bigframes.dataframe import bigframes.dtypes @@ -310,11 +311,20 @@ def drop( block = block.drop_columns([condition_id]) return Series(block.select_column(self._value_column)) - def droplevel(self, level: LevelsType): + def droplevel(self, level: LevelsType, axis: int | str = 0): resolved_level_ids = self._resolve_levels(level) return Series(self._block.drop_levels(resolved_level_ids)) - def reorder_levels(self, order: LevelsType): + def swaplevel(self, i: int = -2, j: int = -1): + level_i = self._block.index_columns[i] + level_j = self._block.index_columns[j] + mapping = {level_i: level_j, level_j: level_i} + reordering = [ + mapping.get(index_id, index_id) for index_id in self._block.index_columns + ] + return Series(self._block.reorder_levels(reordering)) + + def reorder_levels(self, order: LevelsType, axis: int | str = 0): resolved_level_ids = self._resolve_levels(order) return Series(self._block.reorder_levels(resolved_level_ids)) @@ -352,6 +362,14 @@ def cumsum(self) -> Series: agg_ops.sum_op, bigframes.core.WindowSpec(following=0) ) + def ffill(self, *, limit: typing.Optional[int] = None) -> Series: + window = bigframes.core.WindowSpec(preceding=limit, following=0) + return self._apply_window_op(agg_ops.LastNonNullOp(), window) + + def bfill(self, *, limit: typing.Optional[int] = None) -> Series: + window = bigframes.core.WindowSpec(preceding=0, following=limit) + return self._apply_window_op(agg_ops.FirstNonNullOp(), window) + def cummax(self) -> Series: return self._apply_window_op( agg_ops.max_op, bigframes.core.WindowSpec(following=0) @@ -375,7 +393,16 @@ def shift(self, periods: int = 1) -> Series: return self._apply_window_op(agg_ops.ShiftOp(periods), window) def diff(self, periods: int = 1) -> Series: - return self - self.shift(periods=periods) + window = bigframes.core.WindowSpec( + preceding=periods if periods > 0 else None, + following=-periods if periods < 0 else None, + ) + return self._apply_window_op(agg_ops.DiffOp(periods), window) + + def pct_change(self, periods: int = 1) -> Series: + # Future versions of pandas will not perfrom ffill automatically + series = self.ffill() + return Series(block_ops.pct_change(series._block, periods=periods)) def rank( self, @@ -390,6 +417,47 @@ def rank( def fillna(self, value=None) -> Series: return self._apply_binary_op(value, ops.fillna_op) + def replace( + self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False + ): + if regex: + if not (isinstance(to_replace, str) and isinstance(value, str)): + raise NotImplementedError( + f"replace regex mode only supports strings for 'to_replace' and 'value'. {constants.FEEDBACK_LINK}" + ) + block, result_col = self._block.apply_unary_op( + self._value_column, + ops.ReplaceRegexOp(to_replace, value), + result_label=self.name, + ) + return Series(block.select_column(result_col)) + elif utils.is_dict_like(to_replace): + raise NotImplementedError( + f"Dict 'to_replace' not supported. {constants.FEEDBACK_LINK}" + ) + elif utils.is_list_like(to_replace): + block, cond = self._block.apply_unary_op( + self._value_column, ops.IsInOp(to_replace) + ) + block, result_col = block.apply_binary_op( + cond, + self._value_column, + ops.partial_arg1(ops.where_op, value), + result_label=self.name, + ) + return Series(block.select_column(result_col)) + else: # Scalar + block, cond = self._block.apply_unary_op( + self._value_column, ops.BinopPartialLeft(ops.eq_op, to_replace) + ) + block, result_col = block.apply_binary_op( + cond, + self._value_column, + ops.partial_arg1(ops.where_op, value), + result_label=self.name, + ) + return Series(block.select_column(result_col)) + def dropna( self, *, @@ -414,52 +482,16 @@ def tail(self, n: int = 5) -> Series: def nlargest(self, n: int = 5, keep: str = "first") -> Series: if keep not in ("first", "last", "all"): raise ValueError("'keep must be one of 'first', 'last', or 'all'") - block = self._block - if keep == "last": - block = block.reversed() - ordering = ( - OrderingColumnReference( - self._value_column, direction=OrderingDirection.DESC - ), + return Series( + block_ops.nlargest(self._block, n, [self._value_column], keep=keep) ) - block = block.order_by(ordering, stable=True) - if keep in ("first", "last"): - return Series(block.slice(0, n)) - else: # keep == "all": - block, counter = block.apply_window_op( - self._value_column, - agg_ops.rank_op, - window_spec=WindowSpec(ordering=ordering), - ) - block, condition = block.apply_unary_op( - counter, ops.partial_right(ops.le_op, n) - ) - block = block.filter(condition) - block = block.select_column(self._value_column) - return Series(block) def nsmallest(self, n: int = 5, keep: str = "first") -> Series: if keep not in ("first", "last", "all"): raise ValueError("'keep must be one of 'first', 'last', or 'all'") - block = self._block - if keep == "last": - block = block.reversed() - ordering = (OrderingColumnReference(self._value_column),) - block = block.order_by(ordering, stable=True) - if keep in ("first", "last"): - return Series(block.slice(0, n)) - else: # keep == "all": - block, counter = block.apply_window_op( - self._value_column, - agg_ops.rank_op, - window_spec=WindowSpec(ordering=ordering), - ) - block, condition = block.apply_unary_op( - counter, ops.partial_right(ops.le_op, n) - ) - block = block.filter(condition) - block = block.select_column(self._value_column) - return Series(block) + return Series( + block_ops.nsmallest(self._block, n, [self._value_column], keep=keep) + ) def isin(self, values) -> "Series" | None: if not _is_list_like(values): @@ -697,13 +729,9 @@ def _central_moment(self, n: int) -> float: """Useful helper for calculating central moment statistics""" # Nth central moment is mean((x-mean(x))^n) # See: https://en.wikipedia.org/wiki/Moment_(mathematics) - mean = self.mean() - mean_deltas = self - mean - delta_power = mean_deltas - # TODO(tbergeron): Replace with pow once implemented - for i in range(1, n): - delta_power = delta_power * mean_deltas - return delta_power.mean() + mean_deltas = self - self.mean() + delta_powers = mean_deltas**n + return delta_powers.mean() def agg(self, func: str | typing.Sequence[str]) -> scalars.Scalar | Series: if _is_list_like(func): @@ -1096,6 +1124,85 @@ def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series: def add_suffix(self, suffix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_suffix(suffix)) + def filter( + self, + items: typing.Optional[typing.Iterable] = None, + like: typing.Optional[str] = None, + regex: typing.Optional[str] = None, + axis: typing.Optional[typing.Union[str, int]] = None, + ) -> Series: + if (axis is not None) and utils.get_axis_number(axis) != 0: + raise ValueError(f"Invalid axis for series: {axis}") + if sum([(items is not None), (like is not None), (regex is not None)]) != 1: + raise ValueError( + "Need to provide exactly one of 'items', 'like', or 'regex'" + ) + if len(self._block.index_columns) > 1: + raise NotImplementedError( + "Method filter does not support rows multiindex. {constants.FEEDBACK_LINK}" + ) + if (like is not None) or (regex is not None): + block = self._block + block, label_string_id = block.apply_unary_op( + self._block.index_columns[0], + ops.AsTypeOp(pandas.StringDtype(storage="pyarrow")), + ) + if like is not None: + block, mask_id = block.apply_unary_op( + label_string_id, ops.ContainsStringOp(pat=like) + ) + else: # regex + assert regex is not None + block, mask_id = block.apply_unary_op( + label_string_id, ops.ContainsRegexOp(pat=regex) + ) + + block = block.filter(mask_id) + block = block.select_columns([self._value_column]) + return Series(block) + elif items is not None: + # Behavior matches pandas 2.1+, older pandas versions would reindex + block = self._block + block, mask_id = block.apply_unary_op( + self._block.index_columns[0], ops.IsInOp(values=list(items)) + ) + block = block.filter(mask_id) + block = block.select_columns([self._value_column]) + return Series(block) + else: + raise ValueError("Need to provide 'items', 'like', or 'regex'") + + def reindex(self, index=None, *, validate: typing.Optional[bool] = None): + if validate and not self.index.is_unique: + raise ValueError("Original index must be unique to reindex") + keep_original_names = False + if isinstance(index, indexes.Index): + new_indexer = bigframes.dataframe.DataFrame(data=index._data._get_block())[ + [] + ] + else: + if not isinstance(index, pandas.Index): + keep_original_names = True + index = pandas.Index(index) + if index.nlevels != self.index.nlevels: + raise NotImplementedError( + "Cannot reindex with index with different nlevels" + ) + new_indexer = bigframes.dataframe.DataFrame(index=index)[[]] + # multiindex join is senstive to index names, so we will set all these + result = new_indexer.rename_axis(range(new_indexer.index.nlevels)).join( + self.to_frame().rename_axis(range(self.index.nlevels)), + how="left", + ) + # and then reset the names after the join + result_block = result.rename_axis( + self.index.names if keep_original_names else index.names + )._block + return Series(result_block) + + def reindex_like(self, other: Series, *, validate: typing.Optional[bool] = None): + return self.reindex(other.index, validate=validate) + def drop_duplicates(self, *, keep: str = "first") -> Series: block = block_ops.drop_duplicates(self._block, (self._value_column,), keep) return Series(block) @@ -1216,14 +1323,7 @@ def to_xarray(self): def _throw_if_index_contains_duplicates( self, error_message: typing.Optional[str] = None ) -> None: - duplicates_block, _ = block_ops.indicate_duplicates( - self._get_block(), self._get_block().index_columns - ) - duplicates_block = duplicates_block.with_column_labels( - ["values", "is_duplicate"] - ) - duplicates_df = bigframes.dataframe.DataFrame(duplicates_block) - if duplicates_df["is_duplicate"].any(): + if not self.index.is_unique: error_message = ( error_message if error_message diff --git a/bigframes/session.py b/bigframes/session.py index 1744407772..3ca79a7b53 100644 --- a/bigframes/session.py +++ b/bigframes/session.py @@ -424,25 +424,30 @@ def read_gbq( ) -> dataframe.DataFrame: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. if _is_query(query): - return self.read_gbq_query( + return self._read_gbq_query( query, index_col=index_col, col_order=col_order, max_results=max_results, + api_name="read_gbq", ) else: # TODO(swast): Query the snapshot table but mark it as a # deterministic query so we can avoid serializing if we have a # unique index. - return self.read_gbq_table( + return self._read_gbq_table( query, index_col=index_col, col_order=col_order, max_results=max_results, + api_name="read_gbq", ) def _query_to_destination( - self, query: str, index_cols: List[str] + self, + query: str, + index_cols: List[str], + api_name: str, ) -> Tuple[Optional[bigquery.TableReference], Optional[bigquery.QueryJob]]: # If there are no index columns, then there's no reason to cache to a # (clustered) session table, as we'll just have to query it again to @@ -464,7 +469,7 @@ def _query_to_destination( # operations are as speedy as they can be. try: ibis_expr = self.ibis_client.sql(query) - return self._ibis_to_session_table(ibis_expr, index_cols), None + return self._ibis_to_session_table(ibis_expr, index_cols, api_name), None except google.api_core.exceptions.BadRequest: # Some SELECT statements still aren't compatible with CREATE TEMP # TABLE ... AS SELECT ... statements. For example, if the query has @@ -490,15 +495,33 @@ def read_gbq_query( See also: :meth:`Session.read_gbq`. """ + return self._read_gbq_query( + query=query, + index_col=index_col, + col_order=col_order, + max_results=max_results, + api_name="read_gbq_query", + ) + + def _read_gbq_query( + self, + query: str, + *, + index_col: Iterable[str] | str = (), + col_order: Iterable[str] = (), + max_results: Optional[int] = None, + api_name: str, + ) -> dataframe.DataFrame: # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so # these docstrings are inline. - if isinstance(index_col, str): index_cols = [index_col] else: index_cols = list(index_col) - destination, query_job = self._query_to_destination(query, index_cols) + destination, query_job = self._query_to_destination( + query, index_cols, api_name="read_gbq_query" + ) # If there was no destination table, that means the query must have # been DDL or DML. Return some job metadata, instead. @@ -535,6 +558,23 @@ def read_gbq_table( See also: :meth:`Session.read_gbq`. """ + return self._read_gbq_table( + query=query, + index_col=index_col, + col_order=col_order, + max_results=max_results, + api_name="read_gbq_table", + ) + + def _read_gbq_table( + self, + query: str, + *, + index_col: Iterable[str] | str = (), + col_order: Iterable[str] = (), + max_results: Optional[int] = None, + api_name: str, + ) -> dataframe.DataFrame: if max_results and max_results <= 0: raise ValueError("`max_results` should be a positive number.") @@ -646,7 +686,8 @@ def read_gbq_table( # rows for which row numbers must be generated table_expression = table_expression.limit(max_results) table_expression, ordering = self._create_sequential_ordering( - table_expression + table=table_expression, + api_name=api_name, ) hidden_cols = ( (ordering.total_order_col.column_id,) @@ -667,6 +708,7 @@ def read_gbq_table( hidden_cols=hidden_cols, ordering=ordering, is_total_ordering=is_total_ordering, + api_name=api_name, ) def _read_gbq_with_ordering( @@ -680,6 +722,7 @@ def _read_gbq_with_ordering( hidden_cols: Iterable[str] = (), ordering: core.ExpressionOrdering, is_total_ordering: bool = False, + api_name: str, ) -> dataframe.DataFrame: """Internal helper method that loads DataFrame from Google BigQuery given an ordering column. @@ -698,6 +741,8 @@ def _read_gbq_with_ordering( Columns that should be hidden. Ordering columns may (not always) be hidden ordering: Column name to be used for ordering. If not supplied, a default ordering is generated. + api_name: + The name of the API method. Returns: A DataFrame representing results of the query or table. @@ -723,7 +768,9 @@ def _read_gbq_with_ordering( if not is_total_ordering: # Rows are not ordered, we need to generate a default ordering and materialize it table_expression, ordering = self._create_sequential_ordering( - table_expression, index_cols + table=table_expression, + index_cols=index_cols, + api_name=api_name, ) index_col_values = [table_expression[index_id] for index_id in index_cols] if not col_labels: @@ -846,6 +893,11 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame Returns: bigframes.dataframe.DataFrame: The BigQuery DataFrame. """ + return self._read_pandas(pandas_dataframe, "read_pandas") + + def _read_pandas( + self, pandas_dataframe: pandas.DataFrame, api_name: str + ) -> dataframe.DataFrame: col_labels, idx_labels = ( pandas_dataframe.columns.to_list(), pandas_dataframe.index.names, @@ -878,6 +930,7 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame job_config = bigquery.LoadJobConfig(schema=schema) job_config.clustering_fields = cluster_cols + job_config.labels = {"bigframes-api": api_name} load_table_destination = self._create_session_table() load_job = self.bqclient.load_table_from_dataframe( @@ -910,6 +963,7 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame hidden_cols=(ordering_col,), ordering=ordering, is_total_ordering=True, + api_name=api_name, ) return df @@ -991,6 +1045,7 @@ def read_csv( job_config.autodetect = True job_config.field_delimiter = sep job_config.encoding = encoding + job_config.labels = {"bigframes-api": "read_csv"} # We want to match pandas behavior. If header is 0, no rows should be skipped, so we # do not need to set `skip_leading_rows`. If header is None, then there is no header. @@ -1048,7 +1103,7 @@ def read_pickle( pandas_obj.name = "0" bigframes_df = self.read_pandas(pandas_obj.to_frame()) return bigframes_df[bigframes_df.columns[0]] - return self.read_pandas(pandas_obj) + return self._read_pandas(pandas_obj, "read_pickle") def read_parquet( self, @@ -1063,6 +1118,7 @@ def read_parquet( job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.source_format = bigquery.SourceFormat.PARQUET job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY + job_config.labels = {"bigframes-api": "read_parquet"} return self._read_bigquery_load_job(path, table, job_config=job_config) @@ -1109,6 +1165,7 @@ def read_json( job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY job_config.autodetect = True job_config.encoding = encoding + job_config.labels = {"bigframes-api": "read_json"} return self._read_bigquery_load_job( path_or_buf, @@ -1176,7 +1233,10 @@ def _create_session_table(self) -> bigquery.TableReference: return dataset.table(table_name) def _create_sequential_ordering( - self, table: ibis_types.Table, index_cols: Iterable[str] = () + self, + table: ibis_types.Table, + index_cols: Iterable[str] = (), + api_name: str = "", ) -> Tuple[ibis_types.Table, core.ExpressionOrdering]: # Since this might also be used as the index, don't use the default # "ordering ID" name. @@ -1188,6 +1248,7 @@ def _create_sequential_ordering( table_ref = self._ibis_to_session_table( table, cluster_cols=list(index_cols) + [default_ordering_name], + api_name=api_name, ) table = self.ibis_client.sql(f"SELECT * FROM `{table_ref.table_id}`") ordering_reference = core.OrderingColumnReference(default_ordering_name) @@ -1199,7 +1260,10 @@ def _create_sequential_ordering( return table, ordering def _ibis_to_session_table( - self, table: ibis_types.Table, cluster_cols: Iterable[str] + self, + table: ibis_types.Table, + cluster_cols: Iterable[str], + api_name: str, ) -> bigquery.TableReference: clusterable_cols = [ col for col in cluster_cols if _can_cluster(table[col].type()) @@ -1207,10 +1271,14 @@ def _ibis_to_session_table( return self._query_to_session_table( self.ibis_client.compile(table), cluster_cols=clusterable_cols, + api_name=api_name, ) def _query_to_session_table( - self, query_text: str, cluster_cols: Iterable[str] + self, + query_text: str, + cluster_cols: Iterable[str], + api_name: str, ) -> bigquery.TableReference: if len(list(cluster_cols)) > _MAX_CLUSTER_COLUMNS: raise ValueError( @@ -1236,6 +1304,7 @@ def _query_to_session_table( # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not # supported`. job_config.labels = {"source": "bigquery-dataframes-temp"} + job_config.labels["bigframes-api"] = api_name try: self._start_query( @@ -1253,6 +1322,7 @@ def remote_function( dataset: Optional[str] = None, bigquery_connection: Optional[str] = None, reuse: bool = True, + name: Optional[str] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. @@ -1280,7 +1350,7 @@ def remote_function( * BigQuery Data Editor (roles/bigquery.dataEditor) * BigQuery Connection Admin (roles/bigquery.connectionAdmin) * Cloud Functions Developer (roles/cloudfunctions.developer) - * Service Account User (roles/iam.serviceAccountUser) + * Service Account User (roles/iam.serviceAccountUser) on the service account `PROJECT_NUMBER-compute@developer.gserviceaccount.com` * Storage Object Viewer (roles/storage.objectViewer) * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) @@ -1311,10 +1381,16 @@ def remote_function( reuse (bool, Optional): Reuse the remote function if already exists. `True` by default, which will result in reusing an existing remote - function (if any) that was previously created for the same udf. - Setting it to false would force creating a unique remote function. + function and corresponding cloud function (if any) that was + previously created for the same udf. + Setting it to `False` would force creating a unique remote function. If the required remote function does not exist then it would be created irrespective of this param. + name (str, Optional): + Explicit name of the persisted BigQuery remote function. Use it with + caution, because two users working in the same project and dataset + could overwrite each other's remote functions if they use the same + persistent name. Returns: callable: A remote function object pointing to the cloud assets created in the background to support the remote execution. The cloud assets can be @@ -1331,6 +1407,7 @@ def remote_function( dataset=dataset, bigquery_connection=bigquery_connection, reuse=reuse, + name=name, ) def read_gbq_function( diff --git a/noxfile.py b/noxfile.py index 2355a9b27b..033bbfefe4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -42,6 +42,7 @@ "pytest", "pytest-cov", "pytest-asyncio", + "pytest-mock", ] UNIT_TEST_EXTERNAL_DEPENDENCIES: List[str] = [] UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = [] diff --git a/samples/snippets/remote_function.py b/samples/snippets/remote_function.py index 37972672c3..9998a23eb2 100644 --- a/samples/snippets/remote_function.py +++ b/samples/snippets/remote_function.py @@ -39,11 +39,19 @@ def run_remote_function_and_read_gbq_function(project_id: str): # already created, BigQuery DataFrames will attempt to create one assuming # the necessary APIs and IAM permissions are setup in the project. In our # examples we would be using a pre-created connection named - # `bigframes-rf-conn`. Let's try a `pandas`-like use case in which we want - # to apply a user defined scalar function to every value in a `Series`, more - # specifically bucketize the `body_mass_g` value of the penguins, which is a - # real number, into a category, which is a string. - @bpd.remote_function([float], str, bigquery_connection="bigframes-rf-conn") + # `bigframes-rf-conn`. We will also set `reuse=False` to make sure we don't + # step over someone else creating remote function in the same project from + # the exact same source code at the same time. Let's try a `pandas`-like use + # case in which we want to apply a user defined scalar function to every + # value in a `Series`, more specifically bucketize the `body_mass_g` value + # of the penguins, which is a real number, into a category, which is a + # string. + @bpd.remote_function( + [float], + str, + bigquery_connection="bigframes-rf-conn", + reuse=False, + ) def get_bucket(num): if not num: return "NA" @@ -80,9 +88,11 @@ def get_bucket(num): # Let's continue trying other potential use cases of remote functions. Let's # say we consider the `species`, `island` and `sex` of the penguins # sensitive information and want to redact that by replacing with their hash - # code instead. Let's define another scalar custom function and decorated it + # code instead. Let's define another scalar custom function and decorate it # as a remote function - @bpd.remote_function([str], str, bigquery_connection="bigframes-rf-conn") + @bpd.remote_function( + [str], str, bigquery_connection="bigframes-rf-conn", reuse=False + ) def get_hash(input): import hashlib diff --git a/setup.py b/setup.py index 139873e6fc..69b71c88f1 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ "ibis-framework[bigquery] >=6.0.0,<=6.1.0", "pandas >=1.5.0", "pydata-google-auth >=1.8.2", + "requests >=2.27.1", "scikit-learn >=1.2.2", "sqlalchemy >=1.4,<3.0", "ipywidgets >=7.7.1", @@ -58,7 +59,7 @@ "pandas-gbq >=0.19.0", ], # Packages required for basic development flow. - "dev": ["pytest", "pre-commit", "nox", "google-cloud-testutils"], + "dev": ["pytest", "pytest-mock", "pre-commit", "nox", "google-cloud-testutils"], } extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values())))) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 523256ee83..cd69d45dc9 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -9,7 +9,7 @@ cachetools==5.3.0 certifi==2022.12.7 cffi==1.15.1 cfgv==3.3.1 -charset-normalizer==3.1.0 +charset-normalizer==2.0.0 click==8.1.3 cloudpickle==2.0.0 colorlog==6.7.0 @@ -90,13 +90,14 @@ pyperclip==1.8.2 pytest==7.2.2 pytest-asyncio==0.21.0 pytest-cov==4.0.0 +pytest-mock==3.11.1 pytest-retry==1.1.0 pytest-xdist==3.2.1 python-dateutil==2.8.2 pytz==2023.3 PyYAML==6.0 readme-renderer==37.3 -requests==2.28.2 +requests==2.27.1 requests-oauthlib==1.3.1 requests-toolbelt==0.10.1 rfc3986==2.0.0 diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py index 9b2872d673..a8613dfeb9 100644 --- a/tests/system/large/ml/test_ensemble.py +++ b/tests/system/large/ml/test_ensemble.py @@ -70,7 +70,7 @@ def test_xgbregressor_dart_booster_multiple_params( ): model = bigframes.ml.ensemble.XGBRegressor( booster="dart", - tree_method="AUTO", + tree_method="auto", min_tree_child_weight=2, colsample_bytree=0.95, colsample_bylevel=0.95, @@ -121,7 +121,7 @@ def test_xgbregressor_dart_booster_multiple_params( in reloaded_model._bqml_model.model_name ) assert reloaded_model.booster == "DART" - assert reloaded_model.dart_normalized_type == "TREE" + assert reloaded_model.dart_normalized_type == "tree" assert reloaded_model.tree_method == "AUTO" assert reloaded_model.colsample_bytree == 0.95 assert reloaded_model.colsample_bylevel == 0.95 @@ -185,7 +185,7 @@ def test_xgbclassifier_dart_booster_multiple_params( ): model = bigframes.ml.ensemble.XGBClassifier( booster="dart", - tree_method="AUTO", + tree_method="auto", min_tree_child_weight=2, colsample_bytree=0.95, colsample_bylevel=0.95, @@ -235,7 +235,7 @@ def test_xgbclassifier_dart_booster_multiple_params( in reloaded_model._bqml_model.model_name ) assert reloaded_model.booster == "DART" - assert reloaded_model.dart_normalized_type == "TREE" + assert reloaded_model.dart_normalized_type == "tree" assert reloaded_model.tree_method == "AUTO" assert reloaded_model.colsample_bytree == 0.95 assert reloaded_model.colsample_bylevel == 0.95 @@ -297,7 +297,7 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset @pytest.mark.flaky(retries=2, delay=120) def test_randomforestregressor_multiple_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.RandomForestRegressor( - tree_method="AUTO", + tree_method="auto", min_tree_child_weight=2, colsample_bytree=0.95, colsample_bylevel=0.95, diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 8033f79c47..2f231f40c9 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -61,16 +61,32 @@ def get_remote_function_endpoints(bigquery_client, dataset_id): return endpoints -def get_cloud_functions(functions_client, project, location, name_prefix="bigframes-"): +def get_cloud_functions( + functions_client, project, location, name=None, name_prefix=None +): """Get the cloud functions in the given project and location.""" + + assert ( + not name or not name_prefix + ), f"At most one of the {name.__name__} or {name_prefix.__name__} can be passed." + _, location = get_remote_function_locations(location) parent = f"projects/{project}/locations/{location}" request = functions_v2.ListFunctionsRequest(parent=parent) page_result = functions_client.list_functions(request=request) - full_name_prefix = parent + f"/functions/{name_prefix}" for response in page_result: - if not name_prefix or response.name.startswith(full_name_prefix): - yield response + # If name is provided and it does not match then skip + if bool(name): + full_name = parent + f"/functions/{name}" + if response.name != full_name: + continue + # If name prefix is provided and it does not match then skip + elif bool(name_prefix): + full_name_prefix = parent + f"/functions/{name_prefix}" + if not response.name.startswith(full_name_prefix): + continue + + yield response def delete_cloud_function(functions_client, full_name): @@ -84,8 +100,17 @@ def cleanup_remote_function_assets( bigquery_client, functions_client, remote_udf, ignore_failures=True ): """Clean up the GCP assets behind a bigframes remote function.""" + + # Clean up BQ remote function try: bigquery_client.delete_routine(remote_udf.bigframes_remote_function) + except Exception: + # By default don't raise exception in cleanup + if not ignore_failures: + raise + + # Clean up cloud function + try: delete_cloud_function(functions_client, remote_udf.bigframes_cloud_function) except Exception: # By default don't raise exception in cleanup @@ -94,7 +119,15 @@ def cleanup_remote_function_assets( def make_uniq_udf(udf): - """Transform a udf to another with same behavior but a unique name.""" + """Transform a udf to another with same behavior but a unique name. + Use this to test remote functions with reuse=True, in which case parallel + instances of the same tests may evaluate same named cloud functions and BQ + remote functions, therefore interacting with each other and causing unwanted + failures. With this method one can transform a udf into another with the + same behavior but a different name which will remain unique for the + lifetime of one test instance. + """ + prefixer = test_utils.prefixer.Prefixer(udf.__name__, "") udf_uniq_name = prefixer.create_prefix() udf_file_name = f"{udf_uniq_name}.py" @@ -111,7 +144,18 @@ def make_uniq_udf(udf): target_code = source_code.replace(source_key, target_key, 1) f.write(target_code) spec = importlib.util.spec_from_file_location(udf_file_name, udf_file_path) - return getattr(spec.loader.load_module(), udf_uniq_name), tmpdir + udf_uniq = getattr(spec.loader.load_module(), udf_uniq_name) + + # This is a bit of a hack but we need to remove the reference to a foreign + # module, otherwise the serialization would keep the foreign module + # reference and deserialization would fail with error like following: + # ModuleNotFoundError: No module named 'add_one_2nxcmd9j' + # TODO(shobs): Figure out if there is a better way of generating the unique + # function object, but for now let's just set it to same module as the + # original udf. + udf_uniq.__module__ = udf.__module__ + + return udf_uniq, tmpdir @pytest.fixture(scope="module") @@ -136,7 +180,10 @@ def cleanup_cloud_functions(session, functions_client, dataset_id_permanent): ) delete_count = 0 for cloud_function in get_cloud_functions( - functions_client, session.bqclient.project, session.bqclient.location + functions_client, + session.bqclient.project, + session.bqclient.location, + name_prefix="bigframes-", ): # Ignore bigframes cloud functions referred by the remote functions in # the permanent dataset @@ -524,15 +571,6 @@ def add_one(x): # Make a unique udf add_one_uniq, add_one_uniq_dir = make_uniq_udf(add_one) - # This is a bit of a hack but we need to remove the reference to a foreign - # module, otherwise the serialization would keep the foreign module - # reference and deserialization would fail with error like following: - # ModuleNotFoundError: No module named 'add_one_2nxcmd9j' - # TODO(shobs): Figure out if there is a better way of generating the unique - # function object, but for now let's just set it to same module as the - # original udf. - add_one_uniq.__module__ = add_one.__module__ - # Expected cloud function name for the unique udf add_one_uniq_cf_name = get_cloud_function_name(add_one_uniq) @@ -542,7 +580,7 @@ def add_one(x): functions_client, session.bqclient.project, session.bqclient.location, - name_prefix=add_one_uniq_cf_name, + name=add_one_uniq_cf_name, ) ) assert len(cloud_functions) == 0 @@ -563,7 +601,7 @@ def add_one(x): functions_client, session.bqclient.project, session.bqclient.location, - name_prefix=add_one_uniq_cf_name, + name=add_one_uniq_cf_name, ) ) assert len(cloud_functions) == 1 @@ -611,7 +649,7 @@ def inner_test(): functions_client, session.bqclient.project, session.bqclient.location, - name_prefix=add_one_uniq_cf_name, + name=add_one_uniq_cf_name, ) ) assert len(cloud_functions) == 0 @@ -633,7 +671,7 @@ def inner_test(): functions_client, session.bqclient.project, session.bqclient.location, - name_prefix=add_one_uniq_cf_name, + name=add_one_uniq_cf_name, ) ) assert len(cloud_functions) == 1 @@ -776,3 +814,221 @@ def test_remote_udf_lambda( cleanup_remote_function_assets( session.bqclient, functions_client, add_one_lambda_remote ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_with_explicit_name( + session, scalars_dfs, dataset_id, bq_cf_connection, functions_client +): + try: + + def square(x): + return x * x + + prefixer = test_utils.prefixer.Prefixer(square.__name__, "") + rf_name = prefixer.create_prefix() + expected_remote_function = f"{dataset_id}.{rf_name}" + + # Initially the expected BQ remote function should not exist + with pytest.raises(NotFound): + session.bqclient.get_routine(expected_remote_function) + + # Create the remote function with the name provided explicitly + square_remote = session.remote_function( + [int], + int, + dataset_id, + bq_cf_connection, + reuse=False, + name=rf_name, + )(square) + + # The remote function should reflect the explicitly provided name + assert square_remote.bigframes_remote_function == expected_remote_function + + # Now the expected BQ remote function should exist + session.bqclient.get_routine(expected_remote_function) + + # The behavior of the created remote function should be as expected + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_too"] + bf_result_col = bf_int64_col.apply(square_remote) + bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_too"] + pd_result_col = pd_int64_col.apply(square) + # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. + # pd_int64_col.dtype is Int64Dtype() + # pd_int64_col.apply(square).dtype is int64. + # For this test let's force the pandas dtype to be same as bigframes' dtype. + pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) + pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, functions_client, square_remote + ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_with_explicit_name_reuse( + session, scalars_dfs, dataset_id, bq_cf_connection, functions_client +): + try: + + dirs_to_cleanup = [] + + # Define a user code + def square(x): + return x * x + + # Make it a unique udf + square_uniq, square_uniq_dir = make_uniq_udf(square) + dirs_to_cleanup.append(square_uniq_dir) + + # Define a common routine which accepts a remote function and the + # corresponding user defined function and tests that bigframes bahavior + # on the former is in parity with the pandas behaviour on the latter + def test_internal(rf, udf): + # The behavior of the created remote function should be as expected + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_too"] + bf_result_col = bf_int64_col.apply(rf) + bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_too"] + pd_result_col = pd_int64_col.apply(udf) + # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. + # pd_int64_col.dtype is Int64Dtype() + # pd_int64_col.apply(square).dtype is int64. + # For this test let's force the pandas dtype to be same as bigframes' dtype. + pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) + pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + + # Create an explicit name for the remote function + prefixer = test_utils.prefixer.Prefixer("foo", "") + rf_name = prefixer.create_prefix() + expected_remote_function = f"{dataset_id}.{rf_name}" + + # Initially the expected BQ remote function should not exist + with pytest.raises(NotFound): + session.bqclient.get_routine(expected_remote_function) + + # Create a new remote function with the name provided explicitly + square_remote1 = session.remote_function( + [int], + int, + dataset_id, + bq_cf_connection, + name=rf_name, + )(square_uniq) + + # The remote function should reflect the explicitly provided name + assert square_remote1.bigframes_remote_function == expected_remote_function + + # Now the expected BQ remote function should exist + routine = session.bqclient.get_routine(expected_remote_function) + square_remote1_created = routine.created + square_remote1_cf_updated = session.cloudfunctionsclient.get_function( + name=square_remote1.bigframes_cloud_function + ).update_time + + # Test pandas parity with square udf + test_internal(square_remote1, square) + + # Now Create another remote function with the same name provided + # explicitly. Since reuse is True by default, the previously created + # remote function with the same name will be reused. + square_remote2 = session.remote_function( + [int], + int, + dataset_id, + bq_cf_connection, + name=rf_name, + )(square_uniq) + + # The new remote function should still reflect the explicitly provided name + assert square_remote2.bigframes_remote_function == expected_remote_function + + # The expected BQ remote function should still exist + routine = session.bqclient.get_routine(expected_remote_function) + square_remote2_created = routine.created + square_remote2_cf_updated = session.cloudfunctionsclient.get_function( + name=square_remote2.bigframes_cloud_function + ).update_time + + # The new remote function should reflect that the previous BQ remote + # function and the cloud function were reused instead of creating anew + assert square_remote2_created == square_remote1_created + assert ( + square_remote2.bigframes_cloud_function + == square_remote1.bigframes_cloud_function + ) + assert square_remote2_cf_updated == square_remote1_cf_updated + + # Test again that the new remote function is actually same as the + # previous remote function + test_internal(square_remote2, square) + + # Now define a different user code + def plusone(x): + return x + 1 + + # Make it a unique udf + plusone_uniq, plusone_uniq_dir = make_uniq_udf(plusone) + dirs_to_cleanup.append(plusone_uniq_dir) + + # Now Create a third remote function with the same name provided + # explicitly. Even though reuse is True by default, the previously + # created remote function with the same name should not be reused since + # this time it is a different user code. + plusone_remote = session.remote_function( + [int], + int, + dataset_id, + bq_cf_connection, + name=rf_name, + )(plusone_uniq) + + # The new remote function should still reflect the explicitly provided name + assert plusone_remote.bigframes_remote_function == expected_remote_function + + # The expected BQ remote function should still exist + routine = session.bqclient.get_routine(expected_remote_function) + plusone_remote_created = routine.created + plusone_remote_cf_updated = session.cloudfunctionsclient.get_function( + name=plusone_remote.bigframes_cloud_function + ).update_time + + # The new remote function should reflect that the previous BQ remote + # function and the cloud function were NOT reused, instead were created + # anew + assert plusone_remote_created > square_remote2_created + assert ( + plusone_remote.bigframes_cloud_function + != square_remote2.bigframes_cloud_function + ) + assert plusone_remote_cf_updated > square_remote2_cf_updated + + # Test again that the new remote function is equivalent to the new user + # defined function + test_internal(plusone_remote, plusone) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, functions_client, square_remote1 + ) + cleanup_remote_function_assets( + session.bqclient, functions_client, square_remote2 + ) + cleanup_remote_function_assets( + session.bqclient, functions_client, plusone_remote + ) + for dir_ in dirs_to_cleanup: + shutil.rmtree(dir_) diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 6c3e8e06f5..ace943956f 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -18,6 +18,7 @@ import pandas as pd import pyarrow as pa +import pytest import pytz import bigframes @@ -278,6 +279,7 @@ def test_model_predict_with_unnamed_index( ) +@pytest.mark.flaky(retries=2, delay=120) def test_model_generate_text( bqml_palm2_text_generator_model: core.BqmlModel, llm_text_df ): diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index 8df4145fcf..c71bbbe3b0 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -16,33 +16,14 @@ from bigframes.ml import decomposition -_PD_NEW_PENGUINS = pd.DataFrame( - { - "tag_number": [1633, 1672, 1690], - "species": [ - "Adelie Penguin (Pygoscelis adeliae)", - "Gentoo penguin (Pygoscelis papua)", - "Adelie Penguin (Pygoscelis adeliae)", - ], - "island": ["Dream", "Biscoe", "Torgersen"], - "culmen_length_mm": [37.8, 46.5, 41.1], - "culmen_depth_mm": [18.1, 14.8, 18.6], - "flipper_length_mm": [193.0, 217.0, 189.0], - "body_mass_g": [3750.0, 5200.0, 3325.0], - "sex": ["MALE", "FEMALE", "MALE"], - } -).set_index("tag_number") - -def test_pca_predict(session, penguins_pca_model: decomposition.PCA): - new_penguins = session.read_pandas(_PD_NEW_PENGUINS) - - predictions = penguins_pca_model.predict(new_penguins).to_pandas() +def test_pca_predict(penguins_pca_model, new_penguins_df): + predictions = penguins_pca_model.predict(new_penguins_df).to_pandas() expected = pd.DataFrame( { - "principal_component_1": [-1.459, 2.258, -1.685], - "principal_component_2": [-1.120, -1.351, -0.874], - "principal_component_3": [-0.646, 0.443, -0.704], + "principal_component_1": [-1.314041, -0.855813, -1.848786], + "principal_component_2": [-0.889106, -1.259753, -0.983304], + "principal_component_3": [-0.704345, 0.322555, -0.095759], }, dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 85c3cce1d7..a85777c59d 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -13,6 +13,7 @@ # limitations under the License. import operator +import tempfile import typing from typing import Tuple @@ -137,6 +138,46 @@ def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_inde pandas.testing.assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + ("all",), + ], +) +def test_df_nlargest(scalars_df_index, scalars_pandas_df_index, keep): + bf_result = scalars_df_index.nlargest( + 3, ["bool_col", "int64_too"], keep=keep + ).to_pandas() + pd_result = scalars_pandas_df_index.nlargest( + 3, ["bool_col", "int64_too"], keep=keep + ) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + ("all",), + ], +) +def test_df_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): + bf_result = scalars_df_index.nsmallest(6, ["bool_col"], keep=keep).to_pandas() + pd_result = scalars_pandas_df_index.nsmallest(6, ["bool_col"], keep=keep) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_get_column_by_attr(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs series = scalars_df.int64_col @@ -582,6 +623,22 @@ def test_df_fillna(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_df_ffill(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas() + pd_result = scalars_pandas_df[["int64_col", "float64_col"]].ffill(limit=1) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_bfill(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[["int64_col", "float64_col"]].bfill().to_pandas() + pd_result = scalars_pandas_df[["int64_col", "float64_col"]].bfill() + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + def test_df_isin_list(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs values = ["Hello, World!", 55555, 2.51, pd.NA, True] @@ -1027,6 +1084,88 @@ def test_df_notnull(scalars_dfs): assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) +@pytest.mark.parametrize( + ("left_labels", "right_labels", "overwrite", "fill_value"), + [ + (["a", "b", "c"], ["c", "a", "b"], True, None), + (["a", "b", "c"], ["c", "a", "b"], False, None), + (["a", "b", "c"], ["a", "b", "c"], False, 2), + ], + ids=[ + "one_one_match_overwrite", + "one_one_match_no_overwrite", + "exact_match", + ], +) +def test_combine( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, + left_labels, + right_labels, + overwrite, + fill_value, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns] + bf_df_a.columns = left_labels + bf_df_b = scalars_df_2_index[columns] + bf_df_b.columns = right_labels + bf_result = bf_df_a.combine( + bf_df_b, + lambda x, y: x**2 + 2 * x * y + y**2, + overwrite=overwrite, + fill_value=fill_value, + ).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns] + pd_df_a.columns = left_labels + pd_df_b = scalars_pandas_df_index[columns] + pd_df_b.columns = right_labels + pd_result = pd_df_a.combine( + pd_df_b, + lambda x, y: x**2 + 2 * x * y + y**2, + overwrite=overwrite, + fill_value=fill_value, + ) + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +def test_combine_first( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns].iloc[0:6] + bf_df_a.columns = ["a", "b", "c"] + bf_df_b = scalars_df_2_index[columns].iloc[2:8] + bf_df_b.columns = ["b", "a", "d"] + bf_result = bf_df_a.combine_first(bf_df_b).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns].iloc[0:6] + pd_df_a.columns = ["a", "b", "c"] + pd_df_b = scalars_pandas_df_index[columns].iloc[2:8] + pd_df_b.columns = ["b", "a", "d"] + pd_result = pd_df_a.combine_first(pd_df_b) + + print("pandas") + print(pd_result.to_string()) + print("bigframes") + print(bf_result.to_string()) + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + @pytest.mark.parametrize( ("op"), [ @@ -1145,11 +1284,13 @@ def test_series_binop_axis_index( (["a", "a", "b"], ["c", "c", "d"]), (["a", "b", "c"], ["c", "a", "b"]), (["a", "c", "c"], ["c", "a", "c"]), + (["a", "b", "c"], ["a", "b", "c"]), ], ids=[ "no_overlap", "one_one_match", "multi_match", + "exact_match", ], ) def test_binop_df_df_binary_op( @@ -1361,6 +1502,42 @@ def test_dataframe_general_analytic_op( ) +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_dataframe_diff(scalars_df_index, scalars_pandas_df_index, periods): + col_names = ["int64_too", "float64_col", "int64_col"] + bf_result = scalars_df_index[col_names].diff(periods=periods).to_pandas() + pd_result = scalars_pandas_df_index[col_names].diff(periods=periods) + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods): + col_names = ["int64_too", "float64_col", "int64_col"] + bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas() + pd_result = scalars_pandas_df_index[col_names].pct_change(periods=periods) + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + def test_dataframe_agg_single_string(scalars_dfs): numeric_cols = ["int64_col", "int64_too", "float64_col"] scalars_df, scalars_pandas_df = scalars_dfs @@ -1675,6 +1852,52 @@ def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index ) +def test_loc_setitem_bool_series_scalar_new_col(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[bf_df["int64_too"] == 0, "new_col"] = 99 + pd_df.loc[pd_df["int64_too"] == 0, "new_col"] = 99 + + # pandas type difference + pd_df["new_col"] = pd_df["new_col"].astype("Float64") + + pd.testing.assert_frame_equal( + bf_df.to_pandas(), + pd_df, + ) + + +def test_loc_setitem_bool_series_scalar_existing_col(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("this loc overload not supported in pandas 1.x.") + + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[bf_df["int64_too"] == 1, "string_col"] = "hello" + pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = "hello" + + pd.testing.assert_frame_equal( + bf_df.to_pandas(), + pd_df, + ) + + +def test_loc_setitem_bool_series_scalar_type_error(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("this loc overload not supported in pandas 1.x.") + + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(TypeError): + bf_df.loc[bf_df["int64_too"] == 1, "string_col"] = 99 + with pytest.raises(TypeError): + pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99 + + @pytest.mark.parametrize( ("op"), [ @@ -1749,6 +1972,30 @@ def test_dataframe_prod(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) +def test_df_skew_too_few_values(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].head(2).skew().to_pandas() + pd_result = scalars_pandas_df[columns].head(2).skew() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +def test_df_skew(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].skew().to_pandas() + pd_result = scalars_pandas_df[columns].skew() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + @pytest.mark.parametrize( ("frac", "n", "random_state"), [ @@ -1828,6 +2075,161 @@ def test_df_add_suffix(scalars_df_index, scalars_pandas_df_index, axis): ) +def test_df_columns_filter_items(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): + pytest.skip("pandas filter items behavior different pre-2.1") + bf_result = scalars_df_index.filter(items=["string_col", "int64_col"]).to_pandas() + + pd_result = scalars_pandas_df_index.filter(items=["string_col", "int64_col"]) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_columns_filter_like(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.filter(like="64_col").to_pandas() + + pd_result = scalars_pandas_df_index.filter(like="64_col") + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_columns_filter_regex(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.filter(regex="^[^_]+$").to_pandas() + + pd_result = scalars_pandas_df_index.filter(regex="^[^_]+$") + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_rows_filter_items(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): + pytest.skip("pandas filter items behavior different pre-2.1") + bf_result = scalars_df_index.filter(items=[5, 1, 3], axis=0).to_pandas() + + pd_result = scalars_pandas_df_index.filter(items=[5, 1, 3], axis=0) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_rows_filter_like(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy().set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") + + bf_result = scalars_df_index.filter(like="ello", axis=0).to_pandas() + + pd_result = scalars_pandas_df_index.filter(like="ello", axis=0) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_rows_filter_regex(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy().set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") + + bf_result = scalars_df_index.filter(regex="^[GH].*", axis=0).to_pandas() + + pd_result = scalars_pandas_df_index.filter(regex="^[GH].*", axis=0) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_rows_list(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.reindex(index=[5, 1, 3, 99, 1]).to_pandas() + + pd_result = scalars_pandas_df_index.reindex(index=[5, 1, 3, 99, 1]) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_rows_index(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.reindex( + index=pd.Index([5, 1, 3, 99, 1], name="newname") + ).to_pandas() + + pd_result = scalars_pandas_df_index.reindex( + index=pd.Index([5, 1, 3, 99, 1], name="newname") + ) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_nonunique(scalars_df_index): + with pytest.raises(ValueError): + # int64_too is non-unique + scalars_df_index.set_index("int64_too").reindex( + index=[5, 1, 3, 99, 1], validate=True + ) + + +def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"] + ).to_pandas() + + pd_result = scalars_pandas_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"] + ) + + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_like(scalars_df_index, scalars_pandas_df_index): + reindex_target_bf = scalars_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] + ) + bf_result = scalars_df_index.reindex_like(reindex_target_bf).to_pandas() + + reindex_target_pd = scalars_pandas_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] + ) + pd_result = scalars_pandas_df_index.reindex_like(reindex_target_pd) + + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_df_values(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.values @@ -2035,6 +2437,93 @@ def test_df_duplicated(scalars_df_index, scalars_pandas_df_index, keep, subset): pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) +def test_df_to_dict(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + bf_result = scalars_df_index.drop(columns=unsupported).to_dict() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_dict() + + assert bf_result == pd_result + + +def test_df_to_excel(scalars_df_index, scalars_pandas_df_index): + unsupported = ["timestamp_col"] + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.drop(columns=unsupported).to_excel(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).to_excel(pd_result_file) + bf_result = bf_result_file.read() + pd_result = bf_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_latex(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + bf_result = scalars_df_index.drop(columns=unsupported).to_latex() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_latex() + + assert bf_result == pd_result + + +def test_df_to_records(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] + bf_result = scalars_df_index.drop(columns=unsupported).to_records() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_records() + + for bfi, pdi in zip(bf_result, pd_result): + for bfj, pdj in zip(bfi, pdi): + assert pd.isna(bfj) and pd.isna(pdj) or bfj == pdj + + +def test_df_to_string(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + + bf_result = scalars_df_index.drop(columns=unsupported).to_string() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_string() + + assert bf_result == pd_result + + +def test_df_to_markdown(scalars_df_index, scalars_pandas_df_index): + # Nulls have bug from tabulate https://github.com/astanin/python-tabulate/issues/231 + bf_result = scalars_df_index.dropna().to_markdown() + pd_result = scalars_pandas_df_index.dropna().to_markdown() + + assert bf_result == pd_result + + +def test_df_to_pickle(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_pickle(bf_result_file) + scalars_pandas_df_index.to_pickle(pd_result_file) + bf_result = bf_result_file.read() + pd_result = bf_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): + unsupported = [ + "numeric_col", + "bytes_col", + "date_col", + "datetime_col", + "time_col", + "timestamp_col", + "geography_col", + ] + + bf_result_file = tempfile.TemporaryFile() + pd_result_file = tempfile.TemporaryFile() + scalars_df_index.drop(columns=unsupported).to_orc(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).reset_index().to_orc( + pd_result_file + ) + bf_result = bf_result_file.read() + pd_result = bf_result_file.read() + + assert bf_result == pd_result + + @pytest.mark.parametrize( ("subset", "normalize", "ascending", "dropna"), [ diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 987368ce77..18741468c5 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -210,12 +210,14 @@ def test_dataframe_groupby_multi_sum( (lambda x: x.cummax(numeric_only=True)), (lambda x: x.cummin(numeric_only=True)), (lambda x: x.cumprod()), + (lambda x: x.shift(periods=2)), ], ids=[ "cumsum", "cummax", "cummin", "cumprod", + "shift", ], ) def test_dataframe_groupby_analytic( @@ -229,6 +231,30 @@ def test_dataframe_groupby_analytic( pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) +def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.groupby("bool_col")["int64_too"].skew().to_pandas() + pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].skew() + + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + +def test_dataframe_groupby_skew(scalars_df_index, scalars_pandas_df_index): + col_names = ["float64_col", "int64_col", "bool_col"] + bf_result = scalars_df_index[col_names].groupby("bool_col").skew().to_pandas() + pd_result = scalars_pandas_df_index[col_names].groupby("bool_col").skew() + + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + + +def test_dataframe_groupby_diff(scalars_df_index, scalars_pandas_df_index): + col_names = ["float64_col", "int64_col", "string_col"] + bf_result = scalars_df_index[col_names].groupby("string_col").diff(-1) + pd_result = scalars_pandas_df_index[col_names].groupby("string_col").diff(-1) + bf_result_computed = bf_result.to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + + def test_dataframe_groupby_getitem( scalars_df_index, scalars_pandas_df_index, diff --git a/tests/system/small/test_ipython.py b/tests/system/small/test_ipython.py index 6725805d9a..be98ce0067 100644 --- a/tests/system/small/test_ipython.py +++ b/tests/system/small/test_ipython.py @@ -22,7 +22,8 @@ def test_repr_cache(scalars_df_index): # Make sure the df has a new block that the method return value # is not already cached. test_df = scalars_df_index.head() + test_df._block.retrieve_repr_request_results.cache_clear() results = display_formatter.format(test_df) assert results[0].keys() == {"text/plain", "text/html"} - assert test_df._block.retrieve_repr_request_results.cache_info().misses == 1 - assert test_df._block.retrieve_repr_request_results.cache_info().hits == 1 + assert test_df._block.retrieve_repr_request_results.cache_info().misses >= 1 + assert test_df._block.retrieve_repr_request_results.cache_info().hits >= 1 diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 25d1e2ad49..1e38b47b4c 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -157,7 +157,7 @@ def test_multi_index_getitem_bool(scalars_df_index, scalars_pandas_df_index): ], ids=["level_num", "level_name", "list", "mixed_list"], ) -def test_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index, level): +def test_df_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index, level): bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"]) pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"]) @@ -167,6 +167,26 @@ def test_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index, level) pandas.testing.assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("level"), + [ + (1), + ("int64_too"), + ([0, 2]), + ([2, "bool_col"]), + ], + ids=["level_num", "level_name", "list", "mixed_list"], +) +def test_series_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index, level): + bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"]) + pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"]) + + bf_result = bf_frame["string_col"].droplevel(level).to_pandas() + pd_result = pd_frame["string_col"].droplevel(level) + + pandas.testing.assert_series_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ("labels", "level"), [ @@ -198,7 +218,9 @@ def test_multi_index_drop(scalars_df_index, scalars_pandas_df_index, labels, lev "num_names_mixed", ], ) -def test_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_index, order): +def test_df_multi_index_reorder_levels( + scalars_df_index, scalars_pandas_df_index, order +): bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"]) pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"]) @@ -208,6 +230,51 @@ def test_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_index, o pandas.testing.assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("order"), + [ + (1, 0, 2), + (["int64_col", "bool_col", "int64_too"]), + (["int64_col", "bool_col", 0]), + ], + ids=[ + "level_nums", + "level_names", + "num_names_mixed", + ], +) +def test_series_multi_index_reorder_levels( + scalars_df_index, scalars_pandas_df_index, order +): + bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"]) + pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"]) + + bf_result = bf_frame["string_col"].reorder_levels(order).to_pandas() + pd_result = pd_frame["string_col"].reorder_levels(order) + + pandas.testing.assert_series_equal(bf_result, pd_result) + + +def test_df_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index): + bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"]) + pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"]) + + bf_result = bf_frame.swaplevel().to_pandas() + pd_result = pd_frame.swaplevel() + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_series_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index): + bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"]) + pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"]) + + bf_result = bf_frame["string_col"].swaplevel(0, 2).to_pandas() + pd_result = pd_frame["string_col"].swaplevel(0, 2) + + pandas.testing.assert_series_equal(bf_result, pd_result) + + def test_multi_index_series_groupby(scalars_df_index, scalars_pandas_df_index): bf_frame = scalars_df_index.set_index(["int64_too", "bool_col"]) bf_result = ( @@ -446,6 +513,24 @@ def test_multi_index_series_rename_dict_same_type( ) +def test_multi_index_df_reindex(scalars_df_index, scalars_pandas_df_index): + new_index = pandas.MultiIndex.from_tuples( + [(4, "Hello, World!"), (99, "some_new_string")], + names=["new_index1", "new_index2"], + ) + bf_result = ( + scalars_df_index.set_index(["rowindex_2", "string_col"]) + .reindex(index=new_index) + .to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index(["rowindex_2", "string_col"]).reindex( + index=new_index + ) + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + # Column Multi-index tests @@ -722,3 +807,76 @@ def test_is_monotonic_decreasing_extra(indexes): bf_result.index.is_monotonic_decreasing == pd_result.index.is_monotonic_decreasing ) + + +def test_column_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "string_col", "bool_col"] + multi_columns = pandas.MultiIndex.from_tuples( + zip(["a", "b", "a"], ["c", "d", "e"], ["f", "g", "f"]) + ) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.droplevel(1, axis=1).to_pandas() + pd_result = pd_df.droplevel(1, axis=1) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_column_multi_index_reindex(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "int64_col", "rowindex_2"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"])) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + new_index = pandas.MultiIndex.from_tuples( + [("z", "a"), ("a", "a")], names=["newname1", "newname2"] + ) + + bf_result = bf_df.reindex(columns=new_index).to_pandas() + + pd_result = pd_df.reindex(columns=new_index) + + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + pd_result[("z", "a")] = pd_result[("z", "a")].astype(pandas.Float64Dtype()) + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "string_col", "bool_col"] + multi_columns = pandas.MultiIndex.from_tuples( + zip(["a", "b", "a"], ["c", "d", "e"], ["f", "g", "f"]) + ) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.reorder_levels([-2, -1, 0], axis=1).to_pandas() + pd_result = pd_df.reorder_levels([-2, -1, 0], axis=1) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_column_multi_index_swaplevel(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "string_col", "bool_col"] + multi_columns = pandas.MultiIndex.from_tuples( + zip(["a", "b", "a"], ["c", "d", "e"], ["f", "g", "f"]) + ) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.swaplevel(-3, -1, axis=1).to_pandas() + pd_result = pd_df.swaplevel(-3, -1, axis=1) + + pandas.testing.assert_frame_equal(bf_result, pd_result) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index e451d5c3a2..a429c6551d 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -209,3 +209,17 @@ def test_merge_series(scalars_dfs, merge_how): ) assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + + +def test_cut(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=False) + bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=False) + + # make sure the result is a supported dtype + assert bf_result.dtype == bpd.Int64Dtype() + + bf_result = bf_result.to_pandas() + pd_result = pd_result.astype("Int64") + pd.testing.assert_series_equal(bf_result, pd_result) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 07dc892ddc..d3560540cc 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -186,6 +186,54 @@ def test_fillna(scalars_dfs): ) +def test_series_replace_scalar_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = ( + scalars_df[col_name].replace("Hello, World!", "Howdy, Planet!").to_pandas() + ) + pd_result = scalars_pandas_df[col_name].replace("Hello, World!", "Howdy, Planet!") + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +def test_series_replace_regex_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = ( + scalars_df[col_name].replace("^H.l", "Howdy, Planet!", regex=True).to_pandas() + ) + pd_result = scalars_pandas_df[col_name].replace( + "^H.l", "Howdy, Planet!", regex=True + ) + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +def test_series_replace_list_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = ( + scalars_df[col_name] + .replace(["Hello, World!", "T"], "Howdy, Planet!") + .to_pandas() + ) + pd_result = scalars_pandas_df[col_name].replace( + ["Hello, World!", "T"], "Howdy, Planet!" + ) + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + @pytest.mark.parametrize( ("ignore_index",), ( @@ -759,7 +807,6 @@ def test_isin_raise_error(scalars_df_index, scalars_pandas_df_index): ) def test_isin(scalars_dfs, col_name, test_set): scalars_df, scalars_pandas_df = scalars_dfs - print(type(scalars_pandas_df["datetime_col"].iloc[0])) bf_result = scalars_df[col_name].isin(test_set).to_pandas() pd_result = scalars_pandas_df[col_name].isin(test_set).astype("boolean") pd.testing.assert_series_equal( @@ -1506,6 +1553,28 @@ def test_shift(scalars_df_index, scalars_pandas_df_index): ) +def test_series_ffill(scalars_df_index, scalars_pandas_df_index): + col_name = "numeric_col" + bf_result = scalars_df_index[col_name].ffill(limit=1).to_pandas() + pd_result = scalars_pandas_df_index[col_name].ffill(limit=1) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_bfill(scalars_df_index, scalars_pandas_df_index): + col_name = "numeric_col" + bf_result = scalars_df_index[col_name].bfill(limit=2).to_pandas() + pd_result = scalars_pandas_df_index[col_name].bfill(limit=2) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + def test_cumsum_int(scalars_df_index, scalars_pandas_df_index): if pd.__version__.startswith("1."): pytest.skip("Series.cumsum NA mask are different in pandas 1.x.") @@ -1588,7 +1657,7 @@ def test_rank_with_nulls(scalars_df_index, scalars_pandas_df_index, na_option, m ("all",), ], ) -def test_nlargest(scalars_df_index, scalars_pandas_df_index, keep): +def test_series_nlargest(scalars_df_index, scalars_pandas_df_index, keep): col_name = "bool_col" bf_result = scalars_df_index[col_name].nlargest(4, keep=keep).to_pandas() pd_result = scalars_pandas_df_index[col_name].nlargest(4, keep=keep) @@ -1622,6 +1691,25 @@ def test_diff(scalars_df_index, scalars_pandas_df_index, periods): ) +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_series_pct_change(scalars_df_index, scalars_pandas_df_index, periods): + bf_result = scalars_df_index["int64_col"].pct_change(periods=periods).to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index["int64_col"].pct_change(periods=periods) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + @pytest.mark.parametrize( ("keep",), [ @@ -1630,7 +1718,7 @@ def test_diff(scalars_df_index, scalars_pandas_df_index, periods): ("all",), ], ) -def test_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): +def test_series_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): col_name = "bool_col" bf_result = scalars_df_index[col_name].nsmallest(2, keep=keep).to_pandas() pd_result = scalars_pandas_df_index[col_name].nsmallest(2, keep=keep) @@ -1853,6 +1941,91 @@ def test_series_add_suffix(scalars_df_index, scalars_pandas_df_index): ) +def test_series_filter_items(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): + pytest.skip("pandas filter items behavior different pre-2.1") + bf_result = scalars_df_index["float64_col"].filter(items=[5, 1, 3]).to_pandas() + + pd_result = scalars_pandas_df_index["float64_col"].filter(items=[5, 1, 3]) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_filter_like(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy().set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") + + bf_result = scalars_df_index["float64_col"].filter(like="ello").to_pandas() + + pd_result = scalars_pandas_df_index["float64_col"].filter(like="ello") + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_filter_regex(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy().set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") + + bf_result = scalars_df_index["float64_col"].filter(regex="^[GH].*").to_pandas() + + pd_result = scalars_pandas_df_index["float64_col"].filter(regex="^[GH].*") + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_reindex(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1]).to_pandas() + ) + + pd_result = scalars_pandas_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1]) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_series_reindex_nonunique(scalars_df_index): + with pytest.raises(ValueError): + # int64_too is non-unique + scalars_df_index.set_index("int64_too")["float64_col"].reindex( + index=[5, 1, 3, 99, 1], validate=True + ) + + +def test_series_reindex_like(scalars_df_index, scalars_pandas_df_index): + bf_reindex_target = scalars_df_index["float64_col"].reindex(index=[5, 1, 3, 99, 1]) + bf_result = ( + scalars_df_index["int64_too"].reindex_like(bf_reindex_target).to_pandas() + ) + + pd_reindex_target = scalars_pandas_df_index["float64_col"].reindex( + index=[5, 1, 3, 99, 1] + ) + pd_result = scalars_pandas_df_index["int64_too"].reindex_like(pd_reindex_target) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + def test_where_with_series(scalars_df_index, scalars_pandas_df_index): bf_result = ( scalars_df_index["int64_col"] diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py new file mode 100644 index 0000000000..8d4932a3c3 --- /dev/null +++ b/tests/unit/ml/test_golden_sql.py @@ -0,0 +1,47 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pytest_mock + +import bigframes +from bigframes.ml import linear_model +import bigframes.pandas as bpd + + +def test_linear_regression_default_fit(mocker: pytest_mock.MockerFixture): + mock_session = mock.create_autospec(spec=bigframes.Session) + + mock_X = mock.create_autospec(spec=bpd.DataFrame) + mock_X._get_block().expr._session = mock_session + + mock_y = mock.create_autospec(spec=bpd.DataFrame) + mock_y.columns.tolist.return_value = ["input_label_column"] + + mock_X.join(mock_y).sql = "input_dataframe_sql" + + # return values we don't care about, but need to provide to continue the program + mock_session._start_query.return_value = (None, mock.MagicMock()) + + mocker.patch( + "bigframes.ml.core._create_temp_model_name", return_value="temp_model_name" + ) + + model = linear_model.LinearRegression() + model.fit(mock_X, mock_y) + + mock_session._start_query.assert_called_once_with( + 'CREATE TEMP MODEL `temp_model_name`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n INPUT_LABEL_COLS=["input_label_column"])\nAS input_dataframe_sql' + ) diff --git a/tests/unit/ml/test_pipeline.py b/tests/unit/ml/test_pipeline.py index 27706a1a07..ed5c621b1d 100644 --- a/tests/unit/ml/test_pipeline.py +++ b/tests/unit/ml/test_pipeline.py @@ -18,38 +18,35 @@ import sklearn.pipeline as sklearn_pipeline # type: ignore import sklearn.preprocessing as sklearn_preprocessing # type: ignore -import bigframes.ml.compose -import bigframes.ml.linear_model -import bigframes.ml.pipeline -import bigframes.ml.preprocessing +from bigframes.ml import compose, forecasting, linear_model, pipeline, preprocessing def test_pipeline_repr(): - pipeline = bigframes.ml.pipeline.Pipeline( + pl = pipeline.Pipeline( [ ( "preproc", - bigframes.ml.compose.ColumnTransformer( + compose.ColumnTransformer( [ ( "onehot", - bigframes.ml.preprocessing.OneHotEncoder(), + preprocessing.OneHotEncoder(), "species", ), ( "scale", - bigframes.ml.preprocessing.StandardScaler(), + preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), ] ), ), - ("linreg", bigframes.ml.linear_model.LinearRegression()), + ("linreg", linear_model.LinearRegression()), ] ) assert ( - pipeline.__repr__() + pl.__repr__() == """Pipeline(steps=[('preproc', ColumnTransformer(transformers=[('onehot', OneHotEncoder(), 'species'), @@ -62,29 +59,29 @@ def test_pipeline_repr(): @pytest.mark.skipif(sklearn_pipeline is None, reason="requires sklearn") def test_pipeline_repr_matches_sklearn(): - bf_pipeline = bigframes.ml.pipeline.Pipeline( + bf_pl = pipeline.Pipeline( [ ( "preproc", - bigframes.ml.compose.ColumnTransformer( + compose.ColumnTransformer( [ ( "onehot", - bigframes.ml.preprocessing.OneHotEncoder(), + preprocessing.OneHotEncoder(), "species", ), ( "scale", - bigframes.ml.preprocessing.StandardScaler(), + preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), ] ), ), - ("linreg", bigframes.ml.linear_model.LinearRegression()), + ("linreg", linear_model.LinearRegression()), ] ) - sk_pipeline = sklearn_pipeline.Pipeline( + sk_pl = sklearn_pipeline.Pipeline( [ ( "preproc", @@ -107,4 +104,17 @@ def test_pipeline_repr_matches_sklearn(): ] ) - assert bf_pipeline.__repr__() == sk_pipeline.__repr__() + assert bf_pl.__repr__() == sk_pl.__repr__() + + +def test_pipeline_arima_plus_not_implemented(): + with pytest.raises(NotImplementedError): + pipeline.Pipeline( + [ + ( + "transform", + preprocessing.StandardScaler(), + ), + ("estimator", forecasting.ARIMAPlus()), + ] + ) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index e01638e22e..8f3e0beb0e 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -13,9 +13,12 @@ # limitations under the License. import ibis +import ibis.expr.types as ibis_types import pandas import bigframes.core as core +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops from . import resources @@ -46,6 +49,42 @@ def test_arrayvalue_constructor_from_ibis_table_adds_all_columns(): assert len(actual.columns) == 3 +def test_arrayvalue_with_get_column_type(): + value = resources.create_arrayvalue( + pandas.DataFrame( + { + "col1": [1, 2, 3], + "col2": ["a", "b", "c"], + "col3": [0.1, 0.2, 0.3], + } + ), + total_ordering_columns=["col1"], + ) + col1_type = value.get_column_type("col1") + col2_type = value.get_column_type("col2") + col3_type = value.get_column_type("col3") + assert isinstance(col1_type, pandas.Int64Dtype) + assert isinstance(col2_type, pandas.StringDtype) + assert isinstance(col3_type, pandas.Float64Dtype) + + +def test_arrayvalue_with_get_column(): + value = resources.create_arrayvalue( + pandas.DataFrame( + { + "col1": [1, 2, 3], + "col2": ["a", "b", "c"], + "col3": [0.1, 0.2, 0.3], + } + ), + total_ordering_columns=["col1"], + ) + col1 = value.get_column("col1") + assert isinstance(col1, ibis_types.Value) + assert col1.get_name() == "col1" + assert col1.type().is_int64() + + def test_arrayvalue_to_ibis_expr_with_projection(): value = resources.create_arrayvalue( pandas.DataFrame( @@ -69,3 +108,133 @@ def test_arrayvalue_to_ibis_expr_with_projection(): assert actual.columns[0] == "int64_col" assert actual.columns[1] == "literals" assert actual.columns[2] == "string_col" + + +def test_arrayvalues_to_ibis_expr_with_get_column(): + value = resources.create_arrayvalue( + pandas.DataFrame( + { + "col1": [1, 2, 3], + "col2": ["a", "b", "c"], + "col3": [0.1, 0.2, 0.3], + } + ), + total_ordering_columns=["col1"], + ) + expr = value.get_column("col1") + assert expr.get_name() == "col1" + assert expr.type().is_int64() + + +def test_arrayvalues_to_ibis_expr_with_concat(): + value = resources.create_arrayvalue( + pandas.DataFrame( + { + "col1": [1, 2, 3], + "col2": ["a", "b", "c"], + "col3": [0.1, 0.2, 0.3], + } + ), + total_ordering_columns=["col1"], + ) + expr = value.concat([value]) + actual = expr.to_ibis_expr() + assert len(actual.columns) == 3 + # TODO(ashleyxu, b/299631930): test out the union expression + assert actual.columns[0] == "column_0" + assert actual.columns[1] == "column_1" + assert actual.columns[2] == "column_2" + + +def test_arrayvalues_to_ibis_expr_with_project_unary_op(): + value = resources.create_arrayvalue( + pandas.DataFrame( + { + "col1": [1, 2, 3], + "col2": ["a", "b", "c"], + "col3": [0.1, 0.2, 0.3], + } + ), + total_ordering_columns=["col1"], + ) + expr = value.project_unary_op("col1", ops.AsTypeOp("string")) + assert value.columns[0].type().is_int64() + assert expr.columns[0].type().is_string() + + +def test_arrayvalues_to_ibis_expr_with_project_binary_op(): + value = resources.create_arrayvalue( + pandas.DataFrame( + { + "col1": [1, 2, 3], + "col2": [0.2, 0.3, 0.4], + "col3": [0.1, 0.2, 0.3], + } + ), + total_ordering_columns=["col1"], + ) + expr = value.project_binary_op("col2", "col3", ops.add_op, "col4") + assert expr.columns[3].type().is_float64() + actual = expr.to_ibis_expr() + assert len(expr.columns) == 4 + assert actual.columns[3] == "col4" + + +def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): + value = resources.create_arrayvalue( + pandas.DataFrame( + { + "col1": [1, 2, 3], + "col2": [0.2, 0.3, 0.4], + "col3": [True, False, False], + "col4": [0.1, 0.2, 0.3], + } + ), + total_ordering_columns=["col1"], + ) + expr = value.project_ternary_op("col2", "col3", "col4", ops.where_op, "col5") + assert expr.columns[4].type().is_float64() + actual = expr.to_ibis_expr() + assert len(expr.columns) == 5 + assert actual.columns[4] == "col5" + + +def test_arrayvalue_to_ibis_expr_with_aggregate(): + value = resources.create_arrayvalue( + pandas.DataFrame( + { + "col1": [1, 2, 3], + "col2": ["a", "b", "c"], + "col3": [0.1, 0.2, 0.3], + } + ), + total_ordering_columns=["col1"], + ) + expr = value.aggregate( + aggregations=(("col1", agg_ops.sum_op, "col4"),), + by_column_ids=["col1"], + dropna=False, + ) + actual = expr.to_ibis_expr() + assert len(expr.columns) == 2 + assert actual.columns[0] == "col1" + assert actual.columns[1] == "col4" + assert expr.columns[1].type().is_int64() + + +def test_arrayvalue_to_ibis_expr_with_corr_aggregate(): + value = resources.create_arrayvalue( + pandas.DataFrame( + { + "col1": [1, 2, 3], + "col2": ["a", "b", "c"], + "col3": [0.1, 0.2, 0.3], + } + ), + total_ordering_columns=["col1"], + ) + expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")]) + actual = expr.to_ibis_expr() + assert len(expr.columns) == 1 + assert actual.columns[0] == "col4" + assert expr.columns[0].type().is_float64() diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index d209284ab7..a4e61ca0f9 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -12,8 +12,20 @@ def _approx_quantiles(translator, op: vendored_ibis_ops.ApproximateMultiQuantile return f"APPROX_QUANTILES({arg}, {num_bins})" +def _first_non_null_value(translator, op: vendored_ibis_ops.FirstNonNullValue): + arg = translator.translate(op.arg) + return f"FIRST_VALUE({arg} IGNORE NULLS)" + + +def _last_non_null_value(translator, op: vendored_ibis_ops.LastNonNullValue): + arg = translator.translate(op.arg) + return f"LAST_VALUE({arg} IGNORE NULLS)" + + patched_ops = { vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, + vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, + vendored_ibis_ops.LastNonNullValue: _last_non_null_value, } OPERATION_REGISTRY.update(patched_ops) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py index f3ab753a3b..1612d9c12e 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py @@ -1,4 +1,5 @@ # Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/__init__.py from __future__ import annotations +from third_party.bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F403 from third_party.bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F403 diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py new file mode 100644 index 0000000000..038987cac9 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py @@ -0,0 +1,26 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/analytic.py + +from __future__ import annotations + +from ibis.expr.operations.analytic import Analytic +import ibis.expr.rules as rlz + + +class FirstNonNullValue(Analytic): + """Retrieve the first element.""" + + arg = rlz.column(rlz.any) + output_dtype = rlz.dtype_like("arg") + + +class LastNonNullValue(Analytic): + """Retrieve the last element.""" + + arg = rlz.column(rlz.any) + output_dtype = rlz.dtype_like("arg") + + +__all__ = [ + "FirstNonNullValue", + "LastNonNullValue", +] diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 8c81b23b6c..113c6547a0 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -13,7 +13,7 @@ from typing import Iterable, Literal, Mapping, Optional, Sequence, Union -import numpy +import numpy as np from bigframes import constants from third_party.bigframes_vendored.pandas.core.generic import NDFrame @@ -56,7 +56,7 @@ def axes(self) -> list: return [self.index, self.columns] @property - def values(self) -> numpy.ndarray: + def values(self) -> np.ndarray: """Return the values of DataFrame in the form of a NumPy array. Args: @@ -72,9 +72,7 @@ def values(self) -> numpy.ndarray: # ---------------------------------------------------------------------- # IO methods (to / from other formats) - def to_numpy( - self, dtype=None, copy=False, na_value=None, **kwargs - ) -> numpy.ndarray: + def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray: """ Convert the DataFrame to a NumPy array. @@ -154,6 +152,250 @@ def to_parquet( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_dict( + self, + orient: Literal[ + "dict", "list", "series", "split", "tight", "records", "index" + ] = "dict", + into: type[dict] = dict, + **kwargs, + ) -> dict | list[dict]: + """ + Convert the DataFrame to a dictionary. + + The type of the key-value pairs can be customized with the parameters + (see below). + + Args: + orient (str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}): + Determines the type of the values of the dictionary. + 'dict' (default) : dict like {column -> {index -> value}}. + 'list' : dict like {column -> [values]}. + 'series' : dict like {column -> Series(values)}. + split' : dict like {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}. + 'tight' : dict like {'index' -> [index], 'columns' -> [columns], 'data' -> [values], + 'index_names' -> [index.names], 'column_names' -> [column.names]}. + 'records' : list like [{column -> value}, ... , {column -> value}]. + 'index' : dict like {index -> {column -> value}}. + into (class, default dict): + The collections.abc.Mapping subclass used for all Mappings + in the return value. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. + + index (bool, default True): + Whether to include the index item (and index_names item if `orient` + is 'tight') in the returned dictionary. Can only be ``False`` + when `orient` is 'split' or 'tight'. + + Returns: + dict or list of dict: Return a collections.abc.Mapping object representing the DataFrame. + The resulting transformation depends on the `orient` parameter. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def to_excel(self, excel_writer, sheet_name: str = "Sheet1", **kwargs) -> None: + """ + Write DataFrame to an Excel sheet. + + To write a single DataFrame to an Excel .xlsx file it is only necessary to + specify a target file name. To write to multiple sheets it is necessary to + create an `ExcelWriter` object with a target file name, and specify a sheet + in the file to write to. + + Multiple sheets may be written to by specifying unique `sheet_name`. + With all data written to the file it is necessary to save the changes. + Note that creating an `ExcelWriter` object with a file name that already + exists will result in the contents of the existing file being erased. + + Args: + excel_writer (path-like, file-like, or ExcelWriter object): + File path or existing ExcelWriter. + sheet_name (str, default 'Sheet1'): + Name of sheet which will contain DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def to_latex( + self, buf=None, columns=None, header=True, index=True, **kwargs + ) -> str | None: + r""" + Render object to a LaTeX tabular, longtable, or nested table. + + Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted + into a main LaTeX document or read from an external file + with ``\input{{table.tex}}``. + + Args: + buf (str, Path or StringIO-like, optional, default None): + Buffer to write to. If None, the output is returned as a string. + columns (list of label, optional): + The subset of columns to write. Writes all columns by default. + header (bool or list of str, default True): + Write out the column names. If a list of strings is given, + it is assumed to be aliases for the column names. + index (bool, default True): + Write row names (index). + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def to_records( + self, index: bool = True, column_dtypes=None, index_dtypes=None + ) -> np.recarray: + """ + Convert DataFrame to a NumPy record array. + + Index will be included as the first field of the record array if + requested. + + Args: + index (bool, default True): + Include index in resulting record array, stored in 'index' + field or using the index label, if set. + column_dtypes (str, type, dict, default None): + If a string or type, the data type to store all columns. If + a dictionary, a mapping of column names and indices (zero-indexed) + to specific data types. + index_dtypes (str, type, dict, default None): + If a string or type, the data type to store all index levels. If + a dictionary, a mapping of index level names and indices + (zero-indexed) to specific data types. + + This mapping is applied only if `index=True`. + + Returns: + np.recarray: NumPy ndarray with the DataFrame labels as fields and each row + of the DataFrame as entries. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def to_string( + self, + buf=None, + columns: Sequence[str] | None = None, + col_space=None, + header: bool | Sequence[str] = True, + index: bool = True, + na_rep: str = "NaN", + formatters=None, + float_format=None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool = False, + decimal: str = ".", + line_width: int | None = None, + min_rows: int | None = None, + max_colwidth: int | None = None, + encoding: str | None = None, + ): + """Render a DataFrame to a console-friendly tabular output. + + Args: + buf (str, Path or StringIO-like, optional, default None): + Buffer to write to. If None, the output is returned as a string. + columns (sequence, optional, default None): + The subset of columns to write. Writes all columns by default. + col_space (int, list or dict of int, optional): + The minimum width of each column. + header (bool or sequence, optional): + Write out the column names. If a list of strings is given, it is assumed to be aliases for the column names. + index (bool, optional, default True): + Whether to print index (row) labels. + na_rep (str, optional, default 'NaN'): + String representation of NAN to use. + formatters (list, tuple or dict of one-param. functions, optional): + Formatter functions to apply to columns' elements by position or + name. + The result of each function must be a unicode string. + List/tuple must be of length equal to the number of columns. + float_format (one-parameter function, optional, default None): + Formatter function to apply to columns' elements if they are + floats. The result of this function must be a unicode string. + sparsify (bool, optional, default True): + Set to False for a DataFrame with a hierarchical index to print + every multiindex key at each row. + index_names (bool, optional, default True): + Prints the names of the indexes. + justify (str, default None): + How to justify the column labels. If None uses the option from + the print configuration (controlled by set_option), 'right' out + of the box. Valid values are, 'left', 'right', 'center', 'justify', + 'justify-all', 'start', 'end', 'inherit', 'match-parent', 'initial', + 'unset'. + max_rows (int, optional): + Maximum number of rows to display in the console. + min_rows (int, optional): + The number of rows to display in the console in a truncated repr + (when number of rows is above `max_rows`). + max_cols (int, optional): + Maximum number of columns to display in the console. + show_dimensions (bool, default False): + Display DataFrame dimensions (number of rows by number of columns). + decimal (str, default '.'): + Character recognized as decimal separator, e.g. ',' in Europe. + line_width (int, optional): + Width to wrap a line in characters. + max_colwidth (int, optional): + Max width to truncate each column in characters. By default, no limit. + encoding (str, default "utf-8"): + Set character encoding. + + Returns: + str or None: If buf is None, returns the result as a string. Otherwise returns + None. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def to_markdown( + self, + buf=None, + mode: str = "wt", + index: bool = True, + **kwargs, + ): + """Print DataFrame in Markdown-friendly format. + + Args: + buf (str, Path or StringIO-like, optional, default None): + Buffer to write to. If None, the output is returned as a string. + mode (str, optional): + Mode in which file is opened. + index (bool, optional, default True): + Add index (row) labels. + **kwargs + These parameters will be passed to `tabulate `_. + + Returns: + DataFrame in Markdown-friendly format. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def to_pickle(self, path, **kwargs) -> None: + """Pickle (serialize) object to file. + + Args: + path (str): + File path where the pickled object will be stored. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def to_orc(self, path=None, **kwargs) -> bytes | None: + """ + Write a DataFrame to the ORC format. + + Args: + path (str, file-like object or None, default None): + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function). If path is None, + a bytes object is returned. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # Unsorted @@ -185,6 +427,53 @@ def assign(self, **kwargs) -> DataFrame: # ---------------------------------------------------------------------- # Reindexing and alignment + def reindex( + self, + labels=None, + *, + index=None, + columns=None, + axis=None, + ): + """Conform DataFrame to new index with optional filling logic. + + Places NA in locations having no value in the previous index. A new object + is produced. + + Args: + labels (array-like, optional): + New labels / index to conform the axis specified by 'axis' to. + index (array-like, optional): + New labels for the index. Preferably an Index object to avoid + duplicating data. + columns (array-like, optional): + New labels for the columns. Preferably an Index object to avoid + duplicating data. + axis (int or str, optional): + Axis to target. Can be either the axis name ('index', 'columns') + or number (0, 1). + Returns: + DataFrame: DataFrame with changed index. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def reindex_like(self, other): + """Return an object with matching indices as other object. + + Conform the object to the same index on all axes. Optional + filling logic, placing Null in locations having no value + in the previous index. + + Args: + other (Object of the same data type): + Its row and column indices are used to define the new indices + of this object. + + Returns: + Series or DataFrame: Same type as caller, but with changed indices on each axis. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def drop( self, labels=None, *, axis=0, index=None, columns=None, level=None ) -> DataFrame | None: @@ -276,7 +565,9 @@ def set_index( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def reorder_levels(self, order: Sequence[int | str]) -> DataFrame: + def reorder_levels( + self, order: Sequence[int | str], axis: str | int = 0 + ) -> DataFrame: """ Rearrange index levels using input order. May not drop or duplicate levels. @@ -284,13 +575,33 @@ def reorder_levels(self, order: Sequence[int | str]) -> DataFrame: order (list of int or list of str): List representing new level order. Reference level by number (position) or by key (label). + axis ({0 or 'index', 1 or 'columns'}, default 0): + Where to reorder levels. Returns: DataFrame: DataFrame of rearranged index. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def droplevel(self, level): + def swaplevel(self, i, j, axis: str | int = 0) -> DataFrame: + """ + Swap levels i and j in a :class:`MultiIndex`. + + Default is to swap the two innermost levels of the index. + + Args: + i, j (int or str): + Levels of the indices to be swapped. Can pass level name as string. + axis ({0 or 'index', 1 or 'columns'}, default 0): + The axis to swap levels on. 0 or 'index' for row-wise, 1 or + 'columns' for column-wise. + + Returns: + DataFrame: DataFrame with levels swapped in MultiIndex. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def droplevel(self, level, axis: str | int = 0): """ Return DataFrame with requested index / column level(s) removed. @@ -299,6 +610,11 @@ def droplevel(self, level): If a string is given, must be the name of a level If list-like, elements must be names or positional indexes of levels. + axis ({0 or 'index', 1 or 'columns'}, default 0): + Axis along which the level(s) is removed: + + * 0 or 'index': remove level(s) in column. + * 1 or 'columns': remove level(s) in row. Returns: DataFrame: DataFrame with requested index / column level(s) removed. """ @@ -889,6 +1205,54 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def combine( + self, other, func, fill_value=None, overwrite: bool = True + ) -> DataFrame: + """Perform column-wise combine with another DataFrame. + + Combines a DataFrame with `other` DataFrame using `func` + to element-wise combine columns. The row and column indexes of the + resulting DataFrame will be the union of the two. + + Args: + other (DataFrame): + The DataFrame to merge column-wise. + func (function): + Function that takes two series as inputs and return a Series or a + scalar. Used to merge the two dataframes column by columns. + fill_value (scalar value, default None): + The value to fill NaNs with prior to passing any column to the + merge func. + overwrite (bool, default True): + If True, columns in `self` that do not exist in `other` will be + overwritten with NaNs. + + Returns: + DataFrame: Combination of the provided DataFrames. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def combine_first(self, other) -> DataFrame: + """ + Update null elements with value in the same location in `other`. + + Combine two DataFrame objects by filling null values in one DataFrame + with non-null values from other DataFrame. The row and column indexes + of the resulting DataFrame will be the union of the two. The resulting + dataframe contains the 'first' dataframe values and overrides the + second one values where both first.loc[index, col] and + second.loc[index, col] are not missing values, upon calling + first.combine_first(second). + + Args: + other (DataFrame): + Provided DataFrame to use to fill null values. + + Returns: + DataFrame: The result of combining the provided DataFrame with the other object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # Data reshaping @@ -1191,6 +1555,20 @@ def var(self, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def skew(self, *, numeric_only: bool = False): + """Return unbiased skew over requested axis. + + Normalized by N-1. + + Args: + numeric_only (bool, default False): + Include only float, int, boolean columns. + + Returns: + Series + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def std(self, *, numeric_only: bool = False): """Return sample standard deviation over requested axis. @@ -1222,6 +1600,76 @@ def count(self, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def nlargest(self, n: int, columns, keep: str = "first"): + """ + Return the first `n` rows ordered by `columns` in descending order. + + Return the first `n` rows with the largest values in `columns`, in + descending order. The columns that are not specified are returned as + well, but not used for ordering. + + This method is equivalent to + ``df.sort_values(columns, ascending=False).head(n)``, but more + performant. + + Args: + n (int): + Number of rows to return. + columns (label or list of labels): + Column label(s) to order by. + keep ({'first', 'last', 'all'}, default 'first'): + Where there are duplicate values: + + - ``first`` : prioritize the first occurrence(s) + - ``last`` : prioritize the last occurrence(s) + - ``all`` : do not drop any duplicates, even it means + selecting more than `n` items. + + Returns: + DataFrame: The first `n` rows ordered by the given columns in descending order. + + .. note:: + This function cannot be used with all column types. For example, when + specifying columns with `object` or `category` dtypes, ``TypeError`` is + raised. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def nsmallest(self, n: int, columns, keep: str = "first"): + """ + Return the first `n` rows ordered by `columns` in ascending order. + + Return the first `n` rows with the smallest values in `columns`, in + ascending order. The columns that are not specified are returned as + well, but not used for ordering. + + This method is equivalent to + ``df.sort_values(columns, ascending=True).head(n)``, but more + performant. + + Args: + n (int): + Number of rows to return. + columns (label or list of labels): + Column label(s) to order by. + keep ({'first', 'last', 'all'}, default 'first'): + Where there are duplicate values: + + - ``first`` : prioritize the first occurrence(s) + - ``last`` : prioritize the last occurrence(s) + - ``all`` : do not drop any duplicates, even it means + selecting more than `n` items. + + Returns: + DataFrame: The first `n` rows ordered by the given columns in ascending order. + + .. note:: + This function cannot be used with all column types. For example, when + specifying columns with `object` or `category` dtypes, ``TypeError`` is + raised. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def nunique(self): """ Count number of distinct elements in specified axis. @@ -1271,6 +1719,25 @@ def cumprod(self) -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def diff( + self, + periods: int = 1, + ) -> NDFrame: + """First discrete difference of element. + + Calculates the difference of a DataFrame element compared with another + element in the DataFrame (default is element in previous row). + + Args: + periods (int, default 1): + Periods to shift for calculating difference, accepts negative + values. + + Returns: + bigframes.dataframe.DataFrame: First differences of the Series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def agg(self, func): """ Aggregate using one or more operations over the specified axis. diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 56d3b2434f..7d496891b0 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -335,6 +335,41 @@ def copy(self): # ---------------------------------------------------------------------- # Action Methods + def ffill(self, *, limit: Optional[int] = None): + """Fill NA/NaN values by propagating the last valid observation to next valid. + + Args: + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + + + Returns: + Series/DataFrame or None: Object with missing values filled. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def bfill(self, *, limit: Optional[int] = None): + """Fill NA/NaN values by using the next valid observation to fill the gap. + + Args: + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + + Returns: + Series/DataFrame or None: Object with missing values filled. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def isna(self) -> NDFrame: """Detect missing values. @@ -367,6 +402,36 @@ def notna(self) -> NDFrame: notnull = notna + def filter( + self, + items=None, + like: str | None = None, + regex: str | None = None, + axis=None, + ) -> NDFrame: + """ + Subset the dataframe rows or columns according to the specified index labels. + + Note that this routine does not filter a dataframe on its + contents. The filter is applied to the labels of the index. + + Args: + items (list-like): + Keep labels from axis which are in items. + like (str): + Keep labels from axis for which "like in label == True". + regex (str (regular expression)): + Keep labels from axis for which re.search(regex, label) == True. + axis ({0 or 'index', 1 or 'columns', None}, default None): + The axis to filter on, expressed either as an index (int) + or axis name (str). By default this is the info axis, 'columns' for + DataFrame. For `Series` this parameter is unused and defaults to `None`. + + Returns: + same type as input object + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def shift( self, periods: int = 1, @@ -384,6 +449,30 @@ def shift( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def pct_change(self, periods: int = 1): + """ + Fractional change between the current and a prior element. + + Computes the fractional change from the immediately previous row by + default. This is useful in comparing the fraction of change in a time + series of elements. + + .. note:: + + Despite the name of this method, it calculates fractional change + (also known as per unit change or relative change) and not + percentage change. If you need the percentage change, multiply + these values by 100. + + Args: + periods (int, default 1): + Periods to shift for forming percent change. + + Returns: + Series or DataFrame: The same type as the calling object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rank( self, axis=0, diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 9271da8a5e..7849a3afd5 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -124,6 +124,26 @@ def var( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def skew( + self, + *, + numeric_only: bool = False, + ): + """ + Return unbiased skew within groups. + + Normalized by N-1. + + Args: + numeric_only (bool, default False): + Include only `float`, `int` or `boolean` data. + + Returns: + Series or DataFrame + Variance of values within each group. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def sum( self, numeric_only: bool = False, diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 404a99809c..864007b774 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -21,6 +21,16 @@ def shape(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def nlevels(self) -> int: + """Number of levels.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def is_unique(self) -> bool: + """Return if the index has unique values.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_numpy(self, dtype): """ A NumPy ndarray representing the values in this Series or Index. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 76fb46a700..c6dd973372 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -758,6 +758,41 @@ def groupby( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def reindex(self, index=None): + """ + Conform Series to new index with optional filling logic. + + Places NA/NaN in locations having no value in the previous index. A new object + is produced unless the new index is equivalent to the current one and + ``copy=False``. + + Args: + index (array-like, optional): + New labels for the index. Preferably an Index object to avoid + duplicating data. + + Returns: + Series: Series with changed index. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def reindex_like(self, other): + """Return an object with matching indices as other object. + + Conform the object to the same index on all axes. Optional + filling logic, placing Null in locations having no value + in the previous index. + + Args: + other (Object of the same data type): + Its row and column indices are used to define the new indices + of this object. + + Returns: + Series or DataFrame: Same type as caller, but with changed indices on each axis. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def drop( self, labels=None, *, axis=0, index=None, columns=None, level=None ) -> Series | None: @@ -790,7 +825,7 @@ def drop( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def reorder_levels(self, order: Sequence) -> Series: + def reorder_levels(self, order: Sequence, axis) -> Series: """ Rearrange index levels using input order. @@ -800,12 +835,31 @@ def reorder_levels(self, order: Sequence) -> Series: order (list of int representing new level order): Reference level by number or key. + axis ({0 or 'index', 1 or 'columns'}, default 0): + For `Series` this parameter is unused and defaults to 0. + + Returns: type of caller (new object) """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def droplevel(self, level): + def swaplevel(self, i, j): + """ + Swap levels i and j in a `MultiIndex`. + + Default is to swap the two innermost levels of the index. + + Args: + i, j (int or str): + Levels of the indices to be swapped. Can pass level name as string. + + Returns: + Series: Series with levels swapped in MultiIndex + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def droplevel(self, level, axis): """ Return Series with requested index / column level(s) removed. @@ -815,6 +869,9 @@ def droplevel(self, level): If list-like, elements must be names or positional indexes of levels. + axis ({0 or 'index', 1 or 'columns'}, default 0): + For `Series` this parameter is unused and defaults to 0. + Returns: Series with requested index / column level(s) removed. """ @@ -836,6 +893,69 @@ def fillna( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def replace( + self, + to_replace, + value=None, + ) -> Series | None: + """ + Replace values given in `to_replace` with `value`. + + Values of the Series/DataFrame are replaced with other values dynamically. + This differs from updating with ``.loc`` or ``.iloc``, which require + you to specify a location to update with some value. + + Args: + to_replace (str, regex, list, int, float or None): + How to find the values that will be replaced. + + * numeric, str or regex: + + - numeric: numeric values equal to `to_replace` will be + replaced with `value` + - str: string exactly matching `to_replace` will be replaced + with `value` + - regex: regexs matching `to_replace` will be replaced with + `value` + + * list of str, regex, or numeric: + + - First, if `to_replace` and `value` are both lists, they + **must** be the same length. + - Second, if ``regex=True`` then all of the strings in **both** + lists will be interpreted as regexs otherwise they will match + directly. This doesn't matter much for `value` since there + are only a few possible substitution regexes you can use. + - str, regex and numeric rules apply as above. + + value (scalar, default None): + Value to replace any values matching `to_replace` with. + For a DataFrame a dict of values can be used to specify which + value to use for each column (columns not in the dict will not be + filled). Regular expressions, strings and lists or dicts of such + objects are also allowed. + regex (bool, default False): + Whether to interpret `to_replace` and/or `value` as regular + expressions. If this is ``True`` then `to_replace` *must* be a + string. + + Returns: + Series/DataFrame: Object after replacement. + + Raises: + TypeError: + * If `to_replace` is not a scalar, array-like, ``dict``, or ``None`` + * If `to_replace` is a ``dict`` and `value` is not a ``list``, + ``dict``, ``ndarray``, or ``Series`` + * If `to_replace` is ``None`` and `regex` is not compilable + into a regular expression or is a list, dict, ndarray, or + Series. + * When replacing multiple ``bool`` or ``datetime64`` objects and + the arguments to `to_replace` does not match the type of the + value being replaced + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: """ Return a new Series with missing values removed. diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index 847ad06f75..42868ce51f 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -144,6 +144,7 @@ def fit_transform(self, X, y=None): bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new) Transformed DataFrame. """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class MetaEstimatorMixin: diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index ff1c04edbe..ece62dc147 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -49,7 +49,6 @@ def fit( self, X, y=None, - transforms: Optional[List[str]] = None, ): """Compute k-means clustering. @@ -58,10 +57,6 @@ def fit( DataFrame of shape (n_samples, n_features). Training data. y (default None): Not used, present here for API consistency by convention. - transforms (Optional[List[str]], default None): - Do not use. Internal param to be deprecated. - Use bigframes.ml.pipeline instead. - Returns: KMeans: Fitted Estimator. diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 85feab0024..97fee5a501 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -49,10 +49,6 @@ def fit(self, X, y=None): y (default None): Ignored. - transforms (Optional[List[str]], default None): - Do not use. Internal param to be deprecated. - Use bigframes.ml.pipeline instead. - Returns: PCA: Fitted estimator. """ diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py index 79224a772d..6be41bf9aa 100644 --- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py +++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py @@ -54,10 +54,6 @@ def fit(self, X, y): Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. - transforms (Optional[List[str]], default None): - Do not use. Internal param to be deprecated. - Use bigframes.ml.pipeline instead. - Returns: Fitted Estimator. diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 8141da4e3b..81b4fca157 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -74,7 +74,6 @@ def fit( self, X, y, - transforms: Optional[List[str]] = None, ): """Fit linear model. @@ -86,10 +85,6 @@ def fit( Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. - transforms (Optional[List[str]], default None): - Do not use. Internal param to be deprecated. - Use bigframes.ml.pipeline instead. - Returns: LinearRegression: Fitted Estimator. """ diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index a06035eef6..133dc4498e 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -37,7 +37,6 @@ def fit( self, X, y, - transforms: Optional[List[str]] = None, ): """Fit the model according to the given training data. @@ -50,10 +49,6 @@ def fit( y (bigframes.dataframe.DataFrame or bigframes.series.Series): DataFrame of shape (n_samples,). Target vector relative to X. - transforms (Optional[List[str]], default None): - Do not use. Internal param to be deprecated. - Use bigframes.ml.pipeline instead. - Returns: LogisticRegression: Fitted Estimator. diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py index d013043467..89981e34c0 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py @@ -8,10 +8,10 @@ # License: BSD 3 clause from bigframes import constants -from third_party.bigframes_vendored.sklearn.base import BaseEstimator +from third_party.bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin -class StandardScaler(BaseEstimator): +class StandardScaler(BaseEstimator, TransformerMixin): """Standardize features by removing the mean and scaling to unit variance. The standard score of a sample `x` is calculated as:z = (x - u) / s @@ -28,30 +28,23 @@ class StandardScaler(BaseEstimator): machine learning estimators: they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance). - """ - - def fit(self, X): - """Compute the mean and std to be used for later scaling. - Examples: + Examples: .. code-block:: from bigframes.ml.preprocessing import StandardScaler + import bigframes.pandas as bpd - enc = StandardScaler() - X = [['Male', 1], ['Female', 3], ['Female', 2]] - enc.fit(X) - - Examples: - - .. code-block:: - - from bigframes.ml import StandardScaler + scaler = StandardScaler() + data = bpd.DataFrame({"a": [0, 0, 1, 1], "b":[0, 0, 1, 1]}) + scaler.fit(data) + print(scaler.transform(data)) + print(scaler.transform(bpd.DataFrame({"a": [2], "b":[2]}))) + """ - enc = StandardScaler() - X = [['Male', 1], ['Female', 3], ['Female', 2]] - enc.fit(X) + def fit(self, X): + """Compute the mean and std to be used for later scaling. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index b1cf17e539..b0f0df8e15 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -37,12 +37,8 @@ class OneHotEncoder(BaseEstimator): when considering infrequent categories. If there are infrequent categories, max_categories includes the category representing the infrequent categories along with the frequent categories. Default None, set limit to 1,000,000. - """ - - def fit(self, X): - """Fit OneHotEncoder to X. - Examples: + Examples: Given a dataset with two features, we let the encoder find the unique values per feature and transform the data to a binary one-hot encoding. @@ -50,10 +46,16 @@ def fit(self, X): .. code-block:: from bigframes.ml.preprocessing import OneHotEncoder + import bigframes.pandas as bpd enc = OneHotEncoder() - X = [['Male', 1], ['Female', 3], ['Female', 2]] + X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]}) enc.fit(X) + print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]}))) + """ + + def fit(self, X): + """Fit OneHotEncoder to X. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py index 620c87fa3d..b7b43b85a3 100644 --- a/third_party/bigframes_vendored/xgboost/sklearn.py +++ b/third_party/bigframes_vendored/xgboost/sklearn.py @@ -37,10 +37,6 @@ def fit(self, X, y): DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. - transforms (Optional[List[str]], default None): - Do not use. Internal param to be deprecated. - Use bigframes.ml.pipeline instead. - Returns: XGBModel: Fitted Estimator. """