Skip to content

Commit 56be8b0

Browse files
ueshinHyukjinKwon
authored andcommitted
[SPARK-51983][PS] Prepare the test environment for pandas API on Spark with ANSI mode enabled
### What changes were proposed in this pull request? Prepares the test environment for pandas API on Spark with ANSI mode enabled. - Remove forcibly disabling ANSI mode in tests - Add a new option `compute.ansi_mode_support` to keep the current behavior (default `False`) - eventually it should be `True` by default - Skip the failed tests affected by ANSI mode - Make `pyspark-pandas` tests run in the nightly Non-ANSI test to also run skipped tests with Non-ANSI mode ### Why are the changes needed? Currently pandas API on Spark doesn't support ANSI mode and show warnings if it's enabled. ```py >>> import pyspark.pandas as ps >>> ps.range(10) ...: PandasAPIOnSparkAdviceWarning: The config 'spark.sql.ansi.enabled' is set to True. This can cause unexpected behavior from pandas API on Spark since pandas API on Spark follows the behavior of pandas, not SQL. warnings.warn(message, PandasAPIOnSparkAdviceWarning) ... ``` Now ANSI mode is enabled by default, pandas API on Spark should also support it. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? The existing tests should pass. ### Was this patch authored or co-authored using generative AI tooling? Closes #50779 from ueshin/issues/SPARK-51983/pandas_ansi_test. Authored-by: Takuya Ueshin <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
1 parent b957d7f commit 56be8b0

27 files changed

+149
-14
lines changed

.github/workflows/build_non_ansi.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ jobs:
4646
"build": "true",
4747
"docs": "true",
4848
"pyspark": "true",
49+
"pyspark-pandas": "true",
4950
"sparkr": "true",
5051
"tpcds-1g": "true",
5152
"docker-integration-tests": "true",

python/docs/source/tutorial/pandas_on_spark/options.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,10 @@ compute.isin_limit 80 'compute.isin_limit' set
319319
better performance.
320320
compute.pandas_fallback False 'compute.pandas_fallback' sets whether or not to
321321
fallback automatically to Pandas' implementation.
322+
compute.ansi_mode_support False 'compute.ansi_mode_support' sets whether or not to
323+
support the ANSI mode of the underlying Spark. If
324+
False, pandas API on Spark may hit unexpected results
325+
or errors. The default is False.
322326
plotting.max_rows 1000 'plotting.max_rows' sets the visual limit on top-n-
323327
based plots such as `plot.bar` and `plot.pie`. If it
324328
is set to 1000, the first 1000 data points will be

python/pyspark/pandas/config.py

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,10 @@
2020
"""
2121
from contextlib import contextmanager
2222
import json
23-
from typing import Any, Callable, Dict, Iterator, List, Tuple, Union
23+
from typing import Any, Callable, Dict, Iterator, List, Tuple, Union, Optional
2424

2525
from pyspark._globals import _NoValue, _NoValueType
26+
from pyspark.sql.session import SparkSession
2627
from pyspark.pandas.utils import default_session
2728

2829

@@ -268,6 +269,17 @@ def validate(self, v: Any) -> None:
268269
default=False,
269270
types=bool,
270271
),
272+
Option(
273+
key="compute.ansi_mode_support",
274+
doc=(
275+
"'compute.ansi_mode_support' sets whether or not to support the ANSI mode of "
276+
"the underlying Spark. "
277+
"If False, pandas API on Spark may hit unexpected results or errors. "
278+
"The default is False."
279+
),
280+
default=False,
281+
types=bool,
282+
),
271283
Option(
272284
key="plotting.max_rows",
273285
doc=(
@@ -351,7 +363,12 @@ def show_options() -> None:
351363
print(row_format.format("=" * 31, "=" * 23, "=" * 53))
352364

353365

354-
def get_option(key: str, default: Union[Any, _NoValueType] = _NoValue) -> Any:
366+
def get_option(
367+
key: str,
368+
default: Union[Any, _NoValueType] = _NoValue,
369+
*,
370+
spark_session: Optional[SparkSession] = None,
371+
) -> Any:
355372
"""
356373
Retrieves the value of the specified option.
357374
@@ -361,6 +378,9 @@ def get_option(key: str, default: Union[Any, _NoValueType] = _NoValue) -> Any:
361378
The key which should match a single option.
362379
default : object
363380
The default value if the option is not set yet. The value should be JSON serializable.
381+
spark_session : :class:`SparkSession`, optional
382+
The explicit :class:`SparkSession` object to get the option.
383+
If not specified, the default session will be used.
364384
365385
Returns
366386
-------
@@ -374,12 +394,12 @@ def get_option(key: str, default: Union[Any, _NoValueType] = _NoValue) -> Any:
374394
if default is _NoValue:
375395
default = _options_dict[key].default
376396
_options_dict[key].validate(default)
377-
spark_session = default_session()
397+
spark_session = spark_session or default_session()
378398

379399
return json.loads(spark_session.conf.get(_key_format(key), default=json.dumps(default)))
380400

381401

382-
def set_option(key: str, value: Any) -> None:
402+
def set_option(key: str, value: Any, *, spark_session: Optional[SparkSession] = None) -> None:
383403
"""
384404
Sets the value of the specified option.
385405
@@ -389,19 +409,22 @@ def set_option(key: str, value: Any) -> None:
389409
The key which should match a single option.
390410
value : object
391411
New value of option. The value should be JSON serializable.
412+
spark_session : :class:`SparkSession`, optional
413+
The explicit :class:`SparkSession` object to set the option.
414+
If not specified, the default session will be used.
392415
393416
Returns
394417
-------
395418
None
396419
"""
397420
_check_option(key)
398421
_options_dict[key].validate(value)
399-
spark_session = default_session()
422+
spark_session = spark_session or default_session()
400423

401424
spark_session.conf.set(_key_format(key), json.dumps(value))
402425

403426

404-
def reset_option(key: str) -> None:
427+
def reset_option(key: str, *, spark_session: Optional[SparkSession] = None) -> None:
405428
"""
406429
Reset one option to their default value.
407430
@@ -411,13 +434,17 @@ def reset_option(key: str) -> None:
411434
----------
412435
key : str
413436
If specified only option will be reset.
437+
spark_session : :class:`SparkSession`, optional
438+
The explicit :class:`SparkSession` object to reset the option.
439+
If not specified, the default session will be used.
414440
415441
Returns
416442
-------
417443
None
418444
"""
419445
_check_option(key)
420-
default_session().conf.unset(_key_format(key))
446+
spark_session = spark_session or default_session()
447+
spark_session.conf.unset(_key_format(key))
421448

422449

423450
@contextmanager

python/pyspark/pandas/frame.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13778,11 +13778,32 @@ def _test() -> None:
1377813778
import uuid
1377913779
from pyspark.sql import SparkSession
1378013780
import pyspark.pandas.frame
13781+
from pyspark.testing.utils import is_ansi_mode_test
1378113782

1378213783
os.chdir(os.environ["SPARK_HOME"])
1378313784

1378413785
globs = pyspark.pandas.frame.__dict__.copy()
1378513786
globs["ps"] = pyspark.pandas
13787+
13788+
if is_ansi_mode_test:
13789+
del pyspark.pandas.frame.DataFrame.add.__doc__
13790+
del pyspark.pandas.frame.DataFrame.div.__doc__
13791+
del pyspark.pandas.frame.DataFrame.floordiv.__doc__
13792+
del pyspark.pandas.frame.DataFrame.melt.__doc__
13793+
del pyspark.pandas.frame.DataFrame.mod.__doc__
13794+
del pyspark.pandas.frame.DataFrame.mul.__doc__
13795+
del pyspark.pandas.frame.DataFrame.pow.__doc__
13796+
del pyspark.pandas.frame.DataFrame.radd.__doc__
13797+
del pyspark.pandas.frame.DataFrame.rdiv.__doc__
13798+
del pyspark.pandas.frame.DataFrame.rfloordiv.__doc__
13799+
del pyspark.pandas.frame.DataFrame.rmod.__doc__
13800+
del pyspark.pandas.frame.DataFrame.rmul.__doc__
13801+
del pyspark.pandas.frame.DataFrame.rpow.__doc__
13802+
del pyspark.pandas.frame.DataFrame.rsub.__doc__
13803+
del pyspark.pandas.frame.DataFrame.rtruediv.__doc__
13804+
del pyspark.pandas.frame.DataFrame.sub.__doc__
13805+
del pyspark.pandas.frame.DataFrame.truediv.__doc__
13806+
1378613807
spark = (
1378713808
SparkSession.builder.master("local[4]").appName("pyspark.pandas.frame tests").getOrCreate()
1378813809
)

python/pyspark/pandas/groupby.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4595,12 +4595,17 @@ def _test() -> None:
45954595
import numpy
45964596
from pyspark.sql import SparkSession
45974597
import pyspark.pandas.groupby
4598+
from pyspark.testing.utils import is_ansi_mode_test
45984599

45994600
os.chdir(os.environ["SPARK_HOME"])
46004601

46014602
globs = pyspark.pandas.groupby.__dict__.copy()
46024603
globs["np"] = numpy
46034604
globs["ps"] = pyspark.pandas
4605+
4606+
if is_ansi_mode_test:
4607+
del pyspark.pandas.groupby.DataFrameGroupBy.corr.__doc__
4608+
46044609
spark = (
46054610
SparkSession.builder.master("local[4]")
46064611
.appName("pyspark.pandas.groupby tests")

python/pyspark/pandas/namespace.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3874,6 +3874,7 @@ def _test() -> None:
38743874
from pyspark.sql import SparkSession
38753875
import pyspark.pandas.namespace
38763876
from pandas.util.version import Version
3877+
from pyspark.testing.utils import is_ansi_mode_test
38773878

38783879
os.chdir(os.environ["SPARK_HOME"])
38793880

@@ -3886,6 +3887,11 @@ def _test() -> None:
38863887
globs = pyspark.pandas.namespace.__dict__.copy()
38873888
globs["ps"] = pyspark.pandas
38883889
globs["sf"] = F
3890+
3891+
if is_ansi_mode_test:
3892+
del pyspark.pandas.namespace.melt.__doc__
3893+
del pyspark.pandas.namespace.to_numeric.__doc__
3894+
38893895
spark = (
38903896
SparkSession.builder.master("local[4]")
38913897
.appName("pyspark.pandas.namespace tests")

python/pyspark/pandas/series.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7336,11 +7336,16 @@ def _test() -> None:
73367336
import sys
73377337
from pyspark.sql import SparkSession
73387338
import pyspark.pandas.series
7339+
from pyspark.testing.utils import is_ansi_mode_test
73397340

73407341
os.chdir(os.environ["SPARK_HOME"])
73417342

73427343
globs = pyspark.pandas.series.__dict__.copy()
73437344
globs["ps"] = pyspark.pandas
7345+
7346+
if is_ansi_mode_test:
7347+
del pyspark.pandas.series.Series.autocorr.__doc__
7348+
73447349
spark = (
73457350
SparkSession.builder.master("local[4]").appName("pyspark.pandas.series tests").getOrCreate()
73467351
)

python/pyspark/pandas/tests/computation/test_corr.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@
2222
from pyspark import pandas as ps
2323
from pyspark.testing.pandasutils import PandasOnSparkTestCase, SPARK_CONF_ARROW_ENABLED
2424
from pyspark.testing.sqlutils import SQLTestUtils
25+
from pyspark.testing.utils import is_ansi_mode_test, ansi_mode_not_supported_message
2526

2627

2728
class FrameCorrMixin:
29+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
2830
def test_dataframe_corr(self):
2931
pdf = pd.DataFrame(
3032
index=[

python/pyspark/pandas/tests/computation/test_missing_data.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from pyspark import pandas as ps
2424
from pyspark.testing.pandasutils import PandasOnSparkTestCase
2525
from pyspark.testing.sqlutils import SQLTestUtils
26+
from pyspark.testing.utils import is_ansi_mode_test, ansi_mode_not_supported_message
2627

2728

2829
# This file contains test cases for 'Missing data handling'
@@ -273,6 +274,7 @@ def test_fillna(self):
273274
pdf.fillna({("x", "a"): -1, ("x", "b"): -2, ("y", "c"): -5}),
274275
)
275276

277+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
276278
def test_replace(self):
277279
pdf = pd.DataFrame(
278280
{

python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from pyspark import pandas as ps
2626
from pyspark.pandas import option_context
2727
from pyspark.testing.pandasutils import PandasOnSparkTestCase
28+
from pyspark.testing.utils import is_ansi_mode_test, ansi_mode_not_supported_message
2829
from pyspark.pandas.tests.data_type_ops.testing_utils import OpsTestBase
2930
from pyspark.pandas.typedef.typehints import (
3031
extension_float_dtypes_available,
@@ -99,6 +100,7 @@ def test_mul(self):
99100
else:
100101
self.assertRaises(TypeError, lambda: b_psser * psser)
101102

103+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
102104
def test_truediv(self):
103105
pdf, psdf = self.pdf, self.psdf
104106

@@ -114,6 +116,7 @@ def test_truediv(self):
114116
for col in self.non_numeric_df_cols:
115117
self.assertRaises(TypeError, lambda: b_psser / psdf[col])
116118

119+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
117120
def test_floordiv(self):
118121
pdf, psdf = self.pdf, self.psdf
119122

@@ -134,6 +137,7 @@ def test_floordiv(self):
134137
for col in self.non_numeric_df_cols:
135138
self.assertRaises(TypeError, lambda: b_psser // psdf[col])
136139

140+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
137141
def test_mod(self):
138142
pdf, psdf = self.pdf, self.psdf
139143

@@ -233,6 +237,7 @@ def test_rpow(self):
233237
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) ** b_psser)
234238
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) ** b_psser)
235239

240+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
236241
def test_rmod(self):
237242
psdf = self.psdf
238243

python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
from pyspark import pandas as ps
2323
from pyspark.testing.pandasutils import PandasOnSparkTestCase
24+
from pyspark.testing.utils import is_ansi_mode_test, ansi_mode_not_supported_message
2425
from pyspark.pandas.tests.data_type_ops.testing_utils import OpsTestBase
2526

2627

@@ -40,6 +41,7 @@ def float_pser(self):
4041
def float_psser(self):
4142
return ps.from_pandas(self.float_pser)
4243

44+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
4345
def test_add(self):
4446
pdf, psdf = self.pdf, self.psdf
4547
for col in self.numeric_df_cols:
@@ -57,6 +59,7 @@ def test_add(self):
5759
else:
5860
self.assertRaises(TypeError, lambda: psser + psdf[n_col])
5961

62+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
6063
def test_sub(self):
6164
pdf, psdf = self.pdf, self.psdf
6265
for col in self.numeric_df_cols:

python/pyspark/pandas/tests/data_type_ops/test_num_mod.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
from pyspark import pandas as ps
2424
from pyspark.testing.pandasutils import PandasOnSparkTestCase
25+
from pyspark.testing.utils import is_ansi_mode_test, ansi_mode_not_supported_message
2526
from pyspark.pandas.tests.data_type_ops.testing_utils import OpsTestBase
2627

2728

@@ -34,6 +35,7 @@ def float_pser(self):
3435
def float_psser(self):
3536
return ps.from_pandas(self.float_pser)
3637

38+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
3739
def test_mod(self):
3840
pdf, psdf = self.pdf, self.psdf
3941
for col in self.numeric_df_cols:

python/pyspark/pandas/tests/data_type_ops/test_num_mul_div.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
from pyspark import pandas as ps
2424
from pyspark.testing.pandasutils import PandasOnSparkTestCase
25+
from pyspark.testing.utils import is_ansi_mode_test, ansi_mode_not_supported_message
2526
from pyspark.pandas.tests.data_type_ops.testing_utils import OpsTestBase
2627

2728

@@ -34,6 +35,7 @@ def float_pser(self):
3435
def float_psser(self):
3536
return ps.from_pandas(self.float_pser)
3637

38+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
3739
def test_mul(self):
3840
pdf, psdf = self.pdf, self.psdf
3941
for col in self.numeric_df_cols:

python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
from pyspark import pandas as ps
2424
from pyspark.testing.pandasutils import PandasOnSparkTestCase
25+
from pyspark.testing.utils import is_ansi_mode_test, ansi_mode_not_supported_message
2526
from pyspark.pandas.tests.data_type_ops.testing_utils import OpsTestBase
2627

2728

@@ -41,6 +42,7 @@ def float_pser(self):
4142
def float_psser(self):
4243
return ps.from_pandas(self.float_pser)
4344

45+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
4446
def test_radd(self):
4547
pdf, psdf = self.pdf, self.psdf
4648
for col in self.numeric_df_cols:
@@ -53,6 +55,7 @@ def test_radd(self):
5355
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) + psser)
5456
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) + psser)
5557

58+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
5659
def test_rsub(self):
5760
pdf, psdf = self.pdf, self.psdf
5861
for col in self.numeric_df_cols:
@@ -65,6 +68,7 @@ def test_rsub(self):
6568
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) - psser)
6669
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) - psser)
6770

71+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
6872
def test_rmul(self):
6973
pdf, psdf = self.pdf, self.psdf
7074
for col in self.numeric_df_cols:
@@ -113,6 +117,7 @@ def test_rpow(self):
113117
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) ** psser)
114118
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) ** psser)
115119

120+
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
116121
def test_rmod(self):
117122
pdf, psdf = self.pdf, self.psdf
118123
for col in self.numeric_df_cols:

0 commit comments

Comments
 (0)