diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b79797fa86431..8056e6f3393e9 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -21,7 +21,7 @@ PyArrow will become a required dependency with pandas 3.0 `PyArrow `_ will become a required dependency of pandas starting with pandas 3.0. This decision was made based on -`PDEP 12 `_. +`PDEP 10 `_. This will enable more changes that are hugely beneficial to pandas users, including but not limited to: @@ -41,7 +41,9 @@ Avoid NumPy object dtype for strings by default Previously, all strings were stored in columns with NumPy object dtype. This release introduces an option ``future.infer_string`` that infers all -strings as PyArrow backed strings with dtype ``pd.ArrowDtype(pa.string())`` instead. +strings as PyArrow backed strings with dtype ``"string[pyarrow_numpy]"`` instead. +This is a new string dtype implementation that follows NumPy semantics in comparison +operations and will return ``np.nan`` as the missing value indicator. This option only works if PyArrow is installed. PyArrow backed strings have a significantly reduced memory footprint and provide a big performance improvement compared to NumPy object (:issue:`54430`). diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2681115bbdcfb..859cb8e5ebead 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2682,11 +2682,9 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.str_: if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): - import pyarrow as pa + from pandas.core.arrays.string_ import StringDtype - from pandas.core.dtypes.dtypes import ArrowDtype - - dtype = ArrowDtype(pa.string()) + dtype = StringDtype(storage="pyarrow_numpy") return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f733ba3b445fd..aaac0dc73486f 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -51,10 +51,7 @@ is_object_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - NumpyEADtype, -) +from pandas.core.dtypes.dtypes import NumpyEADtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, @@ -595,9 +592,9 @@ def sanitize_array( if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): - import pyarrow as pa + from pandas.core.arrays.string_ import StringDtype - dtype = ArrowDtype(pa.string()) + dtype = StringDtype(storage="pyarrow_numpy") subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index aa228191adc62..1d5db123068e2 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -799,10 +799,9 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: dtype = _dtype_obj if using_pyarrow_string_dtype(): - import pyarrow as pa + from pandas.core.arrays.string_ import StringDtype - pa_dtype = pa.string() - dtype = ArrowDtype(pa_dtype) + dtype = StringDtype(storage="pyarrow_numpy") elif isinstance(val, (np.datetime64, dt.datetime)): try: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f9f9e6d053b5f..5c8873d4324e0 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -32,10 +32,7 @@ is_named_tuple, is_object_dtype, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - ExtensionDtype, -) +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -379,10 +376,9 @@ def ndarray_to_mgr( nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): - import pyarrow as pa + dtype = StringDtype(storage="pyarrow_numpy") obj_columns = list(values) - dtype = ArrowDtype(pa.string()) block_values = [ new_block( dtype.construct_array_type()._from_sequence(data, dtype=dtype), diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 27316b3ab0af0..915595833468d 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -28,4 +28,4 @@ def _arrow_dtype_mapping() -> dict: def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") - return {pa.string(): pd.ArrowDtype(pa.string())}.get + return {pa.string(): pd.StringDtype(storage="pyarrow_numpy")}.get diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 89c3f7bbc4f84..fb0354ef9df6c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -68,7 +68,6 @@ ) from pandas.core.dtypes.missing import array_equivalent -import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -3224,9 +3223,7 @@ def read( values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): - import pyarrow as pa - - result = result.astype(pd.ArrowDtype(pa.string())) + result = result.astype("string[pyarrow_numpy]") return result # error: Signature of "write" incompatible with supertype "Fixed" @@ -3296,9 +3293,7 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): - import pyarrow as pa - - df = df.astype(pd.ArrowDtype(pa.string())) + df = df.astype("string[pyarrow_numpy]") dfs.append(df) if len(dfs) > 0: @@ -4686,9 +4681,7 @@ def read( values, # type: ignore[arg-type] skipna=True, ): - import pyarrow as pa - - df = df.astype(pd.ArrowDtype(pa.string())) + df = df.astype("string[pyarrow_numpy]") frames.append(df) if len(frames) == 1: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 63cddb7f192e6..a8ab3ce1ba014 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2685,8 +2685,8 @@ def test_construct_with_strings_and_none(self): def test_frame_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2720,8 +2720,8 @@ def test_frame_string_inference(self): def test_frame_string_inference_array_string_dtype(self): # GH#54496 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 638124ac20e06..60abbfc441e8e 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -46,8 +46,8 @@ def test_construct_empty_tuples(self, tuple_list): def test_index_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = Index(["a", "b"], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Index(["a", "b"]) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 4ee9e1e2d1598..ca3ce6ba34515 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2095,7 +2095,7 @@ def test_pyarrow_engine_lines_false(): def test_json_roundtrip_string_inference(orient): - pa = pytest.importorskip("pyarrow") + pytest.importorskip("pyarrow") df = DataFrame( [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] ) @@ -2104,9 +2104,9 @@ def test_json_roundtrip_string_inference(orient): result = read_json(StringIO(out)) expected = DataFrame( [["a", "b"], ["c", "d"]], - dtype=pd.ArrowDtype(pa.string()), - index=pd.Index(["row 1", "row 2"], dtype=pd.ArrowDtype(pa.string())), - columns=pd.Index(["col 1", "col 2"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + index=pd.Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 1c0f0939029ff..8494c5d58171b 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -542,8 +542,8 @@ def test_ea_int_avoid_overflow(all_parsers): def test_string_inference(all_parsers): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" data = """a,b x,1 diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 72fc2361c5053..9b4590ca3f01f 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -392,7 +392,7 @@ def test_read_py2_hdf_file_in_py3(datapath): def test_read_infer_string(tmp_path, setup_path): # GH#54431 - pa = pytest.importorskip("pyarrow") + pytest.importorskip("pyarrow") df = DataFrame({"a": ["a", "b", None]}) path = tmp_path / setup_path df.to_hdf(path, key="data", format="table") @@ -400,7 +400,7 @@ def test_read_infer_string(tmp_path, setup_path): result = read_hdf(path, key="data", mode="r") expected = DataFrame( {"a": ["a", "b", None]}, - dtype=pd.ArrowDtype(pa.string()), - columns=Index(["a"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + columns=Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a0fee6751bf53..cf43203466ef4 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -222,14 +222,10 @@ def test_invalid_dtype_backend(self): def test_string_inference(self, tmp_path): # GH#54431 - import pyarrow as pa - path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}) df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) - expected = pd.DataFrame( - data={"a": ["x", "y"]}, dtype=pd.ArrowDtype(pa.string()) - ) + expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index c2d791ba24c87..d90f803f1e607 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -426,7 +426,7 @@ def test_string_inference(tmp_path): result = read_orc(path) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype=pd.ArrowDtype(pa.string()), - columns=pd.Index(["a"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 9182e4c4e7674..6ca71b9507322 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1113,8 +1113,6 @@ def test_df_attrs_persistence(self, tmp_path, pa): def test_string_inference(self, tmp_path, pa): # GH#54431 - import pyarrow as pa - path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"]) df.to_parquet(path, engine="pyarrow") @@ -1122,8 +1120,8 @@ def test_string_inference(self, tmp_path, pa): result = read_parquet(path, engine="pyarrow") expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype=pd.ArrowDtype(pa.string()), - index=pd.Index(["a", "b"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 02736406e109b..2f446e6b8c81d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2946,7 +2946,7 @@ def test_read_sql_dtype_backend_table(self, string_storage, func): def test_read_sql_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") + pytest.importorskip("pyarrow") table = "test" df = DataFrame({"a": ["x", "y"]}) df.to_sql(table, con=self.conn, index=False, if_exists="replace") @@ -2954,7 +2954,7 @@ def test_read_sql_string_inference(self): with pd.option_context("future.infer_string", True): result = read_sql_table(table, self.conn) - dtype = pd.ArrowDtype(pa.string()) + dtype = "string[pyarrow_numpy]" expected = DataFrame( {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 97bd8633954d8..ef734e9664844 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2077,8 +2077,8 @@ def test_series_from_index_dtype_equal_does_not_copy(self): def test_series_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = Series(["a", "b"], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", "b"]) @@ -2092,8 +2092,8 @@ def test_series_string_inference(self): @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) def test_series_string_with_na_inference(self, na_value): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = Series(["a", na_value], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", na_value]) @@ -2101,16 +2101,16 @@ def test_series_string_with_na_inference(self, na_value): def test_series_string_inference_scalar(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - expected = Series("a", index=[1], dtype=pd.ArrowDtype(pa.string())) + pytest.importorskip("pyarrow") + expected = Series("a", index=[1], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series("a", index=[1]) tm.assert_series_equal(ser, expected) def test_series_string_inference_array_string_dtype(self): # GH#54496 - pa = pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())) + pytest.importorskip("pyarrow") + expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series(np.array(["a", "b"])) tm.assert_series_equal(ser, expected)