Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Infer strings as pyarrow_numpy backed strings #54720

Merged
merged 2 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ PyArrow will become a required dependency with pandas 3.0

`PyArrow <https://arrow.apache.org/docs/python/index.html>`_ will become a required
dependency of pandas starting with pandas 3.0. This decision was made based on
`PDEP 12 <https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html>`_.
`PDEP 10 <https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html>`_.

This will enable more changes that are hugely beneficial to pandas users, including
but not limited to:
Expand All @@ -41,7 +41,9 @@ Avoid NumPy object dtype for strings by default

Previously, all strings were stored in columns with NumPy object dtype.
This release introduces an option ``future.infer_string`` that infers all
strings as PyArrow backed strings with dtype ``pd.ArrowDtype(pa.string())`` instead.
strings as PyArrow backed strings with dtype ``"string[pyarrow_numpy]"`` instead.
This is a new string dtype implementation that follows NumPy semantics in comparison
operations and will return ``np.nan`` as the missing value indicator.
This option only works if PyArrow is installed. PyArrow backed strings have a
significantly reduced memory footprint and provide a big performance improvement
compared to NumPy object (:issue:`54430`).
Expand Down
6 changes: 2 additions & 4 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2682,11 +2682,9 @@ def maybe_convert_objects(ndarray[object] objects,

elif seen.str_:
if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True):
import pyarrow as pa
from pandas.core.arrays.string_ import StringDtype

from pandas.core.dtypes.dtypes import ArrowDtype

dtype = ArrowDtype(pa.string())
dtype = StringDtype(storage="pyarrow_numpy")
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)

seen.object_ = True
Expand Down
9 changes: 3 additions & 6 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,7 @@
is_object_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
ArrowDtype,
NumpyEADtype,
)
from pandas.core.dtypes.dtypes import NumpyEADtype
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCExtensionArray,
Expand Down Expand Up @@ -595,9 +592,9 @@ def sanitize_array(
if data.dtype == object:
subarr = maybe_infer_to_datetimelike(data)
elif data.dtype.kind == "U" and using_pyarrow_string_dtype():
import pyarrow as pa
from pandas.core.arrays.string_ import StringDtype

dtype = ArrowDtype(pa.string())
dtype = StringDtype(storage="pyarrow_numpy")
subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)

if subarr is data and copy:
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,10 +799,9 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:

dtype = _dtype_obj
if using_pyarrow_string_dtype():
import pyarrow as pa
from pandas.core.arrays.string_ import StringDtype

pa_dtype = pa.string()
dtype = ArrowDtype(pa_dtype)
dtype = StringDtype(storage="pyarrow_numpy")

elif isinstance(val, (np.datetime64, dt.datetime)):
try:
Expand Down
8 changes: 2 additions & 6 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,7 @@
is_named_tuple,
is_object_dtype,
)
from pandas.core.dtypes.dtypes import (
ArrowDtype,
ExtensionDtype,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
Expand Down Expand Up @@ -379,10 +376,9 @@ def ndarray_to_mgr(
nb = new_block_2d(values, placement=bp, refs=refs)
block_values = [nb]
elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype():
import pyarrow as pa
dtype = StringDtype(storage="pyarrow_numpy")

obj_columns = list(values)
dtype = ArrowDtype(pa.string())
block_values = [
new_block(
dtype.construct_array_type()._from_sequence(data, dtype=dtype),
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ def _arrow_dtype_mapping() -> dict:
def arrow_string_types_mapper() -> Callable:
pa = import_optional_dependency("pyarrow")

return {pa.string(): pd.ArrowDtype(pa.string())}.get
return {pa.string(): pd.StringDtype(storage="pyarrow_numpy")}.get
13 changes: 3 additions & 10 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@
)
from pandas.core.dtypes.missing import array_equivalent

import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Expand Down Expand Up @@ -3224,9 +3223,7 @@ def read(
values = self.read_array("values", start=start, stop=stop)
result = Series(values, index=index, name=self.name, copy=False)
if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
import pyarrow as pa

result = result.astype(pd.ArrowDtype(pa.string()))
result = result.astype("string[pyarrow_numpy]")
return result

# error: Signature of "write" incompatible with supertype "Fixed"
Expand Down Expand Up @@ -3296,9 +3293,7 @@ def read(
columns = items[items.get_indexer(blk_items)]
df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
import pyarrow as pa

df = df.astype(pd.ArrowDtype(pa.string()))
df = df.astype("string[pyarrow_numpy]")
dfs.append(df)

if len(dfs) > 0:
Expand Down Expand Up @@ -4686,9 +4681,7 @@ def read(
values, # type: ignore[arg-type]
skipna=True,
):
import pyarrow as pa

df = df.astype(pd.ArrowDtype(pa.string()))
df = df.astype("string[pyarrow_numpy]")
frames.append(df)

if len(frames) == 1:
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2685,8 +2685,8 @@ def test_construct_with_strings_and_none(self):

def test_frame_string_inference(self):
# GH#54430
pa = pytest.importorskip("pyarrow")
dtype = pd.ArrowDtype(pa.string())
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
expected = DataFrame(
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
)
Expand Down Expand Up @@ -2720,8 +2720,8 @@ def test_frame_string_inference(self):

def test_frame_string_inference_array_string_dtype(self):
# GH#54496
pa = pytest.importorskip("pyarrow")
dtype = pd.ArrowDtype(pa.string())
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
expected = DataFrame(
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/indexes/base_class/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ def test_construct_empty_tuples(self, tuple_list):

def test_index_string_inference(self):
# GH#54430
pa = pytest.importorskip("pyarrow")
dtype = pd.ArrowDtype(pa.string())
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
expected = Index(["a", "b"], dtype=dtype)
with pd.option_context("future.infer_string", True):
ser = Index(["a", "b"])
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2095,7 +2095,7 @@ def test_pyarrow_engine_lines_false():


def test_json_roundtrip_string_inference(orient):
pa = pytest.importorskip("pyarrow")
pytest.importorskip("pyarrow")
df = DataFrame(
[["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"]
)
Expand All @@ -2104,9 +2104,9 @@ def test_json_roundtrip_string_inference(orient):
result = read_json(StringIO(out))
expected = DataFrame(
[["a", "b"], ["c", "d"]],
dtype=pd.ArrowDtype(pa.string()),
index=pd.Index(["row 1", "row 2"], dtype=pd.ArrowDtype(pa.string())),
columns=pd.Index(["col 1", "col 2"], dtype=pd.ArrowDtype(pa.string())),
dtype="string[pyarrow_numpy]",
index=pd.Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"),
columns=pd.Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(result, expected)

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,8 +542,8 @@ def test_ea_int_avoid_overflow(all_parsers):

def test_string_inference(all_parsers):
# GH#54430
pa = pytest.importorskip("pyarrow")
dtype = pd.ArrowDtype(pa.string())
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"

data = """a,b
x,1
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/io/pytables/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,15 +392,15 @@ def test_read_py2_hdf_file_in_py3(datapath):

def test_read_infer_string(tmp_path, setup_path):
# GH#54431
pa = pytest.importorskip("pyarrow")
pytest.importorskip("pyarrow")
df = DataFrame({"a": ["a", "b", None]})
path = tmp_path / setup_path
df.to_hdf(path, key="data", format="table")
with pd.option_context("future.infer_string", True):
result = read_hdf(path, key="data", mode="r")
expected = DataFrame(
{"a": ["a", "b", None]},
dtype=pd.ArrowDtype(pa.string()),
columns=Index(["a"], dtype=pd.ArrowDtype(pa.string())),
dtype="string[pyarrow_numpy]",
columns=Index(["a"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(result, expected)
6 changes: 1 addition & 5 deletions pandas/tests/io/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,14 +222,10 @@ def test_invalid_dtype_backend(self):

def test_string_inference(self, tmp_path):
# GH#54431
import pyarrow as pa

path = tmp_path / "test_string_inference.p"
df = pd.DataFrame(data={"a": ["x", "y"]})
df.to_feather(path)
with pd.option_context("future.infer_string", True):
result = read_feather(path)
expected = pd.DataFrame(
data={"a": ["x", "y"]}, dtype=pd.ArrowDtype(pa.string())
)
expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]")
tm.assert_frame_equal(result, expected)
4 changes: 2 additions & 2 deletions pandas/tests/io/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ def test_string_inference(tmp_path):
result = read_orc(path)
expected = pd.DataFrame(
data={"a": ["x", "y"]},
dtype=pd.ArrowDtype(pa.string()),
columns=pd.Index(["a"], dtype=pd.ArrowDtype(pa.string())),
dtype="string[pyarrow_numpy]",
columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(result, expected)
6 changes: 2 additions & 4 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,17 +1113,15 @@ def test_df_attrs_persistence(self, tmp_path, pa):

def test_string_inference(self, tmp_path, pa):
# GH#54431
import pyarrow as pa

path = tmp_path / "test_string_inference.p"
df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"])
df.to_parquet(path, engine="pyarrow")
with pd.option_context("future.infer_string", True):
result = read_parquet(path, engine="pyarrow")
expected = pd.DataFrame(
data={"a": ["x", "y"]},
dtype=pd.ArrowDtype(pa.string()),
index=pd.Index(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
dtype="string[pyarrow_numpy]",
index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(result, expected)

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -2946,15 +2946,15 @@ def test_read_sql_dtype_backend_table(self, string_storage, func):

def test_read_sql_string_inference(self):
# GH#54430
pa = pytest.importorskip("pyarrow")
pytest.importorskip("pyarrow")
table = "test"
df = DataFrame({"a": ["x", "y"]})
df.to_sql(table, con=self.conn, index=False, if_exists="replace")

with pd.option_context("future.infer_string", True):
result = read_sql_table(table, self.conn)

dtype = pd.ArrowDtype(pa.string())
dtype = "string[pyarrow_numpy]"
expected = DataFrame(
{"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
)
Expand Down
16 changes: 8 additions & 8 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2077,8 +2077,8 @@ def test_series_from_index_dtype_equal_does_not_copy(self):

def test_series_string_inference(self):
# GH#54430
pa = pytest.importorskip("pyarrow")
dtype = pd.ArrowDtype(pa.string())
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
expected = Series(["a", "b"], dtype=dtype)
with pd.option_context("future.infer_string", True):
ser = Series(["a", "b"])
Expand All @@ -2092,25 +2092,25 @@ def test_series_string_inference(self):
@pytest.mark.parametrize("na_value", [None, np.nan, pd.NA])
def test_series_string_with_na_inference(self, na_value):
# GH#54430
pa = pytest.importorskip("pyarrow")
dtype = pd.ArrowDtype(pa.string())
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
expected = Series(["a", na_value], dtype=dtype)
with pd.option_context("future.infer_string", True):
ser = Series(["a", na_value])
tm.assert_series_equal(ser, expected)

def test_series_string_inference_scalar(self):
# GH#54430
pa = pytest.importorskip("pyarrow")
expected = Series("a", index=[1], dtype=pd.ArrowDtype(pa.string()))
pytest.importorskip("pyarrow")
expected = Series("a", index=[1], dtype="string[pyarrow_numpy]")
with pd.option_context("future.infer_string", True):
ser = Series("a", index=[1])
tm.assert_series_equal(ser, expected)

def test_series_string_inference_array_string_dtype(self):
# GH#54496
pa = pytest.importorskip("pyarrow")
expected = Series(["a", "b"], dtype=pd.ArrowDtype(pa.string()))
pytest.importorskip("pyarrow")
expected = Series(["a", "b"], dtype="string[pyarrow_numpy]")
with pd.option_context("future.infer_string", True):
ser = Series(np.array(["a", "b"]))
tm.assert_series_equal(ser, expected)
Expand Down