From 001eef26f4393d854666d74b5cdf26b9376197e6 Mon Sep 17 00:00:00 2001 From: Jay Ahn Date: Sun, 15 Sep 2024 17:32:30 -0400 Subject: [PATCH 1/3] use passed pyarrow dtype when reading json --- pandas/io/json/_json.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index d077b9e0c4568..8e2362cdd6467 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -32,6 +32,7 @@ from pandas.core.dtypes.common import ( ensure_str, is_string_dtype, + pandas_dtype, ) from pandas.core.dtypes.dtypes import PeriodDtype @@ -939,7 +940,19 @@ def read(self) -> DataFrame | Series: with self: if self.engine == "pyarrow": pyarrow_json = import_optional_dependency("pyarrow.json") - pa_table = pyarrow_json.read_json(self.data) + if isinstance(self.dtype, dict): + pa = import_optional_dependency("pyarrow") + fields = [ + (field, pandas_dtype(dtype).pyarrow_dtype) + for field, dtype in self.dtype.items() + ] + schema = pa.schema(fields) + pa_table = pyarrow_json.read_json( + self.data, + parse_options=pyarrow_json.ParseOptions(explicit_schema=schema), + ) + else: + pa_table = pyarrow_json.read_json(self.data) mapping: type[ArrowDtype] | None | Callable if self.dtype_backend == "pyarrow": From f0a90f2eb7f0e47fde6b4392b8a451ae76140126 Mon Sep 17 00:00:00 2001 From: Jay Ahn Date: Sun, 15 Sep 2024 17:35:41 -0400 Subject: [PATCH 2/3] add test case for read_json with explicit pyarrow dtype --- pandas/tests/io/json/test_pandas.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1c54232b8b510..db618fdd85cac 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -18,6 +18,7 @@ import pandas as pd from pandas import ( NA, + ArrowDtype, DataFrame, DatetimeIndex, Index, @@ -2163,7 +2164,7 @@ def test_read_json_dtype_backend( if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - string_dtype = pd.ArrowDtype(pa.string()) + string_dtype = ArrowDtype(pa.string()) else: string_dtype = pd.StringDtype(string_storage) @@ -2286,3 +2287,25 @@ def test_read_json_lines_rangeindex(): result = read_json(StringIO(data), lines=True).index expected = RangeIndex(2) tm.assert_index_equal(result, expected, exact=True) + + +def test_read_json_pyarrow_dtype(datapath): + dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"} + + df = read_json( + datapath("io", "json", "data", "line_delimited.json"), + dtype=dtype, + lines=True, + engine="pyarrow", + dtype_backend="pyarrow", + ) + + result = df.dtypes + expected = Series( + [ + ArrowDtype.construct_from_string("int32[pyarrow]"), + ArrowDtype.construct_from_string("int64[pyarrow]"), + ], + index=["a", "b"], + ) + tm.assert_series_equal(result, expected) From 19dde443d9da4bd8212d8959224691bd5d005b65 Mon Sep 17 00:00:00 2001 From: Jay Ahn Date: Sun, 15 Sep 2024 17:45:55 -0400 Subject: [PATCH 3/3] Add an entry to a doc --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 89a1c388b3ba1..a46ba45dcc4d5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -629,6 +629,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) +- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) - Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`)