Skip to content

Commit 005de65

Browse files
weiji14seisman
andauthored
pyarrow: Support date32[day] and date64[ms] dtypes in pandas objects (#2845)
* Convert pyarrow date32/date64 dtypes to np.datetime64 Handle date columns in pandas.DataFrame with pyarrow dtypes like date32[day][pyarrow] or date64[ms][pyarrow] by modifying the vectors_to_arrays conversion function. Added some parametrized unit tests to test_info.py to ensure this works. * Handle Python lists without dtype attr and use as_c_contiguous Need to handle Python lists that don't have the dtype attribute, unlike pandas.Series objects. Also ensure that we return a C-contiguous array. * Add doctest to check that date32/date64 are converted to datetime64 Ensure that pyarrow date32 and date64 dtypes are converted to numpy.datetime64 dtype. Added pyarrow dependency to ci_doctests.yaml. Also changed from using `"date" in vec_dtype` to `vec_dtype.startswith("date")`. * Refactor to use pygmt.helpers.testing.skip_if_no * Document that PyArrow date32/date64 dtypes are now supported in PyGMT * Refactor to use dict mapping instead of if-then --------- Co-authored-by: Dongdong Tian <[email protected]>
1 parent 20054a1 commit 005de65

File tree

4 files changed

+46
-6
lines changed

4 files changed

+46
-6
lines changed

.github/workflows/ci_doctests.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ jobs:
5858
contextily
5959
geopandas
6060
ipython
61+
pyarrow
6162
rioxarray
6263
build
6364
make

doc/install.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,8 @@ The following are optional dependencies:
112112
If you have `PyArrow <https://arrow.apache.org/docs/python/index.html>`__
113113
installed, PyGMT does have some initial support for ``pandas.Series`` and
114114
``pandas.DataFrame`` objects with Apache Arrow-backed arrays. Specifically,
115-
only uint/int/float dtypes are supported for now. Support for datetime and
116-
string Arrow dtypes are still working in progress. For more details, see
115+
only uint/int/float and date32/date64 dtypes are supported for now. Support
116+
for string Arrow dtypes is still a work in progress. For more details, see
117117
`issue #2800 <https://github.com/GenericMappingTools/pygmt/issues/2800>`__.
118118

119119
Installing GMT and other dependencies

pygmt/clib/conversion.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,11 +162,42 @@ def vectors_to_arrays(vectors):
162162
True
163163
>>> all(isinstance(i, np.ndarray) for i in arrays)
164164
True
165+
165166
>>> data = [[1, 2], (3, 4), range(5, 7)]
166167
>>> all(isinstance(i, np.ndarray) for i in vectors_to_arrays(data))
167168
True
169+
170+
>>> import datetime
171+
>>> import pytest
172+
>>> pa = pytest.importorskip("pyarrow")
173+
>>> vectors = [
174+
... pd.Series(
175+
... data=[datetime.date(2020, 1, 1), datetime.date(2021, 12, 31)],
176+
... dtype="date32[day][pyarrow]",
177+
... ),
178+
... pd.Series(
179+
... data=[datetime.date(2022, 1, 1), datetime.date(2023, 12, 31)],
180+
... dtype="date64[ms][pyarrow]",
181+
... ),
182+
... ]
183+
>>> arrays = vectors_to_arrays(vectors)
184+
>>> all(a.flags.c_contiguous for a in arrays)
185+
True
186+
>>> all(isinstance(a, np.ndarray) for a in arrays)
187+
True
188+
>>> all(isinstance(a.dtype, np.dtypes.DateTime64DType) for a in arrays)
189+
True
168190
"""
169-
arrays = [as_c_contiguous(np.asarray(i)) for i in vectors]
191+
dtypes = {
192+
"date32[day][pyarrow]": np.datetime64,
193+
"date64[ms][pyarrow]": np.datetime64,
194+
}
195+
arrays = []
196+
for vector in vectors:
197+
vec_dtype = str(getattr(vector, "dtype", ""))
198+
array = np.asarray(a=vector, dtype=dtypes.get(vec_dtype, None))
199+
arrays.append(as_c_contiguous(array))
200+
170201
return arrays
171202

172203

pygmt/tests/test_info.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,14 +119,22 @@ def test_info_numpy_array_time_column():
119119
assert output == expected_output
120120

121121

122-
def test_info_pandas_dataframe_time_column():
122+
@pytest.mark.parametrize(
123+
"dtype",
124+
[
125+
"datetime64[ns]",
126+
pytest.param("date32[day][pyarrow]", marks=skip_if_no(package="pyarrow")),
127+
pytest.param("date64[ms][pyarrow]", marks=skip_if_no(package="pyarrow")),
128+
],
129+
)
130+
def test_info_pandas_dataframe_date_column(dtype):
123131
"""
124-
Make sure info works on pandas.DataFrame inputs with a time column.
132+
Make sure info works on pandas.DataFrame inputs with a date column.
125133
"""
126134
table = pd.DataFrame(
127135
data={
128136
"z": [10, 13, 12, 15, 14],
129-
"time": pd.date_range(start="2020-01-01", periods=5),
137+
"date": pd.date_range(start="2020-01-01", periods=5).astype(dtype=dtype),
130138
}
131139
)
132140
output = info(data=table)

0 commit comments

Comments
 (0)