-
Notifications
You must be signed in to change notification settings - Fork 218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
clib.conversion: Deal with np.object dtype in vectors_to_arrays and deprecate the array_to_datetime function #3507
base: main
Are you sure you want to change the base?
Changes from all commits
3661e54
20b9215
83673cf
56a0841
6338cde
1864556
54160bf
f4e1a5f
ed3be20
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -18,7 +18,6 @@ | |||||||||||||||||
import pandas as pd | ||||||||||||||||||
import xarray as xr | ||||||||||||||||||
from pygmt.clib.conversion import ( | ||||||||||||||||||
array_to_datetime, | ||||||||||||||||||
dataarray_to_matrix, | ||||||||||||||||||
sequence_to_ctypes_array, | ||||||||||||||||||
strings_to_ctypes_array, | ||||||||||||||||||
|
@@ -854,22 +853,13 @@ def _check_dtype_and_dim(self, array, ndim): | |||||||||||||||||
""" | ||||||||||||||||||
# Check that the array has the given number of dimensions | ||||||||||||||||||
if array.ndim != ndim: | ||||||||||||||||||
raise GMTInvalidInput( | ||||||||||||||||||
f"Expected a numpy {ndim}-D array, got {array.ndim}-D." | ||||||||||||||||||
) | ||||||||||||||||||
msg = f"Expected a numpy {ndim}-D array, got {array.ndim}-D." | ||||||||||||||||||
raise GMTInvalidInput(msg) | ||||||||||||||||||
|
||||||||||||||||||
# Check that the array has a valid/known data type | ||||||||||||||||||
if array.dtype.type not in DTYPES: | ||||||||||||||||||
try: | ||||||||||||||||||
if array.dtype.type is np.object_: | ||||||||||||||||||
# Try to convert unknown object type to np.datetime64 | ||||||||||||||||||
array = array_to_datetime(array) | ||||||||||||||||||
else: | ||||||||||||||||||
raise ValueError | ||||||||||||||||||
except ValueError as e: | ||||||||||||||||||
raise GMTInvalidInput( | ||||||||||||||||||
f"Unsupported numpy data type '{array.dtype.type}'." | ||||||||||||||||||
) from e | ||||||||||||||||||
msg = f"Unsupported numpy data type '{array.dtype.type}'." | ||||||||||||||||||
raise GMTInvalidInput(msg) | ||||||||||||||||||
return self[DTYPES[array.dtype.type]] | ||||||||||||||||||
|
||||||||||||||||||
def put_vector(self, dataset, column, vector): | ||||||||||||||||||
|
@@ -917,7 +907,7 @@ def put_vector(self, dataset, column, vector): | |||||||||||||||||
gmt_type = self._check_dtype_and_dim(vector, ndim=1) | ||||||||||||||||||
if gmt_type in {self["GMT_TEXT"], self["GMT_DATETIME"]}: | ||||||||||||||||||
if gmt_type == self["GMT_DATETIME"]: | ||||||||||||||||||
vector = np.datetime_as_string(array_to_datetime(vector)) | ||||||||||||||||||
vector = np.datetime_as_string(vector) | ||||||||||||||||||
vector_pointer = strings_to_ctypes_array(vector) | ||||||||||||||||||
else: | ||||||||||||||||||
vector_pointer = vector.ctypes.data_as(ctp.c_void_p) | ||||||||||||||||||
|
@@ -1388,7 +1378,7 @@ def virtualfile_from_vectors(self, *vectors): | |||||||||||||||||
# Assumes that first 2 columns contains coordinates like longitude | ||||||||||||||||||
# latitude, or datetime string types. | ||||||||||||||||||
for col, array in enumerate(arrays[2:]): | ||||||||||||||||||
if pd.api.types.is_string_dtype(array.dtype): | ||||||||||||||||||
if array.dtype.type == np.str_: | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we'll need to check if this can handle There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Both can be converted to the numpy string dtype by the
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The main idea of this PR is to let For any special dtypes that we know how to convert it to numpy dtype, we can maintain a mapping dictionary, just like what you did to support pyarrow's date32[day] and date64[ms] in #2845: pygmt/pygmt/clib/conversion.py Lines 208 to 211 in c2e429c
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In 83673cf, I've moved most of the doctests into a separate test file There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Based on the tests below, I think we should add the entry In [1]: import pandas as pd
In [2]: x = pd.Series(["abc", "12345"])
In [3]: x.dtype
Out[3]: dtype('O')
In [4]: str(x.dtype)
Out[4]: 'object'
In [5]: x = pd.Series(["abc", "12345"], dtype="string")
In [6]: x.dtype
Out[6]: string[python]
In [7]: str(x.dtype)
Out[7]: 'string'
In [8]: x = pd.Series(["abc", "12345"], dtype="string[pyarrow]")
In [9]: x.dtype
Out[9]: string[pyarrow]
In [10]: str(x.dtype)
Out[10]: 'string'
In [11]: import pyarrow as pa
In [12]: x = pa.array(["abc", "defghi"])
In [13]: x.type
Out[13]: DataType(string)
In [14]: str(x.type)
Out[14]: 'string' There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In PR #2774 and #2774, we only checked if PyGMT supports pandas with the pyarrow backend, but didn't check if the original pyarrow arrays works. For example, for a pyarrow
|
||||||||||||||||||
columns = col + 2 | ||||||||||||||||||
break | ||||||||||||||||||
|
||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
""" | ||
Test the functions in the clib.conversion module. | ||
""" | ||
|
||
import datetime | ||
import importlib | ||
|
||
import numpy as np | ||
import numpy.testing as npt | ||
import pandas as pd | ||
import pytest | ||
from pygmt.clib.conversion import vectors_to_arrays | ||
from pygmt.clib.session import DTYPES | ||
|
||
_HAS_PYARROW = bool(importlib.util.find_spec("pyarrow")) | ||
|
||
|
||
def _check_arrays(arrays): | ||
""" | ||
A helper function to check the results of vectors_to_arrays. | ||
|
||
- Check if all arrays are C-contiguous | ||
- Check if all arrays are numpy arrays | ||
- Check if all arrays are 1-D | ||
""" | ||
# Check if all arrays are C-contiguous | ||
assert all(i.flags.c_contiguous for i in arrays) | ||
# Check if all arrays are numpy arrays | ||
assert all(isinstance(i, np.ndarray) for i in arrays) | ||
# Check if all arrays are 1-D | ||
assert all(i.ndim == 1 for i in arrays) | ||
# Check if all numpy dtypes can be recognized by GMT | ||
assert all(i.dtype.type in DTYPES for i in arrays) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"vectors", | ||
[ | ||
pytest.param([[1, 2], (3, 4), range(5, 7)], id="python_objects"), | ||
pytest.param( | ||
[np.array([1, 2]), np.array([3, 4]), np.array(range(5, 7))], | ||
id="numpy_arrays", | ||
), | ||
pytest.param([[1, 2], np.array([3, 4]), range(5, 7)], id="mixed"), | ||
pytest.param([1, 2, 3.0], id="scalars"), | ||
], | ||
) | ||
def test_vectors_to_arrays(vectors): | ||
""" | ||
Test the vectors_to_arrays function for various input types. | ||
""" | ||
arrays = vectors_to_arrays(vectors) | ||
_check_arrays(arrays) | ||
|
||
|
||
def test_vectors_to_arrays_not_c_contiguous(): | ||
""" | ||
Test the vectors_to_arrays function with numpy arrays that are not C-contiguous. | ||
""" | ||
data = np.array([[1, 2], [3, 4], [5, 6]]) | ||
vectors = [data[:, 0], data[:, 1]] | ||
assert all(not i.flags.c_contiguous for i in vectors) | ||
arrays = vectors_to_arrays(vectors) | ||
_check_arrays(arrays) | ||
|
||
|
||
def test_vectors_to_arrays_pandas_nan(): | ||
""" | ||
Test the vectors_to_arrays function with pandas Series containing NaNs. | ||
""" | ||
vectors = [pd.Series(data=[0, 4, pd.NA, 8, 6], dtype=pd.Int32Dtype())] | ||
arrays = vectors_to_arrays(vectors) | ||
npt.assert_equal(arrays[0], np.array([0, 4, np.nan, 8, 6], dtype=np.float64)) | ||
_check_arrays(arrays) | ||
|
||
|
||
def test_vectors_to_arrays_pandas_string(): | ||
""" | ||
Test the vectors_to_arrays function with pandas Series containing datetime64. | ||
""" | ||
vectors = [ | ||
pd.Series(["abc", "defhig"]), | ||
pd.Series(["abcdef", "123456"], dtype="string"), | ||
] | ||
arrays = vectors_to_arrays(vectors) | ||
assert all(i.dtype.type == np.str_ for i in arrays) | ||
_check_arrays(arrays) | ||
|
||
|
||
@pytest.mark.skipif(not _HAS_PYARROW, reason="pyarrow is not installed.") | ||
def test_vectors_to_arrays_pyarrow_datetime(): | ||
""" | ||
Test the vectors_to_arrays function with pyarrow arrays containing datetime64. | ||
""" | ||
vectors = [ | ||
pd.Series( | ||
data=[datetime.date(2020, 1, 1), datetime.date(2021, 12, 31)], | ||
dtype="date32[day][pyarrow]", | ||
), | ||
pd.Series( | ||
data=[datetime.date(2022, 1, 1), datetime.date(2023, 12, 31)], | ||
dtype="date64[ms][pyarrow]", | ||
), | ||
] | ||
arrays = vectors_to_arrays(vectors) | ||
assert all(i.dtype.type == np.datetime64 for i in arrays) | ||
_check_arrays(arrays) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After this PR,
array_to_datetime
is no longer used, but I still want to keep this function so that we know what kinds of datetime formats thatnp.asarray(array, dtype=np.datetime64)
can support.