Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

clib.conversion: Deal with np.object dtype in vectors_to_arrays and deprecate the array_to_datetime function #3507

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
52 changes: 16 additions & 36 deletions pygmt/clib/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,46 +168,13 @@ def vectors_to_arrays(vectors: Sequence[Any]) -> list[np.ndarray]:
True
>>> all(isinstance(i, np.ndarray) for i in arrays)
True

>>> data = [[1, 2], (3, 4), range(5, 7)]
>>> all(isinstance(i, np.ndarray) for i in vectors_to_arrays(data))
True

>>> # Sequence of scalars are converted to 1-D arrays
>>> data = vectors_to_arrays([1, 2, 3.0])
>>> data
[array([1]), array([2]), array([3.])]
>>> [i.ndim for i in data] # Check that they are 1-D arrays
[1, 1, 1]

>>> series = pd.Series(data=[0, 4, pd.NA, 8, 6], dtype=pd.Int32Dtype())
>>> vectors_to_arrays([series])
[array([ 0., 4., nan, 8., 6.])]

>>> import datetime
>>> import pytest
>>> pa = pytest.importorskip("pyarrow")
>>> vectors = [
... pd.Series(
... data=[datetime.date(2020, 1, 1), datetime.date(2021, 12, 31)],
... dtype="date32[day][pyarrow]",
... ),
... pd.Series(
... data=[datetime.date(2022, 1, 1), datetime.date(2023, 12, 31)],
... dtype="date64[ms][pyarrow]",
... ),
... ]
>>> arrays = vectors_to_arrays(vectors)
>>> all(a.flags.c_contiguous for a in arrays)
True
>>> all(isinstance(a, np.ndarray) for a in arrays)
True
>>> all(isinstance(a.dtype, np.dtypes.DateTime64DType) for a in arrays)
>>> all(i.ndim == 1 for i in arrays)
True
"""
dtypes = {
"date32[day][pyarrow]": np.datetime64,
"date64[ms][pyarrow]": np.datetime64,
"string": np.str_,
}
arrays = []
for vector in vectors:
Expand All @@ -222,8 +189,16 @@ def vectors_to_arrays(vectors: Sequence[Any]) -> list[np.ndarray]:
# we can remove the workaround in PyGMT v0.17.0.
array = np.ascontiguousarray(vector.astype(float))
else:
vec_dtype = str(getattr(vector, "dtype", ""))
# NumPy/Pandas uses "dtype" and PyArrow uses "type".
vec_dtype = str(getattr(vector, "dtype", getattr(vector, "type", "")))
array = np.ascontiguousarray(vector, dtype=dtypes.get(vec_dtype))
# Convert np.object_ to np.datetime64 or np.str_.
# If fails, then the array can't be recognized.
if array.dtype.type == np.object_:
try:
array = np.ascontiguousarray(array, dtype=np.datetime64)
except ValueError:
array = np.ascontiguousarray(array, dtype=np.str_)
arrays.append(array)
return arrays

Expand Down Expand Up @@ -313,6 +288,11 @@ def array_to_datetime(array: Sequence[Any]) -> np.ndarray:

If the input array is not in legal datetime formats, raise a ValueError exception.

.. deprecated:: 0.14.0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After this PR, array_to_datetime is no longer used, but I still want to keep this function so that we know what kinds of datetime formats that np.asarray(array, dtype=np.datetime64) can support.


The function is no longer used in the PyGMT project, but we keep this function
to document the supported datetime types.

Parameters
----------
array
Expand Down
22 changes: 6 additions & 16 deletions pygmt/clib/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import pandas as pd
import xarray as xr
from pygmt.clib.conversion import (
array_to_datetime,
dataarray_to_matrix,
sequence_to_ctypes_array,
strings_to_ctypes_array,
Expand Down Expand Up @@ -854,22 +853,13 @@ def _check_dtype_and_dim(self, array, ndim):
"""
# Check that the array has the given number of dimensions
if array.ndim != ndim:
raise GMTInvalidInput(
f"Expected a numpy {ndim}-D array, got {array.ndim}-D."
)
msg = f"Expected a numpy {ndim}-D array, got {array.ndim}-D."
raise GMTInvalidInput(msg)

# Check that the array has a valid/known data type
if array.dtype.type not in DTYPES:
try:
if array.dtype.type is np.object_:
# Try to convert unknown object type to np.datetime64
array = array_to_datetime(array)
else:
raise ValueError
except ValueError as e:
raise GMTInvalidInput(
f"Unsupported numpy data type '{array.dtype.type}'."
) from e
msg = f"Unsupported numpy data type '{array.dtype.type}'."
raise GMTInvalidInput(msg)
return self[DTYPES[array.dtype.type]]

def put_vector(self, dataset, column, vector):
Expand Down Expand Up @@ -917,7 +907,7 @@ def put_vector(self, dataset, column, vector):
gmt_type = self._check_dtype_and_dim(vector, ndim=1)
if gmt_type in {self["GMT_TEXT"], self["GMT_DATETIME"]}:
if gmt_type == self["GMT_DATETIME"]:
vector = np.datetime_as_string(array_to_datetime(vector))
vector = np.datetime_as_string(vector)
vector_pointer = strings_to_ctypes_array(vector)
else:
vector_pointer = vector.ctypes.data_as(ctp.c_void_p)
Expand Down Expand Up @@ -1388,7 +1378,7 @@ def virtualfile_from_vectors(self, *vectors):
# Assumes that first 2 columns contains coordinates like longitude
# latitude, or datetime string types.
for col, array in enumerate(arrays[2:]):
if pd.api.types.is_string_dtype(array.dtype):
if array.dtype.type == np.str_:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pd.api.types.is_string_dtype was added in PR #684. Before PR #684, we used np.issubdtype(array.dtype, np.str_), which was added in #520. Any specific reason that we should use np.issubdtype(array.dtype, np.str_), not array.dtype.type == np.str_?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we'll need to check if this can handle pandas.StringDtype and pyarrow.StringArray (xref #2933).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both can be converted to the numpy string dtype by the vectors_to_arrays method, so in virtualfile_from_vectors, we only need to check np.str_:

In [1]: import numpy as np

In [2]: import pandas as pd

In [3]: import pyarrow as pa

In [4]: x = pd.Series(["abc", "defghi"], dtype="string")

In [5]: np.asarray(x)
Out[5]: array(['abc', 'defghi'], dtype=object)

In [6]: np.asarray(x, dtype=str)
Out[6]: array(['abc', 'defghi'], dtype='<U6')

In [7]: y = pa.array(["abc", "defghi"])

In [8]: np.asarray(y)
Out[8]: array(['abc', 'defghi'], dtype=object)

In [9]: np.asarray(y, dtype=str)
Out[9]: array(['abc', 'defghi'], dtype='<U6')

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main idea of this PR is to let vectors_to_arrays do the conversion work from any dtypes (including pd.StringDtype and pa.StringArray) into the numpy dtypes, so that the virtualfile_from_vectors only need to care about how to map numpy dtypes into GMT data types.

For any special dtypes that we know how to convert it to numpy dtype, we can maintain a mapping dictionary, just like what you did to support pyarrow's date32[day] and date64[ms] in #2845:

dtypes = {
"date32[day][pyarrow]": np.datetime64,
"date64[ms][pyarrow]": np.datetime64,
}

Copy link
Member Author

@seisman seisman Oct 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In 83673cf, I've moved most of the doctests into a separate test file pygmt/tests/test_clib_vectors_to_arrays.py. A test test_vectors_to_arrays_pandas_string is added to check that the function can handle pd.StringDtype correctly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For any special dtypes that we know how to convert it to numpy dtype, we can maintain a mapping dictionary, just like what you did to support pyarrow's date32[day] and date64[ms] in #2845:

dtypes = {
"date32[day][pyarrow]": np.datetime64,
"date64[ms][pyarrow]": np.datetime64,
}

Based on the tests below, I think we should add the entry "string": np.str_ to the dictionary:

In [1]: import pandas as pd

In [2]: x = pd.Series(["abc", "12345"])

In [3]: x.dtype
Out[3]: dtype('O')

In [4]: str(x.dtype)
Out[4]: 'object'

In [5]: x = pd.Series(["abc", "12345"], dtype="string")

In [6]: x.dtype
Out[6]: string[python]

In [7]: str(x.dtype)
Out[7]: 'string'

In [8]: x = pd.Series(["abc", "12345"], dtype="string[pyarrow]")

In [9]: x.dtype
Out[9]: string[pyarrow]

In [10]: str(x.dtype)
Out[10]: 'string'

In [11]: import pyarrow as pa

In [12]: x = pa.array(["abc", "defghi"])

In [13]: x.type
Out[13]: DataType(string)

In [14]: str(x.type)
Out[14]: 'string'

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In PR #2774 and #2774, we only checked if PyGMT supports pandas with the pyarrow backend, but didn't check if the original pyarrow arrays works. For example, for a pyarrow date32 array, we need to check array.type rather than array.dtype:

In [1]: import datetime

In [2]: import pyarrow as pa

In [3]: x = pa.array([datetime.date(2020, 1, 1), datetime.date(2021, 12, 31)])

In [4]: str(x.type)
Out[4]: 'date32[day]'

columns = col + 2
break

Expand Down
107 changes: 107 additions & 0 deletions pygmt/tests/test_clib_vectors_to_arrays.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""
Test the functions in the clib.conversion module.
"""

import datetime
import importlib

import numpy as np
import numpy.testing as npt
import pandas as pd
import pytest
from pygmt.clib.conversion import vectors_to_arrays
from pygmt.clib.session import DTYPES

_HAS_PYARROW = bool(importlib.util.find_spec("pyarrow"))


def _check_arrays(arrays):
"""
A helper function to check the results of vectors_to_arrays.

- Check if all arrays are C-contiguous
- Check if all arrays are numpy arrays
- Check if all arrays are 1-D
"""
# Check if all arrays are C-contiguous
assert all(i.flags.c_contiguous for i in arrays)
# Check if all arrays are numpy arrays
assert all(isinstance(i, np.ndarray) for i in arrays)
# Check if all arrays are 1-D
assert all(i.ndim == 1 for i in arrays)
# Check if all numpy dtypes can be recognized by GMT
assert all(i.dtype.type in DTYPES for i in arrays)


@pytest.mark.parametrize(
"vectors",
[
pytest.param([[1, 2], (3, 4), range(5, 7)], id="python_objects"),
pytest.param(
[np.array([1, 2]), np.array([3, 4]), np.array(range(5, 7))],
id="numpy_arrays",
),
pytest.param([[1, 2], np.array([3, 4]), range(5, 7)], id="mixed"),
pytest.param([1, 2, 3.0], id="scalars"),
],
)
def test_vectors_to_arrays(vectors):
"""
Test the vectors_to_arrays function for various input types.
"""
arrays = vectors_to_arrays(vectors)
_check_arrays(arrays)


def test_vectors_to_arrays_not_c_contiguous():
"""
Test the vectors_to_arrays function with numpy arrays that are not C-contiguous.
"""
data = np.array([[1, 2], [3, 4], [5, 6]])
vectors = [data[:, 0], data[:, 1]]
assert all(not i.flags.c_contiguous for i in vectors)
arrays = vectors_to_arrays(vectors)
_check_arrays(arrays)


def test_vectors_to_arrays_pandas_nan():
"""
Test the vectors_to_arrays function with pandas Series containing NaNs.
"""
vectors = [pd.Series(data=[0, 4, pd.NA, 8, 6], dtype=pd.Int32Dtype())]
arrays = vectors_to_arrays(vectors)
npt.assert_equal(arrays[0], np.array([0, 4, np.nan, 8, 6], dtype=np.float64))
_check_arrays(arrays)


def test_vectors_to_arrays_pandas_string():
"""
Test the vectors_to_arrays function with pandas Series containing datetime64.
"""
vectors = [
pd.Series(["abc", "defhig"]),
pd.Series(["abcdef", "123456"], dtype="string"),
]
arrays = vectors_to_arrays(vectors)
assert all(i.dtype.type == np.str_ for i in arrays)
_check_arrays(arrays)


@pytest.mark.skipif(not _HAS_PYARROW, reason="pyarrow is not installed.")
def test_vectors_to_arrays_pyarrow_datetime():
"""
Test the vectors_to_arrays function with pyarrow arrays containing datetime64.
"""
vectors = [
pd.Series(
data=[datetime.date(2020, 1, 1), datetime.date(2021, 12, 31)],
dtype="date32[day][pyarrow]",
),
pd.Series(
data=[datetime.date(2022, 1, 1), datetime.date(2023, 12, 31)],
dtype="date64[ms][pyarrow]",
),
]
arrays = vectors_to_arrays(vectors)
assert all(i.dtype.type == np.datetime64 for i in arrays)
_check_arrays(arrays)
Loading