Skip to content

Commit 2ddbb0e

Browse files
weiji14seisman
andauthored
Wrap GMT_Put_Strings to pass str columns into GMT C API directly (#520)
Used to insert 1D numpy arrays of string type from PyGMT directly into GMT via the C API. * Set valid GMT data mode as GMT_IS_OUTPUT * Try passing "GMT_IS_VECTOR" family type to put_strings * Do `put_vector` x and y before `put_strings` s, dim is 2 only * Try using ctypes.POINTER in argtypes of c_put_strings * Pass strings using "GMT_IS_VECTOR|GMT_IS_DUPLICATE" * Add test for passing in one string column to virtualfile_from_vectors * Fix test_plot_datetime by not passing first two columns into put_strings * Refactor virtualfile_from_vectors to handle any number of string type columns * Test for put_strings failing to increase code coverage * Expect failures for tests using GMT_Put_strings on GMT < 6.1.1 * Concatenate last string columns instead of allowing arbitrary positions * Test variable length strings * Replace gmt info with convert in test_virtualfile_from_vectors_str_cols Co-authored-by: Dongdong Tian <[email protected]>
1 parent 62580f1 commit 2ddbb0e

File tree

3 files changed

+208
-12
lines changed

3 files changed

+208
-12
lines changed

pygmt/clib/session.py

+97-12
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
"GMT_IS_SURFACE",
4545
]
4646

47+
METHODS = ["GMT_IS_DUPLICATE", "GMT_IS_REFERENCE"]
48+
4749
MODES = ["GMT_CONTAINER_ONLY", "GMT_IS_OUTPUT"]
4850

4951
REGISTRATIONS = ["GMT_GRID_PIXEL_REG", "GMT_GRID_NODE_REG"]
@@ -235,7 +237,7 @@ def __getitem__(self, name):
235237
value = c_get_enum(session, name.encode())
236238

237239
if value is None or value == -99999:
238-
raise GMTCLibError("Constant '{}' doesn't exits in libgmt.".format(name))
240+
raise GMTCLibError(f"Constant '{name}' doesn't exist in libgmt.")
239241

240242
return value
241243

@@ -733,7 +735,7 @@ def put_vector(self, dataset, column, vector):
733735
"""
734736
Attach a numpy 1D array as a column on a GMT dataset.
735737
736-
Use this functions to attach numpy array data to a GMT dataset and pass
738+
Use this function to attach numpy array data to a GMT dataset and pass
737739
it to GMT modules. Wraps ``GMT_Put_Vector``.
738740
739741
The dataset must be created by :meth:`~gmt.clib.Session.create_data`
@@ -793,11 +795,72 @@ def put_vector(self, dataset, column, vector):
793795
)
794796
)
795797

798+
def put_strings(self, dataset, family, strings):
799+
"""
800+
Attach a numpy 1D array of dtype str as a column on a GMT dataset.
801+
802+
Use this function to attach string type numpy array data to a GMT
803+
dataset and pass it to GMT modules. Wraps ``GMT_Put_Strings``.
804+
805+
The dataset must be created by :meth:`~gmt.clib.Session.create_data`
806+
first.
807+
808+
.. warning::
809+
The numpy array must be C contiguous in memory. If it comes from a
810+
column slice of a 2d array, for example, you will have to make a
811+
copy. Use :func:`numpy.ascontiguousarray` to make sure your vector
812+
is contiguous (it won't copy if it already is).
813+
814+
Parameters
815+
----------
816+
dataset : :class:`ctypes.c_void_p`
817+
The ctypes void pointer to a ``GMT_Dataset``. Create it with
818+
:meth:`~gmt.clib.Session.create_data`.
819+
family : str
820+
The family type of the dataset. Can be either ``GMT_IS_VECTOR`` or
821+
``GMT_IS_MATRIX``.
822+
strings : numpy 1d-array
823+
The array that will be attached to the dataset. Must be a 1d C
824+
contiguous array.
825+
826+
Raises
827+
------
828+
GMTCLibError
829+
If given invalid input or ``GMT_Put_Strings`` exits with status !=
830+
0.
831+
832+
"""
833+
c_put_strings = self.get_libgmt_func(
834+
"GMT_Put_Strings",
835+
argtypes=[
836+
ctp.c_void_p,
837+
ctp.c_uint,
838+
ctp.c_void_p,
839+
ctp.POINTER(ctp.c_char_p),
840+
],
841+
restype=ctp.c_int,
842+
)
843+
844+
strings_pointer = (ctp.c_char_p * len(strings))()
845+
strings_pointer[:] = np.char.encode(strings)
846+
847+
family_int = self._parse_constant(
848+
family, valid=FAMILIES, valid_modifiers=METHODS
849+
)
850+
851+
status = c_put_strings(
852+
self.session_pointer, family_int, dataset, strings_pointer
853+
)
854+
if status != 0:
855+
raise GMTCLibError(
856+
f"Failed to put strings of type {strings.dtype} into dataset"
857+
)
858+
796859
def put_matrix(self, dataset, matrix, pad=0):
797860
"""
798861
Attach a numpy 2D array to a GMT dataset.
799862
800-
Use this functions to attach numpy array data to a GMT dataset and pass
863+
Use this function to attach numpy array data to a GMT dataset and pass
801864
it to GMT modules. Wraps ``GMT_Put_Matrix``.
802865
803866
The dataset must be created by :meth:`~gmt.clib.Session.create_data`
@@ -1002,9 +1065,7 @@ def open_virtual_file(self, family, geometry, direction, data):
10021065
family_int = self._parse_constant(family, valid=FAMILIES, valid_modifiers=VIAS)
10031066
geometry_int = self._parse_constant(geometry, valid=GEOMETRIES)
10041067
direction_int = self._parse_constant(
1005-
direction,
1006-
valid=["GMT_IN", "GMT_OUT"],
1007-
valid_modifiers=["GMT_IS_REFERENCE", "GMT_IS_DUPLICATE"],
1068+
direction, valid=["GMT_IN", "GMT_OUT"], valid_modifiers=METHODS,
10081069
)
10091070

10101071
buff = ctp.create_string_buffer(self["GMT_VF_LEN"])
@@ -1079,14 +1140,23 @@ def virtualfile_from_vectors(self, *vectors):
10791140
10801141
"""
10811142
# Conversion to a C-contiguous array needs to be done here and not in
1082-
# put_matrix because we need to maintain a reference to the copy while
1083-
# it is being used by the C API. Otherwise, the array would be garbage
1084-
# collected and the memory freed. Creating it in this context manager
1085-
# guarantees that the copy will be around until the virtual file is
1086-
# closed. The conversion is implicit in vectors_to_arrays.
1143+
# put_vector or put_strings because we need to maintain a reference to
1144+
# the copy while it is being used by the C API. Otherwise, the array
1145+
# would be garbage collected and the memory freed. Creating it in this
1146+
# context manager guarantees that the copy will be around until the
1147+
# virtual file is closed. The conversion is implicit in
1148+
# vectors_to_arrays.
10871149
arrays = vectors_to_arrays(vectors)
10881150

10891151
columns = len(arrays)
1152+
# Find arrays that are of string dtype from column 3 onwards
1153+
# Assumes that first 2 columns contains coordinates like longitude
1154+
# latitude, or datetime string types.
1155+
for col, array in enumerate(arrays[2:]):
1156+
if np.issubdtype(array.dtype, np.str_):
1157+
columns = col + 2
1158+
break
1159+
10901160
rows = len(arrays[0])
10911161
if not all(len(i) == rows for i in arrays):
10921162
raise GMTInvalidInput("All arrays must have same size.")
@@ -1098,9 +1168,24 @@ def virtualfile_from_vectors(self, *vectors):
10981168
family, geometry, mode="GMT_CONTAINER_ONLY", dim=[columns, rows, 1, 0]
10991169
)
11001170

1101-
for col, array in enumerate(arrays):
1171+
# Use put_vector for columns with numerical type data
1172+
for col, array in enumerate(arrays[:columns]):
11021173
self.put_vector(dataset, column=col, vector=array)
11031174

1175+
# Use put_strings for last column(s) with string type data
1176+
# Have to use modifier "GMT_IS_DUPLICATE" to duplicate the strings
1177+
string_arrays = arrays[columns:]
1178+
if string_arrays:
1179+
if len(string_arrays) == 1:
1180+
strings = string_arrays[0]
1181+
elif len(string_arrays) > 1:
1182+
strings = np.apply_along_axis(
1183+
func1d=" ".join, axis=0, arr=string_arrays
1184+
)
1185+
self.put_strings(
1186+
dataset, family="GMT_IS_VECTOR|GMT_IS_DUPLICATE", strings=strings
1187+
)
1188+
11041189
with self.open_virtual_file(
11051190
family, geometry, "GMT_IN|GMT_IS_REFERENCE", dataset
11061191
) as vfile:

pygmt/tests/test_clib.py

+44
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727

2828
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
2929

30+
with clib.Session() as _lib:
31+
gmt_version = Version(_lib.info["version"])
32+
3033

3134
@contextmanager
3235
def mock(session, func, returns=None, mock_func=None):
@@ -399,6 +402,47 @@ def test_virtualfile_from_vectors():
399402
assert output == expected
400403

401404

405+
@pytest.mark.xfail(
406+
condition=gmt_version < Version("6.1.1"),
407+
reason="GMT_Put_Strings only works for GMT 6.1.1 and above",
408+
)
409+
def test_virtualfile_from_vectors_one_string_column():
410+
"Test passing in one column with string dtype into virtual file dataset"
411+
size = 5
412+
x = np.arange(size, dtype=np.int32)
413+
y = np.arange(size, size * 2, 1, dtype=np.int32)
414+
strings = np.array(["a", "bc", "defg", "hijklmn", "opqrst"], dtype=np.str)
415+
with clib.Session() as lib:
416+
with lib.virtualfile_from_vectors(x, y, strings) as vfile:
417+
with GMTTempFile() as outfile:
418+
lib.call_module("convert", f"{vfile} ->{outfile.name}")
419+
output = outfile.read(keep_tabs=True)
420+
expected = "".join(f"{i}\t{j}\t{k}\n" for i, j, k in zip(x, y, strings))
421+
assert output == expected
422+
423+
424+
@pytest.mark.xfail(
425+
condition=gmt_version < Version("6.1.1"),
426+
reason="GMT_Put_Strings only works for GMT 6.1.1 and above",
427+
)
428+
def test_virtualfile_from_vectors_two_string_columns():
429+
"Test passing in two columns of string dtype into virtual file dataset"
430+
size = 5
431+
x = np.arange(size, dtype=np.int32)
432+
y = np.arange(size, size * 2, 1, dtype=np.int32)
433+
strings1 = np.array(["a", "bc", "def", "ghij", "klmno"], dtype=np.str)
434+
strings2 = np.array(["pqrst", "uvwx", "yz!", "@#", "$"], dtype=np.str)
435+
with clib.Session() as lib:
436+
with lib.virtualfile_from_vectors(x, y, strings1, strings2) as vfile:
437+
with GMTTempFile() as outfile:
438+
lib.call_module("convert", f"{vfile} ->{outfile.name}")
439+
output = outfile.read(keep_tabs=True)
440+
expected = "".join(
441+
f"{h}\t{i}\t{j} {k}\n" for h, i, j, k in zip(x, y, strings1, strings2)
442+
)
443+
assert output == expected
444+
445+
402446
def test_virtualfile_from_vectors_transpose():
403447
"Test transforming matrix columns to virtual file dataset"
404448
dtypes = "float32 float64 int32 int64 uint32 uint64".split()

pygmt/tests/test_clib_put_strings.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""
2+
Test the functions that put string data into GMT.
3+
"""
4+
import numpy as np
5+
import numpy.testing as npt
6+
import pytest
7+
from packaging.version import Version
8+
9+
from .. import clib
10+
from ..exceptions import GMTCLibError
11+
from ..helpers import GMTTempFile
12+
13+
with clib.Session() as _lib:
14+
gmt_version = Version(_lib.info["version"])
15+
16+
17+
@pytest.mark.xfail(
18+
condition=gmt_version < Version("6.1.1"),
19+
reason="GMT_Put_Strings only works for GMT 6.1.1 and above",
20+
)
21+
def test_put_strings():
22+
"Check that assigning a numpy array of dtype str to a dataset works"
23+
with clib.Session() as lib:
24+
dataset = lib.create_data(
25+
family="GMT_IS_DATASET|GMT_VIA_VECTOR",
26+
geometry="GMT_IS_POINT",
27+
mode="GMT_CONTAINER_ONLY",
28+
dim=[2, 5, 1, 0], # columns, rows, layers, dtype
29+
)
30+
x = np.array([1, 2, 3, 4, 5], dtype=np.int32)
31+
y = np.array([6, 7, 8, 9, 10], dtype=np.int32)
32+
strings = np.array(["a", "bc", "defg", "hijklmn", "opqrst"], dtype=np.str)
33+
lib.put_vector(dataset, column=lib["GMT_X"], vector=x)
34+
lib.put_vector(dataset, column=lib["GMT_Y"], vector=y)
35+
lib.put_strings(
36+
dataset, family="GMT_IS_VECTOR|GMT_IS_DUPLICATE", strings=strings
37+
)
38+
# Turns out wesn doesn't matter for Datasets
39+
wesn = [0] * 6
40+
# Save the data to a file to see if it's being accessed correctly
41+
with GMTTempFile() as tmp_file:
42+
lib.write_data(
43+
"GMT_IS_VECTOR",
44+
"GMT_IS_POINT",
45+
"GMT_WRITE_SET",
46+
wesn,
47+
tmp_file.name,
48+
dataset,
49+
)
50+
# Load the data and check that it's correct
51+
newx, newy, newstrings = tmp_file.loadtxt(
52+
unpack=True, dtype=[("x", np.int32), ("y", np.int32), ("text", "<U7")]
53+
)
54+
npt.assert_array_equal(newx, x)
55+
npt.assert_array_equal(newy, y)
56+
npt.assert_array_equal(newstrings, strings)
57+
58+
59+
def test_put_strings_fails():
60+
"Check that put_strings raises an exception if return code is not zero"
61+
with clib.Session() as lib:
62+
with pytest.raises(GMTCLibError):
63+
lib.put_strings(
64+
dataset=None,
65+
family="GMT_IS_VECTOR|GMT_IS_DUPLICATE",
66+
strings=np.empty(shape=(3,), dtype=np.str),
67+
)

0 commit comments

Comments
 (0)