Skip to content

Commit 9d4abf9

Browse files
committed
GMT_DATASET: Return an empty DataFrame if the file has no data
1 parent dd8e0cd commit 9d4abf9

File tree

2 files changed

+90
-4
lines changed

2 files changed

+90
-4
lines changed

pygmt/datatypes/dataset.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ class _GMT_DATASET(ctp.Structure): # noqa: N801
1313
"""
1414
GMT dataset structure for holding multiple tables (files).
1515
16-
This class is only meant for internal use by PyGMT and is not exposed to users.
17-
See the GMT source code gmt_resources.h for the original C struct definitions.
16+
This class is only meant for internal use and is not exposed to users. See the GMT
17+
source code ``gmt_resources.h`` for the original C struct definitions.
1818
1919
Examples
2020
--------
@@ -151,6 +151,8 @@ def to_dataframe(self) -> pd.DataFrame:
151151
the same. The same column in all segments of all tables are concatenated. The
152152
trailing text column is also concatenated as a single string column.
153153
154+
If the object has no data, an empty DataFrame will be returned.
155+
154156
Returns
155157
-------
156158
df
@@ -185,8 +187,8 @@ def to_dataframe(self) -> pd.DataFrame:
185187
>>> df.dtypes.to_list()
186188
[dtype('float64'), dtype('float64'), dtype('float64'), string[python]]
187189
"""
188-
# Deal with numeric columns
189190
vectors = []
191+
# Deal with numeric columns
190192
for icol in range(self.n_columns):
191193
colvector = []
192194
for itbl in range(self.n_tables):
@@ -211,5 +213,5 @@ def to_dataframe(self) -> pd.DataFrame:
211213
pd.Series(data=np.char.decode(textvector), dtype=pd.StringDtype())
212214
)
213215

214-
df = pd.concat(objs=vectors, axis=1)
216+
df = pd.concat(objs=vectors, axis=1) if vectors else pd.DataFrame()
215217
return df

pygmt/tests/test_datatypes_dataset.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""
2+
Tests for GMT_DATASET data type.
3+
"""
4+
5+
from pathlib import Path
6+
7+
import pandas as pd
8+
from pygmt.clib import Session
9+
from pygmt.helpers import GMTTempFile
10+
11+
12+
def dataframe_from_pandas(filepath_or_buffer, sep=r"\s+", comment="#"):
13+
"""
14+
Read a tabular data as pandas.DataFrame object using pandas.read_csv().
15+
16+
The parameters have the same meaning as in ``pandas.read_csv()``.
17+
"""
18+
try:
19+
df = pd.read_csv(filepath_or_buffer, sep=sep, comment=comment, header=None)
20+
except pd.errors.EmptyDataError:
21+
# Return an empty DataFrame if the file has no data
22+
return pd.DataFrame()
23+
24+
# By default, pandas reads text strings with whitespaces as multiple columns, but
25+
# GMT contacatenates all trailing text as a single string column. Neet do find all
26+
# string columns (with dtype="object") and combine them into a single string column.
27+
string_columns = df.select_dtypes(include=["object"]).columns
28+
if len(string_columns) > 1:
29+
df[string_columns[0]] = df[string_columns].apply(lambda x: " ".join(x), axis=1)
30+
df = df.drop(string_columns[1:], axis=1)
31+
# Convert 'object' to 'string' type
32+
df = df.convert_dtypes(
33+
convert_string=True,
34+
convert_integer=False,
35+
convert_boolean=False,
36+
convert_floating=False,
37+
)
38+
return df
39+
40+
41+
def dataframe_from_gmt(fname):
42+
"""
43+
Read a tabular data as pandas.DataFrame using GMT virtual file.
44+
"""
45+
with Session() as lib:
46+
with lib.virtualfile_out(kind="dataset") as vouttbl:
47+
lib.call_module("read", f"{fname} {vouttbl} -Td")
48+
df = lib.virtualfile_to_dataset(vfname=vouttbl)
49+
return df
50+
51+
52+
def test_dataset(benchmark):
53+
"""
54+
Test the basic functionality of GMT_DATASET.
55+
"""
56+
with GMTTempFile(suffix=".txt") as tmpfile:
57+
with Path(tmpfile.name).open(mode="w") as fp:
58+
print(">", file=fp)
59+
print("1.0 2.0 3.0 TEXT1 TEXT23", file=fp)
60+
print("4.0 5.0 6.0 TEXT4 TEXT567", file=fp)
61+
print(">", file=fp)
62+
print("7.0 8.0 9.0 TEXT8 TEXT90", file=fp)
63+
print("10.0 11.0 12.0 TEXT123 TEXT456789", file=fp)
64+
65+
# The normal version is:
66+
# df = dataframe_from_gmt(tmpfile.name)
67+
# but we want to benchmark the GMT_DATASET->DataFrame conversion.
68+
df = benchmark(dataframe_from_gmt, tmpfile.name) # The benchmark version
69+
expected_df = dataframe_from_pandas(tmpfile.name, comment=">")
70+
pd.testing.assert_frame_equal(df, expected_df)
71+
72+
73+
def test_dataset_empty():
74+
"""
75+
Make sure that an empty DataFrame is returned if a file has no data.
76+
"""
77+
with GMTTempFile(suffix=".txt") as tmpfile:
78+
with Path(tmpfile.name).open(mode="w") as fp:
79+
print("# This is a comment line.", file=fp)
80+
81+
df = dataframe_from_gmt(tmpfile.name)
82+
assert df.empty # Empty DataFrame
83+
expected_df = dataframe_from_pandas(tmpfile.name)
84+
pd.testing.assert_frame_equal(df, expected_df)

0 commit comments

Comments
 (0)