GMT_DATASET: Return an empty DataFrame if the file has no data

seisman · seisman · commit 9d4abf9892e3 · 2024-03-21T13:40:13.000+08:00
diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py
@@ -13,8 +13,8 @@ class _GMT_DATASET(ctp.Structure):  # noqa: N801
     """
     GMT dataset structure for holding multiple tables (files).
 
-    This class is only meant for internal use by PyGMT and is not exposed to users.
-    See the GMT source code gmt_resources.h for the original C struct definitions.
+    This class is only meant for internal use and is not exposed to users. See the GMT
+    source code ``gmt_resources.h`` for the original C struct definitions.
 
     Examples
     --------
@@ -151,6 +151,8 @@ def to_dataframe(self) -> pd.DataFrame:
         the same. The same column in all segments of all tables are concatenated. The
         trailing text column is also concatenated as a single string column.
 
+        If the object has no data, an empty DataFrame will be returned.
+
         Returns
         -------
         df
@@ -185,8 +187,8 @@ def to_dataframe(self) -> pd.DataFrame:
         >>> df.dtypes.to_list()
         [dtype('float64'), dtype('float64'), dtype('float64'), string[python]]
         """
-        # Deal with numeric columns
         vectors = []
+        # Deal with numeric columns
         for icol in range(self.n_columns):
             colvector = []
             for itbl in range(self.n_tables):
@@ -211,5 +213,5 @@ def to_dataframe(self) -> pd.DataFrame:
                 pd.Series(data=np.char.decode(textvector), dtype=pd.StringDtype())
             )
 
-        df = pd.concat(objs=vectors, axis=1)
+        df = pd.concat(objs=vectors, axis=1) if vectors else pd.DataFrame()
         return df
diff --git a/pygmt/tests/test_datatypes_dataset.py b/pygmt/tests/test_datatypes_dataset.py
@@ -0,0 +1,84 @@
+"""
+Tests for GMT_DATASET data type.
+"""
+
+from pathlib import Path
+
+import pandas as pd
+from pygmt.clib import Session
+from pygmt.helpers import GMTTempFile
+
+
+def dataframe_from_pandas(filepath_or_buffer, sep=r"\s+", comment="#"):
+    """
+    Read a tabular data as pandas.DataFrame object using pandas.read_csv().
+
+    The parameters have the same meaning as in ``pandas.read_csv()``.
+    """
+    try:
+        df = pd.read_csv(filepath_or_buffer, sep=sep, comment=comment, header=None)
+    except pd.errors.EmptyDataError:
+        # Return an empty DataFrame if the file has no data
+        return pd.DataFrame()
+
+    # By default, pandas reads text strings with whitespaces as multiple columns, but
+    # GMT contacatenates all trailing text as a single string column. Neet do find all
+    # string columns (with dtype="object") and combine them into a single string column.
+    string_columns = df.select_dtypes(include=["object"]).columns
+    if len(string_columns) > 1:
+        df[string_columns[0]] = df[string_columns].apply(lambda x: " ".join(x), axis=1)
+        df = df.drop(string_columns[1:], axis=1)
+    # Convert 'object' to 'string' type
+    df = df.convert_dtypes(
+        convert_string=True,
+        convert_integer=False,
+        convert_boolean=False,
+        convert_floating=False,
+    )
+    return df
+
+
+def dataframe_from_gmt(fname):
+    """
+    Read a tabular data as pandas.DataFrame using GMT virtual file.
+    """
+    with Session() as lib:
+        with lib.virtualfile_out(kind="dataset") as vouttbl:
+            lib.call_module("read", f"{fname} {vouttbl} -Td")
+            df = lib.virtualfile_to_dataset(vfname=vouttbl)
+            return df
+
+
+def test_dataset(benchmark):
+    """
+    Test the basic functionality of GMT_DATASET.
+    """
+    with GMTTempFile(suffix=".txt") as tmpfile:
+        with Path(tmpfile.name).open(mode="w") as fp:
+            print(">", file=fp)
+            print("1.0 2.0 3.0 TEXT1 TEXT23", file=fp)
+            print("4.0 5.0 6.0 TEXT4 TEXT567", file=fp)
+            print(">", file=fp)
+            print("7.0 8.0 9.0 TEXT8 TEXT90", file=fp)
+            print("10.0 11.0 12.0 TEXT123 TEXT456789", file=fp)
+
+        # The normal version is:
+        #   df = dataframe_from_gmt(tmpfile.name)
+        # but we want to benchmark the GMT_DATASET->DataFrame conversion.
+        df = benchmark(dataframe_from_gmt, tmpfile.name)  # The benchmark version
+        expected_df = dataframe_from_pandas(tmpfile.name, comment=">")
+        pd.testing.assert_frame_equal(df, expected_df)
+
+
+def test_dataset_empty():
+    """
+    Make sure that an empty DataFrame is returned if a file has no data.
+    """
+    with GMTTempFile(suffix=".txt") as tmpfile:
+        with Path(tmpfile.name).open(mode="w") as fp:
+            print("# This is a comment line.", file=fp)
+
+        df = dataframe_from_gmt(tmpfile.name)
+        assert df.empty  # Empty DataFrame
+        expected_df = dataframe_from_pandas(tmpfile.name)
+        pd.testing.assert_frame_equal(df, expected_df)