clib: Add virtualfile_to_dataset method for converting virtualfile to a dataset (#3083)

seisman · weiji14 · web-flow · commit 0b46aade4cc9 · 2024-03-11T19:17:10.000+08:00
Co-authored-by: Wei Ji &lt;23487320+weiji14@users.noreply.github.com&gt;
diff --git a/doc/api/index.rst b/doc/api/index.rst
@@ -283,8 +283,8 @@ the :meth:`~pygmt.clib.Session.call_module` method:
 
 Passing memory blocks between Python data objects (e.g. :class:`numpy.ndarray`,
 :class:`pandas.Series`, :class:`xarray.DataArray`, etc) and GMT happens through
-*virtual files*. These methods are context managers that automate the
-conversion of Python variables to GMT virtual files:
+*virtual files*. These methods are context managers that automate the conversion of
+Python objects to and from GMT virtual files:
 
 .. autosummary::
     :toctree: generated
@@ -294,6 +294,7 @@ conversion of Python variables to GMT virtual files:
     clib.Session.virtualfile_from_grid
     clib.Session.virtualfile_in
     clib.Session.virtualfile_out
+    clib.Session.virtualfile_to_dataset
 
 Low level access (these are mostly used by the :mod:`pygmt.clib` package):
 
diff --git a/pygmt/clib/session.py b/pygmt/clib/session.py
@@ -1738,6 +1738,127 @@ def read_virtualfile(
         dtype = {"dataset": _GMT_DATASET, "grid": _GMT_GRID}[kind]
         return ctp.cast(pointer, ctp.POINTER(dtype))
 
+    def virtualfile_to_dataset(
+        self,
+        output_type: Literal["pandas", "numpy", "file"],
+        vfname: str,
+        column_names: list[str] | None = None,
+    ) -> pd.DataFrame | np.ndarray | None:
+        """
+        Output a tabular dataset stored in a virtual file to a different format.
+
+        The format of the dataset is determined by the ``output_type`` parameter.
+
+        Parameters
+        ----------
+        output_type
+            Desired output type of the result data.
+
+            - ``"pandas"`` will return a :class:`pandas.DataFrame` object.
+            - ``"numpy"`` will return a :class:`numpy.ndarray` object.
+            - ``"file"`` means the result was saved to a file and will return ``None``.
+        vfname
+            The virtual file name that stores the result data. Required for ``"pandas"``
+            and ``"numpy"`` output type.
+        column_names
+            The column names for the :class:`pandas.DataFrame` output.
+
+        Returns
+        -------
+        result
+            The result dataset. If ``output_type="file"`` returns ``None``.
+
+        Examples
+        --------
+        >>> from pathlib import Path
+        >>> import numpy as np
+        >>> import pandas as pd
+        >>>
+        >>> from pygmt.helpers import GMTTempFile
+        >>> from pygmt.clib import Session
+        >>>
+        >>> with GMTTempFile(suffix=".txt") as tmpfile:
+        ...     # prepare the sample data file
+        ...     with open(tmpfile.name, mode="w") as fp:
+        ...         print(">", file=fp)
+        ...         print("1.0 2.0 3.0 TEXT1 TEXT23", file=fp)
+        ...         print("4.0 5.0 6.0 TEXT4 TEXT567", file=fp)
+        ...         print(">", file=fp)
+        ...         print("7.0 8.0 9.0 TEXT8 TEXT90", file=fp)
+        ...         print("10.0 11.0 12.0 TEXT123 TEXT456789", file=fp)
+        ...
+        ...     # file output
+        ...     with Session() as lib:
+        ...         with GMTTempFile(suffix=".txt") as outtmp:
+        ...             with lib.virtualfile_out(
+        ...                 kind="dataset", fname=outtmp.name
+        ...             ) as vouttbl:
+        ...                 lib.call_module("read", f"{tmpfile.name} {vouttbl} -Td")
+        ...                 result = lib.virtualfile_to_dataset(
+        ...                     output_type="file", vfname=vouttbl
+        ...                 )
+        ...                 assert result is None
+        ...                 assert Path(outtmp.name).stat().st_size > 0
+        ...
+        ...     # numpy output
+        ...     with Session() as lib:
+        ...         with lib.virtualfile_out(kind="dataset") as vouttbl:
+        ...             lib.call_module("read", f"{tmpfile.name} {vouttbl} -Td")
+        ...             outnp = lib.virtualfile_to_dataset(
+        ...                 output_type="numpy", vfname=vouttbl
+        ...             )
+        ...     assert isinstance(outnp, np.ndarray)
+        ...
+        ...     # pandas output
+        ...     with Session() as lib:
+        ...         with lib.virtualfile_out(kind="dataset") as vouttbl:
+        ...             lib.call_module("read", f"{tmpfile.name} {vouttbl} -Td")
+        ...             outpd = lib.virtualfile_to_dataset(
+        ...                 output_type="pandas", vfname=vouttbl
+        ...             )
+        ...     assert isinstance(outpd, pd.DataFrame)
+        ...
+        ...     # pandas output with specified column names
+        ...     with Session() as lib:
+        ...         with lib.virtualfile_out(kind="dataset") as vouttbl:
+        ...             lib.call_module("read", f"{tmpfile.name} {vouttbl} -Td")
+        ...             outpd2 = lib.virtualfile_to_dataset(
+        ...                 output_type="pandas",
+        ...                 vfname=vouttbl,
+        ...                 column_names=["col1", "col2", "col3", "coltext"],
+        ...             )
+        ...     assert isinstance(outpd2, pd.DataFrame)
+        >>> outnp
+        array([[1.0, 2.0, 3.0, 'TEXT1 TEXT23'],
+               [4.0, 5.0, 6.0, 'TEXT4 TEXT567'],
+               [7.0, 8.0, 9.0, 'TEXT8 TEXT90'],
+               [10.0, 11.0, 12.0, 'TEXT123 TEXT456789']], dtype=object)
+        >>> outpd
+              0     1     2                   3
+        0   1.0   2.0   3.0        TEXT1 TEXT23
+        1   4.0   5.0   6.0       TEXT4 TEXT567
+        2   7.0   8.0   9.0        TEXT8 TEXT90
+        3  10.0  11.0  12.0  TEXT123 TEXT456789
+        >>> outpd2
+           col1  col2  col3             coltext
+        0   1.0   2.0   3.0        TEXT1 TEXT23
+        1   4.0   5.0   6.0       TEXT4 TEXT567
+        2   7.0   8.0   9.0        TEXT8 TEXT90
+        3  10.0  11.0  12.0  TEXT123 TEXT456789
+        """
+        if output_type == "file":  # Already written to file, so return None
+            return None
+
+        # Read the virtual file as a GMT dataset and convert to pandas.DataFrame
+        result = self.read_virtualfile(vfname, kind="dataset").contents.to_dataframe()
+        if output_type == "numpy":  # numpy.ndarray output
+            return result.to_numpy()
+
+        # Assign column names
+        if column_names is not None:
+            result.columns = column_names
+        return result  # pandas.DataFrame output
+
     def extract_region(self):
         """
         Extract the WESN bounding box of the currently active figure.