Merge pull request #35 from JeffersonLab/24-common-csv-parser

24 common csv parser
JeffersonLab · May 2, 2024 · c63e630 · c63e630
2 parents eea09a5 + ff117c0
commit c63e630
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 26 deletions.
diff --git a/jlab_datascience_toolkit/data_parser/__init__.py b/jlab_datascience_toolkit/data_parser/__init__.py
@@ -8,6 +8,7 @@
 from jlab_datascience_toolkit.data_parser.numpy_parser import NumpyParser
 
 register(
-    id="PandasParser_v0",
-    entry_point="jlab_datascience_toolkit.data_parser.pandas_parser_v0:PandasParser"
+    id='CSVParser_v0',
+    entry_point="jlab_datascience_toolkit.data_parser.parser_to_dataframe:Parser2DataFrame",
+    kwargs={'registry_config': {'file_format': 'csv'}}
 )
diff --git a/...e_toolkit/data_parser/pandas_parser_v0.py → ...oolkit/data_parser/parser_to_dataframe.py b/...e_toolkit/data_parser/pandas_parser_v0.py → ...oolkit/data_parser/parser_to_dataframe.py
@@ -6,7 +6,7 @@
 import inspect
 import os
 
-pandas_parser_log = logging.getLogger('PandasParser_v0 Logger')
+parser_log = logging.getLogger('Parser Logger')
 
 # Supported file formats
 pandas_read_functions = dict(
@@ -16,7 +16,7 @@
     pickle=pd.read_pickle
 )
 
-class PandasParser(JDSTDataParser):
+class Parser2DataFrame(JDSTDataParser):
     """Reads a list of files and concatenates them in a Pandas DataFrame.
 
     Intialization arguments: 
@@ -30,8 +30,9 @@ class PandasParser(JDSTDataParser):
             Format of files to parse. Currently supports csv, feather, json
             and pickle. Defaults to csv
         `read_kwargs: dict = {}`
-            Arguments to be passed 
+            Arguments to be passed to the read function determined by `file_format`
         `concat_kwargs: dict = {}`
+            Arguments to be passed to pd.concat()
 
     Attributes
     ----------
@@ -59,19 +60,31 @@ class PandasParser(JDSTDataParser):
 
     """
 
-    def __init__(self, config: dict = None):
+    def __init__(self, config: dict = None, registry_config: dict = None):
         # It is important not to use default mutable arguments in python
         #   (lists/dictionaries), so we set config to None and update later
 
+        # Priority for configurations is:
+        # 1) config (intended for users)
+        # 2) registry_config (intended only for the registry)
+        # 3) defaults (set below)
+
         # Set default config
         self.config = dict(
             filepaths=[], 
             file_format='csv',
             read_kwargs = {},
             concat_kwargs = {},
         )
-        # Update configuration with new configuration
+
+        # First update defaults with registry_configuration
+        if registry_config is not None:
+            parser_log.debug(f'Updating defaults with: {registry_config}')
+            self.config.update(registry_config)
+
+        # Now update configuration with new (user) configuration
         if config is not None:
+            parser_log.debug(f'Updating registered config with: {config}')
             self.config.update(config)
 
         # To handle strings and lists of strings, we convert the former here
@@ -82,21 +95,21 @@ def __init__(self, config: dict = None):
 
     @property
     def name(self):
-        return 'PandasParser_v0'
+        return 'Parser2DataFrame_v0'
 
     def setup(self):
         # Set the correct reading function here
         self.read_function = pandas_read_functions.get(
             self.config['file_format'].lower(), None)
 
         if self.read_function is None:
-            pandas_parser_log.error(
+            parser_log.error(
                     f'File format {self.config["file_format"]}'
                      'is not currently supported.')
             raise ValueError
 
     def get_info(self):
-        """ Prints the docstring for the PandasParser module"""
+        """ Prints the docstring for the Parser2DataFrame module"""
         print(inspect.getdoc(self))
 
     def load(self, path: str):
@@ -133,15 +146,15 @@ def load_data(self) -> pd.DataFrame:
         """
         data_list = []
         for file in self.config['filepaths']:
-            pandas_parser_log.debug(f'Loading {file} ...')
+            parser_log.debug(f'Loading {file} ...')
             data = self.read_function(
                 file, 
                 **self.config['read_kwargs'])
             data_list.append(data)
 
         # Check for empty data and return nothing if empty
         if not data_list:
-            pandas_parser_log.warning(
+            parser_log.warning(
                 'load_data() returning None. This is probably not what you '
                 'wanted. Ensure that your configuration includes the key '
                 '"filepaths"')
@@ -154,12 +167,12 @@ def load_data(self) -> pd.DataFrame:
         return output
 
     def load_config(self, path: str):
-        pandas_parser_log.debug('Calling load()...')
+        parser_log.debug('Calling load()...')
         return self.load(path)
 
     def save_config(self, path: str):
-        pandas_parser_log.debug('Calling save()...')
+        parser_log.debug('Calling save()...')
         return self.save(path)
 
     def save_data(self):
-        return super().save_data()
+        return super().save_data()
diff --git a/jlab_datascience_toolkit/utils/parser_utilities.py b/jlab_datascience_toolkit/utils/parser_utilities.py
@@ -0,0 +1,41 @@
+import os
+import pathlib
+import yaml
+import pandas as pd
+
+def save_config_to_yaml(config, path):
+    save_path = pathlib.Path(path)
+    os.makedirs(save_path)
+    with open(save_path.joinpath('config.yaml'), 'w') as f:
+        yaml.safe_dump(self.config, f)
+
+def load_yaml_config(path):
+    base_path = Path(path)
+    with open(base_path.joinpath('config.yaml'), 'r') as f:
+        config = yaml.safe_load(f)
+    return config
+
+def read_data_to_pandas(filepaths: list, file_format: str, **kwargs) -> pd.DataFrame:
+        """ Loads all files listed in filepaths and reads them.
+        All kwargs other than filepaths and file_format will be passed to the read_function
+        for its associated file_format
+
+        Returns:
+            pd.DataFrame: A single DataFrame containing list of dataframes
+        """
+
+        # Supported file formats
+        read_functions = dict(
+            csv=pd.read_csv,
+            feather=pd.read_feather,
+            json=pd.read_json,
+            pickle=pd.read_pickle
+        )
+
+        data_list = []
+        read_function = read_functions[file_format]
+        for file in filepaths:
+            data = read_function(file, **kwargs)
+            data_list.append(data)
+
+        return data_list
diff --git a/utests/utest_pandas_parser_v0.py → utests/utest_csv_parser.py b/utests/utest_pandas_parser_v0.py → utests/utest_csv_parser.py
@@ -1,26 +1,33 @@
 from jlab_datascience_toolkit.data_parser import make
 import unittest
+import logging
 import matplotlib.pyplot as plt
 import pandas as pd
 import numpy as np
+import argparse
+import shutil
+import sys
 import os
 
+test_log = logging.Logger('test_logger')
+
 rng = np.random.default_rng(seed=42)
+parser_id = 'CSVParser_v0'
 
 
-class TestPandasParserv0(unittest.TestCase):
+class TestCSVParserv0(unittest.TestCase):
 
     # Initialize:
     # *****************************************
     def __init__(self, *args, **kwargs):
-        super(TestPandasParserv0, self).__init__(*args, **kwargs)
+        super(TestCSVParserv0, self).__init__(*args, **kwargs)
 
     @classmethod
     def setUpClass(self) -> None:
         print('Setting up all tests...')
         self.columns = ['R121GMES', 'R122GMES',
                         'R123GMES', 'R121GSET', 'R122GSET', 'R123GSET']
-        self.path = './pandas_parser_utest.csv'
+        self.path = './csv_parser_utest.csv'
         self.samples = 100
         data = rng.normal(loc=5, scale=1, size=(
             self.samples, len(self.columns)))
@@ -34,7 +41,7 @@ def setUpClass(self) -> None:
         test_data
         test_data.to_csv(self.path)
 
-        self.path2 = './pandas_parser_utest2.csv'
+        self.path2 = './csv_parser_utest2.csv'
         data = rng.normal(loc=9, scale=2, size=(
             self.samples, len(self.columns)))
         dates = []
@@ -64,14 +71,14 @@ def tearDown(self) -> None:
 
     def test_no_config(self):
         print('*****No Config Test*****\n')
-        parser = make('PandasParser_v0')
+        parser = make(parser_id)
         output = parser.load_data()
         self.assertIsNone(output)
 
     def test_string_filepaths(self):
         print('*****String Filepaths Test*****\n')
 
-        parser = make('PandasParser_v0', config=dict(filepaths=self.path))
+        parser = make(parser_id, config=dict(filepaths=self.path))
         output = parser.load_data()
         print('Output Head:\n', output.head())
 
@@ -80,14 +87,14 @@ def test_string_filepaths(self):
     def test_one_item_list_filepaths(self):
         print('*****One Item List Test*****\n')
 
-        parser = make('PandasParser_v0', config=dict(filepaths=[self.path]))
+        parser = make(parser_id, config=dict(filepaths=[self.path]))
         output = parser.load_data()
         print('Output Head:\n', output.head())
         self.assertEqual(output.shape, (self.samples, len(self.columns)+1))
 
     def test_two_filepaths(self):
         print('*****Two Filepaths Test*****\n')
-        parser = make('PandasParser_v0', config=dict(filepaths=[self.path, self.path2]))
+        parser = make(parser_id, config=dict(filepaths=[self.path, self.path2]))
         output = parser.load_data()
         print('Output Head:\n', output.head())
         print('Output shape:', output.shape)
@@ -97,7 +104,7 @@ def test_usecols_read_arg(self):
         print('*****Usecols Read Arg Test*****\n')
 
         two_columns = ['R121GMES', 'R121GSET']
-        parser = make('PandasParser_v0', config=dict(
+        parser = make(parser_id, config=dict(
             filepaths=self.path, read_kwargs=dict(usecols=two_columns)))
         output = parser.load_data()
         print('Output Head:\n', output.head())
@@ -110,7 +117,7 @@ def test_use_datetime_index(self):
         def column_lambda(x): return ('GMES' in x) or (x == 'Date')
         read_kwargs = dict(usecols=column_lambda,
                            index_col='Date', parse_dates=True)
-        parser = make('PandasParser_v0',
+        parser = make(parser_id,
                       config=dict(
                           filepaths=self.path, read_kwargs=read_kwargs)
                       )
@@ -121,7 +128,27 @@ def column_lambda(x): return ('GMES' in x) or (x == 'Date')
             self.assertTrue('GMES' in column)
         self.assertIsInstance(output.index, pd.DatetimeIndex)
 
+    def test_save_load(self):
+        print('*****Save/Load Test*****\n')
 
-# Run this file via: python utest_pandas_parser_v0.py
+        parser = make(parser_id, config=dict(filepaths=self.path, read_kwargs={'usecols': self.columns}))
+        output = parser.load_data()
+        save_path = './temp_parser'
+        try:
+            parser.save(save_path)
+            new_parser = make(parser_id)
+            new_parser.load(save_path)
+            new_output = new_parser.load_data()
+            for col in output.columns:
+                with self.subTest(col=col):
+                    self.assertTrue(np.allclose(output[col], new_output[col]))
+        finally:
+            shutil.rmtree(save_path)
+        pass
+
+# Run this file via: python utest_csv_parser_v0.py
 if __name__ == "__main__":
+    argv = len(sys.argv) > 1 and sys.argv[1]
+    loglevel = logging.DEBUG if argv == '-v' else logging.WARNING
+    logging.basicConfig(stream=sys.stdout, level=loglevel)
     unittest.main()