Skip to content

Commit

Permalink
Merge pull request #35 from JeffersonLab/24-common-csv-parser
Browse files Browse the repository at this point in the history
24 common csv parser
  • Loading branch information
sgoldenCS authored May 2, 2024
2 parents eea09a5 + ff117c0 commit c63e630
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 26 deletions.
5 changes: 3 additions & 2 deletions jlab_datascience_toolkit/data_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from jlab_datascience_toolkit.data_parser.numpy_parser import NumpyParser

register(
id="PandasParser_v0",
entry_point="jlab_datascience_toolkit.data_parser.pandas_parser_v0:PandasParser"
id='CSVParser_v0',
entry_point="jlab_datascience_toolkit.data_parser.parser_to_dataframe:Parser2DataFrame",
kwargs={'registry_config': {'file_format': 'csv'}}
)
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import inspect
import os

pandas_parser_log = logging.getLogger('PandasParser_v0 Logger')
parser_log = logging.getLogger('Parser Logger')

# Supported file formats
pandas_read_functions = dict(
Expand All @@ -16,7 +16,7 @@
pickle=pd.read_pickle
)

class PandasParser(JDSTDataParser):
class Parser2DataFrame(JDSTDataParser):
"""Reads a list of files and concatenates them in a Pandas DataFrame.
Intialization arguments:
Expand All @@ -30,8 +30,9 @@ class PandasParser(JDSTDataParser):
Format of files to parse. Currently supports csv, feather, json
and pickle. Defaults to csv
`read_kwargs: dict = {}`
Arguments to be passed
Arguments to be passed to the read function determined by `file_format`
`concat_kwargs: dict = {}`
Arguments to be passed to pd.concat()
Attributes
----------
Expand Down Expand Up @@ -59,19 +60,31 @@ class PandasParser(JDSTDataParser):
"""

def __init__(self, config: dict = None):
def __init__(self, config: dict = None, registry_config: dict = None):
# It is important not to use default mutable arguments in python
# (lists/dictionaries), so we set config to None and update later

# Priority for configurations is:
# 1) config (intended for users)
# 2) registry_config (intended only for the registry)
# 3) defaults (set below)

# Set default config
self.config = dict(
filepaths=[],
file_format='csv',
read_kwargs = {},
concat_kwargs = {},
)
# Update configuration with new configuration

# First update defaults with registry_configuration
if registry_config is not None:
parser_log.debug(f'Updating defaults with: {registry_config}')
self.config.update(registry_config)

# Now update configuration with new (user) configuration
if config is not None:
parser_log.debug(f'Updating registered config with: {config}')
self.config.update(config)

# To handle strings and lists of strings, we convert the former here
Expand All @@ -82,21 +95,21 @@ def __init__(self, config: dict = None):

@property
def name(self):
return 'PandasParser_v0'
return 'Parser2DataFrame_v0'

def setup(self):
# Set the correct reading function here
self.read_function = pandas_read_functions.get(
self.config['file_format'].lower(), None)

if self.read_function is None:
pandas_parser_log.error(
parser_log.error(
f'File format {self.config["file_format"]}'
'is not currently supported.')
raise ValueError

def get_info(self):
""" Prints the docstring for the PandasParser module"""
""" Prints the docstring for the Parser2DataFrame module"""
print(inspect.getdoc(self))

def load(self, path: str):
Expand Down Expand Up @@ -133,15 +146,15 @@ def load_data(self) -> pd.DataFrame:
"""
data_list = []
for file in self.config['filepaths']:
pandas_parser_log.debug(f'Loading {file} ...')
parser_log.debug(f'Loading {file} ...')
data = self.read_function(
file,
**self.config['read_kwargs'])
data_list.append(data)

# Check for empty data and return nothing if empty
if not data_list:
pandas_parser_log.warning(
parser_log.warning(
'load_data() returning None. This is probably not what you '
'wanted. Ensure that your configuration includes the key '
'"filepaths"')
Expand All @@ -154,12 +167,12 @@ def load_data(self) -> pd.DataFrame:
return output

def load_config(self, path: str):
pandas_parser_log.debug('Calling load()...')
parser_log.debug('Calling load()...')
return self.load(path)

def save_config(self, path: str):
pandas_parser_log.debug('Calling save()...')
parser_log.debug('Calling save()...')
return self.save(path)

def save_data(self):
return super().save_data()
return super().save_data()
41 changes: 41 additions & 0 deletions jlab_datascience_toolkit/utils/parser_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
import pathlib
import yaml
import pandas as pd

def save_config_to_yaml(config, path):
save_path = pathlib.Path(path)
os.makedirs(save_path)
with open(save_path.joinpath('config.yaml'), 'w') as f:
yaml.safe_dump(self.config, f)

def load_yaml_config(path):
base_path = Path(path)
with open(base_path.joinpath('config.yaml'), 'r') as f:
config = yaml.safe_load(f)
return config

def read_data_to_pandas(filepaths: list, file_format: str, **kwargs) -> pd.DataFrame:
""" Loads all files listed in filepaths and reads them.
All kwargs other than filepaths and file_format will be passed to the read_function
for its associated file_format
Returns:
pd.DataFrame: A single DataFrame containing list of dataframes
"""

# Supported file formats
read_functions = dict(
csv=pd.read_csv,
feather=pd.read_feather,
json=pd.read_json,
pickle=pd.read_pickle
)

data_list = []
read_function = read_functions[file_format]
for file in filepaths:
data = read_function(file, **kwargs)
data_list.append(data)

return data_list
49 changes: 38 additions & 11 deletions utests/utest_pandas_parser_v0.py → utests/utest_csv_parser.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,33 @@
from jlab_datascience_toolkit.data_parser import make
import unittest
import logging
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import argparse
import shutil
import sys
import os

test_log = logging.Logger('test_logger')

rng = np.random.default_rng(seed=42)
parser_id = 'CSVParser_v0'


class TestPandasParserv0(unittest.TestCase):
class TestCSVParserv0(unittest.TestCase):

# Initialize:
# *****************************************
def __init__(self, *args, **kwargs):
super(TestPandasParserv0, self).__init__(*args, **kwargs)
super(TestCSVParserv0, self).__init__(*args, **kwargs)

@classmethod
def setUpClass(self) -> None:
print('Setting up all tests...')
self.columns = ['R121GMES', 'R122GMES',
'R123GMES', 'R121GSET', 'R122GSET', 'R123GSET']
self.path = './pandas_parser_utest.csv'
self.path = './csv_parser_utest.csv'
self.samples = 100
data = rng.normal(loc=5, scale=1, size=(
self.samples, len(self.columns)))
Expand All @@ -34,7 +41,7 @@ def setUpClass(self) -> None:
test_data
test_data.to_csv(self.path)

self.path2 = './pandas_parser_utest2.csv'
self.path2 = './csv_parser_utest2.csv'
data = rng.normal(loc=9, scale=2, size=(
self.samples, len(self.columns)))
dates = []
Expand Down Expand Up @@ -64,14 +71,14 @@ def tearDown(self) -> None:

def test_no_config(self):
print('*****No Config Test*****\n')
parser = make('PandasParser_v0')
parser = make(parser_id)
output = parser.load_data()
self.assertIsNone(output)

def test_string_filepaths(self):
print('*****String Filepaths Test*****\n')

parser = make('PandasParser_v0', config=dict(filepaths=self.path))
parser = make(parser_id, config=dict(filepaths=self.path))
output = parser.load_data()
print('Output Head:\n', output.head())

Expand All @@ -80,14 +87,14 @@ def test_string_filepaths(self):
def test_one_item_list_filepaths(self):
print('*****One Item List Test*****\n')

parser = make('PandasParser_v0', config=dict(filepaths=[self.path]))
parser = make(parser_id, config=dict(filepaths=[self.path]))
output = parser.load_data()
print('Output Head:\n', output.head())
self.assertEqual(output.shape, (self.samples, len(self.columns)+1))

def test_two_filepaths(self):
print('*****Two Filepaths Test*****\n')
parser = make('PandasParser_v0', config=dict(filepaths=[self.path, self.path2]))
parser = make(parser_id, config=dict(filepaths=[self.path, self.path2]))
output = parser.load_data()
print('Output Head:\n', output.head())
print('Output shape:', output.shape)
Expand All @@ -97,7 +104,7 @@ def test_usecols_read_arg(self):
print('*****Usecols Read Arg Test*****\n')

two_columns = ['R121GMES', 'R121GSET']
parser = make('PandasParser_v0', config=dict(
parser = make(parser_id, config=dict(
filepaths=self.path, read_kwargs=dict(usecols=two_columns)))
output = parser.load_data()
print('Output Head:\n', output.head())
Expand All @@ -110,7 +117,7 @@ def test_use_datetime_index(self):
def column_lambda(x): return ('GMES' in x) or (x == 'Date')
read_kwargs = dict(usecols=column_lambda,
index_col='Date', parse_dates=True)
parser = make('PandasParser_v0',
parser = make(parser_id,
config=dict(
filepaths=self.path, read_kwargs=read_kwargs)
)
Expand All @@ -121,7 +128,27 @@ def column_lambda(x): return ('GMES' in x) or (x == 'Date')
self.assertTrue('GMES' in column)
self.assertIsInstance(output.index, pd.DatetimeIndex)

def test_save_load(self):
print('*****Save/Load Test*****\n')

# Run this file via: python utest_pandas_parser_v0.py
parser = make(parser_id, config=dict(filepaths=self.path, read_kwargs={'usecols': self.columns}))
output = parser.load_data()
save_path = './temp_parser'
try:
parser.save(save_path)
new_parser = make(parser_id)
new_parser.load(save_path)
new_output = new_parser.load_data()
for col in output.columns:
with self.subTest(col=col):
self.assertTrue(np.allclose(output[col], new_output[col]))
finally:
shutil.rmtree(save_path)
pass

# Run this file via: python utest_csv_parser_v0.py
if __name__ == "__main__":
argv = len(sys.argv) > 1 and sys.argv[1]
loglevel = logging.DEBUG if argv == '-v' else logging.WARNING
logging.basicConfig(stream=sys.stdout, level=loglevel)
unittest.main()

0 comments on commit c63e630

Please sign in to comment.