Skip to content

Commit c63e630

Browse files
authored
Merge pull request #35 from JeffersonLab/24-common-csv-parser
24 common csv parser
2 parents eea09a5 + ff117c0 commit c63e630

File tree

4 files changed

+108
-26
lines changed

4 files changed

+108
-26
lines changed

jlab_datascience_toolkit/data_parser/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from jlab_datascience_toolkit.data_parser.numpy_parser import NumpyParser
99

1010
register(
11-
id="PandasParser_v0",
12-
entry_point="jlab_datascience_toolkit.data_parser.pandas_parser_v0:PandasParser"
11+
id='CSVParser_v0',
12+
entry_point="jlab_datascience_toolkit.data_parser.parser_to_dataframe:Parser2DataFrame",
13+
kwargs={'registry_config': {'file_format': 'csv'}}
1314
)

jlab_datascience_toolkit/data_parser/pandas_parser_v0.py renamed to jlab_datascience_toolkit/data_parser/parser_to_dataframe.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import inspect
77
import os
88

9-
pandas_parser_log = logging.getLogger('PandasParser_v0 Logger')
9+
parser_log = logging.getLogger('Parser Logger')
1010

1111
# Supported file formats
1212
pandas_read_functions = dict(
@@ -16,7 +16,7 @@
1616
pickle=pd.read_pickle
1717
)
1818

19-
class PandasParser(JDSTDataParser):
19+
class Parser2DataFrame(JDSTDataParser):
2020
"""Reads a list of files and concatenates them in a Pandas DataFrame.
2121
2222
Intialization arguments:
@@ -30,8 +30,9 @@ class PandasParser(JDSTDataParser):
3030
Format of files to parse. Currently supports csv, feather, json
3131
and pickle. Defaults to csv
3232
`read_kwargs: dict = {}`
33-
Arguments to be passed
33+
Arguments to be passed to the read function determined by `file_format`
3434
`concat_kwargs: dict = {}`
35+
Arguments to be passed to pd.concat()
3536
3637
Attributes
3738
----------
@@ -59,19 +60,31 @@ class PandasParser(JDSTDataParser):
5960
6061
"""
6162

62-
def __init__(self, config: dict = None):
63+
def __init__(self, config: dict = None, registry_config: dict = None):
6364
# It is important not to use default mutable arguments in python
6465
# (lists/dictionaries), so we set config to None and update later
6566

67+
# Priority for configurations is:
68+
# 1) config (intended for users)
69+
# 2) registry_config (intended only for the registry)
70+
# 3) defaults (set below)
71+
6672
# Set default config
6773
self.config = dict(
6874
filepaths=[],
6975
file_format='csv',
7076
read_kwargs = {},
7177
concat_kwargs = {},
7278
)
73-
# Update configuration with new configuration
79+
80+
# First update defaults with registry_configuration
81+
if registry_config is not None:
82+
parser_log.debug(f'Updating defaults with: {registry_config}')
83+
self.config.update(registry_config)
84+
85+
# Now update configuration with new (user) configuration
7486
if config is not None:
87+
parser_log.debug(f'Updating registered config with: {config}')
7588
self.config.update(config)
7689

7790
# To handle strings and lists of strings, we convert the former here
@@ -82,21 +95,21 @@ def __init__(self, config: dict = None):
8295

8396
@property
8497
def name(self):
85-
return 'PandasParser_v0'
98+
return 'Parser2DataFrame_v0'
8699

87100
def setup(self):
88101
# Set the correct reading function here
89102
self.read_function = pandas_read_functions.get(
90103
self.config['file_format'].lower(), None)
91104

92105
if self.read_function is None:
93-
pandas_parser_log.error(
106+
parser_log.error(
94107
f'File format {self.config["file_format"]}'
95108
'is not currently supported.')
96109
raise ValueError
97110

98111
def get_info(self):
99-
""" Prints the docstring for the PandasParser module"""
112+
""" Prints the docstring for the Parser2DataFrame module"""
100113
print(inspect.getdoc(self))
101114

102115
def load(self, path: str):
@@ -133,15 +146,15 @@ def load_data(self) -> pd.DataFrame:
133146
"""
134147
data_list = []
135148
for file in self.config['filepaths']:
136-
pandas_parser_log.debug(f'Loading {file} ...')
149+
parser_log.debug(f'Loading {file} ...')
137150
data = self.read_function(
138151
file,
139152
**self.config['read_kwargs'])
140153
data_list.append(data)
141154

142155
# Check for empty data and return nothing if empty
143156
if not data_list:
144-
pandas_parser_log.warning(
157+
parser_log.warning(
145158
'load_data() returning None. This is probably not what you '
146159
'wanted. Ensure that your configuration includes the key '
147160
'"filepaths"')
@@ -154,12 +167,12 @@ def load_data(self) -> pd.DataFrame:
154167
return output
155168

156169
def load_config(self, path: str):
157-
pandas_parser_log.debug('Calling load()...')
170+
parser_log.debug('Calling load()...')
158171
return self.load(path)
159172

160173
def save_config(self, path: str):
161-
pandas_parser_log.debug('Calling save()...')
174+
parser_log.debug('Calling save()...')
162175
return self.save(path)
163176

164177
def save_data(self):
165-
return super().save_data()
178+
return super().save_data()
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import os
2+
import pathlib
3+
import yaml
4+
import pandas as pd
5+
6+
def save_config_to_yaml(config, path):
7+
save_path = pathlib.Path(path)
8+
os.makedirs(save_path)
9+
with open(save_path.joinpath('config.yaml'), 'w') as f:
10+
yaml.safe_dump(self.config, f)
11+
12+
def load_yaml_config(path):
13+
base_path = Path(path)
14+
with open(base_path.joinpath('config.yaml'), 'r') as f:
15+
config = yaml.safe_load(f)
16+
return config
17+
18+
def read_data_to_pandas(filepaths: list, file_format: str, **kwargs) -> pd.DataFrame:
19+
""" Loads all files listed in filepaths and reads them.
20+
All kwargs other than filepaths and file_format will be passed to the read_function
21+
for its associated file_format
22+
23+
Returns:
24+
pd.DataFrame: A single DataFrame containing list of dataframes
25+
"""
26+
27+
# Supported file formats
28+
read_functions = dict(
29+
csv=pd.read_csv,
30+
feather=pd.read_feather,
31+
json=pd.read_json,
32+
pickle=pd.read_pickle
33+
)
34+
35+
data_list = []
36+
read_function = read_functions[file_format]
37+
for file in filepaths:
38+
data = read_function(file, **kwargs)
39+
data_list.append(data)
40+
41+
return data_list

utests/utest_pandas_parser_v0.py renamed to utests/utest_csv_parser.py

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,33 @@
11
from jlab_datascience_toolkit.data_parser import make
22
import unittest
3+
import logging
34
import matplotlib.pyplot as plt
45
import pandas as pd
56
import numpy as np
7+
import argparse
8+
import shutil
9+
import sys
610
import os
711

12+
test_log = logging.Logger('test_logger')
13+
814
rng = np.random.default_rng(seed=42)
15+
parser_id = 'CSVParser_v0'
916

1017

11-
class TestPandasParserv0(unittest.TestCase):
18+
class TestCSVParserv0(unittest.TestCase):
1219

1320
# Initialize:
1421
# *****************************************
1522
def __init__(self, *args, **kwargs):
16-
super(TestPandasParserv0, self).__init__(*args, **kwargs)
23+
super(TestCSVParserv0, self).__init__(*args, **kwargs)
1724

1825
@classmethod
1926
def setUpClass(self) -> None:
2027
print('Setting up all tests...')
2128
self.columns = ['R121GMES', 'R122GMES',
2229
'R123GMES', 'R121GSET', 'R122GSET', 'R123GSET']
23-
self.path = './pandas_parser_utest.csv'
30+
self.path = './csv_parser_utest.csv'
2431
self.samples = 100
2532
data = rng.normal(loc=5, scale=1, size=(
2633
self.samples, len(self.columns)))
@@ -34,7 +41,7 @@ def setUpClass(self) -> None:
3441
test_data
3542
test_data.to_csv(self.path)
3643

37-
self.path2 = './pandas_parser_utest2.csv'
44+
self.path2 = './csv_parser_utest2.csv'
3845
data = rng.normal(loc=9, scale=2, size=(
3946
self.samples, len(self.columns)))
4047
dates = []
@@ -64,14 +71,14 @@ def tearDown(self) -> None:
6471

6572
def test_no_config(self):
6673
print('*****No Config Test*****\n')
67-
parser = make('PandasParser_v0')
74+
parser = make(parser_id)
6875
output = parser.load_data()
6976
self.assertIsNone(output)
7077

7178
def test_string_filepaths(self):
7279
print('*****String Filepaths Test*****\n')
7380

74-
parser = make('PandasParser_v0', config=dict(filepaths=self.path))
81+
parser = make(parser_id, config=dict(filepaths=self.path))
7582
output = parser.load_data()
7683
print('Output Head:\n', output.head())
7784

@@ -80,14 +87,14 @@ def test_string_filepaths(self):
8087
def test_one_item_list_filepaths(self):
8188
print('*****One Item List Test*****\n')
8289

83-
parser = make('PandasParser_v0', config=dict(filepaths=[self.path]))
90+
parser = make(parser_id, config=dict(filepaths=[self.path]))
8491
output = parser.load_data()
8592
print('Output Head:\n', output.head())
8693
self.assertEqual(output.shape, (self.samples, len(self.columns)+1))
8794

8895
def test_two_filepaths(self):
8996
print('*****Two Filepaths Test*****\n')
90-
parser = make('PandasParser_v0', config=dict(filepaths=[self.path, self.path2]))
97+
parser = make(parser_id, config=dict(filepaths=[self.path, self.path2]))
9198
output = parser.load_data()
9299
print('Output Head:\n', output.head())
93100
print('Output shape:', output.shape)
@@ -97,7 +104,7 @@ def test_usecols_read_arg(self):
97104
print('*****Usecols Read Arg Test*****\n')
98105

99106
two_columns = ['R121GMES', 'R121GSET']
100-
parser = make('PandasParser_v0', config=dict(
107+
parser = make(parser_id, config=dict(
101108
filepaths=self.path, read_kwargs=dict(usecols=two_columns)))
102109
output = parser.load_data()
103110
print('Output Head:\n', output.head())
@@ -110,7 +117,7 @@ def test_use_datetime_index(self):
110117
def column_lambda(x): return ('GMES' in x) or (x == 'Date')
111118
read_kwargs = dict(usecols=column_lambda,
112119
index_col='Date', parse_dates=True)
113-
parser = make('PandasParser_v0',
120+
parser = make(parser_id,
114121
config=dict(
115122
filepaths=self.path, read_kwargs=read_kwargs)
116123
)
@@ -121,7 +128,27 @@ def column_lambda(x): return ('GMES' in x) or (x == 'Date')
121128
self.assertTrue('GMES' in column)
122129
self.assertIsInstance(output.index, pd.DatetimeIndex)
123130

131+
def test_save_load(self):
132+
print('*****Save/Load Test*****\n')
124133

125-
# Run this file via: python utest_pandas_parser_v0.py
134+
parser = make(parser_id, config=dict(filepaths=self.path, read_kwargs={'usecols': self.columns}))
135+
output = parser.load_data()
136+
save_path = './temp_parser'
137+
try:
138+
parser.save(save_path)
139+
new_parser = make(parser_id)
140+
new_parser.load(save_path)
141+
new_output = new_parser.load_data()
142+
for col in output.columns:
143+
with self.subTest(col=col):
144+
self.assertTrue(np.allclose(output[col], new_output[col]))
145+
finally:
146+
shutil.rmtree(save_path)
147+
pass
148+
149+
# Run this file via: python utest_csv_parser_v0.py
126150
if __name__ == "__main__":
151+
argv = len(sys.argv) > 1 and sys.argv[1]
152+
loglevel = logging.DEBUG if argv == '-v' else logging.WARNING
153+
logging.basicConfig(stream=sys.stdout, level=loglevel)
127154
unittest.main()

0 commit comments

Comments
 (0)