-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatahelpers.py
393 lines (322 loc) · 12.3 KB
/
datahelpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
"""
Data I/O operations (reading, saving, etc.).
Script Information
------------------
- Contact: [email protected]
- Workspace: pyprojecttools
- FIlename: datahelpers.py
- Date: September 29, 2023
Functions
---------
- class ParseCfg
- function read_json
- function read_txt
- function write_to_excel
- function write_to_json
- function write_to_txt
"""
import json
import os
import warnings
from configparser import ConfigParser
from typing import Any, Callable, Dict, List, Literal, Sequence, Tuple
from zipfile import ZipFile
import numpy as np
import torch
from pandas import DataFrame, ExcelWriter
# from strfmts import osdate_time
# from userwarnings import MissingArgumentsWarning
_reduced_types = DataFrame | List[DataFrame] | Tuple[DataFrame]
def to_excel(
save_as: str = None,
df: DataFrame | List[DataFrame] = None,
sheetnames: str | List[str] = "Main",
index: bool = False,
**kwargs,
):
"""
Builds an Microsoft Excel file (`.xlsx` or `.xls`) from single or multiple Pandas dataframes (`pd.DataFrame` object). In the latter case, it will create separate pages for each dataframe using a list of sheetnames. If no sheetname list is provided, an enumerated list will be used.
Args:
save_as (str, optional): what you want to name the saved file. Defaults to f"untitled_excel_sheet-{osdate_time}.xlsx".
df (DataFrame, optional): single dataframe or list of dataframes you want to write to file. Defaults to None.
sheetname (Union[str, List], optional): name or list of names you want to name each excel sheet page. Defaults to 'Main'.
index (bool, optional): option to write the index of each dataframe. Defaults to False.
"""
with ExcelWriter(save_as) as writer:
if isinstance(sheetnames, List[DataFrame]):
for data, sheetnames in zip(df, sheetnames):
data.to_excel(writer, sheet_name=sheetnames, index=index, **kwargs)
else:
df.to_excel(writer, sheet_name=sheetnames, index=index, **kwargs)
def to_txt(
save_as: str = None,
lines: str | List[str] = None,
mode: Literal["w", "wb", "a"] = "w",
**kwargs,
):
"""
Writes string or list of strings to a text file.
Args:
save_as (str, optional): name to save file as. Defaults to None.
lines (Union[str, List[str]], optional): string or list of strings to save to file. Defaults to None.
mode (literal, optional): write mode for ExcelWriter. Defaults to 'w'.
"""
with open(save_as, mode=mode, **kwargs) as file:
if isinstance(lines, str):
file.write(lines)
else:
for line in lines:
file.write(line)
file.close()
def to_json(
save_as: str = None,
dict_to_save: Dict = None,
mode: Literal["w", "wb"] = "w",
**kwargs,
):
"""
Writes a dictionary to a JSON file
Args:
save_as (str, optional): name to save file as. Defaults to None.
dict_to_save (Dict, optional): dictionary you want to write to the JSON file. Defaults to None.
mode (literal, optional): writing mode. Defaults to 'w'.
"""
with open(save_as, mode) as file:
json.dump(dict_to_save, file, **kwargs)
def df_to_json(
data: DataFrame = None,
save_as: str = None,
mode: Literal["w", "wb"] = "w",
cfg={"orient": "dict", "into": type(dict), "index": True},
):
"""
Saves a Pandas DataFrame into a JSON file.
Args:
data (DataFrame): DataFrame you are saving. Default to None.
save_as (string): Name of JSON file you are saving to. Default to None.
mode (Literal["w", "wb"]): Mode you are saving the DataFrame with. Default to "w".
cfg (Dict): Configuration dictionary for DataFrame -> Dictionary conversion. Default to None.
"""
data = data.to_dict(**cfg)
with open(save_as, mode) as file:
json.dump(data, file)
def read_txt(
filename: str = None, mode: Literal["r", "rb"] = "r", **kwargs
) -> List[str]:
"""
Reads text file. Returns None if path does not exist.
Args:
filename (str, optional): file name to read. Defaults to None.
mode (literal, optional): reading mode. Defaults to 'r'.
Returns:
list: returns list of strings of line read
"""
with open(filename, mode=mode, **kwargs) as file:
return file.readlines() if os.path.exists(filename) else None
def read_json(
filename: str = None, sort_key: Any = None, *args, **kwargs
) -> Dict[Any, Any]:
"""
Reads JSON file. Returns None if path does not exist.
Args:
filename (str, optional): name of JSON file to read. Defaults to None.
Returns:
dictionary: returns read JSON data
"""
with open(filename, *args, **kwargs) as f:
data = json.load(f) if os.path.exists(filename) else None
if sort_key:
return sorted(data, key=sort_key)
else:
return data
def reduce_df(
data: DataFrame = None,
percent: float = None,
save_type: Literal["list", "tuple", "dataframe"] = None,
) -> _reduced_types:
"""
Reduces size of `pd.DataFrame` input based on decimal-representation of what percentage of the original dataset the returned data set should be. Orginial dataset will be returned if parameters are not correctly set (as a safeguard). User is warned if arguments are missing.
Notes:
- Your data can be returned as a list or tuple of the columns.
Args:
data (`DataFrame`): Original dataset, by default None
percent (float): Percentage of `data` that you want returned, by default None
save_type (literal, optional): Return shortened `DataFrame` as a `tuple` or `list` of the columns, returns `new_df` if left `None` or `"dataframe"`.
Returns:
DataFrame: Shortened `DataFrame` as a tuple or list of the columns, or left as a `DataFrame`.
"""
if not (data or percent):
_msg = "'data' and 'percent' must both me used. Original dataset will be returned if inputted, otherwise, None will be returned."
# warnings.warn(_msg, MissingArgumentsWarning)
return data if data and not percent else None
elif data and percent:
new_df = data.head(int(data.shape[0] * percent))
df_list = [new_df[col] for col in new_df.columns]
returns = {"list": df_list, "tuple": tuple(df_list), "dataframe": new_df}
return returns.get(save_type, new_df)
def pull_columns(data: Any = None, *cols, as_tuple: bool = False) -> List | Tuple:
"""
Method for pulling specific columns of dataframe, as list or tuple of said columns.
Args:
data (Any, optional): data to pull from (as a Pandas dataframe). Defaults to None.
tuple (bool, optional): option to return a tuple of pulled columns. Defaults to False.
Returns:
list or tuple: returns targeted columns as a list or tuple
"""
pulled = (data[col] for col in cols)
return pulled if as_tuple else list(pulled)
def zip_folder(
zipped_filename: str = None,
to_zip: Sequence[str] | str = None,
mode: Literal["w", "x", "a"] = "w",
) -> None:
"""
Zip files to folder. Enter a path to a dirctory, or a list of paths to files to zip.
Args:
zipped_filename (str, optional): Name of zipped folder. Defaults to None.
to_zip (Sequence[str] | str, optional): Path to directory or list of paths to zip. Defaults to None.
mode (literal["w", "x", "a"], optional): Zipping mode. Defaults to "w".
Notes:
- Zipping modes:
- `w` for writing to a new file
- `x` for referring to an existing file.
- `a` for appending to an existing file.
"""
with ZipFile(zipped_filename, mode) as file:
files_list = list(to_zip) if os.path.isdir(to_zip) else to_zip
for _file in files_list:
file.write(_file)
def zip_extract(
zipped_file: str = None,
read_mode: Literal["r"] = "r",
extract_mode: Literal["folder", "lists"] = "folder", # FIXME - not finished
out_dir: str = None,
) -> None:
"""
Extracts contents of a ZIP file.
Args:
zipped_file (str, optional): File path to zipped file. Defaults to None.
read_mode (Literal["r"], optional): File reading mode. Defaults to "r".
extract_mode (Literal["folder", "lists"], optional): ZIP extraction mode. Defaults to "folder".
out_dir (str, optional): Directory you want to save the extracted contents to. Defaults to None.
"""
# TODO - finish the second half of this function
with ZipFile(zipped_file, read_mode) as file:
file.extractall(out_dir)
def save_dict2xml(
data: Dict[str, Any] = None, save_as: str = None, mode: Literal["w", "wb"] = "wb"
) -> None:
"""
Saves a dictionary to an XML file.
Args:
data (Dict[str, Any], optional): Dictionary that you want to save. Defaults to None.
save_as (str, optional): Name you are saving the XML file as. Defaults to None.
mode (Literal["w", "wb"], optional): File writing mode. Defaults to "wb".
"""
data = data.encode()
with open(save_as, mode) as file:
file.write(data)
def extract_tar(tarfile: str = None, path: str = None) -> None:
"""
Extracts data from a `.tar` file.
Parameters
----------
tarfile : str, optional
Path to your `.tar` file, by default None
path : str, optional
Path where you want to save the data to, by default None
"""
with tarfile.open(tarfile) as file:
file.extractall(path)
def list2tensor(
y: Sequence | np.array, sqz: int = 1, conv_type: Callable = torch.int64
) -> torch.Tensor:
"""
Converts an array (or list) into a `PyTorch` tensor (`torch.Tensor`).
Parameters
----------
y : Sequence | np.array
Sequence that you want to convert to a tensor.
sqz : int, optional
Squeeze value, by default 1
conv_type : Callable, optional
Type you want to convert the tensor values to, by default torch.int64
Returns
-------
torch.Tensor
Final tensor from sequence.
"""
y = y.to_numpy()
y = torch.Tensor(y)
y = y.squeeze(sqz)
y = y.type(conv_type)
return y
def conv2str(df: DataFrame = None, dtype: str = "str"):
"""
Converts all columns of a given DataFrame into a desired data type.
Parameters
----------
df : DataFrame, optional
DataFrame that you want to convert, by default None
dtype : str, optional
Data type you want to convert the columns to, by default "str"
"""
for colname in df.columns:
try:
df[colname] = df[colname].astype(dtype)
print(f"'{colname}' converted to {dtype}")
except:
print(f"'{colname}' conversion failed")
def combine_list(strings: list) -> str:
"""
Combineds a list of strings into a single string.
Parameters
----------
strings : list
List of strings that you want to convert into a single string.
Returns
-------
str
Final combined string.
"""
return "".join(strings).strip()
def normalize(matrix: np.array | Sequence = None) -> np.array:
"""
Divides matrix (or array) my its norm for normalization.
Parameters
----------
matrix : np.array | Sequence, optional
Matrix (or array) that you want to normalize, by default None
Returns
-------
np.array
Normalized array or matrix.
"""
norm = np.linalg.norm(matrix)
return matrix / norm
class ParseConfig(ConfigParser):
# FIXME - NOT WORKING
def __init__(self, filepath: str = None) -> None:
self.path = filepath
self._FILE = ConfigParser(self.path)
self._parsed_file = self._FILE.read()
def get_val(
self,
section: str = None,
key: str = None,
_type: Literal["str", "bool", "int", "float"] = None,
) -> Any:
if not _type:
return None
match _type:
case "str":
return self._parsed_file.get(section, key)
case "bool":
return self._parsed_file.getboolean(section, key)
case "int":
return self._parsed_file.getint(section, key)
case "float":
return self._parsed_file.getfloat(section, key)
@property
def cfgsections(self):
return self._parsed_file.sections()