1
- import dask .dataframe as dd
1
+ """Module providing functions for processing and wrangling data."""
2
+
2
3
from datetime import datetime
4
+ from pathlib import Path
5
+
3
6
import numpy as np
4
7
import pandas as pd
5
- from pathlib import Path
6
8
9
+ import dask .dataframe as dd
7
10
from .config import Config
8
11
9
12
def format_outname (prefix : str , se : bool , weekday :bool ):
10
- '''
13
+ """
14
+ Write out results.
11
15
12
16
Parameters
13
17
----------
14
- prefix
15
- se
16
- weekday
18
+ prefix:
19
+ se: boolean to write out standard errors, if true, use an obfuscated name
20
+ weekday: boolean for weekday adjustments.
21
+ signals will be generated with weekday adjustments (True) or without
22
+ adjustments (False)
17
23
18
24
Returns
19
25
-------
20
-
21
- '''
22
- # write out results
26
+ outname str
27
+ """
23
28
out_name = "smoothed_adj_cli" if weekday else "smoothed_cli"
24
29
if se :
25
30
assert prefix is not None , "template has no obfuscated prefix"
26
31
out_name = prefix + "_" + out_name
27
32
return out_name
28
33
29
34
def format_df (df : pd .DataFrame , geo_id : str , se : bool , logger ):
30
- '''
31
- format dataframe and checks for anomalies to write results
35
+ """
36
+ Format dataframe and checks for anomalies to write results.
37
+
32
38
Parameters
33
39
----------
34
40
df: dataframe from output from update_sensor
@@ -39,7 +45,7 @@ def format_df(df: pd.DataFrame, geo_id: str, se: bool, logger):
39
45
Returns
40
46
-------
41
47
filtered and formatted dataframe
42
- '''
48
+ """
43
49
# report in percentage
44
50
df ['val' ] = df ['val' ] * 100
45
51
df ["se" ] = df ["se" ] * 100
@@ -66,7 +72,7 @@ def format_df(df: pd.DataFrame, geo_id: str, se: bool, logger):
66
72
valid_cond = (df ['se' ] > 0 ) & (df ['val' ] > 0 )
67
73
invalid_df = df [~ valid_cond ]
68
74
if len (invalid_df ) > 0 :
69
- logger .info (f "p=0, std_err=0 invalid" )
75
+ logger .info ("p=0, std_err=0 invalid" )
70
76
df = df [valid_cond ]
71
77
else :
72
78
df ["se" ] = np .NAN
@@ -76,7 +82,8 @@ def format_df(df: pd.DataFrame, geo_id: str, se: bool, logger):
76
82
return df
77
83
78
84
def write_to_csv (output_df : pd .DataFrame , prefix : str , geo_id : str , weekday : bool , se :bool , logger , output_path = "." ):
79
- """Write sensor values to csv.
85
+ """
86
+ Write sensor values to csv.
80
87
81
88
Args:
82
89
output_dict: dictionary containing sensor rates, se, unique dates, and unique geo_id
@@ -106,9 +113,9 @@ def write_to_csv(output_df: pd.DataFrame, prefix: str, geo_id: str, weekday: boo
106
113
107
114
108
115
def csv_to_df (filepath : str , startdate : datetime , enddate : datetime , dropdate : datetime , logger ) -> pd .DataFrame :
109
- '''
110
- Reads csv using Dask and filters out based on date range and currently unused column,
111
- then converts back into pandas dataframe.
116
+ """
117
+ Read csv using Dask, filters unneeded data, then converts back into pandas dataframe.
118
+
112
119
Parameters
113
120
----------
114
121
filepath: path to the aggregated doctor-visits data
@@ -117,7 +124,7 @@ def csv_to_df(filepath: str, startdate: datetime, enddate: datetime, dropdate: d
117
124
dropdate: data drop date (YYYY-mm-dd)
118
125
119
126
-------
120
- '''
127
+ """
121
128
filepath = Path (filepath )
122
129
logger .info (f"Processing { filepath } " )
123
130
0 commit comments