6
6
7
7
from .config import Config
8
8
9
- def format_df ( df : pd . DataFrame , geo_id : str , se : bool , logger ):
9
+ def format_outname ( prefix : str , se : bool , weekday : bool ):
10
10
'''
11
11
12
12
Parameters
13
13
----------
14
- df
15
- geo_id
14
+ prefix
16
15
se
17
- logger
16
+ weekday
18
17
19
18
Returns
20
19
-------
21
20
21
+ '''
22
+ # write out results
23
+ out_name = "smoothed_adj_cli" if weekday else "smoothed_cli"
24
+ if se :
25
+ assert prefix is not None , "template has no obfuscated prefix"
26
+ out_name = prefix + "_" + out_name
27
+ return out_name
28
+
29
+ def format_df (df : pd .DataFrame , geo_id : str , se : bool , logger ):
30
+ '''
31
+ format dataframe and checks for anomalies to write results
32
+ Parameters
33
+ ----------
34
+ df: dataframe from output from update_sensor
35
+ geo_id: geographic resolution, one of ["county", "state", "msa", "hrr", "nation", "hhs"]
36
+ se: boolean to write out standard errors, if true, use an obfuscated name
37
+ logger
38
+
39
+ Returns
40
+ -------
41
+ filtered and formatted dataframe
22
42
'''
23
43
# report in percentage
24
44
df ['val' ] = df ['val' ] * 100
@@ -28,53 +48,61 @@ def format_df(df: pd.DataFrame, geo_id: str, se: bool, logger):
28
48
df_val_null = df [val_isnull ]
29
49
if not df_val_null .empty :
30
50
logger .info ("sensor value is nan, check pipeline" )
31
- filtered_df = df [~ val_isnull ]
51
+ df = df [~ val_isnull ]
32
52
33
- se_too_high = filtered_df ['se' ] >= 5
34
- df_se_too_high = filtered_df [se_too_high ]
35
- if len (df_se_too_high . empty ) > 0 :
53
+ se_too_high = df ['se' ] >= 5
54
+ df_se_too_high = df [se_too_high ]
55
+ if len (df_se_too_high ) > 0 :
36
56
logger .info (f"standard error suspiciously high! investigate { geo_id } " )
37
- filtered_df = filtered_df [~ se_too_high ]
57
+ df = df [~ se_too_high ]
38
58
39
- sensor_too_high = filtered_df ['val' ] >= 90
40
- df_sensor_too_high = filtered_df [sensor_too_high ]
59
+ sensor_too_high = df ['val' ] >= 90
60
+ df_sensor_too_high = df [sensor_too_high ]
41
61
if len (df_sensor_too_high ) > 0 :
42
62
logger .info (f"standard error suspiciously high! investigate { geo_id } " )
43
- filtered_df = filtered_df [~ sensor_too_high ]
63
+ df = df [~ sensor_too_high ]
44
64
45
65
if se :
46
- valid_cond = filtered_df ['se' ] > 0 & filtered_df ['val' ] > 0
47
- invalid_df = filtered_df [~ valid_cond ]
66
+ valid_cond = ( df ['se' ] > 0 ) & ( df ['val' ] > 0 )
67
+ invalid_df = df [~ valid_cond ]
48
68
if len (invalid_df ) > 0 :
49
69
logger .info (f"p=0, std_err=0 invalid" )
50
- filtered_df = filtered_df [valid_cond ]
70
+ df = df [valid_cond ]
51
71
else :
52
- filtered_df .drop (columns = ['se' ], inplace = True )
53
-
72
+ df ["se" ] = np .NAN
54
73
74
+ df ["direction" ] = np .NAN
75
+ df ["sample_size" ] = np .NAN
76
+ return df
55
77
56
- def write_to_csv (output_df : pd .DataFrame , geo_level : str , se : bool , out_name : str , logger , output_path = "." ):
78
+ def write_to_csv (output_df : pd .DataFrame , prefix : str , geo_id : str , weekday : bool , se : bool , logger , output_path = "." ):
57
79
"""Write sensor values to csv.
58
80
59
81
Args:
60
82
output_dict: dictionary containing sensor rates, se, unique dates, and unique geo_id
61
- geo_level : geographic resolution, one of ["county", "state", "msa", "hrr", "nation", "hhs"]
83
+ geo_id : geographic resolution, one of ["county", "state", "msa", "hrr", "nation", "hhs"]
62
84
se: boolean to write out standard errors, if true, use an obfuscated name
63
85
out_name: name of the output file
64
86
output_path: outfile path to write the csv (default is current directory)
65
87
"""
88
+ out_name = format_outname (prefix , se , weekday )
89
+ filtered_df = format_df (output_df , geo_id , se , logger )
90
+
66
91
if se :
67
92
logger .info (f"========= WARNING: WRITING SEs TO { out_name } =========" )
68
93
69
- out_n = 0
70
- for d in set (output_df ["date" ]):
94
+ dates = set (list (output_df ['date' ]))
95
+ grouped = filtered_df .groupby ('date' )
96
+ for d in dates :
71
97
filename = "%s/%s_%s_%s.csv" % (output_path ,
72
98
(d + Config .DAY_SHIFT ).strftime ("%Y%m%d" ),
73
- geo_level ,
99
+ geo_id ,
74
100
out_name )
75
- single_date_df = output_df [output_df ["date" ] == d ]
101
+ single_date_df = grouped .get_group (d )
102
+ single_date_df = single_date_df .drop (columns = ['date' ])
103
+ single_date_df .to_csv (filename , index = False , na_rep = "NA" )
76
104
77
- logger .debug (f"wrote { out_n } rows for { geo_level } " )
105
+ logger .debug (f"wrote { len ( single_date_df ) } rows for { geo_id } " )
78
106
79
107
80
108
def csv_to_df (filepath : str , startdate : datetime , enddate : datetime , dropdate : datetime , logger ) -> pd .DataFrame :
0 commit comments