Skip to content

Commit b52d80a

Browse files
committed
in progress cleaning up writing csv
1 parent fc2c58d commit b52d80a

File tree

1 file changed

+47
-21
lines changed

1 file changed

+47
-21
lines changed

doctor_visits/delphi_doctor_visits/process_data.py

+47-21
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,52 @@
66

77
from .config import Config
88

9+
def format_df(df: pd.DataFrame, geo_id: str, se: bool, logger):
10+
'''
11+
12+
Parameters
13+
----------
14+
df
15+
geo_id
16+
se
17+
logger
18+
19+
Returns
20+
-------
21+
22+
'''
23+
# report in percentage
24+
df['val'] = df['val'] * 100
25+
df["se"] = df["se"] * 100
26+
27+
val_isnull = df["val"].isnull()
28+
df_val_null = df[val_isnull]
29+
if not df_val_null.empty:
30+
logger.info("sensor value is nan, check pipeline")
31+
filtered_df = df[~val_isnull]
32+
33+
se_too_high = filtered_df['se'] >= 5
34+
df_se_too_high = filtered_df[se_too_high]
35+
if len(df_se_too_high.empty) > 0:
36+
logger.info(f"standard error suspiciously high! investigate {geo_id}")
37+
filtered_df = filtered_df[~se_too_high]
38+
39+
sensor_too_high = filtered_df['val'] >= 90
40+
df_sensor_too_high = filtered_df[sensor_too_high]
41+
if len(df_sensor_too_high) > 0:
42+
logger.info(f"standard error suspiciously high! investigate {geo_id}")
43+
filtered_df = filtered_df[~sensor_too_high]
44+
45+
if se:
46+
valid_cond = filtered_df['se'] > 0 & filtered_df['val'] > 0
47+
invalid_df = filtered_df[~valid_cond]
48+
if len(invalid_df) > 0:
49+
logger.info(f"p=0, std_err=0 invalid")
50+
filtered_df = filtered_df[valid_cond]
51+
else:
52+
filtered_df.drop(columns=['se'], inplace=True)
53+
54+
955

1056
def write_to_csv(output_df: pd.DataFrame, geo_level: str, se:bool, out_name: str, logger, output_path="."):
1157
"""Write sensor values to csv.
@@ -27,27 +73,7 @@ def write_to_csv(output_df: pd.DataFrame, geo_level: str, se:bool, out_name: str
2773
geo_level,
2874
out_name)
2975
single_date_df = output_df[output_df["date"] == d]
30-
with open(filename, "w") as outfile:
31-
outfile.write("geo_id,val,se,direction,sample_size\n")
32-
33-
for line in single_date_df.itertuples():
34-
geo_id = line.geo_id
35-
sensor = 100 * line.val # report percentages
36-
se_val = 100 * line.se
37-
assert not np.isnan(sensor), "sensor value is nan, check pipeline"
38-
assert sensor < 90, f"strangely high percentage {geo_id, sensor}"
39-
if not np.isnan(se_val):
40-
assert se_val < 5, f"standard error suspiciously high! investigate {geo_id}"
41-
42-
if se:
43-
assert sensor > 0 and se_val > 0, "p=0, std_err=0 invalid"
44-
outfile.write(
45-
"%s,%f,%s,%s,%s\n" % (geo_id, sensor, se_val, "NA", "NA"))
46-
else:
47-
# for privacy reasons we will not report the standard error
48-
outfile.write(
49-
"%s,%f,%s,%s,%s\n" % (geo_id, sensor, "NA", "NA", "NA"))
50-
out_n += 1
76+
5177
logger.debug(f"wrote {out_n} rows for {geo_level}")
5278

5379

0 commit comments

Comments
 (0)