-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvalidation_functions.py
145 lines (108 loc) · 6.29 KB
/
validation_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import os
import pandas as pd
import numpy as np
VALID_COLUMNS = ['location', 'age_group', 'forecast_date', 'target_end_date', 'target',
'type', 'quantile', 'value', 'pathogen']
LOCATION_CODES = ['DE', 'DE-BW', 'DE-BY', 'DE-HB', 'DE-HH', 'DE-HE', 'DE-NI',
'DE-NW', 'DE-RP', 'DE-SL', 'DE-SH', 'DE-BB', 'DE-MV', 'DE-SN',
'DE-ST', 'DE-TH', 'DE-BE']
VALID_QUANTILES = [0.025, 0.1, 0.25, 0.5, 0.75, 0.9, 0.975]
VALID_TYPES = ['mean', 'quantile']
VALID_AGE_GROUPS = ['00+', '00-04', '05-14', '15-34', '35-59', '60-79', '80+']
VALID_TARGETS = [f'{_} day ahead inc hosp' for _ in range(-28, 15)]
VALID_PATHOGENS = ['COVID-19']
def check_forecast_date(filepath):
try:
file_forecast_date = pd.to_datetime(os.path.basename(filepath)[:10]).date()
except:
return f"Date of filename in wrong format: {os.path.basename(filepath)[:10]}. Should be yyyy-mm-dd."
df = pd.read_csv(filepath)
if df.forecast_date.nunique() > 1:
return f"The file contains multiple forecast dates: {df.forecast_date.unique()}. Forecast date must be unique."
try:
column_forecast_date = pd.to_datetime(df.forecast_date.iloc[0]).date()
except:
return f"Date in column \'forecast_date\' in wrong format: {df.forecast_date.iloc[0]}. Should be yyyy-mm-dd."
if file_forecast_date != column_forecast_date:
return f"Date of filename {os.path.basename(filepath)} does not match column \'forecast_date\': {column_forecast_date}."
today = pd.Timestamp('today', tz='Europe/Berlin').date()
if ('retrospective' not in filepath) and (abs(file_forecast_date - today).days > 0):
return f"The forecast is not made today. Date of the forecast: {file_forecast_date}, today: {today}."
def check_column_values(df):
invalid_values = dict()
invalid_values['location'] = [_ for _ in df.location.unique() if _ not in LOCATION_CODES]
invalid_values['quantile'] = [_ for _ in df['quantile'].dropna().unique() if _ not in VALID_QUANTILES]
invalid_values['type'] = [_ for _ in df.type.unique() if _ not in VALID_TYPES]
invalid_values['age_group'] = [_ for _ in df.age_group.unique() if _ not in VALID_AGE_GROUPS]
invalid_values['target'] = [_ for _ in df.target.unique() if _ not in VALID_TARGETS]
invalid_values['pathogen'] = [_ for _ in df.pathogen.unique() if _ not in VALID_PATHOGENS]
errors = []
for key, value in invalid_values.items():
if len(value) > 0:
errors.append(f'Invalid entries in column \'{key}\': {value}')
if len(errors) > 0:
return errors
def check_header(df):
missing_cols = [c for c in VALID_COLUMNS if c not in df.columns]
additional_cols = [c for c in df.columns if c not in VALID_COLUMNS]
errors=[]
if len(missing_cols) > 0:
errors.append(f'The following columns are missing: {missing_cols}. Please add them.')
if len(additional_cols) > 0:
errors.append(f'The following columns are not accepted: {additional_cols}. Please remove them.')
if len(errors) > 0:
return errors
def check_target_dates(df):
df['invalid_target_date'] = df.apply(lambda x: x.target_end_date != x.forecast_date +
pd.Timedelta(days = int(x.target.split(' ')[0])), axis = 1)
invalid_target_dates = df.loc[df.invalid_target_date, ['forecast_date', 'target_end_date', 'target']].drop_duplicates()
if len(invalid_target_dates) > 0:
error = 'The following target_end_dates are wrong:\n\n' + invalid_target_dates.to_string(index = False)
return error
def check_value(df):
errors = []
if df.value.isnull().sum():
errors.append(f'Missing values in column \'value\' are not allowed. {df.value.isnull().sum()} values are missing.')
non_numeric_values = df.value[~df.value.astype(str).str.replace('.', '', regex = False).str.isnumeric()].dropna().to_list()
if len(non_numeric_values) > 0:
errors.append(f'Non-numeric entries in column \'value\' are not allowed: {non_numeric_values}.')
if len(errors) > 0:
return errors
def check_mean(df):
n = df[df.type == 'mean']['quantile'].notnull().sum()
if n > 0:
error = f"Rows with type \"mean\" should have NA in column \'quantile\'. This was violated {n} time{'s' if (n > 1) else ''}."
return error
def check_duplicates(df):
df_duplicated = df[df.duplicated(subset = ['location', 'age_group', 'forecast_date', 'target_end_date',
'target', 'type', 'quantile', 'pathogen'], keep = False)].copy()
df_duplicated.sort_values(['location', 'age_group', 'forecast_date', 'target_end_date',
'target', 'type', 'quantile', 'pathogen'], inplace = True)
n = len(df_duplicated)
if n > 0:
error = f"Duplicated targets present. Check the following {n} rows. \n\n {df_duplicated.to_string()}"
return error
# def check_quantiles(df):
# df.loc[df.type != 'mean', 'no_quantiles'] = df[df.type != 'mean'].groupby(['location', 'age_group', 'target',
# 'target_end_date'])['quantile'].transform('nunique')
# # note that we've already checked that no invalid quantiles are present
# incomplete_quantiles = df[(df.no_quantiles != 7) & df.no_quantiles.notnull()]
# if len(incomplete_quantiles) > 0:
# error = 'Not all quantiles were provided in the following setting(s):\n\n' + \
# incomplete_quantiles.groupby(['location', 'age_group', 'target', 'target_end_date']
# )['quantile'].unique().to_string()
# return error
def check_forecast(filepath):
errors = []
result = check_forecast_date(filepath)
if result:
errors.append(result)
df = pd.read_csv(filepath, parse_dates = ['forecast_date', 'target_end_date'])
for check in [check_header, check_column_values, check_value, check_mean, check_duplicates, check_target_dates]:
try:
result = check(df)
if result:
errors.extend(result if isinstance(result, list) else [result])
except:
errors.append(f"Fatal error: {check.__name__} could not be completed.")
return errors