From b446b80dd6d33c6a3fb4ddc6e1d1d44f989cfa80 Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Wed, 29 Sep 2021 10:51:49 -0700 Subject: [PATCH 01/12] add irradiance_qa.py --- irradiance_qa/irradiance_qa.py | 426 +++++++++++++++++++++++++++++++++ 1 file changed, 426 insertions(+) create mode 100644 irradiance_qa/irradiance_qa.py diff --git a/irradiance_qa/irradiance_qa.py b/irradiance_qa/irradiance_qa.py new file mode 100644 index 00000000..c979b508 --- /dev/null +++ b/irradiance_qa/irradiance_qa.py @@ -0,0 +1,426 @@ +""" +Manual QA processing for sites selected for Solar Forecasting 2 Topic +Area 2 and 3 evaluations. +""" + +import json +import logging +import os +from pathlib import Path + +import click +import pandas as pd +import numpy as np +import pvlib +import matplotlib.pyplot as plt +from solarforecastarbiter.cli import common_options, cli_access_token +from solarforecastarbiter.io.api import APISession + +CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) +logger = logging.getLogger(__name__) + +OUTPUT_PATH = Path('ta23_site_data') + +SITES = { + 'NOAA SURFRAD Table Mountain Boulder CO': { + 'consistency_limits': { + 'dhi_ratio': {'ratio_bounds': [0, 1.5]} + }, + 'overcast': '2018-02-03', + 'clear': '2018-04-20', + 'variable': '2018-03-03', + 'site_id': '9dfa7910-7e49-11e9-b4e8-0a580a8003e9', + 'timezone': 'Etc/GMT+7', + }, + 'DOE RTC Cocoa FL': { + 'consistency_limits': { + 'dhi_ratio': {'ratio_bounds': [0, 1.5]} + }, + 'overcast': '2018-01-01', + 'clear': '2018-03-03', + 'variable': '2018-04-22', + 'site_id': 'a9d0d140-99fc-11e9-81fa-0a580a8200c9', + 'timezone': 'Etc/GMT+5', + }, + 'NOAA SURFRAD Goodwin Creek MS': { + 'consistency_limits': { + 'dhi_ratio': {'ratio_bounds': [0, 1.5]}, + 'ghi_ratio': {'ratio_bounds': [0.84, 1.16]}, + }, + 'overcast': '2018-01-10', + 'clear': '2018-03-03', + 'variable': '2018-01-14', + 'site_id': '9e4e98ac-7e49-11e9-a7c4-0a580a8003e9', + 'timezone': 'Etc/GMT+6', + }, + 'NOAA SOLRAD Hanford California': { + 'consistency_limits': { + 'dhi_ratio': {'ratio_bounds': [0, 1.5]}, + 'ghi_ratio': {'ratio_bounds': [0.84, 1.16]}, + }, + 'overcast': '2018-01-17', + 'clear': '2018-02-08', + 'variable': '2018-03-03', + 'site_id': 'c291964c-7e49-11e9-af46-0a580a8003e9', + 'timezone': 'Etc/GMT+8', + }, + 'NREL MIDC Humboldt State University': { + 'consistency_limits': { + 'dhi_ratio': {'ratio_bounds': [0, 1.5]} + }, + 'overcast': '2018-01-14', + 'clear': '2018-07-01', + 'variable': '2018-03-03', + 'site_id': '9feac63a-7e49-11e9-9bde-0a580a8003e9', + 'timezone': 'Etc/GMT+8', + }, + 'DOE ARM Southern Great Plains SGP, Lamont, Oklahoma': { + 'overcast': '2018-02-10', + 'clear': '2018-01-23', + 'variable': '2018-05-04', + 'site_id': 'd52d47e6-88c4-11ea-b5f8-0a580a820092', + 'timezone': 'Etc/GMT+6', + }, + 'WRMC BSRN NASA Langley Research Center': { + 'consistency_limits': { + 'dhi_ratio': {'ratio_bounds': [0, 1.5]} + }, + 'overcast': '2018-02-13', + 'clear': '2018-04-20', + 'variable': '2018-03-03', + 'site_id': '371a5e3a-1888-11eb-959e-0a580a820169', + 'timezone': 'Etc/GMT+5', + }, + 'NOAA SURFRAD Penn State Univ PA': { + 'consistency_limits': { + 'dhi_ratio': {'ratio_bounds': [0, 1.5]}, + 'ghi_ratio': {'ratio_bounds': [0.85, 1.15]}, + }, + 'overcast': '2018-01-12', + 'clear': '2018-01-14', + 'variable': '2018-01-01', + 'site_id': '9e69b108-7e49-11e9-a3df-0a580a8003e9', + 'timezone': 'Etc/GMT+5', + }, + 'PNNL': { + 'consistency_limits': { + 'dhi_ratio': {'ratio_bounds': [0, 1.5]}, + 'ghi_ratio': {'ratio_bounds': [0.85, 1.15]}, + }, + 'overcast': '2018-01-27', + 'clear': '2018-02-11', + 'variable': '2018-02-09', + 'site_id': '4a4e1f82-a2d1-11eb-90bf-0a580a820087', + 'timezone': 'Etc/GMT+8', + }, + 'NOAA SURFRAD Sioux Falls SD': { + 'consistency_limits': { + 'dhi_ratio': {'ratio_bounds': [0, 1.5]}, + 'ghi_ratio': {'ratio_bounds': [0.85, 1.15]}, + }, + 'overcast': '2018-01-14', + 'clear': '2018-01-01', + 'variable': '2018-03-03', + 'site_id': '9e888c48-7e49-11e9-9a66-0a580a8003e9', + 'timezone': 'Etc/GMT+6', + }, +} + + +@click.group(context_settings=CONTEXT_SETTINGS) +def qa_cli(): + """CLI for Solar Forecast Arbiter irradiance QA.""" + pass + + +def set_log_level(verbose): + if verbose == 1: + loglevel = 'INFO' + elif verbose > 1: + loglevel = 'DEBUG' + else: + loglevel = 'WARNING' + logging.getLogger().setLevel(loglevel) + + +@qa_cli.command() +@common_options +def download(verbose, user, password, base_url): + """Download metadata and time series data for all TA 2/3 sites. + + Data saved in new directory named ta23_site_data.""" + set_log_level(verbose) + token = cli_access_token(user, password) + session = APISession(token, base_url=base_url) + sites = session.list_sites() + ta23_sites = tuple(filter(lambda x: x.name in SITES, sites)) + OUTPUT_PATH.mkdir(parents=True, exist_ok=True) + # observation names were not created consistently across the different networks + # so need to keep a nested directory structure to keep things clean. + # also mirrors the organization of data in the arbiter. + for site in ta23_sites: + p = OUTPUT_PATH / f'{site.name}' + p.mkdir(exist_ok=True) + p /= f'{site.name}.json' + p.write_text(json.dumps(site.to_dict(), indent=4)) + obs = session.list_observations() + ta23_obs = tuple(filter( + lambda x: x.site in ta23_sites and x.variable in ['ghi', 'dni'], + obs + )) + for o in ta23_obs: + # o.site.name will match paths created above + p = OUTPUT_PATH / f'{o.site.name}' / f'{o.name}.json' + p.write_text(json.dumps(o.to_dict(), indent=4)) + # pull data by quarters to work around API query length limitation + # 2018 for TA2 analysis. eventually extend to 2021 for TA3. TA2/3 + # reports use Etc/GMT timezone, while some SFA Site timezones are + # DST aware. + tz = SITES[o.site.name]['timezone'] + quarters = pd.date_range('2018-01-01', freq='QS', periods=5, tz=tz) + start_ends = pd.DataFrame( + {'start': quarters[:4], 'end': quarters[1:]-pd.Timedelta('1s')} + ) + values_segments = [] + for _, start_end in start_ends.iterrows(): + start = start_end['start'], + end = start_end['end'] + values_segment = session.get_observation_values( + o.observation_id, + start=start, + end=end, + ) + values_segments.append(values_segment) + values = pd.concat(values_segments) + # construct filename that follows same pattern as API but use our + # requested times so we know for certain what the file will be named + first_start = start_ends.iloc[0]['start'] + last_end = start_ends.iloc[-1]['end'] + filename = f'{o.name.replace(' ', '_')}_{first_start}-{last_end}.csv' + p_data = OUTPUT_PATH / f'{o.site.name}' / filename + values.to_csv(p_data) + + +@qa_cli.command() +@click.option('--site', type=str, help='Site to process', default='all') +def process(verbose, site): + """Process time series data for all TA 2/3 sites. + + Reads data from directory ta23_site_data and writes QA results to + this directory.""" + set_log_level(verbose) + if site == 'all': + sites_to_process = SITES + else: + sites_to_process = {site: SITES[site]} + for name, parameters in sites_to_process.items(): + process_single_site(name, parameters) + + +def process_single_site(name, parameters): + loc = read_metadata() + data = read_irradiance() + # TA2/3 analysis uses fixed offsets, but reference site metadata + # typically uses DST aware timezones + data = data.tz_localize(parameters['timezone']) + +@qa_cli.command() +@common_options +def post(verbose, user, password, base_url): + """Post QA results for all TA 2/3 sites. + + Reads data from directory ta23_site_data. + Posting requires access to reference data account.""" + set_log_level(verbose) + token = cli_access_token(user, password) + session = APISession(token, base_url=base_url) + + +def read_metadata(dirn, filen): + with open (os.path.join(dirn, filen), 'r') as infile: + meta = json.load(infile) + loc = pvlib.location.Location(meta['latitude'], meta['longitude'], + meta['timezone'], meta['elevation']) + return loc + + +def read_irradiance(dirn, data_files): + all_data = pd.DataFrame(columns=data_files.keys()) + for k in data_files.keys(): + with open(os.path.join(dirn, data_files[k]), 'r') as infile: + data = pd.read_csv(infile, skiprows=2, index_col=0, + parse_dates=True) + all_data[k] = data['value'] + return all_data + + +def go(): + dirn = 'D:\\SFA\\BoulderCO' + meta_filen = 'NOAA_SURFRAD_Table_Mountain_Boulder_CO.json' + + data_files = {'ghi': 'Table_Mountain_Boulder_CO_ghi_2018-01-07T00_00_00+00_00-2019-01-01T06_59_00+00_00.csv', + 'dni': 'Table_Mountain_Boulder_CO_dni_2018-01-01T07_00_00+00_00-2019-01-01T06_59_00+00_00.csv', + 'dhi': 'Table_Mountain_Boulder_CO_dhi_2018-01-01T07_00_00+00_00-2019-01-01T06_59_00+00_00.csv'} + + loc = read_metadata(dirn, meta_filen) + data = read_irradiance(dirn, data_files) + data = data.tz_convert(loc.tz) + + # replace negative DHI with 0, so that negative DNI doesn't amplify the ratio + # of measured GHI to component sum GHI + data['dni'] = np.maximum(data['dni'], 0.) + + sp = loc.get_solarposition(data.index) + cs = loc.get_clearsky(data.index, solar_position=sp) + daytime = sp['zenith'] < 87 + + # check for component consistency + + limits = pva.quality.irradiance.QCRAD_CONSISTENCY.copy() + # reset lower bound on GHI + for k in limits: + for m in ['low_zenith', 'high_zenith']: + limits[k][m]['ghi_bounds'] = [0, np.Inf] + # raise limit on diffuse ratio + for m in limits['dhi_ratio']: + limits['dhi_ratio'][m]['ratio_bounds'] = [0, 1.5] + + consistent_comp, diffuse_ratio_limit = pva.quality.irradiance.check_irradiance_consistency_qcrad( + data['ghi'], sp['zenith'], data['dhi'], data['dni'], param=limits) + + # accept GHI and DHI when nearly equal, but not at very high zenith so that + # we don't accept horizon shading + overcast_ok = (sp['zenith'] < 75) & (np.abs(data['ghi'] - data['dhi']) < 50) + + good_overall = (consistent_comp | overcast_ok) & diffuse_ratio_limit + + component_sum = data['dni'] * pvlib.tools.cosd(sp['zenith']) + data['dhi'] + ghi_ratio = data['ghi'] / component_sum + + bad_comp = ~consistent_comp & daytime + bad_comp = data['ghi'] * bad_comp + bad_comp[bad_comp == 0] = np.nan + fig_cons = plt.figure() + plt.plot(data['ghi']) + plt.plot(data['dni']) + plt.plot(data['dhi']) + plt.plot(bad_comp, 'r.') + plt.legend(['GHI', 'DNI', 'DHI', "Bad"]) + plt.title('Consistent components test') + + bad_diff = ~diffuse_ratio_limit & daytime + bad_diff = data['ghi'] * bad_diff + bad_diff[bad_diff == 0] = np.nan + fig_cons = plt.figure() + plt.plot(data['ghi']) + plt.plot(data['dni']) + plt.plot(data['dhi']) + plt.plot(bad_diff, 'r.') + plt.legend(['GHI', 'DNI', 'DHI', "Bad"]) + plt.title('Diffuse fraction test') + + # overall accept/reject plot + fig_summary = plt.figure() + plt.plot(data['ghi']) + plt.plot(data['dni']) + plt.plot(data['dhi']) + good_mask = good_overall.copy() + bad_mask = ~good_mask + good_mask[good_mask == False] = np.nan + bad_mask[bad_mask == False] = np.nan + plt.plot(good_mask * data['ghi'], 'g.') + plt.plot(bad_mask * data['ghi'], 'r.') + plt.legend(['GHI', 'DNI', 'DHI', 'Good', "Bad"]) + plt.title('Overall') + + + # report on count of data dropped by zenith bin + bins = np.arange(np.min(sp['zenith']), np.max(sp['zenith'][daytime]), 1) + count_tot = np.zeros(len(bins) - 1) + count_good = count_tot.copy() + count_cc = count_tot.copy() + count_diff = count_tot.copy() + for i in range(len(bins)-1): + u = (sp['zenith'] >= bins[i]) & (sp['zenith'] < bins[i + 1]) + count_tot[i] = len(sp.loc[u, 'zenith']) + count_cc[i] = (consistent_comp[u] | overcast_ok[u]).sum() + count_diff[i] = diffuse_ratio_limit[u].sum() + count_good[i] = good_overall[u].sum() + fig_accept = plt.figure() + plt.plot(bins[:-1], count_tot) + plt.plot(bins[:-1], count_good) + plt.plot(bins[:-1], count_cc) + plt.plot(bins[:-1], count_diff) + plt.xlabel('Zenith') + plt.ylabel('Count') + plt.legend(['Total', 'Consistent OR Overcast', 'Diffuse', 'Passed all tests']) + plt.title('Boulder, CO') + + + # bar chart of data count within each hour + hrs = range(4, 21) + boxplot_data = [] + hr_count = good_overall.resample('H').sum() + for idx, h in enumerate(hrs): + boxplot_data.append(hr_count[hr_count.index.hour == h].values) + fig_boxplot, ax_boxplot = plt.subplots() + plt.boxplot(boxplot_data) + ax_boxplot.set_xticklabels([str(h) for h in hrs]) + plt.xlabel('Hour of day') + plt.ylabel('Count of data') + + + # plot one overcast day for illustration + dr = pd.date_range(start='2018-02-04 06:00:00', end='2018-02-04 18:00:00', + freq='1T', tz=data.index.tz) + fig_overcast_day = plt.figure() + plt.plot(data.loc[dr, 'ghi']) + plt.plot(data.loc[dr, 'dni']) + plt.plot(data.loc[dr, 'dhi']) + good_mask = good_overall.copy() + bad_mask = ~good_mask + good_mask[good_mask == False] = np.nan + bad_mask[bad_mask == False] = np.nan + plt.plot(good_mask * data.loc[dr, 'ghi'], 'g.') + plt.plot(bad_mask * data.loc[dr, 'ghi'], 'r.') + #plt.plot(sp.loc[dr, 'zenith']) + plt.legend(['GHI', 'DNI', 'DHI', 'Good', 'Bad']) + plt.title('Representative overcast day at Boulder, CO') + + # plot one clear day day for illustration + dr = pd.date_range(start='2018-03-06 06:00:00', end='2018-03-06 18:00:00', + freq='1T', tz=data.index.tz) + fig_clear_day = plt.figure() + plt.plot(data.loc[dr, 'ghi']) + plt.plot(data.loc[dr, 'dni']) + plt.plot(data.loc[dr, 'dhi']) + good_mask = good_overall.copy() + bad_mask = ~good_mask + good_mask[good_mask == False] = np.nan + bad_mask[bad_mask == False] = np.nan + plt.plot(good_mask * data.loc[dr, 'ghi'], 'g.') + plt.plot(bad_mask * data.loc[dr, 'ghi'], 'r.') + #plt.plot(sp.loc[dr, 'zenith']) + plt.legend(['GHI', 'DNI', 'DHI', 'Good', 'Bad']) + plt.title('Representative clear day at Boulder, CO') + + # plot one clear day day for illustration + dr = pd.date_range(start='2018-03-22 06:00:00', end='2018-03-22 18:00:00', + freq='1T', tz=data.index.tz) + fig_clear_day = plt.figure() + plt.plot(data.loc[dr, 'ghi']) + plt.plot(data.loc[dr, 'dni']) + plt.plot(data.loc[dr, 'dhi']) + good_mask = good_overall.copy() + bad_mask = ~good_mask + good_mask[good_mask == False] = np.nan + bad_mask[bad_mask == False] = np.nan + plt.plot(good_mask * data.loc[dr, 'ghi'], 'g.') + plt.plot(bad_mask * data.loc[dr, 'ghi'], 'r.') + #plt.plot(sp.loc[dr, 'zenith']) + plt.legend(['GHI', 'DNI', 'DHI', 'Good', 'Bad']) + plt.title('Representative day with variable conditions at Boulder, CO') + + +if __name__ == "__main__": # pragma: no cover + qa_cli() From b772da9a7fe6e525eacc3c284b4aba82af32f2a8 Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Wed, 29 Sep 2021 11:09:58 -0700 Subject: [PATCH 02/12] some cleanup. fix download names --- irradiance_qa/irradiance_qa.py | 38 ++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/irradiance_qa/irradiance_qa.py b/irradiance_qa/irradiance_qa.py index c979b508..50134ed1 100644 --- a/irradiance_qa/irradiance_qa.py +++ b/irradiance_qa/irradiance_qa.py @@ -16,11 +16,6 @@ from solarforecastarbiter.cli import common_options, cli_access_token from solarforecastarbiter.io.api import APISession -CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) -logger = logging.getLogger(__name__) - -OUTPUT_PATH = Path('ta23_site_data') - SITES = { 'NOAA SURFRAD Table Mountain Boulder CO': { 'consistency_limits': { @@ -126,6 +121,13 @@ }, } +# ideally would be set through an argument, but this is faster +OUTPUT_PATH = Path('ta23_site_data') + +# config for command line interface +CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) +logger = logging.getLogger(__name__) + @click.group(context_settings=CONTEXT_SETTINGS) def qa_cli(): @@ -155,9 +157,9 @@ def download(verbose, user, password, base_url): sites = session.list_sites() ta23_sites = tuple(filter(lambda x: x.name in SITES, sites)) OUTPUT_PATH.mkdir(parents=True, exist_ok=True) - # observation names were not created consistently across the different networks - # so need to keep a nested directory structure to keep things clean. - # also mirrors the organization of data in the arbiter. + # observation names were not created consistently across the different + # networks so need to keep a nested directory structure to keep things + # clean. also mirrors the organization of data in the arbiter. for site in ta23_sites: p = OUTPUT_PATH / f'{site.name}' p.mkdir(exist_ok=True) @@ -169,13 +171,15 @@ def download(verbose, user, password, base_url): obs )) for o in ta23_obs: + # WH: I'm terribly embarassed by the length of this for loop. + logger.info('Fetching data for %s', o.name) # o.site.name will match paths created above p = OUTPUT_PATH / f'{o.site.name}' / f'{o.name}.json' p.write_text(json.dumps(o.to_dict(), indent=4)) - # pull data by quarters to work around API query length limitation - # 2018 for TA2 analysis. eventually extend to 2021 for TA3. TA2/3 - # reports use Etc/GMT timezone, while some SFA Site timezones are - # DST aware. + # pull data by quarters to work around API query length + # limitation 2018 for TA2 analysis. eventually extend to 2021 + # for TA3. TA2/3 reports use Etc/GMT timezone, while some SFA + # Site timezones are DST aware. tz = SITES[o.site.name]['timezone'] quarters = pd.date_range('2018-01-01', freq='QS', periods=5, tz=tz) start_ends = pd.DataFrame( @@ -183,7 +187,7 @@ def download(verbose, user, password, base_url): ) values_segments = [] for _, start_end in start_ends.iterrows(): - start = start_end['start'], + start = start_end['start'] end = start_end['end'] values_segment = session.get_observation_values( o.observation_id, @@ -194,9 +198,11 @@ def download(verbose, user, password, base_url): values = pd.concat(values_segments) # construct filename that follows same pattern as API but use our # requested times so we know for certain what the file will be named - first_start = start_ends.iloc[0]['start'] - last_end = start_ends.iloc[-1]['end'] - filename = f'{o.name.replace(' ', '_')}_{first_start}-{last_end}.csv' + name = o.name.replace(' ', '_') + first_start = start_ends.iloc[0]['start'].isoformat() + last_end = start_ends.iloc[-1]['end'].isoformat() + filename = f'{name}_{first_start}-{last_end}.csv' + filename = filename.replace(':', '_') p_data = OUTPUT_PATH / f'{o.site.name}' / filename values.to_csv(p_data) From c523c200cb4c907ea2ec8ebb7fea121a511ca001 Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Thu, 7 Oct 2021 08:52:43 -0700 Subject: [PATCH 03/12] process sites into graphics --- irradiance_qa/irradiance_qa.py | 271 +++++++++++++++++++-------------- 1 file changed, 156 insertions(+), 115 deletions(-) diff --git a/irradiance_qa/irradiance_qa.py b/irradiance_qa/irradiance_qa.py index 50134ed1..a3530795 100644 --- a/irradiance_qa/irradiance_qa.py +++ b/irradiance_qa/irradiance_qa.py @@ -5,7 +5,6 @@ import json import logging -import os from pathlib import Path import click @@ -15,6 +14,10 @@ import matplotlib.pyplot as plt from solarforecastarbiter.cli import common_options, cli_access_token from solarforecastarbiter.io.api import APISession +from solarforecastarbiter.validation.validator import ( + QCRAD_CONSISTENCY, + check_irradiance_consistency_QCRad +) SITES = { 'NOAA SURFRAD Table Mountain Boulder CO': { @@ -70,6 +73,7 @@ 'timezone': 'Etc/GMT+8', }, 'DOE ARM Southern Great Plains SGP, Lamont, Oklahoma': { + 'consistency_limits': {}, 'overcast': '2018-02-10', 'clear': '2018-01-23', 'variable': '2018-05-04', @@ -167,7 +171,7 @@ def download(verbose, user, password, base_url): p.write_text(json.dumps(site.to_dict(), indent=4)) obs = session.list_observations() ta23_obs = tuple(filter( - lambda x: x.site in ta23_sites and x.variable in ['ghi', 'dni'], + lambda x: x.site in ta23_sites and x.variable in ['ghi', 'dni', 'dhi'], obs )) for o in ta23_obs: @@ -208,6 +212,8 @@ def download(verbose, user, password, base_url): @qa_cli.command() +@click.option('-v', '--verbose', count=True, + help='Increase logging verbosity') @click.option('--site', type=str, help='Site to process', default='all') def process(verbose, site): """Process time series data for all TA 2/3 sites. @@ -224,75 +230,43 @@ def process(verbose, site): def process_single_site(name, parameters): - loc = read_metadata() - data = read_irradiance() + logger.info('Processing %s', name) + loc = read_metadata(name) + data = read_irradiance(name) # TA2/3 analysis uses fixed offsets, but reference site metadata # typically uses DST aware timezones - data = data.tz_localize(parameters['timezone']) + data = data.tz_convert(parameters['timezone']) -@qa_cli.command() -@common_options -def post(verbose, user, password, base_url): - """Post QA results for all TA 2/3 sites. - - Reads data from directory ta23_site_data. - Posting requires access to reference data account.""" - set_log_level(verbose) - token = cli_access_token(user, password) - session = APISession(token, base_url=base_url) - - -def read_metadata(dirn, filen): - with open (os.path.join(dirn, filen), 'r') as infile: - meta = json.load(infile) - loc = pvlib.location.Location(meta['latitude'], meta['longitude'], - meta['timezone'], meta['elevation']) - return loc - - -def read_irradiance(dirn, data_files): - all_data = pd.DataFrame(columns=data_files.keys()) - for k in data_files.keys(): - with open(os.path.join(dirn, data_files[k]), 'r') as infile: - data = pd.read_csv(infile, skiprows=2, index_col=0, - parse_dates=True) - all_data[k] = data['value'] - return all_data - - -def go(): - dirn = 'D:\\SFA\\BoulderCO' - meta_filen = 'NOAA_SURFRAD_Table_Mountain_Boulder_CO.json' - - data_files = {'ghi': 'Table_Mountain_Boulder_CO_ghi_2018-01-07T00_00_00+00_00-2019-01-01T06_59_00+00_00.csv', - 'dni': 'Table_Mountain_Boulder_CO_dni_2018-01-01T07_00_00+00_00-2019-01-01T06_59_00+00_00.csv', - 'dhi': 'Table_Mountain_Boulder_CO_dhi_2018-01-01T07_00_00+00_00-2019-01-01T06_59_00+00_00.csv'} - - loc = read_metadata(dirn, meta_filen) - data = read_irradiance(dirn, data_files) - data = data.tz_convert(loc.tz) - - # replace negative DHI with 0, so that negative DNI doesn't amplify the ratio - # of measured GHI to component sum GHI + # replace negative DHI with 0, so that negative DNI doesn't amplify the + # ratio of measured GHI to component sum GHI data['dni'] = np.maximum(data['dni'], 0.) sp = loc.get_solarposition(data.index) cs = loc.get_clearsky(data.index, solar_position=sp) + # same as solarforecastarbiter.validation.validator.check_day_night daytime = sp['zenith'] < 87 # check for component consistency - limits = pva.quality.irradiance.QCRAD_CONSISTENCY.copy() - # reset lower bound on GHI - for k in limits: + limits = QCRAD_CONSISTENCY.copy() + # reset lower bound on GHI throughout nested dictionary + for irrad_ratio in limits: for m in ['low_zenith', 'high_zenith']: - limits[k][m]['ghi_bounds'] = [0, np.Inf] - # raise limit on diffuse ratio - for m in limits['dhi_ratio']: - limits['dhi_ratio'][m]['ratio_bounds'] = [0, 1.5] + limits[irrad_ratio][m]['ghi_bounds'] = [0, np.Inf] + # site-specific adjustments to ratio bounds + new_limits = SITES[name]['consistency_limits'] + for irrad_ratio, new_bounds in new_limits.items(): + for m in ['low_zenith', 'high_zenith']: + limits[irrad_ratio][m].update(new_bounds) - consistent_comp, diffuse_ratio_limit = pva.quality.irradiance.check_irradiance_consistency_qcrad( - data['ghi'], sp['zenith'], data['dhi'], data['dni'], param=limits) + consistent_comp, diffuse_ratio_limit = check_irradiance_consistency_QCRad( + data['ghi'], + sp['zenith'], + None, # solarforecastarbiter-core/issues/733 + data['dhi'], + data['dni'], + param=limits + ) # accept GHI and DHI when nearly equal, but not at very high zenith so that # we don't accept horizon shading @@ -303,6 +277,9 @@ def go(): component_sum = data['dni'] * pvlib.tools.cosd(sp['zenith']) + data['dhi'] ghi_ratio = data['ghi'] / component_sum + savefig_path = OUTPUT_PATH / name + savefig_kwargs = dict(dpi=300) + bad_comp = ~consistent_comp & daytime bad_comp = data['ghi'] * bad_comp bad_comp[bad_comp == 0] = np.nan @@ -312,7 +289,11 @@ def go(): plt.plot(data['dhi']) plt.plot(bad_comp, 'r.') plt.legend(['GHI', 'DNI', 'DHI', "Bad"]) - plt.title('Consistent components test') + plt.title(f'{name}\nConsistent components test') + plt.savefig( + savefig_path / f'{name} consistent components test.png', + **savefig_kwargs, + ) bad_diff = ~diffuse_ratio_limit & daytime bad_diff = data['ghi'] * bad_diff @@ -323,7 +304,11 @@ def go(): plt.plot(data['dhi']) plt.plot(bad_diff, 'r.') plt.legend(['GHI', 'DNI', 'DHI', "Bad"]) - plt.title('Diffuse fraction test') + plt.title(f'{name}\nDiffuse fraction test') + plt.savefig( + savefig_path / f'{name} diffuse fraction test.png', + **savefig_kwargs, + ) # overall accept/reject plot fig_summary = plt.figure() @@ -337,8 +322,8 @@ def go(): plt.plot(good_mask * data['ghi'], 'g.') plt.plot(bad_mask * data['ghi'], 'r.') plt.legend(['GHI', 'DNI', 'DHI', 'Good', "Bad"]) - plt.title('Overall') - + plt.title(f'{name}\nOverall') + plt.savefig(savefig_path / f'{name} overall.png', **savefig_kwargs) # report on count of data dropped by zenith bin bins = np.arange(np.min(sp['zenith']), np.max(sp['zenith'][daytime]), 1) @@ -359,9 +344,13 @@ def go(): plt.plot(bins[:-1], count_diff) plt.xlabel('Zenith') plt.ylabel('Count') - plt.legend(['Total', 'Consistent OR Overcast', 'Diffuse', 'Passed all tests']) - plt.title('Boulder, CO') - + plt.legend( + ['Total', 'Consistent OR Overcast', 'Diffuse', 'Passed all tests']) + plt.title(f'{name}\nData dropped by zenith bin') + plt.savefig( + savefig_path / f'{name} data dropped by zenith bin.png', + **savefig_kwargs, + ) # bar chart of data count within each hour hrs = range(4, 21) @@ -374,58 +363,110 @@ def go(): ax_boxplot.set_xticklabels([str(h) for h in hrs]) plt.xlabel('Hour of day') plt.ylabel('Count of data') + plt.title(f'{name}\nData count within each hour') + plt.savefig( + savefig_path / f'{name} data count within each hour.png', + **savefig_kwargs, + ) + + # plot one overcast, clear, and variable day for illustration + for kind in ('overcast', 'clear', 'variable'): + date = parameters[kind] + dr = pd.date_range( + start=f'{date} 06:00:00', + end=f'{date} 18:00:00', + freq='1T', + tz=data.index.tz + ) + fig_day = plt.figure() + plt.plot(data.loc[dr, 'ghi']) + plt.plot(data.loc[dr, 'dni']) + plt.plot(data.loc[dr, 'dhi']) + good_mask = good_overall.copy() + bad_mask = ~good_mask + good_mask[good_mask == False] = np.nan + bad_mask[bad_mask == False] = np.nan + plt.plot(good_mask * data.loc[dr, 'ghi'], 'g.') + plt.plot(bad_mask * data.loc[dr, 'ghi'], 'r.') + #plt.plot(sp.loc[dr, 'zenith']) + plt.legend(['GHI', 'DNI', 'DHI', 'Good', 'Bad']) + plt.title(f'{name}\nRepresentative {kind} day. {date}') + plt.savefig( + savefig_path / f'{name} representative {kind} day.png', + **savefig_kwargs, + ) + # determine deadband + clear_times = pvlib.clearsky.detect_clearsky( + data['ghi'], cs['ghi'], data.index, 10 + ) + ghi_rel_diff = (component_sum - data['ghi']) / data['ghi'] + u = daytime & clear_times & (ghi_ratio > 0) & (ghi_ratio < 2) & (data['ghi'] > 50) + + fig_deadband = plt.figure() + plt.plot(ghi_rel_diff[u], 'r') + plt.text( + ghi_rel_diff.index[50000], + -0.1, + 'Mean: ' + str(ghi_rel_diff[u].mean()) + ) + plt.text( + ghi_rel_diff.index[50000], + -0.15, + '85%: ' + str(ghi_rel_diff[u].quantile(q=0.85)) + ) + plt.text( + ghi_rel_diff.index[50000], + -0.2, + 'Median: ' + str(ghi_rel_diff[u].quantile(q=0.5)) + ) + plt.text( + ghi_rel_diff.index[50000], + -0.25, + '15%: ' + str(ghi_rel_diff[u].quantile(q=0.15)) + ) + plt.ylabel('(Comp. sum - GHI) / GHI') + plt.savefig( + savefig_path / f'{name} ghi ratio.png', + **savefig_kwargs, + ) - # plot one overcast day for illustration - dr = pd.date_range(start='2018-02-04 06:00:00', end='2018-02-04 18:00:00', - freq='1T', tz=data.index.tz) - fig_overcast_day = plt.figure() - plt.plot(data.loc[dr, 'ghi']) - plt.plot(data.loc[dr, 'dni']) - plt.plot(data.loc[dr, 'dhi']) - good_mask = good_overall.copy() - bad_mask = ~good_mask - good_mask[good_mask == False] = np.nan - bad_mask[bad_mask == False] = np.nan - plt.plot(good_mask * data.loc[dr, 'ghi'], 'g.') - plt.plot(bad_mask * data.loc[dr, 'ghi'], 'r.') - #plt.plot(sp.loc[dr, 'zenith']) - plt.legend(['GHI', 'DNI', 'DHI', 'Good', 'Bad']) - plt.title('Representative overcast day at Boulder, CO') - - # plot one clear day day for illustration - dr = pd.date_range(start='2018-03-06 06:00:00', end='2018-03-06 18:00:00', - freq='1T', tz=data.index.tz) - fig_clear_day = plt.figure() - plt.plot(data.loc[dr, 'ghi']) - plt.plot(data.loc[dr, 'dni']) - plt.plot(data.loc[dr, 'dhi']) - good_mask = good_overall.copy() - bad_mask = ~good_mask - good_mask[good_mask == False] = np.nan - bad_mask[bad_mask == False] = np.nan - plt.plot(good_mask * data.loc[dr, 'ghi'], 'g.') - plt.plot(bad_mask * data.loc[dr, 'ghi'], 'r.') - #plt.plot(sp.loc[dr, 'zenith']) - plt.legend(['GHI', 'DNI', 'DHI', 'Good', 'Bad']) - plt.title('Representative clear day at Boulder, CO') - - # plot one clear day day for illustration - dr = pd.date_range(start='2018-03-22 06:00:00', end='2018-03-22 18:00:00', - freq='1T', tz=data.index.tz) - fig_clear_day = plt.figure() - plt.plot(data.loc[dr, 'ghi']) - plt.plot(data.loc[dr, 'dni']) - plt.plot(data.loc[dr, 'dhi']) - good_mask = good_overall.copy() - bad_mask = ~good_mask - good_mask[good_mask == False] = np.nan - bad_mask[bad_mask == False] = np.nan - plt.plot(good_mask * data.loc[dr, 'ghi'], 'g.') - plt.plot(bad_mask * data.loc[dr, 'ghi'], 'r.') - #plt.plot(sp.loc[dr, 'zenith']) - plt.legend(['GHI', 'DNI', 'DHI', 'Good', 'Bad']) - plt.title('Representative day with variable conditions at Boulder, CO') + +@qa_cli.command() +@common_options +def post(verbose, user, password, base_url): + """Post QA results for all TA 2/3 sites. + + Reads data from directory ta23_site_data. + Posting requires access to reference data account.""" + set_log_level(verbose) + token = cli_access_token(user, password) + session = APISession(token, base_url=base_url) + + +def read_metadata(name): + metadata_file = OUTPUT_PATH / name / f'{name}.json' + with open(metadata_file, 'r') as infile: + meta = json.load(infile) + loc = pvlib.location.Location(meta['latitude'], meta['longitude'], + meta['timezone'], meta['elevation']) + return loc + + +def read_irradiance(name, column='value'): + directory = OUTPUT_PATH / name + variables = ['ghi', 'dni', 'dhi'] + data_all = {} + for v in variables: + # read in all csv files with e.g. ghi in the name + data_variable = [] + for f in directory.glob(f'*{v}*.csv'): + data_section = pd.read_csv(f, index_col=0, parse_dates=True) + data_variable.append(data_section) + data_variable = pd.concat(data_variable) + data_all[v] = data_variable[column] + data_all = pd.DataFrame(data_all) + return data_all if __name__ == "__main__": # pragma: no cover From 1486f66350142bb7c7931dcc38e2360359bacebe Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Mon, 11 Oct 2021 14:53:33 -0700 Subject: [PATCH 04/12] remove unused dni_extra arg in 1.0.6 --- irradiance_qa/irradiance_qa.py | 1 - 1 file changed, 1 deletion(-) diff --git a/irradiance_qa/irradiance_qa.py b/irradiance_qa/irradiance_qa.py index a3530795..2531dbe9 100644 --- a/irradiance_qa/irradiance_qa.py +++ b/irradiance_qa/irradiance_qa.py @@ -262,7 +262,6 @@ def process_single_site(name, parameters): consistent_comp, diffuse_ratio_limit = check_irradiance_consistency_QCRad( data['ghi'], sp['zenith'], - None, # solarforecastarbiter-core/issues/733 data['dhi'], data['dni'], param=limits From 4bdbe50a570c5b9cac8c1e9f440616ac498d51b6 Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Wed, 24 Nov 2021 14:50:25 -0700 Subject: [PATCH 05/12] write out data. post data --- irradiance_qa/irradiance_qa.py | 168 +++++++++++++++++++++++++++++---- 1 file changed, 149 insertions(+), 19 deletions(-) diff --git a/irradiance_qa/irradiance_qa.py b/irradiance_qa/irradiance_qa.py index 2531dbe9..7783f8ed 100644 --- a/irradiance_qa/irradiance_qa.py +++ b/irradiance_qa/irradiance_qa.py @@ -3,9 +3,11 @@ Area 2 and 3 evaluations. """ +from collections import defaultdict import json import logging from pathlib import Path +import warnings import click import pandas as pd @@ -231,17 +233,25 @@ def process(verbose, site): def process_single_site(name, parameters): logger.info('Processing %s', name) + save_path = OUTPUT_PATH / name + + save_path_validated = save_path / 'validated_data' + save_path_validated.mkdir(exist_ok=True) + loc = read_metadata(name) - data = read_irradiance(name) + data_original = read_irradiance(name) # TA2/3 analysis uses fixed offsets, but reference site metadata # typically uses DST aware timezones - data = data.tz_convert(parameters['timezone']) + data_original = data_original.tz_convert(parameters['timezone']) # replace negative DHI with 0, so that negative DNI doesn't amplify the # ratio of measured GHI to component sum GHI + data = data_original.copy(deep=True) data['dni'] = np.maximum(data['dni'], 0.) + logger.debug('Getting solar position') sp = loc.get_solarposition(data.index) + logger.debug('Getting clearksy') cs = loc.get_clearsky(data.index, solar_position=sp) # same as solarforecastarbiter.validation.validator.check_day_night daytime = sp['zenith'] < 87 @@ -269,14 +279,54 @@ def process_single_site(name, parameters): # accept GHI and DHI when nearly equal, but not at very high zenith so that # we don't accept horizon shading - overcast_ok = (sp['zenith'] < 75) & (np.abs(data['ghi'] - data['dhi']) < 50) + overcast_ok = ( + (sp['zenith'] < 75) & (np.abs(data['ghi'] - data['dhi']) < 50) + ) good_overall = (consistent_comp | overcast_ok) & diffuse_ratio_limit + # Some SFA reference data feeds already contains USER FLAGGED. We want to + # combine our own user flag with the existing. First extract the + # USER FLAGGED field (bit 0) from the quality_flag bitmask (see + # solarforecastarbiter.validation.quality_mapping). + sfa_user_flagged = data_original.filter(like='quality_flag') & (1 << 0) + # But we only consider our flag for daytime + bad_overall_daytime = (~good_overall) & daytime + # Combine operation happens in loop below to avoid issue with | operation + # between DataFrame and Series + + for component in ('ghi', 'dni', 'dhi'): + # Write out just the results of our filter. + # Use data_original so that we do not overwrite values in the Arbiter + # with the DNI filtered for negative values + validated_component = pd.DataFrame({ + 'value': data_original[component], + 'quality_flag': bad_overall_daytime.astype(int), + }) + validated_component.to_csv( + save_path_validated / f'{name}_validated_{component}.csv', + index=True, + index_label='timestamp', + ) + # Create combined user flag. This is ready to be uploaded into SFA. + sfa_uf_or_bad_overall_daytime = \ + sfa_user_flagged[f'{component}_quality_flag'] | bad_overall_daytime + validated_component_sfa = pd.DataFrame({ + 'value': data_original[component], + 'quality_flag': sfa_uf_or_bad_overall_daytime.astype(int), + }) + fname = f'{name}_validated_or_sfa_user_flagged_{component}.csv' + validated_component_sfa.to_csv( + save_path_validated / fname, + index=True, + index_label='timestamp', + ) + + # plot results + component_sum = data['dni'] * pvlib.tools.cosd(sp['zenith']) + data['dhi'] ghi_ratio = data['ghi'] / component_sum - savefig_path = OUTPUT_PATH / name savefig_kwargs = dict(dpi=300) bad_comp = ~consistent_comp & daytime @@ -290,9 +340,10 @@ def process_single_site(name, parameters): plt.legend(['GHI', 'DNI', 'DHI', "Bad"]) plt.title(f'{name}\nConsistent components test') plt.savefig( - savefig_path / f'{name} consistent components test.png', + save_path / f'{name} consistent components test.png', **savefig_kwargs, ) + plt.close() bad_diff = ~diffuse_ratio_limit & daytime bad_diff = data['ghi'] * bad_diff @@ -305,9 +356,10 @@ def process_single_site(name, parameters): plt.legend(['GHI', 'DNI', 'DHI', "Bad"]) plt.title(f'{name}\nDiffuse fraction test') plt.savefig( - savefig_path / f'{name} diffuse fraction test.png', + save_path / f'{name} diffuse fraction test.png', **savefig_kwargs, ) + plt.close() # overall accept/reject plot fig_summary = plt.figure() @@ -322,7 +374,8 @@ def process_single_site(name, parameters): plt.plot(bad_mask * data['ghi'], 'r.') plt.legend(['GHI', 'DNI', 'DHI', 'Good', "Bad"]) plt.title(f'{name}\nOverall') - plt.savefig(savefig_path / f'{name} overall.png', **savefig_kwargs) + plt.savefig(save_path / f'{name} overall.png', **savefig_kwargs) + plt.close() # report on count of data dropped by zenith bin bins = np.arange(np.min(sp['zenith']), np.max(sp['zenith'][daytime]), 1) @@ -347,9 +400,10 @@ def process_single_site(name, parameters): ['Total', 'Consistent OR Overcast', 'Diffuse', 'Passed all tests']) plt.title(f'{name}\nData dropped by zenith bin') plt.savefig( - savefig_path / f'{name} data dropped by zenith bin.png', + save_path / f'{name} data dropped by zenith bin.png', **savefig_kwargs, ) + plt.close() # bar chart of data count within each hour hrs = range(4, 21) @@ -364,9 +418,10 @@ def process_single_site(name, parameters): plt.ylabel('Count of data') plt.title(f'{name}\nData count within each hour') plt.savefig( - savefig_path / f'{name} data count within each hour.png', + save_path / f'{name} data count within each hour.png', **savefig_kwargs, ) + plt.close() # plot one overcast, clear, and variable day for illustration for kind in ('overcast', 'clear', 'variable'): @@ -391,16 +446,28 @@ def process_single_site(name, parameters): plt.legend(['GHI', 'DNI', 'DHI', 'Good', 'Bad']) plt.title(f'{name}\nRepresentative {kind} day. {date}') plt.savefig( - savefig_path / f'{name} representative {kind} day.png', + save_path / f'{name} representative {kind} day.png', **savefig_kwargs, ) + plt.close() # determine deadband - clear_times = pvlib.clearsky.detect_clearsky( - data['ghi'], cs['ghi'], data.index, 10 - ) + + # NaNs cause detect_clearsky to emit invalid value in comparisons, but + # no threat to results. + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=RuntimeWarning) + clear_times = pvlib.clearsky.detect_clearsky( + data['ghi'], cs['ghi'], data.index, 10 + ) ghi_rel_diff = (component_sum - data['ghi']) / data['ghi'] - u = daytime & clear_times & (ghi_ratio > 0) & (ghi_ratio < 2) & (data['ghi'] > 50) + u = ( + daytime & + clear_times & + (ghi_ratio > 0) & + (ghi_ratio < 2) & + (data['ghi'] > 50) + ) fig_deadband = plt.figure() plt.plot(ghi_rel_diff[u], 'r') @@ -426,21 +493,81 @@ def process_single_site(name, parameters): ) plt.ylabel('(Comp. sum - GHI) / GHI') plt.savefig( - savefig_path / f'{name} ghi ratio.png', + save_path / f'{name} ghi ratio.png', **savefig_kwargs, ) + plt.close() @qa_cli.command() @common_options -def post(verbose, user, password, base_url): +@click.option( + '--official', + type=bool, + help=( + 'If True, post to official observations (requires reference account).' + 'If False, create new observations in organization of user.' + ), + default=False, +) +def post(verbose, user, password, base_url, official): """Post QA results for all TA 2/3 sites. Reads data from directory ta23_site_data. - Posting requires access to reference data account.""" + + Posting to official observations requires access to reference data + account.""" set_log_level(verbose) + + # SFA reference data account. Use your own account for fetching data + # or posting results to your own observations. + reference_account = "reference@solarforecastarbiter.org" + if user == reference_account and not official: + raise ValueError("Must pass --official when using reference account.") + elif user != reference_account and official: + raise ValueError( + f"Cannot post to official observations with user {user}" + ) + + # read the data created by process function + # do this first so that we don't attempt to modify data in Arbiter unless + # we know this is good. The cost of safety is the time and memory used to + # read approximately 3*10*20MB = 600MB of csv data. + logger.info('reading site data') + data_to_post = defaultdict(dict) + for site in SITES: + p = OUTPUT_PATH / f'{site}' / 'validated_data' + for v in ('ghi', 'dni', 'dhi'): + f = p / f'{site}_validated_or_sfa_user_flagged_{v}.csv' + logger.debug('reading %s', f) + data = pd.read_csv(f, index_col=0, parse_dates=True) + if not (data.columns == pd.Index(['value', 'quality_flag'])).all(): + raise ValueError(f'wrong columns in {f}') + if not isinstance(data.index, pd.DatetimeIndex): + raise ValueError(f'wrong index in {f}') + data_to_post[site][v] = data + token = cli_access_token(user, password) session = APISession(token, base_url=base_url) + sites = session.list_sites() + ta23_sites = tuple(filter(lambda x: x.name in SITES, sites)) + obs = session.list_observations() + ta23_obs = tuple(filter( + lambda x: x.site in ta23_sites and x.variable in ['ghi', 'dni', 'dhi'], + obs + )) + if official: + # use the real obs + obs_for_post = ta23_obs + else: + # Create new obs patterned on real obs. + # Same as ta23_obs_clean but have new uuids and provider. + obs_for_post = [ + session.create_observation(o) for o in ta23_obs_clean + ] + for o in obs_for_post: + _data_to_post = data_to_post[o.site.name][o.variable.lower()] + session.post_observation_values(_data_to_post) def read_metadata(name): @@ -452,7 +579,8 @@ def read_metadata(name): return loc -def read_irradiance(name, column='value'): +def read_irradiance(name): + logger.debug('Reading irradiance %s', name) directory = OUTPUT_PATH / name variables = ['ghi', 'dni', 'dhi'] data_all = {} @@ -460,10 +588,12 @@ def read_irradiance(name, column='value'): # read in all csv files with e.g. ghi in the name data_variable = [] for f in directory.glob(f'*{v}*.csv'): + logger.debug('Reading %s', f) data_section = pd.read_csv(f, index_col=0, parse_dates=True) data_variable.append(data_section) data_variable = pd.concat(data_variable) - data_all[v] = data_variable[column] + data_all[v] = data_variable['value'] + data_all[f'{v}_quality_flag'] = data_variable['quality_flag'] data_all = pd.DataFrame(data_all) return data_all From efa5fcd7507a3ce51237c10eb07b7d309504a83c Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Mon, 29 Nov 2021 14:51:44 -0700 Subject: [PATCH 06/12] fix errors --- irradiance_qa/irradiance_qa.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/irradiance_qa/irradiance_qa.py b/irradiance_qa/irradiance_qa.py index 7783f8ed..7c9d27ef 100644 --- a/irradiance_qa/irradiance_qa.py +++ b/irradiance_qa/irradiance_qa.py @@ -506,7 +506,7 @@ def process_single_site(name, parameters): type=bool, help=( 'If True, post to official observations (requires reference account).' - 'If False, create new observations in organization of user.' + ' If False, create new observations in organization of user.' ), default=False, ) @@ -552,22 +552,26 @@ def post(verbose, user, password, base_url, official): sites = session.list_sites() ta23_sites = tuple(filter(lambda x: x.name in SITES, sites)) obs = session.list_observations() - ta23_obs = tuple(filter( - lambda x: x.site in ta23_sites and x.variable in ['ghi', 'dni', 'dhi'], - obs - )) + def _is_reference_obs(o): + return ( + o.site in ta23_sites and + o.variable in ['ghi', 'dni', 'dhi'] and + o.site.provider == 'Reference' + ) + ta23_obs = tuple(filter(_is_reference_obs, obs)) if official: # use the real obs obs_for_post = ta23_obs else: # Create new obs patterned on real obs. - # Same as ta23_obs_clean but have new uuids and provider. + # Same as ta23_obs but have new uuids and provider. + # (uuid and provider is set by SFA API) obs_for_post = [ - session.create_observation(o) for o in ta23_obs_clean + session.create_observation(o) for o in ta23_obs ] for o in obs_for_post: _data_to_post = data_to_post[o.site.name][o.variable.lower()] - session.post_observation_values(_data_to_post) + session.post_observation_values(o.observation_id, _data_to_post) def read_metadata(name): From 6d3a55067bfa04723397b2ca6d4cc54dc7ba845d Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Tue, 30 Nov 2021 10:00:46 -0700 Subject: [PATCH 07/12] split into chunks --- irradiance_qa/irradiance_qa.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/irradiance_qa/irradiance_qa.py b/irradiance_qa/irradiance_qa.py index 7c9d27ef..688af939 100644 --- a/irradiance_qa/irradiance_qa.py +++ b/irradiance_qa/irradiance_qa.py @@ -535,17 +535,17 @@ def post(verbose, user, password, base_url, official): # read approximately 3*10*20MB = 600MB of csv data. logger.info('reading site data') data_to_post = defaultdict(dict) - for site in SITES: - p = OUTPUT_PATH / f'{site}' / 'validated_data' - for v in ('ghi', 'dni', 'dhi'): - f = p / f'{site}_validated_or_sfa_user_flagged_{v}.csv' + for site_name in SITES: + p = OUTPUT_PATH / f'{site_name}' / 'validated_data' + for variable in ('ghi', 'dni', 'dhi'): + f = p / f'{site_name}_validated_or_sfa_user_flagged_{variable}.csv' logger.debug('reading %s', f) data = pd.read_csv(f, index_col=0, parse_dates=True) if not (data.columns == pd.Index(['value', 'quality_flag'])).all(): raise ValueError(f'wrong columns in {f}') if not isinstance(data.index, pd.DatetimeIndex): raise ValueError(f'wrong index in {f}') - data_to_post[site][v] = data + data_to_post[site_name][variable] = data token = cli_access_token(user, password) session = APISession(token, base_url=base_url) @@ -570,8 +570,16 @@ def _is_reference_obs(o): session.create_observation(o) for o in ta23_obs ] for o in obs_for_post: - _data_to_post = data_to_post[o.site.name][o.variable.lower()] - session.post_observation_values(o.observation_id, _data_to_post) + site_name = o.site.name + variable = o.variable.lower() + _data_to_post = data_to_post[site_name][variable] + # Split into chunks to stay under API upload limit. + grouped_data = _data_to_post.groupby(lambda x: (x.year, x.month)) + for yr_mo, values in grouped_data: + logger.debug( + 'posting data for %s %s %s', yr_mo, site_name, variable + ) + session.post_observation_values(o.observation_id, values) def read_metadata(name): From 414b92e06ded1140c555d59a28a23f5a4d52a240 Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Wed, 1 Dec 2021 10:48:54 -0700 Subject: [PATCH 08/12] add validation check --- irradiance_qa/irradiance_qa.py | 56 ++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/irradiance_qa/irradiance_qa.py b/irradiance_qa/irradiance_qa.py index 688af939..c8b6770d 100644 --- a/irradiance_qa/irradiance_qa.py +++ b/irradiance_qa/irradiance_qa.py @@ -7,6 +7,7 @@ import json import logging from pathlib import Path +import time import warnings import click @@ -16,6 +17,9 @@ import matplotlib.pyplot as plt from solarforecastarbiter.cli import common_options, cli_access_token from solarforecastarbiter.io.api import APISession +from solarforecastarbiter.validation.quality_mapping import ( + convert_mask_into_dataframe +) from solarforecastarbiter.validation.validator import ( QCRAD_CONSISTENCY, check_irradiance_consistency_QCRad @@ -580,6 +584,58 @@ def _is_reference_obs(o): 'posting data for %s %s %s', yr_mo, site_name, variable ) session.post_observation_values(o.observation_id, values) + # sometimes API fails to validate upload, so wait and retry if + # needed + validated = wait_for_validation(session, o, values) + if not validated: + logger.info('validation appears hung. reposting data') + session.post_observation_values(o.observation_id, values) + validated = wait_for_validation(session, o, values) + if not validated: + logger.info('validation appears hung. reposting data') + session.post_observation_values(o.observation_id, values) + validated = wait_for_validation(session, o, values) + if not validated: + logger.warning( + 'validation failed for %s %s %s', yr_mo, site_name, + variable + ) + + +def wait_for_validation( + session, observation, values, sleep_initial=5, sleep_loop=30, limit=5 +): + logger.debug( + f'sleeping for {sleep_initial} seconds to check validation status' + ) + time.sleep(sleep_initial) + validated = check_validated(session, observation, values) + i = 0 + while not validated and i < limit: + logger.info( + f'sleeping for {sleep_loop} seconds to check validation status' + ) + time.sleep(sleep_loop) + validated = check_validated(session, observation, values) + i += 1 + return validated + + +def check_validated(session, observation, values): + """Pull values from API and see if they have been validated. + + Returns + ------- + True if data has been validated by API. + """ + first = session.get_values(observation, values.index[0], values.index[2]) + last = session.get_values(observation, values.index[-2], values.index[0]) + first_flags = convert_mask_into_dataframe(first['quality_flag']) + last_flags = convert_mask_into_dataframe(last['quality_flag']) + return not ( + first_flags['NOT VALIDATED'].any() or + last_flags['NOT VALIDATED'].any() + ) def read_metadata(name): From 5097e6aae63b17116224c883c3b9575f3102acc8 Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Wed, 1 Dec 2021 14:33:37 -0700 Subject: [PATCH 09/12] fix bugs --- irradiance_qa/irradiance_qa.py | 69 ++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/irradiance_qa/irradiance_qa.py b/irradiance_qa/irradiance_qa.py index c8b6770d..a035331b 100644 --- a/irradiance_qa/irradiance_qa.py +++ b/irradiance_qa/irradiance_qa.py @@ -5,6 +5,7 @@ from collections import defaultdict import json +from functools import partial import logging from pathlib import Path import time @@ -553,35 +554,62 @@ def post(verbose, user, password, base_url, official): token = cli_access_token(user, password) session = APISession(token, base_url=base_url) + current_organization = get_current_organization(session) sites = session.list_sites() ta23_sites = tuple(filter(lambda x: x.name in SITES, sites)) obs = session.list_observations() - def _is_reference_obs(o): + + def _is_provider_obs(o, provider): return ( o.site in ta23_sites and o.variable in ['ghi', 'dni', 'dhi'] and - o.site.provider == 'Reference' + o.provider == provider ) - ta23_obs = tuple(filter(_is_reference_obs, obs)) + + ta23_obs = tuple(filter( + partial(_is_provider_obs, provider='Reference'), + obs + )) if official: # use the real obs obs_for_post = ta23_obs else: - # Create new obs patterned on real obs. - # Same as ta23_obs but have new uuids and provider. - # (uuid and provider is set by SFA API) - obs_for_post = [ - session.create_observation(o) for o in ta23_obs - ] - for o in obs_for_post: + ta23_obs_org = tuple(filter( + partial(_is_provider_obs, provider=current_organization), + obs + )) + if len(ta23_obs_org) == len(ta23_obs): + # we already have existing obs, likely from previous run, + # so no need to recreate. Must have exact length match! + logger.info( + f'using existing observations in {current_organization}' + ) + obs_for_post = ta23_obs_org + else: + # Create new obs patterned on real obs. + # Same as ta23_obs but have new uuids and provider. + # (uuid and provider is set by SFA API) + logger.info(f'creating new observations in {current_organization}') + obs_for_post = [ + session.create_observation(o) for o in ta23_obs + ] + for o in obs_for_post: # ~ 5 minutes per observation x 30 obs = 150 min site_name = o.site.name variable = o.variable.lower() _data_to_post = data_to_post[site_name][variable] + # Split into chunks to stay under API upload limit. - grouped_data = _data_to_post.groupby(lambda x: (x.year, x.month)) - for yr_mo, values in grouped_data: + # DeprecationWarning if pandas >= 1.1.0, but we support earlier pandas + # and supporting both would be too complicated. If using this with + # new pandas you can use week = obs_df.index.isocalendar().week + # and similar for year. + week = _data_to_post.index.week + month = _data_to_post.index.month + year = _data_to_post.index.year + grouped_data = _data_to_post.groupby([year, month, week]) + for group, values in grouped_data: logger.debug( - 'posting data for %s %s %s', yr_mo, site_name, variable + 'posting data for %s %s %s', group, site_name, variable ) session.post_observation_values(o.observation_id, values) # sometimes API fails to validate upload, so wait and retry if @@ -597,11 +625,22 @@ def _is_reference_obs(o): validated = wait_for_validation(session, o, values) if not validated: logger.warning( - 'validation failed for %s %s %s', yr_mo, site_name, + 'validation failed for %s %s %s', group, site_name, variable ) +def get_current_organization(session): + r = session.get('/users/current') + r_json = r.json() + organization = r_json['organization'] + if organization == 'Unaffiliated': + raise ValueError( + 'User must be affiliated with an SFA organization to post data.' + ) + return organization + + def wait_for_validation( session, observation, values, sleep_initial=5, sleep_loop=30, limit=5 ): @@ -629,7 +668,7 @@ def check_validated(session, observation, values): True if data has been validated by API. """ first = session.get_values(observation, values.index[0], values.index[2]) - last = session.get_values(observation, values.index[-2], values.index[0]) + last = session.get_values(observation, values.index[-3], values.index[-1]) first_flags = convert_mask_into_dataframe(first['quality_flag']) last_flags = convert_mask_into_dataframe(last['quality_flag']) return not ( From 0ad491832ad66da2001e702b90413e5585012211 Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Thu, 2 Dec 2021 10:56:01 -0700 Subject: [PATCH 10/12] comments --- irradiance_qa/irradiance_qa.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/irradiance_qa/irradiance_qa.py b/irradiance_qa/irradiance_qa.py index a035331b..2b088ad3 100644 --- a/irradiance_qa/irradiance_qa.py +++ b/irradiance_qa/irradiance_qa.py @@ -593,16 +593,19 @@ def _is_provider_obs(o, provider): obs_for_post = [ session.create_observation(o) for o in ta23_obs ] - for o in obs_for_post: # ~ 5 minutes per observation x 30 obs = 150 min + # ~ 5 minutes per observation x 30 obs = 150 minutes + # token expires after 180 minutes! + for o in obs_for_post: site_name = o.site.name variable = o.variable.lower() _data_to_post = data_to_post[site_name][variable] - # Split into chunks to stay under API upload limit. - # DeprecationWarning if pandas >= 1.1.0, but we support earlier pandas - # and supporting both would be too complicated. If using this with - # new pandas you can use week = obs_df.index.isocalendar().week - # and similar for year. + # Split into chunks to stay under API upload limit. DeprecationWarning + # if pandas >= 1.1.0, but we support earlier pandas and supporting both + # would be too complicated. If using this with new pandas you can use + # week = obs_df.index.isocalendar().week and similar for year. + # Weekly chunks match SFA internal validation chunks, thus minimizing + # risk of hung validation jobs on the server. week = _data_to_post.index.week month = _data_to_post.index.month year = _data_to_post.index.year @@ -612,8 +615,8 @@ def _is_provider_obs(o, provider): 'posting data for %s %s %s', group, site_name, variable ) session.post_observation_values(o.observation_id, values) - # sometimes API fails to validate upload, so wait and retry if - # needed + # sometimes API fails to validate upload, so wait, check, and + # retry if needed validated = wait_for_validation(session, o, values) if not validated: logger.info('validation appears hung. reposting data') @@ -642,7 +645,7 @@ def get_current_organization(session): def wait_for_validation( - session, observation, values, sleep_initial=5, sleep_loop=30, limit=5 + session, observation, values, sleep_initial=4, sleep_loop=15, limit=5 ): logger.debug( f'sleeping for {sleep_initial} seconds to check validation status' From a29781f499cb2c24f1b052bfa5a298656d44e5c4 Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Tue, 7 Dec 2021 10:32:12 -0700 Subject: [PATCH 11/12] refactor download site filter --- irradiance_qa/irradiance_qa.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/irradiance_qa/irradiance_qa.py b/irradiance_qa/irradiance_qa.py index 2b088ad3..5d8f0ff5 100644 --- a/irradiance_qa/irradiance_qa.py +++ b/irradiance_qa/irradiance_qa.py @@ -178,7 +178,7 @@ def download(verbose, user, password, base_url): p.write_text(json.dumps(site.to_dict(), indent=4)) obs = session.list_observations() ta23_obs = tuple(filter( - lambda x: x.site in ta23_sites and x.variable in ['ghi', 'dni', 'dhi'], + partial(is_provider_obs, provider='Reference', sites=ta23_sites), obs )) for o in ta23_obs: @@ -559,15 +559,8 @@ def post(verbose, user, password, base_url, official): ta23_sites = tuple(filter(lambda x: x.name in SITES, sites)) obs = session.list_observations() - def _is_provider_obs(o, provider): - return ( - o.site in ta23_sites and - o.variable in ['ghi', 'dni', 'dhi'] and - o.provider == provider - ) - ta23_obs = tuple(filter( - partial(_is_provider_obs, provider='Reference'), + partial(is_provider_obs, provider='Reference', sites=ta23_sites), obs )) if official: @@ -575,7 +568,11 @@ def _is_provider_obs(o, provider): obs_for_post = ta23_obs else: ta23_obs_org = tuple(filter( - partial(_is_provider_obs, provider=current_organization), + partial( + is_provider_obs, + provider=current_organization, + sites=ta23_sites + ), obs )) if len(ta23_obs_org) == len(ta23_obs): @@ -633,6 +630,14 @@ def _is_provider_obs(o, provider): ) +def is_provider_obs(obs, provider, sites): + return ( + obs.site in sites and + obs.variable in ['ghi', 'dni', 'dhi'] and + obs.provider == provider + ) + + def get_current_organization(session): r = session.get('/users/current') r_json = r.json() From 549c81c6edc880b6388add7101314717c2e64f25 Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Thu, 9 Dec 2021 11:10:44 -0700 Subject: [PATCH 12/12] adapt for ta3 --- irradiance_qa/irradiance_qa.py | 233 +++++++++++++++++++++------------ 1 file changed, 152 insertions(+), 81 deletions(-) diff --git a/irradiance_qa/irradiance_qa.py b/irradiance_qa/irradiance_qa.py index 5d8f0ff5..d3c6df05 100644 --- a/irradiance_qa/irradiance_qa.py +++ b/irradiance_qa/irradiance_qa.py @@ -31,9 +31,12 @@ 'consistency_limits': { 'dhi_ratio': {'ratio_bounds': [0, 1.5]} }, - 'overcast': '2018-02-03', - 'clear': '2018-04-20', - 'variable': '2018-03-03', + 'overcast_ta2': '2018-02-03', + 'clear_ta2': '2018-04-20', + 'variable_ta2': '2018-03-03', + 'overcast_ta3': '2021-10-12', + 'clear_ta3': '2021-10-03', + 'variable_ta3': '2021-10-11', 'site_id': '9dfa7910-7e49-11e9-b4e8-0a580a8003e9', 'timezone': 'Etc/GMT+7', }, @@ -41,9 +44,12 @@ 'consistency_limits': { 'dhi_ratio': {'ratio_bounds': [0, 1.5]} }, - 'overcast': '2018-01-01', - 'clear': '2018-03-03', - 'variable': '2018-04-22', + 'overcast_ta2': '2018-01-01', + 'clear_ta2': '2018-03-03', + 'variable_ta2': '2018-04-22', + 'overcast_ta3': '2021-10-01', + 'clear_ta3': '2021-10-03', + 'variable_ta3': '2021-10-22', 'site_id': 'a9d0d140-99fc-11e9-81fa-0a580a8200c9', 'timezone': 'Etc/GMT+5', }, @@ -52,9 +58,12 @@ 'dhi_ratio': {'ratio_bounds': [0, 1.5]}, 'ghi_ratio': {'ratio_bounds': [0.84, 1.16]}, }, - 'overcast': '2018-01-10', - 'clear': '2018-03-03', - 'variable': '2018-01-14', + 'overcast_ta2': '2018-01-10', + 'clear_ta2': '2018-03-03', + 'variable_ta2': '2018-01-14', + 'overcast_ta3': '2021-10-02', + 'clear_ta3': '2021-09-27', + 'variable_ta3': '2021-09-29', 'site_id': '9e4e98ac-7e49-11e9-a7c4-0a580a8003e9', 'timezone': 'Etc/GMT+6', }, @@ -63,9 +72,12 @@ 'dhi_ratio': {'ratio_bounds': [0, 1.5]}, 'ghi_ratio': {'ratio_bounds': [0.84, 1.16]}, }, - 'overcast': '2018-01-17', - 'clear': '2018-02-08', - 'variable': '2018-03-03', + 'overcast_ta2': '2018-01-17', + 'clear_ta2': '2018-02-08', + 'variable_ta2': '2018-03-03', + 'overcast_ta3': '2021-11-15', + 'clear_ta3': '2021-10-10', + 'variable_ta3': '2021-10-29', 'site_id': 'c291964c-7e49-11e9-af46-0a580a8003e9', 'timezone': 'Etc/GMT+8', }, @@ -73,17 +85,23 @@ 'consistency_limits': { 'dhi_ratio': {'ratio_bounds': [0, 1.5]} }, - 'overcast': '2018-01-14', - 'clear': '2018-07-01', - 'variable': '2018-03-03', + 'overcast_ta2': '2018-01-14', + 'clear_ta2': '2018-07-01', + 'variable_ta2': '2018-03-03', + 'overcast_ta3': '2021-10-17', + 'clear_ta3': '2021-10-09', + 'variable_ta3': '2021-10-10', 'site_id': '9feac63a-7e49-11e9-9bde-0a580a8003e9', 'timezone': 'Etc/GMT+8', }, 'DOE ARM Southern Great Plains SGP, Lamont, Oklahoma': { 'consistency_limits': {}, - 'overcast': '2018-02-10', - 'clear': '2018-01-23', - 'variable': '2018-05-04', + 'overcast_ta2': '2018-02-10', + 'clear_ta2': '2018-01-23', + 'variable_ta2': '2018-05-04', + 'overcast_ta3': '2021-11-02', + 'clear_ta3': '2021-10-17', + 'variable_ta3': '2021-11-05', 'site_id': 'd52d47e6-88c4-11ea-b5f8-0a580a820092', 'timezone': 'Etc/GMT+6', }, @@ -91,9 +109,12 @@ 'consistency_limits': { 'dhi_ratio': {'ratio_bounds': [0, 1.5]} }, - 'overcast': '2018-02-13', - 'clear': '2018-04-20', - 'variable': '2018-03-03', + 'overcast_ta2': '2018-02-13', + 'clear_ta2': '2018-04-20', + 'variable_ta2': '2018-03-03', + 'overcast_ta3': '2021-10-13', + 'clear_ta3': '2021-10-20', + 'variable_ta3': '2021-10-03', 'site_id': '371a5e3a-1888-11eb-959e-0a580a820169', 'timezone': 'Etc/GMT+5', }, @@ -102,9 +123,12 @@ 'dhi_ratio': {'ratio_bounds': [0, 1.5]}, 'ghi_ratio': {'ratio_bounds': [0.85, 1.15]}, }, - 'overcast': '2018-01-12', - 'clear': '2018-01-14', - 'variable': '2018-01-01', + 'overcast_ta2': '2018-01-12', + 'clear_ta2': '2018-01-14', + 'variable_ta2': '2018-01-01', + 'overcast_ta3': '2021-10-07', + 'clear_ta3': '2021-09-02', + 'variable_ta3': '2021-09-24', 'site_id': '9e69b108-7e49-11e9-a3df-0a580a8003e9', 'timezone': 'Etc/GMT+5', }, @@ -113,9 +137,12 @@ 'dhi_ratio': {'ratio_bounds': [0, 1.5]}, 'ghi_ratio': {'ratio_bounds': [0.85, 1.15]}, }, - 'overcast': '2018-01-27', - 'clear': '2018-02-11', - 'variable': '2018-02-09', + 'overcast_ta2': '2018-01-27', + 'clear_ta2': '2018-02-11', + 'variable_ta2': '2018-02-09', + 'overcast_ta3': '2021-10-27', + 'clear_ta3': '2021-10-11', + 'variable_ta3': '2021-10-09', 'site_id': '4a4e1f82-a2d1-11eb-90bf-0a580a820087', 'timezone': 'Etc/GMT+8', }, @@ -124,9 +151,12 @@ 'dhi_ratio': {'ratio_bounds': [0, 1.5]}, 'ghi_ratio': {'ratio_bounds': [0.85, 1.15]}, }, - 'overcast': '2018-01-14', - 'clear': '2018-01-01', - 'variable': '2018-03-03', + 'overcast_ta2': '2018-01-14', + 'clear_ta2': '2018-01-01', + 'variable_ta2': '2018-03-03', + 'overcast_ta3': '2021-10-02', + 'clear_ta3': '2021-10-05', + 'variable_ta3': '2021-10-15', 'site_id': '9e888c48-7e49-11e9-9a66-0a580a8003e9', 'timezone': 'Etc/GMT+6', }, @@ -156,9 +186,17 @@ def set_log_level(verbose): logging.getLogger().setLevel(loglevel) +ta_option = click.option( + '--topic-area', + type=click.Choice(['2', '3']), + help='Topic Area (defines date range)' +) + + @qa_cli.command() @common_options -def download(verbose, user, password, base_url): +@ta_option +def download(verbose, user, password, base_url, topic_area): """Download metadata and time series data for all TA 2/3 sites. Data saved in new directory named ta23_site_data.""" @@ -168,61 +206,89 @@ def download(verbose, user, password, base_url): sites = session.list_sites() ta23_sites = tuple(filter(lambda x: x.name in SITES, sites)) OUTPUT_PATH.mkdir(parents=True, exist_ok=True) + # observation names were not created consistently across the different # networks so need to keep a nested directory structure to keep things # clean. also mirrors the organization of data in the arbiter. for site in ta23_sites: - p = OUTPUT_PATH / f'{site.name}' - p.mkdir(exist_ok=True) - p /= f'{site.name}.json' - p.write_text(json.dumps(site.to_dict(), indent=4)) + p_site = OUTPUT_PATH / f'{site.name}' + p_site.mkdir(exist_ok=True) + p_site_meta = p_site / f'{site.name}.json' + # write out site metadata + p_site_meta.write_text(json.dumps(site.to_dict(), indent=4)) + p_ta = p_site / f'TA{topic_area}' + p_ta.mkdir(exist_ok=True) obs = session.list_observations() ta23_obs = tuple(filter( partial(is_provider_obs, provider='Reference', sites=ta23_sites), obs )) for o in ta23_obs: - # WH: I'm terribly embarassed by the length of this for loop. logger.info('Fetching data for %s', o.name) # o.site.name will match paths created above - p = OUTPUT_PATH / f'{o.site.name}' / f'{o.name}.json' - p.write_text(json.dumps(o.to_dict(), indent=4)) + p_site = OUTPUT_PATH / f'{o.site.name}' + p_obs_meta = p_site / f'{o.name}.json' + # write out observation metadata + p_obs_meta.write_text(json.dumps(o.to_dict(), indent=4)) # pull data by quarters to work around API query length - # limitation 2018 for TA2 analysis. eventually extend to 2021 - # for TA3. TA2/3 reports use Etc/GMT timezone, while some SFA - # Site timezones are DST aware. + # limitation 2018 for TA2 analysis. TA2/3 reports use Etc/GMT + # timezone, while some SFA Site timezones are DST aware. tz = SITES[o.site.name]['timezone'] + start_ends = _ta_23_start_end(topic_area, tz) + values = _loop_over_start_ends(session, start_ends, o) + # construct filename that follows same pattern as API but use our + # requested times so we know for certain what the file will be named + p_data = _format_data_path(o, start_ends, p_site, topic_area) + values.to_csv(p_data) + + +def _format_data_path(observation, start_ends, p_site, topic_area): + name = observation.name.replace(' ', '_') + first_start = start_ends.iloc[0]['start'].isoformat() + last_end = start_ends.iloc[-1]['end'].isoformat() + filename = f'{name}_{first_start}-{last_end}.csv' + filename = filename.replace(':', '_') + p_data = p_site / f'TA{topic_area}' / filename + return p_data + + +def _ta_23_start_end(topic_area, tz): + if topic_area == '2': quarters = pd.date_range('2018-01-01', freq='QS', periods=5, tz=tz) start_ends = pd.DataFrame( {'start': quarters[:4], 'end': quarters[1:]-pd.Timedelta('1s')} ) - values_segments = [] - for _, start_end in start_ends.iterrows(): - start = start_end['start'] - end = start_end['end'] - values_segment = session.get_observation_values( - o.observation_id, - start=start, - end=end, - ) - values_segments.append(values_segment) - values = pd.concat(values_segments) - # construct filename that follows same pattern as API but use our - # requested times so we know for certain what the file will be named - name = o.name.replace(' ', '_') - first_start = start_ends.iloc[0]['start'].isoformat() - last_end = start_ends.iloc[-1]['end'].isoformat() - filename = f'{name}_{first_start}-{last_end}.csv' - filename = filename.replace(':', '_') - p_data = OUTPUT_PATH / f'{o.site.name}' / filename - values.to_csv(p_data) + elif topic_area == '3': + start_ends = pd.DataFrame({ + 'start': pd.Timestamp('2021-09-02', tz=tz), + 'end': pd.Timestamp('2021-12-01', tz=tz) + }, index=[0]) + else: + raise ValueError('topic_area must be 2 or 3') + return start_ends + + +def _loop_over_start_ends(session, start_ends, observation): + values_segments = [] + for _, start_end in start_ends.iterrows(): + start = start_end['start'] + end = start_end['end'] + values_segment = session.get_observation_values( + observation.observation_id, + start=start, + end=end, + ) + values_segments.append(values_segment) + values = pd.concat(values_segments) + return values @qa_cli.command() @click.option('-v', '--verbose', count=True, help='Increase logging verbosity') @click.option('--site', type=str, help='Site to process', default='all') -def process(verbose, site): +@ta_option +def process(verbose, site, topic_area): """Process time series data for all TA 2/3 sites. Reads data from directory ta23_site_data and writes QA results to @@ -233,18 +299,21 @@ def process(verbose, site): else: sites_to_process = {site: SITES[site]} for name, parameters in sites_to_process.items(): - process_single_site(name, parameters) + process_single_site(name, parameters, topic_area) -def process_single_site(name, parameters): +def process_single_site(name, parameters, topic_area): + if any(frag in name for frag in ('Cocoa', 'PNNL', 'NASA')): + logger.warning('Skipping %s', name) + return logger.info('Processing %s', name) - save_path = OUTPUT_PATH / name + save_path = OUTPUT_PATH / name / f'TA{topic_area}' save_path_validated = save_path / 'validated_data' save_path_validated.mkdir(exist_ok=True) loc = read_metadata(name) - data_original = read_irradiance(name) + data_original = read_irradiance(name, topic_area) # TA2/3 analysis uses fixed offsets, but reference site metadata # typically uses DST aware timezones data_original = data_original.tz_convert(parameters['timezone']) @@ -337,7 +406,7 @@ def process_single_site(name, parameters): bad_comp = ~consistent_comp & daytime bad_comp = data['ghi'] * bad_comp bad_comp[bad_comp == 0] = np.nan - fig_cons = plt.figure() + _ = plt.figure() plt.plot(data['ghi']) plt.plot(data['dni']) plt.plot(data['dhi']) @@ -353,7 +422,7 @@ def process_single_site(name, parameters): bad_diff = ~diffuse_ratio_limit & daytime bad_diff = data['ghi'] * bad_diff bad_diff[bad_diff == 0] = np.nan - fig_cons = plt.figure() + _ = plt.figure() plt.plot(data['ghi']) plt.plot(data['dni']) plt.plot(data['dhi']) @@ -367,7 +436,7 @@ def process_single_site(name, parameters): plt.close() # overall accept/reject plot - fig_summary = plt.figure() + _ = plt.figure() plt.plot(data['ghi']) plt.plot(data['dni']) plt.plot(data['dhi']) @@ -394,7 +463,7 @@ def process_single_site(name, parameters): count_cc[i] = (consistent_comp[u] | overcast_ok[u]).sum() count_diff[i] = diffuse_ratio_limit[u].sum() count_good[i] = good_overall[u].sum() - fig_accept = plt.figure() + _ = plt.figure() plt.plot(bins[:-1], count_tot) plt.plot(bins[:-1], count_good) plt.plot(bins[:-1], count_cc) @@ -414,9 +483,9 @@ def process_single_site(name, parameters): hrs = range(4, 21) boxplot_data = [] hr_count = good_overall.resample('H').sum() - for idx, h in enumerate(hrs): + for h in hrs: boxplot_data.append(hr_count[hr_count.index.hour == h].values) - fig_boxplot, ax_boxplot = plt.subplots() + _, ax_boxplot = plt.subplots() plt.boxplot(boxplot_data) ax_boxplot.set_xticklabels([str(h) for h in hrs]) plt.xlabel('Hour of day') @@ -430,14 +499,14 @@ def process_single_site(name, parameters): # plot one overcast, clear, and variable day for illustration for kind in ('overcast', 'clear', 'variable'): - date = parameters[kind] + date = parameters[kind + f'_ta{topic_area}'] dr = pd.date_range( start=f'{date} 06:00:00', end=f'{date} 18:00:00', freq='1T', tz=data.index.tz ) - fig_day = plt.figure() + _ = plt.figure() plt.plot(data.loc[dr, 'ghi']) plt.plot(data.loc[dr, 'dni']) plt.plot(data.loc[dr, 'dhi']) @@ -447,7 +516,7 @@ def process_single_site(name, parameters): bad_mask[bad_mask == False] = np.nan plt.plot(good_mask * data.loc[dr, 'ghi'], 'g.') plt.plot(bad_mask * data.loc[dr, 'ghi'], 'r.') - #plt.plot(sp.loc[dr, 'zenith']) + # plt.plot(sp.loc[dr, 'zenith']) plt.legend(['GHI', 'DNI', 'DHI', 'Good', 'Bad']) plt.title(f'{name}\nRepresentative {kind} day. {date}') plt.savefig( @@ -474,7 +543,7 @@ def process_single_site(name, parameters): (data['ghi'] > 50) ) - fig_deadband = plt.figure() + _ = plt.figure() plt.plot(ghi_rel_diff[u], 'r') plt.text( ghi_rel_diff.index[50000], @@ -515,7 +584,8 @@ def process_single_site(name, parameters): ), default=False, ) -def post(verbose, user, password, base_url, official): +@ta_option +def post(verbose, user, password, base_url, official, topic_area): """Post QA results for all TA 2/3 sites. Reads data from directory ta23_site_data. @@ -537,11 +607,12 @@ def post(verbose, user, password, base_url, official): # read the data created by process function # do this first so that we don't attempt to modify data in Arbiter unless # we know this is good. The cost of safety is the time and memory used to - # read approximately 3*10*20MB = 600MB of csv data. + # read approximately 3*10*20MB = 600MB of csv data for TA2 + # (1/4 as much for TA3). logger.info('reading site data') data_to_post = defaultdict(dict) for site_name in SITES: - p = OUTPUT_PATH / f'{site_name}' / 'validated_data' + p = OUTPUT_PATH / f'{site_name}/TA{topic_area}/validated_data' for variable in ('ghi', 'dni', 'dhi'): f = p / f'{site_name}_validated_or_sfa_user_flagged_{variable}.csv' logger.debug('reading %s', f) @@ -694,9 +765,9 @@ def read_metadata(name): return loc -def read_irradiance(name): +def read_irradiance(name, topic_area): logger.debug('Reading irradiance %s', name) - directory = OUTPUT_PATH / name + directory = OUTPUT_PATH / name / f'TA{topic_area}' variables = ['ghi', 'dni', 'dhi'] data_all = {} for v in variables: