From 6c4aaff268b89c5408a7117bb3b76329429ab8a4 Mon Sep 17 00:00:00 2001 From: Jan Griesfeller Date: Fri, 12 Apr 2024 16:31:09 +0200 Subject: [PATCH] save WIP --- src/pyaro_readers/nilupmfebas/_logging.py | 83 - src/pyaro_readers/nilupmfebas/_warnings.py | 62 - .../nilupmfebas/aerocom_browser.py | 215 -- .../nilupmfebas/aux_var_helpers.py | 848 ----- src/pyaro_readers/nilupmfebas/colocation.py | 1093 ------ .../nilupmfebas/combine_vardata_ungridded.py | 494 --- src/pyaro_readers/nilupmfebas/const.py | 314 +- .../nilupmfebas/country_codes.json | 1 - src/pyaro_readers/nilupmfebas/ebas_config.ini | 628 ---- .../nilupmfebas/ebas_file_index.py | 342 -- src/pyaro_readers/nilupmfebas/ebas_flags.csv | 153 - src/pyaro_readers/nilupmfebas/ebas_varinfo.py | 250 -- src/pyaro_readers/nilupmfebas/geodesy.py | 376 -- src/pyaro_readers/nilupmfebas/grid_io.py | 170 - src/pyaro_readers/nilupmfebas/helpers.py | 1824 ---------- .../nilupmfebas/helpers_landsea_masks.py | 268 -- src/pyaro_readers/nilupmfebas/io_helpers.py | 317 -- src/pyaro_readers/nilupmfebas/mathutils.py | 532 --- .../nilupmfebas/metastandards.py | 395 --- src/pyaro_readers/nilupmfebas/molmasses.py | 111 - src/pyaro_readers/nilupmfebas/obs_io.py | 141 - .../nilupmfebas/readungridded.py | 897 ----- .../nilupmfebas/readungriddedbase.py | 661 ---- src/pyaro_readers/nilupmfebas/region.py | 416 --- src/pyaro_readers/nilupmfebas/region_defs.py | 183 - src/pyaro_readers/nilupmfebas/resources.py | 42 - src/pyaro_readers/nilupmfebas/stationdata.py | 1542 -------- src/pyaro_readers/nilupmfebas/time_config.py | 90 - .../nilupmfebas/time_resampler.py | 256 -- src/pyaro_readers/nilupmfebas/tstype.py | 426 --- .../nilupmfebas/ungriddeddata.py | 3124 ----------------- .../nilupmfebas/units_helpers.py | 328 -- src/pyaro_readers/nilupmfebas/utils.py | 75 - src/pyaro_readers/nilupmfebas/var_groups.py | 25 - .../nilupmfebas/varcollection.py | 213 -- src/pyaro_readers/nilupmfebas/variable.py | 597 ---- .../nilupmfebas/variable_helpers.py | 110 - src/pyaro_readers/nilupmfebas/varnameinfo.py | 144 - 38 files changed, 156 insertions(+), 17590 deletions(-) delete mode 100644 src/pyaro_readers/nilupmfebas/_logging.py delete mode 100644 src/pyaro_readers/nilupmfebas/_warnings.py delete mode 100644 src/pyaro_readers/nilupmfebas/aerocom_browser.py delete mode 100644 src/pyaro_readers/nilupmfebas/aux_var_helpers.py delete mode 100644 src/pyaro_readers/nilupmfebas/colocation.py delete mode 100644 src/pyaro_readers/nilupmfebas/combine_vardata_ungridded.py delete mode 100644 src/pyaro_readers/nilupmfebas/country_codes.json delete mode 100644 src/pyaro_readers/nilupmfebas/ebas_config.ini delete mode 100644 src/pyaro_readers/nilupmfebas/ebas_file_index.py delete mode 100644 src/pyaro_readers/nilupmfebas/ebas_flags.csv delete mode 100644 src/pyaro_readers/nilupmfebas/ebas_varinfo.py delete mode 100644 src/pyaro_readers/nilupmfebas/geodesy.py delete mode 100644 src/pyaro_readers/nilupmfebas/grid_io.py delete mode 100644 src/pyaro_readers/nilupmfebas/helpers.py delete mode 100644 src/pyaro_readers/nilupmfebas/helpers_landsea_masks.py delete mode 100644 src/pyaro_readers/nilupmfebas/io_helpers.py delete mode 100644 src/pyaro_readers/nilupmfebas/mathutils.py delete mode 100644 src/pyaro_readers/nilupmfebas/metastandards.py delete mode 100644 src/pyaro_readers/nilupmfebas/molmasses.py delete mode 100644 src/pyaro_readers/nilupmfebas/obs_io.py delete mode 100755 src/pyaro_readers/nilupmfebas/readungridded.py delete mode 100644 src/pyaro_readers/nilupmfebas/readungriddedbase.py delete mode 100644 src/pyaro_readers/nilupmfebas/region.py delete mode 100644 src/pyaro_readers/nilupmfebas/region_defs.py delete mode 100644 src/pyaro_readers/nilupmfebas/resources.py delete mode 100644 src/pyaro_readers/nilupmfebas/stationdata.py delete mode 100644 src/pyaro_readers/nilupmfebas/time_config.py delete mode 100644 src/pyaro_readers/nilupmfebas/time_resampler.py delete mode 100644 src/pyaro_readers/nilupmfebas/tstype.py delete mode 100644 src/pyaro_readers/nilupmfebas/ungriddeddata.py delete mode 100644 src/pyaro_readers/nilupmfebas/units_helpers.py delete mode 100644 src/pyaro_readers/nilupmfebas/utils.py delete mode 100644 src/pyaro_readers/nilupmfebas/var_groups.py delete mode 100644 src/pyaro_readers/nilupmfebas/varcollection.py delete mode 100644 src/pyaro_readers/nilupmfebas/variable.py delete mode 100644 src/pyaro_readers/nilupmfebas/variable_helpers.py delete mode 100644 src/pyaro_readers/nilupmfebas/varnameinfo.py diff --git a/src/pyaro_readers/nilupmfebas/_logging.py b/src/pyaro_readers/nilupmfebas/_logging.py deleted file mode 100644 index b8fa64a..0000000 --- a/src/pyaro_readers/nilupmfebas/_logging.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -Logging configuration and package metadata helpers - -NOTE: -All pyaerocom child modules share the logging configuration -- all logging messages are time stamped and writen out to file -- some messages are also printed to the console -- log files are kept up to 14 days -- logging configuration is read from pyaerocom/data/logging.ini - with default values from LOGGING_CONFIG -""" -from __future__ import annotations - -import logging -import os -import pathlib -import sys -import time -from logging.config import fileConfig - -import resources - - -def change_verbosity(level: str | int) -> None: - """ - Change logging verbosity (to console) - - Parameters - ---------- - level: str or int - new `logging level`_ - - Returns - ------- - None - - """ - if isinstance(level, str): - level = level.upper() - - if isinstance(level, int) and not (logging.DEBUG <= level <= logging.CRITICAL): - raise ValueError( - f"invalid logging level {level}, choose a value between {logging.DEBUG} and {logging.CRITICAL}" - ) - - logger = logging.getLogger("") - assert logger.handlers, f"{logger.name} logger has not been configured correctly" - for handler in logger.handlers: - if type(handler) == logging.StreamHandler: - handler.setLevel(level) - - -LOGGING_CONFIG = dict( - # root logger - file_name=os.getenv( - "PYAEROCOM_LOG_FILE", default=f"logs/pyaerocom.log.{os.getpid()}" - ), - pid=os.getpid(), -) -cwd_log_path = pathlib.Path.cwd() / "logging.ini" -if cwd_log_path.exists(): - fileConfig(cwd_log_path, defaults=LOGGING_CONFIG, disable_existing_loggers=True) -else: - file_name = pathlib.Path(LOGGING_CONFIG["file_name"]) - log_path = file_name.parent - log_path.mkdir(exist_ok=True, parents=True) - with resources.path("pyaerocom", "logging.ini") as path: - fileConfig(path, defaults=LOGGING_CONFIG, disable_existing_loggers=False) - if not sys.stdout.isatty(): # disable stdout when non-interactive - change_verbosity(logging.CRITICAL) - # cleanup of old default logging files - now = time.time() - logger = logging.getLogger(__name__) - for f in log_path.glob("pyaerocom.log.*"): - age = now - f.lstat().st_mtime - if age > (7 * 24 * 60 * 60): - logger.info(f"deleting log-file older than 7 days: {f}") - f.unlink() - old_logfile = pathlib.Path("pyaerocom.log") - if old_logfile.exists(): - logger.warning( - f"no longer used old default logfile '{old_logfile}' exist, please consider deleting" - ) diff --git a/src/pyaro_readers/nilupmfebas/_warnings.py b/src/pyaro_readers/nilupmfebas/_warnings.py deleted file mode 100644 index be99cec..0000000 --- a/src/pyaro_readers/nilupmfebas/_warnings.py +++ /dev/null @@ -1,62 +0,0 @@ -import warnings -from contextlib import contextmanager - - -@contextmanager -def ignore_warnings(category: type[Warning], *messages: str): - """ - Ignore particular warnings with a decorator or context manager - - Parameters - ---------- - category : subclass of Warning - warning category to be ignored. E.g. UserWarning, DeprecationWarning. - The default is Warning. - *messages : str, optional - warning messages to be ignored. E.g. - ignore_warnings(Warning, 'Warning that can safely be ignored', 'Other warning to ignore'). - For each - `` :func:`warnigns.filterwarnings('ignore', Warning, message=)` - is called. - - Example - ------- - @ignore_warnings(UserWarning) - @ignore_warnings(DeprecationWarning) - @ignore_warnings(Warning, 'I REALLY') - def warn_randomly_and_add_numbers(num1, num2): - warnings.warn(UserWarning('Harmless user warning')) - warnings.warn(DeprecationWarning('This function is deprecated')) - warnings.warn(Warning('I REALLY NEED TO REACH YOU')) - return num1+num2 - - """ - if not issubclass(category, Warning): - raise ValueError("category must be a Warning subclass") - - if not messages: - message = "" - elif all(isinstance(msg, str) for msg in messages): - message = "|".join(messages) - else: - raise ValueError("messages must be list of strings") - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=category, message=message) - yield - - -def ignore_basemap_warning(): # pragma: no cover - warnings.filterwarnings( - "ignore", r".*install Basemap$", UserWarning, "geonum", append=True - ) - - -def ignore_earth_radius_warning(): # pragma: no cover - warnings.filterwarnings( - "ignore", - "Using DEFAULT_SPHERICAL_EARTH_RADIUS", - UserWarning, - "iris.*", - append=True, - ) diff --git a/src/pyaro_readers/nilupmfebas/aerocom_browser.py b/src/pyaro_readers/nilupmfebas/aerocom_browser.py deleted file mode 100644 index b45b270..0000000 --- a/src/pyaro_readers/nilupmfebas/aerocom_browser.py +++ /dev/null @@ -1,215 +0,0 @@ -import fnmatch -import logging -import os -import re - -from . import const -from ._lowlevel_helpers import BrowseDict -from .exceptions import DataSearchError - -logger = logging.getLogger(__name__) - - -class AerocomBrowser(BrowseDict): - """Interface for browsing all Aerocom data direcories - - Note - ---- - Use :func:`browse` to find directories matching a - certain search pattern. - The class methods :func:`find_matches` and :func:`find_dir` both use - :func:`browse`, the only difference is, that the :func:`find_matches` adds - the search result (a list with strings) to - - """ - - def _browse(self, name_or_pattern, ignorecase=True, return_if_match=True): - """Search all Aerocom data directories that match input name or pattern - - Note - ---- - Please do not use this function but either - Parameters - ---------- - name_or_pattern : str - name or pattern of data (can be model or obs data) - ignorecase : bool - if True, upper / lower case is ignored - return_if_match : bool - if True, then the data directory is returned as string, if it can - be found, else, only a list is returned that contains all - matches. The latter takes longer since the whole database is - searched. - - Returns - ------- - :obj:`str` or :obj:`list` - Data directory (str, if ``return_if_match`` is True) or list - containing valid Aerocom names (which can then be used to - retrieve the paths) - - Raises - ------ - DataSearchError - if no match or no unique match can be found - """ - pattern = fnmatch.translate(name_or_pattern) - _candidates = [] - _msgs = [] - _warnings = [] - - for obs_id, obs_path in const.OBSLOCS_UNGRIDDED.items(): - if ignorecase: - match = name_or_pattern.lower() == obs_id.lower() - else: - match = name_or_pattern == obs_id - if match: - logger.info( - f"Found match for search pattern in obs network directories {obs_id}" - ) - path = os.path.normpath(obs_path) - if os.path.exists(path): - self[obs_id] = path - _candidates.append(obs_id) - if return_if_match: - return path - else: - if ignorecase: - match = bool(re.search(pattern, obs_id, re.IGNORECASE)) - else: - match = bool(re.search(pattern, obs_id)) - if match: - path = os.path.normpath(obs_path) - if os.path.exists(path): - self[obs_id] = path - _candidates.append(obs_id) - if return_if_match: - return path - - for search_dir in const.DATA_SEARCH_DIRS: - # get the directories - if os.path.isdir(search_dir): - # subdirs = listdir(search_dir) - subdirs = [ - x - for x in os.listdir(search_dir) - if os.path.isdir(os.path.join(search_dir, x)) - ] - for subdir in subdirs: - if ignorecase: - match = bool(re.search(pattern, subdir, re.IGNORECASE)) - else: - match = bool(re.search(pattern, subdir)) - if match: - _dir = os.path.normpath(os.path.join(search_dir, subdir)) - _rnsubdir = os.path.join(_dir, "renamed") - if os.path.isdir(_rnsubdir): - logger.info(f"{_dir} has subdir renamed. Using that one") - _dir = _rnsubdir - if any([_dir in x for x in self.values()]): - # directory was already found before - continue - # append name of candidate ... - _candidates.append(subdir) - # ... and the corresponding data directory - self[subdir] = _dir - - # now check if it is actually an exact match, if - # applicable - if return_if_match: - if ignorecase: - match = name_or_pattern.lower() == subdir.lower() - else: - match = name_or_pattern == subdir - if match: - logger.info(f"Found match for ID {name_or_pattern}") - if return_if_match: - return _dir - - else: - _msgs.append("directory %s does not exist\n" % search_dir) - for msg in _msgs: - logger.info(msg) - - for warning in _warnings: - logger.warning(warning) - - if len(_candidates) == 0: - raise DataSearchError( - f"No matches could be found for search pattern {name_or_pattern}" - ) - if return_if_match: - if len(_candidates) == 1: - logger.info( - f"Found exactly one match for search pattern " - f"{name_or_pattern}: {_candidates[0]}" - ) - return self[_candidates[0]] - raise DataSearchError( - f"Found multiple matches for search pattern {name_or_pattern}. " - f"Please choose from {_candidates}" - ) - return _candidates - - @property - def dirs_found(self): - """All directories that were found""" - return list(self.values()) - - @property - def ids_found(self): - """All data IDs that were found""" - return list(self) - - def find_data_dir(self, name_or_pattern, ignorecase=True): - """Find match of input name or pattern in Aerocom database - - Parameters - ---------- - name_or_pattern : str - name or pattern of data (can be model or obs data) - ignorecase : bool - if True, upper / lower case is ignored - - Returns - ------- - str - data directory of match - - Raises - ------ - DataSearchError - if no matches or no unique match can be found - """ - if name_or_pattern in self: - logger.info(f"{name_or_pattern} found in instance of AerocomBrowser") - return self[name_or_pattern] - logger.info(f"Searching database for {name_or_pattern}") - return self._browse( - name_or_pattern, ignorecase=ignorecase, return_if_match=True - ) # returns list - - def find_matches(self, name_or_pattern, ignorecase=True): - """Search all Aerocom data directories that match input name or pattern - - Parameters - ---------- - name_or_pattern : str - name or pattern of data (can be model or obs data) - ignorecase : bool - if True, upper / lower case is ignored - - Returns - ------- - list - list of names that match the pattern (corresponding paths can be - accessed from this class instance) - - Raises - ------ - DataSearchError - if no matches can be found - """ - return self._browse( - name_or_pattern, ignorecase=ignorecase, return_if_match=False - ) # returns list diff --git a/src/pyaro_readers/nilupmfebas/aux_var_helpers.py b/src/pyaro_readers/nilupmfebas/aux_var_helpers.py deleted file mode 100644 index 87674ac..0000000 --- a/src/pyaro_readers/nilupmfebas/aux_var_helpers.py +++ /dev/null @@ -1,848 +0,0 @@ -import cf_units -import numpy as np - -from . import const -from .variable_helpers import get_variable - - -def calc_ang4487aer(data): - """Compute Angstrom coefficient (440-870nm) from 440 and 870 nm AODs - - Parameters - ---------- - data : dict-like - data object containing imported results - - Note - ---- - Requires the following two variables to be available in provided data - object: - - 1. od440aer - 2. od870aer - - Raises - ------ - AttributError - if either 'od440aer' or 'od870aer' are not available in data object - - Returns - ------- - ndarray - array containing computed angstrom coefficients - - - """ - if not all([x in data for x in ["od440aer", "od870aer"]]): - raise AttributeError( - "Either of the two (or both) required variables " - "(od440aer, od870aer) are not available in data" - ) - od440aer, od870aer = data["od440aer"], data["od870aer"] - return compute_angstrom_coeff(od440aer, od870aer, 0.44, 0.87) - - -def calc_od550aer(data): - """Compute AOD at 550 nm using Angstrom coefficient and 500 nm AOD - - Parameters - ---------- - data : dict-like - data object containing imported results - - Returns - ------- - :obj:`float` or :obj:`ndarray` - AOD(s) at shifted wavelength - """ - return _calc_od_helper( - data=data, - var_name="od550aer", - to_lambda=0.55, - od_ref="od500aer", - lambda_ref=0.50, - od_ref_alt="od440aer", - lambda_ref_alt=0.44, - use_angstrom_coeff="ang4487aer", - ) - - -def calc_abs550aer(data): - """Compute AOD at 550 nm using Angstrom coefficient and 500 nm AOD - - Parameters - ---------- - data : dict-like - data object containing imported results - - Returns - ------- - :obj:`float` or :obj:`ndarray` - AOD(s) at shifted wavelength - """ - return _calc_od_helper( - data=data, - var_name="abs550aer", - to_lambda=0.55, - od_ref="abs500aer", - lambda_ref=0.50, - od_ref_alt="abs440aer", - lambda_ref_alt=0.44, - use_angstrom_coeff="angabs4487aer", - ) - - -def calc_od550gt1aer(data): - """Compute coarse mode AOD at 550 nm using Angstrom coeff. and 500 nm AOD - - Parameters - ---------- - data : dict-like - data object containing imported results - - Returns - ------- - float or ndarray - AOD(s) at shifted wavelength - """ - return _calc_od_helper( - data=data, - var_name="od550gt1aer", - to_lambda=0.55, - od_ref="od500gt1aer", - lambda_ref=0.50, - use_angstrom_coeff="ang4487aer", - ) - - -def calc_od550lt1aer(data): - """Compute fine mode AOD at 550 nm using Angstrom coeff. and 500 nm AOD - - Parameters - ---------- - data : dict-like - data object containing imported results - - Returns - ------- - :obj:`float` or :obj:`ndarray` - AOD(s) at shifted wavelength - """ - return _calc_od_helper( - data=data, - var_name="od550lt1aer", - to_lambda=0.55, - od_ref="od500lt1aer", - lambda_ref=0.50, - use_angstrom_coeff="ang4487aer", - ) - - -def calc_od550lt1ang(data): - """Compute AOD at 550 nm using Angstrom coeff. and 500 nm AOD, - that is filtered for angstrom coeff < 1 to get AOD representative - of coarse particles. - - Parameters - ---------- - data : dict-like - data object containing imported results - - Returns - ------- - :obj:`float` or :obj:`ndarray` - AOD(s) at shifted wavelength - """ - - return _calc_od_helper( - data=data, - var_name="od550lt1ang", - to_lambda=0.55, - od_ref="od500aer", - lambda_ref=0.50, - od_ref_alt="od440aer", - lambda_ref_alt=0.44, - use_angstrom_coeff="ang4487aer", - treshold_angstrom=1.0, - ) - - -def compute_angstrom_coeff(od1, od2, lambda1, lambda2): - """Compute Angstrom coefficient based on 2 optical densities - - Parameters - ---------- - od1 : :obj:`float` or :obj:`ndarray` - AOD at wavelength 1 - od2 : :obj:`float` or :obj:`ndarray` - AOD at wavelength 2 - lambda1 : :obj:`float` or :obj:`ndarray` - wavelength 1 - lambda 2 : :obj:`float` or :obj:`ndarray` - wavelength 2 - - Returns - ------- - :obj:`float` or :obj:`ndarray` - Angstrom exponent(s) - """ - return -np.log(od1 / od2) / np.log(lambda1 / lambda2) - - -def compute_od_from_angstromexp(to_lambda, od_ref, lambda_ref, angstrom_coeff): - """Compute AOD at specified wavelength - - Uses Angstrom coefficient and reference AOD to compute the - corresponding wavelength shifted AOD - - Parameters - ---------- - to_lambda : :obj:`float` or :obj:`ndarray` - wavelength for which AOD is calculated - od_ref : :obj:`float` or :obj:`ndarray` - reference AOD - lambda_ref : :obj:`float` or :obj:`ndarray` - wavelength corresponding to reference AOD - angstrom_coeff : :obj:`float` or :obj:`ndarray` - Angstrom coefficient - - Returns - ------- - :obj:`float` or :obj:`ndarray` - AOD(s) at shifted wavelength - - """ - return od_ref * (lambda_ref / to_lambda) ** angstrom_coeff - - -def _calc_od_helper( - data, - var_name, - to_lambda, - od_ref, - lambda_ref, - od_ref_alt=None, - lambda_ref_alt=None, - use_angstrom_coeff="ang4487aer", - treshold_angstrom=None, -): - """Helper method for computing ODs - - Parameters - ---------- - data : dict-like - data object containing loaded results used to compute the ODs at a new - wavelength - var_name : str - name of variable that is supposed to be computed (is used in order to - see whether a global lower threshold is defined for this variable and - if this is the case, all computed values that are below this threshold - are replaced with NaNs) - to_lambda : float - wavelength of computed AOD - od_ref : str - name of reference AOD in data - lambda_ref : :obj:`float` or :obj:`ndarray` - wavelength corresponding to reference AOD - od_ref_alt : str - alternative reference AOD (is used for datapoints where former is - invalid) - lambda_ref_alt : :obj:`float` or :obj:`ndarray`, optional - wavelength corresponding to alternative reference AOD - use_angstrom_coeff : str - name of Angstrom coefficient in data, that is used for computation - threshold_angstrom : float - filter out observations that have angstrom exponent larger than a set - threshold. - - Returns - ------- - :obj:`float` or :obj:`ndarray` - AOD(s) at shifted wavelength - - Raises - ------ - AttributeError - if neither ``od_ref`` nor ``od_ref_alt`` are available in data, or if - ``use_angstrom_coeff`` is missing - """ - if not od_ref in data: - if od_ref_alt is None and not od_ref_alt in data: - raise AttributeError( - f"No alternative OD found for computation of {var_name}" - ) - return compute_od_from_angstromexp( - to_lambda=to_lambda, - od_ref=data[od_ref_alt], - lambda_ref=lambda_ref_alt, - angstrom_coeff=data[use_angstrom_coeff], - ) - elif not use_angstrom_coeff in data: - raise AttributeError( - "Angstrom coefficient (440-870 nm) is not available in provided data" - ) - result = compute_od_from_angstromexp( - to_lambda=to_lambda, - od_ref=data[od_ref], - lambda_ref=lambda_ref, - angstrom_coeff=data[use_angstrom_coeff], - ) - - # optional if available - if od_ref_alt in data: - # fill up time steps that are nans with values calculated from the - # alternative wavelength to minimise gaps in the time series - mask = np.argwhere(np.isnan(result)) - - if len(mask) > 0: # there are nans - ods_alt = data[od_ref_alt][mask] - ang = data[use_angstrom_coeff][mask] - replace = compute_od_from_angstromexp( - to_lambda=to_lambda, - od_ref=ods_alt, - lambda_ref=lambda_ref_alt, - angstrom_coeff=ang, - ) - result[mask] = replace - if treshold_angstrom: - result = np.where(data[use_angstrom_coeff] < treshold_angstrom, result, np.nan) - - return result - - -def compute_ang4470dryaer_from_dry_scat(data): - """Compute angstrom exponent between 440 and 700 nm - - Parameters - ---------- - StationData or dict - data containing dry scattering coefficients at 440 and 700 nm - (i.e. keys sc440dryaer and sc700dryaer) - - Returns - ------- - StationData or dict - extended data object containing angstrom exponent - """ - return compute_angstrom_coeff(data["sc440dryaer"], data["sc700dryaer"], 440, 700) - - -def compute_sc550dryaer(data): - """Compute dry scattering coefficent applying RH threshold - - Cf. :func:`_compute_dry_helper` - - Parameters - ---------- - dict - data object containing scattering and RH data - - Returns - ------- - dict - modified data object containing new column sc550dryaer - - """ - rh_max = get_variable("sc550dryaer").dry_rh_max - vals, rh_mean = _compute_dry_helper( - data, data_colname="sc550aer", rh_colname="scrh", rh_max_percent=rh_max - ) - if not "sc550dryaer" in data.var_info: - data.var_info["sc550dryaer"] = {} - data.var_info["sc550dryaer"]["rh_mean"] = rh_mean - - return vals - - -def compute_sc440dryaer(data): - """Compute dry scattering coefficent applying RH threshold - - Cf. :func:`_compute_dry_helper` - - Parameters - ---------- - dict - data object containing scattering and RH data - - Returns - ------- - dict - modified data object containing new column sc550dryaer - - """ - rh_max = get_variable("sc440dryaer").dry_rh_max - return _compute_dry_helper( - data, data_colname="sc440aer", rh_colname="scrh", rh_max_percent=rh_max - )[0] - - -def compute_sc700dryaer(data): - """Compute dry scattering coefficent applying RH threshold - - Cf. :func:`_compute_dry_helper` - - Parameters - ---------- - dict - data object containing scattering and RH data - - Returns - ------- - dict - modified data object containing new column sc550dryaer - - """ - rh_max = get_variable("sc700dryaer").dry_rh_max - return _compute_dry_helper( - data, data_colname="sc700aer", rh_colname="scrh", rh_max_percent=rh_max - )[0] - - -def compute_ac550dryaer(data): - """Compute aerosol dry absorption coefficent applying RH threshold - - Cf. :func:`_compute_dry_helper` - - Parameters - ---------- - dict - data object containing scattering and RH data - - Returns - ------- - dict - modified data object containing new column sc550dryaer - - """ - rh_max = get_variable("ac550dryaer").dry_rh_max - return _compute_dry_helper( - data, data_colname="ac550aer", rh_colname="acrh", rh_max_percent=rh_max - )[0] - - -def _compute_dry_helper(data, data_colname, rh_colname, rh_max_percent=None): - """Compute new column that contains data where RH is smaller than ... - - All values in original data columns are set to NaN, where RH exceeds a - certain threshold or where RH is NaN. - - Parameters - ---------- - data : dict-like - dictionary-like object that contains data - data_colname : str - column name of variable data that is supposed to be filtered - rh_colname : str - column name of RH data - rh_max_percent : int - maximum relative humidity - - Returns - ------- - dict - modified data dictionary with new dry data column - """ - if rh_max_percent is None: - rh_max_percent = const.RH_MAX_PERCENT_DRY - - vals = np.array(data[data_colname], copy=True) - - rh = data[rh_colname] - - high_rh = rh > rh_max_percent - rhnan = np.isnan(rh) - vals[high_rh] = np.nan - vals[rhnan] = np.nan - - rh = rh[~rhnan] - lowrh = rh[~high_rh[~rhnan]] - if len(lowrh) > 0: - rh_mean = np.nanmean(lowrh) - else: - rh_mean = np.nan - - return vals, rh_mean - - -def _compute_wdep_from_concprcp_helper(data, wdep_var, concprcp_var, pr_var): - """ - Helper to compute wed deposition from concentration in precipitation - - Note - ---- - In addition to the returned numpy array, the input instance of - :class:`StationData` is modified by additional metadata and flags for - the new variable. - - Parameters - ---------- - data : StationData - input data containing concentration in precipitation variable and - precipitation variable. Both are needed to be sampled at the same - time and both arrays must have the same lengths. - wdep_var : str - name of output wet deposition variable - concprcp_var : str - name of concenration in precipitation variable - pr_var : str - precipitation variable (needs to be implicit, i.e. in units of mm or - similar and not in units of mm d-1. - - Returns - ------- - numpy.ndarray - array with wet deposition values - """ - - vars_needed = (concprcp_var, pr_var) - - if not all(x in data.data_flagged for x in vars_needed): - raise ValueError(f"Need flags for {vars_needed} to compute wet deposition") - from pyaerocom import TsType - from pyaerocom.units_helpers import RATES_FREQ_DEFAULT, get_unit_conversion_fac - - tst = TsType(data.get_var_ts_type(concprcp_var)) - - ival = tst.to_si() - - conc_unit = data.get_unit(concprcp_var) - conc_data = data[concprcp_var] - if not conc_unit.endswith("m-3"): - raise NotImplementedError("Can only handle concprcp unit ending with m-3") - concprcp_flags = data.data_flagged[concprcp_var] - - pr_unit = data.get_unit(pr_var) - if not pr_unit == "m": - data.convert_unit(pr_var, "m") - pr_data = data[pr_var] - pr_flags = data.data_flagged[pr_var] - # check where precip data is zero (it did not rain!) and for each of - # these timestamps, set concprcp to 0 (it should be 0 if there is no - # rain...) and set flags in concprcp to False (these data are to be used - # later) - pr_zero = pr_data == 0 - if pr_zero.sum() > 0: - conc_data[pr_zero] = 0 - concprcp_flags[pr_zero] = False - pr_flags[pr_zero] = False - wdep = conc_data * pr_data - wdep_units = conc_unit.replace("m-3", "m-2") - - if not ival == RATES_FREQ_DEFAULT: - fac = get_unit_conversion_fac(ival, RATES_FREQ_DEFAULT) - wdep /= fac - # in units of ts_type, that is, e.g. kg m-2 d - freq_str = f" {RATES_FREQ_DEFAULT}-1" - wdep_units += freq_str - if not wdep_var in data.var_info: - data.var_info[wdep_var] = {} - data.var_info[wdep_var]["units"] = wdep_units - - # set flags for wetso4 - wdep_flags = np.zeros(len(wdep)).astype(bool) - wdep_flags[concprcp_flags] = True - wdep_flags[pr_flags] = True - data.data_flagged[wdep_var] = wdep_flags - - return wdep - - -def compute_wetoxs_from_concprcpoxs(data): - """Compute wdep from conc in precip and precip data - - Note - ---- - In addition to the returned numpy array, the input instance of - :class:`StationData` is modified by additional metadata and flags for - the new variable. See also :func:`_compute_wdep_from_concprcp_helper`. - - Parameters - ---------- - StationData - data object containing concprcp and precip data - - Returns - ------- - numpy.ndarray - array with wet deposition values - - """ - return _compute_wdep_from_concprcp_helper(data, "wetoxs", "concprcpoxs", "pr") - - -def compute_wetoxs_from_concprcpoxst(data): - """Compute wdep from conc in precip and precip data - - Note - ---- - In addition to the returned numpy array, the input instance of - :class:`StationData` is modified by additional metadata and flags for - the new variable. See also :func:`_compute_wdep_from_concprcp_helper`. - - Parameters - ---------- - StationData - data object containing concprcp and precip data - - Returns - ------- - numpy.ndarray - array with wet deposition values - - """ - return _compute_wdep_from_concprcp_helper(data, "wetoxs", "concprcpoxst", "pr") - - -def compute_wetoxs_from_concprcpoxsc(data): - """Compute wdep from conc in precip and precip data - - Note - ---- - In addition to the returned numpy array, the input instance of - :class:`StationData` is modified by additional metadata and flags for - the new variable. See also :func:`_compute_wdep_from_concprcp_helper`. - - Parameters - ---------- - StationData - data object containing concprcp and precip data - - Returns - ------- - numpy.ndarray - array with wet deposition values - - """ - return _compute_wdep_from_concprcp_helper(data, "wetoxs", "concprcpoxsc", "pr") - - -def compute_wetoxn_from_concprcpoxn(data): - """Compute wdep from conc in precip and precip data - - Note - ---- - In addition to the returned numpy array, the input instance of - :class:`StationData` is modified by additional metadata and flags for - the new variable. See also :func:`_compute_wdep_from_concprcp_helper`. - - Parameters - ---------- - StationData - data object containing concprcp and precip data - - Returns - ------- - numpy.ndarray - array with wet deposition values - - """ - return _compute_wdep_from_concprcp_helper(data, "wetoxn", "concprcpoxn", "pr") - - -def compute_wetrdn_from_concprcprdn(data): - """Compute wdep from conc in precip and precip data - - Note - ---- - In addition to the returned numpy array, the input instance of - :class:`StationData` is modified by additional metadata and flags for - the new variable. See also :func:`_compute_wdep_from_concprcp_helper`. - - Parameters - ---------- - StationData - data object containing concprcp and precip data - - Returns - ------- - numpy.ndarray - array with wet deposition values - - """ - return _compute_wdep_from_concprcp_helper(data, "wetrdn", "concprcprdn", "pr") - - -def compute_wetnh4_from_concprcpnh4(data): - return _compute_wdep_from_concprcp_helper(data, "wetnh4", "concprcpnh4", "pr") - - -def compute_wetno3_from_concprcpno3(data): - return _compute_wdep_from_concprcp_helper(data, "wetno3", "concprcpno3", "pr") - - -def compute_wetso4_from_concprcpso4(data): - return _compute_wdep_from_concprcp_helper(data, "wetso4", "concprcpso4", "pr") - - -def compute_wetna_from_concprcpna(data): - return _compute_wdep_from_concprcp_helper(data, "wetna", "concprcpna", "pr") - - -def vmrx_to_concx( - data, p_pascal, T_kelvin, vmr_unit, mmol_var, mmol_air=None, to_unit=None -): - """ - Convert volume mixing ratio (vmr) to mass concentration - - Parameters - ---------- - data : float or ndarray - array containing vmr values - p_pascal : float - pressure in Pa of input data - T_kelvin : float - temperature in K of input data - vmr_unit : str - unit of input data - mmol_var : float - molar mass of variable represented by input data - mmol_air : float, optional - Molar mass of air. Uses average density of dry air if None. - The default is None. - to_unit : str, optional - Unit to which output data is converted. If None, output unit is - kg m-3. The default is None. - - Returns - ------- - float or ndarray - input data converted to mass concentration - - """ - if mmol_air is None: - from pyaerocom.molmasses import get_molmass - - mmol_air = get_molmass("air_dry") - - Rspecific = 287.058 # J kg-1 K-1 - - conversion_fac = 1 / cf_units.Unit("mol mol-1").convert(1, vmr_unit) - - airdensity = p_pascal / (Rspecific * T_kelvin) # kg m-3 - mulfac = mmol_var / mmol_air * airdensity # kg m-3 - conc = data * mulfac # kg m-3 - if to_unit is not None: - conversion_fac *= cf_units.Unit("kg m-3").convert(1, to_unit) - if not np.isclose(conversion_fac, 1, rtol=1e-7): - conc *= conversion_fac - return conc - - -def concx_to_vmrx( - data, p_pascal, T_kelvin, conc_unit, mmol_var, mmol_air=None, to_unit=None -): - """ - Convert mass concentration to volume mixing ratio (vmr) - - Parameters - ---------- - data : float or ndarray - array containing vmr values - p_pascal : float - pressure in Pa of input data - T_kelvin : float - temperature in K of input data - vmr_unit : str - unit of input data - mmol_var : float - molar mass of variable represented by input data - mmol_air : float, optional - Molar mass of air. Uses average density of dry air if None. - The default is None. - to_unit : str, optional - Unit to which output data is converted. If None, output unit is - kg m-3. The default is None. - - Returns - ------- - float or ndarray - input data converted to volume mixing ratio - - """ - if mmol_air is None: - from pyaerocom.molmasses import get_molmass - - mmol_air = get_molmass("air_dry") - - Rspecific = 287.058 # J kg-1 K-1 - - conversion_fac = 1 / cf_units.Unit("kg m-3").convert(1, conc_unit) - - airdensity = p_pascal / (Rspecific * T_kelvin) # kg m-3 - mulfac = mmol_var / mmol_air * airdensity # kg m-3 - vmr = data / mulfac # unitless - if to_unit is not None: - conversion_fac *= cf_units.Unit("mole mole-1").convert(1, to_unit) - if not np.isclose(conversion_fac, 1, rtol=1e-7): - vmr *= conversion_fac - return vmr - - -def calc_vmro3max(data): - var_name = "vmro3" - new_var_name = "vmro3max" - - flags = data.data_flagged[var_name] - - o3max = data[var_name] - - units = data.var_info[var_name]["units"] - # data.var_info[new_var_name]["units"] = units - - if not new_var_name in data.var_info: - data.var_info[new_var_name] = {} - - data.var_info[new_var_name] = data.var_info[var_name] - - data.data_flagged[new_var_name] = flags - # print(data.var_info) - # exit() - return o3max - - -def identity(data): - return data - - -def make_proxy_drydep_from_O3(data): - # sort of prototype to add a compted variable - # one has to extend the data structures of the station data object - # 'right', but has to return just the data array - # That concept is a bit confusing (why not do everything in data here?) - var_name = "vmro3" - new_var_name = "proxydryo3" - - flags = data.data_flagged[var_name] - new_var_data = data[var_name] - units = data.var_info[var_name]["units"] - # data.var_info[new_var_name]["units"] = units - - if not new_var_name in data.var_info: - data.var_info[new_var_name] = {} - data.var_info[new_var_name] = data.var_info[var_name] - data.var_info[new_var_name]["units"] = "mg m-2 d-1" - - data.data_flagged[new_var_name] = flags - return new_var_data - - -def make_proxy_wetdep_from_O3(data): - # sort of prototype to add a compted variable - # one has to extend the data structures of the station data object - # 'right', but has to return just the data array - # That concept is a bit confusing (why not do everything in data here?) - var_name = "vmro3" - new_var_name = "proxyweto3" - - flags = data.data_flagged[var_name] - new_var_data = data[var_name] - units = data.var_info[var_name]["units"] - # data.var_info[new_var_name]["units"] = units - - if not new_var_name in data.var_info: - data.var_info[new_var_name] = {} - data.var_info[new_var_name] = data.var_info[var_name] - data.var_info[new_var_name]["units"] = "mg m-2 d-1" - - data.data_flagged[new_var_name] = flags - return new_var_data diff --git a/src/pyaro_readers/nilupmfebas/colocation.py b/src/pyaro_readers/nilupmfebas/colocation.py deleted file mode 100644 index 0f032ec..0000000 --- a/src/pyaro_readers/nilupmfebas/colocation.py +++ /dev/null @@ -1,1093 +0,0 @@ -""" -Methods and / or classes to perform colocation -""" - -import logging -import os - -import numpy as np -import pandas as pd -import xarray as xr -from geonum.atmosphere import pressure - -from pyaerocom import __version__ as pya_ver -from pyaerocom import const -from pyaerocom.colocateddata import ColocatedData -from pyaerocom.exceptions import ( - DataUnitError, - DimensionOrderError, - MetaDataError, - TemporalResolutionError, - TimeMatchError, - VariableDefinitionError, - VarNotAvailableError, -) -from pyaerocom.filter import Filter -from pyaerocom.helpers import ( - get_lowest_resolution, - isnumeric, - make_datetime_index, - to_pandas_timestamp, -) -from pyaerocom.time_resampler import TimeResampler -from pyaerocom.tstype import TsType -from pyaerocom.variable import Variable - -logger = logging.getLogger(__name__) - - -def resolve_var_name(data): - """ - Check variable name of `GriddedData` against AeroCom default - - Checks whether the variable name set in the data corresponds to the - AeroCom variable name, or whether it is an alias. Returns both the - variable name set and the AeroCom variable name. - - Parameters - ---------- - data : GriddedData - Data to be checked. - - Returns - ------- - str - variable name as set in data (may be alias, but may also be AeroCom - variable name, in which case first and second return parameter are the - same). - str - corresponding AeroCom variable name - - """ - - var = data.var_name - try: - vardef = const.VARS[var] - except VariableDefinitionError: - vardef = data.register_var_glob() - - return (var, vardef.var_name_aerocom) - - -def _regrid_gridded(gridded, regrid_scheme, regrid_res_deg): - """ - Regrid instance of `GriddedData` - - Makes sure to handle different input options for `regrid_res_deg`. - - Parameters - ---------- - gridded : GriddedData - instance of :class:`GriddedData` that is supposed to be regridded. - regrid_scheme : str - iris scheme used for regridding (defaults to area weighted regridding) - regrid_res_deg : int or dict, optional - regrid resolution in degrees. If specified, the input gridded data - objects will be regridded in lon / lat dimension to the input - resolution (if input is integer, both lat and lon are regridded to that - resolution, if input is dict, use keys `lat_res_deg` and `lon_res_deg` - to specify regrid resolutions, respectively). - - Raises - ------ - ValueError - If input for `regrid_res_deg` is invalid. - - Returns - ------- - GriddedData - regridded data object - - """ - if not isinstance(regrid_res_deg, dict): - if not isnumeric(regrid_res_deg): - raise ValueError( - "Invalid input for regrid_res_deg. Need integer " - "or dict specifying lat and lon res" - ) - regrid_res_deg = dict(lat_res_deg=regrid_res_deg, lon_res_deg=regrid_res_deg) - - return gridded.regrid(scheme=regrid_scheme, **regrid_res_deg) - - -def _ensure_gridded_gridded_same_freq(data, data_ref, min_num_obs, resample_how): - """ - Make sure 2 input gridded data objects are in the same frequency - - Checks if both input data objects are in the same frequency, and if not, - downsample the one with higher freqency accordingly. - - Parameters - ---------- - data : GriddedData - first data object. - data_ref : GriddedData - second data object. - min_num_obs : int or dict, optional - Minimum number of observations for resampling. - resample_how : str or dict, optional - Resampling aggregators used. - - Returns - ------- - GriddedData - first data object. - GriddedData - second data object. - str - sampling frequency of both data objects. - - """ - ts_type_data = data.ts_type - ts_type_data_ref = data_ref.ts_type - if ts_type_data != ts_type_data_ref: - # ref data is in higher resolution - if TsType(ts_type_data_ref) > TsType(ts_type_data): - data_ref = data_ref.resample_time( - ts_type_data, min_num_obs=min_num_obs, how=resample_how - ) - else: - data = data.resample_time( - ts_type_data_ref, min_num_obs=min_num_obs, how=resample_how - ) - return data, data_ref, data.ts_type - - -def colocate_gridded_gridded( - data, - data_ref, - ts_type=None, - start=None, - stop=None, - filter_name=None, - regrid_res_deg=None, - harmonise_units=True, - regrid_scheme="areaweighted", - update_baseyear_gridded=None, - min_num_obs=None, - colocate_time=False, - resample_how=None, - **kwargs, -): - """Colocate 2 gridded data objects - - Todo - ---- - - think about vertical dimension (vert_scheme input not used at the moment) - - Parameters - ---------- - data : GriddedData - gridded data (e.g. model results) - data_ref : GriddedData - reference data (e.g. gridded satellite object) that is co-located with - `data`. - observation data or other model) - ts_type : str, optional - desired temporal resolution of output colocated data (e.g. "monthly"). - Defaults to None, in which case the highest possible resolution is - used. - start : str or datetime64 or similar, optional - start time for colocation, if None, the start time of the input - :class:`GriddedData` object is used - stop : str or datetime64 or similar, optional - stop time for colocation, if None, the stop time of the input - :class:`GriddedData` object is used - filter_name : str, optional - string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for - details). If None, then it is set to 'ALL-wMOUNTAINS', which - corresponds to no filtering (world with mountains). - Use ALL-noMOUNTAINS to exclude mountain sites. - regrid_res_deg : int or dict, optional - regrid resolution in degrees. If specified, the input gridded data - objects will be regridded in lon / lat dimension to the input - resolution (if input is integer, both lat and lon are regridded to that - resolution, if input is dict, use keys `lat_res_deg` and `lon_res_deg` - to specify regrid resolutions, respectively). - harmonise_units : bool - if True, units are attempted to be harmonised (note: raises Exception - if True and units cannot be harmonised). Defaults to True. - regrid_scheme : str - iris scheme used for regridding (defaults to area weighted regridding) - update_baseyear_gridded : int, optional - optional input that can be set in order to redefine the time dimension - in the first gridded data object `data`to be analysed. E.g., if the - data object is a climatology (one year of data) that has set the base - year of the time dimension to a value other than the specified input - start / stop time this may be used to update the time in order to make - co-location possible. - min_num_obs : int or dict, optional - minimum number of observations for resampling of time - colocate_time : bool - if True and if original time resolution of data is higher than desired - time resolution (`ts_type`), then both datasets are colocated in time - *before* resampling to lower resolution. - resample_how : str or dict - string specifying how data should be aggregated when resampling in time. - Default is "mean". Can also be a nested dictionary, e.g. - resample_how={'daily': {'hourly' : 'max'}} would use the maximum value - to aggregate from hourly to daily, rather than the mean. - **kwargs - additional keyword args (not used here, but included such that factory - class can handle different methods with different inputs) - - Returns - ------- - ColocatedData - instance of colocated data - - """ - if filter_name is None: - filter_name = const.DEFAULT_REG_FILTER - - if harmonise_units: - if not data.units == data_ref.units: - try: - data_ref.convert_unit(data.units) - except Exception: - raise DataUnitError( - f"Failed to merge data unit of reference gridded data object ({data.units}) " - f"to data unit of gridded data object ({data_ref.units})" - ) - - if update_baseyear_gridded is not None: - # update time dimension in gridded data - data.base_year = update_baseyear_gridded - - if regrid_res_deg is not None: - data_ref = _regrid_gridded(data_ref, regrid_scheme, regrid_res_deg) - # perform regridding - if data.lon_res < data_ref.lon_res: # obs has lower resolution - data = data.regrid(data_ref, scheme=regrid_scheme) - else: - data_ref = data_ref.regrid(data, scheme=regrid_scheme) - - ts_type_src = [data_ref.ts_type, data.ts_type] - # time resolution of dataset to be analysed - data, data_ref, data_ts_type = _ensure_gridded_gridded_same_freq( - data, data_ref, min_num_obs, resample_how - ) - # now both are in same temporal resolution - - # input ts_type is not specified or model is in lower resolution - # than input ts_type -> use model frequency to colocate - if ts_type is None or TsType(data_ts_type) < TsType(ts_type): - ts_type = data_ts_type - - # 1. match model data with potential input start / stop and update if - # applicable - start, stop = check_time_ival(data, start, stop) - # 2. narrow it down with obsdata availability, if applicable - start, stop = check_time_ival(data_ref, start, stop) - - data = data.crop(time_range=(start, stop)) - data_ref = data_ref.crop(time_range=(start, stop)) - - # perform region extraction (if applicable) - regfilter = Filter(name=filter_name) - data = regfilter(data) - data_ref = regfilter(data_ref) - - files_ref = [os.path.basename(x) for x in data_ref.from_files] - files = [os.path.basename(x) for x in data.from_files] - - var, var_aerocom = resolve_var_name(data) - var_ref, var_ref_aerocom = resolve_var_name(data_ref) - meta = { - "data_source": [data_ref.data_id, data.data_id], - "var_name": [var_ref_aerocom, var_aerocom], - "var_name_input": [var_ref, var], - "ts_type": data_ts_type, - "filter_name": filter_name, - "ts_type_src": ts_type_src, - "var_units": [str(data_ref.units), str(data.units)], - "data_level": 3, - "revision_ref": data_ref.data_revision, - "from_files": files, - "from_files_ref": files_ref, - "colocate_time": colocate_time, - "obs_is_clim": False, - "pyaerocom": pya_ver, - "min_num_obs": min_num_obs, - "resample_how": resample_how, - } - - data_np = data.grid.data - if isinstance(data_np, np.ma.core.MaskedArray): - data_np = data_np.filled(np.nan) - data_ref_np = data_ref.grid.data - if isinstance(data_ref_np, np.ma.core.MaskedArray): - data_ref_np = data_ref_np.filled(np.nan) - arr = np.asarray((data_ref_np, data_np)) - time = data.time_stamps().astype("datetime64[ns]") - lats = data.latitude.points - lons = data.longitude.points - - # create coordinates of DataArray - coords = { - "data_source": meta["data_source"], - "time": time, - "latitude": lats, - "longitude": lons, - } - - dims = ["data_source", "time", "latitude", "longitude"] - - coldata = ColocatedData( - data=arr, coords=coords, dims=dims, name=data.var_name, attrs=meta - ) - - # add correct units for lat / lon dimensions - coldata.latitude.attrs["standard_name"] = data.latitude.standard_name - coldata.latitude.attrs["units"] = str(data.latitude.units) - - coldata.longitude.attrs["standard_name"] = data.longitude.standard_name - coldata.longitude.attrs["units"] = str(data.longitude.units) - - if data_ts_type != ts_type: - coldata = coldata.resample_time( - to_ts_type=ts_type, - colocate_time=colocate_time, - min_num_obs=min_num_obs, - how=resample_how, - **kwargs, - ) - return coldata - - -def check_time_ival(data, start, stop): - # get start / stop of gridded data as pandas.Timestamp - data_start = to_pandas_timestamp(data.start) - data_stop = to_pandas_timestamp(data.stop) - - if start is None: - start = data_start - else: - start = to_pandas_timestamp(start) - if stop is None: - stop = data_stop - else: - stop = to_pandas_timestamp(stop) - - if start < data_start: - start = data_start - if stop > data_stop: - stop = data_stop - # check overlap - if stop < data_start or start > data_stop: - raise TimeMatchError( - f"Input time range {start}-{stop} does not overlap with data " - f"range: {data_start}-{data_stop}" - ) - return start, stop - - -def check_ts_type(data, ts_type): - ts_type_data = TsType(data.ts_type) - if ts_type is None: - ts_type = ts_type_data - elif isinstance(ts_type, str): - ts_type = TsType(ts_type) - if ts_type > ts_type_data: - # desired output frequency ts_type is higher resolution than frequency - # of data (e.g. desired output is hourly but data is daily, update - # output ts_type) - ts_type = ts_type_data - return ts_type, ts_type_data - - -def _colocate_site_data_helper( - stat_data, - stat_data_ref, - var, - var_ref, - ts_type, - resample_how, - min_num_obs, - use_climatology_ref, -): - """ - Helper method that colocates two timeseries from 2 StationData objects - - Used in main loop of :func:`colocate_gridded_ungridded` - - Parameters - ---------- - stat_data : StationData - first data object (usually the one that is to be compared with obs) - stat_data_ref : StationData - second data object (usually obs) - var : str - variable to be used from `stat_data` - var_ref : str - variable to be used from `stat_data_ref` - ts_type : str - output frequency - resample_how : str or dict - string specifying how data should be aggregated when resampling in time. - Default is "mean". Can also be a nested dictionary, e.g. - resample_how={'daily': {'hourly' : 'max'}} would use the maximum value - to aggregate from hourly to daily, rather than the mean. - min_num_obs : int or dict, optional - minimum number of observations for resampling of time - use_climatology_ref : bool - if True, climatological timeseries are used from observations - - Raises - ------ - TemporalResolutionError - if obs sampling frequency is lower than desired output frequency - - Returns - ------- - pandas.DataFrame - dataframe containing the colocated input data (column names are - data and ref) - """ - - # get grid and obs timeseries data (that may be sampled in arbitrary - # time resolution, particularly the obs data) - grid_ts = stat_data.resample_time( - var, ts_type=ts_type, how=resample_how, min_num_obs=min_num_obs, inplace=True - )[var] - - if use_climatology_ref: - obs_ts = stat_data_ref.calc_climatology(var_ref, min_num_obs=min_num_obs)[ - var_ref - ] - else: - obs_ts = stat_data_ref.resample_time( - var_ref, - ts_type=ts_type, - how=resample_how, - min_num_obs=min_num_obs, - inplace=True, - )[var_ref] - - # fill up missing time stamps - return pd.concat([obs_ts, grid_ts], axis=1, keys=["ref", "data"]) - - -def _colocate_site_data_helper_timecol( - stat_data, - stat_data_ref, - var, - var_ref, - ts_type, - resample_how, - min_num_obs, - use_climatology_ref, -): - """ - Helper method that colocates two timeseries from 2 StationData objects - - Other than :func:`_colocate_site_data_helper` this method applies time - colocation in highest possible resolution (used if option `colocate_time` - is active in colocation routine :func:`colocate_gridded_ungridded`). - - Used in main loop of :func:`colocate_gridded_ungridded` - - Parameters - ---------- - stat_data : StationData - first data object (usually the one that is to be compared with obs) - stat_data_ref : StationData - second data object (usually obs) - var : str - variable to be used from `stat_data` - var_ref : str - variable to be used from `stat_data_ref` - ts_type : str - output frequency - resample_how : str or dict - string specifying how data should be aggregated when resampling in time. - Default is "mean". Can also be a nested dictionary, e.g. - resample_how={'daily': {'hourly' : 'max'}} would use the maximum value - to aggregate from hourly to daily, rather than the mean. - min_num_obs : int or dict, optional - minimum number of observations for resampling of time - use_climatology_ref : bool - if True, NotImplementedError is raised - - Raises - ------ - TemporalResolutionError - if model or obs sampling frequency is lower than desired output frequency - NotImplementedError - if input arg `use_climatology_ref` is True. - - Returns - ------- - pandas.DataFrame - dataframe containing the colocated input data (column names are - data and ref) - """ - if use_climatology_ref: - raise NotImplementedError( - "Using observation climatology in colocation with option " - "colocate_time=True is not available yet ..." - ) - - grid_tst = stat_data.get_var_ts_type(var) - obs_tst = stat_data_ref.get_var_ts_type(var_ref) - coltst = TsType(get_lowest_resolution(grid_tst, obs_tst)) - # ============================================================================= - # if coltst.mulfac != 1: - # coltst = coltst.next_lower - # ============================================================================= - - stat_data.resample_time( - var_name=var, - ts_type=str(coltst), - how=resample_how, - min_num_obs=min_num_obs, - inplace=True, - ) - - stat_data_ref.resample_time( - var_name=var_ref, - ts_type=str(coltst), - how=resample_how, - min_num_obs=min_num_obs, - inplace=True, - ) - - # Save time indices of the observations and a mask of where it is NaN - obs_idx = stat_data_ref[var_ref].index - obs_isnan = stat_data_ref[var_ref].isnull() - - # now both StationData objects are in the same resolution, but they still - # might have gaps in their time axis, thus concatenate them in a DataFrame, - # which will merge the time index - merged = pd.concat( - [stat_data_ref[var_ref], stat_data[var]], axis=1, keys=["ref", "data"] - ) - # Interpolate the model to the times of the observations - # (for non-standard coltst it could be that 'resample_time' - # has placed the model and observations at different time stamps) - merged = merged.interpolate("index").reindex(obs_idx).loc[obs_idx] - # Set to NaN at times when observations were NaN originally - # (because the interpolation will interpolate the 'ref' column as well) - merged.loc[obs_isnan] = np.nan - # due to interpolation some model values may be NaN, where there is obs - merged.loc[merged.data.isnull()] = np.nan - # Ensure the whole timespan of the model is kept in "merged" - stat_data[var].name = "tmp" - merged = pd.concat([merged, stat_data[var]], axis=1) - merged = merged[["ref", "data"]] - - grid_ts = merged["data"] - obs_ts = merged["ref"] - # invalidate model where obs is NaN (NB: maybe not needed any more?) - obsnan = np.isnan(obs_ts.values) - grid_ts[obsnan] = np.nan - - # now resample both to output frequency - resampler = TimeResampler() - obs_ts = resampler.resample( - to_ts_type=ts_type, - input_data=obs_ts, - from_ts_type=coltst, - how=resample_how, - min_num_obs=min_num_obs, - ) - - grid_ts = resampler.resample( - to_ts_type=ts_type, - input_data=grid_ts, - from_ts_type=coltst, - how=resample_how, - min_num_obs=min_num_obs, - ) - # fill up missing time stamps - return pd.concat([obs_ts, grid_ts], axis=1, keys=["ref", "data"]) - - -def colocate_gridded_ungridded( - data, - data_ref, - ts_type=None, - start=None, - stop=None, - filter_name=None, - regrid_res_deg=None, - harmonise_units=True, - regrid_scheme="areaweighted", - var_ref=None, - update_baseyear_gridded=None, - min_num_obs=None, - colocate_time=False, - use_climatology_ref=False, - resample_how=None, - **kwargs, -): - """Colocate gridded with ungridded data (low level method) - - For high-level colocation see :class:`pyaerocom.colocation_auto.Colocator` - and :class:`pyaerocom.colocation_auto.ColocationSetup` - - Note - ---- - Uses the variable that is contained in input :class:`GriddedData` object - (since these objects only contain a single variable). If this variable - is not contained in observation data (or contained but using a different - variable name) you may specify the obs variable to be used via input arg - `var_ref` - - Parameters - ---------- - data : GriddedData - gridded data object (e.g. model results). - data_ref : UngriddedData - ungridded data object (e.g. observations). - ts_type : str - desired temporal resolution of colocated data (must be valid AeroCom - ts_type str such as daily, monthly, yearly.). - start : :obj:`str` or :obj:`datetime64` or similar, optional - start time for colocation, if None, the start time of the input - :class:`GriddedData` object is used. - stop : :obj:`str` or :obj:`datetime64` or similar, optional - stop time for colocation, if None, the stop time of the input - :class:`GriddedData` object is used - filter_name : str - string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for - details). If None, then it is set to 'ALL-wMOUNTAINS', which - corresponds to no filtering (world with mountains). - Use ALL-noMOUNTAINS to exclude mountain sites. - regrid_res_deg : int or dict, optional - regrid resolution in degrees. If specified, the input gridded data - object will be regridded in lon / lat dimension to the input - resolution (if input is integer, both lat and lon are regridded to that - resolution, if input is dict, use keys `lat_res_deg` and `lon_res_deg` - to specify regrid resolutions, respectively). - harmonise_units : bool - if True, units are attempted to be harmonised (note: raises Exception - if True and units cannot be harmonised). - var_ref : :obj:`str`, optional - variable against which data in arg `data` is supposed to be compared. - If None, then the same variable is used (i.e. `data.var_name`). - update_baseyear_gridded : int, optional - optional input that can be set in order to re-define the time dimension - in the gridded data object to be analysed. E.g., if the data object - is a climatology (one year of data) that has set the base year of the - time dimension to a value other than the specified input start / stop - time this may be used to update the time in order to make colocation - possible. - min_num_obs : int or dict, optional - minimum number of observations for resampling of time - colocate_time : bool - if True and if original time resolution of data is higher than desired - time resolution (`ts_type`), then both datasets are colocated in time - *before* resampling to lower resolution. - use_climatology_ref : bool - if True, climatological timeseries are used from observations - resample_how : str or dict - string specifying how data should be aggregated when resampling in time. - Default is "mean". Can also be a nested dictionary, e.g. - resample_how={'daily': {'hourly' : 'max'}} would use the maximum value - to aggregate from hourly to daily, rather than the mean. - **kwargs - additional keyword args (passed to - :func:`UngriddedData.to_station_data_all`) - - Returns - ------- - ColocatedData - instance of colocated data - - Raises - ------ - VarNotAvailableError - if grid data variable is not available in ungridded data object - AttributeError - if instance of input :class:`UngriddedData` object contains more than - one dataset - TimeMatchError - if gridded data time range does not overlap with input time range - ColocationError - if none of the data points in input :class:`UngriddedData` matches - the input colocation constraints - """ - if filter_name is None: - filter_name = const.DEFAULT_REG_FILTER - try: - data.check_dimcoords_tseries() - except DimensionOrderError: - data.reorder_dimensions_tseries() - - var, var_aerocom = resolve_var_name(data) - if var_ref is None: - var_ref = var_aerocom - var_ref_aerocom = var_aerocom - else: - var_ref_aerocom = const.VARS[var_ref].var_name_aerocom - - if not var_ref in data_ref.contains_vars: - raise VarNotAvailableError( - f"Variable {var_ref} is not available in ungridded " - f"data (which contains {data_ref.contains_vars})" - ) - elif len(data_ref.contains_datasets) > 1: - raise AttributeError( - f"Colocation can only be performed with ungridded data objects " - f"that only contain a single dataset (input data contains: " - f"{data_ref.contains_datasets}. Use method `extract_dataset` of " - f"UngriddedData object to extract single datasets." - ) - - dataset_ref = data_ref.contains_datasets[0] - - if update_baseyear_gridded is not None: - # update time dimension in gridded data - data.base_year = update_baseyear_gridded - - # apply region filter to data - regfilter = Filter(name=filter_name) - data_ref = regfilter.apply(data_ref) - data = regfilter.apply(data) - - # check time overlap and crop model data if needed - start, stop = check_time_ival(data, start, stop) - data = data.crop(time_range=(start, stop)) - - if regrid_res_deg is not None: - data = _regrid_gridded(data, regrid_scheme, regrid_res_deg) - - # Special ts_typs for which all stations with ts_type< are removed - reduce_station_data_ts_type = ts_type - - ts_type_src_data = data.ts_type - ts_type, ts_type_data = check_ts_type(data, ts_type) - if not colocate_time and ts_type < ts_type_data: - data = data.resample_time( - str(ts_type), min_num_obs=min_num_obs, how=resample_how - ) - ts_type_data = ts_type - - if use_climatology_ref: - col_freq = "monthly" - obs_start = const.CLIM_START - obs_stop = const.CLIM_STOP - else: - col_freq = str(ts_type) - obs_start = start - obs_stop = stop - - # colocation frequency - col_tst = TsType(col_freq) - - latitude = data.latitude.points - longitude = data.longitude.points - lat_range = [np.min(latitude), np.max(latitude)] - lon_range = [np.min(longitude), np.max(longitude)] - # use only sites that are within model domain - - # filter_by_meta wipes is_vertical_profile - data_ref = data_ref.filter_by_meta(latitude=lat_range, longitude=lon_range) - - # get timeseries from all stations in provided time resolution - # (time resampling is done below in main loop) - all_stats = data_ref.to_station_data_all( - vars_to_convert=var_ref, - start=obs_start, - stop=obs_stop, - by_station_name=True, - ts_type_preferred=reduce_station_data_ts_type, - **kwargs, - ) - - obs_stat_data = all_stats["stats"] - ungridded_lons = all_stats["longitude"] - ungridded_lats = all_stats["latitude"] - - if len(obs_stat_data) == 0: - raise VarNotAvailableError( - f"Variable {var_ref} is not available in specified time interval ({start}-{stop})" - ) - - grid_stat_data = data.to_time_series( - longitude=ungridded_lons, latitude=ungridded_lats - ) - - pd_freq = col_tst.to_pandas_freq() - time_idx = make_datetime_index(start, stop, pd_freq) - - time_num = len(time_idx) - stat_num = len(obs_stat_data) - - arr = np.full((2, time_num, stat_num), np.nan) - - lons = [np.nan] * stat_num - lats = [np.nan] * stat_num - alts = [np.nan] * stat_num - station_names = [""] * stat_num - - data_ref_unit = None - ts_type_src_ref = None - if not harmonise_units: - data_unit = str(data.units) - else: - data_unit = None - - # loop over all stations and append to colocated data object - for i, obs_stat in enumerate(obs_stat_data): - # Add coordinates to arrays required for xarray.DataArray below - lons[i] = obs_stat.longitude - lats[i] = obs_stat.latitude - alts[i] = obs_stat.altitude - station_names[i] = obs_stat.station_name - - # ToDo: consider removing to keep ts_type_src_ref (this was probably - # introduced for EBAS were the original data frequency is not constant - # but can vary from site to site) - if ts_type_src_ref is None: - ts_type_src_ref = obs_stat["ts_type_src"] - elif obs_stat["ts_type_src"] != ts_type_src_ref: - spl = ts_type_src_ref.split(";") - if not obs_stat["ts_type_src"] in spl: - spl.append(obs_stat["ts_type_src"]) - ts_type_src_ref = ";".join(spl) - - if data_ref_unit is None: - try: - data_ref_unit = obs_stat["var_info"][var_ref]["units"] - except KeyError as e: # variable information or unit is not defined - logger.exception(repr(e)) - try: - unit = obs_stat["var_info"][var_ref]["units"] - except Exception: - unit = None - if not unit == data_ref_unit: - raise ValueError( - f"Cannot perform colocation. " - f"Ungridded data object contains different units ({var_ref})" - ) - # get observations (Note: the index of the observation time series - # is already in the specified frequency format, and thus, does not - # need to be updated, for details (or if errors occur), cf. - # UngriddedData.to_station_data, where the conversion happens) - - # get model station data - grid_stat = grid_stat_data[i] - if harmonise_units: - grid_unit = grid_stat.get_unit(var) - obs_unit = obs_stat.get_unit(var_ref) - if not grid_unit == obs_unit: - grid_stat.convert_unit(var, obs_unit) - if data_unit is None: - data_unit = obs_unit - - try: - if colocate_time: - _df = _colocate_site_data_helper_timecol( - stat_data=grid_stat, - stat_data_ref=obs_stat, - var=var, - var_ref=var_ref, - ts_type=col_freq, - resample_how=resample_how, - min_num_obs=min_num_obs, - use_climatology_ref=use_climatology_ref, - ) - else: - _df = _colocate_site_data_helper( - stat_data=grid_stat, - stat_data_ref=obs_stat, - var=var, - var_ref=var_ref, - ts_type=col_freq, - resample_how=resample_how, - min_num_obs=min_num_obs, - use_climatology_ref=use_climatology_ref, - ) - - # this try/except block was introduced on 23/2/2021 as temporary fix from - # v0.10.0 -> v0.10.1 as a result of multi-weekly obsdata (EBAS) that - # can end up resulting in incorrect number of timestamps after resampling - # (the error was discovered using EBASMC, concpm10, 2019 and colocation - # frequency monthly) - try: - # assign the unified timeseries data to the colocated data array - arr[0, :, i] = _df["ref"].values - arr[1, :, i] = _df["data"].values - except ValueError: - try: - mask = _df.index.intersection(time_idx) - _df = _df.loc[mask] - arr[0, :, i] = _df["ref"].values - arr[1, :, i] = _df["data"].values - except ValueError as e: - logger.warning( - f"Failed to colocate time for station {obs_stat.station_name}. " - f"This station will be skipped (error: {e})" - ) - except TemporalResolutionError as e: - # resolution of obsdata is too low - logger.warning( - f"{var_ref} data from site {obs_stat.station_name} will " - f"not be added to ColocatedData. Reason: {e}" - ) - try: - revision = data_ref.data_revision[dataset_ref] - except Exception: - try: - revision = data_ref._get_data_revision_helper(dataset_ref) - except MetaDataError: - revision = "MULTIPLE" - except Exception: - revision = "n/a" - - files = [os.path.basename(x) for x in data.from_files] - - meta = { - "data_source": [dataset_ref, data.data_id], - "var_name": [var_ref_aerocom, var_aerocom], - "var_name_input": [var_ref, var], - "ts_type": col_freq, # will be updated below if resampling - "filter_name": filter_name, - "ts_type_src": [ts_type_src_ref, ts_type_src_data], - "var_units": [data_ref_unit, data_unit], - "data_level": 3, - "revision_ref": revision, - "from_files": files, - "from_files_ref": None, - "colocate_time": colocate_time, - "obs_is_clim": use_climatology_ref, - "pyaerocom": pya_ver, - "min_num_obs": min_num_obs, - "resample_how": resample_how, - } - - # create coordinates of DataArray - coords = { - "data_source": meta["data_source"], - "time": time_idx, - "station_name": station_names, - "latitude": ("station_name", lats), - "longitude": ("station_name", lons), - "altitude": ("station_name", alts), - } - - dims = ["data_source", "time", "station_name"] - coldata = ColocatedData(data=arr, coords=coords, dims=dims, name=var, attrs=meta) - - # add correct units for lat / lon dimensions - coldata.latitude.attrs["standard_name"] = data.latitude.standard_name - coldata.latitude.attrs["units"] = str(data.latitude.units) - - coldata.longitude.attrs["standard_name"] = data.longitude.standard_name - coldata.longitude.attrs["units"] = str(data.longitude.units) - - return coldata - - -def correct_model_stp_coldata(coldata, p0=None, t0=273.15, inplace=False): - """Correct modeldata in colocated data object to STP conditions - - Note - ---- - BETA version, quite unelegant coded (at 8pm 3 weeks before IPCC deadline), - but should do the job for 2010 monthly colocated data files (AND NOTHING - ELSE)! - - """ - if coldata.ndim != 3: - raise NotImplementedError("Can only handle 3D coldata so far...") - elif not coldata.ts_type == "monthly" or not len(coldata.time) == 12: - raise NotImplementedError( - "Can only handle monthly colocated data files " - "so far (since ERA5 temps are only available) " - "in monthly resolution" - ) - startyr = pd.Timestamp(coldata.start).year - stopyr = pd.Timestamp(coldata.stop).year - if not all([x == 2010 for x in (startyr, stopyr)]): - raise NotImplementedError("Can only handle 2010 monthly data so far") - - if not inplace: - coldata = coldata.copy() - temp = xr.open_dataset(const.ERA5_SURFTEMP_FILE)["t2m"] - - arr = coldata.data - - coords = zip( - arr.latitude.values, - arr.longitude.values, - arr.altitude.values, - arr.station_name.values, - ) - if p0 is None: - p0 = pressure() # STD conditions sea level - logger.info("Correcting model data in ColocatedData instance to STP") - cfacs = [] - meantemps = [] - mintemps = [] - maxtemps = [] - ps = [] - for i, (lat, lon, alt, name) in enumerate(coords): - logger.info(name, ", Lat", lat, ", Lon", lon) - p = pressure(alt) - logger.info("Alt", alt) - logger.info("P=", p / 100, "hPa") - - ps.append(p / 100) - - temps = temp.sel(latitude=lat, longitude=lon, method="nearest").data - - meantemps.append(temps.mean()) - mintemps.append(temps.min()) - maxtemps.append(temps.min()) - - if not len(temps) == len(arr.time): - raise NotImplementedError("Check timestamps") - logger.info("Mean Temp: ", temps.mean() - t0, " C") - - corrfacs = (p0 / p) * (temps / t0) - - logger.info("Corr fac:", corrfacs.mean(), "+/-", corrfacs.std()) - - cfacs.append(corrfacs.mean()) - - # mularr = xr.DataArray(corrfacs) - - if not arr.station_name.values[i] == name: - raise Exception - elif not arr.dims[1] == "time": - raise Exception - arr[1, :, i] *= corrfacs - - cfacs = np.asarray(cfacs) - - logger.info("Min: ", cfacs.min()) - logger.info("Mean: ", cfacs.mean()) - logger.info("Max: ", cfacs.max()) - coldata.data.attrs["Model_STP_corr"] = True - - newcoords = dict( - pres=("station_name", ps), - temp_mean=("station_name", meantemps), - temp_min=("station_name", mintemps), - temp_max=("station_name", maxtemps), - stp_corrfac_mean=("station_name", cfacs), - ) - - coldata.data = coldata.data.assign_coords(newcoords) - - info_str = ( - "Correction factors to convert model data from ambient to " - "STP were computed using corrfac=(p0/p)*(T/T0) with T0=273K " - "and p0=1013 hPa and p is the pressure at the station location " - "(which was computed assuming a standard atmosphere and using " - "the station altitude) and T is the 2m surface temperature at " - "the station, applied on a monthly basis and estimated using " - "ERA5 data" - ) - - coldata.data["pres"].attrs["units"] = "hPa" - coldata.data["temp_mean"].attrs["units"] = "K" - coldata.data["temp_min"].attrs["units"] = "K" - coldata.data["temp_max"].attrs["units"] = "K" - - coldata.data.attrs["Model_STP_corr"] = True - coldata.data.attrs["Model_STP_corr_info"] = info_str - return coldata diff --git a/src/pyaro_readers/nilupmfebas/combine_vardata_ungridded.py b/src/pyaro_readers/nilupmfebas/combine_vardata_ungridded.py deleted file mode 100644 index 734be15..0000000 --- a/src/pyaro_readers/nilupmfebas/combine_vardata_ungridded.py +++ /dev/null @@ -1,494 +0,0 @@ -import numpy as np - -from ._lowlevel_helpers import invalid_input_err_str -from .colocation import _colocate_site_data_helper -from .geodesy import find_coord_indices_within_distance -from .helpers import sort_ts_types -from .obs_io import ObsVarCombi -from .stationdata import StationData - - -def _check_input_data_ids_and_vars(data_ids_and_vars): - if not isinstance(data_ids_and_vars, (list, tuple)): - raise ValueError("Input data_ids_and_vars must be tuple or list") - elif len(data_ids_and_vars) != 2: - raise NotImplementedError( - "Currently, only (and exactly) 2 datasets can be combined..." - ) - for item in data_ids_and_vars: - if not isinstance(item, (list, tuple)): - raise ValueError("Each entry in data_ids_and_vars must be tuple or list") - elif len(item) != 3: - raise ValueError( - "Each entry in data_ids_and_vars needs to contain exactly 3 items." - ) - if not isinstance(item[1], str) or not isinstance(item[2], str): - raise ValueError( - "2nd and 3rd entries (data_id, var_name) in item need to be str" - ) - - -def _map_same_stations(stats_short, stats_long, match_stats_how, match_stats_tol_km): - long_coords = list(zip(stats_long["latitude"], stats_long["longitude"])) - - # index matches and corresponding station name matches - _index_short = [] - _index_long = [] - _statnames_short = [] - _statnames_long = [] - - long_sitenames = np.asarray(stats_long["station_name"]) - - for i, stat in enumerate(stats_short["stats"]): - statname = stat.station_name - lat0, lon0 = stats_short["latitude"][i], stats_short["longitude"][i] - - if match_stats_how == "station_name": - # np.where returns tuple, first index contains array with index - # matches - index_matches = np.where(long_sitenames == statname)[0] - else: - index_matches = find_coord_indices_within_distance( - latref=lat0, lonref=lon0, latlons=long_coords, radius=match_stats_tol_km - ) - - # init which default index to use - use_index = 0 - if len(index_matches) == 0: - continue - elif len(index_matches) > 1: - if match_stats_how == "station_name": - raise Exception( - "Unexpected error: each station_name should " - "only occur once... (perhaps due to unforeseen " - "API change sometime in the future)" - ) - else: - # more than one site was found in the surroundings of the - # current coordinate. Check and prefer same site name if - # possible, else, use closest - for j, idx_match in enumerate(index_matches): - if statname == stats_long["station_name"][idx_match]: - use_index = j - break - - idx_long = index_matches[use_index] - - # make sure to colocate each site only once - if idx_long in _index_long: - statname_long = stats_long["station_name"][idx_long] - if statname == statname_long: - # rare case: the index match in long has already been assigned - # to another site in short which does not occur in long - # (e.g. AAOT site and Venise site in AERONET). In this case - # we want to use the one that matches the site name, so we - # have to remove the already registered index from the record - rm_idx = _statnames_long.index(statname_long) - _index_short.pop(rm_idx) - _index_long.pop(rm_idx) - _statnames_long.pop(rm_idx) - _statnames_short.pop(rm_idx) - else: - continue - - _index_short.append(i) - _index_long.append(idx_long) - _statnames_short.append(statname) - _statnames_long.append(stats_long["station_name"][idx_long]) - - return (_index_short, _index_long, _statnames_short, _statnames_long) - - -def _combine_2_sites( - stat, - var, - stat_other, - var_other, - merge_how, - merge_eval_fun, - match_stats_tol_km, - var_name_out, - data_id_out, - var_unit_out, - resample_how, - min_num_obs, - prefer, - merge_info_vars, - add_meta_keys, -): - """Combine two StationData objects for a given merge strategy - - Private for now... details should follow. Until then see - :func:`combine_vardata_ungridded` for details on input args - - Returns - ------- - StationData - merged StationData instance - """ - # unit of first variable - var_unit_in = stat.get_unit(var) - - # check if output unit is defined explicitly and if not, use unit of - # variable 1 - if var_unit_out is None: - var_unit_out = var_unit_in - # make sure both input data objects are in the correct unit (which is - # var_unit_out) - elif not var_unit_in == var_unit_out: - stat.convert_unit(var, var_unit_out) - - if not stat_other.get_unit(var_other) == var_unit_out: - stat_other.convert_unit(var_other, var_unit_out) - - new = StationData() - # add default metadata to new data object - meta_first = stat.get_meta( - force_single_value=False, quality_check=False, add_meta_keys=add_meta_keys - ) - new.update(meta_first) - - new.merge_meta_same_station( - other=stat_other, - check_coords=False, # has already been done - inplace=True, - raise_on_error=True, - add_meta_keys=add_meta_keys, - ) - - tstype = stat.get_var_ts_type(var) - tstype_other = stat_other.get_var_ts_type(var_other) - - to_ts_type = sort_ts_types([tstype, tstype_other])[-1] - - df = _colocate_site_data_helper( - stat, - stat_other, - var, - var_other, - to_ts_type, - resample_how=resample_how, - min_num_obs=min_num_obs, - use_climatology_ref=False, - ) - - # remove timestamps where both observations are NaN - df.dropna(axis=0, how="all", inplace=True) - - # NOTE: the dataframe returned by _colocate_site_data_helper has ref as first - # column and the first input data as 2nd! - obsvar_id = str(ObsVarCombi(stat.data_id, var)) - obsvar_id_other = str(ObsVarCombi(stat_other.data_id, var_other)) - - stat_order = [stat_other, stat] - col_order = [obsvar_id_other, obsvar_id] - col_vars = [var_other, var] - col_names = list(df.columns.values) - - # In case input variables are different, keep both of them in the - # output colocated StationData, in addition to potentially computed - # additional variables below. This is equivalent with using - # merge_how='combine' and var1 != var2 - if var != var_other: - for j, colname in enumerate(col_names): - _var = col_vars[j] - _stat = stat_order[j] - ts = df[colname] - new[_var] = ts - vi = _stat["var_info"][_var] - vi["ts_type"] = to_ts_type - new["var_info"][_var] = vi - - add_ts = None - # Merge timeseries if variables are the same and are supposed to be - # combined - if merge_how == "combine" and var == var_other: - prefer_col = col_names[col_order.index(prefer)] - dont_prefer = col_names[int(not (col_names.index(prefer_col)))] - add_ts = df[prefer_col].combine_first(df[dont_prefer]) - - if var_name_out is None: - var_name_out = var - - elif merge_how == "mean": - if var != var_other: - raise NotImplementedError( - "Averaging of site data is only supported if input variables are the same..." - ) - # if it made it until here, then both sites have same variables and - # units - if var_name_out is None: - var_name_out = var - - add_ts = df.mean(axis=1) - - elif merge_how == "eval": - func = merge_eval_fun.replace(col_order[0], col_names[0]) - func = func.replace(col_order[1], col_names[1]) - if "=" in merge_eval_fun: - # make sure variable name is not in merge_eval_fun anymore, otherwise - # the eval method will return a DataFrame instead of a Series - func = func.split("=")[-1].strip() - add_ts = df.eval(func) - - if var_name_out is None: - var_name_out = merge_eval_fun - var_name_out = var_name_out.replace(f"{stat.data_id};", "") - var_name_out = var_name_out.replace(f"{stat_other.data_id};", "") - - if add_ts is not None: - var_info = {"ts_type": to_ts_type, "units": var_unit_out} - - var_info.update(merge_info_vars) - - new["var_info"][var_name_out] = var_info - new[var_name_out] = add_ts - - if isinstance(data_id_out, str): - new["data_id"] = data_id_out - - return new - - -def combine_vardata_ungridded( - data_ids_and_vars, - match_stats_how="closest", - match_stats_tol_km=1, - merge_how="combine", - merge_eval_fun=None, - var_name_out=None, - data_id_out=None, - var_unit_out=None, - resample_how=None, - min_num_obs=None, - add_meta_keys=None, -): - """ - Combine and colocate different variables from UngriddedData - - This method allows to combine different variable timeseries from different - ungridded observation records in multiple ways. The source data may be all - included in a single instance of `UngriddedData` or in multiple, for - details see first input parameter :param:`data_ids_and_vars`. Merging can - be done in flexible ways, e.g. by combining measurements of the same - variable from 2 different datasets or by computing new variables based - on 2 measured variables (e.g. concox=concno2+conco3). Doing this requires - colocation of site locations and timestamps of both input observation - records, which is done in this method. - - It comprises 2 major steps: - - 1. Compute list of :class:`StationData` objects for both input \ - data combinations (data_id1 & var1; data_id2 & var2) and based \ - on these, find the coincident locations. Finding coincident \ - sites can either be done based on site location name or based on - their lat/lon locations. The method to use can be specified via - input arg :param:`match_stats_how`. - 2. For all coincident locations, a new instance of :class:`StationData` \ - is computed that has merged the 2 timeseries in the way - that can be specified through input args :param:`merge_how` and - :param:`merge_eval_fun`. If the 2 original timeseries from both - sites come in different temporal resolutions, they will be - resampled to the lower of both resolutions. Resampling constraints - that are supposed to be applied in that case can be provided via - the respective input args for temporal resampling. Default is - pyaerocom default, which corresponds to ~25% coverage constraint - (as of 22.10.2020) for major resolution steps, such as - daily->monthly. - - Note - ---- - Currently, only 2 variables can be combined to a new one (e.g. - concox=conco3+concno2). - - Note - ---- - Be aware of unit conversion issues that may arise if your input data is - not in AeroCom default units. For details see below. - - Parameters - ---------- - data_ids_and_vars : list - list of 3 element tuples, each containing, in the following order - 1. instance of :class:`UngriddedData`; 2. dataset ID (remember that - UngriddedData can contain more than one dataset); and 3. variable name. - Note that currently only 2 of such tuples can be combined. - match_stats_how : str, optional - String specifying how site locations are supposed to be matched. - The default is 'closest'. Supported are 'closest' and 'station_name'. - match_stats_tol_km : float, optional - radius tolerance in km for matching site locations when using 'closest' - for site location matching. The default is 1. - merge_how : str, optional - String specifying how to merge variable data at site locations. - The default is 'combine'. If both input variables are the same and - `combine` is used, then the first input variable will be preferred over - the other. Supported are 'combine', 'mean' and 'eval', for the latter, - `merge_eval_fun` needs to be specified explicitly. - merge_eval_fun : str, optional - String specifying how `var1` and `var2` data should be evaluated (only - relevant if `merge_how='eval'` is used) . The default is None. E.g. if - one wants to retrieve the column aerosol fine mode fraction at 550nm - (fmf550aer) through AERONET, this could be done through the SDA product - by prodiding data_id1 and var1 are 'AeronetSDA' and 'od550aer' and - second input data_id2 and var2 are 'AeronetSDA' and 'od550lt1aer' and - merge_eval_fun could then be - 'fmf550aer=(AeronetSDA;od550lt1aer/AeronetSDA;od550aer)*100'. Note that - the input variables will be converted to their AeroCom default units, - so the specification of `merge_eval_fun` should take that into account - in case the originally read obsdata is not in default units. - var_name_out : str, optional - Name of output variable. Default is None, in which case it is attempted - to be inferred. - data_id_out : str, optional - `data_id` set in output `StationData` objects. Default is None, in - which case it is inferred from input data_ids (e.g. in above example - of merge_eval_fun, the output data_id would be 'AeronetSDA' since both - input IDs are the same. - var_unit_out : str - unit of output variable. - resample_how : str, optional - String specifying how temporal resampling should be done. The default - is 'mean'. - min_num_obs : int or dict, optional - Minimum number of observations for temporal resampling. - The default is None in which case pyaerocom default is used, which - is available via pyaerocom.const.OBS_MIN_NUM_RESAMPLE. - add_meta_keys : list, optional - additional metadata keys to be added to output `StationData` objects - from input data. If None, then only the pyaerocom default keys are - added (see `StationData.STANDARD_META_KEYS`). - - Raises - ------ - ValueError - If input for `merge_how` or `match_stats_how` is invalid. - NotImplementedError - If one of the input UngriddedData objects contains more than one - dataset. - - Returns - ------- - merged_stats : list - list of `StationData` objects containing the colocated and combined - variable data. - - """ - if add_meta_keys is None: - add_meta_keys = [] - _check_input_data_ids_and_vars(data_ids_and_vars) - data1, data_id1, var1 = data_ids_and_vars[0] - data2, data_id2, var2 = data_ids_and_vars[1] - - if data2 is data1 and var2 == var1 and data_id1 == data_id2: - raise ValueError("nothing to combine...") - - if not data_id1 in data1.contains_datasets: - raise ValueError(f"No such data ID {data_id1} in {data1}") - elif len(data1.contains_datasets) > 1: - data1 = data1.extract_dataset(data_id1) - - if not data_id2 in data2.contains_datasets: - raise ValueError(f"No such data ID {data_id2} in {data2}") - elif len(data2.contains_datasets) > 1: - data2 = data2.extract_dataset(data_id2) - - id1 = str(ObsVarCombi(data_id1, var1)) - id2 = str(ObsVarCombi(data_id2, var2)) - - data1_stats = data1.to_station_data_all(var1, add_meta_keys=add_meta_keys) - data1_stats["var_name"] = var1 - data1_stats["id"] = id1 - - data2_stats = data2.to_station_data_all(var2, add_meta_keys=add_meta_keys) - data2_stats["var_name"] = var2 - data2_stats["id"] = id2 - - if len(data1_stats["latitude"]) <= len(data2_stats["latitude"]): # - short = data1_stats - long = data2_stats - else: - short = data2_stats - long = data1_stats - - match_stats_opts = ["station_name", "closest"] - - if not match_stats_how in match_stats_opts: - raise ValueError( - f"Invalid input for match_stats_how {match_stats_how}, choose from {match_stats_opts}" - ) - - merge_how_opts = ["combine", "mean", "eval"] - - # if e.g. merge_how is combine and var==var2, then the preferred - # dataset & variable can be provided via this instance - prefer = id1 - - if not merge_how in merge_how_opts: - raise ValueError(invalid_input_err_str("merge_how", merge_how, merge_how_opts)) - - elif merge_how == "eval": - if merge_eval_fun is None: - raise ValueError("Please specify evaluation function for mode eval") - elif not all([x in merge_eval_fun for x in [id1, id2]]): - raise ValueError( - f"merge_eval_fun needs to include both input " - f"datasets;variables (e.g. {id1} + {id2}" - ) - if "=" in merge_eval_fun: - spl = merge_eval_fun.split("=") - if len(spl) > 2: - raise ValueError( - "merge_eval_fun contains more than 1 equality symbol..." - ) - var_name_out = spl[0].strip() - merge_eval_fun = spl[1].strip() - - elif var_name_out is None: - var_name_out = merge_eval_fun - var_name_out = var_name_out.replace(f"{data_id1};", "") - var_name_out = var_name_out.replace(f"{data_id2};", "") - - merge_info_vars = {"merge_how": merge_how} - if merge_how == "combine" and var1 == var2: - merge_info_vars["prefer"] = prefer - elif merge_how == "eval": - merge_info_vars["merge_eval_fun"] = merge_eval_fun - - (_index_short, _index_long, _statnames_short, _statnames_long) = _map_same_stations( - short, long, match_stats_how, match_stats_tol_km - ) - - merged_stats = [] - - var_short, var_long = short["var_name"], long["var_name"] - for idx_short, idx_long in zip(_index_short, _index_long): - stat_short = short["stats"][idx_short] - stat_short.check_var_unit_aerocom(var_short) - stat_long = long["stats"][idx_long] - stat_long.check_var_unit_aerocom(var_long) - - # prepare output StationData object (will contain colocated timeseries - # of both input variables as well as, additionally retrieved variable, - # if applicable) - new = _combine_2_sites( - stat_short, - var_short, - stat_long, - var_long, - merge_how, - merge_eval_fun, - match_stats_tol_km, - var_name_out, - data_id_out, - var_unit_out, - resample_how, - min_num_obs, - prefer, - merge_info_vars, - add_meta_keys, - ) - - merged_stats.append(new) - - return merged_stats diff --git a/src/pyaro_readers/nilupmfebas/const.py b/src/pyaro_readers/nilupmfebas/const.py index 1eab5f2..e4ca57b 100644 --- a/src/pyaro_readers/nilupmfebas/const.py +++ b/src/pyaro_readers/nilupmfebas/const.py @@ -1,23 +1,23 @@ import getpass import logging import os -from configparser import ConfigParser -from pathlib import Path - -import numpy as np - -from . import obs_io -from ._lowlevel_helpers import ( - check_dir_access, - check_write_access, - chk_make_subdir, - list_to_shortstr, -) -from . import resources -from .exceptions import DataIdError, DataSourceError -from .grid_io import GridIO -from .region_defs import ALL_REGION_NAME, HTAP_REGIONS, OLD_AEROCOM_REGIONS -from .varcollection import VarCollection +# from configparser import ConfigParser +# from pathlib import Path +# +# import numpy as np +# +# from . import obs_io +# from ._lowlevel_helpers import ( +# check_dir_access, +# check_write_access, +# chk_make_subdir, +# list_to_shortstr, +# ) +# from . import resources +# from .exceptions import DataIdError, DataSourceError +# from .grid_io import GridIO +# from .region_defs import ALL_REGION_NAME, HTAP_REGIONS, OLD_AEROCOM_REGIONS +# from .varcollection import VarCollection logger = logging.getLogger(__name__) @@ -29,149 +29,147 @@ #: standard names for coordinates STANDARD_COORD_NAMES = ["latitude", "longitude", "altitude"] -GRID_IO = GridIO() - - -"""Class containing relevant paths for read and write routines - -A loaded instance of this class is created on import of pyaerocom and -can be accessed via `pyaerocom.const`. - -TODO: provide more information -""" - -# NAMES -# default names of the different obs networks -# might get overwritten from paths.ini see func read_config - -#: ICP Forests -ICPFORESTS_NAME = "ICPFORESTS" - -#: Aeronet Sun V2 access names -AERONET_SUN_V2L15_AOD_DAILY_NAME = "AeronetSunV2Lev1.5.daily" -AERONET_SUN_V2L15_AOD_ALL_POINTS_NAME = "AeronetSun_2.0_NRT" -AERONET_SUN_V2L2_AOD_DAILY_NAME = "AeronetSunV2Lev2.daily" -AERONET_SUN_V2L2_AOD_ALL_POINTS_NAME = "AeronetSunV2Lev2.AP" - -#: Aeronet SDA V2 access names -AERONET_SUN_V2L2_SDA_DAILY_NAME = "AeronetSDAV2Lev2.daily" -AERONET_SUN_V2L2_SDA_ALL_POINTS_NAME = "AeronetSDAV2Lev2.AP" - -# Aeronet V2 inversion products -AERONET_INV_V2L15_DAILY_NAME = "AeronetInvV2Lev1.5.daily" -AERONET_INV_V2L15_ALL_POINTS_NAME = "AeronetInvV2Lev1.5.AP" -AERONET_INV_V2L2_DAILY_NAME = "AeronetInvV2Lev2.daily" -AERONET_INV_V2L2_ALL_POINTS_NAME = "AeronetInvV2Lev2.AP" - -#: Aeronet Sun V3 access names -AERONET_SUN_V3L15_AOD_DAILY_NAME = "AeronetSunV3Lev1.5.daily" -AERONET_SUN_V3L15_AOD_ALL_POINTS_NAME = "AeronetSunV3Lev1.5.AP" -AERONET_SUN_V3L2_AOD_DAILY_NAME = "AeronetSunV3Lev2.daily" -AERONET_SUN_V3L2_AOD_ALL_POINTS_NAME = "AeronetSunV3Lev2.AP" - -#: Aeronet SDA V3 access names -AERONET_SUN_V3L15_SDA_DAILY_NAME = "AeronetSDAV3Lev1.5.daily" -AERONET_SUN_V3L15_SDA_ALL_POINTS_NAME = "AeronetSDAV3Lev1.5.AP" -AERONET_SUN_V3L2_SDA_DAILY_NAME = "AeronetSDAV3Lev2.daily" -AERONET_SUN_V3L2_SDA_ALL_POINTS_NAME = "AeronetSDAV3Lev2.AP" - -#: Aeronet V3 inversions -AERONET_INV_V3L15_DAILY_NAME = "AeronetInvV3Lev1.5.daily" -AERONET_INV_V3L2_DAILY_NAME = "AeronetInvV3Lev2.daily" - -#: EBAS name -EBAS_MULTICOLUMN_NAME = "EBASMC" - -#: EEA nmea -EEA_NAME = "EEAAQeRep" - -#: EEA.NRT name -EEA_NRT_NAME = "EEAAQeRep.NRT" - -#: EEAV2 name -EEA_V2_NAME = "EEAAQeRep.v2" - -#: Earlinet access name; -EARLINET_NAME = "EARLINET" - -#: GAW TAD subset aas et al paper -GAWTADSUBSETAASETAL_NAME = "GAWTADsubsetAasEtAl" - -#: DMS -DMS_AMS_CVO_NAME = "DMS_AMS_CVO" - -#: MEP name -MEP_NAME = "MEP" - -#: ICOS name -ICOS_NAME = "ICOS" - -#: boolean specifying wheter EBAS DB is copied to local cache for faster -#: access, defaults to True -EBAS_DB_LOCAL_CACHE = True - -#: Lowest possible year in data -MIN_YEAR = 0 -#: Highest possible year in data -MAX_YEAR = 20000 - -#: standard names for coordinates -STANDARD_COORD_NAMES = ["latitude", "longitude", "altitude"] -#: Information specifying default vertical grid for post processing of -#: profile data. The values are in units of m. -DEFAULT_VERT_GRID_DEF = dict(lower=0, upper=15000, step=250) -#: maximum allowed RH to be considered dry -RH_MAX_PERCENT_DRY = 40 - -DEFAULT_REG_FILTER = f"{ALL_REGION_NAME}-wMOUNTAINS" - -#: Time resample strategies for certain cominations, first level refers -#: to TO, second to FROM and values are minimum number of observations -OBS_MIN_NUM_RESAMPLE = dict( - yearly=dict(monthly=3), - monthly=dict(daily=7), - daily=dict(hourly=6), - hourly=dict(minutely=15), -) - -#: This boolean can be used to enable / disable the former (i.e. use -#: available wavelengths of variable in a certain range around variable -#: wavelength). -OBS_ALLOW_ALT_WAVELENGTHS = obs_io.OBS_ALLOW_ALT_WAVELENGTHS - -#: Wavelength tolerance for observations imports -OBS_WAVELENGTH_TOL_NM = obs_io.OBS_WAVELENGTH_TOL_NM +# GRID_IO = GridIO() -CLIM_START = 2005 -CLIM_STOP = 2015 -CLIM_FREQ = "daily" -CLIM_RESAMPLE_HOW = "mean" # median, ... -# as a function of climatological frequency -CLIM_MIN_COUNT = dict( - daily=30, - monthly=5, # at least 30 daily measurements in each month over whole period -) # analogue to daily ... -# names for the satellite data sets -SENTINEL5P_NAME = "Sentinel5P" -AEOLUS_NAME = "AeolusL2A" - -OLD_AEROCOM_REGIONS = OLD_AEROCOM_REGIONS - -URL_HTAP_MASKS = "https://pyaerocom.met.no/pyaerocom-suppl/htap_masks/" - -HTAP_REGIONS = HTAP_REGIONS - -RM_CACHE_OUTDATED = True - -#: Name of the file containing the revision string of an obs data network -REVISION_FILE = "Revision.txt" - -#: timeout to check if one of the supported server locations can be -#: accessed -SERVER_CHECK_TIMEOUT = 1 # s - -_outhomename = "MyPyaerocom" +# """Class containing relevant paths for read and write routines +# +# A loaded instance of this class is created on import of pyaerocom and +# can be accessed via `pyaerocom.const`. +# +# TODO: provide more information +# """ +# +# # NAMES +# # default names of the different obs networks +# # might get overwritten from paths.ini see func read_config +# +# #: ICP Forests +# ICPFORESTS_NAME = "ICPFORESTS" +# +# #: Aeronet Sun V2 access names +# AERONET_SUN_V2L15_AOD_DAILY_NAME = "AeronetSunV2Lev1.5.daily" +# AERONET_SUN_V2L15_AOD_ALL_POINTS_NAME = "AeronetSun_2.0_NRT" +# AERONET_SUN_V2L2_AOD_DAILY_NAME = "AeronetSunV2Lev2.daily" +# AERONET_SUN_V2L2_AOD_ALL_POINTS_NAME = "AeronetSunV2Lev2.AP" +# +# #: Aeronet SDA V2 access names +# AERONET_SUN_V2L2_SDA_DAILY_NAME = "AeronetSDAV2Lev2.daily" +# AERONET_SUN_V2L2_SDA_ALL_POINTS_NAME = "AeronetSDAV2Lev2.AP" +# +# # Aeronet V2 inversion products +# AERONET_INV_V2L15_DAILY_NAME = "AeronetInvV2Lev1.5.daily" +# AERONET_INV_V2L15_ALL_POINTS_NAME = "AeronetInvV2Lev1.5.AP" +# AERONET_INV_V2L2_DAILY_NAME = "AeronetInvV2Lev2.daily" +# AERONET_INV_V2L2_ALL_POINTS_NAME = "AeronetInvV2Lev2.AP" +# +# #: Aeronet Sun V3 access names +# AERONET_SUN_V3L15_AOD_DAILY_NAME = "AeronetSunV3Lev1.5.daily" +# AERONET_SUN_V3L15_AOD_ALL_POINTS_NAME = "AeronetSunV3Lev1.5.AP" +# AERONET_SUN_V3L2_AOD_DAILY_NAME = "AeronetSunV3Lev2.daily" +# AERONET_SUN_V3L2_AOD_ALL_POINTS_NAME = "AeronetSunV3Lev2.AP" +# +# #: Aeronet SDA V3 access names +# AERONET_SUN_V3L15_SDA_DAILY_NAME = "AeronetSDAV3Lev1.5.daily" +# AERONET_SUN_V3L15_SDA_ALL_POINTS_NAME = "AeronetSDAV3Lev1.5.AP" +# AERONET_SUN_V3L2_SDA_DAILY_NAME = "AeronetSDAV3Lev2.daily" +# AERONET_SUN_V3L2_SDA_ALL_POINTS_NAME = "AeronetSDAV3Lev2.AP" +# +# #: Aeronet V3 inversions +# AERONET_INV_V3L15_DAILY_NAME = "AeronetInvV3Lev1.5.daily" +# AERONET_INV_V3L2_DAILY_NAME = "AeronetInvV3Lev2.daily" +# +# #: EBAS name +# # EBAS_MULTICOLUMN_NAME = "EBASMC" +# +# #: EEA nmea +# EEA_NAME = "EEAAQeRep" +# +# #: EEA.NRT name +# EEA_NRT_NAME = "EEAAQeRep.NRT" +# +# #: EEAV2 name +# EEA_V2_NAME = "EEAAQeRep.v2" +# +# #: Earlinet access name; +# EARLINET_NAME = "EARLINET" +# +# #: GAW TAD subset aas et al paper +# GAWTADSUBSETAASETAL_NAME = "GAWTADsubsetAasEtAl" +# +# #: DMS +# DMS_AMS_CVO_NAME = "DMS_AMS_CVO" +# +# #: MEP name +# MEP_NAME = "MEP" +# +# #: ICOS name +# ICOS_NAME = "ICOS" +# +# #: boolean specifying wheter EBAS DB is copied to local cache for faster +# #: access, defaults to True +# # EBAS_DB_LOCAL_CACHE = True +# +# #: Lowest possible year in data +# MIN_YEAR = 0 +# #: Highest possible year in data +# MAX_YEAR = 20000 +# +# #: standard names for coordinates +# # STANDARD_COORD_NAMES = ["latitude", "longitude", "altitude"] +# #: Information specifying default vertical grid for post processing of +# #: profile data. The values are in units of m. +# DEFAULT_VERT_GRID_DEF = dict(lower=0, upper=15000, step=250) +# #: maximum allowed RH to be considered dry +# RH_MAX_PERCENT_DRY = 40 +# +# # DEFAULT_REG_FILTER = f"{ALL_REGION_NAME}-wMOUNTAINS" +# +# #: Time resample strategies for certain cominations, first level refers +# #: to TO, second to FROM and values are minimum number of observations +# OBS_MIN_NUM_RESAMPLE = dict( +# yearly=dict(monthly=3), +# monthly=dict(daily=7), +# daily=dict(hourly=6), +# hourly=dict(minutely=15), +# ) +# +# #: This boolean can be used to enable / disable the former (i.e. use +# #: available wavelengths of variable in a certain range around variable +# #: wavelength). +# # OBS_ALLOW_ALT_WAVELENGTHS = obs_io.OBS_ALLOW_ALT_WAVELENGTHS +# +# #: Wavelength tolerance for observations imports +# # OBS_WAVELENGTH_TOL_NM = obs_io.OBS_WAVELENGTH_TOL_NM +# +# CLIM_START = 2005 +# CLIM_STOP = 2015 +# CLIM_FREQ = "daily" +# CLIM_RESAMPLE_HOW = "mean" # median, ... +# # as a function of climatological frequency +# CLIM_MIN_COUNT = dict( +# daily=30, +# monthly=5, # at least 30 daily measurements in each month over whole period +# ) # analogue to daily ... +# +# # names for the satellite data sets +# SENTINEL5P_NAME = "Sentinel5P" +# AEOLUS_NAME = "AeolusL2A" +# +# URL_HTAP_MASKS = "https://pyaerocom.met.no/pyaerocom-suppl/htap_masks/" +# +# # HTAP_REGIONS = HTAP_REGIONS +# +# RM_CACHE_OUTDATED = True +# +# #: Name of the file containing the revision string of an obs data network +# REVISION_FILE = "Revision.txt" +# +# #: timeout to check if one of the supported server locations can be +# #: accessed +# SERVER_CHECK_TIMEOUT = 1 # s +# +# _outhomename = "MyPyaerocom" # with resources.path("pyaerocom.data", "paths.ini") as path: # _config_ini_lustre = str(path) diff --git a/src/pyaro_readers/nilupmfebas/country_codes.json b/src/pyaro_readers/nilupmfebas/country_codes.json deleted file mode 100644 index 064ea91..0000000 --- a/src/pyaro_readers/nilupmfebas/country_codes.json +++ /dev/null @@ -1 +0,0 @@ -[{"name":"Afghanistan","alpha-2":"AF","country-code":"004"},{"name":"Åland Islands","alpha-2":"AX","country-code":"248"},{"name":"Albania","alpha-2":"AL","country-code":"008"},{"name":"Algeria","alpha-2":"DZ","country-code":"012"},{"name":"American Samoa","alpha-2":"AS","country-code":"016"},{"name":"Andorra","alpha-2":"AD","country-code":"020"},{"name":"Angola","alpha-2":"AO","country-code":"024"},{"name":"Anguilla","alpha-2":"AI","country-code":"660"},{"name":"Antarctica","alpha-2":"AQ","country-code":"010"},{"name":"Antigua and Barbuda","alpha-2":"AG","country-code":"028"},{"name":"Argentina","alpha-2":"AR","country-code":"032"},{"name":"Armenia","alpha-2":"AM","country-code":"051"},{"name":"Aruba","alpha-2":"AW","country-code":"533"},{"name":"Australia","alpha-2":"AU","country-code":"036"},{"name":"Austria","alpha-2":"AT","country-code":"040"},{"name":"Azerbaijan","alpha-2":"AZ","country-code":"031"},{"name":"Bahamas","alpha-2":"BS","country-code":"044"},{"name":"Bahrain","alpha-2":"BH","country-code":"048"},{"name":"Bangladesh","alpha-2":"BD","country-code":"050"},{"name":"Barbados","alpha-2":"BB","country-code":"052"},{"name":"Belarus","alpha-2":"BY","country-code":"112"},{"name":"Belgium","alpha-2":"BE","country-code":"056"},{"name":"Belize","alpha-2":"BZ","country-code":"084"},{"name":"Benin","alpha-2":"BJ","country-code":"204"},{"name":"Bermuda","alpha-2":"BM","country-code":"060"},{"name":"Bhutan","alpha-2":"BT","country-code":"064"},{"name":"Bolivia (Plurinational State of)","alpha-2":"BO","country-code":"068"},{"name":"Bonaire, Sint Eustatius and Saba","alpha-2":"BQ","country-code":"535"},{"name":"Bosnia and Herzegovina","alpha-2":"BA","country-code":"070"},{"name":"Botswana","alpha-2":"BW","country-code":"072"},{"name":"Bouvet Island","alpha-2":"BV","country-code":"074"},{"name":"Brazil","alpha-2":"BR","country-code":"076"},{"name":"British Indian Ocean Territory","alpha-2":"IO","country-code":"086"},{"name":"Brunei Darussalam","alpha-2":"BN","country-code":"096"},{"name":"Bulgaria","alpha-2":"BG","country-code":"100"},{"name":"Burkina Faso","alpha-2":"BF","country-code":"854"},{"name":"Burundi","alpha-2":"BI","country-code":"108"},{"name":"Cabo Verde","alpha-2":"CV","country-code":"132"},{"name":"Cambodia","alpha-2":"KH","country-code":"116"},{"name":"Cameroon","alpha-2":"CM","country-code":"120"},{"name":"Canada","alpha-2":"CA","country-code":"124"},{"name":"Cayman Islands","alpha-2":"KY","country-code":"136"},{"name":"Central African Republic","alpha-2":"CF","country-code":"140"},{"name":"Chad","alpha-2":"TD","country-code":"148"},{"name":"Chile","alpha-2":"CL","country-code":"152"},{"name":"China","alpha-2":"CN","country-code":"156"},{"name":"Christmas Island","alpha-2":"CX","country-code":"162"},{"name":"Cocos (Keeling) Islands","alpha-2":"CC","country-code":"166"},{"name":"Colombia","alpha-2":"CO","country-code":"170"},{"name":"Comoros","alpha-2":"KM","country-code":"174"},{"name":"Congo","alpha-2":"CG","country-code":"178"},{"name":"Congo, Democratic Republic of the","alpha-2":"CD","country-code":"180"},{"name":"Cook Islands","alpha-2":"CK","country-code":"184"},{"name":"Costa Rica","alpha-2":"CR","country-code":"188"},{"name":"Côte d'Ivoire","alpha-2":"CI","country-code":"384"},{"name":"Croatia","alpha-2":"HR","country-code":"191"},{"name":"Cuba","alpha-2":"CU","country-code":"192"},{"name":"Curaçao","alpha-2":"CW","country-code":"531"},{"name":"Cyprus","alpha-2":"CY","country-code":"196"},{"name":"Czechia","alpha-2":"CZ","country-code":"203"},{"name":"Denmark","alpha-2":"DK","country-code":"208"},{"name":"Djibouti","alpha-2":"DJ","country-code":"262"},{"name":"Dominica","alpha-2":"DM","country-code":"212"},{"name":"Dominican Republic","alpha-2":"DO","country-code":"214"},{"name":"Ecuador","alpha-2":"EC","country-code":"218"},{"name":"Egypt","alpha-2":"EG","country-code":"818"},{"name":"El Salvador","alpha-2":"SV","country-code":"222"},{"name":"Equatorial Guinea","alpha-2":"GQ","country-code":"226"},{"name":"Eritrea","alpha-2":"ER","country-code":"232"},{"name":"Estonia","alpha-2":"EE","country-code":"233"},{"name":"Eswatini","alpha-2":"SZ","country-code":"748"},{"name":"Ethiopia","alpha-2":"ET","country-code":"231"},{"name":"Falkland Islands (Malvinas)","alpha-2":"FK","country-code":"238"},{"name":"Faroe Islands","alpha-2":"FO","country-code":"234"},{"name":"Fiji","alpha-2":"FJ","country-code":"242"},{"name":"Finland","alpha-2":"FI","country-code":"246"},{"name":"France","alpha-2":"FR","country-code":"250"},{"name":"French Guiana","alpha-2":"GF","country-code":"254"},{"name":"French Polynesia","alpha-2":"PF","country-code":"258"},{"name":"French Southern Territories","alpha-2":"TF","country-code":"260"},{"name":"Gabon","alpha-2":"GA","country-code":"266"},{"name":"Gambia","alpha-2":"GM","country-code":"270"},{"name":"Georgia","alpha-2":"GE","country-code":"268"},{"name":"Germany","alpha-2":"DE","country-code":"276"},{"name":"Ghana","alpha-2":"GH","country-code":"288"},{"name":"Gibraltar","alpha-2":"GI","country-code":"292"},{"name":"Greece","alpha-2":"GR","country-code":"300"},{"name":"Greenland","alpha-2":"GL","country-code":"304"},{"name":"Grenada","alpha-2":"GD","country-code":"308"},{"name":"Guadeloupe","alpha-2":"GP","country-code":"312"},{"name":"Guam","alpha-2":"GU","country-code":"316"},{"name":"Guatemala","alpha-2":"GT","country-code":"320"},{"name":"Guernsey","alpha-2":"GG","country-code":"831"},{"name":"Guinea","alpha-2":"GN","country-code":"324"},{"name":"Guinea-Bissau","alpha-2":"GW","country-code":"624"},{"name":"Guyana","alpha-2":"GY","country-code":"328"},{"name":"Haiti","alpha-2":"HT","country-code":"332"},{"name":"Heard Island and McDonald Islands","alpha-2":"HM","country-code":"334"},{"name":"Holy See","alpha-2":"VA","country-code":"336"},{"name":"Honduras","alpha-2":"HN","country-code":"340"},{"name":"Hong Kong","alpha-2":"HK","country-code":"344"},{"name":"Hungary","alpha-2":"HU","country-code":"348"},{"name":"Iceland","alpha-2":"IS","country-code":"352"},{"name":"India","alpha-2":"IN","country-code":"356"},{"name":"Indonesia","alpha-2":"ID","country-code":"360"},{"name":"Iran (Islamic Republic of)","alpha-2":"IR","country-code":"364"},{"name":"Iraq","alpha-2":"IQ","country-code":"368"},{"name":"Ireland","alpha-2":"IE","country-code":"372"},{"name":"Isle of Man","alpha-2":"IM","country-code":"833"},{"name":"Israel","alpha-2":"IL","country-code":"376"},{"name":"Italy","alpha-2":"IT","country-code":"380"},{"name":"Jamaica","alpha-2":"JM","country-code":"388"},{"name":"Japan","alpha-2":"JP","country-code":"392"},{"name":"Jersey","alpha-2":"JE","country-code":"832"},{"name":"Jordan","alpha-2":"JO","country-code":"400"},{"name":"Kazakhstan","alpha-2":"KZ","country-code":"398"},{"name":"Kenya","alpha-2":"KE","country-code":"404"},{"name":"Kiribati","alpha-2":"KI","country-code":"296"},{"name":"Korea (Democratic People's Republic of)","alpha-2":"KP","country-code":"408"},{"name":"Korea, Republic of","alpha-2":"KR","country-code":"410"},{"name":"Kuwait","alpha-2":"KW","country-code":"414"},{"name":"Kyrgyzstan","alpha-2":"KG","country-code":"417"},{"name":"Lao People's Democratic Republic","alpha-2":"LA","country-code":"418"},{"name":"Latvia","alpha-2":"LV","country-code":"428"},{"name":"Lebanon","alpha-2":"LB","country-code":"422"},{"name":"Lesotho","alpha-2":"LS","country-code":"426"},{"name":"Liberia","alpha-2":"LR","country-code":"430"},{"name":"Libya","alpha-2":"LY","country-code":"434"},{"name":"Liechtenstein","alpha-2":"LI","country-code":"438"},{"name":"Lithuania","alpha-2":"LT","country-code":"440"},{"name":"Luxembourg","alpha-2":"LU","country-code":"442"},{"name":"Macao","alpha-2":"MO","country-code":"446"},{"name":"Madagascar","alpha-2":"MG","country-code":"450"},{"name":"Malawi","alpha-2":"MW","country-code":"454"},{"name":"Malaysia","alpha-2":"MY","country-code":"458"},{"name":"Maldives","alpha-2":"MV","country-code":"462"},{"name":"Mali","alpha-2":"ML","country-code":"466"},{"name":"Malta","alpha-2":"MT","country-code":"470"},{"name":"Marshall Islands","alpha-2":"MH","country-code":"584"},{"name":"Martinique","alpha-2":"MQ","country-code":"474"},{"name":"Mauritania","alpha-2":"MR","country-code":"478"},{"name":"Mauritius","alpha-2":"MU","country-code":"480"},{"name":"Mayotte","alpha-2":"YT","country-code":"175"},{"name":"Mexico","alpha-2":"MX","country-code":"484"},{"name":"Micronesia (Federated States of)","alpha-2":"FM","country-code":"583"},{"name":"Moldova, Republic of","alpha-2":"MD","country-code":"498"},{"name":"Monaco","alpha-2":"MC","country-code":"492"},{"name":"Mongolia","alpha-2":"MN","country-code":"496"},{"name":"Montenegro","alpha-2":"ME","country-code":"499"},{"name":"Montserrat","alpha-2":"MS","country-code":"500"},{"name":"Morocco","alpha-2":"MA","country-code":"504"},{"name":"Mozambique","alpha-2":"MZ","country-code":"508"},{"name":"Myanmar","alpha-2":"MM","country-code":"104"},{"name":"Namibia","alpha-2":"NA","country-code":"516"},{"name":"Nauru","alpha-2":"NR","country-code":"520"},{"name":"Nepal","alpha-2":"NP","country-code":"524"},{"name":"Netherlands","alpha-2":"NL","country-code":"528"},{"name":"New Caledonia","alpha-2":"NC","country-code":"540"},{"name":"New Zealand","alpha-2":"NZ","country-code":"554"},{"name":"Nicaragua","alpha-2":"NI","country-code":"558"},{"name":"Niger","alpha-2":"NE","country-code":"562"},{"name":"Nigeria","alpha-2":"NG","country-code":"566"},{"name":"Niue","alpha-2":"NU","country-code":"570"},{"name":"Norfolk Island","alpha-2":"NF","country-code":"574"},{"name":"North Macedonia","alpha-2":"MK","country-code":"807"},{"name":"Northern Mariana Islands","alpha-2":"MP","country-code":"580"},{"name":"Norway","alpha-2":"NO","country-code":"578"},{"name":"Oman","alpha-2":"OM","country-code":"512"},{"name":"Pakistan","alpha-2":"PK","country-code":"586"},{"name":"Palau","alpha-2":"PW","country-code":"585"},{"name":"Palestine, State of","alpha-2":"PS","country-code":"275"},{"name":"Panama","alpha-2":"PA","country-code":"591"},{"name":"Papua New Guinea","alpha-2":"PG","country-code":"598"},{"name":"Paraguay","alpha-2":"PY","country-code":"600"},{"name":"Peru","alpha-2":"PE","country-code":"604"},{"name":"Philippines","alpha-2":"PH","country-code":"608"},{"name":"Pitcairn","alpha-2":"PN","country-code":"612"},{"name":"Poland","alpha-2":"PL","country-code":"616"},{"name":"Portugal","alpha-2":"PT","country-code":"620"},{"name":"Puerto Rico","alpha-2":"PR","country-code":"630"},{"name":"Qatar","alpha-2":"QA","country-code":"634"},{"name":"Réunion","alpha-2":"RE","country-code":"638"},{"name":"Romania","alpha-2":"RO","country-code":"642"},{"name":"Russian Federation","alpha-2":"RU","country-code":"643"},{"name":"Rwanda","alpha-2":"RW","country-code":"646"},{"name":"Saint Barthélemy","alpha-2":"BL","country-code":"652"},{"name":"Saint Helena, Ascension and Tristan da Cunha","alpha-2":"SH","country-code":"654"},{"name":"Saint Kitts and Nevis","alpha-2":"KN","country-code":"659"},{"name":"Saint Lucia","alpha-2":"LC","country-code":"662"},{"name":"Saint Martin (French part)","alpha-2":"MF","country-code":"663"},{"name":"Saint Pierre and Miquelon","alpha-2":"PM","country-code":"666"},{"name":"Saint Vincent and the Grenadines","alpha-2":"VC","country-code":"670"},{"name":"Samoa","alpha-2":"WS","country-code":"882"},{"name":"San Marino","alpha-2":"SM","country-code":"674"},{"name":"Sao Tome and Principe","alpha-2":"ST","country-code":"678"},{"name":"Saudi Arabia","alpha-2":"SA","country-code":"682"},{"name":"Senegal","alpha-2":"SN","country-code":"686"},{"name":"Serbia","alpha-2":"RS","country-code":"688"},{"name":"Seychelles","alpha-2":"SC","country-code":"690"},{"name":"Sierra Leone","alpha-2":"SL","country-code":"694"},{"name":"Singapore","alpha-2":"SG","country-code":"702"},{"name":"Sint Maarten (Dutch part)","alpha-2":"SX","country-code":"534"},{"name":"Slovakia","alpha-2":"SK","country-code":"703"},{"name":"Slovenia","alpha-2":"SI","country-code":"705"},{"name":"Solomon Islands","alpha-2":"SB","country-code":"090"},{"name":"Somalia","alpha-2":"SO","country-code":"706"},{"name":"South Africa","alpha-2":"ZA","country-code":"710"},{"name":"South Georgia and the South Sandwich Islands","alpha-2":"GS","country-code":"239"},{"name":"South Sudan","alpha-2":"SS","country-code":"728"},{"name":"Spain","alpha-2":"ES","country-code":"724"},{"name":"Sri Lanka","alpha-2":"LK","country-code":"144"},{"name":"Sudan","alpha-2":"SD","country-code":"729"},{"name":"Suriname","alpha-2":"SR","country-code":"740"},{"name":"Svalbard and Jan Mayen","alpha-2":"SJ","country-code":"744"},{"name":"Sweden","alpha-2":"SE","country-code":"752"},{"name":"Switzerland","alpha-2":"CH","country-code":"756"},{"name":"Syrian Arab Republic","alpha-2":"SY","country-code":"760"},{"name":"Taiwan, Province of China","alpha-2":"TW","country-code":"158"},{"name":"Tajikistan","alpha-2":"TJ","country-code":"762"},{"name":"Tanzania, United Republic of","alpha-2":"TZ","country-code":"834"},{"name":"Thailand","alpha-2":"TH","country-code":"764"},{"name":"Timor-Leste","alpha-2":"TL","country-code":"626"},{"name":"Togo","alpha-2":"TG","country-code":"768"},{"name":"Tokelau","alpha-2":"TK","country-code":"772"},{"name":"Tonga","alpha-2":"TO","country-code":"776"},{"name":"Trinidad and Tobago","alpha-2":"TT","country-code":"780"},{"name":"Tunisia","alpha-2":"TN","country-code":"788"},{"name":"Turkey","alpha-2":"TR","country-code":"792"},{"name":"Turkmenistan","alpha-2":"TM","country-code":"795"},{"name":"Turks and Caicos Islands","alpha-2":"TC","country-code":"796"},{"name":"Tuvalu","alpha-2":"TV","country-code":"798"},{"name":"Uganda","alpha-2":"UG","country-code":"800"},{"name":"Ukraine","alpha-2":"UA","country-code":"804"},{"name":"United Arab Emirates","alpha-2":"AE","country-code":"784"},{"name":"United Kingdom of Great Britain and Northern Ireland","alpha-2":"GB","country-code":"826"},{"name":"United States of America","alpha-2":"US","country-code":"840"},{"name":"United States Minor Outlying Islands","alpha-2":"UM","country-code":"581"},{"name":"Uruguay","alpha-2":"UY","country-code":"858"},{"name":"Uzbekistan","alpha-2":"UZ","country-code":"860"},{"name":"Vanuatu","alpha-2":"VU","country-code":"548"},{"name":"Venezuela (Bolivarian Republic of)","alpha-2":"VE","country-code":"862"},{"name":"Viet Nam","alpha-2":"VN","country-code":"704"},{"name":"Virgin Islands (British)","alpha-2":"VG","country-code":"092"},{"name":"Virgin Islands (U.S.)","alpha-2":"VI","country-code":"850"},{"name":"Wallis and Futuna","alpha-2":"WF","country-code":"876"},{"name":"Western Sahara","alpha-2":"EH","country-code":"732"},{"name":"Yemen","alpha-2":"YE","country-code":"887"},{"name":"Zambia","alpha-2":"ZM","country-code":"894"},{"name":"Zimbabwe","alpha-2":"ZW","country-code":"716"},{"name":"Kosovo","alpha-2":"XK","country-code":"383"}] \ No newline at end of file diff --git a/src/pyaro_readers/nilupmfebas/ebas_config.ini b/src/pyaro_readers/nilupmfebas/ebas_config.ini deleted file mode 100644 index 0e323a6..0000000 --- a/src/pyaro_readers/nilupmfebas/ebas_config.ini +++ /dev/null @@ -1,628 +0,0 @@ -# EBAS I/O variable definitions for Pyaerocom -# -# - Based on https://github.com/metno/aerocom-tools/blob/master/aerocom_read_include.pro -# - Reviewed and partly changed -# - TODO: review and discuss definitions - -# REMARK ON SCALE FACTOR: - -# There are 2 options to define custom unit and / or mass conversion - -# Option 1: -# If the provided unit in the data files is CF conform (i.e. supported -# by cf_units module, e.g. ug m-3) but the measured quantity comprises only a -# mass fraction of the species of interest, then, a scale factor may be -# specified below. In this case the value of the data unit remains the same as -# in the files during reading, but the data itself is multiplied by that scale -# factor. This is, e.g. the case for variable concss (sea salt concentration) -# where Sodium measurements are used to upscale to total sea salt mass - -# Option 2: -# The original unit is provided in a non CF conform format (e.g. ug S/m3, i.e. -# mass of sulphur). In this case the unit value needs to be changed while -# converting the mass to e.g. SO4. These conversion factors are specified in -# the dataframe table pyaerocom.units_helpers.UCONV_MUL_FACS - -# NOTE: BEFORE INSERTING A SCALE FACTOR HERE (option 1) MAKE SURE THAT THIS -# CONVERSION IS NOT ALREADY HANDLED VIA option 2 - -# ---------------------------------------------------------------------------- - -# 0. UNSORTED NEW STUFF - -[concca] -component=calcium -matrix=aerosol,pm25,pm10,pm1 - -[concmg] -component=magnesium -matrix=aerosol,pm25,pm10,pm1 - -[conck] -component=potassium -matrix=aerosol,pm25,pm10,pm1 -# 1. AEROSOL OPTICAL PROPERTIES - -# 1.1 Scattering, absorption, extinction coefficients -[sc550aer] -component=aerosol_light_scattering_coefficient -matrix=aerosol,pm10 - -[sc440aer] -component=aerosol_light_scattering_coefficient -matrix=aerosol,pm10 - -[sc700aer] -component=aerosol_light_scattering_coefficient -matrix=aerosol,pm10 - -[sc550dryaer] -requires=sc550aer,scrh - -[sc440dryaer] -requires=sc440aer,scrh - -[sc700dryaer] -requires=sc700aer,scrh - -[sc550lt1aer] -component=aerosol_light_scattering_coefficient -matrix=pm25,pm1 - -[bsc550aer] -component=aerosol_light_backscattering_coefficient -matrix=aerosol,pm10,pm25 - -[ac550aer] -component=aerosol_absorption_coefficient -matrix=aerosol,pm10 -instrument=filter_absorption_photometer - -[ac550dryaer] -instrument=filter_absorption_photometer -requires=ac550aer,acrh - -[ac550lt1aer] -component=aerosol_absorption_coefficient -matrix=pm25,pm1 -instrument=filter_absorption_photometer - -[bsc550dryaer] -component=aerosol_light_backscattering_coefficient -instrument=nephelometer -matrix=pm10,pm25,pm1,aerosol - -# 1.2. Auxiliary variables -[scrh] -component=relative_humidity -matrix=instrument,aerosol,met,pm10,pm25,pm1 - -[acrh] -component=relative_humidity -matrix=instrument,aerosol,met,pm10,pm25,pm1 - -[ts] -component=temperature -matrix=air,instrument,aerosol,met,pm10,pm25,pm1 - -# 2. Concentrations - -# 2.1 Surface air concentrations (i.e. [sconcXX]) -# REMOVED AND GENERALISED IN AUG 2019 NOW ONLY USING [concXX] - -# 2.2 Air concentrations -[concso4] -component=sulphate_corrected,sulphate_total -# after discussion with Wenche -matrix=aerosol,pm10,pm25 - -[concso4coarse] -component=sulphate_corrected,sulphate_total -# after discussion with Wenche -matrix=aerosol,pm10 - -[concso4fine] -component=sulphate_corrected,sulphate_total -# after discussion with Wenche -matrix=pm25, pm1 - -[SO4ugSm3] -component=sulphate_corrected,sulphate_total -# after discussion with Wenche -matrix=aerosol,pm10,pm25 - -[concso4pm10] -component=sulphate_corrected,sulphate_total -matrix=pm10 - -[concso4pm25] -component=sulphate_corrected,sulphate_total -matrix=pm25 - -[concso2] -component=sulphur_dioxide -# main-air-chem, -matrix=air - -[concSso2] -component=sulphur_dioxide -# main-air-chem, -matrix=air - -[vmrso2] -component=sulphur_dioxide -# main-air-chem, -matrix=air - -[concpm10] -component=pm10_mass -# pm10-chem, pm10_mass_statistics -matrix=pm10 - -[concpm25] -component=pm25_mass -# pm25-chem, -matrix=pm25 - -[concpm1] -component=pm1_mass - -[concso4t] -component=sulphate_total -# after discussion with Wenche -matrix=aerosol,pm10,pm25 - -[concso4c] -component=sulphate_corrected -# after discussion with Wenche -matrix=aerosol,pm10,pm25 - -[concbc] -component=elemental_carbon -instrument=denuder,ecoc_monitor,filter_1pack,filter_2pack,high_vol_sampler,impactor,low_vol_sampler,lvs_denuder_single,lvs_denuder_tandem,lvs_QBQ,lvs_single,lvs_single_twin,lvs_teflon -matrix=pm25,pm10,pm1,aerosol - -[conceqbc] -component=equivalent_black_carbon -instrument=filter_absorption_photometer -matrix=aerosol,pm1,pm10,pm25 - -[concCec] -component=elemental_carbon -# after discussion with Wenche -matrix=pm25,pm10,aerosol,pm1 - -[concCecpm25] -component=elemental_carbon -matrix=pm25,pm1 - -[conctc] -component=total_carbon -# after discussion with Wenche -matrix=pm25,pm10,aerosol - -[concoa] -component=organic_carbon -# after discussion with Wenche -matrix=pm25,pm10,aerosol,pm1 -scale_factor=1.4 - -[concoc] -component=organic_carbon -# after discussion with Wenche -matrix=pm25,pm10,aerosol,pm1 - -[concCoc] -component=organic_carbon -# after discussion with Wenche -matrix=pm25,pm10,aerosol,pm1 - -[concCocpm25] -component=organic_carbon -matrix=pm25,pm1 - -[concss] -component=sodium -matrix=pm10,aerosol,pm25,pm1,air -scale_factor=3.27 - -[concsspm10] -component=sodium -matrix=pm10,aerosol -scale_factor=3.27 - -[concsspm25] -component=sodium -matrix=pm25 -scale_factor=3.27 - -[concnh3] -component=ammonia -matrix=air - -[concNnh3] -component=ammonia -matrix=air - -[concno3] -component=nitrate -matrix=pm10,aerosol,pm25 - -[concNno3pm10] -component=nitrate -matrix=pm10,aerosol - -[concNno3pm25] -component=nitrate -matrix=pm25,pm1 - -[concno3pm10] -component=nitrate -matrix=pm10,aerosol - -[concno3pm25] -component=nitrate -matrix=pm25,pm1 - -[concnh4] -component=ammonium -matrix=pm10,aerosol,pm25 - -[concnh4coarse] -component=ammonium -matrix=pm10,aerosol - -[concnh4fine] -component=ammonium -matrix=pm25,pm1 - -[concNnh4] -component=ammonium -matrix=pm10,aerosol,pm25 - -[concNhno3] -component=nitric_acid -matrix=air - -[concNtno3] -component=sum_nitric_acid_and_nitrate -matrix=air+aerosol - -[concNtnh] -component=sum_ammonia_and_ammonium -matrix=air+aerosol - -[concno] -component=nitrogen_monoxide -matrix=air - -[concno2] -component=nitrogen_dioxide -matrix=air - -[concNno] -component=nitrogen_monoxide -matrix=air - -[concNno2] -component=nitrogen_dioxide -matrix=air - -[conchcho] -component=methanal -matrix=air - -[conco3] -component=ozone -matrix=air - -[concco] -component=carbon_monoxide -matrix=air - -[vmro3] -component=ozone -matrix=air - -[vmro3max] -component=ozone -matrix=air - -[vmrco] -component=carbon_monoxide -matrix=air - -[vmrno2] -component=nitrogen_dioxide -matrix=air - -[vmrno] -component=nitrogen_monoxide -matrix=air - -[vmrisop] -component=isoprene -matrix=air - -[vmrhcho] -component=methanal -matrix=air - -[vmrglyoxal] -component=ethanedial -matrix=air - -[vmrc2h6] -component=ethane -matrix=air - -[vmrc2h4] -component=ethene -matrix=air - -[concglyoxal] -component=ethanedial -matrix=air - - -# 2.3. Precipitation concentrations -[concprcpoxs] -component=sulphate_corrected,sulphate_total -matrix=precip - -[concprcpoxsc] -component=sulphate_corrected -matrix=precip - -[concprcpoxst] -component=sulphate_total -matrix=precip - -[concprcpoxn] -component=nitrate -matrix=precip - -[concprcprdn] -component=ammonium -matrix=precip - -[concprcpna] -component=sodium -matrix=precip - -# 3. Deposition rates - -# 3.1. Wet deposition -[wetoxs] -requires=concprcpoxs - -[wetoxst] -requires=concprcpoxst - -[wetoxsc] -requires=concprcpoxsc - -[wetrdn] -requires=concprcprdn - -[wetoxn] -requires=concprcpoxn - -[wetna] -requires=concprcpna - -# 4. Precipitation -[pr] # pyaerocom unit kg m-2 s-1 -component=precipitation_amount_off,precipitation_amount -matrix=precip - -[prmm] # pyaerocom unit mm d-1 -component=precipitation_amount_off,precipitation_amount -matrix=precip - -[concCocpm10] -component=organic_carbon -matrix=pm10 - -[concCecpm10] -component=elemental_carbon -matrix=pm10 - -# CAMS2_40 Task4041 - -# Gases - -[vmrhno3] -component=nitric_acid -matrix=air - -[vmrnh3] -component=ammonia -matrix=air - -[vmrtp] -component=monoterpenes -matrix=air - -; [vmrpan] -; component=methanal -; matrix=air - -; [vmroh] -; component=methanal -; matrix=air - -# PM - -[concCoc25] -component=organic_carbon -matrix=pm25,pm1 - -[concom25] -component=organic_mass -matrix=pm25,pm1 - - -[concsscoarse] -component=sodium -matrix=pm10 -scale_factor=3.27 - -[concss25] -component=sodium -matrix=pm25 -scale_factor=3.27 - - -# Deposition -[concprcpnh4] -component=ammonium -matrix=precip - -[wetnh4] -requires=concprcpnh4 - -[concprcpno3] -component=nitrate -matrix=precip - -[wetno3] -requires=concprcpno3 - - -[concprcpso4] -#component=sulphate_corrected#,sulphate_total -component=sulphate_corrected -# after discussion with Wenche -matrix=precip - -[wetso4] -requires=concprcpso4 - -#proxy Dry Dep - -# Sulpher Based dry dep -[proxydryoxs] -requires=concprcpoxs - -[proxydryss] -requires=concprcpna - -[proxydryna] -requires=concprcpna - -[proxydryso2] -requires=concprcpoxs - -[proxydryso4] -requires=concprcpoxs - - -# Oxidized nitrogen based dry dep -[proxydryoxn] -requires=concprcpoxn - -[proxydryno2] -requires=concprcpoxn - -[proxydryno2no2] -requires=concprcpoxn - -[proxydryhono] -requires=concprcpoxn - -[proxydryn2o5] -requires=concprcpoxn - -[proxydryhno3] -requires=concprcpoxn - -[proxydryno3c] -requires=concprcpoxn - -[proxydryno3f] -requires=concprcpoxn - - -# Reduced nitrogen based dry dep -[proxydryrdn] -requires=concprcprdn - -[proxydrynh3] -requires=concprcprdn - -[proxydrynh4] -requires=concprcprdn - - -# Other proxy dry dep - -[proxydryo3] -requires=vmro3 - -[proxydrypm10] -requires=concprcpoxs - -[proxydrypm25] -requires=concprcpoxs - -#proxy wet Dep - -# Sulpher Based wet dep -[proxywetoxs] -requires=concprcpoxs - -[proxywetso2] -requires=concprcpoxs - -[proxywetso4] -requires=concprcpoxs - - -# Oxidized nitrogen based wet dep -[proxywetoxn] -requires=concprcpoxn - -[proxywetno2] -requires=concprcpoxn - -[proxywetno2no2] -requires=concprcpoxn - -[proxywethono] -requires=concprcpoxn - -[proxywetn2o5] -requires=concprcpoxn - -[proxywethno3] -requires=concprcpoxn - -[proxywetno3c] -requires=concprcpoxn - -[proxywetno3f] -requires=concprcpoxn - - -# Reduced nitrogen based wet dep -[proxywetrdn] -requires=concprcprdn - -[proxywetnh3] -requires=concprcprdn - -[proxywetnh4] -requires=concprcprdn - - -# Other proxy wet dep - -[proxyweto3] -requires=vmro3 - -[proxywetpm10] -requires=concprcpoxs - -[proxywetpm25] -requires=concprcpoxs \ No newline at end of file diff --git a/src/pyaro_readers/nilupmfebas/ebas_file_index.py b/src/pyaro_readers/nilupmfebas/ebas_file_index.py deleted file mode 100644 index 4cb7341..0000000 --- a/src/pyaro_readers/nilupmfebas/ebas_file_index.py +++ /dev/null @@ -1,342 +0,0 @@ -import logging -import os -import sqlite3 - -from ._lowlevel_helpers import BrowseDict - -logger = logging.getLogger(__name__) - - -class EbasSQLRequest(BrowseDict): - """Low level dictionary like object for EBAS sqlite queries - - Attributes - ---------- - variables : :obj:`tuple`, optional - tuple containing variable names to be extracted (e.g. - ``('aerosol_light_scattering_coefficient', 'aerosol_optical_depth')``). - If None, all available is used - start_date : :obj:`str`, optional - start date of data request (format YYYY-MM-DD). If None, all available - is used - stop_date : :obj:`str`, optional - stop date of data request (format YYYY-MM-DD). If None, all available - is used - station_names : :obj:`tuple`, optional - tuple containing station_names of request (e.g. - ``('Birkenes II', 'Asa')``).If None, all available is used - matrices : :obj:`tuple`, optional - tuple containing station_names of request (e.g. - ``('pm1', 'pm10', 'pm25', 'aerosol')``) - If None, all available is used - altitude_range : :obj:`tuple`, optional - tuple specifying altitude range of station in m (e.g. - ``(0.0, 500.0)``). If None, all available is used - lon_range : :obj:`tuple`, optional - tuple specifying longitude range of station in degrees (e.g. - ``(-20, 20)``). If None, all available is used - lat_range : :obj:`tuple`, optional - tuple specifying latitude range of station in degrees (e.g. - ``(50, 80)``). If None, all available is used - instrument_type : :obj:`str`, optional - string specifying instrument types (e.g. - ``("nephelometer")``) - statistics : :obj:`tuple`, optional - string specifying statistics code (e.g. - ``("arithmetic mean")``) - - Parameters - ---------- - see Attributes - """ - - def __init__( - self, - variables=None, - start_date=None, - stop_date=None, - station_names=None, - matrices=None, - altitude_range=None, - lon_range=None, - lat_range=None, - instrument_types=None, - statistics=None, - datalevel=None, - ): - self.variables = variables - self.start_date = start_date - self.stop_date = stop_date - self.station_names = station_names - self.matrices = matrices - self.altitude_range = altitude_range - self.lon_range = lon_range - self.lat_range = lat_range - self.instrument_types = instrument_types - self.statistics = statistics - self.datalevel = datalevel - - def update(self, **kwargs): - for k, v in kwargs.items(): - if k in self: - self[k] = v - else: - logger.warning(f"Unknown EBAS SQL request key {k} (value {v})") - - @staticmethod - def _var2sql(var): - if isinstance(var, list): - if len(var) > 1: - var = tuple(var) - else: - var = var[0] - - if isinstance(var, tuple): - return f"{var}" - elif isinstance(var, str): - return f"('{var}')" - raise ValueError("Invalid value...") - - def make_file_query_str(self, distinct=True, **kwargs): - """Wrapper for base method :func:`make_query_str` - - Parameters - ---------- - distinct : bool - return unique files - **kwargs - update request attributes (e.g. ``lon_range=(30, 60)``) - - Returns - ------- - str - SQL file request command for current specs - """ - query = self.make_query_str(distinct=distinct, **kwargs) - # add an extsion to get only files that have no fraction variables in them - query = query.replace( - ";", - " and not exists (select * from characteristic where var_id=variable.var_id and ct_type='Fraction');", - ) - return query - - def make_query_str(self, what=None, distinct=True, **kwargs): - """Translate current class state into SQL query command string - - Parameters - ---------- - what : str or tuple, optional - what columns to retrieve (e.g. comp_name for all variables) from - table specified. Defaults to None, in which case "filename" is used - distinct : bool - return unique files - **kwargs - update request attributes (e.g. ``lon_range=(30, 60)``) - - Returns - ------- - str - SQL file request command for current specs - """ - self.update(**kwargs) - if what is None: - what = "filename" - elif not isinstance(what, str): # tuple or list of parameters to be retrieved - what = ",".join(what) - - if distinct: - req = f"select distinct {what} from variable" - else: - req = f"select {what} from variable" - req += " join station on station.station_code=variable.station_code" - add_cond = 0 - # add constraints from station table - conv = self._var2sql - if self.station_names is not None: - req += f" where station_name in {conv(self.station_names)}" - add_cond += 1 - if self.altitude_range is not None: - low, high = self.altitude_range - req += " and " if add_cond else " where " - req += f"station_altitude>{low} and station_altitude<{high}" - add_cond += 1 - if self.lon_range is not None: - l, r = self.lon_range - req += " and " if add_cond else " where " - req += f"station_longitude>{l} and station_longitude<{r}" - add_cond += 1 - if self.lat_range is not None: - s, n = self.lat_range - req += " and " if add_cond else " where " - req += f"station_latitude>{s} and station_latitude<{n}" - add_cond += 1 - if self.instrument_types is not None: - req += " and " if add_cond else " where " - req += f"instr_type in {conv(self.instrument_types)}" - add_cond += 1 - # add constraints from variable table - if self.variables is not None: - req += " and " if add_cond else " where " - req += f"comp_name in {conv(self.variables)}" - add_cond += 1 - if self.stop_date is not None: - req += " and " if add_cond else " where " - req += f"first_end < '{self.stop_date}'" - add_cond += 1 - if self.start_date is not None: - req += " and " if add_cond else " where " - req += f"last_start > '{self.start_date}'" - add_cond += 1 - if self.matrices is not None: - req += " and " if add_cond else " where " - req += f"matrix in {conv(self.matrices)}" - add_cond += 1 - if self.statistics is not None: - req += " and " if add_cond else " where " - req += f"statistics in {conv(self.statistics)}" - add_cond += 1 - if self.datalevel is not None: - req += " and " if add_cond else " where " - req += f"datalevel={self.datalevel}" - add_cond += 1 - return req + ";" - - def __str__(self): - head = f"Pyaerocom {type(self).__name__}" - s = f"\n{head}\n{len(head) * '-'}" - for k, v in self.items(): - s += f"\n{k}: {v}" - s += f"\nFilename request string:\n{self.make_file_query_str()}" - return s - - -class EbasFileIndex: - """EBAS SQLite I/O interface - - Takes care of connection to database and execution of requests - """ - - def __init__(self, database=None): - self._database = database - - @property - def database(self): - """Path to ebas_file_index.sqlite3 file""" - db = self._database - if db is None or not os.path.exists(db): - raise AttributeError( - "EBAS SQLite database file could not be " - "located but is needed in EbasFileIndex class" - ) - return db - - @property - def ALL_STATION_NAMES(self): - """List of all available station names in database""" - names = self.execute_request("select distinct station_name from station") - return [x[0] for x in names] - - @property - def ALL_STATION_CODES(self): - """List of all available station codes in database - - Note - ---- - Not tested whether the order is the same as the order in - :attr:`STATION_NAMES`, i.e. the lists should not be linked to each - other - """ - names = self.execute_request("select distinct station_code from station") - return [x[0] for x in names] - - @property - def ALL_STATISTICS_PARAMS(self): - """List of all statistical parameters available - - For more info see `here `__ - """ - names = self.execute_request("select distinct statistics from variable") - return [x[0] for x in names] - - @property - def ALL_VARIABLES(self): - """List of all variables available""" - names = self.execute_request("select distinct comp_name from variable") - return [x[0] for x in names] - - @property - def ALL_MATRICES(self): - """List of all matrix values available""" - names = self.execute_request("select distinct matrix from variable") - return [x[0] for x in names] - - @property - def ALL_INSTRUMENTS(self): - """List of all variables available""" - names = self.execute_request("select distinct instr_type from variable") - return [x[0] for x in names] - - def get_table_names(self): - """Get all table names in SQLite database file""" - return [ - x[0] - for x in self.execute_request( - "SELECT name FROM sqlite_master WHERE type='table';" - ) - ] - - def get_table_columns(self, table_name): - """Get all columns of a table in SQLite database file""" - req = f"select * from {table_name} where 1=0;" - with sqlite3.connect(self.database) as con: - cur = con.cursor() - cur.execute(req) - return [f[0] for f in cur.description] - - def execute_request(self, request, file_request=False): - """Connect to database and retrieve data for input request - - Parameters - ---------- - request : :obj:`EbasSQLRequest` or :obj:`str` - request specifications - - Returns - ------- - list - list of tuples containing the retrieved results. The number of - items in each tuple corresponds to the number of requested - parameters (usually one, can be specified in - :func:`make_query_str` using argument ``what``) - - """ - if isinstance(request, str): - sql_str = request - elif isinstance(request, EbasSQLRequest): - if not file_request: - sql_str = request.make_query_str() - else: - sql_str = request.make_file_query_str() - else: - raise ValueError(f"Unsupported request type {type(request)}") - - with sqlite3.connect(self.database) as con: - cur = con.cursor() - cur.execute(sql_str) - return [f for f in cur.fetchall()] - - def get_file_names(self, request): - """Get all files that match the request specifications - - Parameters - ---------- - request : :obj:`EbasSQLRequest` or :obj:`str` - request specifications - - Returns - ------- - list - list of file paths that match the request - """ - return [f[0] for f in self.execute_request(request, file_request=True)] diff --git a/src/pyaro_readers/nilupmfebas/ebas_flags.csv b/src/pyaro_readers/nilupmfebas/ebas_flags.csv deleted file mode 100644 index f0edeb5..0000000 --- a/src/pyaro_readers/nilupmfebas/ebas_flags.csv +++ /dev/null @@ -1,153 +0,0 @@ -100,'Checked by data originator. Valid measurement, overrides any invalid flags','V' -101,'Denuder capture efficiency < 75%. Valid measurement','V' -102,'CV of replicate diffusion tubes > 30 %. Valid measurement','V' -103,'CV of replicate ALPHA samplers > 15 %. Valid measurement','V' -110,'Episode data checked and accepted by data originator. Valid measurement','V' -111,'Irregular data checked and accepted by data originator. Valid measurement','V' -120,'Sample reanalysed with similar results. Valid measurement','V' -147,'Below theoretical detection limit or formal Q/A limit, but a value has been measured and reported and is considered valid','V' -185,'Possible local contamination indicated by wind direction or velocity','V' -186,'Possible local contamination indicated by single scattering albedo (auto)','V' -187,'Possible local contamination indicated by occurrence of new particles (auto)','V' -188,'Possible local contamination indicated by low wind speed (auto)','V' -189,'Possible local contamination indicated by wind from contaminated sector (auto)','V' -190,'Not corrected for cross-sensitivity to particle scattering','V' -191,'Data not truncation corrected - Valid measurement','V' -210,'Episode data checked and accepted by database co-ordinator. Valid measurement','V' -211,'Irregular data checked and accepted by database co-ordinator. Valid measurement','V' -220,'Preliminary data','V' -247,'Overlapping sample interval was corrected by the database co-ordinator. Possible wrong sample time (used for historic data only).','V' -248,'Illegal flag was removed by the database co-ordinator. Lost flag information. (used for historic data only)','V' -249,'Apparent typing error corrected. Valid measurement','V' -250,'Considerable sea salt contribution, but considered valid','V' -251,'Invalid due to large sea salt contribution','I' -256,'Invalidated by database co-ordinator','I' -257,'Extremely low value, outside four times standard deviation in a log-normal distribution','V' -258,'Extremely high value, outside four times standard deviation in a log-normal distribution','V' -259,'Unspecified error expected','I' -260,'Contamination suspected','I' -275,'Inconsistency between measured and estimated conductivity, but considered valid','V' -276,'Inconsistency discovered through ion balance calculations, but considered valid','V' -277,'Invalid due to inconsistency between measured and estimated conductivity','I' -278,'Invalid due to inconsistency discovered through ion balance calculations','I' -298,'Gold trap inconsistency in mercury monitor','V' -299,'Inconsistent with another unspecified measurement','V' -370,'For monthly values using samples partly in two month, the number of days are used for weighing the sample','V' -380,'More than 50% of the measurements are below detection limit','V' -382,'More than 75% of the measurements are below detection limit','V' -388,'Data completeness less than 66%','V' -389,'Data completeness less than 66%','I' -390,'Data completeness less than 50%','V' -391,'Data completeness less than 50%, data considered invalid','I' -392,'Data completeness less than 75%','V' -393,'Data completeness less than 75%, data considered invalid','I' -394,'Data completeness less than 90%','V' -395,'Data completeness less than 90%, data considered invalid','I' -410,'Sahara dust event','V' -411,'Aeolian dust event','V' -420,'Preliminary data','V' -440,'Reconstructed or recalculated data','V' -450,'Considerable sea salt contribution, but considered valid','V' -451,'Invalid due to large sea salt contribution','I' -452,'Invalid due to large uncertainty','I' -456,'Invalidated by data originator','I' -457,'Extremely low value, outside four times standard deviation in a lognormal distribution','V' -458,'Extremely high value, outside four times standard deviation in a lognormal distribution','V' -459,'Extreme value, unspecified error','I' -460,'Contamination suspected','I' -470,'Particulate mass concentration higher than parallell mass concentration measurement with higher cut off i.e PM1_mass > PM25_mass and PM25_mass > PM10_mass','V' -471,'Particulate mass concentration higher than parallell mass concentration measurement with higher cut off i.e PM1_mass > PM25_mass and PM25_mass > PM10_mass. Considered invalid','I' -472,'Less accurate than normal due to high concentration(s)','V' -475,'Inconsistency between measured and estimated conductivity, but considerd valid','V' -476,'Inconsistency discovered through ion balance calculations, but considerd valid','V' -477,'Invalid due to inconsistency between measured and estimated conductivity','I' -478,'Invalid due to inconsistency discovered through ion balance calculations','I' -498,'Gold trap inconsistency in mercury monitor','V' -499,'Inconsistent with another unspecified measurement','V' -521,'Bactericide was added to sample for storage under warm climate. Considered valid','V' -530,'Invalid due to too low or too high recovery','I' -531,'Low recovery, analysis inaccurate','V' -532,'Data less accurate than normal due to high field blank value','V' -533,'Filters mixed up; incorrect analysis','I' -534,'Wrong coated denuder used','I' -540,'Spectral interference in laboratory analysis','I' -541,'Gold trap passiviated by unknown compound','I' -549,'Impure chemicals','I' -555,'Pollen and/or leaf contamination, but considered valid','V' -556,'Bird droppings, but considered valid','V' -557,'Insect contamination, but considered valid','V' -558,'Dust contamination, but considered valid','V' -559,'Unspecified contamination or local influence, but considered valid','V' -565,'Pollen and/or leaf contamination, considered invalid','I' -566,'Bird droppings, considered invalid','I' -567,'Insect contamination, considered invalid','I' -568,'Dust contamination, considered invalid','I' -578,'Large sea salt contribution (ratio between marine and excess sulphate is larger than 2.0). Used for old data only. For newer data use 451/450.','I' -591,'Agricultural contamination, considered invalid','I' -593,'Industrial contamination, considered invalid','I' -599,'Unspecified contamination or local influence','I' -620,'Too high filter breakthrough, considered invalid','I' -630,'POP concentration from the polyurethane foam (PUF) only','V' -632,'Lid of polyurethane foam (PUF) sampler not closed','V' -635,' Internal temperatures too far off target value, considered invalid','I' -640,'Instrument internal relative humidity above 40%','V' -641,'Aerosol filters installed incorrectly','I' -644,'Low instrument precision and/or calibration issues','V' -645,'Exceptional traffic nearby','V' -646,'Exceptional traffic nearby','I' -647,'Fire/wood burning nearby','V' -648,'Snow sampler','V' -649,'Temporary power fail has affected sampler operation','V' -650,'Precipitation collector failure','V' -651,'Agricultural activity nearby','V' -652,'Construction/acitivity nearby','V' -653,'Sampling period shorter than normal, considered representative. Observed values reported','V' -654,'Sampling period longer than normal, considered representative. Observed values reported','V' -655,'Estimated value created by averaging or spliting samples','V' -656,'Wet-only collector failure, operated as bulk collector','V' -657,'Precipitation collector overflow. Heavy snowfall/rain shower (squall)','V' -658,'Too small air volume','I' -659,'Unspecified sampling anomaly','I' -660,'Unspecified sampling anomaly, considered valid','V' -662,'Too high sampling flow, data considered valid','V' -663,'Too high sampling flow, data considered invalid','I' -664,'Instrument flow(s) too far off target value, considered invalid','I' -665,'Filter damaged, valid','V' -666,'Filter damaged, invalid','I' -668,'Moist or wet filter, valid','V' -669,'Moist or wet filter, invalid','I' -670,'Incomplete data acquisition for multi-component data sets','I' -674,'Icing or hoar frost in the intake, considered valid','V' -675,'no visibility data available','V' -676,'station inside cloud (visibility < 1000 m)','V' -677,'Icing or hoar frost in the intake','I' -678,'Hurricane','V' -679,'Unspecified meteorological condition','V' -680,'Undefined wind direction','V' -681,'Low data capture','I' -682,'Invalid due to calibration or zero/span check. Used for Level 0.','I' -683,'Invalid due to calibration. Used for Level 0.','I' -684,'Invalid due to zero/span check. Used for Level 0.','I' -685,'Invalid due to secondary standard gas measurement. Used for Level 0.','I' -699,'Mechanical problem, unspecified reason','I' -701,'Less accurate than usual, unspecified reason. (Used only with old data, for new data see groups 6 and 5)','I' -740,'Probably biased gas/particle ratio','V' -741,'Non refractory AMS concentrations. Dont include compounds that volatalises above 600 deg C','V' -750,'H+ not measured in alkaline sample','M' -760,'Value estimated by summing up the constituents measured','V' -770,'Value above range, data element contains estimated value','V' -771,'Value above range, data element contains upper range limit','V' -780,'Value below detection or quantification limit, data element contains estimated or measured value. Use of flag 147 is encouraged.','V' -781,'Value below detection limit, data element contains detection limit','V' -782,'Low precipitation, concentration estimated','V' -783,'Low precipitation, concentration unknown','M' -784,'Low precipitation, concentration estimated','I' -797,'Data element taken from co-located instrument','V' -798,'Measurement missing (unspecified reason), data element contains estimated value. Considered valid.','V' -799,'Measurement missing (unspecified reason), data element contains estimated value','I' -890,'Concentration in precipitation undefined, no precipitation','M' -899,'Measurement undefined, unspecified reason','M' -900,'Hidden and invalidated by data originator','H' -980,'Missing due to calibration or zero/span check','M' -990,'Precipitation not measured due to snow-fall. Needed for historic data, should not be needed for new data','M' -999,'Missing measurement, unspecified reason','M' diff --git a/src/pyaro_readers/nilupmfebas/ebas_varinfo.py b/src/pyaro_readers/nilupmfebas/ebas_varinfo.py deleted file mode 100644 index 523c30a..0000000 --- a/src/pyaro_readers/nilupmfebas/ebas_varinfo.py +++ /dev/null @@ -1,250 +0,0 @@ -from __future__ import annotations - -from configparser import ConfigParser - -from . import const -from ._lowlevel_helpers import BrowseDict -from . import resources -from .exceptions import VarNotAvailableError -from .ebas_file_index import EbasSQLRequest - - -class EbasVarInfo(BrowseDict): - """Interface for mapping between EBAS variable information and AeroCom - - For more information about EBAS variable and data information see - `EBAS website `__. - - Attributes - ---------- - var_name : str - AeroCom variable name - component : list - list of EBAS variable / component names that are mapped to - :attr:`var_name` - matrix : list, optional - list of EBAS matrix values that are accepted, default is None, i.e. - all available matrices are used - instrument : list, optional - list of all instruments that are accepted for this variable - requires : list, optional - for variables that are computed and not directly available in EBAS. - Provided as list of (AeroCom) variables that are required to - compute :attr:`var_name` (e.g. for `sc550dryaer` this would be - `[sc550aer,scrh]`). - scale_factor : float, optional - multiplicative scale factor that is applied in order to convert - EBAS variable into AeroCom variable (e.g. 1.4 for conversion of - EBAS OC measurement to AeroCom concoa variable) - - Parameters - ---------- - var_name : str - AeroCom variable name - init : bool - if True, EBAS configuration for input variable is retrieved from - data file ebas_config.ini (if possible) - **kwargs - additional keyword arguments (currently not used) - - """ - - def __init__(self, var_name: str, init: bool = True, **kwargs): - self.var_name = var_name - - self.component = None - - #: list of matrix names (EBAS side, optional) - self.matrix = None - - #: list of instrument names (EBAS side, optional) - self.instrument = None - - #: list containing variable statistics info (EBAS side, optional) - self.statistics = None - - #: list of additional variable required for retrieval of this variable - self.requires = None - - #: scale factor for conversion to Aerocom units - self.scale_factor = 1 - - # imports default information and, on top, variable information (if - # applicable) - if init: - self.parse_from_ini(var_name) - - @staticmethod - def PROVIDES_VARIABLES() -> list[str]: - """List specifying provided variables""" - info = EbasVarInfo.open_config() - return list(info) - - @staticmethod - def open_config(): - """Open ebas_config.ini file with `ConfigParser` - - Returns - ------- - ConfigParser - """ - - conf_reader = ConfigParser() - with resources.path("pyaro_readers.nilupmfebas", "ebas_config.ini") as path: - conf_reader.read(path) - return conf_reader - - @property - def var_name_aerocom(self) -> str: - """Variable name in AeroCom convention""" - return const.VARS[self.var_name].var_name_aerocom - - def parse_from_ini(self, var_name: str, conf_reader: ConfigParser | None = None): - """ - Parse EBAS info for input AeroCom variable (works also for aliases) - - Parameters - ---------- - var_name : str - AeroCom variable name - conf_reader : ConfigParser - open config parser object - - Raises - ------ - VarNotAvailableError - if variable is not supported - - Returns - ------- - bool - True, if default could be loaded, False if not - """ - if conf_reader is None: - conf_reader = self.open_config() - - if not var_name in conf_reader: - # this will raise Variable - var_name = const.VARS[var_name].var_name_aerocom - if not var_name in conf_reader: - raise VarNotAvailableError( - f"Variable {var_name} is not available in EBAS interface" - ) - - var_info = conf_reader[var_name] - for key in self.keys(): - if key in var_info: - val = var_info[key] - if key == "scale_factor": - self[key] = float(val.split("#")[0].strip()) - else: - self[key] = list(dict.fromkeys([x for x in val.split(",")])) - self.var_name = var_name - - def to_dict(self) -> dict: - """Convert into dictionary""" - d = {} - for k, v in self.items(): - if k == "unit": - k = "units" - if v is not None: - d[k] = v - return d - - def make_sql_request(self, **constraints) -> EbasSQLRequest: - """Create an SQL request for the specifications in this object - - Parameters - ---------- - constraints - request constraints deviating from default. For details on - parameters see :class:`EbasSQLRequest` - - Returns - ------- - EbasSQLRequest - the SQL request object that can be used to retrieve corresponding - file names using instance of :func:`EbasFileIndex.get_file_names`. - """ - if self.requires is not None: - raise ValueError( - f"This variable {self.var_name} requires other variables " - f"for reading, thus more than one SQL request is needed. " - f"Please use :func:`make_sql_requests` instead" - ) - - variables = self.component - - if variables is None: - raise AttributeError( - f"At least one component (Ebas variable name) " - f"must be specified for retrieval of variable {self.var_name}" - ) - - # default request - req = EbasSQLRequest( - variables=variables, - matrices=self.matrix, - instrument_types=self.instrument, - statistics=self.statistics, - ) - - req.update(**constraints) - return req - - def make_sql_requests(self, **constraints) -> list[EbasSQLRequest]: - """Create a list of SQL requests for the specifications in this object - - Parameters - ---------- - requests : dict, optional - other SQL requests linked to this one (e.g. if this variable - requires) - constraints - request constraints deviating from default. For details on - parameters see :class:`EbasSQLRequest` - - Returns - ------- - list - list of :class:`EbasSQLRequest` instances for this component and - potential required components. - """ - requests = {} - if self.component is not None: - req = EbasSQLRequest( - variables=self.component, - matrices=self.matrix, - instrument_types=self.instrument, - statistics=self.statistics, - ) - req.update(**constraints) - requests[self.var_name] = req - - if self.requires is not None: - for var in self.requires: - if var in requests: - # ToDo: check if this can be generalised better - raise ValueError( - f"Variable conflict in EBAS SQL request: " - f"{var} cannot depent on itself..." - ) - info = EbasVarInfo(var) - _reqs = info.make_sql_requests(**constraints) - for _var, _req in _reqs.items(): - if _var in requests: - # ToDo: check if this can be generalised better - raise ValueError( - f"Variable conflict in EBAS SQL request: " - f"{_var} cannot depent on itself..." - ) - requests[_var] = _req - - return requests - - def __str__(self) -> str: - head = f"Pyaerocom {type(self).__name__}" - s = f"\n{head}\n{len(head)*'-'}" - for k, v in self.items(): - s += f"\n{k}: {v}" - return s diff --git a/src/pyaro_readers/nilupmfebas/geodesy.py b/src/pyaro_readers/nilupmfebas/geodesy.py deleted file mode 100644 index 5503e75..0000000 --- a/src/pyaro_readers/nilupmfebas/geodesy.py +++ /dev/null @@ -1,376 +0,0 @@ -""" -Module for geographical calculations - -This module contains low-level methods to perform geographical calculations, -(e.g. distance between two coordinates) -""" -import logging -import os -from copy import deepcopy - -import geonum -import numpy as np -from geocoder_reverse_natural_earth import ( - Geocoder_Reverse_Exception, - Geocoder_Reverse_NE, -) - -from pyaerocom import const -from pyaerocom.helpers import isnumeric - -logger = logging.getLogger(__name__) - - -def calc_latlon_dists(latref, lonref, latlons): - """ - Calculate distances of (lat, lon) coords to input lat, lon coordinate - - Parameters - ---------- - latref : float - latitude of reference coordinate - lonref : float - longitude of reference coordinate - latlons : list - list of (lat, lon) tuples for which distances to (latref, lonref) are - computed - - Returns - ------- - list - list of computed geographic distances to input reference coordinate - for all (lat, lon) coords in `latlons` - - """ - return [haversine(latref, lonref, c[0], c[1]) for c in latlons] - - -def find_coord_indices_within_distance(latref, lonref, latlons, radius=1): - """ - Find indices of coordinates that match input coordinate - - Parameters - ---------- - latref : float - latitude of reference coordinate - lonref : float - longitude of reference coordinate - latlons : list - list of (lat, lon) tuples for which distances to (latref, lonref) are - computed - radius : float or int, optional - Maximum allowed distance to input coordinate. The default is 1. - - Returns - ------- - ndarray - Indices of latlon coordinates in :param:`latlons` that are within - the specified radius around (`latref`, `lonref`). The indices are - sorted by distance to the input coordinate, starting with the - closest - - """ - dists = np.asarray(calc_latlon_dists(latref, lonref, latlons)) - within_tol = np.where(dists < radius)[0] - # the following statement sorts all indices in dists that are within - # the tolerance radius, so the first entry in the returned aaray is the - # index of the closest coordinate within the radius and the last is the - # furthest - return within_tol[np.argsort(dists[within_tol])] - - -def get_country_info_coords(coords): - """ - Get country information for input lat/lon coordinates - - Parameters - ---------- - coords : list or tuple - list of coord tuples (lat, lon) or single coord tuple - - Raises - ------ - ValueError - if input format is incorrect - - Returns - ------- - list - list of dictionaries containing country information for each input - coordinate - """ - if isinstance(coords, np.ndarray): - coords = list(coords) - if not isinstance(coords, (list, tuple)): - raise ValueError("Invalid input for coords, need list or tuple or array") - - geo = Geocoder_Reverse_NE() - ret_list = [] - # that's what reverse_geocoder used to return - # (more a list of this) - ret_proto = {"city": "", "country_code": "", "code": ""} - - if isnumeric(coords[0]) and len(coords) == 2: # only one coordinate - lat, lon = coords - try: - dummy = geo.lookup(lat, lon) - except Geocoder_Reverse_Exception: - dummy = geo.lookup_nearest(lat, lon) - if dummy is None: - return [ret_proto] - # return [rg.get(coords)] - ret_dummy = deepcopy(ret_proto) - ret_dummy["country"] = dummy["NAME"] - ret_dummy["country_code"] = dummy["ISO_A2_EH"] - return [ret_dummy] - else: - for coord in coords: - ret_dummy = deepcopy(ret_proto) - lat, lon = coord - try: - dummy = geo.lookup(lat, lon) - except Geocoder_Reverse_Exception: - dummy = geo.lookup_nearest(lat, lon) - if dummy is not None: - ret_dummy["country"] = dummy["NAME"] - ret_dummy["country_code"] = dummy["ISO_A2_EH"] - - ret_list.append(ret_dummy) - return ret_list - # return rg.search(coords) - - -def get_topo_data( - lat0, - lon0, - lat1=None, - lon1=None, - topo_dataset="srtm", - topodata_loc=None, - try_etopo1=False, -): - """Retrieve topographic altitude for a certain location - - Supports topography datasets supported by geonum. - These are currently (20 Feb. 19) srtm (SRTM dataset, default, automatic access if online) and - etopo1 (ETOPO1 dataset, lower resolution, must be available on local machine or server). - - Parameters - ---------- - lat0 : float - start longitude for data extraction - lon0 : float - start latitude for data extraction - lat1 : float - stop longitude for data extraction (default: None). If None only - data around lon0, lat0 will be extracted. - lon1 : float - stop latitude for data extraction (default: None). - If None only data around lon0, lat0 will be extracted - topo_dataset : str - name of topography dataset - topodata_loc : str - filepath or directory containing supported topographic datasets - try_etopo1 : bool - if True and if access fails via input arg `topo_dataset`, then try - to access altitude using ETOPO1 dataset. - - Returns - ------- - geonum.TopoData - data object containing topography data in specified range - - Raises - ------ - ValueError - if altitude data cannot be accessed - """ - if topodata_loc is None: - if topo_dataset in const.SUPPLDIRS and os.path.exists( - const.SUPPLDIRS[topo_dataset] - ): - topodata_loc = const.SUPPLDIRS[topo_dataset] - logger.info( - f"Found default location for {topo_dataset} topodata at\n{topodata_loc}" - ) - - try: - access = geonum.TopoDataAccess(topo_dataset, local_path=topodata_loc) - topodata = access.get_data(lat0, lon0, lat1, lon1) - - return topodata - except Exception as e: - if try_etopo1 and not topo_dataset == "etopo1": - logger.warning( - f"Failed to access topography data for {topo_dataset}. " - f"Trying ETOPO1.\nError: {repr(e)}" - ) - return get_topo_data( - lat0, - lon0, - lat1, - lon1, - topo_dataset="etopo1", - topodata_loc=topodata_loc, - try_etopo1=False, - ) - raise - - -def get_topo_altitude( - lat, lon, topo_dataset="srtm", topodata_loc=None, try_etopo1=True -): - """Retrieve topographic altitude for a certain location - - Supports topography datasets supported by geonum. - These are currently (20 Feb. 19) srtm (SRTM dataset, default, automatic access if online) and - etopo1 (ETOPO1 dataset, lower resolution, must be available on local machine or server). - - Parameters - ---------- - lat : float - latitude of coordinate - lon : float - longitude of coordinate - topo_dataset : str - name of topography dataset - topodata_loc : str - filepath or directory containing supported topographic datasets - try_etopo1 : bool - if True and if access fails via input arg `topo_dataset`, then try - to access altitude using ETOPO1 dataset. - - Returns - ------- - dict - dictionary containing input latitude, longitude, altitude and - topographic dataset name used to retrieve the altitude. - - Raises - ------ - ValueError - if altitude data cannot be accessed - """ - return get_topo_data( - lat, - lon, - topo_dataset=topo_dataset, - topodata_loc=topodata_loc, - try_etopo1=try_etopo1, - )(lat, lon) - - -def calc_distance( - lat0, lon0, lat1, lon1, alt0=None, alt1=None, auto_altitude_srtm=False -): - """Calculate distance between two coordinates - - Parameters - ---------- - lat0 : float - latitude of first point in decimal degrees - lon0 : float - longitude of first point in decimal degrees - lat1 : float - latitude of secondpoint in decimal degrees - lon1 : float - longitude of second point in decimal degrees - alt0 : :obj:`float`, optional - altitude of first point in m - alt1 : :obj:`float`, optional - altitude of second point in m - auto_altitude_srtm : bool - if True, then all altitudes that are unspecified are set to the - corresponding topographic altitude of that coordinate, using SRTM - (only works for coordinates where SRTM topographic data is accessible). - - Returns - ------- - float - distance between points in km - """ - p0 = geonum.GeoPoint(lat0, lon0, alt0, auto_topo_access=auto_altitude_srtm) - p1 = geonum.GeoPoint(lat1, lon1, alt1, auto_topo_access=auto_altitude_srtm) - if auto_altitude_srtm and p0.altitude_err == p0._ALTERR_DEFAULT: - raise ValueError( - f"Failed to access topographic height for coord {p0} using SRTM topographic database" - ) - if auto_altitude_srtm and p1.altitude_err == p1._ALTERR_DEFAULT: - raise ValueError( - f"Failed to access topographic height for coord {p1} using SRTM topographic database" - ) - return (p0 - p1).magnitude - - -def is_within_radius_km(lat0, lon0, lat1, lon1, maxdist_km, alt0=0, alt1=0, **kwargs): - """Checks if two lon/lat coordinates are within a certain distance to each other - - Parameters - ---------- - lat0 : float - latitude of first point in decimal degrees - lon0 : float - longitude of first point in decimal degrees - lat1 : float - latitude of second point in decimal degrees - lon1 : float - longitude of second point in decimal degrees - maxdist_km : float - maximum distance between two points in km - alt0 : float - altitude of first point in m - alt1 : float - altitude of second point in m - - Returns - ------- - bool - True, if coordinates are within specified distance to each other, else - False - - """ - dist = calc_distance(lat0, lon0, lat1, lon1, alt0=alt0, alt1=alt1) - if dist <= maxdist_km: - return True - return False - - -def haversine(lat0, lon0, lat1, lon1, earth_radius=6371.0): - """Haversine formula - - Approximate horizontal distance between 2 points assuming a spherical - earth using haversine formula. - - Note - ---- - This code was copied from geonum library (date 12/11/2018, J. Gliss) - - Parameters - ---------- - lat0 : float - latitude of first point in decimal degrees - lon0 : float - longitude of first point in decimal degrees - lat1 : float - latitude of second point in decimal degrees - lon1 : float - longitude of second point in decimal degrees - earth_radius : float - average earth radius in km, defaults to 6371.0 - - Returns - -------- - float - horizontal distance between input coordinates in km - """ - hav = lambda d_theta: np.sin(d_theta / 2.0) ** 2 - - d_lon = np.radians(lon1 - lon0) - d_lat = np.radians(lat1 - lat0) - lat0 = np.radians(lat0) - lat1 = np.radians(lat1) - - a = hav(d_lat) + np.cos(lat0) * np.cos(lat1) * hav(d_lon) - c = 2 * np.arcsin(np.sqrt(a)) - - return earth_radius * c diff --git a/src/pyaro_readers/nilupmfebas/grid_io.py b/src/pyaro_readers/nilupmfebas/grid_io.py deleted file mode 100644 index 884876b..0000000 --- a/src/pyaro_readers/nilupmfebas/grid_io.py +++ /dev/null @@ -1,170 +0,0 @@ -from ._lowlevel_helpers import dict_to_str -from .time_config import TS_TYPES - - -class GridIO: - """Global I/O settings for gridded data - - This class includes options related to the import of gridded data. This - includes both options related to file search as well as preprocessing - options. - - Attributes - ---------- - FILE_TYPE : str - file type of data files. Defaults to .nc - TS_TYPES : list - list of strings specifying temporal resolution options encrypted in - file names. - PERFORM_FMT_CHECKS : bool - perform formatting checks when reading netcdf data, using metadata - encoded in filenames (requires that NetCDF file follows a registered - naming convention) - DEL_TIME_BOUNDS : bool - if True, preexisting bounds on time are deleted when grid data is - loaded. Else, nothing is done. Aerocom default is True - SHIFT_LONS : bool - if True, longitudes are shifted to - -180 <= lon <= 180 when data is loaded (in case they are defined - 0 <= lon <= 360. Aerocom default is True. - CHECK_TIME_FILENAME : bool - the times stored in NetCDF files may be wrong or not stored according - to the CF conventions. If True, the times are checked and if - :attr:`CORRECT_TIME_FILENAME`, corrected for on data import based what - is encrypted in the - file name. In case of Aerocom models, it is ensured that the filename - contains both the year and the temporal resolution in the filenames - (for details see :class:`pyaerocom.io.FileConventionRead`). - Aerocom default is True - CORRECT_TIME_FILENAME : bool - if True and time dimension in data is found to be different from - filename, it is attempted to be corrected - EQUALISE_METADATA : bool - if True (and if metadata varies between different NetCDF files that are - supposed to be merged in time), the metadata in all loaded objects is - unified based on the metadata of the first grid (otherwise, - concatenating them in time might not work using the Iris interface). - This might need to be reviewed and should be used with care if - specific metadata aspects of individual files need to be accessed. - Aerocom default is True - USE_FILECONVENTION : bool - if True, file names are strictly required to follow one of the file - naming conventions that can be specified in the file - `file_conventions.ini `__. Aerocom default is True. - INCLUDE_SUBDIRS : bool - if True, search for files is expanded to all subdirecories included in - data directory. Aerocom default is False. - INFER_SURFACE_LEVEL : bool - if True then surface level for 4D gridded data is inferred automatically - when necessary (e.g. when extracting surface time series from 4D - gridded data object that does not contain sufficient information about - vertical dimension) - - """ - - UNITS_ALIASES = {"/m": "m-1"} - _AEROCOM = { - "FILE_TYPE": ".nc", - "PERFORM_FMT_CHECKS": True, - "DEL_TIME_BOUNDS": True, - "SHIFT_LONS": True, - "CHECK_TIME_FILENAME": True, - "CORRECT_TIME_FILENAME": True, - "CHECK_DIM_COORDS": True, - "EQUALISE_METADATA": True, - "INCLUDE_SUBDIRS": False, - } - - _DEFAULT = { - "FILE_TYPE": ".nc", - "PERFORM_FMT_CHECKS": True, - "DEL_TIME_BOUNDS": True, - "SHIFT_LONS": True, - "CHECK_TIME_FILENAME": True, - "CORRECT_TIME_FILENAME": True, - "CHECK_DIM_COORDS": True, - "EQUALISE_METADATA": True, - "INCLUDE_SUBDIRS": False, - } - - def __init__(self, **kwargs): - self.FILE_TYPE = ".nc" - # it is important to keep them in the order from highest to lowest - # resolution - self.TS_TYPES = TS_TYPES - - self.PERFORM_FMT_CHECKS = True - - # delete time bounds if they exist in netCDF files - self.DEL_TIME_BOUNDS = True - # shift longitudes to -180 -> 180 repr (if applicable) - self.SHIFT_LONS = True - - self.CHECK_TIME_FILENAME = True - self.CORRECT_TIME_FILENAME = True - - self.CHECK_DIM_COORDS = False - # check and update metadata dictionary on Cube load since - # iris concatenate of Cubes only works if metadata is equal - - self.EQUALISE_METADATA = True - - self.INCLUDE_SUBDIRS = False - - self.INFER_SURFACE_LEVEL = True - - self.load_default() - - def load_aerocom_default(self): - self.from_dict(self._AEROCOM) - - def load_default(self): - self.from_dict(self._DEFAULT) - - def to_dict(self): - """Convert object to dictionary - - Returns - ------- - dict - settings dictionary - """ - return self.__dict__ - - def from_dict(self, dictionary=None, **settings): - """Import settings from dictionary""" - if not dictionary: - dictionary = {} - dictionary.update(settings) - for key, val in dictionary.items(): - self[key] = val - - def __setitem__(self, key, value): - """Set item - - GridIO[""] = value <=> GridIO. = value - <=> GridIO.__setitem__(, value) - - Raises - ------ - IOError - if key is not a valid setting - """ - if not key in self.__dict__: - raise OSError("Could not update IO setting: Invalid key") - self.__dict__[key] = value - - def __getitem__(self, key): - """Get item using curly brackets - - GridIO[""] => value - - """ - if not key in self.__dict__: - raise OSError("Invalid attribute") - return self.__dict__[key] - - def __str__(self): - head = f"Pyaerocom {type(self).__name__}" - return "\n{}\n{}\n{}".format(head, len(head) * "-", dict_to_str(self.to_dict())) diff --git a/src/pyaro_readers/nilupmfebas/helpers.py b/src/pyaro_readers/nilupmfebas/helpers.py deleted file mode 100644 index 87efc35..0000000 --- a/src/pyaro_readers/nilupmfebas/helpers.py +++ /dev/null @@ -1,1824 +0,0 @@ -""" -General helper methods for the pyaerocom library. -""" -from __future__ import annotations - -import logging -import math as ma -from collections import Counter -from datetime import MINYEAR, date, datetime - -import iris -import iris.analysis -import iris.coords -import iris.cube -import numpy as np -import pandas as pd -import xarray as xr -from cf_units import Unit - -from . import const -from ._warnings import ignore_warnings -from .exceptions import ( - DataCoverageError, - DataDimensionError, - LongitudeConstraintError, - MetaDataError, - ResamplingError, - TemporalResolutionError, - VariableDefinitionError, -) -from .time_config import ( - GREGORIAN_BASE, - PANDAS_RESAMPLE_OFFSETS, - TS_TYPE_DATETIME_CONV, - TS_TYPE_SECS, - TS_TYPE_TO_PANDAS_FREQ, - day_units, - hr_units, - microsec_units, - millisec_units, - min_units, - sec_units, -) -from .tstype import TsType -from .variable_helpers import get_variable - -logger = logging.getLogger(__name__) - -NUM_KEYS_META = ["longitude", "latitude", "altitude"] - -STR_TO_IRIS = dict( - count=iris.analysis.COUNT, - gmean=iris.analysis.GMEAN, - hmean=iris.analysis.HMEAN, - max=iris.analysis.MAX, - mean=iris.analysis.MEAN, - median=iris.analysis.MEDIAN, - sum=iris.analysis.SUM, - nearest=iris.analysis.Nearest, - linear=iris.analysis.Linear, - areaweighted=iris.analysis.AreaWeighted, -) - - -def varlist_aerocom(varlist): - if isinstance(varlist, str): - varlist = [varlist] - elif not isinstance(varlist, list): - raise ValueError("Need string or list") - output = [] - for var in varlist: - try: - _var = const.VARS[var].var_name_aerocom - if not _var in output: - output.append(_var) - except VariableDefinitionError as e: - logger.warning(repr(e)) - if len(output) == 0: - raise ValueError("None of the input variables appears to be valid") - return output - - -def delete_all_coords_cube(cube, inplace=True): - """Delete all coordinates of an iris cube - - Parameters - ---------- - cube : iris.cube.Cube - input cube that is supposed to be cleared of coordinates - inplace : bool - if True, then the coordinates are deleted in the input object, else in - a copy of it - - Returns - ------- - iris.cube.Cube - input cube without coordinates - """ - if not inplace: - cube = cube.copy() - - for aux_fac in cube.aux_factories: - cube.remove_aux_factory(aux_fac) - - for coord in cube.coords(): - cube.remove_coord(coord) - return cube - - -def extract_latlon_dataarray( - arr, - lat, - lon, - lat_dimname=None, - lon_dimname=None, - method="nearest", - new_index_name=None, - check_domain=True, -): - """Extract individual lat / lon coordinates from `DataArray` - - Parameters - ---------- - arr : DataArray - data (must contain lat and lon dimensions) - lat : array or similar - 1D array containing latitude coordinates - lon : array or similar - 1D array containing longitude coordinates - lat_dimname : str, optional - name of latitude dimension in input data (if None, it assumes standard - name) - lon_dimname : str, optional - name of longitude dimension in input data (if None, it assumes standard - name) - method : str - how to interpolate to input coordinates (defaults to nearest neighbour) - new_index_name : str, optional - name of flattend latlon dimension (defaults to latlon) - check_domain : bool - if True, lat/lon domain of datarray is checked and all input coordinates - that are outside of the domain are ignored. - - Returns - ------- - DataArray - data at input coordinates - """ - if lat_dimname is None: - lat_dimname = "lat" - if lon_dimname is None: - lon_dimname = "lon" - if not lat_dimname in arr.dims and lat_dimname == "lat": - for alias in const.COORDINFO["lat"].aliases: - if alias in arr.dims: - lat_dimname = alias - break - if not lon_dimname in arr.dims and lon_dimname == "lon": - for alias in const.COORDINFO["lon"].aliases: - if alias in arr.dims: - lon_dimname = alias - break - if isinstance(lat, str): - lat = [lat] - if isinstance(lon, str): - lon = [lon] - if check_domain: - arr_lat = arr[lat_dimname].data - arr_lon = arr[lon_dimname].data - lat0, lat1 = arr_lat.min(), arr_lat.max() - lon0, lon1 = arr_lon.min(), arr_lon.max() - new_lat = [] - new_lon = [] - for x, y in zip(lat, lon): - if (lat0 <= x <= lat1) and (lon0 <= y <= lon1): - new_lat.append(x) - new_lon.append(y) - if len(new_lat) == 0 and len(new_lon) == 0: - raise DataCoverageError("Coordinates not found in dataarray") - lat, lon = new_lat, new_lon - if new_index_name is None: - new_index_name = "latlon" - where = { - lat_dimname: xr.DataArray(lat, dims=new_index_name), - lon_dimname: xr.DataArray(lon, dims=new_index_name), - } - subset = arr.sel(where, method=method) - subset.attrs["lat_dimname"] = lat_dimname - subset.attrs["lon_dimname"] = lon_dimname - return subset - - -def lists_to_tuple_list(*lists): - """Convert input lists (of same length) into list of tuples - - e.g. input 2 lists of latitude and longitude coords, output one list - with tuple coordinates at each index - """ - return list(zip(*lists)) - - -def tuple_list_to_lists(tuple_list): - """Convert list with tuples (e.g. (lat, lon)) into multiple lists""" - return list(map(list, zip(tuple_list))) - - -def make_dummy_cube_latlon( - lat_res_deg: float = 2, - lon_res_deg: float = 3, - lat_range: list[float] | tuple[float, float] = (-90, 90), - lon_range: list[float] | tuple[float, float] = (-180, 180), -): - """Make an empty Cube with given latitude and longitude resolution - - Dimensions will be lat, lon - - Parameters - ---------- - lat_res_deg : float or int - latitude resolution of grid - lon_res_deg : float or int - longitude resolution of grid - lat_range : tuple or list - 2-element list containing latitude range. If `None`, then `(-90, 90)` - is used. - lon_range : tuple or list - 2-element list containing longitude range. If `None`, then `(-180, 180)` - is used. - - Returns - ------- - Cube - dummy cube in input resolution - """ - - # Accept lists for lat_range and lon_range, but make sure correct length - assert len(lat_range) == len(lon_range) == 2 - - lons = np.arange( - lon_range[0] + (lon_res_deg / 2), lon_range[1] + (lon_res_deg / 2), lon_res_deg - ) - lats = np.arange( - lat_range[0] + (lat_res_deg / 2), lat_range[1] + (lat_res_deg / 2), lat_res_deg - ) - - lon_circ = check_coord_circular(lons, modulus=360) - latdim = iris.coords.DimCoord( - lats, - var_name="lat", - standard_name="latitude", - circular=False, - units=Unit("degrees"), - ) - - londim = iris.coords.DimCoord( - lons, - var_name="lon", - standard_name="longitude", - circular=lon_circ, - units=Unit("degrees"), - ) - - latdim.guess_bounds() - londim.guess_bounds() - dummy = iris.cube.Cube(np.ones((len(lats), len(lons)))) - - dummy.add_dim_coord(latdim, 0) - dummy.add_dim_coord(londim, 1) - dummy.var_name = "dummy_grid" - - return dummy - - -def check_coord_circular(coord_vals, modulus, rtol=1e-5): - """Check circularity of coordinate - - Parameters - ---------- - coord_vals : list or ndarray - values of coordinate to be tested - modulus : float or int - modulus of coordinate (e.g. 360 for longitude) - rtol : float - relative tolerance - - Returns - ------- - bool - True if circularity is given, else False - - Raises - ------ - ValueError - if circularity is given and results in overlap (right end of input - array is mapped to a value larger than the first one at the left end - of the array) - - """ - from pyaerocom import const - - if len(coord_vals) < 2: - logger.warning( - "Checking coordinate values for circularity " - "failed since coord array has less than 2 values" - ) - return False - step = coord_vals[-1] - coord_vals[-2] - tol = step * rtol - diff = coord_vals[-1] - coord_vals[0] + step - if diff - tol > modulus: - raise ValueError( - "Circularity is given but results in overlap (right " - "end of input array is mapped to a value larger than " - "the first one at the left end of the array)." - ) - if abs(modulus - diff) > tol: - return False - return True - - -def numpy_to_cube(data, dims=None, var_name=None, units=None, **attrs): - """Make a cube from a numpy array - - Parameters - ---------- - data : ndarray - input data - dims : list, optional - list of :class:`iris.coord.DimCoord` instances in order of dimensions - of input data array (length of list and shapes of each of the - coordinates must match dimensions of input data) - var_name : str, optional - name of variable - units : str - unit of variable - **attrs - additional attributes to be added to metadata - - Returns - ------- - iris.cube.Cube - - Raises - ------ - DataDimensionError - if input `dims` is specified and results in conflict - """ - if not isinstance(data, np.ndarray): - raise ValueError("Invalid input, need numpy array") - cube = iris.cube.Cube(data) - - cube.var_name = var_name - cube.units = units - - sh = data.shape - if dims is not None: - if not len(dims) == data.ndim: - raise DataDimensionError( - "Input number of dimensios must match array dimension number" - ) - for i, dim in enumerate(dims): - if not isinstance(dim, iris.coords.DimCoord): - raise ValueError("Need iris.DimCoord...") - elif not len(dim.points) == sh[i]: - raise DataDimensionError( - f"Length mismatch between {dim.var_name} dim ({len(dim.points)}) " - f"and array dimension {i} ({sh[i]})" - ) - cube.add_dim_coord(dim, i) - - cube.attributes.update(attrs) - return cube - - -def copy_coords_cube(to_cube, from_cube, inplace=True): - """Copy all coordinates from one cube to another - - Requires the underlying data to be the same shape. - - Warning - -------- - This operation will delete all existing coordinates and auxiliary - coordinates and will then copy the ones from the input data object. - No checks of any kind will be performed - - Parameters - ---------- - to_cube - other : GriddedData or Cube - other data object (needs to be same shape as this object) - - Returns - ------- - GriddedData - data object containing coordinates from other object - """ - if not all([isinstance(x, iris.cube.Cube) for x in [to_cube, from_cube]]): - raise ValueError("Invalid input. Need instances of iris.cube.Cube class...") - - if not from_cube.shape == to_cube.shape: - raise DataDimensionError("Cannot copy coordinates: shape mismatch") - - to_cube = delete_all_coords_cube(to_cube, inplace) - - for i, dim_coord in enumerate(from_cube.dim_coords): - to_cube.add_dim_coord(dim_coord, i) - - for aux_coord, dim in from_cube._aux_coords_and_dims: - to_cube.add_aux_coord(aux_coord, dim) - - for aux_fac in from_cube.aux_factories: - to_cube.add_aux_factory(aux_fac) - return to_cube - - -def infer_time_resolution(time_stamps, dt_tol_percent=5, minfrac_most_common=0.8): - """Infer time resolution based on input time-stamps - - Calculates time difference *dt* between consecutive timestamps provided via - input array or list. Then it counts the most common *dt* (e.g. 86400 s for - daily). Before inferring the frequency it then checks all other *dts* - occurring in the input array to see if they are within a certain interval - around the most common one (e.g. +/- 5% as default, via arg - `dt_tol_percent`), that is, 86390 would be included if most common dt is - 86400 s but not 80000s. Then it checks if the number of *dts* that - are within that tolerance level around the most common *dt* exceed a - certain fraction (arg `minfrac_most_common`) of the total number of *dts* - that occur in the input array (default is 80%). If that is the case, the - most common frequency is attempted to be derived using - :func:`TsType.from_total_seconds` based on the most common *dt* (in this - example that would be *daily*). - - - Parameters - ---------- - time_stamps : pandas.DatetimeIndex, or similar - list of time stamps - dt_tol_percent : int - tolerance in percent of accepted range of time diffs with respect to - most common time difference. - minfrac_most_common : float - minimum required fraction of time diffs that have to be equal to, or - within tolerance range, the most common time difference. - - - Raises - ------ - TemporalResolutionError - if frequency cannot be derived. - - Returns - ------- - str - inferred frequency - """ - from pyaerocom import TsType - - if not isinstance(time_stamps, pd.DatetimeIndex): - time_stamps = pd.DatetimeIndex(time_stamps) - vals = time_stamps.values - - dts = (vals[1:] - vals[:-1]).astype("timedelta64[s]").astype(int) - - if np.min(dts) < 0: - raise TemporalResolutionError("Nasa Ames file contains neg. meas periods...") - - counts = Counter(dts).most_common() - most_common_dt, most_common_num = counts[0] - num_within_tol = most_common_num - lower = most_common_dt * (100 - dt_tol_percent) / 100 - upper = most_common_dt * (100 + dt_tol_percent) / 100 - for dt, num in counts[1:]: - if lower <= dt <= upper: - num_within_tol += num - frac_ok = num_within_tol / len(dts) - if not frac_ok > minfrac_most_common: - raise TemporalResolutionError("Failed to infer ts_type") - tst = TsType.from_total_seconds(most_common_dt) - return str(tst) - - -def seconds_in_periods(timestamps, ts_type): - """ - Calculates the number of seconds for each period in timestamps. - - Parameters - ---------- - timestamps : numpy.datetime64 or numpy.ndarray - Either a single datetime or an array of datetimes. - ts_type : str - Frequency of timestamps. - - Returns - ------- - np.array : - Array with same length as timestamps containing number of seconds for - each period. - """ - - ts_type = TsType(ts_type) - if isinstance(timestamps, np.datetime64): - timestamps = np.array([timestamps]) - if isinstance(timestamps, np.ndarray): - timestamps = [to_pandas_timestamp(timestamp) for timestamp in timestamps] - # From here on timestamps should be a numpy array containing pandas Timestamps - - seconds_in_day = 86400 - if ts_type >= TsType("monthly"): - if ts_type == TsType("monthly"): - days_in_months = np.array( - [timestamp.days_in_month for timestamp in timestamps] - ) - seconds = days_in_months * seconds_in_day - return seconds - if ts_type == TsType("daily"): - return seconds_in_day * np.ones_like(timestamps) - raise NotImplementedError( - "Only yearly, monthly and daily frequencies implemented." - ) - elif ts_type == TsType("yearly"): - days_in_year = [] - for ts in timestamps: - if ts.year % 4 == 0: - days_in_year.append(366) # Leap year - else: - days_in_year.append(365) - seconds = np.array(days_in_year) * seconds_in_day - return seconds - raise TemporalResolutionError(f"Unknown TsType: {ts_type}") - - -def get_tot_number_of_seconds(ts_type, dtime=None): - """Get total no. of seconds for a given frequency - - ToDo - ---- - This method needs revision and can be solved simpler probably - - Parameters - ---------- - ts_type : str or TsType - frequency for which number of seconds is supposed to be retrieved - dtime : TYPE, optional - DESCRIPTION. The default is None. - - Raises - ------ - AttributeError - DESCRIPTION. - - Returns - ------- - TYPE - DESCRIPTION. - - """ - - ts_tpe = TsType(ts_type) - - if ts_tpe >= TsType("monthly"): - if dtime is None: - raise AttributeError( - "For frequncies larger than or eq. monthly you" - + " need to provide dtime in order to compute the number of second." - ) - if not ts_type == "monthly": - raise NotImplementedError("Can only handle monthly so far...") - - # find seconds from dtime - # TODO generalize this - days_in_month = dtime.dt.daysinmonth - - return days_in_month * 24 * 60 * 60 - else: - return TS_TYPE_SECS[ts_type] - - -def get_standard_name(var_name): - """Converts AeroCom variable name to CF standard name - - Also handles alias names for variables, etc. or strings corresponding to - older conventions (e.g. names containing 3D). - - Parameters - ---------- - var_name : str - AeroCom variable name - - Returns - ------- - str - corresponding standard name - """ - from pyaerocom import const - - return const.VARS[var_name].standard_name - - -def get_standard_unit(var_name): - """Gets standard unit of AeroCom variable - - Also handles alias names for variables, etc. or strings corresponding to - older conventions (e.g. names containing 3D). - - Parameters - ---------- - var_name : str - AeroCom variable name - - Returns - ------- - str - corresponding standard unit - """ - from pyaerocom import const - - return const.VARS[var_name].units - - -def get_lowest_resolution(ts_type, *ts_types): - """Get the lowest resolution from several ts_type codes - - Parameters - ---------- - ts_type : str - first ts_type - *ts_types - one or more additional ts_type codes - - Returns - ------- - str - the ts_type that corresponds to the lowest resolution - - Raises - ------ - ValueError - if one of the input ts_type codes is not supported - """ - # all_ts_types = const.GRID_IO.TS_TYPES - from pyaerocom.tstype import TsType - - lowest = TsType(ts_type) - for freq in ts_types: - _temp = TsType(freq) - if _temp < lowest: - lowest = _temp - return lowest.val - - -def sort_ts_types(ts_types): - """Sort a list of ts_types - - Parameters - ---------- - ts_types : list - list of strings (or instance of :class:`TsType`) to be sorted - - Returns - ------- - list - list of strings with sorted frequencies - - Raises - ------ - TemporalResolutionError - if one of the input ts_types is not supported - """ - freqs_sorted = [] - for ts_type in ts_types: - if isinstance(ts_type, str): - ts_type = TsType(ts_type) - if len(freqs_sorted) == 0: - freqs_sorted.append(ts_type) - else: - insert = False - for i, tt in enumerate(freqs_sorted): - if tt < ts_type: - insert = True - break - if insert: - freqs_sorted.insert(i, ts_type) - else: - freqs_sorted.append(ts_type) - return [str(tt) for tt in freqs_sorted] - - -def get_highest_resolution(ts_type, *ts_types): - """Get the highest resolution from several ts_type codes - - Parameters - ---------- - ts_type : str - first ts_type - *ts_types - one or more additional ts_type codes - - Returns - ------- - str - the ts_type that corresponds to the highest resolution - - Raises - ------ - ValueError - if one of the input ts_type codes is not supported - """ - lst = [ts_type] - lst.extend(ts_types) - return sort_ts_types(lst)[0] - - -def isnumeric(val): - """Check if input value is numeric - - Parameters - ---------- - val - input value to be checked - - Returns - ------- - bool - True, if input value corresponds to a range, else False. - """ - from numbers import Number - - if isinstance(val, Number): - return True - return False - - -def isrange(val): - """Check if input value corresponds to a range - - Checks if input is list, or array or tuple with 2 entries, or alternatively - a slice that has defined start and stop and has set step to None. - - Note - ---- - No check is performed, whether first entry is smaller than second entry if - all requirements for a range are fulfilled. - - Parameters - ---------- - val - input value to be checked - - Returns - ------- - bool - True, if input value corresponds to a range, else False. - """ - if isinstance(val, (list, np.ndarray, tuple)): - if len(val) == 2: - return True - return False - elif isinstance(val, slice): - if val.step is not None or val.start is None or val.stop is None: - return False - return True - return False - - -def _check_stats_merge(statlist, var_name, pref_attr, fill_missing_nan): - has_errs = False - is_3d = [] - stats = [] - for stat in statlist: - if not var_name in stat: - raise DataCoverageError(f"All input stations must contain {var_name} data") - elif pref_attr is not None and not pref_attr in stat: - raise MetaDataError( - f"Cannot sort station relevance by attribute {pref_attr}. " - f"At least one of the input stations does not contain this attribute" - ) - elif not isinstance(stat[var_name], pd.Series): - stat._to_ts_helper(var_name) - # this will raise MetaDataError or TemporalResolutionError if there is - # an unresolvable issue with sampling frequency - stat.get_var_ts_type(var_name) - - is_3d.append(stat.check_if_3d(var_name)) - - if var_name in stat.data_err: - has_errs = True - - stats.append(stat) - if np.any(is_3d): - if not np.all(is_3d): - raise ValueError( - "Merge error: some of the input stations contain " - "altitude info (suggesting profile data), others " - "not." - ) - is_3d = True - else: - is_3d = False - return (stats, is_3d, has_errs) - - -def _merge_stats_2d( - stats, - var_name, - sort_by_largest, - pref_attr, - add_meta_keys, - resample_how, - min_num_obs, -): - if pref_attr is not None: - stats.sort(key=lambda s: s[pref_attr]) - else: - stats.sort(key=lambda s: len(s[var_name].dropna())) - - if sort_by_largest: - stats = stats[::-1] - - # remove first station from the list - merged = stats.pop(0) - for i, stat in enumerate(stats): - merged.merge_other( - stat, - var_name, - add_meta_keys=add_meta_keys, - resample_how=resample_how, - min_num_obs=min_num_obs, - ) - return merged - - -def _merge_stats_3d(stats, var_name, add_meta_keys, has_errs): - dtime = [] - for stat in stats: - _t = stat[var_name].index.unique() - if not len(_t) == 1: - raise NotImplementedError( - "So far, merging of profile data " - "requires that profile values are " - "sampled at the same time" - ) - dtime.append(_t[0]) - tidx = pd.DatetimeIndex(dtime) - - # AeroCom default vertical grid - vert_grid = const.make_default_vert_grid() - _data = np.ones((len(vert_grid), len(tidx))) * np.nan - if has_errs: - _data_err = np.ones((len(vert_grid), len(tidx))) * np.nan - - for i, stat in enumerate(stats): - if i == 0: - merged = stat - else: - merged.merge_meta_same_station(stat, add_meta_keys=add_meta_keys) - - _data[:, i] = np.interp(vert_grid, stat["altitude"], stat[var_name].values) - - if has_errs: - try: - _data_err[:, i] = np.interp( - vert_grid, stat["altitude"], stat.data_err[var_name] - ) - except Exception: - pass - _coords = {"time": tidx, "altitude": vert_grid} - - d = xr.DataArray( - data=_data, coords=_coords, dims=["altitude", "time"], name=var_name - ) - d = d.sortby("time") - merged[var_name] = d - merged.dtime = d.time - merged.altitude = d.altitude - return merged - - -def merge_station_data( - stats, - var_name, - pref_attr=None, - sort_by_largest=True, - fill_missing_nan=True, - add_meta_keys=None, - resample_how=None, - min_num_obs=None, -): - """Merge multiple StationData objects (from one station) into one instance - - Note - ---- - all input :class:`StationData` objects need to have same attributes - ``station_name``, ``latitude``, ``longitude`` and ``altitude`` - - Parameters - ---------- - stats : list - list containing :class:`StationData` objects (note: all of these - objects must contain variable data for the specified input variable) - var_name : str - data variable name that is to be merged - pref_attr - optional argument that may be used to specify a metadata attribute - that is available in all input :class:`StationData` objects and that - is used to order the input stations by relevance. The associated values - of this attribute need to be sortable (e.g. revision_date). This is - only relevant in case overlaps occur. If unspecified the relevance of - the stations is sorted based on the length of the associated data - arrays. - sort_by_largest : bool - if True, the result from the sorting is inverted. E.g. if - ``pref_attr`` is unspecified, then the stations will be sorted based on - the length of the data vectors, starting with the shortest, ending with - the longest. This sorting result will then be inverted, if - ``sort_by_largest=True``, so that the longest time series get's highest - importance. If, e.g. ``pref_attr='revision_date'``, then the stations - are sorted by the associated revision date value, starting with the - earliest, ending with the latest (which will also be inverted if - this argument is set to True) - fill_missing_nan : bool - if True, the resulting time series is filled with NaNs. NOTE: this - requires that information about the temporal resolution (ts_type) of - the data is available in each of the StationData objects. - add_meta_keys : str or list, optional - additional non-standard metadata keys that are supposed to be - considered for merging. - resample_how : str or dict, optional - in case input stations come in different frequencies they are merged - to the lowest common freq. This parameter can be used to control, which - aggregator(s) are to be used (e.g. mean, median). - min_num_obs : str or dict, optional - in case input stations come in different frequencies they are merged - to the lowest common freq. This parameter can be used to control minimum - number of observation constraints for the downsampling. - - Returns - ------- - StationData - merged data - - """ - if isinstance(var_name, list): - if len(var_name) > 1: - raise NotImplementedError("Merging of multivar data not yet possible") - var_name = var_name[0] - - stats, is_3d, has_errs = _check_stats_merge( - stats, var_name, pref_attr, fill_missing_nan - ) - # ToDo: data_err is not handled at the moment for 2D data, needs r - # revision and should be done in StationData.merge, also 3D vs 2D - # should be handled by StationData directly... - if is_3d: - merged = _merge_stats_3d(stats, var_name, add_meta_keys, has_errs) - else: - merged = _merge_stats_2d( - stats, - var_name, - sort_by_largest, - pref_attr, - add_meta_keys, - resample_how, - min_num_obs, - ) - - if fill_missing_nan: - try: - merged.insert_nans_timeseries(var_name) - except Exception as e: - logger.warning( - f"Could not insert NaNs into timeseries of variable {var_name} " - f"after merging stations. Reason: {repr(e)}" - ) - - merged["stat_merge_pref_attr"] = pref_attr - return merged - - -def _get_pandas_freq_and_loffset(freq): - """Helper to convert resampling info""" - if freq in TS_TYPE_TO_PANDAS_FREQ: - freq = TS_TYPE_TO_PANDAS_FREQ[freq] - loffset = None - if freq in PANDAS_RESAMPLE_OFFSETS: - loffset = PANDAS_RESAMPLE_OFFSETS[freq] - return (freq, loffset) - - -def make_datetime_index(start, stop, freq): - """Make pandas.DatetimeIndex for input specs - - Note - ---- - If input frequency is specified in `PANDAS_RESAMPLE_OFFSETS`, an offset - will be added (e.g. 15 days for monthly data). - - Parameters - ---------- - start - start time. Preferably as :class:`pandas.Timestamp`, else it will be - attempted to be converted. - stop - stop time. Preferably as :class:`pandas.Timestamp`, else it will be - attempted to be converted. - freq - frequency of datetime index. - - Returns - ------- - DatetimeIndex - """ - if not isinstance(start, pd.Timestamp): - start = to_pandas_timestamp(start) - if not isinstance(stop, pd.Timestamp): - stop = to_pandas_timestamp(stop) - - freq, loffset = _get_pandas_freq_and_loffset(freq) - idx = pd.date_range(start=start, end=stop, freq=freq) - if loffset is not None: - idx = idx + pd.Timedelta(loffset) - return idx - - -def make_datetimeindex_from_year(freq, year): - """Create pandas datetime index - - Parameters - ---------- - freq : str - pandas frequency str - year : int - year - - Returns - ------- - pandas.DatetimeIndex - index object - """ - start, stop = start_stop_from_year(year) - return make_datetime_index(start, stop, freq) - - -def calc_climatology( - s, start, stop, min_count=None, set_year=None, resample_how="mean" -): - """Compute climatological timeseries from pandas.Series - - Parameters - ---------- - s : pandas.Series - time series data - start : numpy.datetime64 or similar - start time of data used to compute climatology - stop : numpy.datetime64 or similar - start time of data used to compute climatology - mincount_month : int, optional - minimum number of observations required per aggregated month in - climatological interval. Months not meeting this requirement will be - set to NaN. - set_year : int, optional - if specified, the output data will be assigned the input year. Else - the middle year of the climatological interval is used. - resample_how : str - string specifying how the climatological timeseries is to be - aggregated - - Returns - ------- - DataFrame - dataframe containing climatological timeseries as - well as columns std and count - """ - if not isinstance(start, pd.Timestamp): - start, stop = start_stop(start, stop) - sc = s[start:stop] - sc.dropna(inplace=True) - - if len(sc) == 0: - raise ValueError( - "Cropping input time series in climatological interval resulted in empty series" - ) - if set_year is None: - set_year = int(start.year + (stop.year - start.year) / 2) + 1 - - df = pd.DataFrame(sc) - df["month"] = df.index.month - - clim = df.groupby("month").agg([resample_how, "std", "count"]) - - # clim.columns = clim.columns.droplevel(0) - clim.columns = ["data", "std", "numobs"] - idx = [np.datetime64(f"{set_year}-{x:02d}-15") for x in clim.index.values] - clim.set_index(pd.DatetimeIndex(idx), inplace=True) - if min_count is not None: - mask = clim["numobs"] < min_count - clim.loc[mask, "data"] = np.nan - return clim - - -def resample_timeseries(ts, freq, how=None, min_num_obs=None): - """Resample a timeseries (pandas.Series) - - Parameters - ---------- - ts : Series - time series instance - freq : str - new temporal resolution (can be pandas freq. string, or pyaerocom - ts_type) - how - aggregator to be used, accepts everything that is accepted by - :func:`pandas.core.resample.Resampler.agg` and in addition, - percentiles may be provided as str using e.g. 75percentile as input for - the 75% percentile. - min_num_obs : int, optional - minimum number of observations required per period (when downsampling). - E.g. if input is in daily resolution and freq is monthly and - min_num_obs is 10, then all months that have less than 10 days of data - are set to nan. - - Returns - ------- - Series - resampled time series object - """ - if how is None: - how = "mean" - elif "percentile" in how: - p = int(how.split("percentile")[0]) - how = lambda x: np.nanpercentile(x, p) - - freq, loffset = _get_pandas_freq_and_loffset(freq) - resampler = ts.resample(freq) - - data = resampler.agg(how) - if min_num_obs is not None: - numobs = resampler.count() - # df = resampler.agg([how, 'count']) - invalid = numobs < min_num_obs - if np.any(invalid): - data.values[invalid] = np.nan - if loffset is not None: - data.index = data.index + pd.Timedelta(loffset) - return data - - -def resample_time_dataarray(arr, freq, how=None, min_num_obs=None): - """Resample the time dimension of a :class:`xarray.DataArray` - - Note - ---- - The dataarray must have a dimension coordinate named "time" - - Parameters - ---------- - arr : DataArray - data array to be resampled - freq : str - new temporal resolution (can be pandas freq. string, or pyaerocom - ts_type) - how : str - how to aggregate (e.g. mean, median) - min_num_obs : int, optional - minimum number of observations required per period (when downsampling). - E.g. if input is in daily resolution and freq is monthly and - min_num_obs is 10, then all months that have less than 10 days of data - are set to nan. - - Returns - ------- - DataArray - resampled data array object - - Raises - ------ - IOError - if data input `arr` is not an instance of :class:`DataArray` - DataDimensionError - if time dimension is not available in dataset - """ - if how is None: - how = "mean" - elif "percentile" in how: - raise NotImplementedError( - "percentile based resampling is not yet available for xarray based data" - ) - - if not isinstance(arr, xr.DataArray): - raise OSError(f"Invalid input for arr: need DataArray, got {type(arr)}") - elif not "time" in arr.dims: - raise DataDimensionError( - "Cannot resample time: input DataArray has no time dimension" - ) - - from pyaerocom.tstype import TsType - - to = TsType(freq) - pd_freq = to.to_pandas_freq() - invalid = None - if min_num_obs is not None: - invalid = arr.resample(time=pd_freq).count(dim="time") < min_num_obs - - freq, loffset = _get_pandas_freq_and_loffset(freq) - resampler = arr.resample(time=pd_freq, loffset=loffset) - try: - aggfun = getattr(resampler, how) - except AttributeError: - raise ResamplingError( - f"Invalid aggregator {how} for temporal resampling of DataArray..." - ) - arr = aggfun(dim="time") - - if invalid is not None: - arr.data[invalid.data] = np.nan - return arr - - -def same_meta_dict( - meta1, meta2, ignore_keys=["PI"], num_keys=NUM_KEYS_META, num_rtol=1e-2 -): - """Compare meta dictionaries - - Parameters - ---------- - meta1 : dict - meta dictionary that is to be compared with ``meta2`` - meta2 : dict - meta dictionary that is to be compared with ``meta1`` - ignore_keys : list - list containing meta keys that are supposed to be ignored - num_keys : keys that contain numerical values - num_rtol : float - relative tolerance level for comparison of numerical values - - Returns - ------- - bool - True, if dictionaries are the same, else False - """ - if not meta1.keys() == meta2.keys(): - return False - for k, v in meta1.items(): - if k in ignore_keys: - continue - elif k in num_keys: - if not ma.isclose(v, meta2[k], rel_tol=num_rtol): - return False - elif isinstance(v, dict): - if not same_meta_dict(v, meta2[k]): - return False - else: - if not v == meta2[k]: - return False - return True - - -def str_to_iris(key, **kwargs): - """Mapping function that converts strings into iris analysis objects - - Please see dictionary ``STR_TO_IRIS`` in this module for valid definitions - - Parameters - ---------- - key : str - key of :attr:`STR_TO_IRIS` dictionary - - Returns - ------- - obj - corresponding iris analysis object (e.g. Aggregator, method) - """ - key = key.lower() - if not key in STR_TO_IRIS: - raise KeyError( - "No iris.analysis object available for key %s, please " - "choose from %s" % (key, STR_TO_IRIS.keys()) - ) - val = STR_TO_IRIS[key] - if callable(val): - return val(**kwargs) - return val - - -@ignore_warnings(UserWarning, r"Parsing .* in DD/MM/YYYY format") -def to_pandas_timestamp(value): - """Convert input to instance of :class:`pandas.Timestamp` - - Parameters - ---------- - value - input value that is supposed to be converted to time stamp - - Returns - -------- - pandas.Timestamp - """ - if isinstance(value, np.str_): - value = str(value) - if isinstance(value, pd.Timestamp): - return value - elif isinstance(value, (str, np.datetime64, datetime, date)): - return pd.Timestamp(value) - else: - try: - numval = int(value) - if not 0 <= numval <= 10000: - raise ValueError("Could not infer valid year from numerical time input") - return pd.Timestamp(str(numval)) - except Exception as e: - raise ValueError(f"Failed to convert {value} to Timestamp: {repr(e)}") - - -def to_datetime64(value): - """Convert input value to numpy.datetime64 - - Parameters - ---------- - value - input value that is supposed to be converted, needs to be either str, - datetime.datetime, pandas.Timestamp or an integer specifying the - desired year. - - Returns - ------- - datetime64 - input timestamp converted to datetime64 - """ - if isinstance(value, np.datetime64): - return value - else: - try: - return to_pandas_timestamp(value).to_datetime64() - except Exception as e: - raise ValueError( - f"Failed to convert {value} to datetime64 objectError: {repr(e)}" - ) - - -def is_year(val): - """Check if input is / may be year - - Parameters - ---------- - val - input that is supposed to be checked - - Returns - ------- - bool - True if input is a number between -2000 and 10000, else False - """ - try: - if -2000 < int(val) < 10000: - return True - raise Exception - except Exception: - return False - - -def _check_climatology_timestamp(t): - if isnumeric(t) and t == 9999: - return pd.Timestamp("1-1-2222") - elif isinstance(t, np.datetime64): - tstr = str(t) - if tstr.startswith("9999"): - return pd.Timestamp(tstr.replace("9999", "2222")) - elif isinstance(t, str) and "9999" in t: - return pd.Timestamp(t.replace("9999", "2222")) - elif isinstance(t, datetime) and t.year == 9999: - return pd.Timestamp(t.replace(year=2222)) - raise ValueError(f"Failed to identify {t} as climatological timestamp...") - - -def start_stop(start, stop=None, stop_sub_sec=True): - """Create pandas timestamps from input start / stop values - - Note - ---- - If input suggests climatological data in AeroCom format (i.e. year=9999) - then the year is converted to 2222 instead since pandas cannot handle - year 9999. - - Parameters - ----------- - start - start time (any format that can be converted to pandas.Timestamp) - stop - stop time (any format that can be converted to pandas.Timestamp) - stop_sub_sec : bool - if True and if input for stop is a year (e.g. 2015) then one second - is subtracted from stop timestamp (e.g. if input stop is - 2015 and denotes "until 2015", then for the returned stop timestamp - one second will be subtracted, so it would be 31.12.2014 23:59:59). - - Returns - ------- - pandas.Timestamp - start timestamp - pandas.Timestamp - stop timestamp - - Raises - ------ - ValueError - if input cannot be converted to pandas timestamps - """ - isclim = False - try: - start = to_pandas_timestamp(start) - except pd.errors.OutOfBoundsDatetime: # probably climatology - start = _check_climatology_timestamp(start) - isclim = True - if stop is None: - if isclim: - yr = 2222 - else: - yr = start.year - stop = to_pandas_timestamp(f"{yr}-12-31 23:59:59") - else: - try: - subt_sec = False - if isnumeric(stop): - subt_sec = True - stop = to_pandas_timestamp(stop) - if subt_sec and stop_sub_sec: - stop = stop - pd.Timedelta(1, "s") - except pd.errors.OutOfBoundsDatetime: - stop = _check_climatology_timestamp(stop) - return (start, stop) - - -def datetime2str(time, ts_type=None): - from pyaerocom import const - - conv = TS_TYPE_DATETIME_CONV[ts_type] - if is_year(time): - return str(time) - try: - time = to_pandas_timestamp(time).strftime(conv) - except pd.errors.OutOfBoundsDatetime: - logger.warning(f"Failed to convert time {time} to string") - return time - - -def start_stop_str(start, stop=None, ts_type=None): - conv = TS_TYPE_DATETIME_CONV[ts_type] - if is_year(start) and stop is None: - return str(start) - start, stop = start_stop(start, stop) - start_str = start.strftime(conv) - stop_str = stop.strftime(conv) - if stop_str != start_str: - return f"{start_str}-{stop_str}" - return start_str - - -def start_stop_from_year(year): - """Create start / stop timestamp from year - - Parameters - ---------- - year : int - the year for which start / stop is to be instantiated - - Returns - ------- - numpy.datetime64 - start datetime - numpy.datetime64 - stop datetime - """ - start = np.datetime64(f"{year}-01-01T00:00:00") - stop = np.datetime64(f"{year}-12-31T23:59:59") - return (start, stop) - - -def to_datestring_YYYYMMDD(value): - """Convert input time to string with format YYYYMMDD - - Parameters - ---------- - value - input time, may be string, datetime, numpy.datetime64 or - pandas.Timestamp - - Returns - ------- - str - input formatted to string YYYYMMDD - - Raises - ------ - ValueError - if input is not supported - """ - if isinstance(value, str) and len(value) == 8: - logger.info( - "Input is already string containing 8 chars. Assuming it " - "is in the right format and returning unchanged" - ) - return value - try: - return to_pandas_timestamp(value).strftime("%Y%m%d") - except Exception as e: - raise ValueError( - f"Invalid input, need str, datetime, numpy.datetime64 or pandas.Timestamp. " - f"Error: {repr(e)}" - ) - - -def cftime_to_datetime64(times, cfunit=None, calendar=None): - """Convert numerical timestamps with epoch to numpy datetime64 - - This method was designed to enhance the performance of datetime conversions - and is based on the corresponding information provided in the cftime - package (`see here `__). Particularly, this object does, what the :func:`num2date` - therein does, but faster, in case the time stamps are not defined on a non - standard calendar. - - Parameters - ---------- - times : :obj:`list` or :obj:`ndarray` or :obj:`iris.coords.DimCoord` - array containing numerical time stamps (relative to basedate of - ``cfunit``). Can also be a single number. - cfunit : :obj:`str` or :obj:`Unit`, optional - CF unit string (e.g. day since 2018-01-01 00:00:00.00000000 UTC) or - unit. Required if `times` is not an instance of - :class:`iris.coords.DimCoord` - calendar : :obj:`str`, optional - string specifying calendar (only required if ``cfunit`` is of type - ``str``). - - Returns - ------- - ndarray - numpy array containing timestamps as datetime64 objects - - Raises - ------ - ValueError - if cfunit is ``str`` and calendar is not provided or invalid, or if - the cfunit string is invalid - - Example - ------- - - >>> cfunit_str = 'day since 2018-01-01 00:00:00.00000000 UTC' - >>> cftime_to_datetime64(10, cfunit_str, "gregorian") - array(['2018-01-11T00:00:00.000000'], dtype='datetime64[us]') - """ - if isinstance(times, iris.coords.DimCoord): # special case - times, cfunit = times.points, times.units - try: - len(times) - except Exception: - times = [times] - if isinstance(cfunit, str): - if calendar is None: - raise ValueError( - "Require specification of calendar for conversion into datetime64 objects" - ) - cfunit = Unit(cfunit, calendar) # raises Error if calendar is invalid - if not isinstance(cfunit, Unit): - raise ValueError( - "Please provide cfunit either as instance of class cf_units.Unit or as a string" - ) - calendar = cfunit.calendar - basedate = cfunit.num2date(0) - if (calendar == "proleptic_gregorian" and basedate.year >= MINYEAR) or ( - calendar in ["gregorian", "standard"] and basedate > GREGORIAN_BASE - ): - # NOTE: changed on 9 July 2018 by jgliss due to error (kernel died) - # after update of dependencies (cf_units). Attribute name does not - # work anymore... - cfu_str = cfunit.origin # cfunit.name - - res = cfu_str.split()[0].lower() - if res in microsec_units: - tstr = "us" - elif res in millisec_units: - tstr = "ms" - elif res in sec_units: - tstr = "s" - elif res in min_units: - tstr = "m" - elif res in hr_units: - tstr = "h" - elif res in day_units: - tstr = "D" - else: - raise ValueError("unsupported time units") - - basedate = np.datetime64(basedate) - dt = np.asarray(np.asarray(times), dtype=f"timedelta64[{tstr}]") - return basedate + dt - else: - return np.asarray([np.datetime64(t) for t in cfunit.num2date(times)]) - - -def get_constraint( - lon_range=None, lat_range=None, time_range=None, meridian_centre=True -): - """Function that creates an :class:`iris.Constraint` based on input - - Note - ---- - Please be aware of the definition of the longitudes in your data when - cropping within the longitude dimension. The longitudes in your data may be - defined either from **-180 <= lon <= 180** (pyaerocom standard) or from - **0 <= lon <= 360**. In the former case (-180 -> 180) you can leave the - additional input parameter ``meridian_centre=True`` (default). - - Parameters - ---------- - lon_range : :obj:`tuple`, optional - 2-element tuple containing longitude range for cropping - Example input to crop around meridian: `lon_range=(-30, 30)` - lat_range : :obj:`tuple`, optional - 2-element tuple containing latitude range for cropping. - time_range : :obj:`tuple`, optional - 2-element tuple containing time range for cropping. Allowed data - types for specifying the times are - - 1. a combination of 2 :class:`pandas.Timestamp` instances or - 2. a combination of two strings that can be directly converted\ - into :class:`pandas.Timestamp` instances (e.g.\ - `time_range=("2010-1-1", "2012-1-1")`) or - 3. directly a combination of indices (:obj:`int`). - meridian_centre : bool - specifies the coordinate definition range of longitude array. If True, - then -180 -> 180 is assumed, else 0 -> 360 - - Returns - ------- - iris.Constraint - the combined constraint from all valid input parameters - """ - constraints = [] - if lon_range is not None: - constraints.append(get_lon_rng_constraint(*lon_range, meridian_centre)) - if lat_range is not None: - constraints.append(get_lat_rng_constraint(*lat_range)) - if time_range is not None: - constraints.append(get_time_rng_constraint(*time_range)) - if len(constraints) > 0: - c = constraints[0] - for cadd in constraints[1:]: - c = c & cadd - return c - - -def get_lat_rng_constraint(low, high): - """Create latitude constraint based on input range - - Parameters - ---------- - low : float or int - lower latitude coordinate - high : float or int - upper latitude coordinate - - Returns - ------- - iris.Constraint - the corresponding iris.Constraint instance - - """ - return iris.Constraint(latitude=lambda v: low <= v <= high) - - -def get_lon_rng_constraint(low, high, meridian_centre=True): - """Create longitude constraint based on input range - - Parameters - ---------- - low : float or int - left longitude coordinate - high : float or int - right longitude coordinate - meridian_centre : bool - specifies the coordinate definition range of longitude array of the - data to be cropped. If True, then -180 -> 180 is assumed, else 0 -> 360 - - Returns - ------- - iris.Constraint - the corresponding iris.Constraint instance - - Raises - ------ - ValueError - if first coordinate in lon_range equals or exceeds second - LongitudeConstraintError - if the input implies cropping over border of longitude array - (e.g. 160 -> - 160 if -180 <= lon <= 180). - """ - if low == high: - raise ValueError("the specified values are equal") - elif low > high: - raise ValueError("Left coordinate must exceed right coordinate") - if meridian_centre: - low, high = (low + 180) % 360 - 180, (high + 180) % 360 - 180 - else: - low, high = low % 360, high % 360 - if low > high: - msg = "Cannot crop over right border of longitude range" - raise LongitudeConstraintError(msg) - return iris.Constraint(longitude=lambda v: low <= v <= high) - - -def get_time_rng_constraint(start, stop): - """Create iris.Constraint for data extraction along time axis - - Parameters - ---------- - start : :obj:`Timestamp` or :obj:` str` - start time of desired subset. If string, it must be convertible - into :class:`pandas.Timestamp` (e.g. "2012-1-1") - stop : :obj:`Timestamp` or :obj:` str` - start time of desired subset. If string, it must be convertible - into :class:`pandas.Timestamp` (e.g. "2012-1-1") - - Returns - ------- - iris.Constraint - iris Constraint instance that can, e.g., be used as input for - :func:`pyaerocom.griddeddata.GriddedData.extract` - """ - if not isinstance(start, pd.Timestamp): - start = pd.Timestamp(start) - if not isinstance(stop, pd.Timestamp): - stop = pd.Timestamp(stop) - - t_lower = iris.time.PartialDateTime( - year=start.year, month=start.month, day=start.day - ) - t_upper = iris.time.PartialDateTime(year=stop.year, month=stop.month, day=stop.day) - - return iris.Constraint(time=lambda cell: t_lower <= cell <= t_upper) - - -def get_max_period_range(periods): - start = min([int(per.split("-")[0]) for per in periods]) - stop = max( - int(per.split("-")[1]) if len(per.split("-")) > 1 else int(per) - for per in periods - ) - - return start, stop - - -def make_dummy_cube( - var_name: str, - start_yr: int = 2000, - stop_yr: int = 2020, - freq: str = "daily", - dtype=float, -) -> iris.cube.Cube: - startstr = f"days since {start_yr}-01-01 00:00" - - if freq not in TS_TYPE_TO_PANDAS_FREQ.keys(): - raise ValueError(f"{freq} not a recognized frequency") - - start_str = f"{start_yr}-01-01 00:00" - stop_str = f"{stop_yr}-12-31 00:00" - times = pd.date_range(start_str, stop_str, freq=TS_TYPE_TO_PANDAS_FREQ[freq]) - - days_since_start = (times - times[0]).days - unit = get_variable(var_name).units - - lat_range = (-90, 90) - lon_range = (-180, 180) - lat_res_deg = 45 - lon_res_deg = 90 - time_unit = Unit(startstr, calendar="gregorian") - - lons = np.arange( - lon_range[0] + (lon_res_deg / 2), lon_range[1] + (lon_res_deg / 2), lon_res_deg - ) - lats = np.arange( - lat_range[0] + (lat_res_deg / 2), lat_range[1] + (lat_res_deg / 2), lat_res_deg - ) - - latdim = iris.coords.DimCoord( - lats, - var_name="lat", - standard_name="latitude", - long_name="Center coordinates for latitudes", - circular=False, - units=Unit("degrees"), - ) - - londim = iris.coords.DimCoord( - lons, - var_name="lon", - standard_name="longitude", - long_name="Center coordinates for longitudes", - circular=False, - units=Unit("degrees"), - ) - - timedim = iris.coords.DimCoord( - days_since_start, - var_name="time", - standard_name="time", - long_name="Time", - units=time_unit, - ) - - latdim.guess_bounds() - londim.guess_bounds() - dummy = iris.cube.Cube(np.ones((len(times), len(lats), len(lons))), units=unit) - - dummy.add_dim_coord(latdim, 1) - dummy.add_dim_coord(londim, 2) - dummy.add_dim_coord(timedim, 0) - dummy.var_name = var_name - dummy.ts_type = freq - - dummy.data = dummy.data.astype(dtype) - for coord in dummy.coords(): - coord.points = coord.points.astype(dtype) - return dummy diff --git a/src/pyaro_readers/nilupmfebas/helpers_landsea_masks.py b/src/pyaro_readers/nilupmfebas/helpers_landsea_masks.py deleted file mode 100644 index b9a11cd..0000000 --- a/src/pyaro_readers/nilupmfebas/helpers_landsea_masks.py +++ /dev/null @@ -1,268 +0,0 @@ -""" -Helper methods for access of and working with land/sea masks. pyaerocom -provides automatic access to HTAP land sea masks from this URL: - -https://pyaerocom.met.no/pyaerocom-suppl - -Filtering by these masks is implemented in :class:`Filter` and all relevant -data classes (i.e. :class:`GriddedData`, :class:`UngriddedData`, -:class:`ColocatedData`). -""" - -import glob -import logging -import os - -import numpy as np -import requests -import xarray as xr -from iris import load_cube - -import const -from .exceptions import DataRetrievalError -from .helpers import numpy_to_cube - -logger = logging.getLogger(__name__) - - -def available_htap_masks(): - """ - List of HTAP mask names - - Returns - ---------- - list - Returns a list of available htap region masks. - """ - return const.HTAP_REGIONS - - -def download_htap_masks(regions_to_download=None): - """Download HTAP mask - - URL: https://pyaerocom.met.no/pyaerocom-suppl. - - Parameters - ----------- - regions_to_download : list - List containing the regions to download. - - Returns - ------- - list - List of file paths that point to the mask files that were successfully - downloaded - - Raises - ------ - ValueError - if one of the input regions does not exist - DataRetrievalError - if download fails for one of the input regions - """ - - if regions_to_download is None: - regions_to_download = const.HTAP_REGIONS - elif isinstance(regions_to_download, str): - regions_to_download = [regions_to_download] - elif not isinstance(regions_to_download, list): - raise ValueError("Invalid input for regions_to_download, need list or str") - - path_out = const.FILTERMASKKDIR - base_url = const.URL_HTAP_MASKS - - paths = [] - for region in regions_to_download: - if not region in const.HTAP_REGIONS: - raise ValueError(f"No such HTAP region {region}") - elif region == "EAS": - filename = f"{region}htap.nc" - file_out = os.path.join(path_out, f"{region}htap.0.1x0.1deg.nc") - else: - filename = f"{region}htap.0.1x0.1deg.nc" - file_out = os.path.join(path_out, filename) - - url = os.path.join(base_url, filename) - - try: - r = requests.get(url) - open(file_out, "wb").write(r.content) - paths.append(file_out) - except Exception as e: - raise DataRetrievalError( - f"Failed to download HTAP mask {region}. Reason {repr(e)}" - ) - return paths - - -def get_htap_mask_files(*region_ids): - """Get file paths to input HTAP regions - - Parameters - ---------- - *region_ids - ID's of regions for which mask files are supposed to be retrieved - - Returns - ------- - list - list of file paths for each input region - - Raises - ------ - FileNotFoundError - if default local directory for storage of HTAP masks does not exist - NameError - if multiple mask files are found for the same region - """ - mask_dir = const.FILTERMASKKDIR - if not os.path.exists(mask_dir): - raise FileNotFoundError("HTAP mask directory does not exist") - out = [] - for region in region_ids: - if not region in const.HTAP_REGIONS: - raise ValueError(f"No such HTAP region {region}") - files = glob.glob(os.path.join(mask_dir, f"{region}*.nc")) - if len(files) != 1: - if len(files) == 0: - logger.info(f"Downloading HTAP mask {region}") - files = download_htap_masks(region) - elif len(files) > 1: - raise NameError(f"Found multiple masks for region {region}") - out.append(files[0]) - return out - - -def load_region_mask_xr(*regions): - """Load boolean mask for input regions (as xarray.DataArray) - - Parameters - ----------- - *regions - regions that are supposed to be loaded and merged (just use string, - no list or similar) - - Returns - --------- - xarray.DataArray - boolean mask for input region(s) - """ - masks = None - for i, fil in enumerate(get_htap_mask_files(*regions)): - r = regions[i] - if i == 0: - masks = xr.open_dataset(fil)[r + "htap"] - name = r - else: - masks += xr.open_dataset(fil)[r + "htap"] - name += f"-{r}" - if masks is not None: - mask = masks.where(masks < 1, 1) - mask["name"] = name - mask.attrs["long_name"] = name - mask = mask.rename({"lat": "latitude", "long": "longitude"}) - return mask - - -def load_region_mask_iris(*regions): - """Loads regional mask to iris. - - Parameters - ----------- - region_id : str - Chosen region. - - Returns - --------- - iris.cube.Cube - cube representing merged mask from input regions - """ - cubes = [] - names = [] - for i, fil in enumerate(get_htap_mask_files(*regions)): - names.append(regions[i]) - cubes.append(load_cube(fil)) - if len(cubes) == 1: - out = cubes[0] - else: - merged_np = np.max([x.data for x in cubes], axis=0) - out = numpy_to_cube( - merged_np, dims=(cubes[0].coords()[0], cubes[0].coords()[1]) - ) - out.units = cubes[0].units - name = "-".join(names) - out.var_name = name - # out.attributes['long_name'] = name - return out - - -def get_mask_value(lat, lon, mask): - """Get value of mask at input lat / lon position - - Parameters - ---------- - lat : float - latitute - lon : float - longitude - mask : xarray.DataArray - data array - - Returns - ------- - float - neirest neigbhour mask value to input lat lon - """ - if not isinstance(mask, xr.DataArray): - raise ValueError(f"Invalid input for mask: need DataArray, got {type(mask)}") - return float(mask.sel(latitude=lat, longitude=lon, method="nearest")) - - -def check_all_htap_available(): - """ - Check for missing HTAP masks on local computer and download - """ - return get_htap_mask_files(*available_htap_masks()) - - -def get_lat_lon_range_mask_region(mask, latdim_name=None, londim_name=None): - """ - Get outer lat/lon rectangle of a binary mask - - Parameters - ---------- - mask : xr.DataArray - binary mask - latdim_name : str, optional - Name of latitude dimension. The default is None, in which case lat is - assumed. - londim_name : str, optional - Name of longitude dimension. The default is None, in which case long is - assumed. - - Returns - ------- - dict - dictionary containing lat and lon ranges of the mask. - - """ - if latdim_name is None: # htap - latdim_name = "lat" - if londim_name is None: - londim_name = "long" # htap - assert isinstance(mask, xr.DataArray) - assert mask.dims == (latdim_name, londim_name) - - data = mask.data - lats = mask.latitude.data - lons = mask.longitude.data - - lonmask = np.where(data.any(axis=0))[0] # flatten latitude dimenstion - firstidx, lastidx = lonmask.min(), lonmask.max() - lonr = sorted([lons[firstidx], lons[lastidx]]) - - latmask = np.where(data.any(axis=1))[0] # flatten latitude dimenstion - firstidx, lastidx = latmask.min(), latmask.max() - latr = sorted([lats[firstidx], lats[lastidx]]) - - return dict(lat_range=latr, lon_range=lonr) diff --git a/src/pyaro_readers/nilupmfebas/io_helpers.py b/src/pyaro_readers/nilupmfebas/io_helpers.py deleted file mode 100644 index 2ea1d84..0000000 --- a/src/pyaro_readers/nilupmfebas/io_helpers.py +++ /dev/null @@ -1,317 +0,0 @@ -""" -I/O helper methods of the pyaerocom package -""" -from __future__ import annotations - -import logging -import os -import shutil -from datetime import datetime -from pathlib import Path -from time import time - -import simplejson as json - -from . import const -from . import resources -from .exceptions import VariableDefinitionError, VarNotAvailableError -from .aerocom_browser import AerocomBrowser - -logger = logging.getLogger(__name__) - - -#: country code file name -#: will be prepended with the path later on -COUNTRY_CODE_FILE = "country_codes.json" - - -def _check_ebas_db_local_vs_remote(loc_remote, loc_local): - """ - Check and if applicable, copy ebas_file_index.sqlite3 into cache dir - - Note - ---- - This may speedup things if remote location is on a mounted server location. - Nothing the user should worry about in any case. - - Parameters - ---------- - loc_remote : str - remote location of ebas_file_index.sqlite3 - loc_local : str - local (cached) location of ebas_file_index.sqlite3 - - Returns - ------- - str - valid location of ebas_file_index.sqlite3 that is supposed to be used - - """ - if os.path.exists(loc_remote): # remote exists - if os.path.exists(loc_local): - chtremote = os.path.getmtime(loc_remote) - chtlocal = os.path.getmtime(loc_local) - if chtlocal == chtremote: - return loc_local - - # changing time differs -> try to copy to local and if that - # fails, use remote location - try: - t0 = time() - shutil.copy2(loc_remote, loc_local) - logger.info( - f"Copied EBAS SQL database to {loc_local}\nElapsed time: {time()-t0:.3f} s" - ) - - return loc_local - except Exception as e: - logger.warning(f"Failed to copy EBAS SQL database. Reason: {repr(e)}") - return loc_remote - return loc_remote - - -def aerocom_savename(data_id, var_name, vert_code, year, ts_type): - """Generate filename in AeroCom conventions - - ToDo: complete docstring - """ - return f"aerocom3_{data_id}_{var_name}_{vert_code}_{year}_{ts_type}.nc" - - -def _print_read_info(i, mod, tot_num, last_t, name, logger): # pragma: no cover - """Helper for displaying standardised output in reading classes - - Not to be used directly - """ - t = datetime.now() - logger.info( - f"Reading files {i+1}-{i+1+mod} of {tot_num} " - f"({name}) | {t:%T} (delta = {(t-last_t).seconds} s')" - ) - return t - - -def get_metadata_from_filename(filename): - """Try access metadata information from filename""" - from pyaerocom.io.fileconventions import FileConventionRead - - fc = FileConventionRead().from_file(filename) - return fc.get_info_from_file(filename) - - -def read_ebas_flags_file(ebas_flags_csv): - """Reads file ebas_flags.csv - - Parameters - ---------- - ebas_flags_csv : str - file containing flag info - - Returns - ------- - dict - dict with loaded flag info - """ - valid = {} - values = {} - info = {} - with open(ebas_flags_csv) as fio: - for line in fio: - spl = line.strip().split(",") - num = int(spl[0].strip()) - try: - val_str = spl[-1][1:-1] - except Exception: - raise OSError( - f"Failed to read flag information in row {line} " - f"(Check if entries in ebas_flags.csv are quoted)" - ) - info_str = ",".join(spl[1:-1]) - try: - info_str = info_str[1:-1] - except Exception: - raise OSError( - f"Failed to read flag information in row {line} " - f"(Check if entries in ebas_flags.csv are quoted)" - ) - isvalid = True if val_str == "V" else False - valid[num] = isvalid - values[num] = val_str - info[num] = info_str - result = {} - result["valid"] = valid - result["info"] = info - result["vals"] = values - return result - - -def add_file_to_log(filepath, err_msg): - """ - Add input file path to error logdir - - The logdir location can be accessed via :attr:`pyaerocom.const.LOGFILESDIR` - - Parameters - ---------- - filepath : str or Path - path of file that has an error - err_msg : str - Problem associated with input file - - """ - if isinstance(filepath, Path): - filepath = str(filepath) - try: - dirname = os.path.dirname(filepath) - spl = dirname.split(os.sep) - if spl[-1].lower() == "renamed": - model_or_obs_id = spl[-2] - else: - model_or_obs_id = spl[-1] - except Exception: - model_or_obs_id = "others" - - logdir = const.LOGFILESDIR - - logfile = os.path.join(logdir, f"{model_or_obs_id}.log") - - if os.path.exists(logfile): # check if this file is already flagged - with open(logfile) as f: - for line in f: - if filepath == line.strip(): - return # file is already flagged -> ignore - - logfile_err = os.path.join(logdir, f"{model_or_obs_id}_ERR.log") - with open(logfile, "a+") as f: - f.write(f"{filepath}\n") - with open(logfile_err, "a+") as ferr: - ferr.write(f"{filepath}\n{err_msg}\n\n") - - -def get_standard_name(var_name): - """Get standard name of aerocom variable - - Parameters - ---------- - var_name : str - HTAP2 variable name - - Returns - -------- - str - corresponding standard name - - Raises - ------ - VarNotAvailableError - if input variable is not defined in *variables.ini* file - VariableDefinitionError - if standarad name is not set for variable in *variables.ini* file - """ - if not var_name in const.VARS: - raise VarNotAvailableError(f"No such variable {var_name}. Check variables.ini") - name = const.VARS[var_name].standard_name - if name is None: - raise VariableDefinitionError("standard_name not defined for variable") - return name - - -def search_data_dir_aerocom(name_or_pattern, ignorecase=True): - """Search Aerocom data directory based on model / data ID""" - browser = AerocomBrowser() - return browser.find_data_dir(name_or_pattern, ignorecase) - - -def get_all_supported_ids_ungridded(): - """Get list of datasets that are supported by :class:`ReadUngridded` - - Returns - ------- - list - list with supported network names - """ - from pyaerocom.io import ReadUngridded - - return ReadUngridded().SUPPORTED_DATASETS - - -def get_obsnetwork_dir(obs_id): - """Returns data path for obsnetwork ID - - Parameters - ---------- - obs_id : str - ID of obsnetwork (e.g. AeronetSunV2Lev2.daily) - - Returns - ------- - str - corresponding directory from ``pyaerocom.const`` - - Raises - ------ - ValueError - if obs_id is invalid - IOError - if directory does not exist - """ - if not obs_id in const.OBSLOCS_UNGRIDDED: - raise ValueError(f"Observation network ID {obs_id} does not exist") - - data_dir = const.OBSLOCS_UNGRIDDED[obs_id] - if not os.path.exists(data_dir): - raise FileNotFoundError( - f"Data directory {data_dir} for observation network {obs_id} does not exist" - ) - return data_dir - - -def get_country_name_from_iso( - iso_code: str | None = None, - filename: str | Path | None = None, - return_as_dict: bool = False, -): - """get the country name from the 2 digit iso country code - - the underlaying json file was taken from this github repository - https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes - - Parameters - ---------- - iso_code : :obj:`str` - string containing the 2 character iso code of the country (e.g. no for Norway) - filename : :obj:`str` , optional - optional string with the json file to read - return_as_dict : :obj:`bool`, optional - flag to get the entire list of countries as a dictionary with the country codes - as keys and the country names as value - Useful if you have to get the names for a lot of country codes - - Returns - ------- - string with country name or dictionary with iso codes as keys and the country names as values - empty string if the country code was not found - - - Raises - ------ - ValueError - if the country code ins invalid - """ - if filename is None: - # set default file name - with resources.path("pyaerocom.data", COUNTRY_CODE_FILE) as path: - filename = path - - if isinstance(filename, str): - filename = Path(filename) - json_data = json.loads(filename.read_text()) - - iso_dict = {} - for indict in json_data: - iso_dict[indict["alpha-2"]] = indict["name"] - - if iso_code is None or return_as_dict: - return iso_dict - - return iso_dict[iso_code.upper()] diff --git a/src/pyaro_readers/nilupmfebas/mathutils.py b/src/pyaro_readers/nilupmfebas/mathutils.py deleted file mode 100644 index b07f270..0000000 --- a/src/pyaro_readers/nilupmfebas/mathutils.py +++ /dev/null @@ -1,532 +0,0 @@ -""" -Mathematical low level utility methods of pyaerocom -""" - -import numpy as np -from scipy.stats import kendalltau, pearsonr, spearmanr - -from ._warnings import ignore_warnings - -### LAMBDA FUNCTIONS -in_range = lambda x, low, high: low <= x <= high - -### OTHER FUNCTIONS - - -def is_strictly_monotonic(iter1d) -> bool: - """ - Check if 1D iterble is strictly monotonic - - Parameters - ---------- - iter1d - 1D iterable object to be tested - - Returns - ------- - bool - - """ - return True if np.all(np.diff(iter1d) > 0) else False - - -def make_binlist(vmin: float, vmax: float, num: int = None) -> list: - """""" - if num is None: - num = 8 - return list(np.linspace(vmin, vmax, num + 1)) - - -def weighted_sum(data, weights): - """Compute weighted sum using numpy dot product - - Parameters - ---------- - data : ndarray - data array that is supposed to be summed up - weights : ndarray - array containing weights for each point in `data` - - Returns - ------- - float - weighted sum of values in input array - """ - return np.dot(data, weights) - - -def sum(data, weights=None): - """Summing operation with option to perform weighted sum - - Parameters - ---------- - data : ndarray - data array that is supposed to be summed up - weights : ndarray, optional - array containing weights for each point in `data` - - Returns - ------- - float or int - sum of values in input array - """ - if weights is None: - return np.sum(data) - return weighted_sum(data, weights) - - -def weighted_mean(data, weights): - """Compute weighted mean - - Parameters - ---------- - data : ndarray - data array that is supposed to be averaged - weights : ndarray - array containing weights for each point in `data` - - Returns - ------- - float or int - weighted mean of data array - """ - return np.sum(data * weights) / np.sum(weights) - - -def weighted_cov(ref_data, data, weights): - """Compute weighted covariance - - Parameters - ---------- - data_ref : ndarray - x data - data : ndarray - y data - weights : ndarray - array containing weights for each point in `data` - - Returns - ------- - float - covariance - """ - avgx = weighted_mean(ref_data, weights) - avgy = weighted_mean(data, weights) - return np.sum(weights * (ref_data - avgx) * (data - avgy)) / np.sum(weights) - - -def weighted_corr(ref_data, data, weights): - """Compute weighted correlation - - Parameters - ---------- - data_ref : ndarray - x data - data : ndarray - y data - weights : ndarray - array containing weights for each point in `data` - - Returns - ------- - float - weighted correlation coefficient - """ - wcovxy = weighted_cov(ref_data, data, weights) - - wcovxx = weighted_cov(ref_data, ref_data, weights) - wcovyy = weighted_cov(data, data, weights) - wsigmaxy = np.sqrt(wcovxx * wcovyy) - return wcovxy / wsigmaxy - - -@ignore_warnings( - RuntimeWarning, - "invalid value encountered in double_scalars", - "An input array is constant", -) -def corr(ref_data, data, weights=None): - """Compute correlation coefficient - - Parameters - ---------- - data_ref : ndarray - x data - data : ndarray - y data - weights : ndarray, optional - array containing weights for each point in `data` - - Returns - ------- - float - correlation coefficient - """ - if weights is None: - return pearsonr(ref_data, data)[0] - return weighted_corr(ref_data, data, weights) - - -def _nanmean_and_std(data): - """ - Calculate mean and std for input data (may contain NaN's') - - Parameters - ---------- - data : list or numpy.ndarray - input data - - Returns - ------- - float - mean value of input data. - float - standard deviation of input data. - - """ - if np.all(np.isnan(data)): - return (np.nan, np.nan) - return (np.nanmean(data), np.nanstd(data)) - - -@ignore_warnings( - RuntimeWarning, - "An input array is constant", - "invalid value encountered in .*divide", -) -def calc_statistics( - data, - ref_data, - lowlim=None, - highlim=None, - min_num_valid=1, - weights=None, - drop_stats=None, -): - """Calc statistical properties from two data arrays - - Calculates the following statistical properties based on the two provided - 1-dimensional data arrays and returns them in a dictionary (keys are - provided after the arrows): - - - Mean value of both arrays -> refdata_mean, data_mean - - Standard deviation of both arrays -> refdata_std, data_std - - RMS (Root mean square) -> rms - - NMB (Normalised mean bias) -> nmb - - MNMB (Modified normalised mean bias) -> mnmb - - MB (Mean Bias) -> mb - - MAB (Mean Absolute Bias) -> mab - - FGE (Fractional gross error) -> fge - - R (Pearson correlation coefficient) -> R - - R_spearman (Spearman corr. coeff) -> R_spearman - - Note - ---- - Nans are removed from the input arrays, information about no. of removed - points can be inferred from keys `totnum` and `num_valid` in return dict. - - Parameters - ---------- - data : ndarray - array containing data, that is supposed to be compared with reference - data - ref_data : ndarray - array containing data, that is used to compare `data` array with - lowlim : float - lower end of considered value range (e.g. if set 0, then all datapoints - where either ``data`` or ``ref_data`` is smaller than 0 are removed) - highlim : float - upper end of considered value range - min_num_valid : int - minimum number of valid measurements required to compute statistical - parameters. - weights: ndarray - array containing weights if computing weighted statistics - drop_stats: tuple - tuple which drops the provided statistics from computed json files. - For example, setting drop_stats = ("mb", "mab"), results in json files - in hm/ts with entries which do not contain the mean bias and mean - absolute bias, but the other statistics are preserved. - Returns - ------- - dict - dictionary containing computed statistics - - Raises - ------ - ValueError - if either of the input arrays has dimension other than 1 - - """ - data = np.asarray(data) - ref_data = np.asarray(ref_data) - - if not data.ndim == 1 or not ref_data.ndim == 1: - raise ValueError("Invalid input. Data arrays must be one dimensional") - - result = {} - - mask = ~np.isnan(ref_data) * ~np.isnan(data) - num_points = mask.sum() - - data, ref_data = data[mask], ref_data[mask] - - weighted = False if weights is None else True - - result["totnum"] = float(len(mask)) - result["num_valid"] = float(num_points) - ref_mean, ref_std = _nanmean_and_std(ref_data) - data_mean, data_std = _nanmean_and_std(data) - result["refdata_mean"] = ref_mean - result["refdata_std"] = ref_std - result["data_mean"] = data_mean - result["data_std"] = data_std - result["weighted"] = weighted - - if not num_points >= min_num_valid: - if lowlim is not None: - valid = np.logical_and(data > lowlim, ref_data > lowlim) - data = data[valid] - ref_data = ref_data[valid] - if highlim is not None: - valid = np.logical_and(data < highlim, ref_data < highlim) - data = data[valid] - ref_data = ref_data[valid] - - result["rms"] = np.nan - result["nmb"] = np.nan - result["mnmb"] = np.nan - result["fge"] = np.nan - result["R"] = np.nan - result["R_spearman"] = np.nan - - return result - - if lowlim is not None: - valid = np.logical_and(data > lowlim, ref_data > lowlim) - data = data[valid] - ref_data = ref_data[valid] - if highlim is not None: - valid = np.logical_and(data < highlim, ref_data < highlim) - data = data[valid] - ref_data = ref_data[valid] - - difference = data - ref_data - - diffsquare = difference**2 - - if weights is not None: - weights = weights[mask] - weights = weights / weights.max() - result[ - "NOTE" - ] = "Weights were not applied to FGE and kendall and spearman corr (not implemented)" - - result["rms"] = np.sqrt(np.average(diffsquare, weights=weights)) - - # NO implementation to apply weights yet ... - if num_points > 1: - result["R"] = corr(data, ref_data, weights) - result["R_spearman"] = spearmanr(data, ref_data)[0] - result["R_kendall"] = kendalltau(data, ref_data)[0] - else: - result["R"] = np.nan - result["R_spearman"] = np.nan - result["R_kendall"] = np.nan - - sum_diff = sum(difference, weights=weights) - sum_refdata = sum(ref_data, weights=weights) - - if sum_refdata == 0: - if sum_diff == 0: - nmb = 0 - mb = 0 - else: - nmb = np.nan - mb = np.nan - else: - nmb = sum_diff / sum_refdata - mb = sum_diff - - sum_data_refdata = data + ref_data - # for MNMB, and FGE: don't divide by 0 ... - mask = ~np.isnan(sum_data_refdata) - num_points = mask.sum() - if num_points == 0: - mnmb = np.nan - fge = np.nan - mb = np.nan - mab = np.nan - else: - tmp = difference[mask] / sum_data_refdata[mask] - if weights is not None: - weights = weights[mask] - mnmb = 2.0 / num_points * sum(tmp, weights=weights) - fge = 2.0 / num_points * sum(np.abs(tmp), weights=weights) - mb = sum(difference[mask]) / num_points - mab = sum(np.abs(difference[mask])) / num_points - - result["nmb"] = nmb - result["mnmb"] = mnmb - result["fge"] = fge - result["mb"] = mb - result["mab"] = mab - - if drop_stats: - for istat in drop_stats: - result.pop(istat, None) - - return result - - -def closest_index(num_array, value): - """Returns index in number array that is closest to input value""" - return np.argmin(np.abs(np.asarray(num_array) - value)) - - -def numbers_in_str(input_string): - """This method finds all numbers in a string - - Note - ---- - - Beta version, please use with care - - Detects only integer numbers, dots are ignored - - Parameters - ---------- - input_string : str - string containing numbers - - Returns - ------- - list - list of strings specifying all numbers detected in string - - Example - ------- - >>> numbers_in_str('Bla42Blub100') - [42, 100] - """ - numbers = [] - IN_NUM = False - c_num = None - for char in input_string: - try: - int(char) - if not IN_NUM: - IN_NUM = True - c_num = char - elif IN_NUM: - c_num += char - except Exception: - if IN_NUM: - numbers.append(c_num) - IN_NUM = False - if IN_NUM: - numbers.append(c_num) - return numbers - - -def exponent(num): - """Get exponent of input number - - Parameters - ---------- - num : :obj:`float` or iterable - input number - - Returns - ------- - :obj:`int` or :obj:`ndarray` containing ints - exponent of input number(s) - - Example - ------- - >>> from pyaerocom.mathutils import exponent - >>> exponent(2340) - 3 - """ - return np.floor(np.log10(abs(np.asarray(num)))).astype(int) - - -def range_magnitude(low, high): - """Returns magnitude of value range - - Parameters - ---------- - low : float - lower end of range - high : float - upper end of range - - Returns - ------- - int - magnitudes spanned by input numbers - - Example - ------- - - >>> range_magnitude(0.1, 100) - 3 - >>> range_magnitude(100, 0.1) - -3 - >>> range_magnitude(1e-3, 1e6) - 9 - - """ - return exponent(high) - exponent(low) - - -def estimate_value_range(vmin, vmax, extend_percent=0): - """ - Round and extend input range to estimate lower and upper bounds of range - - Parameters - ---------- - vmin : float - lower value of range - vmax : float - upper value of range - extend_percent : int - percentage specifying to which extent the input range is supposed to be - extended. - - Returns - ------- - float - estimated lower end of range - float - estimated upper end of range - - - """ - if not vmax > vmin: - raise ValueError("vmax needs to exceed vmin") - # extent value range by +/- 5% - offs = (vmax - vmin) * extend_percent * 0.01 - vmin, vmax = vmin - offs, vmax + offs - - if vmin != 0: - exp = float(exponent(vmin)) - else: - exp = float(exponent(vmax)) - # round values - vmin = np.floor(vmin * 10 ** (-exp)) * 10.0 ** (exp) - vmax = np.ceil(vmax * 10 ** (-exp)) * 10.0 ** (exp) - return vmin, vmax - - -def _init_stats_dummy(drop_stats=None): - # dummy for statistics dictionary for locations without data - stats_dummy = {} - for k in calc_statistics([1], [1], drop_stats=drop_stats): - stats_dummy[k] = np.nan - - # Test to make sure these variables are defined even when yearly and season != all - stats_dummy["R_spatial_mean"] = np.nan - stats_dummy["R_spatial_median"] = np.nan - stats_dummy["R_temporal_mean"] = np.nan - stats_dummy["R_temporal_median"] = np.nan - - return stats_dummy diff --git a/src/pyaro_readers/nilupmfebas/metastandards.py b/src/pyaro_readers/nilupmfebas/metastandards.py deleted file mode 100644 index ce43a79..0000000 --- a/src/pyaro_readers/nilupmfebas/metastandards.py +++ /dev/null @@ -1,395 +0,0 @@ -import logging -from configparser import ConfigParser - -import numpy as np - -from ._lowlevel_helpers import BrowseDict -from . import resources - -logger = logging.getLogger(__name__) - - -class DataSource(BrowseDict): - """Dict-like object defining a data source - - Attributes - ---------- - data_id - name (or ID) of dataset (e.g. AeronetSunV3Lev2.daily) - dataset_name - name of dataset (e.g. AERONET) - data_product - data product (e.g. SDA, Inv, Sun for Aeronet) - data_version - version of data (e.g. 3) - data_level - level of data (e.g. 2) - framework : str - ID of framework to which data is associated (e.g. ACTRIS, GAW) - instr_vert_loc : str - Vertical location of measuring instrument(s). - revision_date - last revision date of dataset - ts_type_src - sampling frequency as defined in data files (use None if undefined) - stat_merge_pref_attr : str - optional, a metadata attribute that is available in data and that - is used to order the individual stations by relevance in case overlaps - occur. The associated values of this attribute need to be sortable - (e.g. revision_date). This is only relevant in case overlaps occur. - """ - - SUPPORTED_VERT_LOCS = ["ground", "space", "airborne"] - - _types = dict( - dataset_name=str, - data_product=str, - data_version=float, - data_level=float, - framework=str, - instr_vert_loc=str, - ts_type_src=str, - stat_merge_pref_attr=str, - revision_date=np.datetime64, - website=str, - ) - - _ini_file_name = "data_sources.ini" - - def __init__(self, **info): - self.data_id = None - self.dataset_name = None - self.data_product = None - self.data_version = None - self.data_level = None - - self.framework = None - self.instr_vert_loc = None - self.revision_date = None - self.website = None - - self.ts_type_src = None - - self.stat_merge_pref_attr = None - - self.update(**info) - if self.data_id is not None: - self._parse_source_info_from_ini() - - @property - def data_dir(self): - """Directory containing data files""" - from pyaerocom.io.helpers import get_obsnetwork_dir - - return get_obsnetwork_dir(self.data_id) - - def dataset_str(self): - s = "" - if self.dataset_name is not None: - s += self.dataset_name - hasv = False - if self.data_version is not None: - s += f"(v{self.data_version}" - hasv = True - if self.data_level is not None: - if hasv: - s += f", Lev {self.data_level})" - else: - s += f"(Lev {self.data_level})" - else: - s += ")" - else: - s += self.data_id - return s - - def load_dataset_info(self): - """Wrapper for :func:`_parse_source_info_from_ini`""" - try: - self._parse_source_info_from_ini() - except Exception: - pass - - def _parse_source_info_from_ini(self): - """Parse source info from ini file""" - - if not resources.is_resource("pyaerocom.data", self._ini_file_name): - raise OSError(f"File {self._ini_file_name} does not exist") - - parser = ConfigParser() - with resources.path("pyaerocom.data", self._ini_file_name) as path: - parser.read(path) - if self.data_id in parser: - for k, v in parser[self.data_id].items(): - if k in self._types: - self[k] = self._types[k](v) - else: - self[k] = str(v) - - -class StationMetaData(DataSource): - """This object defines a standard for station metadata in pyaerocom - - Variable names associated with meta data can vary significantly between - different conventions (e.g. conventions in modellers community vs. - observations community). - - Note - ---- - - This object is a dictionary and can be easily expanded - - In many cases, only some of the attributes are relevant - - Attributes - ---------- - filename : str - name of file (may be full path or only filename) - station_id : str - Code or unique ID of station - station_name :str - name or ID of a station. Note, that the concept of a station in - pyaerocom is not necessarily related to a fixed coordinate. A station - can also be a satellite, ship, or a human walking around and measuring - something - instrument_name : str - name (or ID) of instrument - PI : str - principal investigator - country : str - string specifying country (or country ID) - ts_type : str - frequency of data (e.g. monthly). Note the difference between - :attr:`ts_type_src` of :class:`DataSource`, which specifies the freq. - of the original files. - latitude : float - latitude coordinate - longitude : float - longitude coordinate - altitude : float - altitude coordinate - - """ - - def __init__(self, **info): - self.filename = None - - self.station_id = None - self.station_name = None - self.instrument_name = None - self.PI = None - - self.country = None - self.country_code = None - - self.ts_type = None - - self.latitude = np.nan - self.longitude = np.nan - self.altitude = np.nan - - super().__init__(**info) - - -class AerocomDataID: - """ - Class representing a model data ID following AeroCom PhaseIII conventions - - The ID must contain 4 substrings with meta parameters: - - -_- - - E.g. - - NorESM2-met2010_CTRL-AP3 - - For more information see `AeroCom diagnostics spreadsheet `__ - - This interface can be used to make sure a provided data ID is following - this convention and to extract the corresponding meta parameters as - dictionary (:func:`to_dict`) or to create an data_id from the corresponding - meta parameters :func:`from_dict`. - """ - - DELIM = "_" - SUBDELIM = "-" - KEYS = ["model_name", "meteo", "experiment", "perturbation"] - - def __init__(self, data_id=None, **meta_info): - self._data_id = None - self._values = None - - if data_id is not None: - self.data_id = data_id - elif meta_info: - self._values_from_dict(meta_info) - - @property - def data_id(self): - """ - str - AeroCom data ID - """ - return self._data_id - - @data_id.setter - def data_id(self, val): - self._values = self._eval_data_id(val) - self._data_id = val - - @property - def values(self): - if self._values is not None: - return self._values - raise AttributeError("Meta value list is not set.") - - @values.setter - def values(self, val): - if not isinstance(val, list) or not len(val) == len(self.KEYS): - raise ValueError(f"Invalid input: need list of length {len(self.KEYS)}") - # this will first create a data_id string from input values and - # then call setter method to make sure the input is correct. - self.data_id = self.from_values(val) - - def to_dict(self): - """Convert data_id to dictionary - - Returns - ------- - dict - dictionary with metadata information - """ - if not len(self._values) == len(self.KEYS): - self._eval_data_id(self.data_id) - return dict(zip(self.KEYS, self._values)) - - def _values_from_dict(self, meta): - vals = [] - for key in self.KEYS: - if not key in meta: - raise KeyError(f"Missing specification of {key} in input meta dict") - vals.append(meta[key]) - self._data_id = self.from_values(vals) - self._values = vals - - @staticmethod - def from_dict(meta): - """ - Create instance of AerocomDataID from input meta dictionary - - Parameters - ---------- - meta : dict - dictionary containing required keys (cf. :attr:`KEYS`) and - corresponding values to create an data_id - - Raises - ------ - KeyError - if not all information required is provided - - Returns - ------- - AerocomDataID - - """ - return AerocomDataID(**meta) - - @staticmethod - def from_values(values): - """ - Create data_id from list of values - - Note - ---- - The values have to be in the right order, cf. :attr:`KEYS` - - Parameters - ---------- - values : list - list containing values for each key in :attr:`KEYS` - - Raises - ------ - ValueError - if length of input list mismatches length of :attr:`KEYS` - - Returns - ------- - str - generated data_id - - """ - if not len(values) == 4: - raise ValueError( - "Need 4 entries model_name, meteo_config, experiment, perturbation" - ) - return "{}-{}_{}-{}".format(*values) - - def _eval_data_id(self, val): - """ - Check and extract meta information from input data_id - - Parameters - ---------- - val : str - data_id - - Raises - ------ - ValueError - if input is not string or is not in format - -_- - - Returns - ------- - values - DESCRIPTION. - - """ - if not isinstance(val, str): - raise ValueError(f"Invalid input for data_id. Need str. Got {val}") - - values = [""] * len(self.KEYS) - spl = val.split(self.DELIM) - if not len(spl) == 2: - logger.debug( - "Invalid or old data ID %s. Consider format -_-", - val, - ) - - values[0] = val - return values - - sub = spl[0].split(self.SUBDELIM, 1) - if len(sub) == 2: - values[0] = sub[0] # model_name - - meteo = sub[1] - if meteo.startswith("met"): - values[1] = meteo # meteo_config - else: - logger.debug( - "Meteorology config substring in data_id %s needs to start with met.", - meteo, - ) - values[0] = spl[0] - else: - values[0] = spl[0] - - sub = spl[1].split(self.SUBDELIM, 1) - if len(sub) == 2: - values[2] = sub[0] - values[3] = sub[1] - else: - values[2] = spl[1] - return values - - def __eq__(self, other): - return True if self._data_id == str(other) else False - - def __repr__(self): - return self._data_id - - def __str__(self): - return self._data_id - - -STANDARD_META_KEYS = list(StationMetaData()) diff --git a/src/pyaro_readers/nilupmfebas/molmasses.py b/src/pyaro_readers/nilupmfebas/molmasses.py deleted file mode 100644 index e18dec1..0000000 --- a/src/pyaro_readers/nilupmfebas/molmasses.py +++ /dev/null @@ -1,111 +0,0 @@ -VAR_PREFIXES = [ - "vmr", - "mmr", - "conc", - "sconc", - "wet", - "dry", - "concNt", - "concN", - "concC", - "proxydry", - "proxywet", - "dep", -] - -# in g/mol -MOLMASSES = { - "air_dry": 28.9647, - "o3": 48, - "so2": 64.066, - "so4": 96.06, - "no": 30.01, - "no2": 46.0055, - "no3": 62.0045, - "hno3": 63.01, - "nh3": 17.031, - "nh4": 18.039, - "co": 28.010, - "isop": 68.12, - "glyoxal": 58.036, - "glyox": 58.036, - "hcho": 30.026, - "co2": 44.0095, - "ch4": 16.04, -} - - -class UnkownSpeciesError(ValueError): - pass - - -def get_species(var_name): - """ - Get species name from variable name - - Parameters - ---------- - var_name : str - pyaerocom variable name (cf. variables.ini) - - Raises - ------ - UnkownSpeciesError - if species cannot be inferred - - Returns - ------- - str - name of species - - """ - if var_name in MOLMASSES: - return var_name - for prefix in VAR_PREFIXES: - if var_name.startswith(prefix): - species = var_name.split(prefix)[-1] - if species in MOLMASSES: - return species - raise UnkownSpeciesError( - f"Could not infer atom / molecule/ species from var_name {var_name}" - ) - - -def get_molmass(var_name): - """ - Get molar mass for input variable - - Parameters - ---------- - var_name : str - pyaerocom variable name (cf. variables.ini) or name of species - - Returns - ------- - float - molar mass of species in units of g/mol - - """ - return MOLMASSES[get_species(var_name)] - - -def get_mmr_to_vmr_fac(var_name): - """ - Get conversion factor for MMR -> VMR conversion for input variable - - Note - ---- - Assumes dry air molar mass - - Parameters - ---------- - var_name : str - Name of variable to be converted - - Returns - ------- - float - multiplication factor to convert MMR -> VMR - - """ - return get_molmass("air_dry") / get_molmass(var_name) diff --git a/src/pyaro_readers/nilupmfebas/obs_io.py b/src/pyaro_readers/nilupmfebas/obs_io.py deleted file mode 100644 index 4698849..0000000 --- a/src/pyaro_readers/nilupmfebas/obs_io.py +++ /dev/null @@ -1,141 +0,0 @@ -""" -Settings and helper methods / classes for I/O of obervation data - -Note ----- -Some settings like paths etc can be found in :mod:`pyaerocom.config.py` -""" -from ._lowlevel_helpers import dict_to_str, str_underline - -#: Wavelength tolerance for observations if data for required wavelength -#: is not available -OBS_WAVELENGTH_TOL_NM = 10.0 - -#: This boolean can be used to enable / disable the former (i.e. use -#: available wavelengths of variable in a certain range around variable -#: wavelength). -OBS_ALLOW_ALT_WAVELENGTHS = True - - -class ObsVarCombi: - def __init__(self, obs_id, var_name): - self.obs_id = obs_id - self.var_name = var_name - - def __repr__(self): - return f"{self.obs_id};{self.var_name}" - - def __str__(self): - return repr(self) - - -class AuxInfoUngridded: - MAX_VARS_PER_METHOD = 2 - - def __init__( - self, - data_id, - vars_supported, - aux_requires, - aux_merge_how, - aux_funs=None, - aux_units=None, - ): - self.data_id = data_id - self.vars_supported = vars_supported - - self.aux_requires = aux_requires - self.aux_merge_how = aux_merge_how - self.aux_funs = aux_funs - self.aux_units = aux_units - self.check_status() - - def to_dict(self): - """Dictionary representation of this object - - Ignores any potential private attributes. - """ - dd = {} - for key, val in self.__dict__.items(): - if any([key.startswith(x) for x in ("_", "__")]): - continue - dd[key] = val - return dd - - def check_status(self): - """ - Check if specifications are correct and consistent - - Raises - ------ - ValueError - If one of the class attributes is invalid - NotImplementedError - If computation method contains more than 2 variables / datasets - - """ - if isinstance(self.vars_supported, str): - self.vars_supported = [self.vars_supported] - - if isinstance(self.aux_merge_how, str): - nv = len(self.vars_supported) - self.aux_merge_how = dict( - zip(self.vars_supported, [self.aux_merge_how] * nv) - ) - if self.aux_funs is None: - self.aux_funs = {} - if self.aux_units is None: - self.aux_units = {} - - for var in self.vars_supported: - if not var in self.aux_requires: - raise ValueError( - f"Variable {var} is not defined in attr aux_requires..." - ) - - elif not var in self.aux_merge_how: - raise ValueError( - f"Missing information about how {var} should be merged (aux_merge_how)" - ) - merge_how = self.aux_merge_how[var] - if merge_how == "eval": - if not var in self.aux_funs: - raise ValueError( - f"Specification of computation function is missing for var {var}" - ) - fun = self.aux_funs[var] - - if not isinstance(fun, str): - raise ValueError("eval functions need to be strings") - - aux_info = self.aux_requires[var] - - fc = 0 - for aux_id, var_info in aux_info.items(): - if isinstance(var_info, str): - # make sure variables are represented as list, even if - # it is only one - aux_info[aux_id] = var_info = [var_info] - for _var in var_info: - obsvar = ObsVarCombi(aux_id, _var) - obsvarstr = str(obsvar) - if merge_how == "eval" and not obsvarstr in fun: - raise ValueError( - f"Mismatch between aux_requires and aux_funs for variable {var}. " - f"No such obs;var string {obsvarstr} in computation method {fun}" - ) - - fc += 1 - if fc > self.MAX_VARS_PER_METHOD: - raise NotImplementedError( - "So far only 2 variables can be combined..." - ) - - def __repr__(self): - return f"{type(self).__name__}; data_id: {self.data_id}; vars_supported: {self.vars_supported}" - - def __str__(self): - name = str(type(self).__name__) - s = str_underline(name) - s += dict_to_str(self.to_dict()) - return s diff --git a/src/pyaro_readers/nilupmfebas/readungridded.py b/src/pyaro_readers/nilupmfebas/readungridded.py deleted file mode 100755 index b703e4c..0000000 --- a/src/pyaro_readers/nilupmfebas/readungridded.py +++ /dev/null @@ -1,897 +0,0 @@ -import logging -import os -import warnings -from copy import deepcopy -from pathlib import Path -from typing import Optional, Union - -from pyaerocom import const -from pyaerocom.combine_vardata_ungridded import combine_vardata_ungridded -from pyaerocom.exceptions import ( - DataRetrievalError, - NetworkNotImplemented, - NetworkNotSupported, -) -from pyaerocom.helpers import varlist_aerocom -from pyaerocom.io import ReadUngriddedBase -from pyaerocom.io.cachehandler_ungridded import CacheHandlerUngridded -from pyaerocom.io.gaw.reader import ReadGAW -from pyaerocom.io.ghost.reader import ReadGhost -from pyaerocom.io.icos.reader import ReadICOS -from pyaerocom.io.icpforests.reader import ReadICPForest -from pyaerocom.io.mep.reader import ReadMEP -from pyaerocom.io.pyaro.pyaro_config import PyaroConfig -from pyaerocom.io.pyaro.read_pyaro import ReadPyaro -from pyaerocom.io.read_aasetal import ReadAasEtal -from pyaerocom.io.read_aeronet_invv2 import ReadAeronetInvV2 -from pyaerocom.io.read_aeronet_invv3 import ReadAeronetInvV3 -from pyaerocom.io.read_aeronet_sdav2 import ReadAeronetSdaV2 -from pyaerocom.io.read_aeronet_sdav3 import ReadAeronetSdaV3 -from pyaerocom.io.read_aeronet_sunv2 import ReadAeronetSunV2 -from pyaerocom.io.read_aeronet_sunv3 import ReadAeronetSunV3 -from pyaerocom.io.read_airnow import ReadAirNow -from pyaerocom.io.read_earlinet import ReadEarlinet -from pyaerocom.io.read_ebas import ReadEbas -from pyaerocom.io.read_eea_aqerep import ReadEEAAQEREP -from pyaerocom.io.read_eea_aqerep_v2 import ReadEEAAQEREP_V2 -from pyaerocom.ungriddeddata import UngriddedData -from pyaerocom.variable import get_aliases - -logger = logging.getLogger(__name__) - - -# TODO Add check if data id of config is same as one already given by pyaerocom -class ReadUngridded: - """Factory class for reading of ungridded data based on obsnetwork ID - - This class also features reading functionality that goes beyond reading - of inidividual observation datasets; including, reading of multiple - datasets and post computation of new variables based on datasets that can - be read. - - Parameters - ---------- - COMING SOON - - """ - - SUPPORTED_READERS = [ - ReadAeronetInvV3, - ReadAeronetInvV2, - ReadAeronetSdaV2, - ReadAeronetSdaV3, - ReadAeronetSunV2, - ReadAeronetSunV3, - ReadEarlinet, - ReadEbas, - ReadAasEtal, - ReadAirNow, - ReadEEAAQEREP, - ReadEEAAQEREP_V2, - ReadGAW, - ReadGhost, - ReadMEP, - ReadICOS, - ReadICPForest, - ] - - # Creates list of all readers excluding ReadPyaro - INCLUDED_READERS = deepcopy(SUPPORTED_READERS) - - # Adds ReadPyaro to said list - SUPPORTED_READERS.append(ReadPyaro) - - DONOTCACHE_NAME = "DONOTCACHE" - - def __init__( - self, - data_ids=None, - ignore_cache=False, - data_dirs=None, - configs: Optional[Union[PyaroConfig, list[PyaroConfig]]] = None, - ): - # will be assigned in setter method of data_ids - self._data_ids = [] - self._data_dirs = {} - - #: dictionary containing reading classes for each dataset to read (will - #: be accessed via get_reader) - self._readers = {} - - if data_ids is not None: - self.data_ids = data_ids - - if data_dirs is not None: - self.data_dirs = data_dirs - - if ignore_cache: - logger.info("Deactivating caching") - const.CACHING = False - - self.config_ids = {} - self.config_map = {} - - if isinstance(configs, PyaroConfig): - configs = [configs] - - self._configs = configs - - if isinstance(configs, list): - for config in configs: - if config is not None: - self._init_pyaro_reader(config=config) - - @property - def data_dirs(self): - """ - dict: Data directory(ies) for dataset(s) to read (keys are data IDs) - """ - return self._data_dirs - - @data_dirs.setter - def data_dirs(self, val): - if isinstance(val, Path): - val = str(val) - dsr = self.data_ids - if len(dsr) < 2 and isinstance(val, str): - val = {dsr[0]: val} - elif not isinstance(val, dict): - raise ValueError( - f"Invalid input for data_dirs ({val}); needs to be a dictionary." - ) - for data_dir in val.values(): - assert os.path.exists(data_dir), f"{data_dir} does not exist" - self._data_dirs = val - - @property - def post_compute(self): - """Information about datasets that can be computed in post""" - return const.OBS_UNGRIDDED_POST - - @property - def INCLUDED_DATASETS(self): - lst = [] - for reader in self.INCLUDED_READERS: - lst.extend(reader.SUPPORTED_DATASETS) - lst.extend(self.post_compute) - return lst - - @property - def SUPPORTED_DATASETS(self): - """ - Returns list of strings containing all supported dataset names - """ - lst = [] - for reader in self.SUPPORTED_READERS: - lst.extend(reader.SUPPORTED_DATASETS) - lst.extend(self.post_compute) - lst.extend(list(self.config_ids.keys())) - return lst - - @property - def supported_datasets(self): - """ - Wrapper for :attr:`SUPPORTED_DATASETS` - """ - return self.SUPPORTED_DATASETS - - def _check_donotcachefile(self): - """Check if donotcache file exists - - Returns - ------- - bool - True if file exists, else False - """ - try: - if os.path.exists(os.path.join(const.cache_basedir, self.DONOTCACHE_NAME)): - return True - except: - pass - return False - - @property - def ignore_cache(self): - """Boolean specifying whether caching is active or not""" - if self._check_donotcachefile() or not const.CACHING: - return True - return False - - @property - def data_ids(self): - """List of datasets supposed to be read""" - return self._data_ids - - @data_ids.setter - def data_ids(self, val): - if isinstance(val, str): - val = [val] - elif not isinstance(val, (tuple, list)): - raise OSError("Invalid input for parameter data_ids") - self._data_ids = val - - @property - def configs(self): - """List configs""" - return self._configs - - @configs.setter - def configs(self, val: Union[PyaroConfig, list[PyaroConfig]]): - if isinstance(val, PyaroConfig): - val = [val] - elif not isinstance(val, (tuple, list)): - raise OSError("Invalid input for parameter data_ids") - logger.warning( - f"You are now overwriting the list of configs. This will delete the previous configs, but will leave readeres associated with those configs intact. Use 'add_config' for safer usage!" - ) - for config in val: - self._init_pyaro_reader(config=config) - self._configs = deepcopy(val) - - @property - def data_id(self): - """ - ID of dataset - - Note - ----- - Only works if exactly one dataset is assigned to the reader, that is, - length of :attr:`data_ids` is 1. - - Raises - ------ - AttributeError - if number of items in :attr:`data_ids` is unequal one. - - Returns - ------- - str - data ID - - """ - nids = len(self.data_ids) - if nids == 0: - raise AttributeError("No data_id assigned") - elif nids > 1: - raise AttributeError("Multiple data_ids assigned") - return self.data_ids[0] - - def dataset_provides_variables(self, data_id=None): - """List of variables provided by a certain dataset""" - if data_id is None: - data_id = self.data_id - if config is None: - config = self.config - if not data_id in self._readers: - reader = self.get_lowlevel_reader(data_id) - else: - reader = self._readers[data_id] - return reader.PROVIDES_VARIABLES - - def get_reader(self, data_id): - warnings.warn( - "this method was renamed to get_lowlevel_reader, please use the new name", - DeprecationWarning, - stacklevel=2, - ) - return self.get_lowlevel_reader(data_id) - - def get_lowlevel_reader(self, data_id: str | None = None) -> ReadUngriddedBase: - """Helper method that returns initiated reader class for input ID - - Parameters - ----------- - data_id : str - Name of dataset - - Returns - ------- - ReadUngriddedBase - instance of reading class (needs to be implementation of base - class :class:`ReadUngriddedBase`). - """ - - if data_id is None: - if len(self.data_ids) != 1: - raise ValueError("Please specify dataset") - if data_id not in self.supported_datasets: - if data_id not in self.config_map: - raise NetworkNotSupported( - f"Could not fetch reader class: Input " - f"network {data_id} is not supported by " - f"ReadUngridded" - ) - elif data_id not in self.data_ids: - self.data_ids.append(data_id) - if data_id not in self._readers: - _cls = self._find_read_class(data_id) - reader = self._init_lowlevel_reader(_cls, data_id) - self._readers[data_id] = reader - return self._readers[data_id] - - def add_pyaro_reader(self, config: PyaroConfig) -> ReadUngriddedBase: - return self._init_pyaro_reader(config=config) - - def _init_pyaro_reader(self, config: PyaroConfig) -> ReadUngriddedBase: - """ - Initializes PyAro reader from config, and adds reader to list of readers. If no config is given, the config given when ReaderUngridded was initiated is used - - Parameters - ----------- - config : PyaroConfig - Config for reader - - Returns - ------- - ReadUngriddedBase - instance of reading class (needs to be implementation of base - class :class:`ReadUngriddedBase`) - - - Raises - ------ - ValueError - If both the config argument and self.config are None - """ - name = config.name - - if name in self.INCLUDED_DATASETS: - raise NameError( - f"{name} from config {config} cannot have the same name as an included dataset" - ) - - if name in self._readers: - return self._readers[name] - - else: - reader = ReadPyaro(config=config) - self._readers[name] = reader - self._data_ids.append(name) - self.config_ids[name] = config.data_id - self.config_map[name] = config - return reader - - def add_config(self, config: PyaroConfig) -> None: - """ - Adds single PyaroConfig to self.configs - - Parameters - ---------- - config: PyaroConfig - - Raises - ------ - ValueError - If config is not PyaroConfig - - """ - if not isinstance(config, PyaroConfig): - raise ValueError(f"Given config is not a PyaroConfig") - - self._init_pyaro_reader(config=config) - self._configs.append(config) - - def _find_read_class(self, data_id): - """Find reading class for dataset name - - Loops over all reading classes available in :attr:`SUPPORTED_READERS` - and finds the first one that matches the input dataset name, by - checking the attribute :attr:`SUPPORTED_DATASETS` in each respective - reading class. - - Parameters - ----------- - data_id : str - Name of dataset - - Returns - ------- - ReadUngriddedBase - instance of reading class (needs to be implementation of base - class :class:`ReadUngriddedBase`) - - Raises - ------ - NetworkNotImplemented - if network is supported but no reading routine is implemented yet - - """ - for _cls in self.SUPPORTED_READERS: - if data_id in _cls.SUPPORTED_DATASETS: - return _cls - raise NetworkNotImplemented( - f"Could not find reading class for dataset {data_id}" - ) - - def _init_lowlevel_reader(self, reader, data_id): - """ - Initiate lowlevel reader for input data ID - - Parameters - ---------- - reader - reader class (not instantiated) - data_id : str - ID of dataset to be isntantiated with input reader - - Returns - ------- - ReadUngriddedBase - instantiated reader class for input ID. - - """ - # if data_id is not None and config is not None: - # if data_id != config.name: - # raise ValueError( - # f"DATA ID and config are both given, but they are not equal, {data_id} != {config.data_id}" - # ) - # if config is None: - # config = self.config - - # if data_id is None: - # data_id = config.name - - if data_id is None: - raise ValueError(f"Data_id can not be none") - - if data_id in self.config_map: - return reader(config=self.config_map[data_id]) - - if data_id in self.data_dirs: - ddir = self.data_dirs[data_id] - logger.info(f"Reading {data_id} from specified data loaction: {ddir}") - else: - ddir = None - - return reader(data_id=data_id, data_dir=ddir) - - def read_dataset( - self, - data_id, - vars_to_retrieve=None, - only_cached=False, - filter_post=None, - **kwargs, - ): - """Read dataset into an instance of :class:`ReadUngridded` - - Parameters - ---------- - data_id : str - name of dataset - vars_to_retrieve : list - variable or list of variables to be imported - only_cached : bool - if True, then nothing is reloaded but only data is loaded that is - available as cached objects (not recommended to use but may be - used if working offline without connection to database) - filter_post : dict, optional - filters applied to `UngriddedData` object AFTER it is read into - memory, via :func:`UngriddedData.apply_filters`. This option was - introduced in pyaerocom version 0.10.0 and should be used - preferably over **kwargs. There is a certain flexibility with - respect to how these filters can be defined, for instance, sub - dicts for each `data_id`. The most common way would be - to provide directly the input needed for - `UngriddedData.apply_filters`. If you want to read multiple variables - from one or more datasets, and if you want to apply variable - specific filters, it is recommended to read the data individually - for each variable and corresponding set of filters and then - merge the individual filtered `UngriddedData` objects afterwards, - e.g. using `data_var1 & data_var2`. - **kwargs - Additional input options for reading of data, which are applied - WHILE the data is read. If any such additional options are - provided that are applied during the reading, then automatic - caching of the output `UngriddedData` object will be deactivated. - Thus, it is recommended to handle data filtering via `filter_post` - argument whenever possible, which will result in better performance - as the unconstrained original data is read in and cached, and then - the filtering is applied. - - Returns - -------- - UngriddedData - data object - """ - _caching = None - if len(kwargs) > 0: - _caching = const.CACHING - const.CACHING = False - - logger.info("Received additional reading constraints, ignoring caching") - - reader = self.get_lowlevel_reader(data_id) - - if vars_to_retrieve is None: - vars_to_retrieve = reader.DEFAULT_VARS - - vars_to_retrieve = varlist_aerocom(vars_to_retrieve) - - # Since this interface enables to load multiple datasets, each of - # which support a number of variables, here, only the variables are - # considered that are supported by the dataset - vars_available = [var for var in vars_to_retrieve if reader.var_supported(var)] - - if len(vars_available) == 0: - raise DataRetrievalError( - f"None of the input variables ({vars_to_retrieve}) is " - f"supported by {data_id} interface" - ) - cache = CacheHandlerUngridded(reader) - if not self.ignore_cache: - # initate cache handler - for var in vars_available: - try: - cache.check_and_load(var, force_use_outdated=only_cached) - except Exception: - logger.exception( - "Fatal: compatibility error between old cache file " - "and current version of code." - ) - - if not only_cached: - vars_to_read = [v for v in vars_available if not v in cache.loaded_data] - else: - vars_to_read = [] - - data_read = None - if len(vars_to_read) > 0: - _loglevel = logger.level - logger.setLevel(logging.INFO) - data_read = reader.read(vars_to_read, **kwargs) - logger.setLevel(_loglevel) - - for var in vars_to_read: - # write the cache file - if not self.ignore_cache: - try: - cache.write(data_read, var) - except Exception as e: - _caching = False - logger.warning( - f"Failed to write to cache directory. " - f"Error: {repr(e)}. Deactivating caching " - f"in pyaerocom" - ) - - if len(vars_to_read) == len(vars_available): - data_out = data_read - else: - data_out = UngriddedData() - for var in vars_available: - if var in cache.loaded_data: - data_out.append(cache.loaded_data[var]) - if data_read is not None: - data_out.append(data_read) - - if _caching is not None: - const.CACHING = _caching - - if filter_post: - filters = self._eval_filter_post(filter_post, data_id, vars_available) - data_out = data_out.apply_filters(**filters) - - # Check to see if this reader is for a VerticalProfile - # It is currently only allowed that a reader can be for a VerticalProfile, not a species - if getattr(reader, "is_vertical_profile", None): - data_out.is_vertical_profile = reader.is_vertical_profile - - return data_out - - def _eval_filter_post(self, filter_post, data_id, vars_available): - filters = {} - if not isinstance(filter_post, dict): - raise ValueError(f"input filter_post must be dict, got {type(filter_post)}") - elif len(filter_post) == 0: - return filters - - if data_id in filter_post: - # filters are specified specifically for that dataset - subset = filter_post[data_id] - return self._eval_filter_post(subset, data_id, vars_available) - - for key, val in filter_post.items(): - if key == "ignore_station_names": # for backwards compatibility - if isinstance(val, (str, list)): - filters["station_name"] = val - if not "negate" in filters: - filters["negate"] = [] - filters["negate"].append("station_name") - - elif isinstance(val, dict): # variable specific station filtering - if len(vars_available) > 1: - raise NotImplementedError( - f"Cannot filter different sites for multivariable " - f"UngriddedData objects (i.e. apply filter " - f"ignore_station_names={val} for UngriddedData " - f"object containing {vars_available}" - ) - else: - # the variable that is available in the UngriddedData - # object - var = vars_available[0] - try: - filters["station_name"] = val[var] - if not "negate" in filters: - filters["negate"] = [] - filters["negate"].append("station_name") - except KeyError: - continue - else: - raise ValueError(f"Invalid input for ignore_station_names: {val}") - else: - filters[key] = val - return filters - - def read_dataset_post( - self, - data_id, - vars_to_retrieve, - only_cached=False, - filter_post=None, - **kwargs, - ): - """Read dataset into an instance of :class:`ReadUngridded` - - Parameters - ---------- - data_id : str - name of dataset - vars_to_retrieve : list - variable or list of variables to be imported - only_cached : bool - if True, then nothing is reloaded but only data is loaded that is - available as cached objects (not recommended to use but may be - used if working offline without connection to database) - filter_post : dict, optional - filters applied to `UngriddedData` object AFTER it is read into - memory, via :func:`UngriddedData.apply_filters`. This option was - introduced in pyaerocom version 0.10.0 and should be used - preferably over **kwargs. There is a certain flexibility with - respect to how these filters can be defined, for instance, sub - dicts for each `data_id`. The most common way would be - to provide directly the input needed for - `UngriddedData.apply_filters`. If you want to read multiple variables - from one or more datasets, and if you want to apply variable - specific filters, it is recommended to read the data individually - for each variable and corresponding set of filters and then - merge the individual filtered `UngriddedData` objects afterwards, - e.g. using `data_var1 & data_var2`. - **kwargs - Additional input options for reading of data, which are applied - WHILE the data is read. If any such additional options are - provided that are applied during the reading, then automatic - caching of the output `UngriddedData` object will be deactivated. - Thus, it is recommended to handle data filtering via `filter_post` - argument whenever possible, which will result in better performance - as the unconstrained original data is read in and cached, and then - the filtering is applied. - - Returns - -------- - UngriddedData - data object - """ - aux_info = self.post_compute[data_id] - loaded = [] - for var in vars_to_retrieve: - input_data_ids_vars = [] - aux_info_var = aux_info["aux_requires"][var] - for aux_id, aux_vars in aux_info_var.items(): - if aux_id in self.post_compute: - aux_data = self.read_dataset_post( - data_id=aux_id, - vars_to_retrieve=aux_vars, - only_cached=only_cached, - **kwargs, - ) - for aux_var in aux_vars: - input_data_ids_vars.append((aux_data, aux_id, aux_var)) - else: - # read variables individually, so filter_post is more - # flexible if some post filters are specified for - # individual variables... - for aux_var in aux_vars: - _data = self.read_dataset( - data_id=aux_id, - vars_to_retrieve=aux_var, - only_cached=only_cached, - filter_post=filter_post, - **kwargs, - ) - input_data_ids_vars.append((_data, aux_id, aux_var)) - - aux_merge_how = aux_info["aux_merge_how"][var] - - if var in aux_info["aux_units"]: - var_unit_out = aux_info["aux_units"][var] - else: - var_unit_out = None - - if aux_merge_how == "eval": - # function MUST be defined - aux_fun = aux_info["aux_funs"][var] - else: - aux_fun = None - - merged_stats = combine_vardata_ungridded( - data_ids_and_vars=input_data_ids_vars, - merge_eval_fun=aux_fun, - merge_how=aux_merge_how, - var_name_out=var, - var_unit_out=var_unit_out, - data_id_out=aux_info["data_id"], - ) - loaded.append(UngriddedData.from_station_data(merged_stats)) - first = loaded[0] - if len(loaded) == 1: - return first - for data in loaded[1:]: - first.append(data) - return first - - def read( - self, - data_ids=None, - vars_to_retrieve=None, - only_cached=False, - filter_post=None, - configs: Optional[Union[PyaroConfig, list[PyaroConfig]]] = None, - **kwargs, - ): - """Read observations - - Iter over all datasets in :attr:`data_ids`, call - :func:`read_dataset` and append to data object - - Parameters - ---------- - data_ids : str or list - data ID or list of all datasets to be imported - vars_to_retrieve : str or list - variable or list of variables to be imported - only_cached : bool - if True, then nothing is reloaded but only data is loaded that is - available as cached objects (not recommended to use but may be - used if working offline without connection to database) - filter_post : dict, optional - filters applied to `UngriddedData` object AFTER it is read into - memory, via :func:`UngriddedData.apply_filters`. This option was - introduced in pyaerocom version 0.10.0 and should be used - preferably over **kwargs. There is a certain flexibility with - respect to how these filters can be defined, for instance, sub - dicts for each `data_id`. The most common way would be - to provide directly the input needed for - `UngriddedData.apply_filters`. If you want to read multiple variables - from one or more datasets, and if you want to apply variable - specific filters, it is recommended to read the data individually - for each variable and corresponding set of filters and then - merge the individual filtered `UngriddedData` objects afterwards, - e.g. using `data_var1 & data_var2`. - **kwargs - Additional input options for reading of data, which are applied - WHILE the data is read. If any such additional options are - provided that are applied during the reading, then automatic - caching of the output `UngriddedData` object will be deactivated. - Thus, it is recommended to handle data filtering via `filter_post` - argument whenever possible, which will result in better performance - as the unconstrained original data is read in and cached, and then - the filtering is applied. - - Example - ------- - >>> import pyaerocom.io.readungridded as pio - >>> from pyaerocom import const - >>> obj = pio.ReadUngridded(data_id=const.AERONET_SUN_V3L15_AOD_ALL_POINTS_NAME) - >>> obj.read() - >>> print(obj) - >>> print(obj.metadata[0.]['latitude']) - - """ - if data_ids is None: - data_ids = self.data_ids - elif isinstance(data_ids, str): - data_ids = [data_ids] - - if configs is not None: - if not isinstance(configs, list): - configs = [configs] - for config in configs: - self._init_pyaro_reader(config=config) - data_ids.append(config.name) - - if isinstance(vars_to_retrieve, str): - vars_to_retrieve = [vars_to_retrieve] - - data = UngriddedData() - for ds in data_ids: - if ds in self.post_compute: - data.append( - self.read_dataset_post( - data_id=ds, - vars_to_retrieve=vars_to_retrieve, - only_cached=only_cached, - filter_post=filter_post, - **kwargs, - ) - ) - else: - data_to_append = self.read_dataset( - data_id=ds, - vars_to_retrieve=vars_to_retrieve, - only_cached=only_cached, - filter_post=filter_post, - **kwargs, - ) - data.append(data_to_append) - # TODO: Test this. UngriddedData can contain more than 1 variable - if getattr(data_to_append, "is_vertical_profile", None): - data.is_vertical_profile = data_to_append.is_vertical_profile - - logger.info(f"Successfully imported {ds} data") - return data - - def _check_var_alias(self, var, supported): - # could be an alias - aliases = get_aliases(var) - for svar in supported: - if svar in aliases: - return svar - raise ValueError() - - def get_vars_supported( - self, obs_id, vars_desired - ): # , config: Optional[PyaroConfig] = None): - """ - Filter input list of variables by supported ones for a certain data ID - - Parameters - ---------- - obs_id : str - ID of observation network - vars_desired : list - List of variables that are desired - - Returns - ------- - list - list of variables that can be read through the input network - - """ - obs_vars = [] - if isinstance(vars_desired, str): - vars_desired = [vars_desired] - if obs_id in self.post_compute: - # check if all required are accessible - postinfo = self.post_compute[obs_id] - supported = postinfo["vars_supported"] - for var in varlist_aerocom(vars_desired): - if not var in supported: - try: - var = self._check_var_alias(var, supported) - except ValueError: - # no alias match, skip... - continue - requires = postinfo["aux_requires"][var] - all_good = True - for ds, vars_required in requires.items(): - if isinstance(vars_required, str): - vars_required = [vars_required] - vars_avail = self.get_vars_supported(ds, vars_required) - if not len(vars_required) == len(vars_avail): - all_good = False - break - if all_good: - obs_vars.append(var) - - else: - # check if variable can be read from a dataset on disk - _oreader = self.get_lowlevel_reader(obs_id) - for var in varlist_aerocom(vars_desired): - if _oreader.var_supported(var): - obs_vars.append(var) - return obs_vars - - def __str__(self): - return "\n".join(str(self.get_lowlevel_reader(ds)) for ds in self.data_ids) diff --git a/src/pyaro_readers/nilupmfebas/readungriddedbase.py b/src/pyaro_readers/nilupmfebas/readungriddedbase.py deleted file mode 100644 index dd063c9..0000000 --- a/src/pyaro_readers/nilupmfebas/readungriddedbase.py +++ /dev/null @@ -1,661 +0,0 @@ -from __future__ import annotations - -import abc -import glob -import logging -import os -import warnings -from fnmatch import fnmatch - -import numpy as np - -from . import const -from ._lowlevel_helpers import list_to_shortstr -from .exceptions import DataSourceError -from .helpers import varlist_aerocom -from .io_helpers import get_obsnetwork_dir - -logger = logging.getLogger(__name__) - - -# TODO: Proposal: include attribute ts_type that is by default undefined but -# may be set to either of the defined -class ReadUngriddedBase(abc.ABC): - """TEMPLATE: Abstract base class template for reading of ungridded data - - .. note:: - - The two dictionaries ``AUX_REQUIRES`` and ``AUX_FUNS`` can be filled - with variables that are not contained in the original data files but - are computed during the reading. The former specifies what additional - variables are required to perform the computation and the latter - specifies functions used to perform the computations of the auxiliary - variables. - See, for instance, the class :class:`ReadAeronetSunV3`, which includes - the computation of the AOD at 550nm and the Angstrom coefficient - (in 440-870 nm range) from AODs measured at other wavelengths. - """ - - #: version of this base class. Please update if you apply changes to this - #: code. This version is required for caching and needs to be considered - #: in the definition of __version__ in all derived classes, so that - #: caching can be done reliably - __baseversion__ = "0.09" - - #: dictionary containing information about additionally required variables - #: for each auxiliary variable (i.e. each variable that is not provided - #: by the original data but computed on import) - AUX_REQUIRES = {} - - #: Functions that are used to compute additional variables (i.e. one - #: for each variable defined in AUX_REQUIRES) - AUX_FUNS = {} - - IGNORE_META_KEYS = [] - - _FILEMASK = "*.*" - - def __str__(self): - return ( - f"Dataset name: {self.data_id}\n" - f"Data directory: {self.data_dir}\n" - f"Supported variables: {self.PROVIDES_VARIABLES}\n" - f"Last revision: {self.data_revision}" - ) - - def __repr__(self): - return str(type(self).__name__) - - @abc.abstractproperty - def TS_TYPE(self): - """Temporal resolution of dataset - - This should be defined in the header of an implementation class if - it can be globally defined for the corresponding obs-network or in - other cases it should be initated as string ``undefined`` and then, - if applicable, updated in the reading routine of a file. - - The TS_TYPE information should ultimately be written into the meta-data - of objects returned by the implementation of :func:`read_file` (e.g. - instance of :class:`StationData` or a normal dictionary) and the method - :func:`read` (which should ALWAYS return an instance of the - :class:`UngriddedData` class). - - Note - ---- - - Please use ``"undefined"`` if the derived class is not sampled on \ - a regular basis. - - If applicable please use Aerocom ts_type (i.e. hourly, 3hourly, \ - daily, monthly, yearly) - - Note also, that the ts_type in a derived class may or may not be \ - defined in a general case. For instance, in the EBAS database the \ - resolution code can be found in the file header and may thus be \ - intiated as ``"undefined"`` in the initiation of the reading class \ - and then updated when the class is being read - - For derived implementation classes that support reading of multiple \ - network versions, you may also assign - """ - pass - - @abc.abstractproperty - def _FILEMASK(self): - """Mask for identifying datafiles (e.g. '*.txt') - - Note - ---- - May be implemented as global constant in header - """ - pass - - @abc.abstractproperty - def __version__(self): - """Version of reading class - - Keeps track of changes in derived reading class (e.g. to assess whether - potential cache-files are outdated). - - Note - ---- - May be implemented as global constant in header - """ - pass - - @abc.abstractproperty - def DATA_ID(self): - """Name of dataset (OBS_ID) - - Note - ---- - - - May be implemented as global constant in header of derieved class - - May be multiple that can be specified on init (see example below) - - """ - pass - - @abc.abstractproperty - def SUPPORTED_DATASETS(self): - """List of all datasets supported by this interface - - Note - ---- - - - best practice to specify in header of class definition - - needless to mention that :attr:`DATA_ID` needs to be in this list - """ - pass - - @abc.abstractproperty - def PROVIDES_VARIABLES(self): - """List of variables that are provided by this dataset - - Note - ---- - May be implemented as global constant in header - """ - pass - - @abc.abstractproperty - def DEFAULT_VARS(self): - """List containing default variables to read""" - pass - - @abc.abstractmethod - def read_file(self, filename, vars_to_retrieve=None): - """Read single file - - Parameters - ---------- - filename : str - string specifying filename - vars_to_retrieve : :obj:`list` or similar, optional, - list containing variable IDs that are supposed to be read. If None, - all variables in :attr:`PROVIDES_VARIABLES` are loaded - - Returns - ------- - :obj:`dict` or :obj:`StationData`, or other... - imported data in a suitable format that can be handled by - :func:`read` which is supposed to append the loaded results from - this method (which reads one datafile) to an instance of - :class:`UngriddedData` for all files. - """ - pass - - @abc.abstractmethod - def read(self, vars_to_retrieve=None, files=[], first_file=None, last_file=None): - """Method that reads list of files as instance of :class:`UngriddedData` - - Parameters - ---------- - vars_to_retrieve : :obj:`list` or similar, optional, - list containing variable IDs that are supposed to be read. If None, - all variables in :attr:`PROVIDES_VARIABLES` are loaded - files : :obj:`list`, optional - list of files to be read. If None, then the file list is used that - is returned on :func:`get_file_list`. - first_file : :obj:`int`, optional - index of first file in file list to read. If None, the very first - file in the list is used - last_file : :obj:`int`, optional - index of last file in list to read. If None, the very last file - in the list is used - - Returns - ------- - UngriddedData - instance of ungridded data object containing data from all files. - """ - pass - - ### Concrete implementations of methods that are the same for all (or most) - # of the derived reading classes - def __init__(self, data_id: str | None = None, data_dir: str | None = None): - self.data = None # object that holds the loaded data - self._data_id = None - self.files = [] - # list that will be updated in read method to store all files that - # could not be read. It is the responsibility of developers of derived - # classes to include a try / except block in method read, where the - # method read_file is called, and in case of an Exception, append the - # corresponding file path to this list. - self.read_failed = [] - - self._data_dir = data_dir - - #: Class own instance of logger class - self.logger = logging.getLogger(__name__) - self._add_aux_variables() - - if data_id is not None: - if not data_id in self.SUPPORTED_DATASETS: - raise AttributeError( - f"Dataset {data_id} not supported by this interface" - ) - self._data_id = data_id - - @property - def data_id(self): - """ID of dataset""" - return self.DATA_ID if self._data_id is None else self._data_id - - @property - def DATASET_PATH(self): - """Wrapper for :attr:`data_dir`.""" - warnings.warn( - "Attr. DATASET_PATH is deprecated in ungridded readers " - "as of pyaerocom v0.11.0. Please use data_dir instead.", - DeprecationWarning, - stacklevel=2, - ) - return self.data_dir - - @property - def data_dir(self) -> str: - """ - str: Location of the dataset - - Note - ---- - This can be set explicitly when instantiating the class (e.g. if data - is available on local machine). If unspecified, the data location is - attempted to be inferred via :func:`get_obsnetwork_dir` - - Raises - ------ - FileNotFoundError - if data directory does not exist or cannot be retrieved - automatically - """ - if self._data_dir is None: - self._data_dir = get_obsnetwork_dir(self.data_id) - if not os.path.exists(self._data_dir): - raise FileNotFoundError(f"{self._data_dir} does not exist.") - return self._data_dir - - @property - def REVISION_FILE(self): - """Name of revision file located in data directory""" - return const.REVISION_FILE - - @property - def AUX_VARS(self): - """List of auxiliary variables (keys of attr. :attr:`AUX_REQUIRES`) - - Auxiliary variables are those that are not included in original files - but are computed from other variables during import - """ - return list(self.AUX_REQUIRES) - - @property - def data_revision(self): - """Revision string from file Revision.txt in the main data directory""" - if "_data_revision" in self.__dict__: - return self.__dict__["_data_revision"] - rev = "n/d" - try: - revision_file = os.path.join(self.data_dir, self.REVISION_FILE) - if os.path.isfile(revision_file): - with open(revision_file) as in_file: - rev = in_file.readline().strip() - except Exception: - pass - self._data_revision = rev - return rev - - @property - def verbosity_level(self): - """Current level of verbosity of logger""" - return self.logger.level - - @verbosity_level.setter - def verbosity_level(self, val): - self.logger.setLevel(val) - - def _add_aux_variables(self): - """Helper that makes sure all auxiliary variables can be computed""" - for var in self.AUX_REQUIRES: - if not var in self.AUX_FUNS: - raise AttributeError( - f"Fatal: no computation method defined for auxiliary variable {var}. " - f"Please specify method in class header dictionary AUX_FUNS" - ) - if not var in self.PROVIDES_VARIABLES: - self.PROVIDES_VARIABLES.append(var) - - def _add_additional_vars(self, vars_to_retrieve): - """Add required additional variables for computation to input list - - Helper method that is called in :func:`check_vars_to_retrieve` - in order to find all variables that are required for a specified - retrieval. This is relevant for additionally computed variables - (attribute ``AUX_VARS``) that are not available in the original data - files, but are computed from available parameters. - - Parameters - ---------- - vars_to_retrieve : list - list of variables supported by this interface (i.e. must be - contained in ``PROVIDES_VARIABLES``) - - Returns - ------- - tuple - 2-element tuple, containing - - - bool : boolean, specifying whether variables list of required \ - variables needs to be extended or the order was changed - - list : additionally required variables - """ - changed = False - added_vars = [] - - for var in vars_to_retrieve: - if var in self.AUX_VARS: - vars_req = self.AUX_REQUIRES[var] - for var_req in vars_req: - if var_req in vars_to_retrieve: - idx_var = vars_to_retrieve.index(var) - idx_var_req = vars_to_retrieve.index(var_req) - if idx_var < idx_var_req: # wrong order for computation - vars_to_retrieve[idx_var] = var_req - vars_to_retrieve[idx_var_req] = var - # break and return that it was changed (i.e repeat - # calling this method until nothing is changed or - # added) - return (True, added_vars + vars_to_retrieve) - else: - added_vars.append(var_req) - changed = True - # it is important to insert the additionally required variables in - # the beginning, as these need to be computed first later on - # Example: if vars_to_retrieve=['od550aer'] then this loop will - # find out that this requires 'ang4487aer' to be computed as - # well. So at the end of this function, ang4487aer needs to be - # before od550aer in the list vars_to_compute, since the method - # "compute_additional_vars" loops over that list in the specified - # order - vars_to_retrieve = added_vars + vars_to_retrieve - return (changed, vars_to_retrieve) - - def var_supported(self, var_name): - """ - Check if input variable is supported - - Parameters - ---------- - var_name : str - AeroCom variable name or alias - - Raises - ------ - VariableDefinitionError - if input variable is not supported by pyaerocom - - Returns - ------- - bool - True, if variable is supported by this interface, else False - - """ - if ( - var_name in self.PROVIDES_VARIABLES - or const.VARS[var_name].var_name_aerocom in self.PROVIDES_VARIABLES - ): - return True - return False - - def check_vars_to_retrieve(self, vars_to_retrieve): - """Separate variables that are in file from those that are computed - - Some of the provided variables by this interface are not included in - the data files but are computed within this class during data import - (e.g. od550aer, ang4487aer). - - The latter may require additional parameters to be retrieved from the - file, which is specified in the class header (cf. attribute - ``AUX_REQUIRES``). - - This function checks the input list that specifies all required - variables and separates them into two lists, one that includes all - variables that can be read from the files and a second list that - specifies all variables that are computed in this class. - - Parameters - ---------- - vars_to_retrieve : list - all parameter names that are supposed to be loaded - - Returns - ------- - tuple - 2-element tuple, containing - - - list: list containing all variables to be read - - list: list containing all variables to be computed - """ - if vars_to_retrieve is None: - vars_to_retrieve = self.DEFAULT_VARS - elif isinstance(vars_to_retrieve, str): - vars_to_retrieve = [vars_to_retrieve] - # first, check if input variables are alias names, and replace - vars_to_retrieve = varlist_aerocom(vars_to_retrieve) - - repeat = True - while repeat: - repeat, vars_to_retrieve = self._add_additional_vars(vars_to_retrieve) - - # unique list containing all variables that are supposed to be read, - # either because they are required to be retrieved, or because they - # are supposed to be read because they are required to compute one - # of the output variables - vars_to_retrieve = list(dict.fromkeys(vars_to_retrieve)) - - # in the following, vars_to_retrieve is separated into two arrays, one - # containing all variables that can be read from the files, and the - # second containing all variables that are computed - vars_to_read = [] - vars_to_compute = [] - - for var in vars_to_retrieve: - if not var in self.PROVIDES_VARIABLES: - raise ValueError(f"Invalid variable {var}") - elif var in self.AUX_REQUIRES: - vars_to_compute.append(var) - else: - vars_to_read.append(var) - return (vars_to_read, vars_to_compute) - - def compute_additional_vars(self, data, vars_to_compute): - """Compute all additional variables - - The computations for each additional parameter are done using the - specified methods in ``AUX_FUNS``. - - Parameters - ---------- - data : dict-like - data object containing data vectors for variables that are required - for computation (cf. input param ``vars_to_compute``) - vars_to_compute : list - list of variable names that are supposed to be computed. - Variables that are required for the computation of the variables - need to be specified in :attr:`AUX_VARS` and need to be - available as data vectors in the provided data dictionary (key is - the corresponding variable name of the required variable). - - Returns - ------- - dict - updated data object now containing also computed variables - """ - if not "var_info" in data: - data["var_info"] = {} - for var in vars_to_compute: - required = self.AUX_REQUIRES[var] - missing = [] - for req in required: - if not req in data: - missing.append(req) - - if len(missing) == 0: - data[var] = self.AUX_FUNS[var](data) - try: - data["var_info"][var]["computed"] = True - except KeyError: - data["var_info"][var] = {"computed": True} - - return data - - def remove_outliers(self, data, vars_to_retrieve, **valid_rng_vars): - """Remove outliers from data - - Parameters - ---------- - data : dict-like - data object containing data vectors for variables that are required - for computation (cf. input param ``vars_to_compute``) - vars_to_retrieve : list - list of variable names for which outliers will be removed from - data - **valid_rng_vars - additional keyword args specifying variable name and corresponding - min / max interval (list or tuple) that specifies valid range - for the variable. For each variable that is not explicitely defined - here, the default minimum / maximum value is used (accessed via - ``pyaerocom.const.VARS[var_name]``) - """ - for var in vars_to_retrieve: - if var in data: - if var in valid_rng_vars: - rng = valid_rng_vars[var] - low, high = rng[0], rng[1] - else: - var_info = const.VARS[var] - low, high = var_info["minimum"], var_info["maximum"] - vals = data[var] - mask = np.logical_or(vals < low, vals > high) - vals[mask] = np.nan - data[var] = vals - return data - - def find_in_file_list(self, pattern=None): - """Find all files that match a certain wildcard pattern - - Parameters - ---------- - pattern : :obj:`str`, optional - wildcard pattern that may be used to narrow down the search (e.g. - use ``pattern=*Berlin*`` to find only files that contain Berlin - in their filename) - - Returns - ------- - list - list containing all files in :attr:`files` that match pattern - - Raises - ------ - IOError - if no matches can be found - """ - if len(self.files) == 0: - self.get_file_list() - files = [f for f in self.files if fnmatch(f, pattern)] - if not len(files) > 0: - raise OSError( - f"No files could be detected that match the pattern {pattern}" - ) - return files - - def get_file_list(self, pattern=None): - """Search all files to be read - - Uses :attr:`_FILEMASK` (+ optional input search pattern, e.g. - station_name) to find valid files for query. - - Parameters - ---------- - pattern : str, optional - file name pattern applied to search - - Returns - ------- - list - list containing retrieved file locations - - Raises - ------ - IOError - if no files can be found - """ - if isinstance(pattern, str): - pattern = (pattern + self._FILEMASK).replace("**", "*") - else: - pattern = self._FILEMASK - if pattern is None: - logger.warning( - "_FILEMASK attr. must not be None...using default pattern *.* for file search" - ) - pattern = "*.*" - self.logger.info("Fetching data files. This might take a while...") - files = sorted(glob.glob(os.path.join(self.data_dir, pattern))) - if not len(files) > 0: - all_str = list_to_shortstr(os.listdir(self.data_dir)) - raise DataSourceError( - f"No files could be detected matching file " - f"mask {pattern} in dataset {self.data_id}, files in folder {self.data_dir}:\n" - f"Files in folder: {all_str}" - ) - self.files = files - return files - - def read_station(self, station_id_filename, **kwargs): - """Read data from a single station into :class:`UngriddedData` - - Find all files that contain the station ID in their filename and then - call :func:`read`, providing the reduced filelist as input, in order - to read all files from this station into data object. - - Parameters - ---------- - station_id_filename : str - name of station (MUST be encrypted in filename) - **kwargs - additional keyword args passed to :func:`read` - (e.g. ``vars_to_retrieve``) - - Returns - ------- - UngriddedData - loaded data - - Raises - ------ - IOError - if no files can be found for this station ID - """ - files = self.find_in_file_list(f"*{station_id_filename}*") - return self.read(files=files, **kwargs) - - def read_first_file(self, **kwargs): - """Read first file returned from :func:`get_file_list` - - Note - ---- - This method may be used for test purposes. - - Parameters - ---------- - **kwargs - keyword args passed to :func:`read_file` (e.g. vars_to_retrieve) - - Returns - ------- - dict-like - dictionary or similar containing loaded results from first file - """ - files = self.files - if len(files) == 0: - files = self.get_file_list() - return self.read_file(files[0], **kwargs) diff --git a/src/pyaro_readers/nilupmfebas/region.py b/src/pyaro_readers/nilupmfebas/region.py deleted file mode 100644 index 3338fb0..0000000 --- a/src/pyaro_readers/nilupmfebas/region.py +++ /dev/null @@ -1,416 +0,0 @@ -""" -This module contains functionality related to regions in pyaerocom -""" -from __future__ import annotations - -import numpy as np - -from pyaerocom._lowlevel_helpers import BrowseDict -from pyaerocom.config import ALL_REGION_NAME -from pyaerocom.helpers_landsea_masks import get_mask_value, load_region_mask_xr -from pyaerocom.region_defs import HTAP_REGIONS # list of HTAP regions -from pyaerocom.region_defs import REGION_DEFS # all region definitions -from pyaerocom.region_defs import ( - OLD_AEROCOM_REGIONS, - REGION_NAMES, -) # custom names (dict) - -POSSIBLE_REGION_OCEAN_NAMES = ["OCN", "Oceans"] - - -class Region(BrowseDict): - """Class specifying a region - - Attributes - ---------- - region_id : str - ID of region (e.g. EUROPE) - name : str - name of region (e.g. Europe) used e.g. in plotting. - lon_range : list - longitude range (min, max) covered by region - lat_range : list - latitude range (min, max) covered by region - lon_range_plot : list - longitude range (min, max) used for plotting region. - lat_range_plot : list - latitude range (min, max) used for plotting region. - lon_ticks : list - list of longitude ticks used for plotting - lat_ticks : list - list of latitude ticks used for plotting - - Parameters - ---------- - region_id : str - ID of region (e.g. "EUROPE"). If the input region ID is registered as - a default region in :mod:`pyaerocom.region_defs`, then the default - information is automatically imported on class instantiation. - **kwargs - additional class attributes (see above for available default attributes). - Note, any attr. values provided by kwargs are preferred over - potentially defined default attrs. that are imported automatically. - """ - - def __init__(self, region_id=None, **kwargs): - if region_id is None: - region_id = ALL_REGION_NAME - - if region_id in REGION_NAMES: - name = REGION_NAMES[region_id] - else: - name = region_id - - self.region_id = region_id - self.name = name - - self.lon_range = None - self.lat_range = None - - # longitude / latitude range of data in plots - self.lon_range_plot = None - self.lat_range_plot = None - - self.lon_ticks = None - self.lat_ticks = None - - self._mask_data = None - if region_id in REGION_DEFS: - self.import_default(region_id) - - self.update(**kwargs) - - def is_htap(self): - """Boolean specifying whether region is an HTAP binary region""" - return True if self.region_id in HTAP_REGIONS else False - - def import_default(self, region_id): - """Import region definition - - Parameters - ---------- - region_id : str - ID of region - - Raises - ------ - KeyError - if no region is registered for the input ID - """ - self.update(REGION_DEFS[region_id]) - - if self.lon_range_plot is None: - self.lon_range_plot = self.lon_range - if self.lat_range_plot is None: - self.lat_range_plot = self.lat_range - - @property - def center_coordinate(self): - """Center coordinate of this region""" - latc = self.lat_range[0] + (self.lat_range[1] - self.lat_range[0]) / 2 - lonc = self.lon_range[0] + (self.lon_range[1] - self.lon_range[0]) / 2 - return (latc, lonc) - - def distance_to_center(self, lat, lon): - """Compute distance of input coordinate to center of this region - - Parameters - ---------- - lat : float - latitude of coordinate - lon : float - longitude of coordinate - - Returns - ------- - float - distance in km - """ - from pyaerocom.geodesy import calc_distance - - cc = self.center_coordinate - return calc_distance(lat0=cc[0], lon0=cc[1], lat1=lat, lon1=lon) - - def contains_coordinate(self, lat, lon): - """Check if input lat/lon coordinate is contained in region - - Parameters - ---------- - lat : float - latitude of coordinate - lon : float - longitude of coordinate - - Returns - ------- - bool - True if coordinate is contained in this region, False if not - """ - - lat_lb = self.lat_range[0] - lat_ub = self.lat_range[1] - lon_lb = self.lon_range[0] - lon_ub = self.lon_range[1] - # latitude bounding boxes should always be defined with the southern most boundary less than the northernmost - lat_ok = lat_lb <= lat <= lat_ub - # if the longitude bounding box has a lowerbound less than the upperbound - if lon_lb < lon_ub: - # it suffices to check that lon is between these values - lon_ok = lon_lb <= lon <= lon_ub - # if the longitude lowerbound has a value lessthan the upperbound - elif lon_ub < lon_lb: - # lon is contained in the bounding box in two cases - lon_ok = lon < lon_ub or lon > lon_lb - else: - lon_ok = False # safeguard - return lat_ok * lon_ok - - def mask_available(self): - if not self.is_htap(): - return False - return True - - def get_mask_data(self): - if not self.mask_available(): - raise AttributeError( - f"No binary mask data available for region {self.region_id}." - ) - if self._mask_data is None: - self._mask_data = load_region_mask_xr(self.region_id) - return self._mask_data - - def plot_mask(self, ax, color, alpha=0.2): - mask = self.get_mask_data() - # import numpy as np - data = mask.data - data[data == 0] = np.nan - mask.data = data - - mask.plot(ax=ax) - return ax - - def plot_borders(self, ax, color, lw=2): - raise NotImplementedError("Coming soon...") - - def plot(self, ax=None): - """ - Plot this region - - Draws a rectangle of the outer bounds of the region and if a binary - mask is available for this region, it will be plotted as well. - - Parameters - ---------- - ax : GeoAxes, optional - axes instance to be used for plotting. Defaults to None in which - case a new instance is created. - - Returns - ------- - GeoAxes - axes instance used for plotting - - """ - from cartopy.mpl.geoaxes import GeoAxes - - from pyaerocom.plot.mapping import init_map - - if ax is None: - ax = init_map() - elif not isinstance(ax, GeoAxes): - raise ValueError("Invalid input for ax: need cartopy GeoAxes..") - - if self.mask_available(): - self.plot_mask(ax, color="r") - - ax.set_xlabel("Longitude") - ax.set_ylabel("Latitude") - name = self.name - if not name == self.region_id: - name += f" (ID={self.region_id})" - - ax.set_title(name) - - return ax - - def __contains__(self, val): - if not isinstance(val, tuple): - raise TypeError("Invalid input, need tuple") - if not len(val) == 2: - raise ValueError( - "Invalid input: coordinate must contain 2 elements (lat, lon)" - ) - return self.contains_coordinate(lat=val[0], lon=val[1]) - - def __repr__(self): - return f"Region {self.name} {super().__repr__()}" - - def __str__(self): - s = ( - "pyaeorocom Region\nName: %s\n" - "Longitude range: %s\n" - "Latitude range: %s\n" - "Longitude range (plots): %s\n" - "Latitude range (plots): %s" - % ( - self.name, - self.lon_range, - self.lat_range, - self.lon_range_plot, - self.lat_range_plot, - ) - ) - return s - - -def all(): - """Wrapper for :func:`get_all_default_region_ids`""" - return list(REGION_DEFS) - - -def get_all_default_region_ids(): - """Get list containing IDs of all default regions - - Returns - ------- - list - IDs of all predefined default regions - """ - return OLD_AEROCOM_REGIONS - - -def _get_regions_helper(reg_ids): - """ - Get dictionary of :class:`Region` instances for input IDs - - Parameters - ---------- - reg_ids : list - list of region IDs - - Returns - ------- - dict - keys are input region IDs, values are loaded :class:`Region` instances - """ - regs = {} - for reg in reg_ids: - regs[reg] = Region(reg) - return regs - - -def get_old_aerocom_default_regions(): - """ - Load dictionary with default AeroCom regions - - Returns - ------- - dict - keys are region ID's, values are instances of :class:`Region` - """ - return _get_regions_helper(OLD_AEROCOM_REGIONS) - - -def get_htap_regions(): - """ - Load dictionary with HTAP regions - - Returns - ------- - dict - keys are region ID's, values are instances of :class:`Region` - """ - return _get_regions_helper(HTAP_REGIONS) - - -def get_all_default_regions(): - """Get dictionary containing all default regions from region.ini file - - Returns - ------- - dict - dictionary containing all default regions; keys are region ID's, values - are instances of :class:`Region`. - - """ - return get_old_aerocom_default_regions() - - -#: ToDO: check how to handle methods properly with HTAP regions... -def get_regions_coord(lat, lon, regions=None): - """Get the region that contains an input coordinate - - Note - ---- - This does not yet include HTAP, since this causes troules in automated - AeroCom processing - - Parameters - ---------- - lat : float - latitude of coordinate - lon : float - longitude of coordinate - regions : dict, optional - dictionary containing instances of :class:`Region` as values, which - are considered. If None, then all default regions are used. - - Returns - ------- - list - list of regions that contain this coordinate - """ - matches = [] - if regions is None: - regions = get_all_default_regions() - ocean_mask = load_region_mask_xr("OCN") - on_ocean = bool(get_mask_value(lat, lon, ocean_mask)) - for rname, reg in regions.items(): - if rname == ALL_REGION_NAME: # always True for ALL_REGION_NAME - matches.append(rname) - continue - # OCN needs special handling determined by the rname, not hardcoded to return OCN b/c of HTAP issues - if rname in POSSIBLE_REGION_OCEAN_NAMES: - if on_ocean: - matches.append(rname) - continue - if reg.contains_coordinate(lat, lon) and not on_ocean: - matches.append(rname) - if len(matches) == 0: - matches.append(ALL_REGION_NAME) - return matches - - -def find_closest_region_coord( - lat: float, lon: float, regions: dict | None = None, **kwargs -) -> list[str]: - """Finds list of regions sorted by their center closest to input coordinate - - Parameters - ---------- - lat : float - latitude of coordinate - lon : float - longitude of coordinate - regions : dict, optional - dictionary containing instances of :class:`Region` as values, which - are considered. If None, then all default regions are used. - - Returns - ------- - list[str] - sorted list of region IDs of identified regions - """ - if regions is None: - regions = get_all_default_regions() - matches = get_regions_coord(lat, lon, regions) - matches.sort(key=lambda id: regions[id].distance_to_center(lat, lon)) - if kwargs.get("regions_how") == "htap": - # keep only first entry and Oceans if it exists - keep = matches[:1] - if "Oceans" in matches[1:]: - keep += ["Oceans"] - if ALL_REGION_NAME in matches[1:]: - keep += [ALL_REGION_NAME] - return list(set(keep)) - return matches diff --git a/src/pyaro_readers/nilupmfebas/region_defs.py b/src/pyaro_readers/nilupmfebas/region_defs.py deleted file mode 100644 index 0d11253..0000000 --- a/src/pyaro_readers/nilupmfebas/region_defs.py +++ /dev/null @@ -1,183 +0,0 @@ -""" -Definitions of rectangular regions used in pyaerocom - -NOTE: replaces former regions.ini in pyaerocom/data dir -""" -from typing import Final - -#: Name of region containing absolute all valid data points (WORLD in old aerocom notation) -ALL_REGION_NAME: Final = "ALL" - -_AEROCOM_DEFS = { - ALL_REGION_NAME: { - "lat_range": [-90, 90], - "lon_range": [-180, 180], - "lon_range_plot": [-180, 180], - "lon_ticks": [-180.0, -135.0, -90.0, -45.0, 0.0, 45, 90, 135, 180], - "lat_ticks": [-90.0, -60, -30, 0.0, 30, 60, 90], - }, - "ASIA": {"lat_range": [0, 72], "lon_range": [50, 150], "lon_range_plot": [50, 150]}, - "AUSTRALIA": { - "lat_range": [-50, -10], - "lon_range": [110, 155], - "lon_range_plot": [90, 180], - }, - "CHINA": { - "lat_range": [20, 50], - "lon_range": [90, 130], - "lon_range_plot": [90, 140], - }, - "EUROPE": { - "lat_range": [40, 72], - "lon_range": [-10, 40], - "lon_range_plot": [-10, 40], - "lon_ticks": [-20, -10, 0, 10, 20, 30, 40, 50, 60, 70], - "lat_ticks": [30, 40, 50, 60, 70, 80], - }, - "INDIA": {"lat_range": [5, 35], "lon_range": [65, 90], "lon_range_plot": [50, 100]}, - "NAFRICA": { - "lat_range": [0, 40], - "lon_range": [-17, 50], - "lon_range_plot": [-17, 50], - }, - "SAFRICA": { - "lat_range": [-35, 0], - "lon_range": [10, 40], - "lon_range_plot": [10, 40], - }, - "SAMERICA": { - "lat_range": [-60, 20], - "lon_range": [-105, -30], - "lon_range_plot": [-105, -30], - }, - "NAMERICA": { - "lat_range": [20, 80], - "lon_range": [-150, -45], - "lon_range_plot": [-150, -45], - }, -} - -_HTAP_DEFS = { - "PAN": { - "lat_range": [-54.74999999999966, 11.550000000000823], - "lon_range": [112, -134], - }, - "EAS": { - "lat_range": [18.25000000000084, 53.55000000000134], - "lon_range": [73.64999999999412, 145.74999999999002], - }, - "NAF": { - "lat_range": [19.05000000000085, 37.25000000000111], - "lon_range": [-17.050000000005937, 35.74999999999427], - }, - "MDE": { - "lat_range": [12.150000000000821, 39.65000000000114], - "lon_range": [34.24999999999425, 63.249999999994664], - }, - "LAND": { - "lat_range": [-59.449999999999726, 71.15000000000109], - "lon_range": [-180, 180], - }, - "SAS": { - "lat_range": [-9.749999999999135, 37.050000000001106], - "lon_range": [46.349999999994424, 97.34999999999278], - }, - "SPO": { - "lat_range": [-89.94999999999831, -60.049999999999734], - "lon_range": [-179.95000000000002, 179.94999999998808], - }, - "OCN": { - "lat_range": [-59.94999999999973, 66.25000000000136], - "lon_range": [-180, 180], - }, - "SEA": { - "lat_range": [-11.649999999999128, 28.450000000000983], - "lon_range": [92.24999999999307, 155.94999999998944], - }, - "RBU": { - "lat_range": [29.450000000000998, 66.25000000000136], - "lon_range": [22, -170], - }, - "EEUROPE": { - "lat_range": [34.65000000000107, 59.550000000001425], - "lon_range": [12.14999999999401, 44.7499999999944], - }, - "NAM": { - "lat_range": [18.95000000000085, 71.15000000000109], - "lon_range": [172, -52], - }, - "WEUROPE": { - "lat_range": [27.750000000000973, 66.45000000000135], - "lon_range": [-31.25000000000614, 31.449999999994215], - }, - "SAF": { - "lat_range": [-54.449999999999655, 27.250000000000966], - "lon_range": [-25.350000000006055, 77.5499999999939], - }, - "USA": { - "lat_range": [18.95000000000085, 49.35000000000128], - "lon_range": [-159.75000000000117, -56.250000000006494], - }, - "SAM": { - "lat_range": [-59.449999999999726, 5.150000000000846], - "lon_range": [-109.35000000000403, -26.250000000006068], - }, - "EUR": { - "lat_range": [27.750000000000973, 66.45000000000135], - "lon_range": [-31.25000000000614, 44.7499999999944], - }, - "NPO": { - "lat_range": [59.85000000000143, 89.95000000000002], - "lon_range": [-179.95000000000002, 179.94999999998808], - }, - "MCA": { - "lat_range": [-4.149999999999155, 32.65000000000104], - "lon_range": [-118.35000000000352, -51.65000000000643], - }, -} - - -_OTHER_REG_DEFS = { - "NHEMISPHERE": { - "lat_range": [0, 90], - "lon_range": [-180, 180], - "lon_range_plot": [-180, 180], - }, - "SHEMISPHERE": { - "lat_range": [-90, 0], - "lon_range": [-180, 180], - "lon_range_plot": [-180, 180], - }, -} - - -REGION_DEFS = {**_AEROCOM_DEFS, **_HTAP_DEFS, **_OTHER_REG_DEFS} - -# optional: alternative names for regions (e.g. used for plotting) -# if undefined the corresponding ID is used as name. Names are adapetd from -# https://publications.jrc.ec.europa.eu/repository/bitstream/JRC102552/lbna28255enn.pdf -# (Fig. 3, P11) -_HTAP_NAMES = { - "NAM": "N America", - "EUR": "Europe", - "EEUROPE": "E Europe", - "RBU": "Rus,Bel,Ukr", - "MDE": "Middle East", - "EAS": "E Asia", - "SAS": "S Asia", - "SEA": "SE Asia", - "NAF": "N Africa", - "MCA": "C America", - "SAF": "S Africa", - "SAM": "S America", - "PAN": "Pacific,Aust,NZ", - "OCN": "Oceans", -} - - -REGION_NAMES = {**_HTAP_NAMES} - -OLD_AEROCOM_REGIONS = list(_AEROCOM_DEFS) -HTAP_REGIONS_DEFAULT = list(_HTAP_NAMES) -HTAP_REGIONS = list(_HTAP_DEFS) -OTHER_REGIONS = list(_OTHER_REG_DEFS) diff --git a/src/pyaro_readers/nilupmfebas/resources.py b/src/pyaro_readers/nilupmfebas/resources.py deleted file mode 100644 index 8cc0363..0000000 --- a/src/pyaro_readers/nilupmfebas/resources.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Compatibility layer between importlib.resources for Python 3.11 and older versions -""" -import sys -from pathlib import Path -from typing import ContextManager - -if sys.version_info >= (3, 11): - from importlib import resources -else: - import importlib_resources as resources - - -def path(package: str, resource: str) -> ContextManager[Path]: - """A context manager providing a file path object to the resource. - If the resource does not already exist on its own on the file system, - a temporary file will be created. If the file was created, the file - will be deleted upon exiting the context manager (no exception is - raised if the file was deleted prior to the context manager - exiting). - """ - return resources.as_file(resources.files(package) / resource) - - -def is_resource(package: str, name: str) -> bool: - """True if `name` is a resource inside `package`. - - Directories are *not* resources. - """ - with path(package, name) as p: - return p.exists() - - -def read_text( - package: str, resource: str, encoding: str = "utf-8", errors: str = "strict" -) -> str: - """Return the decoded string of the resource. - - The decoding-related arguments have the same semantics as those of `pathlib.Path.read_text`. - """ - with path(package, resource) as p: - return p.read_text(encoding, errors) diff --git a/src/pyaro_readers/nilupmfebas/stationdata.py b/src/pyaro_readers/nilupmfebas/stationdata.py deleted file mode 100644 index 72fdf09..0000000 --- a/src/pyaro_readers/nilupmfebas/stationdata.py +++ /dev/null @@ -1,1542 +0,0 @@ -import logging -import warnings -from copy import deepcopy - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import xarray as xr - -from . import const -from ._lowlevel_helpers import BrowseDict, dict_to_str, list_to_shortstr, merge_dicts -from .exceptions import ( - CoordinateError, - DataDimensionError, - DataExtractionError, - DataUnitError, - MetaDataError, - StationCoordinateError, - TemporalResolutionError, - UnitConversionError, - VarNotAvailableError, -) -from .helpers import calc_climatology, isnumeric, isrange, to_datetime64 -from .metastandards import STANDARD_META_KEYS, StationMetaData -from .time_resampler import TimeResampler -from .tstype import TsType -from .units_helpers import convert_unit, get_unit_conversion_fac - -logger = logging.getLogger(__name__) - - -class StationData(StationMetaData): - """Dict-like base class for single station data - - ToDo: write more detailed introduction - - Note - ---- - Variable data (e.g. numpy array or pandas Series) can be directly - assigned to the object. When assigning variable data it is - recommended to add variable metadata (e.g. unit, ts_type) - in :attr:`var_info`, where key is variable name and value is dict with - metadata entries. - - Attributes - ---------- - dtime : list - list / array containing time index values - var_info : dict - dictionary containing information about each variable - data_err : dict - dictionary that may be used to store uncertainty timeseries or data - arrays associated with the different variable data. - overlap : dict - dictionary that may be filled to store overlapping timeseries data - associated with one variable. This is, for instance, used in - :func:`merge_vardata` to store overlapping data from another station. - - """ - - #: List of keys that specify standard metadata attribute names. This - #: is used e.g. in :func:`get_meta` - STANDARD_COORD_KEYS = const.STANDARD_COORD_NAMES - - #: maximum numerical distance between coordinates associated with this - #: station - # _COORD_MAX_VAR = 0.1 #km - _COORD_MAX_VAR = 5.0 # km - STANDARD_META_KEYS = STANDARD_META_KEYS - - VALID_TS_TYPES = const.GRID_IO.TS_TYPES - - #: Keys that are ignored when accessing metadata - PROTECTED_KEYS = [ - "dtime", - "var_info", - "station_coords", - "data_err", - "overlap", - "numobs", - "data_flagged", - ] - - def __init__(self, **meta_info): - self.dtime = [] - - self.var_info = BrowseDict() - - self.station_coords = dict.fromkeys(self.STANDARD_COORD_KEYS) - - self.data_err = BrowseDict() - self.overlap = BrowseDict() - self.numobs = BrowseDict() - self.data_flagged = BrowseDict() - - super().__init__(**meta_info) - - @property - def default_vert_grid(self): - """AeroCom default grid for vertical regridding - - For details, see :attr:`DEFAULT_VERT_GRID_DEF` in :class:`Config` - - Returns - ------- - ndarray - numpy array specifying default coordinates - """ - return const.make_default_vert_grid() - - @property - def vars_available(self): - """Number of variables available in this data object""" - return list(self.var_info) - - def has_var(self, var_name): - """Checks if input variable is available in data object - - Parameters - ---------- - var_name : str - name of variable - - Returns - ------- - bool - True, if variable data is available, else False - """ - if not var_name in self: - return False - if not var_name in self.var_info: - logger.warning( - f"Variable {var_name} exists in data but has no " - f"metadata assigned in :attr:`var_info`" - ) - return True - - def get_unit(self, var_name): - """Get unit of variable data - - Parameters - ---------- - var_name : str - name of variable - - Returns - ------- - str - unit of variable - - Raises - ------ - MetaDataError - if unit cannot be accessed for variable - """ - if not var_name in self.var_info: - raise MetaDataError( - f"Could not access variable metadata dict for {var_name}." - ) - try: - return str(self.var_info[var_name]["units"]) - except KeyError: - add_str = "" - if "unit" in self.var_info[var_name]: - add_str = ( - "Corresponding var_info dict contains " - 'attr. "unit", which is deprecated, please ' - "check corresponding reading routine. " - ) - raise MetaDataError( - f"Failed to access units attribute for variable {var_name}. {add_str}" - ) - - @property - def units(self): - """Dictionary containing units of all variables in this object""" - ud = {} - for var in self.var_info: - ud[var] = self.get_unit(var) - return ud - - def check_var_unit_aerocom(self, var_name): - """Check if unit of input variable is AeroCom default, if not, convert - - Parameters - ---------- - var_name : str - name of variable - - Raises - ------ - MetaDataError - if unit information is not accessible for input variable name - UnitConversionError - if current unit cannot be converted into specified unit - (e.g. 1 vs m-1) - DataUnitError - if current unit is not equal to AeroCom default and cannot - be converted. - """ - to_unit = const.VARS[var_name].units - try: - self.check_unit(var_name, to_unit) - except Exception: - self.convert_unit(var_name, to_unit) - - def check_unit(self, var_name, unit=None): - """Check if variable unit corresponds to a certain unit - - Parameters - ---------- - var_name : str - variable name for which unit is to be checked - unit : :obj:`str`, optional - unit to be checked, if None, AeroCom default unit is used - - Raises - ------ - MetaDataError - if unit information is not accessible for input variable name - UnitConversionError - if current unit cannot be converted into specified unit - (e.g. 1 vs m-1) - DataUnitError - if current unit is not equal to input unit but can be converted - (e.g. 1/Mm vs 1/m) - """ - if unit is None: - unit = const.VARS[var_name].units - u = self.get_unit(var_name) - if not get_unit_conversion_fac(u, unit, var_name) == 1: - raise DataUnitError(f"Invalid unit {u} (expected {unit})") - - def convert_unit(self, var_name, to_unit): - """Try to convert unit of data - - Requires that unit of input variable is available in :attr:`var_info` - - Parameters - ---------- - var_name : str - name of variable - to_unit : str - new unit - - Raises - ------ - MetaDataError - if variable unit cannot be accessed - UnitConversionError - if conversion failed - """ - unit = self.get_unit(var_name) - - data = self[var_name] - try: - tst = self.get_var_ts_type(var_name) - except MetaDataError: - tst = None - data = convert_unit( - data, from_unit=unit, to_unit=to_unit, var_name=var_name, ts_type=tst - ) - - self[var_name] = data - self.var_info[var_name]["units"] = to_unit - logger.info( - f"Successfully converted unit of variable {var_name} in {self.station_name} " - f"from {unit} to {to_unit}" - ) - - def dist_other(self, other): - """Distance to other station in km - - Parameters - ---------- - other : StationData - other data object - - Returns - ------- - float - distance between this and other station in km - """ - from pyaerocom.geodesy import calc_distance - - cthis = self.get_station_coords() - cother = other.get_station_coords() - - return calc_distance( - cthis["latitude"], - cthis["longitude"], - cother["latitude"], - cother["longitude"], - cthis["altitude"], - cother["altitude"], - ) - - def same_coords(self, other, tol_km=None): - """Compare station coordinates of other station with this station - - Parameters - ---------- - other : StationData - other data object - tol_km : float - distance tolerance in km - - Returns - ------- - bool - if True, then the two object are located within the specified - tolerance range - """ - if tol_km is None: - tol_km = self._COORD_MAX_VAR - return True if self.dist_other(other) < tol_km else False - - def get_station_coords(self, force_single_value=True): - """Return coordinates as dictionary - - This method uses the standard coordinate names defined in - :attr:`STANDARD_COORD_KEYS` (latitude, longitude and altitude) to get - the station coordinates. For each of these parameters tt first looks - in :attr:`station_coords` if the parameter is defined (i.e. it is not - None) and if not it checks if this object has an attribute that has - this name and uses that one. - - Parameters - ---------- - force_single_value : bool - if True and coordinate values are lists or arrays, then they are - collapsed to single value using mean - - Returns - ------- - dict - dictionary containing the retrieved coordinates - - Raises - ------ - AttributeError - if one of the coordinate values is invalid - CoordinateError - if local variation in either of the three spatial coordinates is - found too large - """ - output = {} - for key in self.STANDARD_COORD_KEYS: - # prefer explicit if defined in station_coord dictionary (e.g. altitude - # attribute in lidar data will be an array corresponding to profile - # altitudes) - val = self.station_coords[key] - if val is not None: - if not isnumeric(val): - raise MetaDataError( - f"Station coordinate {key} must be numeric. Got: {val}" - ) - output[key] = val - else: - val = self[key] - if force_single_value and not isinstance(val, (float, np.floating)): - if isinstance(val, (int, np.integer)): - val = np.float64(val) - elif isinstance(val, (list, np.ndarray)): - # ToDo: consider tolerance to be specified in input - # args. - maxdiff = np.max(val) - np.min(val) - if key in ("latitude", "longitude"): - tol = 0.05 # ca 5km at equator - else: - tol = 100 # m altitude tolerance - if maxdiff > tol: - raise StationCoordinateError( - f"meas point coordinate arrays of {key} vary " - f"too much to reduce them to a single " - f"coordinate. Order of difference in {key} is " - f"{maxdiff} and maximum allowed is {tol}." - ) - val = np.mean(val) - else: - raise AttributeError( - f"Invalid value encountered for coord {key}, " - f"need float, int, list or ndarray, got {type(val)}" - ) - output[key] = val - return output - - def get_meta( - self, - force_single_value=True, - quality_check=True, - add_none_vals=False, - add_meta_keys=None, - ): - """Return meta-data as dictionary - - By default, only default metadata keys are considered, use parameter - `add_meta_keys` to add additional metadata. - - Parameters - ---------- - force_single_value : bool - if True, then each meta value that is list or array,is converted - to a single value. - quality_check : bool - if True, and coordinate values are lists or arrays, then the - standarad deviation in the values is compared to the upper limits - allowed in the local variation. The upper limits are specified - in attr. ``COORD_MAX_VAR``. - add_none_vals : bool - Add metadata keys which have value set to None. - add_meta_keys : str or list, optional - Add none-standard metadata. - - Returns - ------- - dict - dictionary containing the retrieved meta-data - - Raises - ------ - AttributeError - if one of the meta entries is invalid - MetaDataError - in case of consistencies in meta data between individual time-stamps - """ - if isinstance(add_meta_keys, str): - add_meta_keys = [add_meta_keys] - elif not isinstance(add_meta_keys, list): - add_meta_keys = [] - meta = {} - meta.update(self.get_station_coords(force_single_value)) - keys = [k for k in self.STANDARD_META_KEYS] - keys.extend(add_meta_keys) - for key in keys: - if not key in self: - logger.warning(f"No such key in StationData: {key}") - continue - elif key in self.PROTECTED_KEYS: - # this is not metadata... - continue - elif key in self.STANDARD_COORD_KEYS: - # this has been handled above - continue - if self[key] is None and not add_none_vals: - logger.debug(f"No metadata available for key {key}") - continue - - val = self[key] - if force_single_value and isinstance(val, (list, tuple, np.ndarray)): - if quality_check and not all([x == val[0] for x in val]): - raise MetaDataError(f"Inconsistencies in meta parameter {key}") - val = val[0] - meta[key] = val - - return meta - - def _check_meta_item(self, key): - """Check if metadata item is valid - - Valid value types are dictionaries, lists, strings, numerical values - and datetetime objects. - """ - val = self[key] - if val is None: - return - elif isinstance(val, np.ndarray): - if val.ndim != 1: - raise MetaDataError( - f"Invalid metadata entry {val} for key {key}. " - f"Only 1d numpy arrays are supported..." - ) - self[key] = list(val) - elif not isinstance(val, (dict, list, str)) and not isnumeric(val): - try: - self[key] = to_datetime64(val) - except Exception: - raise MetaDataError( - f"Invalid metadata entry {val} for key {key}. " - f"Only dicts, lists, strings, numerical " - f"values or datetime objects are supported." - ) - - def _merge_meta_item(self, key, val): - """Merge meta item into this object - - Parameters - ---------- - key - key of metadata value - val - value to be added - """ - current_val = self[key] - same_type = isinstance(current_val, type(val)) - try: - if isinstance(current_val, dict): - if not same_type: - raise ValueError( - f"Cannot merge meta item {key} due to type mismatch" - ) - elif not current_val == val: - self[key] = merge_dicts(current_val, val) - - elif isinstance(current_val, str): - if not same_type: - if isinstance(val, list): - if not current_val in val: - newval = val.insert(0, current_val) - self[key] = newval - else: - raise ValueError( - f"Cannot merge meta item {key} due to type mismatch" - ) - elif not current_val == val: - # both are str that may be already merged with ";" -> only - # add new entries - vals_in = [x.strip() for x in val.split(";")] - - for item in vals_in: - if not item in current_val: - current_val += f";{item}" - self[key] = current_val - - elif isinstance(current_val, list): - if not same_type: - val = [val] - for item in val: - if not item in current_val: - current_val.append(item) - self[key] = current_val - - elif isnumeric(current_val) and isnumeric(val): - if np.isnan(current_val) and np.isnan(val): - self[key] = val - elif val != current_val: - self[key] = [current_val, val] - - elif isinstance(val, list): - if not current_val in val: - self[key] = val.insert(0, current_val) - - elif current_val != val: - self[key] = [current_val, val] - - else: # they shoul be the same - assert current_val == val, (current_val, val) - except Exception as e: - raise MetaDataError( - f"Failed to merge metadata entries for key {key}.\n" - f"Value in current StationData: {current_val}\n" - f"Value to be merged: {val}\n" - f"Error: {repr(e)}" - ) - - def _append_meta_item(self, key, val): - """Add a metadata item""" - if not key in self or self[key] is None: - self[key] = val - else: - self._merge_meta_item(key, val) - - def merge_meta_same_station( - self, - other, - coord_tol_km=None, - check_coords=True, - inplace=True, - add_meta_keys=None, - raise_on_error=False, - ): - """Merge meta information from other object - - Note - ---- - Coordinate attributes (latitude, longitude and altitude) are not - copied as they are required to be the same in both stations. The - latter can be checked and ensured using input argument ``check_coords`` - - Parameters - ---------- - other : StationData - other data object - coord_tol_km : float - maximum distance in km between coordinates of input StationData - object and self. Only relevant if :attr:`check_coords` is True. If - None, then :attr:`_COORD_MAX_VAR` is used which is defined in the - class header. - check_coords : bool - if True, the coordinates are compared and checked if they are lying - within a certain distance to each other (cf. :attr:`coord_tol_km`). - inplace : bool - if True, the metadata from the other station is added to the - metadata of this station, else, a new station is returned with the - merged attributes. - add_meta_keys : str or list, optional - additional non-standard metadata keys that are supposed to be - considered for merging. - raise_on_error : bool - if True, then an Exception will be raised in case one of the - metadata items cannot be merged, which is most often due to - unresolvable type differences of metadata values between the two - objects - - """ - if add_meta_keys is None: - add_meta_keys = [] - - elif isinstance(add_meta_keys, str): - add_meta_keys = [add_meta_keys] - - if not inplace: - obj = self.copy() - else: - obj = self - - if check_coords: - if coord_tol_km is None: - coord_tol_km = self._COORD_MAX_VAR - try: - if not self.same_coords(other, coord_tol_km): - raise CoordinateError( - f"Station coordinates differ by more than {coord_tol_km} km." - ) - except MetaDataError: # - pass - - keys = [k for k in self.STANDARD_META_KEYS] - keys.extend(add_meta_keys) - for key in keys: - if key in self.STANDARD_COORD_KEYS: - if self[key] is None and other[key] is not None: - self[key] = other[key] - elif key in self.PROTECTED_KEYS: - continue - elif key in other and other[key] is not None: - try: - self._check_meta_item(key) - other._check_meta_item(key) - - obj._append_meta_item(key, other[key]) - except MetaDataError as e: - obj[key] = "N/A_FAILED_TO_MERGE" - msg = f"Failed to merge meta item {key}. Reason:{repr(e)}" - if raise_on_error: - raise MetaDataError(msg) - else: - logger.warning(msg) - - return obj - - def merge_varinfo(self, other, var_name): - """Merge variable specific meta information from other object - - Parameters - ---------- - other : StationData - other data object - var_name : str - variable name for which info is to be merged (needs to be both - available in this object and the provided other object) - """ - if not var_name in self.var_info or not var_name in other.var_info: - raise MetaDataError( - f"No variable meta information available for {var_name}" - ) - - info_this = self.var_info[var_name] - info_other = other.var_info[var_name] - for key, val in info_other.items(): - if not key in info_this or info_this[key] == None: - info_this[key] = val - else: - if isinstance(info_this[key], str): - if not isinstance(val, str): - raise ValueError( - f"Cannot merge meta item {key} due to type mismatch" - ) - vals = [x.strip() for x in info_this[key].split(";")] - vals_in = [x.strip() for x in val.split(";")] - - for _val in vals_in: - if not _val in vals: - info_this[key] = info_this[key] + f";{_val}" - else: - if isinstance(val, (list, np.ndarray)): - if len(val) == 0: - continue - elif type(info_this[key]) == type(val): - if info_this[key] == val: - continue - info_this[key] = [info_this[key], val] - raise ValueError( - "Cannot append metadata value that is " - "already a list or numpy array due to " - "potential ambiguities" - ) - if isinstance(info_this[key], list): - if not val in info_this[key]: - info_this[key].append(val) - else: - if not info_this[key] == val: - info_this[key] = [info_this[key], val] - return self - - def check_if_3d(self, var_name): - """Checks if altitude data is available in this object""" - if "altitude" in self: - val = self["altitude"] - if isnumeric(val): # is numerical value - return False - # unique altitude values - uvals = np.unique(val) - if len(uvals) == 1: # only one value in altitude array (NOT 3D) - return False - elif ( - len(uvals[~np.isnan(uvals)]) == 1 - ): # only 2 unique values in altitude array but one is NaN - return False - return True - return False - - def _check_ts_types_for_merge(self, other, var_name): - ts_type = self.get_var_ts_type(var_name) - ts_type1 = other.get_var_ts_type(var_name) - if ts_type != ts_type1: - # make sure each variable in the object has explicitely ts_type - # assigned (rather than global specification) - - self._update_var_timeinfo() - other._update_var_timeinfo() - - from pyaerocom.helpers import get_lowest_resolution - - ts_type = get_lowest_resolution(ts_type, ts_type1) - return ts_type - - def _update_var_timeinfo(self): - for var, info in self.var_info.items(): - data = self[var] - if not isinstance(data, pd.Series): - try: - self[var] = pd.Series(data, self.dtime) - except Exception as e: - raise Exception(f"Unexpected error: {repr(e)}.\nPlease debug...") - if not "ts_type" in info or info["ts_type"] is None: - if not self.ts_type in const.GRID_IO.TS_TYPES: - raise ValueError(f"Cannot identify ts_type for var {var} in {self}") - info["ts_type"] = self.ts_type - self.ts_type = None - - def _merge_vardata_2d(self, other, var_name, resample_how=None, min_num_obs=None): - """Merge 2D variable data (for details see :func:`merge_vardata`)""" - ts_type = self._check_ts_types_for_merge(other, var_name) - - s0 = self.resample_time( - var_name, - ts_type=ts_type, - how=resample_how, - min_num_obs=min_num_obs, - inplace=True, - )[var_name].dropna() - s1 = other.resample_time( - var_name, - ts_type=ts_type, - how=resample_how, - min_num_obs=min_num_obs, - inplace=True, - )[var_name].dropna() - - info = other.var_info[var_name] - removed = None - if "overlap" in info and info["overlap"]: - raise NotImplementedError("Coming soon...") - - if len(s1) > 0: # there is data - overlap = s0.index.intersection(s1.index) - try: - if len(overlap) > 0: - removed = s1[overlap] - # NOTE JGLISS: updated on 8.5.2020, cf. issue #106 - # s1 = s1.drop(index=overlap, inplace=True) - s1.drop(index=overlap, inplace=True) - # compute merged time series - if len(s1) > 0: - s0 = pd.concat([s0, s1], verify_integrity=True) - - # sort the concatenated series based on timestamps - s0.sort_index(inplace=True) - self.merge_varinfo(other, var_name) - except KeyError: - logger.warning( - f"failed to merge {var_name} data from 2 StationData " - f"objects for station {self.station_name}. Ignoring 2nd " - f"data object." - ) - - # assign merged time series (overwrites previous one) - self[var_name] = s0 - self.dtime = s0.index.values - - if removed is not None: - if var_name in self.overlap: - self.overlap[var_name] = pd.concat([self.overlap[var_name], removed]) - self.overlap[var_name].sort_index(inplace=True) - else: - self.overlap[var_name] = removed - - return self - - def merge_vardata(self, other, var_name, **kwargs): - """Merge variable data from other object into this object - - Note - ---- - This merges also the information about this variable in the dict - :attr:`var_info`. It is required, that variable meta-info is - specified in both StationData objects. - - Note - ---- - This method removes NaN's from the existing time series in the data - objects. In order to fill up the time-series with NaNs again after - merging, call :func:`insert_nans_timeseries` - - Parameters - ---------- - other : StationData - other data object - var_name : str - variable name for which info is to be merged (needs to be both - available in this object and the provided other object) - kwargs - keyword args passed on to :func:`_merge_vardata_2d` - - Returns - ------- - StationData - this object merged with other object - """ - if not var_name in self: - raise VarNotAvailableError( - f"StationData object does not contain data for variable {var_name}" - ) - elif not var_name in other: - raise VarNotAvailableError( - "Input StationData object does not contain data for variable {var_name}" - ) - elif not var_name in self.var_info: - raise MetaDataError( - f"For merging of {var_name} data, variable specific meta " - f"data needs to be available in var_info dict" - ) - elif not var_name in other.var_info: - raise MetaDataError( - f"For merging of {var_name} data, variable specific meta " - f"data needs to be available in var_info dict" - ) - - if self.get_unit(var_name) != other.get_unit(var_name): - self.check_var_unit_aerocom(var_name) - other.check_var_unit_aerocom(var_name) - - if self.check_if_3d(var_name): - raise NotImplementedError("Coming soon...") - # return self._merge_vardata_3d(other, var_name) - else: - return self._merge_vardata_2d(other, var_name, **kwargs) - - def merge_other(self, other, var_name, add_meta_keys=None, **kwargs): - """Merge other station data object - - Todo - ---- - Should be independent of variable, i.e. it should be able to merge all - data that is in the other object into this, even if this object does - not contain that variable yet. - - Parameters - ---------- - other : StationData - other data object - var_name : str - variable name for which info is to be merged (needs to be both - available in this object and the provided other object) - add_meta_keys : str or list, optional - additional non-standard metadata keys that are supposed to be - considered for merging. - kwargs - keyword args passed on to :func:`merge_vardata` (e.g time - resampling settings) - - Returns - ------- - StationData - this object that has merged the other station - """ - self.merge_vardata(other, var_name, **kwargs) - self.merge_meta_same_station(other, add_meta_keys=add_meta_keys) - - return self - - def check_dtime(self): - """Checks if dtime attribute is array or list""" - if not any([isinstance(self.dtime, x) for x in [list, np.ndarray]]): - raise TypeError(f"dtime attribute is not iterable: {self.dtime}") - elif not len(self.dtime) > 0: - raise AttributeError("No timestamps available") - - def get_var_ts_type(self, var_name, try_infer=True): - """Get ts_type for a certain variable - - Note - ---- - Converts to ts_type string if assigned ts_type is in pandas format - - Parameters - ---------- - var_name : str - data variable name for which the ts_type is supposed to be - retrieved - try_infer : bool - if ts_type is not available, try inferring it from data - - Returns - ------- - str - the corresponding data time resolution - - Raises - ------ - MetaDataError - if no metadata is available for this variable (e.g. if ``var_name`` - cannot be found in :attr:`var_info`) - """ - # make sure there exists a var_info dict for this variable - if not var_name in self.var_info: - self.var_info[var_name] = {} - - # use variable specific entry if available - if "ts_type" in self.var_info[var_name]: - return TsType(self.var_info[var_name]["ts_type"]).val - elif isinstance(self.ts_type, str): - # ensures validity and corrects for pandas strings - ts_type = TsType(self.ts_type).val - self.var_info[var_name]["ts_type"] = ts_type - return ts_type - - if try_infer: - logger.warning( - f"Trying to infer ts_type in StationData {self.station_name} " - f"for variable {var_name}" - ) - from pyaerocom.helpers import infer_time_resolution - - try: - s = self._to_ts_helper(var_name) - ts_type = infer_time_resolution(s.index) - self.var_info[var_name]["ts_type"] = ts_type - return ts_type - except Exception: - pass # Raise standard error - raise MetaDataError(f"Could not access ts_type for {var_name}") - - def remove_outliers(self, var_name, low=None, high=None, check_unit=True): - """Remove outliers from one of the variable timeseries - - Parameters - ---------- - var_name : str - variable name - low : float - lower end of valid range for input variable. If None, then the - corresponding value from the default settings for this variable - are used (cf. minimum attribute of `available variables - `__) - high : float - upper end of valid range for input variable. If None, then the - corresponding value from the default settings for this variable - are used (cf. maximum attribute of `available variables - `__) - check_unit : bool - if True, the unit of the data is checked against AeroCom default - """ - if any([x is None for x in (low, high)]): - info = const.VARS[var_name] - if check_unit: - try: - self.check_unit(var_name) - except DataUnitError: - self.convert_unit(var_name, to_unit=info.units) - if low is None: - low = info.minimum - logger.info(f"Setting {var_name} outlier lower lim: {low:.2f}") - if high is None: - high = info.maximum - logger.info(f"Setting {var_name} outlier upper lim: {high:.2f}") - - d = self[var_name] - invalid_mask = np.logical_or(d < low, d > high) - d[invalid_mask] = np.nan - self[var_name] = d - - def calc_climatology( - self, - var_name, - start=None, - stop=None, - min_num_obs=None, - clim_mincount=None, - clim_freq=None, - set_year=None, - resample_how=None, - ): - """Calculate climatological timeseries for input variable - - - Parameters - ---------- - var_name : str - name of data variable - start - start time of data used to compute climatology - stop - start time of data used to compute climatology - min_num_obs : dict or int, optional - minimum number of observations required per period (when - downsampling). For details see - :func:`pyaerocom.time_resampler.TimeResampler.resample`) - clim_micount : int, optional - minimum number of of monthly values required per month of - climatology - set_year : int, optional - if specified, the output data will be assigned the input year. Else - the middle year of the climatological interval is used. - resample_how : str - how should the resampled data be averaged (e.g. mean, median) - **kwargs - Additional keyword args passed to - :func:`pyaerocom.time_resampler.TimeResampler.resample` - - Returns - ------- - StationData - new instance of StationData containing climatological data - """ - if clim_freq is None: - clim_freq = const.CLIM_FREQ - - if resample_how is None: - resample_how = const.CLIM_RESAMPLE_HOW - - ts_type = TsType(self.get_var_ts_type(var_name)) - - monthly = TsType("monthly") - if ts_type < monthly: - raise TemporalResolutionError( - f"Cannot compute climatology, {var_name} data " - f"needs to be in monthly resolution or higher (is: {ts_type})" - ) - if ts_type < TsType( - clim_freq - ): # current resolution is lower than input climatological freq - supported = list(const.CLIM_MIN_COUNT) - if str(ts_type) in supported: - clim_freq = str(ts_type) - else: # use monthly - clim_freq = "monthly" - - data = self.resample_time( - var_name, - ts_type=clim_freq, - how=resample_how, - min_num_obs=min_num_obs, - inplace=False, - ) - ts = data.to_timeseries(var_name) - - if start is None: - start = const.CLIM_START - if stop is None: - stop = const.CLIM_STOP - - if clim_mincount is None: - clim_mincount = const.CLIM_MIN_COUNT[clim_freq] - - clim = calc_climatology( - ts, - start, - stop, - min_count=clim_mincount, - set_year=set_year, - resample_how=resample_how, - ) - - new = StationData() - try: - new.update(self.get_meta()) - except MetaDataError: - new.update(self.get_meta(force_single_value=False)) - - new[var_name] = clim["data"] - vi = {} - if var_name in self.var_info: - vi.update(self.var_info[var_name]) - - new.var_info[var_name] = vi - new.var_info[var_name]["ts_type"] = "monthly" - new.var_info[var_name]["ts_type_src"] = ts_type.base - new.var_info[var_name]["is_climatology"] = True - new.var_info[var_name]["clim_start"] = start - new.var_info[var_name]["clim_stop"] = stop - new.var_info[var_name]["clim_freq"] = clim_freq - new.var_info[var_name]["clim_how"] = resample_how - new.var_info[var_name]["clim_mincount"] = clim_mincount - new.data_err[var_name] = clim["std"] - new.numobs[var_name] = clim["numobs"] - return new - - def resample_time( - self, var_name, ts_type, how=None, min_num_obs=None, inplace=False, **kwargs - ): - """Resample one of the time-series in this object - - Parameters - ---------- - var_name : str - name of data variable - ts_type : str - new frequency string (can be pyaerocom ts_type or valid pandas - frequency string) - how : str - how should the resampled data be averaged (e.g. mean, median) - min_num_obs : dict or int, optional - minimum number of observations required per period (when - downsampling). For details see - :func:`pyaerocom.time_resampler.TimeResampler.resample`) - inplace : bool - if True, then the current data object stored in self, will be - overwritten with the resampled time-series - **kwargs - Additional keyword args passed to - :func:`pyaerocom.time_resampler.TimeResampler.resample` - - Returns - ------- - StationData - with resampled variable timeseries - """ - if inplace: - outdata = self - else: - outdata = self.copy() - if not var_name in outdata: - raise KeyError(f"Variable {var_name} does not exist") - - to_ts_type = TsType(ts_type) # make sure to use AeroCom ts_type - - try: - from_ts_type = TsType(outdata.get_var_ts_type(var_name)) - except (MetaDataError, TemporalResolutionError): - from_ts_type = None - logger.warning( - f"Failed to access current temporal resolution of {var_name} data " - f"in StationData {outdata.station_name}. " - f"No resampling constraints will be applied" - ) - - data = outdata[var_name] - - if not isinstance(data, (pd.Series, xr.DataArray)): - data = outdata.to_timeseries(var_name) - resampler = TimeResampler(data) - new = resampler.resample( - to_ts_type=to_ts_type, - from_ts_type=from_ts_type, - how=how, - min_num_obs=min_num_obs, - **kwargs, - ) - - outdata[var_name] = new - outdata.var_info[var_name]["ts_type"] = to_ts_type.val - outdata.var_info[var_name].update(resampler.last_setup) - # there is other variables that are not resampled - if len(outdata.var_info) > 1 and outdata.ts_type is not None: - _tt = outdata.ts_type - outdata.ts_type = None - outdata.dtime = None - for var, info in outdata.var_info.items(): - if not var == var_name: - info["ts_type"] = _tt - else: # no other variables, update global class attributes - outdata.ts_type = to_ts_type.val - outdata.dtime = new.index.values - - return outdata - - def resample_timeseries(self, var_name, **kwargs): - """Wrapper for :func:`resample_time` (for backwards compatibility) - - Note - ---- - For backwards compatibility, this method will return a pandas Series - instead of the actual StationData object - """ - warnings.warn( - "This method was renamed to resample_time as a means " - "of harmonisation with GriddedData and ColocatedData", - DeprecationWarning, - stacklevel=2, - ) - return self.resample_time(var_name, **kwargs)[var_name] - - def remove_variable(self, var_name): - """Remove variable data - - Parameters - ---------- - var_name : str - name of variable that is to be removed - - Returns - ------- - StationData - current instance of this object, with data removed - - Raises - ------ - VarNotAvailableError - if the input variable is not available in this object - """ - if not self.has_var(var_name): - raise VarNotAvailableError(f"No such variable in StationData: {var_name}") - self.pop(var_name) - if var_name in self.var_info: - self.var_info.pop(var_name) - return self - - def insert_nans_timeseries(self, var_name): - """Fill up missing values with NaNs in an existing time series - - Note - ---- - This method does a resample of the data onto a regular grid. Thus, if - the input ``ts_type`` is different from the actual current ``ts_type`` - of the data, this method will not only insert NaNs but at the same. - - Parameters - --------- - var_name : str - variable name - inplace : bool - if True, the actual data in this object will be overwritten with - the new data that contains NaNs - - Returns - ------- - StationData - the modified station data object - - """ - ts_type = self.get_var_ts_type(var_name) - - self.resample_time(var_name, ts_type, inplace=True) - - return self - - def _to_ts_helper(self, var_name): - """Convert data internally to pandas.Series if it is not stored as such - - Parameters - ---------- - var_name : str - variable name of data - - Returns - ------- - pandas.Series - data as timeseries - """ - data = self[var_name] - if isinstance(data, pd.Series): - return data - - elif not data.ndim == 1: - raise NotImplementedError( - "Multi-dimensional data columns cannot be converted to time-series" - ) - self.check_dtime() - if not len(data) == len(self.dtime): - raise ValueError( - f"Mismatch between length of data array " - f"for variable {var_name} (length: {len(data)}) " - f"and time array (length: {len(self.dtime)})." - ) - self[var_name] = s = pd.Series(data, index=self.dtime) - return s - - def select_altitude(self, var_name, altitudes): - """Extract variable data within certain altitude range - - Note - ---- - Beta version - - Parameters - ---------- - var_name : str - name of variable for which metadata is supposed to be extracted - altitudes : list - altitude range in m, e.g. [0, 1000] - - Returns - ------- - pandas. Series or xarray.DataArray - data object within input altitude range - """ - data = self[var_name] - - if not isrange(altitudes): - raise NotImplementedError( - "So far only a range (low, high) is supported for altitude extraction." - ) - - if isinstance(data, xr.DataArray): - if not sorted(data.dims) == ["altitude", "time"]: - raise NotImplementedError( - "Can only handle dataarrays that contain 2 dimensions altitude and time" - ) - if isrange(altitudes): - if not isinstance(altitudes, slice): - altitudes = slice(altitudes[0], altitudes[1]) - result = data.sel(altitude=altitudes) - if len(result.altitude) == 0: - raise ValueError(f"no data in specified altitude range") - return result - - raise DataExtractionError("Cannot intepret input for altitude...") - - elif isinstance(data, pd.Series) or len(self.dtime) == len(data): - if not "altitude" in self: - raise ValueError("Missing altitude information") - if not isinstance(data, pd.Series): - data = pd.Series(data, self.dtime) - alt = self.altitude - if not isinstance(alt, (list, np.ndarray)): - raise AttributeError("need 1D altitude array") - elif not len(alt) == len(data): - raise DataDimensionError( - f"Altitude data and {var_name} data have different lengths" - ) - mask = np.logical_and(alt >= altitudes[0], alt <= altitudes[1]) - if mask.sum() == 0: - raise ValueError(f"no data in specified altitude range") - return data[mask] - - raise DataExtractionError( - f"Cannot extract altitudes: type of {var_name} ({type(data)}) is not supported" - ) - - def to_timeseries(self, var_name, **kwargs): - """Get pandas.Series object for one of the data columns - - Parameters - ---------- - var_name : str - name of variable (e.g. "od550aer") - - Returns - ------- - Series - time series object - - Raises - ------ - KeyError - if variable key does not exist in this dictionary - ValueError - if length of data array does not equal the length of the time array - """ - if not var_name in self: - raise KeyError(f"Variable {var_name} does not exist") - - data = self[var_name] - - if isinstance(data, xr.DataArray): - if not all([x in data.dims for x in ("time", "altitude")]): - raise NotImplementedError( - "Can only handle dataarrays that " - "contain 2 dimensions of time " - "and altitude" - ) - if not "altitude" in kwargs: - raise ValueError( - "please specify altitude range via input " - "arg: altitude, e.g. altitude=(100,110)" - ) - alt_info = kwargs.pop("altitude") - data = self.select_altitude(var_name, alt_info) - data = data.mean("altitude") - data = data.to_series() - - if not isinstance(data, pd.Series): - data = self._to_ts_helper(var_name) - - return data - - def plot_timeseries( - self, var_name, add_overlaps=False, legend=True, tit=None, **kwargs - ): - """ - Plot timeseries for variable - - Note - ---- - If you set input arg ``add_overlaps = True`` the overlapping timeseries - data - if it exists - will be plotted on top of the actual timeseries - using red colour and dashed line. As the overlapping data may be - identical with the actual data, you might want to increase the line - width of the actual timeseries using an additional input argument - ``lw=4``, or similar. - - Parameters - ---------- - var_name : str - name of variable (e.g. "od550aer") - add_overlaps : bool - if True and if overlapping data exists for this variable, it will - be added to the plot. - tit : :obj:`str`, optional - title of plot, if None, default title is used - **kwargs - additional keyword args passed to matplotlib ``plot`` method - - Returns - ------- - axes - matplotlib.axes instance of plot - - Raises - ------ - KeyError - if variable key does not exist in this dictionary - ValueError - if length of data array does not equal the length of the time array - """ - if "label" in kwargs: - lbl = kwargs.pop("label") - else: - lbl = var_name - try: - ts_type = self.get_var_ts_type(var_name) - lbl += f" ({ts_type})" - except Exception: - pass - if not "ax" in kwargs: - if "figsize" in kwargs: - fs = kwargs.pop("figsize") - else: - fs = (16, 8) - _, ax = plt.subplots(1, 1, figsize=fs) - else: - ax = kwargs.pop("ax") - # keep existing title if it exists - _tit = ax.get_title() - if not _tit == "": - tit = _tit - - if tit is None: - try: - tit = self.get_meta(force_single_value=True, quality_check=False)[ - "station_name" - ] - except Exception: - tit = "Failed to retrieve station_name" - s = self.to_timeseries(var_name) - ax.plot(s, label=lbl, **kwargs) - if add_overlaps and var_name in self.overlap: - so = self.overlap[var_name] - ax.plot(so, "--", lw=1, c="r", label=f"{var_name} (overlap)") - - ylabel = var_name - try: - if "units" in self.var_info[var_name]: - u = self.var_info[var_name]["units"] - if u is not None and not u in [1, "1"]: - ylabel += f" [{u}]" - except Exception: - logger.warning(f"Failed to access unit information for variable {var_name}") - ax.set_ylabel(ylabel) - ax.set_title(tit) - if legend: - ax.legend() - return ax - - def copy(self): - new = StationData() - for key, val in self.items(): - cpv = deepcopy(val) - new[key] = cpv - - return new - - def __str__(self): - """String representation""" - head = f"Pyaerocom {type(self).__name__}" - s = f"\n{head}\n{len(head) * '-'}" - arrays = "" - series = "" - - for k, v in self.items(): - if k[0] == "_": - continue - if isinstance(v, dict): - s += f"\n{k} ({type(v).__name__}):" - if v: - s += dict_to_str(v, indent=2) - else: - s += " " - elif isinstance(v, list): - s += f"{k} : {list_to_shortstr(v)}" - elif isinstance(v, np.ndarray): - if v.ndim == 1: - arrays += f"{k} : {list_to_shortstr(v)}" - else: - arrays += f"\n{k} (ndarray, shape {v.shape})" - arrays += f"\n{v}" - elif isinstance(v, pd.Series): - series += f"\n{k} (Series, {len(v)} items)" - else: - if isinstance(v, str) and v == "": - v = "" - s += f"\n{k}: {v}" - if arrays: - s += "\n\nData arrays\n................." - s += arrays - if series: - s += "\nPandas Series\n................." - s += series - - return s diff --git a/src/pyaro_readers/nilupmfebas/time_config.py b/src/pyaro_readers/nilupmfebas/time_config.py deleted file mode 100644 index 7ead7c5..0000000 --- a/src/pyaro_readers/nilupmfebas/time_config.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -Definitions and helpers related to time conversion -""" -from datetime import datetime - -from iris import coord_categorisation - -TS_TYPES = ["minutely", "hourly", "daily", "weekly", "monthly", "yearly", "native"] - -# The following import was removed and the information about available unit -# strings was copied from the netCDF4 module directly here -microsec_units = ["microseconds", "microsecond", "microsec", "microsecs"] -millisec_units = ["milliseconds", "millisecond", "millisec", "millisecs"] -sec_units = ["second", "seconds", "sec", "secs", "s"] -min_units = ["minute", "minutes", "min", "mins"] -hr_units = ["hour", "hours", "hr", "hrs", "h"] -day_units = ["day", "days", "d"] - -# -# Start of the gregorian calendar -# adapted from here: https://github.com/Unidata/cftime/blob/master/cftime/_cftime.pyx -GREGORIAN_BASE = datetime(1582, 10, 15) - -IRIS_AGGREGATORS = { - "hourly": coord_categorisation.add_hour, - "daily": coord_categorisation.add_day_of_year, - "monthly": coord_categorisation.add_month_number, - "yearly": coord_categorisation.add_year, -} - -# some helper dictionaries for conversion of temporal resolution -# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases -TS_TYPE_TO_PANDAS_FREQ = { - "minutely": "T", - "hourly": "H", - "daily": "D", - "weekly": "W-MON", - "monthly": "MS", # Month start ! - "season": "Q", - "yearly": "AS", -} - -PANDAS_RESAMPLE_OFFSETS = {"AS": "181D", "MS": "14D", "D": "12H", "H": "30T"} - -PANDAS_FREQ_TO_TS_TYPE = {v: k for k, v in TS_TYPE_TO_PANDAS_FREQ.items()} - -# frequency strings -# https://numpy.org/devdocs/reference/arrays.datetime.html#datetime-units -TS_TYPE_TO_NUMPY_FREQ = { - "minutely": "m", - "hourly": "h", - "daily": "D", - "weekly": "W", - "monthly": "M", # Month start ! - "yearly": "Y", -} - -# conversion of ts_types to strings that cf_units understands -TS_TYPE_TO_SI = { - "minutely": "min", - "hourly": "h", - "daily": "d", - "weekly": "week", - "monthly": "month", - "yearly": "yr", -} - -SI_TO_TS_TYPE = {v: k for k, v in TS_TYPE_TO_SI.items()} - -# conversion of datetime-like objects for given temporal resolutions (can, e.g. -# be used in plotting methods) -TS_TYPE_DATETIME_CONV = { - None: "%d.%m.%Y", # Default - "hourly": "%d.%m.%Y", - "3hourly": "%d.%m.%Y", - "daily": "%d.%m.%Y", - "weekly": "%d.%m.%Y", - "monthly": "%b %Y", - "yearly": "%Y", -} - -TS_TYPE_SECS = { - "minutely": 60, - "hourly": 3600, - "3hourly": 10800, - "daily": 86400, - "weekly": 604800, - "monthly": 2592000, # counting 3 days per month (APPROX) - "yearly": 31536000, # counting 365 days (APPROX) -} diff --git a/src/pyaro_readers/nilupmfebas/time_resampler.py b/src/pyaro_readers/nilupmfebas/time_resampler.py deleted file mode 100644 index c838e6f..0000000 --- a/src/pyaro_readers/nilupmfebas/time_resampler.py +++ /dev/null @@ -1,256 +0,0 @@ -""" -Module containing time resampling functionality -""" -import logging - -import pandas as pd -import xarray as xarr - -from .exceptions import TemporalResolutionError -from .helpers import isnumeric, resample_time_dataarray, resample_timeseries -from .tstype import TsType - -logger = logging.getLogger(__name__) - - -class TimeResampler: - """Object that can be use to resample timeseries data - - It supports hierarchical resampling of :class:`xarray.DataArray` objects - and :class:`pandas.Series` objects. - - Hierarchical means, that resampling constraints can be applied for each - level, that is, if hourly data is to be resampled to monthly, it may be - specified to first required minimum number of hours per day, and minimum - days per month, to create the output data. - """ - - AGGRS_UNIT_PRESERVE = ("mean", "median", "std", "max", "min") - DEFAULT_HOW = "mean" - - def __init__(self, input_data=None): - self.last_setup = None - # the following attribute is updated whenever a resampling operation is - # performed and it will check if any of the specified resampling - # aggregators invalidates unit preservation (e.g. using how=add for - # for accumulating precipitation...). See also attr. AGGRS_UNIT_PRESERVE - self._last_units_preserved = None - self._input_data = None - - if input_data is not None: - self.input_data = input_data - - @property - def last_units_preserved(self): - """Boolean indicating if last resampling operation preserves units""" - if self._last_units_preserved is None: - raise AttributeError("Please call resample first...") - return self._last_units_preserved - - @property - def input_data(self): - """Input data object that is to be resampled""" - return self._input_data - - @input_data.setter - def input_data(self, val): - if not isinstance(val, (pd.Series, xarr.DataArray)): - raise ValueError("Invalid input: need Series or DataArray") - self._input_data = val - - @property - def fun(self): - """Resamplig method (depends on input data type)""" - if isinstance(self.input_data, pd.Series): - return resample_timeseries - return resample_time_dataarray - - def _get_resample_how(self, fr, to, how): - if not isinstance(how, (str, dict)): - val = self.DEFAULT_HOW - elif isinstance(how, dict): - if to.val in how and fr.val in how[to.val]: - val = how[to.val][fr.val] - else: - val = self.DEFAULT_HOW - else: - val = how - return val - - def _get_idx_entry(self, fr, to, min_num_obs, how): - min_num = fr.get_min_num_obs(to, min_num_obs) - - _how = self._get_resample_how(fr, to, how) - - return (to.val, min_num, _how) - - def _gen_idx(self, from_ts_type, to_ts_type, min_num_obs, how): - """Generate hierarchical resampling index - - Return - ------ - list - list (can be considered the iterator) of 3-element tuples for each\ - resampling step, containing - - - frequency to which the current is converted - - minimum number of not-NaN values required for that step - - aggregator to be used (e.g. mean, median, ...) - - """ - if isnumeric(min_num_obs): - if not isinstance(how, str): - raise ValueError( - f"Error initialising resampling constraints. " - f"min_num_obs is numeric ({min_num_obs}) and input how is {how} " - f"(would need to be string, e.g. mean)" - ) - return [(to_ts_type.val, int(min_num_obs), how)] - if not isinstance(min_num_obs, dict): - raise ValueError( - f"Invalid input for min_num_obs, need dictionary or integer, got {min_num_obs}" - ) - - base_freqs = TsType.VALID - - start_base = base_freqs.index(from_ts_type.base) - stop_base = base_freqs.index(to_ts_type.base) - - last_from = from_ts_type - idx = [] - # loop from next base freq to end base freq, note that min_num_obs as - # well as input freqs may have multiplication factors, which may - # require min_num_obs values to be updated accordingly - for i in range(start_base + 1, stop_base + 1): - to_base = TsType(base_freqs[i]) - try: - entry = self._get_idx_entry(last_from, to_base, min_num_obs, how) - idx.append(entry) - last_from = to_base - except (TemporalResolutionError, ValueError): - continue - if len(idx) == 0 or not idx[-1][0] == to_ts_type.val: - try: - last_entry = self._get_idx_entry( - last_from, to_ts_type, min_num_obs, how - ) - except (TemporalResolutionError, ValueError): - _how = self._get_resample_how(last_from, to_ts_type, how) - last_entry = (to_ts_type.val, 0, _how) - idx.append(last_entry) - return idx - - def resample( - self, - to_ts_type, - input_data=None, - from_ts_type=None, - how=None, - min_num_obs=None, - **kwargs, - ): - """Resample input data - - Parameters - ---------- - to_ts_type : str or TsType - output resolution - input_data : pandas.Series or xarray.DataArray - data to be resampled - from_ts_type : str or TsType, optional - current temporal resolution of data - how : str - string specifying how the data is to be aggregated, default is mean - min_num_obs : dict or int, optinal - integer or nested dictionary specifying minimum number of - observations required to resample from higher to lower frequency. - For instance, if `input_data` is hourly and `to_ts_type` is - monthly, you may specify something like:: - - min_num_obs = - {'monthly' : {'daily' : 7}, - 'daily' : {'hourly' : 6}} - - to require at least 6 hours per day and 7 days per month. - - **kwargs - additional input arguments passed to resampling method - - Returns - ------- - pandas.Series or xarray.DataArray - resampled data object - """ - if how is None: - how = "mean" - - if how in self.AGGRS_UNIT_PRESERVE: - self._last_units_preserved = True - else: - self._last_units_preserved = False - - if not isinstance(to_ts_type, TsType): - to_ts_type = TsType(to_ts_type) - - if str(from_ts_type) == "native": - from_ts_type = None - - if from_ts_type is None: - if min_num_obs is not None: - logger.warning( - "setting min_num_obs to None since from_ts_type is not specified" - ) - min_num_obs = None - elif isinstance(from_ts_type, str): - from_ts_type = TsType(from_ts_type) - - if input_data is not None: - self.input_data = input_data - if self.input_data is None: - raise ValueError("Please provide data (Series or DataArray)") - - self.last_setup = dict(min_num_obs=min_num_obs, how=how) - - if from_ts_type is None: # native == unknown - freq = to_ts_type.to_pandas_freq() - data_out = self.fun(self.input_data, freq=freq, how=how, **kwargs) - elif to_ts_type > from_ts_type: - raise TemporalResolutionError( - f"Cannot resample time-series from {from_ts_type} to {to_ts_type}" - ) - elif to_ts_type == from_ts_type: - logger.info( - f"Input time frequency {to_ts_type.val} equals current frequency of data. " - f"Resampling will be applied anyways which will introduce NaN values " - f"at missing time stamps" - ) - - freq = to_ts_type.to_pandas_freq() - self._last_units_preserved = True - data_out = self.fun(self.input_data, freq=freq, how="mean", **kwargs) - - elif min_num_obs is None: - freq = to_ts_type.to_pandas_freq() - if not isinstance(how, str): - raise ValueError( - f"Temporal resampling without constraints can only use string type " - f"argument how (e.g. how=mean). Got {how}" - ) - - data_out = self.fun(self.input_data, freq=freq, how=how, **kwargs) - else: - _idx = self._gen_idx(from_ts_type, to_ts_type, min_num_obs, how) - data_out = self.input_data - aggrs = [] - for to_ts_type, mno, rshow in _idx: - freq = TsType(to_ts_type).to_pandas_freq() - data_out = self.fun( - data_out, freq=freq, how=rshow, min_num_obs=mno, **kwargs - ) - aggrs.append(rshow) - - if all([x in self.AGGRS_UNIT_PRESERVE for x in aggrs]): - self._last_units_preserved = True - else: - self._last_units_preserved = False - return data_out diff --git a/src/pyaro_readers/nilupmfebas/tstype.py b/src/pyaro_readers/nilupmfebas/tstype.py deleted file mode 100644 index a97af50..0000000 --- a/src/pyaro_readers/nilupmfebas/tstype.py +++ /dev/null @@ -1,426 +0,0 @@ -""" -General helper methods for the pyaerocom library. -""" -import logging -import re - -import numpy as np - -from .exceptions import TemporalResolutionError -from .time_config import ( - PANDAS_FREQ_TO_TS_TYPE, - TS_TYPE_TO_NUMPY_FREQ, - TS_TYPE_TO_PANDAS_FREQ, - TS_TYPE_TO_SI, - TS_TYPES, -) - -logger = logging.getLogger(__name__) - - -class TsType: - VALID = TS_TYPES - VALID_ITER = VALID[:-1] - FROM_PANDAS = PANDAS_FREQ_TO_TS_TYPE - TO_PANDAS = TS_TYPE_TO_PANDAS_FREQ - TO_NUMPY = TS_TYPE_TO_NUMPY_FREQ - TO_SI = TS_TYPE_TO_SI - - TS_MAX_VALS = { - "minutely": 360, # up to 6hourly - "hourly": 168, # up to weekly - "daily": 180, # up to 6 monthly - "weekly": 104, # up to ~2yearly - "monthly": 120, - } # up to 10yearly - - # "monthly": "days" below is because each month does not have the same number of days - # netcdf does time calculation for you given starting day and days past (CF convention) - TSTR_TO_CF = {"hourly": "hours", "daily": "days", "monthly": "days"} - - TOL_SECS_PERCENT = 5 - - def __init__(self, val): - self._mulfac = 1 - self._val = None - - self.val = val - - @property - def mulfac(self): - """Multiplication factor of frequency""" - return self._mulfac - - @mulfac.setter - def mulfac(self, value): - try: - value = int(value) - except Exception: - raise ValueError("mulfac needs to be int or convertible to int") - if self.base in self.TS_MAX_VALS and value > self.TS_MAX_VALS[self.base]: - raise ValueError( - f"Multiplication factor exceeds maximum allowed, which is " - f"{self.TS_MAX_VALS[self.base]}" - ) - self._mulfac = value - - @property - def base(self): - """Base string (without multiplication factor, cf :attr:`mulfac`)""" - return self._val - - @property - def val(self): - """Value of frequency (string type), e.g. 3daily""" - if self._mulfac != 1: - return f"{self._mulfac}{self._val}" - return self._val - - @val.setter - def val(self, val): - if val is None: - raise TemporalResolutionError( - "Invalid input, please provide valid frequency string..." - ) - mulfac = 1 - if val[0].isdigit(): - ivalstr = re.findall(r"\d+", val)[0] - val = val.split(ivalstr)[-1] - mulfac = int(ivalstr) - if not val in self.VALID: - try: - val = self._from_pandas(val) - except TemporalResolutionError: - raise TemporalResolutionError( - f"Invalid input for ts_type {val}. Choose from {self.VALID}" - ) - if val in self.TS_MAX_VALS and mulfac != 1: - if mulfac > self.TS_MAX_VALS[val]: - raise TemporalResolutionError( - f"Invalid input for ts_type {val}. Multiplication factor " - f"{mulfac} exceeds maximum allowed for {val}, which is " - f"{self.TS_MAX_VALS[val]}" - ) - self._val = val - self._mulfac = mulfac - - @property - def datetime64_str(self): - """Convert ts_type str to datetime64 unit string""" - return f"datetime64[{self.to_numpy_freq()}]" - - @property - def timedelta64_str(self): - """Convert ts_type str to datetime64 unit string""" - return f"timedelta64[{self.to_numpy_freq()}]" - - @property - def cf_base_unit(self): - """Convert ts_type str to CF convention time unit""" - if not self.base in self.TSTR_TO_CF: - raise NotImplementedError(f"Cannot convert {self.base} to CF str") - return self.TSTR_TO_CF[self.base] - - @property - def num_secs(self): - """Number of seconds in one period - - Note - ---- - Be aware that for monthly frequency the number of seconds is not well - defined! - """ - from cf_units import Unit - - cf = self.to_si() - total_secs = 1 / Unit("s").convert(1, cf) - return total_secs - - @property - def tol_secs(self): - """Tolerance in seconds for current TsType""" - total_secs = self.num_secs - frac = self.TOL_SECS_PERCENT / 100 - return int(np.ceil(frac * total_secs)) - - def to_timedelta64(self): - """ - Convert frequency to timedelta64 object - - Can be used, e.g. as tolerance when reindexing pandas Series - - Returns - ------- - timedelta64 - - """ - return np.timedelta64(1, self.to_numpy_freq()) - - @property - def next_higher(self): - """Next lower resolution code""" - if self.mulfac > 1: - return TsType(self._val) - - idx = self.VALID_ITER.index(self._val) - if idx == 0: - raise IndexError(f"No higher resolution available than {self}") - return TsType(self.VALID_ITER[idx - 1]) - - @property - def next_lower(self): - """Next lower resolution code - - This will go to the next lower base resolution, that is if current is - 3daily, it will return weekly, however, if current exceeds next lower - base, it will iterate that base, that is, if current is 8daily, next - lower will be 2weekly (and not 9daily). - """ - idx = self.VALID_ITER.index(self._val) - if idx == len(self.VALID_ITER) - 1: - tst = TsType(self.base) - tst.mulfac = self.mulfac + 1 - return tst - tst = TsType(self.VALID_ITER[idx + 1]) - if self.mulfac == 1 or self.num_secs < tst.num_secs: - return tst - try: - maxmul = self.TS_MAX_VALS[tst.base] - except: - maxmul = 10 - numsecs = self.num_secs - for mulfac in range(1, maxmul + 1): - tst.mulfac = mulfac - if numsecs < tst.num_secs: - return tst - raise TemporalResolutionError( - f"Failed to determine next lower resolution for {self}" - ) - - @staticmethod - def valid(val): - try: - TsType(val) - return True - except TemporalResolutionError: - return False - - def to_numpy_freq(self): - if not self._val in self.TO_NUMPY: - raise TemporalResolutionError( - f"numpy frequency not available for {self._val}" - ) - freq = self.TO_NUMPY[self._val] - return f"{self.mulfac}{freq}" - - def to_pandas_freq(self): - """Convert ts_type to pandas frequency string""" - if not self._val in self.TO_PANDAS: - raise TemporalResolutionError( - f"pandas frequency not available for {self._val}" - ) - freq = self.TO_PANDAS[self._val] - if self._mulfac == 1: - return freq - return f"{self._mulfac}{freq}" - - def to_si(self): - """Convert to SI conform string (e.g. used for unit conversion)""" - base = self.base - if not base in self.TO_SI: - raise ValueError(f"Cannot convert ts_type={self} to SI unit string...") - si = self.TO_SI[base] - return si if self.mulfac == 1 else f"({self.mulfac}{si})" - - def get_min_num_obs(self, to_ts_type: "TsType", min_num_obs: dict) -> int: - selfstr = self.val - if to_ts_type >= self: # should occur rarely - if to_ts_type == self: - return 0 - raise TemporalResolutionError( - f"input ts_type {to_ts_type} is lower resolution than current {self}" - ) - - elif str(to_ts_type) in min_num_obs: - # output frequency is specified in min_num_obs (note: this may - # also be 3daily, etc, i.e., not restricted to base frequencies) - mno = min_num_obs[str(to_ts_type)] - if selfstr in mno: - return int(mno[selfstr]) - elif self.mulfac != 1 and self.base in mno: - min_num_base = mno[self.base] - return int(np.round(min_num_base / self.mulfac)) - - elif to_ts_type.base in min_num_obs: - mno = min_num_obs[to_ts_type.base] - if selfstr in mno: - val = mno[selfstr] - return int(np.round(to_ts_type.mulfac * val)) - - elif self.mulfac != 1 and self.base in mno: - min_num_base = mno[self.base] - val = min_num_base / self.mulfac * to_ts_type.mulfac - val = int(np.round(val)) - return val - raise ValueError( - f"could not infer min_num_obs value from input dict {min_num_obs} " - f"for conversion from {self} to {to_ts_type}" - ) - - def check_match_total_seconds(self, total_seconds): - """ - Check if this object matches with input interval length in seconds - - Parameters - ---------- - total_seconds : int or float - interval length in units of seconds (e.g. 86400 for daily) - - Returns - ------- - bool - - """ - try: - numsecs = self.num_secs - tolsecs = self.tol_secs - except ValueError: # native / undefined - return False - low, high = numsecs - tolsecs, numsecs + tolsecs - if np.logical_and(total_seconds >= low, total_seconds <= high): - return True - return False - - @staticmethod - def _try_infer_from_total_seconds(base, total_seconds): - """ - Infer multiplication factor required to match input interval length - - Not to be used directly, is used in :func:`from_total_seconds`. - - Parameters - ---------- - base : str - base frequency - total_seconds : int or float - interval length - - Raises - ------ - TemporalResolutionError - if TsType cannot be inferred - - Returns - ------- - TsType - inferred frequency - - """ - - if base in TsType.TS_MAX_VALS: - maxnum = TsType.TS_MAX_VALS[base] - else: - maxnum = 2 - candidates = [] - dts = [] - tstype = TsType(base) - for mulfac in range(1, maxnum): - tstype.mulfac = mulfac - if tstype.check_match_total_seconds(total_seconds): - dt = total_seconds - tstype.num_secs - dts.append(dt) - candidates.append(TsType(f"{mulfac}{base}")) - if ( - dt == 0 or dt < 0 - ): # current candidate has larger number of seconds than input - break - - if len(candidates) > 0: - return candidates[np.argmin(np.abs(dts))] - - raise TemporalResolutionError( - f"Period {total_seconds}s could not be associated with any " - f"allowed multiplication factor of base frequency {base}" - ) - - @staticmethod - def from_total_seconds(total_seconds): - """ - Try to infer TsType based on interval length - - Parameters - ---------- - total_seconds : int or float - total number of seconds - - Raises - ------ - TemporalResolutionError - If no TsType can be inferred for input number of seconds - - Returns - ------- - TsType - - """ - candidates = [] - candidates_diff = [] - for tst in TsType.VALID_ITER: - tstype = TsType(tst) - if tstype.check_match_total_seconds(total_seconds): - return tstype - diff = total_seconds - tstype.num_secs - if diff > 0: - candidates.append(tst) - candidates_diff.append(diff) - if len(candidates) > 0: - # sort by the candidate that has the lowest dt - candidates_sorted = [c for _, c in sorted(zip(candidates_diff, candidates))] - for base_tst in candidates_sorted: - try: - return TsType._try_infer_from_total_seconds(base_tst, total_seconds) - except TemporalResolutionError as e: - logger.info(e) - continue - - raise TemporalResolutionError( - f"failed to infer ts_type based on input dt={total_seconds} s" - ) - - def _from_pandas(self, val): - if not val in self.FROM_PANDAS: - raise TemporalResolutionError( - f"Invalid input: {val}, need pandas frequency string" - ) - return self.FROM_PANDAS[val] - - def __eq__(self, other): - if isinstance(other, str): - other = TsType(other) - return other.val == self.val - - def __lt__(self, other): - if isinstance(other, str): - other = TsType(other) - nss, nso = self.num_secs, other.num_secs - # inverted comparison, i.e. if other has less seconds if has higher - # resolution - return nss > nso - - def __le__(self, other): - return True if (self.__eq__(other) or self.__lt__(other)) else False - - def __gt__(self, other): - return not self.__le__(other) - - def __ge__(self, other): - return not self.__lt__(other) - - def __call__(self): - return self.val - - def __str__(self): - return self.val - - def __repr__(self): - return self.val diff --git a/src/pyaro_readers/nilupmfebas/ungriddeddata.py b/src/pyaro_readers/nilupmfebas/ungriddeddata.py deleted file mode 100644 index e99c65a..0000000 --- a/src/pyaro_readers/nilupmfebas/ungriddeddata.py +++ /dev/null @@ -1,3124 +0,0 @@ -from __future__ import annotations - -import fnmatch -import logging -import os -from datetime import datetime - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd - -from . import const -from ._lowlevel_helpers import merge_dicts -from .combine_vardata_ungridded import combine_vardata_ungridded -from .exceptions import ( - DataCoverageError, - DataExtractionError, - MetaDataError, - StationCoordinateError, - StationNotFoundError, - TimeMatchError, - VarNotAvailableError, -) -from .geodesy import get_country_info_coords -from .helpers import ( - isnumeric, - merge_station_data, - same_meta_dict, - start_stop, - start_stop_str, -) -from .helpers_landsea_masks import get_mask_value, load_region_mask_xr -from .mathutils import in_range -from .metastandards import STANDARD_META_KEYS -from .region import Region -from .stationdata import StationData -from .units_helpers import get_unit_conversion_fac - -from .tstype import TsType - -logger = logging.getLogger(__name__) - - -class UngriddedData: - """Class representing point-cloud data (ungridded) - - The data is organised in a 2-dimensional numpy array where the first index - (rows) axis corresponds to individual measurements (i.e. one timestamp of - one variable) and along the second dimension (containing 11 columns) the - actual values are stored (in column 6) along with additional information, - such as metadata index (can be used as key in :attr:`metadata` to access - additional information related to this measurement), timestamp, latitude, - longitude, altitude of instrument, variable index and, in case of 3D - data (e.g. LIDAR profiles), also the altitude corresponding to the data - value. - - Note - ---- - - That said, let's look at two examples. - - **Example 1**: Suppose you load 3 variables from 5 files, each of which - contains 30 timestamps. This corresponds to a total of 3*5*30=450 data - points and hence, the shape of the underlying numpy array will be 450x11. - - **Example 2**: 3 variables, 5 files, 30 timestamps, but each variable - is height resolved, containing 100 altitudes => 3*5*30*100=4500 data points, - thus, the final shape will be 4500x11. - - TODO - ---- - Include unit attribute for each variable (in pyaerocom.io package: make - sure to include units during ungridded read, if available) - - Attributes - ---------- - metadata : dict - dictionary containing meta information about the data. Keys are - floating point numbers corresponding to each station, values are - corresponding dictionaries containing station information. - meta_idx : dict - dictionary containing index mapping for each station and variable. Keys - correspond to metadata key (float -> station, see :attr:`metadata`) and - values are dictionaries containing keys specifying variable name and - corresponding values are arrays or lists, specifying indices (rows) of - these station / variable information in :attr:`_data`. Note: this - information is redunant and is there to accelarate station data - extraction since the data index matches for a given metadata block - do not need to be searched in the underlying numpy array. - var_idx : dict - mapping of variable name (keys, e.g. od550aer) to numerical variable - index of this variable in data numpy array (in column specified by - :attr:`_VARINDEX`) - - Parameters - ---------- - num_points : :obj:`int`, optional - inital number of total datapoints (number of rows in 2D dataarray) - add_cols : :obj:`list`, optional - list of additional index column names of 2D datarray. - - """ - - #: version of class (for caching) - __version__ = "0.22" - - #: default number of rows that are dynamically added if total number of - #: data rows is reached. - _CHUNKSIZE = 10000000 - - #: The following indices specify what the individual rows of the datarray - #: are reserved for. These may be expanded when creating an instance of - #: this class by providing a list of additional index names. - _METADATAKEYINDEX = 0 - _TIMEINDEX = 1 - _LATINDEX = 2 - _LONINDEX = 3 - _ALTITUDEINDEX = 4 # altitude of measurement device - _VARINDEX = 5 - _DATAINDEX = 6 - _DATAHEIGHTINDEX = 7 - _DATAERRINDEX = 8 # col where errors can be stored - _DATAFLAGINDEX = 9 # can be used to store flags - _STOPTIMEINDEX = 10 # can be used to store stop time of acq. - _TRASHINDEX = ( - 11 # index where invalid data can be moved to (e.g. when outliers are removed) - ) - - # The following number denotes the kept precision after the decimal dot of - # the location (e.g denotes lat = 300.12345) - # used to code lat and long in a single number for a uniqueness test - _LOCATION_PRECISION = 5 - _LAT_OFFSET = 90.0 - - STANDARD_META_KEYS = STANDARD_META_KEYS - - ALLOWED_VERT_COORD_TYPES = ["altitude"] - - @property - def _ROWNO(self): - return self._data.shape[0] - - def __init__(self, num_points=None, add_cols=None): - if num_points is None: - num_points = self._CHUNKSIZE - - self._chunksize = num_points - self._index = self._init_index(add_cols) - - # keep private, this is not supposed to be used by the user - self._data = np.full([num_points, self._COLNO], np.nan) - - self.metadata = {} - # single value data revision is deprecated - self.data_revision = {} - self.meta_idx = {} - self.var_idx = {} - - self._idx = -1 - - self.filter_hist = {} - self._is_vertical_profile = False - - def _get_data_revision_helper(self, data_id): - """ - Helper method to get last data revision - - Parameters - ---------- - data_id : str - ID of dataset for which revision is to be retrieved - - Raises - ------ - MetaDataError - If multiple revisions are found for this dataset. - - Returns - ------- - latest revision (None if no revision is available). - - """ - rev = None - for meta in self.metadata.values(): - if meta["data_id"] == data_id: - if rev is None: - rev = meta["data_revision"] - elif not meta["data_revision"] == rev: - raise MetaDataError( - f"Found different data revisions for dataset {data_id}" - ) - if data_id in self.data_revision: - if not rev == self.data_revision[data_id]: - raise MetaDataError( - f"Found different data revisions for dataset {data_id}" - ) - self.data_revision[data_id] = rev - return rev - - def _check_index(self): - """Checks if all indices are assigned correctly""" - assert len(self.meta_idx) == len( - self.metadata - ), "Mismatch len(meta_idx) and len(metadata)" - - assert sum(self.meta_idx) == sum( - self.metadata - ), "Mismatch between keys of metadata dict and meta_idx dict" - - _varnums = self._data[:, self._VARINDEX] - var_indices = np.unique(_varnums[~np.isnan(_varnums)]) - - assert len(var_indices) == len( - self.var_idx - ), "Mismatch between number of variables in data array and var_idx attr." - - assert sum(var_indices) == sum( - self.var_idx.values() - ), "Mismatch between variable indices in data array and var_idx attr." - - vars_avail = self.var_idx - - for idx, meta in self.metadata.items(): - if not "var_info" in meta: - if not "variables" in meta: - raise AttributeError( - f"Need either variables (list) or var_info (dict) " - f"in meta block {idx}: {meta}" - ) - meta["var_info"] = {} - for v in meta["variables"]: - meta["var_info"][v] = {} - - var_idx = self.meta_idx[idx] - for var, indices in var_idx.items(): - if len(indices) == 0: - continue # no data assigned for this metadata index - - assert ( - var in meta["var_info"] - ), f"Var {var} is indexed in meta_idx[{idx}] but not in metadata[{idx}]" - var_idx_data = np.unique(self._data[indices, self._VARINDEX]) - assert ( - len(var_idx_data) == 1 - ), f"Found multiple variable indices for var {var}: {var_idx_data}" - assert var_idx_data[0] == vars_avail[var], ( - f"Mismatch between {var} index assigned in data and " - f"var_idx for {idx} in meta-block" - ) - - @staticmethod - def from_station_data(stats, add_meta_keys=None): - """ - Create UngriddedData from input station data object(s) - - Parameters - ---------- - stats : list or StationData - input data object(s) - add_meta_keys : list, optional - list of metadata keys that are supposed to be imported from the - input `StationData` objects, in addition to the default metadata - retrieved via :func:`StationData.get_meta`. - - Raises - ------ - ValueError - if any of the input data objects is not an instance of - :class:`StationData`. - - Returns - ------- - UngriddedData - ungridded data object created from input station data objects - - """ - if add_meta_keys is None: - add_meta_keys = [] - elif isinstance(add_meta_keys, str): - add_meta_keys = [add_meta_keys] - elif not isinstance(add_meta_keys, list): - raise ValueError( - f"Invalid input for add_meta_keys {add_meta_keys}... need list" - ) - if isinstance(stats, StationData): - stats = [stats] - data_obj = UngriddedData() - - meta_key = 0.0 - idx = 0 - - metadata = data_obj.metadata - meta_idx = data_obj.meta_idx - - var_count_glob = -1 - for stat in stats: - if isinstance(stat, dict): - stat = StationData(**stat) - elif not isinstance(stat, StationData): - raise ValueError("Need instances of StationData or dicts") - metadata[meta_key] = {} - metadata[meta_key].update( - stat.get_meta( - force_single_value=False, quality_check=False, add_none_vals=True - ) - ) - for key in add_meta_keys: - try: - val = stat[key] - except KeyError: - val = "undefined" - - metadata[meta_key][key] = val - - metadata[meta_key]["var_info"] = {} - - meta_idx[meta_key] = {} - - append_vars = list(stat.var_info) - - for var in append_vars: - if not var in data_obj.var_idx: - var_count_glob += 1 - var_idx = var_count_glob - data_obj.var_idx[var] = var_idx - else: - var_idx = data_obj.var_idx[var] - - vardata = stat[var] - - if isinstance(vardata, pd.Series): - times = vardata.index - values = vardata.values - else: - times = stat["dtime"] - values = vardata - if not len(times) == len(values): - raise ValueError - - times = np.asarray([np.datetime64(x, "s") for x in times]) - times = np.float64(times) - - num_times = len(times) - # check if size of data object needs to be extended - if (idx + num_times) >= data_obj._ROWNO: - # if totnum < data_obj._CHUNKSIZE, then the latter is used - data_obj.add_chunk(num_times) - - start = idx - stop = start + num_times - - # write common meta info for this station (data lon, lat and - # altitude are set to station locations) - data_obj._data[start:stop, data_obj._LATINDEX] = stat["latitude"] - data_obj._data[start:stop, data_obj._LONINDEX] = stat["longitude"] - data_obj._data[start:stop, data_obj._ALTITUDEINDEX] = stat["altitude"] - data_obj._data[start:stop, data_obj._METADATAKEYINDEX] = meta_key - - # write data to data object - data_obj._data[start:stop, data_obj._TIMEINDEX] = times - - data_obj._data[start:stop, data_obj._DATAINDEX] = values - - data_obj._data[start:stop, data_obj._VARINDEX] = var_idx - - if var in stat.data_flagged: - invalid = stat.data_flagged[var] - data_obj._data[start:stop, data_obj._DATAFLAGINDEX] = invalid - - if var in stat.data_err: - errs = stat.data_err[var] - data_obj._data[start:stop, data_obj._DATAERRINDEX] = errs - - var_info = stat["var_info"][var] - metadata[meta_key]["var_info"][var] = {} - metadata[meta_key]["var_info"][var].update(var_info) - meta_idx[meta_key][var] = np.arange(start, stop) - - idx += num_times - - meta_key += 1 - - # shorten data_obj._data to the right number of points - data_obj._data = data_obj._data[:idx] - - data_obj._check_index() - - return data_obj - - def add_station_data( - self, stat, meta_idx=None, data_idx=None, check_index=False - ): # pragma: no cover - raise NotImplementedError("Coming at some point") - if meta_idx is None: - meta_idx = self.last_meta_idx + 1 - elif meta_idx in self.meta_idx: - raise ValueError( - f"Cannot add data at meta block index {meta_idx}, index already exists" - ) - - if data_idx is None: - data_idx = self._data.shape[0] - elif not np.all(np.isnan(self._data[data_idx, :])): - raise ValueError( - f"Cannot add data at data index {data_idx}, index already exists" - ) - - @property - def last_meta_idx(self): - """ - Index of last metadata block - """ - return np.max(list(self.meta_idx)) - - @property - def index(self): - return self._index - - @property - def first_meta_idx(self): - # First available metadata index - return list(self.metadata)[0] - - def _init_index(self, add_cols=None): - """Init index mapping for columns in dataarray""" - idx = dict( - meta=self._METADATAKEYINDEX, - time=self._TIMEINDEX, - stoptime=self._STOPTIMEINDEX, - latitude=self._LATINDEX, - longitude=self._LONINDEX, - altitude=self._ALTITUDEINDEX, - varidx=self._VARINDEX, - data=self._DATAINDEX, - dataerr=self._DATAERRINDEX, - dataaltitude=self._DATAHEIGHTINDEX, - dataflag=self._DATAFLAGINDEX, - trash=self._TRASHINDEX, - ) - - next_idx = max(idx.values()) + 1 - if add_cols is not None: - if not isinstance(add_cols, (list, tuple)): - raise ValueError("Invalid input for add_cols. Need list or tuple") - for name in add_cols: - if name in idx: - raise ValueError( - f"Cannot add new index with name {name} since " - f"this index already exists at column position {idx[name]}" - ) - idx[name] = next_idx - next_idx += 1 - return idx - - @property - def _COLNO(self): - return len(self._index) - - @property - def has_flag_data(self): - """Boolean specifying whether this object contains flag data""" - return (~np.isnan(self._data[:, self._DATAFLAGINDEX])).any() - - @property - def is_vertical_profile(self): - """Boolean specifying whether is vertical profile""" - return self._is_vertical_profile - - @is_vertical_profile.setter - def is_vertical_profile(self, value): - """ - Boolean specifying whether is vertical profile. - Note must be set in ReadUngridded based on the reader - because the instance of class used during reading is - not the same as the instance used later in the workflow - """ - self._is_vertical_profile = value - - def copy(self): - """Make a copy of this object - - Returns - ------- - UngriddedData - copy of this object - - Raises - ------ - MemoryError - if copy is too big to fit into memory together with existing - instance - """ - from copy import deepcopy - - new = UngriddedData() - new._data = np.copy(self._data) - new.metadata = deepcopy(self.metadata) - new.data_revision = self.data_revision - new.meta_idx = deepcopy(self.meta_idx) - new.var_idx = deepcopy(self.var_idx) - new.filter_hist = deepcopy(self.filter_hist) - return new - - @property - def contains_vars(self) -> list[str]: - """List of all variables in this dataset""" - return list(self.var_idx) - - @property - def contains_datasets(self): - """List of all datasets in this object""" - datasets = [] - for info in self.metadata.values(): - ds = info["data_id"] - if not ds in datasets: - datasets.append(ds) - return datasets - - @property - def contains_instruments(self): - """List of all instruments in this object""" - instruments = [] - for info in self.metadata.values(): - try: - instr = info["instrument_name"] - if instr is not None and not instr in instruments: - instruments.append(instr) - except Exception: - pass - return instruments - - @property - def shape(self): - """Shape of data array""" - return self._data.shape - - @property - def is_empty(self): - """Boolean specifying whether this object contains data or not""" - return True if len(self.metadata) == 0 else False - - @property - def is_filtered(self): - """Boolean specifying whether this data object has been filtered - - Note - ---- - Details about applied filtering can be found in :attr:`filter_hist` - """ - if len(self.filter_hist) > 0: - return True - return False - - @property - def longitude(self): - """Longitudes of stations""" - vals = [] - for v in self.metadata.values(): - try: - vals.append(v["longitude"]) - except Exception: - vals.append(np.nan) - return vals - - @longitude.setter - def longitude(self, value): - raise AttributeError("Station longitudes cannot be changed") - - @property - def latitude(self): - """Latitudes of stations""" - vals = [] - for v in self.metadata.values(): - try: - vals.append(v["latitude"]) - except Exception: - vals.append(np.nan) - return vals - - @latitude.setter - def latitude(self, value): - raise AttributeError("Station latitudes cannot be changed") - - @property - def altitude(self): - """Altitudes of stations""" - vals = [] - for v in self.metadata.values(): - try: - vals.append(v["altitude"]) - except Exception: - vals.append(np.nan) - return vals - - @altitude.setter - def altitude(self, value): - raise AttributeError("Station altitudes cannot be changed") - - @property - def station_name(self): - """Latitudes of data""" - vals = [] - for v in self.metadata.values(): - try: - vals.append(v["station_name"]) - except Exception: - vals.append(np.nan) - return vals - - @station_name.setter - def station_name(self, value): - raise AttributeError("Station names cannot be changed") - - @property - def unique_station_names(self): - """List of unique station names""" - return sorted(list(dict.fromkeys(self.station_name))) - - @property - def available_meta_keys(self): - """List of all available metadata keys - - Note - ---- - This is a list of all metadata keys that exist in this dataset, but - it does not mean that all of the keys are registered in all metadata - blocks, especially if the data is merged from different sources with - different metadata availability - """ - metakeys = [] - for meta in self.metadata.values(): - for key in meta: - if not key in metakeys: - metakeys.append(key) - return metakeys - - @property - def nonunique_station_names(self): - """List of station names that occur more than once in metadata""" - import collections - - lst = self.station_name - return [item for item, count in collections.Counter(lst).items() if count > 1] - - @property - def time(self): - """Time dimension of data""" - raise NotImplementedError - - @time.setter - def time(self, value): - raise AttributeError("Time array cannot be changed") - - def last_filter_applied(self): - """Returns the last filter that was applied to this dataset - - To see all filters, check out :attr:`filter_hist` - """ - if not self.is_filtered: - raise AttributeError("No filters were applied so far") - return self.filter_hist[max(self.filter_hist)] - - def add_chunk(self, size=None): - """Extend the size of the data array - - Parameters - ---------- - size : :obj:`int`, optional - number of additional rows. If None (default) or smaller than - minimum chunksize specified in attribute ``_CHUNKSIZE``, then the - latter is used. - """ - if size is None or size < self._chunksize: - size = self._chunksize - chunk = np.full([size, self._COLNO], np.nan) - self._data = np.append(self._data, chunk, axis=0) - logger.info(f"adding chunk, new array size ({self._data.shape})") - - def _find_station_indices_wildcards(self, station_str): - """Find indices of all metadata blocks matching input station name - - Parameters - ---------- - station_str : str - station name or wildcard pattern - - Returns - ------- - list - list containing all metadata indices that match the input station - name or pattern - - Raises - ------ - StationNotFoundError - if no such station exists in this data object - """ - idx = [] - for i, meta in self.metadata.items(): - if fnmatch.fnmatch(meta["station_name"], station_str): - idx.append(i) - if len(idx) == 0: - raise StationNotFoundError( - f"No station available in UngriddedData that matches pattern {station_str}" - ) - return idx - - def _find_station_indices(self, station_str): - """Find indices of all metadata blocks matching input station name - - Parameters - ---------- - station_str : str - station name - - Returns - ------- - list - list containing all metadata indices that match the input station - name or pattern - - Raises - ------ - StationNotFoundError - if no such station exists in this data object - """ - idx = [] - for i, meta in self.metadata.items(): - if meta["station_name"] == station_str: - idx.append(i) - if len(idx) == 0: - raise StationNotFoundError( - f"No station available in UngriddedData that matches name {station_str}" - ) - return idx - - def _get_stat_coords(self): - meta_idx = [] - coords = [] - for idx, meta in self.metadata.items(): - try: - lat, lon = meta["latitude"], meta["longitude"] - except: - logger.warning(f"Could not retrieve lat lon coord at meta index {idx}") - continue - meta_idx.append(idx) - coords.append((lat, lon)) - return (meta_idx, coords) - - def check_set_country(self): - """CHecks all metadata entries for availability of country information - - Metadata blocks that are missing country entry will be updated based - on country inferred from corresponding lat / lon coordinate. Uses - :func:`pyaerocom.geodesy.get_country_info_coords` (library - reverse-geocode) to retrieve countries. This may be errouneous - close to country borders as it uses eucledian distance based on a list - of known locations. - - Note - ---- - Metadata blocks that do not contain latitude and longitude entries are - skipped. - - Returns - ------- - list - metadata entries where country was added - list - corresponding countries that were inferred from lat / lon - """ - meta_idx, coords = self._get_stat_coords() - info = get_country_info_coords(coords) - meta_idx_updated = [] - countries = [] - - for i, idx in enumerate(meta_idx): - meta = self.metadata[idx] - if not "country" in meta or meta["country"] is None: - country = info[i]["country"] - meta["country"] = country - meta["country_code"] = info[i]["country_code"] - meta_idx_updated.append(idx) - countries.append(country) - return (meta_idx_updated, countries) - - @property - def countries_available(self): - """ - Alphabetically sorted list of country names available - """ - # self.check_set_country() - countries = [] - for idx, meta in self.metadata.items(): - try: - countries.append(meta["country"]) - except: - logger.warning("No country information in meta block", idx) - if len(countries) == 0: - logger.warning( - "None of the metadata blocks contains " - "country information. You may want to " - "run class method check_set_country first " - "to automatically assign countries." - ) - return sorted(dict.fromkeys(countries)) - - def find_station_meta_indices(self, station_name_or_pattern, allow_wildcards=True): - """Find indices of all metadata blocks matching input station name - - You may also use wildcard pattern as input (e.g. *Potenza*) - - Parameters - ---------- - station_pattern : str - station name or wildcard pattern - allow_wildcards : bool - if True, input station_pattern will be used as wildcard pattern and - all matches are returned. - - Returns - ------- - list - list containing all metadata indices that match the input station - name or pattern - - Raises - ------ - StationNotFoundError - if no such station exists in this data object - """ - if not allow_wildcards: - return self._find_station_indices(station_name_or_pattern) - return self._find_station_indices_wildcards(station_name_or_pattern) - - # TODO: see docstring - def to_station_data( - self, - meta_idx, - vars_to_convert=None, - start=None, - stop=None, - freq=None, - ts_type_preferred=None, - merge_if_multi=True, - merge_pref_attr=None, - merge_sort_by_largest=True, - insert_nans=False, - allow_wildcards_station_name=True, - add_meta_keys=None, - resample_how=None, - min_num_obs=None, - ): - """Convert data from one station to :class:`StationData` - - Todo - ---- - - Review for retrieval of profile data (e.g. Lidar data) - - Parameters - ---------- - meta_idx : float - index of station or name of station. - vars_to_convert : :obj:`list` or :obj:`str`, optional - variables that are supposed to be converted. If None, use all - variables that are available for this station - start - start time, optional (if not None, input must be convertible into - pandas.Timestamp) - stop - stop time, optional (if not None, input must be convertible into - pandas.Timestamp) - freq : str - pandas frequency string (e.g. 'D' for daily, 'M' for month end) or - valid pyaerocom ts_type - merge_if_multi : bool - if True and if data request results in multiple instances of - StationData objects, then these are attempted to be merged into one - :class:`StationData` object using :func:`merge_station_data` - merge_pref_attr - only relevant for merging of multiple matches: preferred attribute - that is used to sort the individual StationData objects by relevance. - Needs to be available in each of the individual StationData objects. - For details cf. :attr:`pref_attr` in docstring of - :func:`merge_station_data`. Example could be `revision_date`. If - None, then the stations will be sorted based on the number of - available data points (if :attr:`merge_sort_by_largest` is True, - which is default). - merge_sort_by_largest : bool - only relevant for merging of multiple matches: cf. prev. attr. and - docstring of :func:`merge_station_data` method. - insert_nans : bool - if True, then the retrieved :class:`StationData` objects are filled - with NaNs - allow_wildcards_station_name : bool - if True and if input `meta_idx` is a string (i.e. a station name or - pattern), metadata matches will be identified applying wildcard - matches between input `meta_idx` and all station names in this - object. - - Returns - ------- - StationData or list - StationData object(s) containing results. list is only returned if - input for meta_idx is station name and multiple matches are - detected for that station (e.g. data from different instruments), - else single instance of StationData. All variable time series are - inserted as pandas Series - """ - if isinstance(vars_to_convert, str): - vars_to_convert = [vars_to_convert] - elif vars_to_convert is None: - vars_to_convert = self.contains_vars - if len(vars_to_convert) == 0: - raise DataCoverageError( - "UngriddedData object does not contain any variables" - ) - if start is None and stop is None: - start = pd.Timestamp("1970") - stop = pd.Timestamp("2200") - else: - start, stop = start_stop(start, stop) - - if isinstance(meta_idx, str): - # user asks explicitely for station name, find all meta indices - # that match this station - meta_idx = self.find_station_meta_indices( - meta_idx, allow_wildcards_station_name - ) - if not isinstance(meta_idx, list): - meta_idx = [meta_idx] - - stats = [] - # ToDo: check consistency, consider using methods in helpers.py - # check also Hans' issue on the topic - start, stop = np.datetime64(start), np.datetime64(stop) - - for idx in meta_idx: - try: - stat = self._metablock_to_stationdata( - idx, vars_to_convert, start, stop, add_meta_keys - ) - if ts_type_preferred is not None: - if "ts_type" in stat["var_info"][vars_to_convert[0]].keys(): - if TsType( - stat["var_info"][vars_to_convert[0]]["ts_type"] - ) < TsType(ts_type_preferred): - continue - elif "ts_type" in stat.keys(): - if TsType(stat["ts_type"]) < TsType(ts_type_preferred): - continue - else: - raise KeyError("Could not find ts_type in stat") - stats.append(stat) - except (VarNotAvailableError, DataCoverageError) as e: - logger.info(f"Skipping meta index {idx}. Reason: {repr(e)}") - if merge_if_multi and len(stats) > 1: - if len(vars_to_convert) > 1: - raise NotImplementedError( - "Cannot yet merge multiple stations with multiple variables." - ) - if merge_pref_attr is None: - merge_pref_attr = self._try_infer_stat_merge_pref_attr(stats) - merged = merge_station_data( - stats, - vars_to_convert, - pref_attr=merge_pref_attr, - sort_by_largest=merge_sort_by_largest, - fill_missing_nan=False, - resample_how=resample_how, - min_num_obs=min_num_obs, - ) - stats = [merged] - - stats_ok = [] - for stat in stats: - for var in vars_to_convert: - if not var in stat: - continue - if freq is not None: - stat.resample_time( - var, - freq, - how=resample_how, - min_num_obs=min_num_obs, - inplace=True, - ) - elif insert_nans: - stat.insert_nans_timeseries(var) - if np.all(np.isnan(stat[var].values)): - stat = stat.remove_variable(var) - if any([x in stat for x in vars_to_convert]): - stats_ok.append(stat) - - if len(stats_ok) == 0: - raise DataCoverageError( - f"{vars_to_convert} data could not be retrieved " - f"for meta index (or station name) {meta_idx}" - ) - elif len(stats_ok) == 1: - # return StationData object and not list - return stats_ok[0] - return stats_ok - - def _try_infer_stat_merge_pref_attr(self, stats): - """Checks if a preferred attribute for handling of overlaps can be inferred - - Parameters - ---------- - stats : list - list of :class:`StationData` objects - - Returns - ------- - str - preferred merge attribute parameter, if applicable, else None - """ - data_id = None - pref_attr = None - for stat in stats: - if not "data_id" in stat: - return None - elif data_id is None: - data_id = stat["data_id"] - from pyaerocom.metastandards import DataSource - - s = DataSource( - data_id=data_id - ) # reads default data source info that may contain preferred meta attribute - pref_attr = s.stat_merge_pref_attr - if pref_attr is None: - return None - elif ( - not stat["data_id"] == data_id - ): # station data objects contain different data sources - return None - return pref_attr - - ### TODO: check if both `variables` and `var_info` attrs are required in - ### metdatda blocks - def _metablock_to_stationdata( - self, meta_idx, vars_to_convert, start=None, stop=None, add_meta_keys=None - ): - """Convert one metadata index to StationData (helper method) - - See :func:`to_station_data` for input parameters - """ - if add_meta_keys is None: - add_meta_keys = [] - elif isinstance(add_meta_keys, str): - add_meta_keys = [add_meta_keys] - - sd = StationData() - meta = self.metadata[meta_idx] - - # TODO: make sure in reading classes that data_revision is assigned - # to each metadata block and not only in self.data_revision - rev = None - if "data_revision" in meta: - rev = meta["data_revision"] - else: - try: - rev = self.data_revision[meta["data_id"]] - except Exception: - logger.warning("Data revision could not be accessed") - sd.data_revision = rev - try: - vars_avail = list(meta["var_info"]) - except KeyError: - if not "variables" in meta or meta["variables"] in (None, []): - raise VarNotAvailableError( - "Metablock does not contain variable information" - ) - vars_avail = meta["variables"] - - for key in self.STANDARD_META_KEYS + add_meta_keys: - if key in sd.PROTECTED_KEYS: - logger.warning(f"skipping protected key: {key}") - continue - try: - sd[key] = meta[key] - except KeyError: - pass - - try: - sd["ts_type_src"] = meta["ts_type"] - except KeyError: - pass - - # assign station coordinates explicitely - for ck in sd.STANDARD_COORD_KEYS: - try: - sd.station_coords[ck] = meta[ck] - except KeyError: - pass - # if no input variables are provided, use the ones that are available - # for this metadata block - if vars_to_convert is None: - vars_to_convert = vars_avail - - # find overlapping variables (ignore all other ones) - vars_avail = np.intersect1d(vars_to_convert, vars_avail) - if not len(vars_avail) >= 1: - raise VarNotAvailableError( - "None of the input variables matches, or station does not contain data." - ) - # init helper boolean that is set to True if valid data can be found - # for at least one of the input variables - FOUND_ONE = False - for var in vars_avail: - # get indices of this variable - var_idx = self.meta_idx[meta_idx][var] - - # vector of timestamps corresponding to this variable - dtime = self._data[var_idx, self._TIMEINDEX].astype("datetime64[s]") - - # get subset - subset = self._data[var_idx] - - # make sure to extract only valid timestamps - if start is None: - start = dtime.min() - if stop is None: - stop = dtime.max() - - # create access mask for valid time stamps - tmask = np.logical_and(dtime >= start, dtime <= stop) - - # make sure there is some valid data - if tmask.sum() == 0: - logger.info( - f"Ignoring station {sd['station_name']}, var {var} ({sd['data_id']}): " - f"no data available in specified time interval {start} - {stop}" - ) - continue - - dtime = dtime[tmask] - subset = subset[tmask] - - vals = subset[:, self._DATAINDEX] - if np.all(np.isnan(vals)): - logger.warning( - f"Ignoring station {sd['station_name']}, var {var} ({sd['data_id']}): " - f"All values are NaN" - ) - continue - vals_err = subset[:, self._DATAERRINDEX] - flagged = subset[:, self._DATAFLAGINDEX] - altitude = subset[:, self._DATAHEIGHTINDEX] - - data = pd.Series(vals, dtime) - if not data.index.is_monotonic_increasing: - data = data.sort_index() - if any(~np.isnan(vals_err)): - sd.data_err[var] = vals_err - if any(~np.isnan(flagged)): - sd.data_flagged[var] = flagged - - sd["dtime"] = data.index.values - sd[var] = data - sd["var_info"][var] = {} - FOUND_ONE = True - # check if there is information about altitude (then relevant 3D - # variables and parameters are included too) - if "var_info" in meta: - vi = meta["var_info"] - else: - vi = {} - if not np.isnan(altitude).all(): - if "altitude" in vi: - sd.var_info["altitude"] = vi["altitude"] - sd.altitude = altitude - if var in vi: - sd.var_info[var].update(vi[var]) - - if len(data.index) == len(data.index.unique()): - sd.var_info[var]["overlap"] = False - else: - sd.var_info[var]["overlap"] = True - if not FOUND_ONE: - raise DataCoverageError( - f"Could not retrieve any valid data for station {sd['station_name']} " - f"and input variables {vars_to_convert}" - ) - return sd - - def _generate_station_index(self, by_station_name=True, ignore_index=None): - """Generates index to loop over station names or metadata block indices""" - if ignore_index is None: - if by_station_name: - return self.unique_station_names # all station names - return list(range(len(self.metadata))) # all meta indices - - if not by_station_name: - from pyaerocom.helpers import isnumeric - - if isnumeric(ignore_index): - ignore_index = [ignore_index] - if not isinstance(ignore_index, list): - raise ValueError("Invalid input for ignore_index, need number or list") - return [i for i in range(len(self.metadata)) if not i in ignore_index] - - # by station name and ignore certation stations - _iter = [] - if isinstance(ignore_index, str): - ignore_index = [ignore_index] - if not isinstance(ignore_index, list): - raise ValueError("Invalid input for ignore_index, need str or list") - for stat_name in self.unique_station_names: - ok = True - for name_or_pattern in ignore_index: - if fnmatch.fnmatch(stat_name, name_or_pattern): - ok = False - if ok: - _iter.append(stat_name) - return _iter - - def to_station_data_all( - self, - vars_to_convert=None, - start=None, - stop=None, - freq=None, - ts_type_preferred=None, - by_station_name=True, - ignore_index=None, - **kwargs, - ): - """Convert all data to :class:`StationData` objects - - Creates one instance of :class:`StationData` for each metadata block in - this object. - - Parameters - ---------- - vars_to_convert : :obj:`list` or :obj:`str`, optional - variables that are supposed to be converted. If None, use all - variables that are available for this station - start - start time, optional (if not None, input must be convertible into - pandas.Timestamp) - stop - stop time, optional (if not None, input must be convertible into - pandas.Timestamp) - freq : str - pandas frequency string (e.g. 'D' for daily, 'M' for month end) - or valid pyaerocom ts_type (e.g. 'hourly', 'monthly'). - by_station_name : bool - if True, then iter over unique_station_name (and merge multiple - matches if applicable), else, iter over metadata index - **kwargs - additional keyword args passed to :func:`to_station_data` (e.g. - `merge_if_multi, merge_pref_attr, merge_sort_by_largest, - insert_nans`) - - Returns - ------- - dict - 4-element dictionary containing following key / value pairs: - - - stats: list of :class:`StationData` objects - - station_name: list of corresponding station names - - latitude: list of latitude coordinates - - longitude: list of longitude coordinates - - """ - out_data = { - "stats": [], - "station_name": [], - "latitude": [], - "failed": [], - "longitude": [], - } - - _iter = self._generate_station_index(by_station_name, ignore_index) - for idx in _iter: - try: - data = self.to_station_data( - idx, - vars_to_convert, - start, - stop, - freq, - merge_if_multi=True, - allow_wildcards_station_name=False, - ts_type_preferred=ts_type_preferred, - **kwargs, - ) - - out_data["latitude"].append(data["latitude"]) - out_data["longitude"].append(data["longitude"]) - out_data["station_name"].append(data["station_name"]) - out_data["stats"].append(data) - - # catch the exceptions that are acceptable - except ( - VarNotAvailableError, - TimeMatchError, - DataCoverageError, - NotImplementedError, - StationCoordinateError, - ) as e: - logger.warning(f"Failed to convert to StationData Error: {repr(e)}") - out_data["failed"].append([idx, repr(e)]) - return out_data - - # TODO: check more general cases (i.e. no need to convert to StationData - # if no time conversion is required) - def get_variable_data( - self, variables, start=None, stop=None, ts_type=None, **kwargs - ): # pragma: no cover - """Extract all data points of a certain variable - - Parameters - ---------- - vars_to_extract : :obj:`str` or :obj:`list` - all variables that are supposed to be accessed - """ - if isinstance(variables, str): - variables = [variables] - all_stations = self.to_station_data_all( - variables, start, stop, freq=ts_type, **kwargs - ) - result = {} - num_stats = {} - for var in variables: - result[var] = [] - num_stats[var] = 0 - for stat_data in all_stations: - if stat_data is not None: - num_points = len(stat_data.dtime) - for var in variables: - if var in stat_data: - num_stats[var] += 1 - result[var].extend(stat_data[var]) - else: - result[var].extend([np.nan] * num_points) - result["num_stats"] = num_stats - return result - - def _check_str_filter_match(self, meta, negate, str_f): - # Check string equality for input meta data and filters. Supports - # wildcard matching - for metakey, filterval in str_f.items(): - # key does not exist in this specific meta_block - if not metakey in meta: - return False - # check if this key is in negate list (then result will be True - # for all that do not match the specified filter input value(s)) - neg = metakey in negate - - # actual value of this key in input metadata - metaval = meta[metakey] - - # check equality of values - match = metaval == filterval - if match: # direct match found - if neg: # key is flagged in negate -> no match - return False - else: # no direct match found - # check wildcard match - if "*" in filterval: # no wildcard in - match = fnmatch.fnmatch(metaval, filterval) - if neg: - if match: - return False - else: - if not match: - return False - elif not neg: # no match, no wildcard match and not inverted - return False - return True - - def _check_filter_match(self, meta, negate, str_f, list_f, range_f, val_f): - """Helper method that checks if station meta item matches filters - - Note - ---- - This method is used in :func:`apply_filter` - """ - if not self._check_str_filter_match(meta, negate, str_f): - return False - - for metakey, filterval in list_f.items(): - if not metakey in meta: - return False - neg = metakey in negate - metaval = meta[metakey] - match = metaval == filterval - if match: # lists are identical - if neg: - return False - else: - # value in metadata block is different from filter value - match = metaval in filterval - if match: - if neg: - return False - else: - # current metavalue is not equal the filterlist and is also - # not contained in the filterlist. However, one or more - # entries in the filterlist may be wildcard - if isinstance(metaval, str): - found = False - for entry in filterval: - if "*" in entry: - match = fnmatch.fnmatch(metaval, entry) - if match: - found = True - if neg: - return False - if not found and not neg: - return False - # range filter - for metakey, filterval in range_f.items(): - if not metakey in meta: - return False - neg = metakey in negate - match = in_range(meta[metakey], filterval[0], filterval[1]) - if (neg and match) or (not neg and not match): - return False - - for metakey, filterval in val_f.items(): - if not metakey in meta: - return False - neg = metakey in negate - match = meta[metakey] == filterval - if (neg and match) or (not neg and not match): - return False - return True - - def _init_meta_filters(self, **filter_attributes): - """Init filter dictionary for :func:`apply_filter_meta` - - Parameters - ---------- - **filter_attributes - valid meta keywords that are supposed to be filtered and the - corresponding filter values (or value ranges) - Only valid meta keywords are considered (e.g. data_id, - longitude, latitude, altitude, ts_type) - - Returns - ------- - tuple - 3-element tuple containing - - - dict: string match filters for metakeys \ - (e.g. dict['data_id'] = 'AeronetSunV2Lev2.daily') - - dict: in-list match filters for metakeys \ - (e.g. dict['station_name'] = ['stat1', 'stat2', 'stat3']) - - dict: in-range dictionary for metakeys \ - (e.g. dict['longitude'] = [-30, 30]) - - """ - # initiate filters that are checked - valid_keys = list(self.metadata[self.first_meta_idx]) - str_f = {} - list_f = {} - range_f = {} - val_f = {} - for key, val in filter_attributes.items(): - if not key in valid_keys: - raise OSError( - f"Invalid input parameter for filtering: {key}. " - f"Please choose from {valid_keys}" - ) - - if isinstance(val, str): - str_f[key] = val - elif isnumeric(val): - val_f[key] = val - elif isinstance(val, (list, np.ndarray, tuple)): - if all([isinstance(x, str) for x in val]): - list_f[key] = val - elif len(val) == 2 and all([isnumeric(x) for x in val]): - try: - low, high = float(val[0]), float(val[1]) - if not low < high: - raise ValueError("First entry needs to be smaller than 2nd") - range_f[key] = [low, high] - except Exception as e: - list_f[key] = val - else: - list_f[key] = val - return (str_f, list_f, range_f, val_f) - - def check_convert_var_units(self, var_name, to_unit=None, inplace=True): - obj = self if inplace else self.copy() - - # get the unit - if to_unit is None: - to_unit = const.VARS[var_name]["units"] - - for i, meta in obj.metadata.items(): - if var_name in meta["var_info"]: - try: - unit = meta["var_info"][var_name]["units"] - except KeyError: - add_str = "" - if "unit" in meta["var_info"][var_name]: - add_str = ( - "Corresponding var_info dict contains " - 'attr. "unit", which is deprecated, please ' - "check corresponding reading routine. " - ) - raise MetaDataError( - f"Failed to access unit information for variable {var_name} " - f"in metadata block {i}. {add_str}" - ) - fac = get_unit_conversion_fac(unit, to_unit, var_name) - if fac != 1: - meta_idx = obj.meta_idx[i][var_name] - current = obj._data[meta_idx, obj._DATAINDEX] - new = current * fac - obj._data[meta_idx, obj._DATAINDEX] = new - obj.metadata[i]["var_info"][var_name]["units"] = to_unit - - return obj - - def check_unit(self, var_name, unit=None): - """Check if variable unit corresponds to AeroCom unit - - Parameters - ---------- - var_name : str - variable name for which unit is to be checked - unit : :obj:`str`, optional - unit to be checked, if None, AeroCom default unit is used - - Raises - ------ - MetaDataError - if unit information is not accessible for input variable name - """ - if unit is None: - unit = const.VARS[var_name]["units"] - - units = [] - for i, meta in self.metadata.items(): - if var_name in meta["var_info"]: - try: - u = meta["var_info"][var_name]["units"] - if not u in units: - units.append(u) - except KeyError: - add_str = "" - if "unit" in meta["var_info"][var_name]: - add_str = ( - "Corresponding var_info dict contains " - 'attr. "unit", which is deprecated, please ' - "check corresponding reading routine. " - ) - raise MetaDataError( - f"Failed to access unit information for variable {var_name} " - f"in metadata block {i}. {add_str}" - ) - if len(units) == 0 and str(unit) != "1": - raise MetaDataError( - f"Failed to access unit information for variable {var_name}. " - f"Expected unit {unit}" - ) - for u in units: - if not get_unit_conversion_fac(u, unit, var_name) == 1: - raise MetaDataError(f"Invalid unit {u} detected (expected {unit})") - - def set_flags_nan(self, inplace=False): - """Set all flagged datapoints to NaN - - Parameters - ---------- - inplace : bool - if True, the flagged datapoints will be set to NaN in this object, - otherwise a new oject will be created and returned - - Returns - ------- - UngriddedData - data object that has all flagged data values set to NaN - - Raises - ------ - AttributeError - if no flags are assigned - """ - - if not self.has_flag_data: - raise AttributeError( - "Ungridded data object does not contain flagged data points" - ) - if inplace: - obj = self - else: - obj = self.copy() - mask = obj._data[:, obj._DATAFLAGINDEX] == 1 - - obj._data[mask, obj._DATAINDEX] = np.nan - obj._add_to_filter_history("set_flags_nan") - return obj - - # TODO: check, confirm and remove Beta version note in docstring - def remove_outliers( - self, - var_name, - inplace=False, - low=None, - high=None, - unit_ref=None, - move_to_trash=True, - ): - """Method that can be used to remove outliers from data - - Parameters - ---------- - var_name : str - variable name - inplace : bool - if True, the outliers will be removed in this object, otherwise - a new oject will be created and returned - low : float - lower end of valid range for input variable. If None, then the - corresponding value from the default settings for this variable - are used (cf. minimum attribute of `available variables - `__) - high : float - upper end of valid range for input variable. If None, then the - corresponding value from the default settings for this variable - are used (cf. maximum attribute of `available variables - `__) - unit_ref : str - reference unit for assessment of input outlier ranges: all data - needs to be in that unit, else an Exception will be raised - move_to_trash : bool - if True, then all detected outliers will be moved to the trash - column of this data object (i.e. column no. specified at - :attr:`UngriddedData._TRASHINDEX`). - - Returns - ------- - UngriddedData - ungridded data object that has all outliers for this variable - removed. - - Raises - ------ - ValueError - if input :attr:`move_to_trash` is True and in case for some of the - measurements there is already data in the trash. - """ - if inplace: - new = self - else: - new = self.copy() - - new.check_convert_var_units(var_name, to_unit=unit_ref) - - if low is None: - low = const.VARS[var_name].minimum - logger.info(f"Setting {var_name} outlier lower lim: {low:.2f}") - if high is None: - high = const.VARS[var_name].maximum - logger.info(f"Setting {var_name} outlier upper lim: {high:.2f}") - var_idx = new.var_idx[var_name] - var_mask = new._data[:, new._VARINDEX] == var_idx - - all_data = new._data[:, new._DATAINDEX] - invalid_mask = np.logical_or(all_data < low, all_data > high) - - mask = invalid_mask * var_mask - invalid_vals = new._data[mask, new._DATAINDEX] - new._data[mask, new._DATAINDEX] = np.nan - - if move_to_trash: - # check if trash is empty and put outliers into trash - trash = new._data[mask, new._TRASHINDEX] - if np.isnan(trash).sum() == len(trash): # trash is empty - new._data[mask, new._TRASHINDEX] = invalid_vals - else: - raise ValueError( - "Trash is not empty for some of the datapoints. " - "Please empty trash first using method " - ":func:`empty_trash` or deactivate input arg " - ":attr:`move_to_trash`" - ) - - new._add_to_filter_history( - f"Removed {len(invalid_vals)} outliers from {var_name} data " - f"(range: {low}-{high}, in trash: {move_to_trash})" - ) - return new - - def _add_to_filter_history(self, info): - """Add info to :attr:`filter_hist` - - Key is current system time string - - Parameter - --------- - info - information to be appended to filter history - """ - time_str = datetime.now().strftime("%Y%m%d%H%M%S") - self.filter_hist[int(time_str)] = info - - def empty_trash(self): - """Set all values in trash column to NaN""" - self._data[:, self._TRASHINDEX] = np.nan - - @property - def station_coordinates(self): - """dictionary with station coordinates - - Returns - ------- - dict - dictionary containing station coordinates (latitude, longitude, - altitude -> values) for all stations (keys) where these parameters - are accessible. - """ - d = {"station_name": [], "latitude": [], "longitude": [], "altitude": []} - - for i, meta in self.metadata.items(): - if not "station_name" in meta: - logger.warning(f"Skipping meta-block {i}: station_name is not defined") - continue - elif not all(name in meta for name in const.STANDARD_COORD_NAMES): - logger.warning( - f"Skipping meta-block {i} (station {meta['station_name']}): " - f"one or more of the coordinates is not defined" - ) - continue - - stat = meta["station_name"] - - if stat in d["station_name"]: - continue - d["station_name"].append(stat) - for k in const.STANDARD_COORD_NAMES: - d[k].append(meta[k]) - return d - - def _find_meta_matches(self, negate=None, *filters): - """Find meta matches for input attributes - - Parameters - ---------- - negate : list or str, optional - specified meta key(s) provided in `*filters` that are - supposed to be treated as 'not valid'. E.g. if - `station_name="bad_site"` is input in `filter_attributes` and if - `station_name` is listed in `negate`, then all metadata blocks - containing "bad_site" as station_name will be excluded in output - data object. - *filters - list of filters to be applied - - Returns - ------- - tuple - list of metadata indices that match input filter - """ - if negate is None: - negate = [] - elif isinstance(negate, str): - negate = [negate] - elif not isinstance(negate, list): - raise ValueError( - f"Invalid input for negate {negate}, need list or str or None" - ) - meta_matches = [] - totnum = 0 - for meta_idx, meta in self.metadata.items(): - if self._check_filter_match(meta, negate, *filters): - meta_matches.append(meta_idx) - for var in meta["var_info"]: - if var in self.ALLOWED_VERT_COORD_TYPES: - continue # altitude is not actually a variable but is stored in var_info like one - try: - totnum += len(self.meta_idx[meta_idx][var]) - except KeyError: - logger.warning( - f"Ignoring variable {var} in meta block {meta_idx} " - f"since no data could be found" - ) - - return (meta_matches, totnum) - - def filter_altitude(self, alt_range): - """Filter altitude range - - Parameters - ---------- - alt_range : list or tuple - 2-element list specifying altitude range to be filtered in m - - Returns - ------- - UngriddedData - filtered data object - """ - return self.filter_by_meta(altitude=alt_range) - - def filter_region( - self, region_id, check_mask=True, check_country_meta=False, **kwargs - ): - """Filter object by a certain region - - Parameters - ---------- - region_id : str - name of region (must be valid AeroCom region name or HTAP region) - check_mask : bool - if True and region_id a valid name for a binary mask, then the - filtering is done based on that binary mask. - check_country_meta : bool - if True, then the input region_id is first checked against - available country names in metadata. If that fails, it is assumed - that this regions is either a valid name for registered rectangular - regions or for available binary masks. - **kwargs - currently not used in method (makes usage in higher level classes - such as :class:`Filter` easier as other data objects have the - same method with possibly other input possibilities) - - Returns - ------- - UngriddedData - filtered data object (containing only stations that fall into - input region) - """ - if check_country_meta: - if region_id in self.countries_available: - return self.filter_by_meta(country=region_id) - - if region_id in const.HTAP_REGIONS and check_mask: - return self.apply_region_mask(region_id) - - region = Region(region_id) - return self.filter_by_meta( - longitude=region.lon_range, latitude=region.lat_range - ) - - def apply_region_mask(self, region_id=None): - """ - TODO : Write documentations - - Parameters - ---------- - region_id : str or list (of strings) - ID of region or IDs of multiple regions to be combined - """ - if not region_id in const.HTAP_REGIONS: - raise ValueError( - f"Invalid input for region_id: {region_id}, choose from: {const.HTAP_REGIONS}" - ) - - # 1. find matches -> list of meta indices that are in region - # 2. Get total number of datapoints -> defines shape of output UngriddedData - # 3. Create - - mask = load_region_mask_xr(region_id) - - meta_matches = [] - totnum = 0 - for meta_idx, meta in self.metadata.items(): - lon, lat = meta["longitude"], meta["latitude"] - - mask_val = get_mask_value(lat, lon, mask) - if mask_val >= 1: # coordinate is in mask - meta_matches.append(meta_idx) - for var in meta["var_info"]: - totnum += len(self.meta_idx[meta_idx][var]) - - new = self._new_from_meta_blocks(meta_matches, totnum) - time_str = datetime.now().strftime("%Y%m%d%H%M%S") - new.filter_hist[int(time_str)] = f"Applied mask {region_id}" - new._check_index() - return new - - def apply_filters(self, var_outlier_ranges=None, **filter_attributes): - """Extended filtering method - - Combines :func:`filter_by_meta` and adds option to also remove outliers - (keyword `remove_outliers`), set flagged data points to NaN (keyword - `set_flags_nan`) and to extract individual variables (keyword - `var_name`). - - Parameters - ---------- - var_outlier_ranges : dict, optional - dictionary specifying custom outlier ranges for individual - variables. - **filter_attributes : dict - filters that are supposed to be applied to the data. - To remove outliers, use keyword `remove_outliers`, to set flagged - values to NaN, use keyword `set_flags_nan`, to extract single or - multiple variables, use keyword `var_name`. Further filter keys - are assumed to be metadata specific and are passed to - :func:`filter_by_meta`. - - Returns - ------- - UngriddedData - filtered data object - """ - data = self - - remove_outliers = False - set_flags_nan = False - extract_vars = None - region_id = None - if "remove_outliers" in filter_attributes: - remove_outliers = filter_attributes.pop("remove_outliers") - if "set_flags_nan" in filter_attributes: - set_flags_nan = filter_attributes.pop("set_flags_nan") - if "var_name" in filter_attributes: - extract_vars = filter_attributes.pop("var_name") - if isinstance(extract_vars, str): - extract_vars = [extract_vars] - for var in extract_vars: - if not var in data.contains_vars: - raise VarNotAvailableError( - f"No such variable {var} in UngriddedData object. " - f"Available vars: {self.contains_vars}" - ) - if "region_id" in filter_attributes: - region_id = filter_attributes.pop("region_id") - - if len(filter_attributes) > 0: - data = data.filter_by_meta(**filter_attributes) - - if extract_vars is not None: - data = data.extract_vars(extract_vars) - - if remove_outliers: - if var_outlier_ranges is None: - var_outlier_ranges = {} - - for var in data.contains_vars: - lower, upper = ( - None, - None, - ) # uses pyaerocom default specified in variables.ini - if var in var_outlier_ranges: - lower, upper = var_outlier_ranges[var] - data = data.remove_outliers( - var, inplace=True, low=lower, high=upper, move_to_trash=False - ) - if set_flags_nan: - if not data.has_flag_data: - # jgriesfeller 20230210 - # not sure if raising this exception is the right thing to do - # the fake variables (vars computed from other variables) might not have - # and do not need flags (because that has been done during the read of the - # variable they are computed from) - # disabling and logging it for now - # raise MetaDataError( - logger.info( - 'Cannot apply filter "set_flags_nan" to ' - "UngriddedData object, since it does not " - "contain flag information" - ) - else: - data = data.set_flags_nan(inplace=True) - if region_id: - data = data.filter_region(region_id) - return data - - def filter_by_meta(self, negate=None, **filter_attributes): - """Flexible method to filter these data based on input meta specs - - Parameters - ---------- - negate : list or str, optional - specified meta key(s) provided via `filter_attributes` that are - supposed to be treated as 'not valid'. E.g. if - `station_name="bad_site"` is input in `filter_attributes` and if - `station_name` is listed in `negate`, then all metadata blocks - containing "bad_site" as station_name will be excluded in output - data object. - **filter_attributes - valid meta keywords that are supposed to be filtered and the - corresponding filter values (or value ranges) - Only valid meta keywords are considered (e.g. data_id, - longitude, latitude, altitude, ts_type) - - Returns - ------- - UngriddedData - filtered ungridded data object - - Raises - ------ - NotImplementedError - if attempt variables are supposed to be filtered (not yet possible) - IOError - if any of the input keys are not valid meta key - - Example - ------- - >>> import pyaerocom as pya - >>> r = pya.io.ReadUngridded(['AeronetSunV2Lev2.daily', - 'AeronetSunV3Lev2.daily'], 'od550aer') - >>> data = r.read() - >>> data_filtered = data.filter_by_meta(data_id='AeronetSunV2Lev2.daily', - ... longitude=[-30, 30], - ... latitude=[20, 70], - ... altitude=[0, 1000]) - """ - - if "variables" in filter_attributes: - raise NotImplementedError("Cannot yet filter by variables") - - # separate filters by strin, list, etc. - filters = self._init_meta_filters(**filter_attributes) - - # find all metadata blocks that match the filters - meta_matches, totnum_new = self._find_meta_matches( - negate, - *filters, - ) - if len(meta_matches) == len(self.metadata): - logger.info( - f"Input filters {filter_attributes} result in unchanged data object" - ) - return self - new = self._new_from_meta_blocks(meta_matches, totnum_new) - time_str = datetime.now().strftime("%Y%m%d%H%M%S") - new.filter_hist[int(time_str)] = filter_attributes - return new - - def _new_from_meta_blocks(self, meta_indices, totnum_new): - # make a new empty object with the right size (totnum_new) - - new = UngriddedData(num_points=totnum_new) - - meta_idx_new = 0.0 - data_idx_new = 0 - - # loop over old meta_idx and extract data and create new meta_idx in - # output data object - for meta_idx in meta_indices: - meta = self.metadata[meta_idx] - new.metadata[meta_idx_new] = meta - new.meta_idx[meta_idx_new] = {} - for var in meta["var_info"]: - if var in self.ALLOWED_VERT_COORD_TYPES: - continue - indices = self.meta_idx[meta_idx][var] - totnum = len(indices) - - stop = data_idx_new + totnum - - new._data[data_idx_new:stop, :] = self._data[indices, :] - new.meta_idx[meta_idx_new][var] = np.arange(data_idx_new, stop) - new.var_idx[var] = self.var_idx[var] - data_idx_new += totnum - - meta_idx_new += 1 - - if meta_idx_new == 0 or data_idx_new == 0: - raise DataExtractionError("Filtering results in empty data object") - new._data = new._data[:data_idx_new] - - # write history of filtering applied - new.filter_hist.update(self.filter_hist) - new.data_revision.update(self.data_revision) - - return new - - def clear_meta_no_data(self, inplace=True): - """Remove all metadata blocks that do not have data associated with it - - Parameters - ---------- - inplace : bool - if True, the changes are applied to this instance directly, else - to a copy - - Returns - ------- - UngriddedData - cleaned up data object - - Raises - ------ - DataCoverageError - if filtering results in empty data object - """ - if inplace: - obj = self - else: - obj = self.copy() - meta_new = {} - meta_idx_new = {} - for idx, val in obj.meta_idx.items(): - meta = obj.metadata[idx] - if not bool(val): # no data assigned with this metadata block - # sanity check - if bool(meta["var_info"]): - raise AttributeError( - "meta_idx {} suggests empty data block " - "but metadata[{}] contains variable " - "information" - ) - else: - meta_new[idx] = meta - meta_idx_new[idx] = val - num_removed = len(obj.metadata) - len(meta_new) - if not bool(meta_new): - raise DataCoverageError("UngriddedData object appears to be empty") - elif num_removed > 0: # some meta blocks are empty - obj.metadata = meta_new - obj.meta_idx = meta_idx_new - - obj._add_to_filter_history( - f"Removed {num_removed} metadata blocks that have no data assigned" - ) - obj._check_index() - return obj - - def extract_dataset(self, data_id): - """Extract single dataset into new instance of :class:`UngriddedData` - - Calls :func:`filter_by_meta`. - - Parameters - ----------- - data_id : str - ID of dataset - - Returns - ------- - UngriddedData - new instance of ungridded data containing only data from specified - input network - """ - logger.info(f"Extracting dataset {data_id} from data object") - return self.filter_by_meta(data_id=data_id) - - def extract_var(self, var_name, check_index=True): - """Split this object into single-var UngriddedData objects - - Parameters - ---------- - var_name : str - name of variable that is supposed to be extracted - check_index : Bool - Call :func:`_check_index` in the new data object. - - Returns - ------- - UngriddedData - new data object containing only input variable data - """ - if not var_name in self.contains_vars: - # try alias - _var = const.VARS[var_name].var_name_aerocom - if _var in self.contains_vars: - var_name = _var - else: - raise VarNotAvailableError(f"No such variable {var_name} in data") - elif len(self.contains_vars) == 1: - logger.info("Data object is already single variable. Returning copy") - return self.copy() - - var_idx = self.var_idx[var_name] - - totnum = np.sum(self._data[:, self._VARINDEX] == var_idx) - - colnum, rownum = self.shape - - if rownum != len(self._init_index()): - raise NotImplementedError( - "Cannot split UngriddedData objects that have " - "additional columns other than default columns" - ) - - subset = UngriddedData(totnum) - - subset.var_idx[var_name] = 0 - subset._index = self.index - - meta_idx = -1 - arr_idx = 0 - - for midx, didx in self.meta_idx.items(): - if var_name in didx and len(didx[var_name]) > 0: - meta_idx += 1 - meta = {} - _meta = self.metadata[midx] - meta.update(_meta) - meta["var_info"] = {} - meta["var_info"][var_name] = _meta["var_info"][var_name] - meta["variables"] = [var_name] - subset.metadata[meta_idx] = meta - - idx = didx[var_name] - - subset.meta_idx[meta_idx] = {} - - num_add = len(idx) - start = arr_idx - stop = arr_idx + num_add - subset.meta_idx[meta_idx][var_name] = np.arange(start, stop) - - subset._data[start:stop] = self._data[idx] - subset._data[start:stop, subset._METADATAKEYINDEX] = meta_idx - subset._data[start:stop, subset._VARINDEX] = 0 - - arr_idx += num_add - - if check_index: - subset._check_index() - subset.filter_hist.update(self.filter_hist) - subset._add_to_filter_history( - f"Created {var_name} single var object from multivar UngriddedData instance" - ) - return subset - - def extract_vars(self, var_names, check_index=True): - """Extract multiple variables from dataset - - Loops over input variable names and calls :func:`extract_var` to - retrieve single variable UngriddedData objects for each variable and - then merges all of these into one object - - Parameters - ---------- - var_names : list or str - list of variables to be extracted - check_index : Bool - Call :func:`_check_index` in the new data object. - - Returns - ------- - UngriddedData - new data object containing input variables - - Raises - ------- - VarNotAvailableError - if one of the input variables is not available in this data - object - """ - if isinstance(var_names, str): - return self.extract_var(var_names) - data = UngriddedData() - - for var in var_names: - data.append(self.extract_var(var, check_index=False)) - if check_index: - data._check_index() - return data - - def code_lat_lon_in_float(self): - """method to code lat and lon in a single number so that we can use np.unique to - determine single locations""" - - # multiply lons with 10 ** (three times the needed) precision and add the lats muliplied with 1E(precision) to it - self.coded_loc = self._data[:, self._LONINDEX] * 10 ** ( - 3 * self._LOCATION_PRECISION - ) + (self._data[:, self._LATINDEX] + self._LAT_OFFSET) * ( - 10**self._LOCATION_PRECISION - ) - return self.coded_loc - - def decode_lat_lon_from_float(self): - """method to decode lat and lon from a single number calculated by code_lat_lon_in_float""" - - lons = ( - np.trunc(self.coded_loc / 10 ** (2 * self._LOCATION_PRECISION)) - / 10**self._LOCATION_PRECISION - ) - lats = ( - self.coded_loc - - np.trunc(self.coded_loc / 10 ** (2 * self._LOCATION_PRECISION)) - * 10 ** (2 * self._LOCATION_PRECISION) - ) / (10**self._LOCATION_PRECISION) - self._LAT_OFFSET - - return lats, lons - - def _find_common_meta(self, ignore_keys=None): - """Searches all metadata dictionaries that are the same - - Parameters - ---------- - ignore_keys : list - list containing meta keys that are supposed to be ignored - - Returns - ------- - tuple - 2-element tuple containing - - - list containing lists with common meta indices - - list containing corresponding meta dictionaries - """ - if ignore_keys is None: - ignore_keys = [] - meta_registered = [] - same_indices = [] - for meta_key, meta in self.metadata.items(): - found = False - for idx, meta_reg in enumerate(meta_registered): - if same_meta_dict(meta_reg, meta, ignore_keys=ignore_keys): - same_indices[idx].append(meta_key) - found = True - - if not found: - meta_registered.append(meta) - same_indices.append([meta_key]) - - return same_indices - - def merge_common_meta(self, ignore_keys=None): - """Merge all meta entries that are the same - - Note - ---- - If there is an overlap in time between the data, the blocks are not - merged - - Todo - ---- - Keep mapping of ``var_info`` (if defined in ``metadata``) to data - points (e.g. EBAS), since the data sources may be at different - wavelengths. - - Parameters - ---------- - ignore_keys : list - list containing meta keys that are supposed to be ignored - - Returns - ------- - UngriddedData - merged data object - """ - if ignore_keys is None: - ignore_keys = [] - sh = self.shape - lst_meta_idx = self._find_common_meta(ignore_keys) - new = UngriddedData(num_points=self.shape[0]) - didx = 0 - for i, idx_lst in enumerate(lst_meta_idx): - _meta_check = {} - # write metadata of first index that matches - _meta_check.update(self.metadata[idx_lst[0]]) - _meta_idx_new = {} - for j, meta_idx in enumerate(idx_lst): - if j > 0: # don't check first against first - meta = self.metadata[meta_idx] - merged = merge_dicts(meta, _meta_check) - for key in ignore_keys: - _meta_check[key] = merged[key] - - data_var_idx = self.meta_idx[meta_idx] - for var, data_idx in data_var_idx.items(): - num = len(data_idx) - stop = didx + num - new._data[didx:stop, :] = self._data[data_idx] - new._data[didx:stop, 0] = i - if not var in _meta_idx_new: - _meta_idx_new[var] = np.arange(didx, stop) - else: - _idx = np.append(_meta_idx_new[var], np.arange(didx, stop)) - _meta_idx_new[var] = _idx - didx += num - - new.meta_idx[i] = _meta_idx_new - new.metadata[i] = _meta_check - new.var_idx.update(self.var_idx) - new.filter_hist.update(self.filter_hist) - if not new.shape == sh: - raise Exception( - "FATAL: Mismatch in shape between initial and " - "and final object. Developers: please check" - ) - return new - - def merge(self, other, new_obj=True): - """Merge another data object with this one - - Parameters - ----------- - other : UngriddedData - other data object - new_obj : bool - if True, this object remains unchanged and the merged data objects - are returned in a new instance of :class:`UngriddedData`. If False, - then this object is modified - - Returns - ------- - UngriddedData - merged data object - - Raises - ------- - ValueError - if input object is not an instance of :class:`UngriddedData` - """ - if not isinstance(other, UngriddedData): - raise ValueError( - f"Invalid input, need instance of UngriddedData, got: {type(other)}" - ) - if new_obj: - obj = self.copy() - else: - obj = self - - if obj.is_empty: - obj._data = other._data - obj.metadata = other.metadata - # obj.unit = other.unit - obj.data_revision = other.data_revision - obj.meta_idx = other.meta_idx - obj.var_idx = other.var_idx - else: - # get offset in metadata index - meta_offset = max(obj.metadata) + 1 - data_offset = obj.shape[0] - - # add this offset to indices of meta dictionary in input data object - for meta_idx_other, meta_other in other.metadata.items(): - meta_idx = meta_offset + meta_idx_other - obj.metadata[meta_idx] = meta_other - _idx_map = {} - for var_name, indices in other.meta_idx[meta_idx_other].items(): - _idx_map[var_name] = np.asarray(indices) + data_offset - obj.meta_idx[meta_idx] = _idx_map - - for var, idx in other.var_idx.items(): - if var in obj.var_idx: # variable already exists in this object - if not idx == obj.var_idx[var]: - other.change_var_idx(var, obj.var_idx[var]) - else: # variable does not yet exist - idx_exists = [v for v in obj.var_idx.values()] - if idx in idx_exists: - # variable index is already assigned to another - # variable and needs to be changed - new_idx = max(idx_exists) + 1 - other.change_var_idx(var, new_idx) - obj.var_idx[var] = new_idx - else: - obj.var_idx[var] = idx - obj._data = np.vstack([obj._data, other._data]) - obj.data_revision.update(other.data_revision) - obj.filter_hist.update(other.filter_hist) - obj._check_index() - return obj - - def colocate_vardata( - self, var1, data_id1=None, var2=None, data_id2=None, other=None, **kwargs - ): - if other is None: - other = self - if var2 is None: - var2 = var1 - if data_id1 is None: - contains = self.contains_datasets - if len(contains) > 1: - raise ValueError( - "Please provide data_id1 since data object contains more than 1 dataset..." - ) - data_id1 = contains[0] - - if data_id2 is None: - contains = other.contains_datasets - if len(contains) > 1: - raise ValueError( - "Please provide data_id2 since data object contains more than 1 dataset..." - ) - data_id2 = contains[0] - if self is other and data_id1 == data_id2 and var1 == var2: - raise ValueError( - "Input combination too unspecific, please provide " - "either another data object, 2 different data IDs " - "or 2 different variable names" - ) - input_data = [(self, data_id1, var1), (other, data_id2, var2)] - statlist = combine_vardata_ungridded(input_data, **kwargs) - - new = UngriddedData.from_station_data(statlist) - return new - - def change_var_idx(self, var_name, new_idx): - """Change index that is assigned to variable - - Each variable in this object has assigned a unique index that is - stored in the dictionary :attr:`var_idx` and which is used internally - to access data from a certain variable from the data array - :attr:`_data` (the indices are stored in the data column specified by - :attr:`_VARINDEX`, cf. class header). - - This index thus needs to be unique for each variable and hence, may - need to be updated, when two instances of :class:`UngriddedData` are - merged (cf. :func:`merge`). - - And the latter is exactrly what this function does. - - Parameters - ---------- - var_name : str - name of variable - new_idx : int - new index of variable - - Raises - ------ - ValueError - if input ``new_idx`` already exist in this object as a variable - index - """ - if new_idx in self.var_idx.values(): - raise ValueError( - "Fatal: variable index cannot be assigned a new " - "index that is already assigned to one of the " - "variables in this object" - ) - cidx = self.var_idx[var_name] - self.var_idx[var_name] = new_idx - var_indices = np.where(self._data[:, self._VARINDEX] == cidx) - self._data[var_indices, self._VARINDEX] = new_idx - - def append(self, other): - """Append other instance of :class:`UngriddedData` to this object - - Note - ---- - Calls :func:`merge(other, new_obj=False)` - - Parameters - ----------- - other : UngriddedData - other data object - - Returns - ------- - UngriddedData - merged data object - - Raises - ------- - ValueError - if input object is not an instance of :class:`UngriddedData` - - """ - return self.merge(other, new_obj=False) - - def all_datapoints_var(self, var_name): - """Get array of all data values of input variable - - Parameters - ---------- - var_name : str - variable name - - Returns - ------- - ndarray - 1-d numpy array containing all values of this variable - - Raises - ------ - AttributeError - if variable name is not available - """ - if not var_name in self.var_idx: - raise AttributeError(f"Variable {var_name} not available in data") - idx = self.var_idx[var_name] - mask = np.where(self._data[:, self._VARINDEX] == idx)[0] - return self._data[mask, self._DATAINDEX] - - def num_obs_var_valid(self, var_name): - """Number of valid observations of variable in this dataset - - Parameters - ---------- - var_name : str - name of variable - - Returns - ------- - int - number of valid observations (all values that are not NaN) - """ - raise NotImplementedError("Coming soon") - - def find_common_stations( - self, - other: UngriddedData, - check_vars_available=None, - check_coordinates: bool = True, - max_diff_coords_km: float = 0.1, - ) -> dict: - """Search common stations between two UngriddedData objects - - This method loops over all stations that are stored within this - object (using :attr:`metadata`) and checks if the corresponding - station exists in a second instance of :class:`UngriddedData` that - is provided. The check is performed on basis of the station name, and - optionally, if desired, for each station name match, the lon lat - coordinates can be compared within a certain radius (defaul 0.1 km). - - Note - ---- - This is a beta version and thus, to be treated with care. - - Parameters - ---------- - other : UngriddedData - other object of ungridded data - check_vars_available : :obj:`list` (or similar), optional - list of variables that need to be available in stations of both - datasets - check_coordinates : bool - if True, check that lon and lat coordinates of station candidates - match within a certain range, specified by input parameter - ``max_diff_coords_km`` - - Returns - ------- - dict - dictionary where keys are meta_indices of the common station in - this object and corresponding values are meta indices of the - station in the other object - - """ - if len(self.contains_datasets) > 1: - raise NotImplementedError( - "This data object contains data from " - "more than one dataset and thus may " - "include multiple station matches for " - "each station ID. This method, however " - "is implemented such, that it checks " - "only the first match for each station" - ) - elif len(other.contains_datasets) > 1: - raise NotImplementedError( - "Other data object contains data from " - "more than one dataset and thus may " - "include multiple station matches for " - "each station ID. This method, however " - "is implemented such, that it checks " - "only the first match for each station" - ) - _check_vars = False - if check_vars_available is not None: - _check_vars = True - if isinstance(check_vars_available, str): - check_vars_available = [check_vars_available] - elif isinstance(check_vars_available, (tuple, np.ndarray)): - check_vars_available = list(check_vars_available) - if not isinstance(check_vars_available, list): - raise ValueError( - f"Invalid input for check_vars_available. " - f"Need str or list-like, got: {check_vars_available}" - ) - lat_len = 111.0 # approximate length of latitude degree in km - station_map = {} - stations_other = other.station_name - for meta_idx, meta in self.metadata.items(): - name = meta["station_name"] - # bool that is used to accelerate things - ok = True - if _check_vars: - for var in check_vars_available: - try: - if not var in meta["variables"]: - logger.debug( - f"No {var} in data of station {name} ({meta['data_id']})" - ) - ok = False - except Exception: # attribute does not exist or is not iterable - ok = False - if ok and name in stations_other: - for meta_idx_other, meta_other in other.metadata.items(): - if meta_other["station_name"] == name: - if _check_vars: - for var in check_vars_available: - try: - if not var in meta_other["variables"]: - logger.debug( - f"No {var} in data of station {name} ({meta_other['data_id']})" - ) - ok = False - except ( - Exception - ): # attribute does not exist or is not iterable - ok = False - if ok and check_coordinates: - dlat = abs(meta["latitude"] - meta_other["latitude"]) - dlon = abs(meta["longitude"] - meta_other["longitude"]) - lon_fac = np.cos(np.deg2rad(meta["latitude"])) - # compute distance between both station coords - dist = np.linalg.norm( - (dlat * lat_len, dlon * lat_len * lon_fac) - ) - if dist > max_diff_coords_km: - logger.warning( - f"Coordinate of station {name} " - f"varies more than {max_diff_coords_km} km " - f"between {meta['data_id']} and {meta_other['data_id']} data. " - f"Retrieved distance: {dist:.2f} km " - ) - ok = False - if ok: # match found - station_map[meta_idx] = meta_idx_other - logger.debug(f"Found station match {name}") - # no need to further iterate over the rest - continue - - return station_map - - # TODO: brute force at the moment, we need to rethink and define how to - # work with time intervals and perform temporal merging. - def find_common_data_points(self, other, var_name, sampling_freq="daily"): - if not sampling_freq == "daily": - raise NotImplementedError("Currently only works with daily data") - if not isinstance(other, UngriddedData): - raise NotImplementedError( - "So far, common data points can only be " - "retrieved between two instances of " - "UngriddedData" - ) - # find all stations that are common - common = self.find_common_stations( - other, check_vars_available=var_name, check_coordinates=True - ) - if len(common) == 0: - raise DataExtractionError("None of the stations in the two match") - dates = [] - data_this_match = [] - data_other_match = [] - - for idx_this, idx_other in common.items(): - data_idx_this = self.meta_idx[idx_this][var_name] - data_idx_other = other.meta_idx[idx_other][var_name] - - # timestamps of variable match for station... - dtimes_this = self._data[data_idx_this, self._TIMEINDEX] - dtimes_other = other._data[data_idx_other, other._TIMEINDEX] - # ... and corresponding data values of variable - data_this = self._data[data_idx_this, self._DATAINDEX] - data_other = other._data[data_idx_other, other._DATAINDEX] - # round to daily resolution. looks too complicated, but is much - # faster than pandas combined with datetime - date_nums_this = ( - dtimes_this.astype("datetime64[s]").astype("M8[D]").astype(int) - ) - date_nums_other = ( - dtimes_other.astype("datetime64[s]").astype("M8[D]").astype(int) - ) - - # TODO: loop over shorter array - for idx, datenum in enumerate(date_nums_this): - matches = np.where(date_nums_other == datenum)[0] - if len(matches) == 1: - dates.append(datenum) - data_this_match.append(data_this[idx]) - data_other_match.append(data_other[matches[0]]) - - return (dates, data_this_match, data_other_match) - - def _meta_to_lists(self): - meta = {k: [] for k in self.metadata[self.first_meta_idx]} - for meta_item in self.metadata.values(): - for k, v in meta.items(): - v.append(meta_item[k]) - return meta - - def plot_station_timeseries( - self, - station_name, - var_name, - start=None, - stop=None, - ts_type=None, - insert_nans=True, - ax=None, - **kwargs, - ): # pragma: no cover - """Plot time series of station and variable - - Parameters - ---------- - station_name : :obj:`str` or :obj:`int` - station name or index of station in metadata dict - var_name : str - name of variable to be retrieved - start - start time (optional) - stop - stop time (optional). If start time is provided and stop time not, - then only the corresponding year inferred from start time will be - considered - ts_type : :obj:`str`, optional - temporal resolution - - **kwargs - Addifional keyword args passed to method :func:`pandas.Series.plot` - - Returns - ------- - axes - matplotlib axes instance - - """ - if ax is None: - from pyaerocom.plot.config import FIGSIZE_DEFAULT - - fig, ax = plt.subplots(figsize=FIGSIZE_DEFAULT) - - stat = self.to_station_data( - station_name, - var_name, - start, - stop, - freq=ts_type, - merge_if_multi=True, - insert_nans=insert_nans, - ) - ax = stat.plot_timeseries(var_name, ax=ax, **kwargs) - return ax - - def plot_station_coordinates( - self, - var_name=None, - start=None, - stop=None, - ts_type=None, - color="r", - marker="o", - markersize=8, - fontsize_base=10, - legend=True, - add_title=True, - **kwargs, - ): # pragma: no cover - """Plot station coordinates on a map - - All input parameters are optional and may be used to add constraints - related to which stations are plotted. Default is all stations of all - times. - - Parameters - ---------- - - var_name : :obj:`str`, optional - name of variable to be retrieved - start - start time (optional) - stop - stop time (optional). If start time is provided and stop time not, - then only the corresponding year inferred from start time will be - considered - ts_type : :obj:`str`, optional - temporal resolution - color : str - color of stations on map - marker : str - marker type of stations - markersize : int - size of station markers - fontsize_base : int - basic fontsize - legend : bool - if True, legend is added - add_title : bool - if True, title will be added - **kwargs - Addifional keyword args passed to - :func:`pyaerocom.plot.plot_coordinates` - - Returns - ------- - axes - matplotlib axes instance - - """ - from pyaerocom.plot.plotcoordinates import plot_coordinates - - if len(self.contains_datasets) > 1: - logger.warning( - "UngriddedData object contains more than one " - "dataset ({}). Station coordinates will not be " - "distinguishable. You may want to apply a filter " - "first and plot them separately" - ) - - subset = self - if var_name is None: - info_str = "AllVars" - else: - if not isinstance(var_name, str): - raise ValueError( - "Can only handle single variable (or all -> input var_name=None)" - ) - elif not var_name in subset.contains_vars: - raise ValueError( - f"Input variable {var_name} is not available in dataset " - ) - info_str = var_name - - try: - info_str += f"_{start_stop_str(start, stop, ts_type)}" - except Exception: - info_str += "_AllTimes" - if ts_type is not None: - info_str += f"_{ts_type}" - - if all([x is None for x in (var_name, start, stop)]): # use all stations - all_meta = subset._meta_to_lists() - lons, lats = all_meta["longitude"], all_meta["latitude"] - - else: - stat_data = subset.to_station_data_all(var_name, start, stop, ts_type) - - if len(stat_data["stats"]) == 0: - raise DataCoverageError( - "No stations could be found for input specs (var, start, stop, freq)" - ) - lons = stat_data["longitude"] - lats = stat_data["latitude"] - if not "label" in kwargs: - kwargs["label"] = info_str - - ax = plot_coordinates( - lons, - lats, - color=color, - marker=marker, - markersize=markersize, - legend=legend, - **kwargs, - ) - - if "title" in kwargs: - title = kwargs["title"] - else: - title = info_str - if add_title: - ax.set_title(title, fontsize=fontsize_base + 4) - return ax - - def save_as(self, file_name, save_dir): - """ - Save this object to disk - - Note - ---- - So far, only storage as pickled object via - `CacheHandlerUngridded` is supported, so input file_name must end - with .pkl - - Parameters - ---------- - file_name : str - name of output file - save_dir : str - name of output directory - - Returns - ------- - str - file path - - """ - from pyaerocom.io.cachehandler_ungridded import CacheHandlerUngridded - - if not os.path.exists(save_dir): - raise FileNotFoundError(f"Directory does not exist: {save_dir}") - elif not file_name.endswith(".pkl"): - raise ValueError( - "Can only store files as pickle, file_name needs to have format .pkl" - ) - ch = CacheHandlerUngridded() - return ch.write(self, var_or_file_name=file_name, cache_dir=save_dir) - - @staticmethod - def from_cache(data_dir, file_name): - """ - Load pickled instance of `UngriddedData` - - Parameters - ---------- - data_dir : str - directory where pickled object is stored - file_name : str - file name of pickled object (needs to end with pkl) - - Raises - ------ - ValueError - if loading failed - - Returns - ------- - UngriddedData - loaded UngriddedData object. If this method is called from an - instance of `UngriddedData`, this instance remains unchanged. - You may merge the returned reloaded instance using - :func:`merge`. - - """ - from pyaerocom.io.cachehandler_ungridded import CacheHandlerUngridded - - ch = CacheHandlerUngridded() - if ch.check_and_load(file_name, cache_dir=data_dir): - return ch.loaded_data[file_name] - raise ValueError("Failed to load UngriddedData object") - - def __contains__(self, key): - """Check if input key (str) is valid dataset, variable, instrument or - station name - - Parameters - ---------- - key : str - search key - - Returns - ------- - bool - True, if key can be found, False if not - """ - - if not isinstance(key, str): - raise ValueError( - "Need string (e.g. variable name, station name, instrument name" - ) - if key in self.contains_datasets: - return True - elif key in self.contains_vars: - return True - elif key in self.station_name: - return True - elif key in self.contains_instruments: - return True - return False - - def __iter__(self): - return self - - #: ToDo revise cases of DataCoverageError - def __next__(self): - self._idx += 1 - if self._idx == len(self.metadata): - self._idx = -1 - raise StopIteration - try: - return self[self._idx] - except DataCoverageError: - logger.warning( - f"No variable data in metadata block {self._idx}. " - f"Returning empty StationData" - ) - return StationData() - - def __repr__(self): - return "{} >> from pyaerocom.io import ReadAeronetSdaV2 - >>> read = ReadAeronetSdaV2() - - >>> d0 = read.read(last_file=10) - >>> d1 = read.read(first_file=10, last_file=20) - - >>> merged = d0 & d1 - - >>> print(d0.shape, d1.shape, merged.shape) - (7326, 11) (9894, 11) (17220, 11) - """ - return self.merge(other, new_obj=True) - - def __str__(self): - head = f"Pyaerocom {type(self).__name__}" - s = ( - f"\n{head}\n{len(head) * '-'}" - f"\nContains networks: {self.contains_datasets}" - f"\nContains variables: {self.contains_vars}" - f"\nContains instruments: {self.contains_instruments}" - f"\nTotal no. of meta-blocks: {len(self.metadata)}" - ) - if self.is_filtered: - s += "\nFilters that were applied:" - for tstamp, f in self.filter_hist.items(): - if f: - s += f"\n Filter time log: {tstamp}" - if isinstance(f, dict): - for key, val in f.items(): - s += f"\n\t{key}: {val}" - else: - s += f"\n\t{f}" - - return s - - -def reduce_array_closest(arr_nominal, arr_to_be_reduced): - test = sorted(arr_to_be_reduced) - closest_idx = [] - for num in sorted(arr_nominal): - idx = np.argmin(abs(test - num)) - closest_idx.append(idx) - test = test[(idx + 1) :] - return closest_idx diff --git a/src/pyaro_readers/nilupmfebas/units_helpers.py b/src/pyaro_readers/nilupmfebas/units_helpers.py deleted file mode 100644 index 525ea77..0000000 --- a/src/pyaro_readers/nilupmfebas/units_helpers.py +++ /dev/null @@ -1,328 +0,0 @@ -import pandas as pd -from cf_units import Unit - -from .exceptions import UnitConversionError -from .time_config import SI_TO_TS_TYPE -from .tstype import TsType -from .variable_helpers import get_variable - -#: default frequency for rates variables (e.g. deposition, precip) -RATES_FREQ_DEFAULT = "d" - -# 1. DEFINITION OF ATOM and MOLECULAR MASSES - -# Atoms -M_O = 15.999 # u -M_S = 32.065 # u -M_N = 14.0067 # u -M_H = 1.00784 # u - -# Molecules -M_SO2 = M_S + 2 * M_O -M_SO4 = M_S + 4 * M_O - -M_NO2 = M_N + 2 * M_O -M_NO3 = M_N + 3 * M_O - -M_NH3 = M_N + 3 * M_H -M_NH4 = M_N + 4 * M_H - -# Unit conversion and custom units definitions - -# 2.1 Other conversion factors -HA_TO_SQM = 10000 # hectar to square metre. - -# 3. LOOKUP TABLE FOR CONVERSION FACTORS - -#: Custom unit conversion factors for certain variables -#: columns: variable -> from unit -> to_unit -> conversion -#: factor -UCONV_MUL_FACS = pd.DataFrame( - [ - # ["dryso4", "mg/m2/d", "mgS m-2 d-1", M_S / M_SO4], - # ["drynh4", "mg/m2/d", "mgN m-2 d-1", M_N/ M_NH4], - # ["concso4", "ug S/m3", "ug m-3", M_SO4 / M_S], - # ["SO4ugSm3", "ug/m3", "ug S m-3", M_S / M_SO4], - # ["concso4pm25", "ug S/m3", "ug m-3", M_SO4 / M_S], - # ["concso4pm10", "ug S/m3", "ug m-3", M_SO4 / M_S], - ["concso2", "ug S/m3", "ug m-3", M_SO2 / M_S], - ["concbc", "ug C/m3", "ug m-3", 1.0], - ["concoa", "ug C/m3", "ug m-3", 1.0], - ["concoc", "ug C/m3", "ug m-3", 1.0], - ["conctc", "ug C/m3", "ug m-3", 1.0], - # a little hacky for ratpm10pm25... - # ["ratpm10pm25", "ug m-3", "1", 1.0], - ["concpm25", "ug m-3", "1", 1.0], - ["concpm10", "ug m-3", "1", 1.0], - ["concno2", "ug N/m3", "ug m-3", M_NO2 / M_N], - # ["concno3", "ug N/m3", "ug m-3", M_NO3 / M_N], - ["concnh3", "ug N/m3", "ug m-3", M_NH3 / M_N], - # ["concnh4", "ug N/m3", "ug m-3", M_NH4 / M_N], - ["wetso4", "kg S/ha", "kg m-2", M_SO4 / M_S / HA_TO_SQM], - ["concso4pr", "mg S/L", "g m-3", M_SO4 / M_S], - ], - columns=["var_name", "from", "to", "fac"], -).set_index(["var_name", "from"]) - -# may be used to specify alternative names for custom units defined -# in UCONV_MUL_FACS - -UALIASES = { - # mass concentrations - "ug S m-3": "ug S/m3", - "ug C m-3": "ug C/m3", - "ug N m-3": "ug N/m3", - "ugC/m3": "ug C m-3", - # deposition rates (implicit) - ## sulphur species - "mgS/m2": "mg S m-2", - "mgSm-2": "mg S m-2", - ## nitrogen species - "mgN/m2": "mg N m-2", - "mgNm-2": "mg N m-2", - # deposition rates (explicit) - ## sulphur species - "mgS/m2/h": "mg S m-2 h-1", - "mg/m2/h": "mg m-2 h-1", - "mgS/m**2/h": "mg S m-2 h-1", - "mgSm-2h-1": "mg S m-2 h-1", - "mgSm**-2h-1": "mg S m-2 h-1", - "mgS/m2/d": "mg S m-2 d-1", - ## nitrogen species - "mgN/m2/h": "mg N m-2 h-1", - "mgN/m**2/h": "mg N m-2 h-1", - "mgNm-2h-1": "mg N m-2 h-1", - "mgNm**-2h-1": "mg N m-2 h-1", - "mgN/m2/d": "mg N m-2 d-1", - ## others - "MM/H": "mm h-1", - # others - "/m": "m-1", -} - - -def _check_unit_endswith_freq(unit): - """ - Check if input unit ends with an SI frequency string - - Considered SI base periods are defined in :attr:`SI_TO_TS_TYPE` (keys) - and accepted specifications as frequency in unit string are either via - "/" or "-1" (e.g. "/d" "d-1"). - - Parameters - ---------- - unit : str - unit to be checked - - Returns - ------- - bool - True if input unit ends with valid frequency string, else False - """ - if isinstance(unit, Unit): - unit = str(unit) - for si_unit in SI_TO_TS_TYPE: - if unit.endswith(f"/{si_unit}") or unit.endswith(f"{si_unit}-1"): - return True - return False - - -def rate_unit_implicit(unit): - """ - Check whether input rate unit is implicit - - Implicit rate units do not contain frequency string, e.g. "mg m-2" - instead of "mg m-2 d-1". Such units are, e.g. used in EMEP output where - the frequency corresponds to the output frequency, e.g. "mg m-2" per day if - output is daily. - - Note - ---- - For now, this is just a wrapper for :func:`_check_unit_endswith_freq`, - but there may be more sophisticated options in the future, which may be - added to this function. - - Parameters - ---------- - unit : str - unit to be tested - - Returns - ------- - bool - True if input unit appears to be implicit, else False. - - """ - return not _check_unit_endswith_freq(unit) - - -def _unit_conversion_fac_custom(var_name, from_unit): - """Get custom conversion factor for a certain unit - - Tries to determine custom conversion factor for a variable, relative to - that variables pyaerocom default unit. These are typically conversions - that cannot be handled by :mod:`cf_units` (e.g. if variable is `concno3` - which should be in units of "ug m-3" but is given in units of "ug N - m-3", that is, nitrogen mass and not molecular NO3 mass. Since such atomar - units are not supported by `cf_units` which is based on SI (it would think - the "N" is Newton), pyaerocom provides a simple interface to circumvent - these issues for such variables by providing explicit conversion factors to - convert from e.g. "ug N m-3" to "ug m-3", for affected variables, such as - concno3. - - Parameters - ---------- - var_name : str - name of variable for which factor is to be determined (needs to be - registered in global attr. :attr:`UCONV_MUL_FACS` - from_unit : str - input unit (e.g. "ug N m-3") - - Raises - ------ - UnitConversionError - if no or no unique unit conversion factor could be retrieved for - input from global attr. :attr:`UCONV_MUL_FACS` - - Returns - ------- - str - output unit - float - corresponding converison factor - """ - if from_unit in UALIASES: - from_unit = UALIASES[from_unit] - try: - info = UCONV_MUL_FACS.loc[(var_name, str(from_unit)), :] - if not isinstance(info, pd.Series): - raise UnitConversionError( - "FATAL: Could not find unique conversion factor in table " - "UCONV_MUL_FACS in units_helpers.py. Please check for " - "dulplicate entries" - ) - except KeyError: - raise UnitConversionError( - f"Failed to convert unit {from_unit} (variable {var_name}). " - f"Reason: no custom conversion factor could be inferred from table " - f"pyaerocom.units_helpers.UCONV_MUL_FACS" - ) - return (info.to, info.fac) - - -def _unit_conversion_fac_si(from_unit, to_unit): - """Retrieve multiplication factor for unit conversion - - Works only for standard units that are supported by :mod:`cf_units` - library. See also :func:`get_unit_conversion_factor` for more general - cases. - - Parameters - ---------- - from_unit : Unit or str - input unit - to_unit : Unit or str - output unit - - Raises - ------ - UnitConversionError - if units cannot be converted into each other using cf_units package - - Returns - -------- - float - multiplicative conversion factor - - - """ - if isinstance(from_unit, str): - from_unit = Unit(from_unit) - try: - return from_unit.convert(1, to_unit) - except ValueError: - raise UnitConversionError( - f"Failed to convert unit from {from_unit} to {to_unit}" - ) - - -def _get_unit_conversion_fac_helper(from_unit, to_unit, var_name=None): - """ - Helper for unit conversion - - Parameters - ---------- - from_unit : str - input unit - to_unit : str - output unit - var_name : str, optional - associated variable - - Returns - ------- - float - multiplication factor to convert data with input unit to output unit - (e.g. 1000 if input unit is kg and output unit g). - - """ - pre_conv_fac = 1.0 - if from_unit == to_unit: - # nothing to do - return 1.0 - elif var_name is not None and var_name in UCONV_MUL_FACS.index: - try: - from_unit, pre_conv_fac = _unit_conversion_fac_custom(var_name, from_unit) - except UnitConversionError: - # from_unit is likely not custom but standard... and if not - # call of unit_conversion_fac_si below will crash - pass - - return _unit_conversion_fac_si(from_unit, to_unit) * pre_conv_fac - - -def get_unit_conversion_fac(from_unit, to_unit, var_name=None, ts_type=None): - try: - return _get_unit_conversion_fac_helper(from_unit, to_unit, var_name) - except UnitConversionError: - if ( - ts_type is not None - and var_name is not None - and get_variable(var_name).is_rate - and rate_unit_implicit(from_unit) - ): - freq_si = TsType(ts_type).to_si() - from_unit = f"{from_unit} {freq_si}-1" - return _get_unit_conversion_fac_helper(from_unit, to_unit, var_name) - - raise UnitConversionError(f"failed to convert unit from {from_unit} to {to_unit}") - - -def convert_unit(data, from_unit, to_unit, var_name=None, ts_type=None): - """Convert unit of data - - Parameters - ---------- - data : np.ndarray or similar - input data - from_unit : cf_units.Unit or str - current unit of input data - to_unit : cf_units.Unit or str - new unit of input data - var_name : str, optional - name of variable. If provided, and standard conversion with - :mod:`cf_units` fails, then custom unit conversion is attempted. - ts_type : str, optional - frequency of data. May be needed for conversion of rate variables - such as precip, deposition, etc, that may be defined implictly - without proper frequency specification in the unit string. - - Returns - ------- - data - data in new unit - """ - conv_fac = get_unit_conversion_fac(from_unit, to_unit, var_name, ts_type) - if conv_fac != 1: - data *= conv_fac - return data diff --git a/src/pyaro_readers/nilupmfebas/utils.py b/src/pyaro_readers/nilupmfebas/utils.py deleted file mode 100644 index 38e6e42..0000000 --- a/src/pyaro_readers/nilupmfebas/utils.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -High level I/O utility methods for pyaerocom -""" - -import const -from _logging import change_verbosity -from .aerocom_browser import AerocomBrowser -from .readgridded import ReadGridded -from .readungridded import ReadUngridded - - -def get_ungridded_reader(obs_id): - for reader in ReadUngridded.SUPPORTED_READERS: - if obs_id in reader.SUPPORTED_DATASETS: - return reader - raise ValueError(f"No ungridded reader found that supports {obs_id}") - - -def browse_database(model_or_obs, verbose=False): - """Browse Aerocom database using model or obs ID (or wildcard) - - Searches database for matches and prints information about all matches - found (e.g. available variables, years, etc.) - - Parameters - ---------- - model_or_obs : str - model or obs ID or search pattern - verbose : bool - if True, verbosity level will be set to debug, else to critical - - Returns - ------- - list - list with data_ids of all matches - - Example - ------- - >>> import pyaerocom as pya - >>> pya.io.browse_database('AATSR*ORAC*v4*') - Pyaerocom ReadGridded - --------------------- - Model ID: AATSR_ORAC_v4.01 - Data directory: /lustre/storeA/project/aerocom/aerocom-users-database/CCI-Aerosol/CCI_AEROSOL_Phase2/AATSR_ORAC_v4.01/renamed - Available variables: ['abs550aer', 'ang4487aer', 'clt', 'landseamask', 'od550aer', 'od550dust', 'od550gt1aer', 'od550lt1aer', 'pixelcount'] - Available years: [2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012] - Available time resolutions ['daily'] - - """ - if not verbose: - change_verbosity("critical") - else: - change_verbosity("debug") - browser = AerocomBrowser() - matches = browser.find_matches(model_or_obs) - if len(matches) == 0: - print(f"No match could be found for {model_or_obs}") - return - elif len(matches) > 20: - print( - f"Found more than 20 matches for input pattern {model_or_obs}:\n\n" - f"Matches: {matches}\n\n" - f"To receive more detailed information, please specify search ID more accurately" - ) - return - for match in matches: - try: - if match in const.OBS_IDS_UNGRIDDED: - reader = ReadUngridded(match) - else: - reader = ReadGridded(match) - print(reader) - except Exception as e: - print(f"Reading failed for {match}. Error: {repr(e)}") - return matches diff --git a/src/pyaro_readers/nilupmfebas/var_groups.py b/src/pyaro_readers/nilupmfebas/var_groups.py deleted file mode 100644 index c2ae3cf..0000000 --- a/src/pyaro_readers/nilupmfebas/var_groups.py +++ /dev/null @@ -1,25 +0,0 @@ -""" -Variable categorisation groups - -These are needed in some cases to infer, e.g. units associated with variable -names. Used in :class:`pyaerocom.variable.Variable` to identify certain groups. - -Note ------ -The below definitions are far from complete -""" - -#: start string of emission variables -emi_startswith = "emi" -#: start string of wet deposition variables -wetdep_startswith = "wet" -#: start string of dry deposition variables -drydep_startswith = "dry" -#: start string of total deposition variables -totdep_startswith = "dep" - -#: additional emission rate variables (that do not start with emi*) -emi_add_vars = [] - -#: additional deposition rate variables (that do not start with wet* or dry*) -dep_add_vars = [] diff --git a/src/pyaro_readers/nilupmfebas/varcollection.py b/src/pyaro_readers/nilupmfebas/varcollection.py deleted file mode 100644 index 96dfa78..0000000 --- a/src/pyaro_readers/nilupmfebas/varcollection.py +++ /dev/null @@ -1,213 +0,0 @@ -import fnmatch -import logging -import os - -from cf_units import Unit - -from .exceptions import VariableDefinitionError -from .variable import Variable -from .variable_helpers import parse_aliases_ini, parse_variables_ini - -logger = logging.getLogger(__name__) - - -class VarCollection: - """Variable access class based on variables.ini file""" - - def __init__(self, var_ini): - self._all_vars = None - self._var_ini = None - - self.var_ini = var_ini - - self._vars_added = {} - - self._cfg_parser = parse_variables_ini(var_ini) - self._alias_parser = parse_aliases_ini() - self._idx = -1 - - @property - def all_vars(self): - """List of all variables - - Note - ---- - Does not include variable names that may be inferred via - alias families as defined in section [alias_families] in - aliases.ini. - """ - if self._all_vars is None: - self._all_vars = list(self._cfg_parser) + list(self._vars_added) - return self._all_vars - - @property - def var_ini(self): - """Config file specifying variable information""" - return self._var_ini - - @var_ini.setter - def var_ini(self, var_ini): - if not isinstance(var_ini, str): - raise ValueError(f"Invalid input for var_ini, need str") - elif not os.path.exists(var_ini): - raise FileNotFoundError(f"File {var_ini} does not exist") - self._var_ini = var_ini - - def add_var(self, var): - """Add a new variable to this collection - - Minimum requirement for new variables are attributes var_name and - units. - - Parameters - ---------- - var : Variable - new variable definition - - Raises - ------ - VariableDefinitionError - if a variable is already defined under that name - - Returns - ------- - None - """ - if not isinstance(var.var_name, str): - raise ValueError("Attr. var_name needs to be assigned to input variable") - if var.var_name in self.all_vars: - raise VariableDefinitionError( - f"variable with name {var.var_name} is already defined" - ) - if not isinstance(var, Variable): - raise ValueError("Can only add instances of Variable class...") - if not isinstance(var.units, str): - if not isinstance(var.units, Unit): - raise ValueError("Please assign a unit to the new input variable") - var.units = str(var.units) - self._all_vars.append(var.var_name) - self._vars_added[var.var_name] = var - - def delete_variable(self, var_name: str) -> None: - """ - Remove input variable from this collection - - Parameters - ---------- - var_name : str - name of variable - - Raises - ------ - VariableDefinitionError - if variable does not exist or if it exists more than once. - - Returns - ------- - None - - """ - all_vars = self.all_vars - matches = [i for i, x in enumerate(all_vars) if x == var_name] - if len(matches) == 0: - raise VariableDefinitionError( - f"No such variable {var_name} in VarCollection" - ) - elif len(matches) > 1: - raise VariableDefinitionError( - f"FATAL: found multiple matches for variable {var_name} in " - f"VarCollection. Please check variables.ini" - ) - all_vars.pop(matches[0]) - self._all_vars == all_vars - if var_name in self._vars_added: - del self._vars_added[var_name] - - def get_var(self, var_name): - """ - Get variable based on variable name - - Parameters - ---------- - var_name : str - name of variable - - Raises - ------ - VariableDefinitionError - if no variable under input var_name is registered. - - Returns - ------- - Variable - Variable instance - - """ - if var_name in self._vars_added: - return self._vars_added[var_name] - var = Variable(var_name, cfg=self._cfg_parser) - if not var.var_name_aerocom in self: - raise VariableDefinitionError( - f"Error (VarCollection): input variable {var_name} is not supported" - ) - return var - - def find(self, search_pattern): - """Find all variables that match input search pattern - - Note - ---- - Searches for matches in variable names (:attr:`Variable.var_name`) and - standard name (:attr:`Variable.standard_name`). - - Parameters - ---------- - search_pattern : str - variable search pattern - - Returns - ------- - list - AeroCom variable names that match the search pattern - """ - matches = [] - for var in self: - if fnmatch.fnmatch(var.var_name, search_pattern): - matches.append(var.var_name) - elif isinstance(var.standard_name, str) and fnmatch.fnmatch( - var.standard_name, search_pattern - ): - matches.append(var.var_name) - return matches - - def __dir__(self): - """Activates auto tab-completion for all variables""" - return self.all_vars - - def __iter__(self): - return self - - def __next__(self): - self._idx += 1 - if self._idx == len(self.all_vars): - self._idx = -1 - raise StopIteration - var_name = self.all_vars[self._idx] - return self[var_name] - - def __contains__(self, var_name): - if var_name in self.all_vars: - return True - return False - - def __len__(self): - return len(self.all_vars) - - def __getitem__(self, var_name): - return self.get_var(var_name) - - def __repr__(self): - return f"VarCollection ({len(self)} entries)" - - def __str__(self): - return repr(self) diff --git a/src/pyaro_readers/nilupmfebas/variable.py b/src/pyaro_readers/nilupmfebas/variable.py deleted file mode 100644 index d226c78..0000000 --- a/src/pyaro_readers/nilupmfebas/variable.py +++ /dev/null @@ -1,597 +0,0 @@ -import logging -import warnings -from ast import literal_eval -from configparser import ConfigParser - -import numpy as np - -from . import var_groups -from ._lowlevel_helpers import dict_to_str, list_to_shortstr -from .exceptions import VariableDefinitionError -from .mathutils import make_binlist -from .obs_io import OBS_WAVELENGTH_TOL_NM - -#: helper vor checking if variable name contains str 3d or 3D -from .variable_helpers import ( - _check_alias_family, - _read_alias_ini, - get_aliases, - parse_aliases_ini, - parse_variables_ini, -) -from .varnameinfo import VarNameInfo - -logger = logging.getLogger(__name__) - - -class Variable: - """Interface that specifies default settings for a variable - - See `variables.ini `__ file for an overview of currently available - default variables. - - Parameters - ---------- - var_name : str - string ID of variable (see file variables.ini for valid IDs) - init : bool - if True, input variable name is attempted to be read from config file - cfg : ConfigParser - open config parser that holds the information in config file available - (i.e. :func:`ConfigParser.read` has been called with config file as - input) - **kwargs - any valid class attribute (e.g. map_vmin, map_vmax, ...) - - Attributes - ---------- - var_name : str - input variable name - var_name_aerocom : str - AEROCOM variable name (see e.g. `AEROCOM protocol - `__ for a list of - available variables) - is_3d : bool - flag that indicates if variable is 3D - is_dry : bool - flag that is set based on filename that indicates if variable data - corresponds to dry conditions. - units : str - unit of variable (None if no unit) - default_vert_code : str, optional - default vertical code to be loaded (i.e. Column, ModelLevel, Surface). - Only relevant during reading and in case conflicts occur (e.g. - abs550aer, 2010, Column and Surface files) - aliases : list - list of alternative names for this variable - minimum : float - lower limit of allowed value range - upper_limit : float - upper limit of allowed value range - obs_wavelength_tol_nm : float - wavelength tolerance (+/-) for reading of obsdata. Default is 10, i.e. - if this variable is defined at 550 nm and obsdata contains measured - values of this quantity within interval of 540 - 560, then these data - is used - scat_xlim : float - x-range for scatter plot - scat_ylim : float - y-range for scatter plot - scat_loglog : bool - scatter plot on loglog scale - scat_scale_factor : float - scale factor for scatter plot - map_cmap : str - name of default colormap (matplotlib) of this variable. - map_vmin : float - data value corresponding to lower end of colormap in map plots of this - quantity - map_vmax : float - data value corresponding to upper end of colormap in map plots of this - quantity - map_c_under : str - color used for values below :attr:`map_vmin` in map plots of this - quantity - map_c_over : str - color used for values exceeding :attr:`map_vmax` in map plots of this - quantity - map_cbar_levels : :obj:`list`, optional - levels of colorbar - map_cbar_ticks : :obj:`list`, optional - colorbar ticks - """ - - literal_eval_list = lambda val: list(literal_eval(val)) - str2list = lambda val: [x.strip() for x in val.split(",")] - str2bool = lambda val: val.lower() in ("true", "1", "t", "yes") - - _TYPE_CONV = { - "wavelength_nm": float, - "minimum": float, - "maximum": float, - "dimensions": str2list, - "obs_wavelength_tol_nm": float, - "scat_xlim": literal_eval_list, - "scat_ylim": literal_eval_list, - "scat_loglog": str2bool, - "scat_scale_factor": float, - "dry_rh_max": float, - "map_cmap": str, - "map_vmin": float, - "map_vmax": float, - "map_cbar_levels": literal_eval_list, - "map_cbar_ticks": literal_eval_list, - "_is_rate": bool, - } - - # maybe used in config - ALT_NAMES = {"unit": "units"} - - plot_info_keys = [ - "scat_xlim", - "scat_ylim", - "scat_loglog", - "scat_scale_factor", - "map_vmin", - "map_vmax", - "map_cmap", - "map_c_under", - "map_c_over", - "map_cbar_levels", - "map_cbar_ticks", - ] - VMIN_DEFAULT = -np.inf - VMAX_DEFAULT = np.inf - - @staticmethod - def _check_input_var_name(var_name): - if "3d" in var_name: - var_name = var_name.replace("3d", "") - elif "3D" in var_name: - var_name = var_name.replace("3D", "") - elif "_" in var_name: - raise ValueError( - f"invalid variable name {var_name}. Must not contain underscore" - ) - return var_name - - def __init__(self, var_name=None, init=True, cfg=None, **kwargs): - if var_name is None: - var_name = "od550aer" - elif not isinstance(var_name, str): - raise ValueError( - f"Invalid input for variable name, need str type, got {type(var_name)}" - ) - # save orig. input for whatever reason - self._var_name_input = var_name - - self.var_name = self._check_input_var_name(var_name) - self._var_name_aerocom = None - - self.standard_name = None - # Assume variables that have no unit specified in variables.ini are - # unitless. - self.units = "1" - self.default_vert_code = None - - self.wavelength_nm = None - self.dry_rh_max = 40 - self.dimensions = None - self.minimum = self.VMIN_DEFAULT - self.maximum = self.VMAX_DEFAULT - - self.description = None - self.comments_and_purpose = None - - # wavelength tolerance in nm - self.obs_wavelength_tol_nm = None - - self.scat_xlim = None - self.scat_ylim = None - self.scat_loglog = None - self.scat_scale_factor = 1.0 - - # settings for map plotting - self.map_cmap = "coolwarm" - self.map_vmin = None - self.map_vmax = None - self.map_c_under = None - self.map_c_over = "r" - self.map_cbar_levels = None - self.map_cbar_ticks = None - - self._is_rate = False - - # imports default information and, on top, variable information (if - # applicable) - if init: - self.parse_from_ini(self.var_name, cfg=cfg) - - self.update(**kwargs) - if self.obs_wavelength_tol_nm is None: - self.obs_wavelength_tol_nm = OBS_WAVELENGTH_TOL_NM - - @property - def var_name_aerocom(self): - """AeroCom variable name of the input variable""" - vna = self._var_name_aerocom - return self.var_name if vna is None else vna - - @property - def var_name_input(self): - """Input variable""" - return self._var_name_input - - @property - def is_3d(self): - """True if str '3d' is contained in :attr:`var_name_input`""" - return True if "3d" in self.var_name_input.lower() else False - - @property - def is_wavelength_dependent(self): - """Indicates whether this variable is wavelength dependent""" - return True if self.wavelength_nm is not None else False - - @property - def is_at_dry_conditions(self): - """Indicate whether variable denotes dry conditions""" - var_name = self.var_name_aerocom - if var_name.startswith("dry"): # dry deposition - return False - return True if "dry" in var_name else False - - @property - def is_deposition(self): - """ - Indicates whether input variables is a deposition rate - - Note - ---- - This funtion only identifies wet and dry deposition based on the variable - names, there might be other variables that are deposition variables but - cannot be identified by this function. - - Parameters - ---------- - var_name : str - Name of variable to be checked - - Returns - ------- - bool - If True, then variable name denotes a deposition variables - - """ - var_name = self.var_name_aerocom - if var_name.startswith(var_groups.drydep_startswith): - return True - elif var_name.startswith(var_groups.wetdep_startswith): - return True - elif var_name.startswith(var_groups.totdep_startswith): - return True - elif var_name in var_groups.dep_add_vars: - return True - return False - - @property - def is_emission(self): - """ - Indicates whether input variables is an emission rate - - Note - ---- - This funtion only identifies wet and dry deposition based on the variable - names, there might be other variables that are deposition variables but - cannot be identified by this function. - - Parameters - ---------- - var_name : str - Name of variable to be checked - - Returns - ------- - bool - If True, then variable name denotes a deposition variables - - """ - var_name = self.var_name_aerocom - if var_name.startswith(var_groups.emi_startswith): - return True - elif var_name in var_groups.emi_add_vars: - return True - return False - - @property - def is_rate(self): - """Indicates whether variable name is a rate - - Rates include e.g. deposition or emission rate variables but also - precipitation - - Returns - ------- - bool - True if variable is rate, else False - """ - if self.is_emission: - return True - elif self.is_deposition: - return True - elif self._is_rate: - return True - return False - - @property - def is_alias(self): - return True if self.var_name != self.var_name_aerocom else False - - @property - def unit(self): - """Unit of variable (old name, deprecated)""" - warnings.warn( - "Attr. name unit in Variable class is deprecated. Please use units instead", - DeprecationWarning, - stacklevel=2, - ) - return self.units - - @property - def plot_info(self): - """Dictionary containing plot information""" - d = {} - for k in self.plot_info_keys: - d[k] = self[k] - return d - - def update(self, **kwargs): - for key, val in kwargs.items(): - self[key] = val - - @property - def has_unit(self): - """Boolean specifying whether variable has unit""" - return True if not self.units in (1, None) else False - - @property - def lower_limit(self): - """Old attribute name for :attr:`minimum` (following HTAP2 defs)""" - warnings.warn( - "Old name for attribute minimum", DeprecationWarning, stacklevel=2 - ) - return self.minimum - - @property - def upper_limit(self): - """Old attribute name for :attr:`maximum` (following HTAP2 defs)""" - warnings.warn( - "Old name for attribute maximum", DeprecationWarning, stacklevel=2 - ) - return self.maximum - - @property - def unit_str(self): - """string representation of unit""" - if self.units is None: - return "" - else: - return f"[{self.units}]" - - @staticmethod - def read_config(): - return parse_variables_ini() - - @property - def var_name_info(self): - return VarNameInfo(self.var_name) - - @property - def aliases(self): - """Alias variable names that are frequently found or used - - Returns - ------- - list - list containing valid aliases - """ - return get_aliases(self.var_name) - - @property - def long_name(self): - """Wrapper for :attr:`description`""" - return self.description - - def keys(self): - return list(self.__dict__) - - @staticmethod - def _check_aliases(var_name): - ap = parse_aliases_ini() - aliases = _read_alias_ini(ap) - if var_name in aliases: - return aliases[var_name] - return _check_alias_family(var_name, ap) - - def get_default_vert_code(self): - """Get default vertical code for variable name""" - if self.default_vert_code is not None: - return self.default_vert_code - try: - return VarNameInfo(self.var_name_aerocom).get_default_vert_code() - except ValueError: - logger.warning( - f"default_vert_code not set for {self.var_name_aerocom} and " - f"could also not be inferred" - ) - return None - - def get_cmap(self): - """ - Get cmap str for var - - Returns - ------- - str - - """ - return self.map_cmap - - def _cmap_bins_from_vmin_vmax(self): - """ - Calculate cmap discretisation bins from :attr:`vmin` and :attr:`vmax` - - Sets value of :attr:`map_cbar_levels` - - Raises - ------ - AttributeError - if :attr:`vmin` and :attr:`vmax` are not defined - - """ - if self.minimum == self.VMIN_DEFAULT or self.maximum == self.VMAX_DEFAULT: - raise AttributeError( - f"need minimum and maximum to be specified " - f"for variable {self.var_name} in " - f"order to retrieve cmap_bins" - ) - self.map_cbar_levels = make_binlist(self.minimum, self.maximum) - - def get_cmap_bins(self, infer_if_missing=True): - """ - Get cmap discretisation bins - - Parameters - ---------- - infer_if_missing : bool - if True and :attr:`map_cbar_levels` is not defined, try to infer - using :func:`_cmap_bins_from_vmin_vmax`. - - Raises - ------ - AttributeError - if unavailable - - Returns - ------- - list - levels - - """ - if self.map_cbar_levels is None: - if infer_if_missing: - self._cmap_bins_from_vmin_vmax() - else: - raise AttributeError( - f"map_cbar_levels is not defined for variable {self.var_name}" - ) - return self.map_cbar_levels - - def parse_from_ini(self, var_name=None, cfg=None): - """Import information about default region - - Parameters - ---------- - var_name : str - variable name - var_name_alt : str - alternative variable name that is used if variable name is not - available - cfg : ConfigParser - open config parser object - - Returns - ------- - bool - True, if default could be loaded, False if not - """ - if cfg is None: - cfg = self.read_config() - elif not isinstance(cfg, ConfigParser): - raise ValueError( - f"invalid input for cfg, need config parser got {type(cfg)}" - ) - if not var_name in cfg: - try: - var_name = self._check_aliases(var_name) - except VariableDefinitionError: - logger.info(f"Unknown input variable {var_name}") - return - self._var_name_aerocom = var_name - - var_info = cfg[var_name] - # this variable should import settings from another variable - if "use" in var_info: - use = var_info["use"] - if not use in cfg: - raise VariableDefinitionError( - f"Input variable {var_name} depends on {use} " - f"which is not available in variables.ini." - ) - self.parse_from_ini(use, cfg) - - for key, val in var_info.items(): - if key in self.ALT_NAMES: - key = self.ALT_NAMES[key] - self._add(key, val) - - def _add(self, key, val): - if key in self._TYPE_CONV: - try: - val = self._TYPE_CONV[key](val) - except: - pass - elif key == "units" and val == "None": - val = "1" - if val == "None": - val = None - self[key] = val - - def __setitem__(self, key, val): - self.__dict__[key] = val - - def __getitem__(self, key): - return self.__dict__[key] - - def __repr__(self): - return ( - "{self.var_name}\nstandard_name: {self.standard_name}; Unit: {self.units}" - ) - - def __eq__(self, other): - if isinstance(other, str): - other = Variable(other) - elif not isinstance(other, Variable): - raise TypeError("Can only compare with str or other Variable instance") - return True if other.var_name_aerocom == self.var_name_aerocom else False - - def __str__(self): - head = f"Pyaerocom {type(self).__name__}" - s = f"\n{head}\n{len(head)*'-'}" - - plot_s = "\nPlotting settings\n......................" - - for k, v in self.__dict__.items(): - if k in self.plot_info_keys: - if v is None: - continue - if isinstance(v, dict): - plot_s += f"\n{k} (dict)" - plot_s += dict_to_str(v, indent=3, ignore_null=True) - elif isinstance(v, list): - plot_s += f"\n{k} (list, {len(v)} items)" - plot_s += list_to_shortstr(v) - else: - plot_s += f"\n{k}: {v}" - else: - if isinstance(v, dict): - s += f"\n{k} (dict)" - s += dict_to_str(v, indent=3, ignore_null=True) - elif isinstance(v, list): - s += f"\n{k} (list, {len(v)} items)" - s += list_to_shortstr(v) - else: - s += f"\n{k}: {v}" - - s += plot_s - return s diff --git a/src/pyaro_readers/nilupmfebas/variable_helpers.py b/src/pyaro_readers/nilupmfebas/variable_helpers.py deleted file mode 100644 index b7da13e..0000000 --- a/src/pyaro_readers/nilupmfebas/variable_helpers.py +++ /dev/null @@ -1,110 +0,0 @@ -from __future__ import annotations - -from configparser import ConfigParser -from pathlib import Path - -from . import resources -from .exceptions import VariableDefinitionError - - -def parse_variables_ini(fpath: str | Path | None = None): - """Returns instance of ConfigParser to access information""" - - if fpath is None: - with resources.path("pyaerocom.data", "variables.ini") as path: - fpath = path - - if isinstance(fpath, str): - fpath = Path(fpath) - if not fpath.exists(): - raise FileNotFoundError( - f"FATAL: variables.ini file could not be found at {fpath}" - ) - - parser = ConfigParser() - parser.read(fpath) - return parser - - -def parse_aliases_ini(): - """Returns instance of ConfigParser to access information""" - with resources.path("pyaerocom.data", "aliases.ini") as path: - fpath = path - - parser = ConfigParser() - parser.read(fpath) - return parser - - -def _read_alias_ini(parser: ConfigParser | None = None): - """Read all alias definitions from aliases.ini file and return as dict - - Returns - ------- - dict - keys are AEROCOM standard names of variable, values are corresponding - aliases - """ - if parser is None: - parser = parse_aliases_ini() - aliases = {} - items = parser["aliases"] - for var_name in items: - _aliases = [x.strip() for x in items[var_name].strip().split(",")] - for alias in _aliases: - aliases[alias] = var_name - for var_fam, alias_fam in parser["alias_families"].items(): - if "," in alias_fam: - raise Exception( - f"Found invalid definition of alias family {var_fam}: {alias_fam}. " - f"Only one family can be mapped to a variable name" - ) - return aliases - - -def get_aliases(var_name: str, parser: ConfigParser | None = None): - """Get aliases for a certain variable""" - if parser is None: - parser = ConfigParser() - with resources.path("pyaerocom.data", "aliases.ini") as path: - parser.read(path) - - info = parser["aliases"] - aliases = [] - if var_name in info: - aliases.extend([a.strip() for a in info[var_name].split(",")]) - for var_fam, alias_fam in parser["alias_families"].items(): - if var_name.startswith(var_fam): - alias = var_name.replace(var_fam, alias_fam) - aliases.append(alias) - return aliases - - -def _check_alias_family(var_name: str, parser: ConfigParser): - for var_fam, alias_fam in parser["alias_families"].items(): - if var_name.startswith(alias_fam): - var_name_aerocom = var_name.replace(alias_fam, var_fam) - return var_name_aerocom - raise VariableDefinitionError( - "Input variable could not be identified as " - "belonging to either of the available alias " - "variable families" - ) - - -def get_variable(var_name: str): - """ - Get a certain variable - - Parameters - ---------- - var_name : str - variable name - - Returns - ------- - Variable - """ - from pyaerocom import const - - return const.VARS[var_name] diff --git a/src/pyaro_readers/nilupmfebas/varnameinfo.py b/src/pyaro_readers/nilupmfebas/varnameinfo.py deleted file mode 100644 index 654bc6a..0000000 --- a/src/pyaro_readers/nilupmfebas/varnameinfo.py +++ /dev/null @@ -1,144 +0,0 @@ -import fnmatch -import re - -from .exceptions import VariableDefinitionError - - -class VarNameInfo: - """This class can be used to retrieve information from variable names""" - - #: valid number range for retrieval of wavelengths from variable name - _VALID_WVL_RANGE = [0.1, 10000] # nm - - #: valid variable families for wavelength retrievals - _VALID_WVL_IDS = ["od", "abs", "ec", "sc", "ac", "bsc", "ssa"] - - PATTERNS = {"od": r"od\d+aer"} - DEFAULT_VERT_CODE_PATTERNS = { - "abs*": "Column", - "od*": "Column", - "ang*": "Column", - "load*": "Column", - "wet*": "Surface", - "dry*": "Surface", - "emi*": "Surface", - } - - def __init__(self, var_name): - self.var_name = var_name - self._nums = [] - try: - self._nums = self._numbers_in_string(var_name) - except Exception: - pass - - def get_default_vert_code(self): - """Get default vertical code for variable name""" - for pattern, code in self.DEFAULT_VERT_CODE_PATTERNS.items(): - if fnmatch.fnmatch(self.var_name, pattern): - return code - raise ValueError(f"No default vertical code could be found for {self.var_name}") - - @staticmethod - def _numbers_in_string(input_str): - """Get list of all numbers in input str - - Parameters - ---------- - input_str : str - string to be checked - - Returns - ------- - list - list of numbers that were found in input string - """ - return [int(x) for x in re.findall(r"\d+", input_str)] - - @property - def contains_numbers(self): - """Boolean specifying whether this variable name contains numbers""" - if len(self._nums) > 0: - return True - return False - - @property - def is_wavelength_dependent(self): - """Boolean specifying whether this variable name is wavelength dependent""" - for item in self._VALID_WVL_IDS: - if self.var_name.startswith(item): - return True - return False - - @property - def contains_wavelength_nm(self): - """Boolean specifying whether this variable contains a certain wavelength""" - if not self.contains_numbers: - return False - low, high = self._VALID_WVL_RANGE - if self._nums and low <= self._nums[0] <= high: - return True - return False - - @property - def wavelength_nm(self): - """Wavelength in nm (if appliable)""" - if not self.is_wavelength_dependent: - raise VariableDefinitionError( - f"Variable {self.var_name} is not wavelength " - f"dependent (does not start with either of {self._VALID_WVL_IDS})" - ) - - elif not self.contains_wavelength_nm: - raise VariableDefinitionError( - "Wavelength could not be extracted from variable name" - ) - return self._nums[0] - - def in_wavelength_range(self, low, high): - """Boolean specifying whether variable is within wavelength range - - Parameters - ---------- - low : float - lower end of wavelength range to be tested - high : float - upper end of wavelength range to be tested - - Returns - ------- - bool - True, if this variable is wavelength dependent and if the - wavelength that is inferred from the filename is within the - specified input range - """ - return low <= self.wavelength <= high - - def translate_to_wavelength(self, to_wavelength): - """Create new variable name at a different wavelength - - Parameters - ---------- - to_wavelength : float - new wavelength in nm - - Returns - ------- - VarNameInfo - new variable name - """ - if not self.contains_wavelength_nm: - raise ValueError(f"Variable {self.var_name} is not wavelength dependent") - name = self.var_name.replace(str(self.wavelength_nm), str(to_wavelength)) - return VarNameInfo(name) - - def __str__(self): - s = ( - f"\nVariable {self.var_name}\n" - f"is_wavelength_dependent: {self.is_wavelength_dependent}\n" - ) - if hasattr(self, "is_optical_density"): # pragma: no cover - s += f"is_optical_density: {self.is_optical_density}\n" # can't find situation where this happens however not sure if depricated - if self.is_wavelength_dependent: - s += f"\nwavelength_nm: {self.wavelength_nm}" - return s