From 81e88190af463c4ae85938a653e9973dd1701fc2 Mon Sep 17 00:00:00 2001 From: Anu-Ra-g Date: Thu, 27 Jun 2024 17:52:01 +0530 Subject: [PATCH] added parse_grib_idx function --- kerchunk/grib2.py | 71 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/kerchunk/grib2.py b/kerchunk/grib2.py index 52b02269..e92668f1 100644 --- a/kerchunk/grib2.py +++ b/kerchunk/grib2.py @@ -3,10 +3,12 @@ import io import logging from collections import defaultdict -from typing import Iterable, List, Dict, Set - +from typing import Iterable, List, Dict, Set, TYPE_CHECKING, Optional import ujson +if TYPE_CHECKING: + import pandas as pd + try: import cfgrib except ModuleNotFoundError as err: # pragma: no cover @@ -582,3 +584,68 @@ def correct_hrrr_subhf_step(group: Dict) -> Dict: group["refs"]["step/0"] = enocded_val return group + + +def parse_grib_idx( + fs: fsspec.AbstractFileSystem, + *, + basename: str, + suffix: str = "idx", + tstamp: Optional["pd.Timestamp"] = None, + validate: bool = False, +) -> "pd.DataFrame": + """ + Standalone method used to extract metadata from a grib2 idx file(text) from NODD. + + The function takes idx file, extracts the metadata known as attrs (variables with + level and forecast time) from each idx entry and converts it into pandas + DataFrame. The dataframe is later to build the one-to-one mapping to the grib file metadata. + + Parameters + ---------- + fs : fsspec.AbstractFileSystem + The file system to read from. + basename : str + The base name is the full path to the grib file. + suffix : str + The suffix is the ending for the idx file. + tstamp : Optional[pd.Timestamp] + The timestamp to use for when the data was indexed + validate : bool + The validation if the metadata table has duplicate attrs. + + Returns + ------- + pandas.DataFrame : The data frame containing the results. + """ + import pandas as pd + + fname = f"{basename}.{suffix}" + + baseinfo = fs.info(basename) + + result = None + + try: + result = pd.read_csv(fname, sep=":", header=None).loc[:, :5] + result.columns = ["idx", "offset", "date", "attrs", "level", "forecast"] + result["attrs"] = ( + result["attrs"] + ":" + result["level"] + ":" + result["forecast"] + ) + result.drop(columns=["level", "forecast"], inplace=True) + except Exception as e: + raise ValueError(f"Could not parse {fname}") from e + + result = result.assign( + length=( + result.offset.shift(periods=-1, fill_value=baseinfo["size"]) - result.offset + ), + idx_uri=fname, + grib_uri=basename, + indexed_at=tstamp if tstamp else pd.Timestamp.now(), + ) + + if validate and not result["attrs"].is_unique: + raise ValueError(f"Attribute mapping for grib file {basename} is not unique)") + + return result.set_index("idx")