-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use xarray-beam to append derived Replay variables (#24)
* this works for really small stuff * some debugging stuff and storage options kwargs * this worked for 1 degree dataset * oops * num_threads on chunkstozarr capable, but has not effect * set input chunks to 127, full column... finally quarter degree is possible * submitted subsampled quarter degree append job * note about missinf fields * geopotential verified * used this to append 1degree and 1/4 degree static variables * cleanup * typo
- Loading branch information
Showing
8 changed files
with
689 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
"""Compute geopotential from Replay dataset, | ||
append it back to the original or store it locally depending on inputs | ||
Note: this was heavily borrowed from this xarray-beam example: | ||
https://github.com/google/xarray-beam/blob/main/examples/era5_climatology.py | ||
""" | ||
|
||
from typing import Tuple | ||
|
||
from absl import app | ||
from absl import flags | ||
import logging | ||
import apache_beam as beam | ||
from apache_beam.options.pipeline_options import PipelineOptions | ||
from apache_beam.runners.dask.dask_runner import DaskRunner | ||
import numpy as np | ||
import xarray as xr | ||
import xarray_beam as xbeam | ||
|
||
from ufs2arco import Layers2Pressure | ||
from verify_geopotential import setup_log | ||
from localzarr import ChunksToZarr | ||
|
||
|
||
INPUT_PATH = flags.DEFINE_string('input_path', None, help='Input Zarr path') | ||
OUTPUT_PATH = flags.DEFINE_string('output_path', None, help='Output Zarr path') | ||
RUNNER = flags.DEFINE_string('runner', "DirectRunner", 'beam.runners.Runner') | ||
TIME_LENGTH = flags.DEFINE_integer('time_length', None, help="Number of time slices to use for debugging") | ||
NUM_WORKERS = flags.DEFINE_integer('num_workers', None, help="Number of workers for the runner") | ||
NUM_THREADS = flags.DEFINE_integer('num_threads', None, help="Passed to ChunksToZarr") | ||
|
||
def calc_geopotential( | ||
key: xbeam.Key, | ||
xds: xr.Dataset, | ||
) -> Tuple[xbeam.Key, xr.Dataset]: | ||
"""Return dataset with geopotential field, that's it""" | ||
|
||
lp = Layers2Pressure() | ||
xds = xds.rename({"pfull": "level"}) | ||
prsl = lp.calc_layer_mean_pressure(xds["pressfc"], xds["tmp"], xds["spfh"], xds["delz"]) | ||
|
||
newds = xr.Dataset() | ||
newds["geopotential"] = lp.calc_geopotential(xds["hgtsfc"], xds["delz"]) | ||
newds = newds.rename({"level": "pfull"}) | ||
return key, newds | ||
|
||
def main(argv): | ||
|
||
setup_log() | ||
path = INPUT_PATH.value | ||
kwargs = {} | ||
|
||
if "gs://" in path or "gcs://" in path: | ||
kwargs["storage_options"] = {"token": "anon"} | ||
|
||
source_dataset, source_chunks = xbeam.open_zarr(path, **kwargs) | ||
source_dataset = source_dataset.drop_vars(["cftime", "ftime"]) | ||
if TIME_LENGTH.value is not None: | ||
source_dataset = source_dataset.isel(time=slice(int(TIME_LENGTH.value))) | ||
|
||
# create template | ||
tds = source_dataset[["tmp"]].rename({"tmp": "geopotential"}) | ||
tds["geopotential"].attrs = { | ||
"units": "m**2 / s**2", | ||
"description": "Diagnosed using ufs2arco.Layers2Pressure.calc_geopotential", | ||
"long_name": "geopotential height", | ||
} | ||
input_chunks = {k: source_chunks[k] if k != "pfull" else 127 for k in tds["geopotential"].dims} | ||
output_chunks = {k: source_chunks[k] for k in tds["geopotential"].dims} | ||
|
||
template = xbeam.make_template(tds) | ||
storage_options = None | ||
if "gs://" in OUTPUT_PATH.value: | ||
storage_options = {"token": "/contrib/Tim.Smith/.gcs/replay-service-account.json"} | ||
|
||
pipeline_kwargs = {} | ||
if NUM_WORKERS.value is not None: | ||
pipeline_kwargs["options"]=PipelineOptions( | ||
direct_num_workers=NUM_WORKERS.value, | ||
) | ||
|
||
with beam.Pipeline(runner=RUNNER.value, argv=argv, **pipeline_kwargs) as root: | ||
( | ||
root | ||
| xbeam.DatasetToChunks(source_dataset, input_chunks, num_threads=NUM_THREADS.value) | ||
| beam.MapTuple(calc_geopotential) | ||
| ChunksToZarr(OUTPUT_PATH.value, template, output_chunks, num_threads=NUM_THREADS.value, storage_options=storage_options) | ||
) | ||
|
||
logging.info("Done") | ||
|
||
if __name__ == "__main__": | ||
app.run(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
"""Compute static variables surface orography and land/sea mask, | ||
append it back to the original or store it locally depending on inputs | ||
Note: this was heavily borrowed from this xarray-beam example: | ||
https://github.com/google/xarray-beam/blob/main/examples/era5_climatology.py | ||
""" | ||
|
||
from typing import Tuple | ||
|
||
from absl import app | ||
from absl import flags | ||
import logging | ||
import apache_beam as beam | ||
from apache_beam.options.pipeline_options import PipelineOptions | ||
from apache_beam.runners.dask.dask_runner import DaskRunner | ||
import numpy as np | ||
import xarray as xr | ||
import xarray_beam as xbeam | ||
|
||
from ufs2arco import Layers2Pressure | ||
from verify_geopotential import setup_log | ||
from localzarr import ChunksToZarr | ||
|
||
|
||
INPUT_PATH = flags.DEFINE_string('input_path', None, help='Input Zarr path') | ||
OUTPUT_PATH = flags.DEFINE_string('output_path', None, help='Output Zarr path') | ||
RUNNER = flags.DEFINE_string('runner', "DirectRunner", 'beam.runners.Runner') | ||
NUM_WORKERS = flags.DEFINE_integer('num_workers', None, help="Number of workers for the runner") | ||
NUM_THREADS = flags.DEFINE_integer('num_threads', None, help="Passed to ChunksToZarr") | ||
|
||
def calc_static_vars( | ||
key: xbeam.Key, | ||
xds: xr.Dataset, | ||
) -> Tuple[xbeam.Key, xr.Dataset]: | ||
"""Return dataset with geopotential field, that's it""" | ||
|
||
newds = xr.Dataset() | ||
hgtsfc = xds["hgtsfc"] if "time" not in xds["hgtsfc"] else xds["hgtsfc"].isel(time=0) | ||
land = xds["land"] if "time" not in xds["land"] else xds["land"].isel(time=0) | ||
|
||
newds["hgtsfc_static"] = hgtsfc | ||
|
||
newds["land_static"] = xr.where( | ||
land == 1, | ||
1, | ||
0, | ||
).astype(np.int32) | ||
newds["hgtsfc_static"].attrs = xds["hgtsfc"].attrs.copy() | ||
newds["land_static"].attrs = { | ||
"long_name": "static land-sea/ice mask", | ||
"description": "1 = land, 0 = not land", | ||
} | ||
|
||
for k in ["time", "cftime", "ftime", "pfull"]: | ||
if k in newds: | ||
newds = newds.drop_vars(k) | ||
return key, newds | ||
|
||
def main(argv): | ||
|
||
setup_log() | ||
path = INPUT_PATH.value | ||
kwargs = {} | ||
|
||
if "gs://" in path or "gcs://" in path: | ||
kwargs["storage_options"] = {"token": "anon"} | ||
|
||
source_dataset, source_chunks = xbeam.open_zarr(path, **kwargs) | ||
source_dataset = source_dataset[["hgtsfc", "land"]].isel(time=0) | ||
for key in ["time", "cftime", "ftime", "pfull"]: | ||
if key in source_dataset: | ||
source_dataset = source_dataset.drop_vars(key) | ||
if key in source_chunks: | ||
source_chunks.pop(key) | ||
|
||
# create template | ||
_, tds = calc_static_vars(None, source_dataset) | ||
#input_chunks = source_chunks.copy() | ||
output_chunks = {k: v for k,v in source_chunks.items() if k not in ("pfull", "time")} | ||
input_chunks=output_chunks.copy() | ||
|
||
template = xbeam.make_template(tds) | ||
storage_options = None | ||
if "gs://" in OUTPUT_PATH.value: | ||
storage_options = {"token": "/contrib/Tim.Smith/.gcs/replay-service-account.json"} | ||
|
||
pipeline_kwargs = {} | ||
if NUM_WORKERS.value is not None: | ||
pipeline_kwargs["options"]=PipelineOptions( | ||
direct_num_workers=NUM_WORKERS.value, | ||
) | ||
|
||
with beam.Pipeline(runner=RUNNER.value, argv=argv, **pipeline_kwargs) as root: | ||
( | ||
root | ||
| xbeam.DatasetToChunks(source_dataset, input_chunks, num_threads=NUM_THREADS.value) | ||
| beam.MapTuple(calc_static_vars) | ||
| ChunksToZarr(OUTPUT_PATH.value, template, output_chunks, num_threads=NUM_THREADS.value, storage_options=storage_options) | ||
) | ||
|
||
logging.info("Done") | ||
|
||
if __name__ == "__main__": | ||
app.run(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.