Skip to content

Commit 6bfbaed

Browse files
authored
more tutorial refactoring (#5074)
* split out open_rasterio again * remove engine from the explicit signature * fix the lists of available datasets * explicitly pass the temporary cache_dir * use open_rasterio instead of open_dataset * update the description of "tiny" * generate a API page for tutorial.open_rasterio [skip-ci] * add a typespec alias for path-like * use tutorial.open_rasterio instead of downloading manually * back to downloading manually * update whats-new.rst * add the shade dataset * add a description for RGB.byte * reference a tag to make having dead links less likely * [test-upstream] * rename to arr because open_rasterio returns DataArray objects * try to fix the docs [skip-ci] * fix the links [skip-ci] * add descriptions for all except the era5 grib file [skip-ci] * also add a description of the ERA5 data * move the credits to the bottom of the list of available datasets [skip-ci] * adjust the log level of pooch's logger
1 parent 24c357f commit 6bfbaed

File tree

7 files changed

+119
-57
lines changed

7 files changed

+119
-57
lines changed

doc/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,7 @@ Tutorial
861861
:toctree: generated/
862862

863863
tutorial.open_dataset
864+
tutorial.open_rasterio
864865
tutorial.load_dataset
865866

866867
Testing

doc/conf.py

+1
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@
125125
"callable": ":py:func:`callable`",
126126
"dict_like": ":term:`dict-like <mapping>`",
127127
"dict-like": ":term:`dict-like <mapping>`",
128+
"path-like": ":term:`path-like <path-like object>`",
128129
"mapping": ":term:`mapping`",
129130
"file-like": ":term:`file-like <file-like object>`",
130131
# special terms

doc/examples/visualization_gallery.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@
209209
"metadata": {},
210210
"outputs": [],
211211
"source": [
212-
"da = xr.tutorial.open_dataset(\"RGB.byte\").data\n",
212+
"da = xr.tutorial.open_rasterio(\"RGB.byte\")\n",
213213
"\n",
214214
"# The data is in UTM projection. We have to set it manually until\n",
215215
"# https://github.com/SciTools/cartopy/issues/813 is implemented\n",
@@ -245,7 +245,7 @@
245245
"from rasterio.warp import transform\n",
246246
"import numpy as np\n",
247247
"\n",
248-
"da = xr.tutorial.open_dataset(\"RGB.byte\").data\n",
248+
"da = xr.tutorial.open_rasterio(\"RGB.byte\")\n",
249249
"\n",
250250
"# Compute the lon/lat coordinates with rasterio.warp.transform\n",
251251
"ny, nx = len(da['y']), len(da['x'])\n",

doc/whats-new.rst

+5-1
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,15 @@ New Features
6464
:py:class:`~core.groupby.DataArrayGroupBy`, inspired by pandas'
6565
:py:meth:`~pandas.core.groupby.GroupBy.get_group`.
6666
By `Deepak Cherian <https://github.com/dcherian>`_.
67+
- Switch the tutorial functions to use `pooch <https://github.com/fatiando/pooch>`_
68+
(which is now a optional dependency) and add :py:func:`tutorial.open_rasterio` as a
69+
way to open example rasterio files (:issue:`3986`, :pull:`4102`, :pull:`5074`).
70+
By `Justus Magin <https://github.com/keewis>`_.
6771
- Add typing information to unary and binary arithmetic operators operating on
6872
:py:class:`~core.dataset.Dataset`, :py:class:`~core.dataarray.DataArray`,
6973
:py:class:`~core.variable.Variable`, :py:class:`~core.groupby.DatasetGroupBy` or
7074
:py:class:`~core.groupby.DataArrayGroupBy` (:pull:`4904`).
71-
By `Richard Kleijn <https://github.com/rhkleijn>`_ .
75+
By `Richard Kleijn <https://github.com/rhkleijn>`_.
7276
- Add a ``combine_attrs`` parameter to :py:func:`open_mfdataset` (:pull:`4971`).
7377
By `Justus Magin <https://github.com/keewis>`_.
7478
- Enable passing arrays with a subset of dimensions to

xarray/backends/rasterio_.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def open_rasterio(filename, parse_coordinates=None, chunks=None, cache=None, loc
176176
177177
>>> from affine import Affine
178178
>>> da = xr.open_rasterio(
179-
... "https://github.com/mapbox/rasterio/raw/master/tests/data/RGB.byte.tif"
179+
... "https://github.com/mapbox/rasterio/raw/1.2.1/tests/data/RGB.byte.tif"
180180
... )
181181
>>> da
182182
<xarray.DataArray (band: 3, y: 718, x: 791)>

xarray/tests/test_tutorial.py

+17-14
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import os
2-
31
import pytest
42

53
from xarray import DataArray, tutorial
@@ -13,25 +11,30 @@ class TestLoadDataset:
1311
def setUp(self):
1412
self.testfile = "tiny"
1513

16-
def test_download_from_github(self, tmp_path, monkeypatch):
17-
monkeypatch.setenv("XDG_CACHE_DIR", os.fspath(tmp_path))
18-
19-
ds = tutorial.open_dataset(self.testfile).load()
14+
def test_download_from_github(self, tmp_path):
15+
cache_dir = tmp_path / tutorial._default_cache_dir_name
16+
ds = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load()
2017
tiny = DataArray(range(5), name="tiny").to_dataset()
2118
assert_identical(ds, tiny)
2219

2320
def test_download_from_github_load_without_cache(self, tmp_path, monkeypatch):
24-
monkeypatch.setenv("XDG_CACHE_DIR", os.fspath(tmp_path))
21+
cache_dir = tmp_path / tutorial._default_cache_dir_name
2522

26-
ds_nocache = tutorial.open_dataset(self.testfile, cache=False).load()
27-
ds_cache = tutorial.open_dataset(self.testfile).load()
23+
ds_nocache = tutorial.open_dataset(
24+
self.testfile, cache=False, cache_dir=cache_dir
25+
).load()
26+
ds_cache = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load()
2827
assert_identical(ds_cache, ds_nocache)
2928

3029
def test_download_rasterio_from_github_load_without_cache(
3130
self, tmp_path, monkeypatch
3231
):
33-
monkeypatch.setenv("XDG_CACHE_DIR", os.fspath(tmp_path))
34-
35-
ds_nocache = tutorial.open_dataset("RGB.byte", cache=False).load()
36-
ds_cache = tutorial.open_dataset("RGB.byte", cache=True).load()
37-
assert_identical(ds_cache, ds_nocache)
32+
cache_dir = tmp_path / tutorial._default_cache_dir_name
33+
34+
arr_nocache = tutorial.open_rasterio(
35+
"RGB.byte", cache=False, cache_dir=cache_dir
36+
).load()
37+
arr_cache = tutorial.open_rasterio(
38+
"RGB.byte", cache=True, cache_dir=cache_dir
39+
).load()
40+
assert_identical(arr_cache, arr_nocache)

xarray/tutorial.py

+92-39
Original file line numberDiff line numberDiff line change
@@ -11,37 +11,36 @@
1111
import numpy as np
1212

1313
from .backends.api import open_dataset as _open_dataset
14-
from .backends.rasterio_ import open_rasterio
14+
from .backends.rasterio_ import open_rasterio as _open_rasterio
1515
from .core.dataarray import DataArray
1616
from .core.dataset import Dataset
1717

18-
19-
def _open_rasterio(path, engine=None, **kwargs):
20-
data = open_rasterio(path, **kwargs)
21-
name = data.name if data.name is not None else "data"
22-
return data.to_dataset(name=name)
23-
24-
2518
_default_cache_dir_name = "xarray_tutorial_data"
2619
base_url = "https://github.com/pydata/xarray-data"
2720
version = "master"
2821

2922

30-
external_urls = {
31-
"RGB.byte": (
32-
"rasterio",
33-
"https://github.com/mapbox/rasterio/raw/master/tests/data/RGB.byte.tif",
34-
),
35-
}
36-
overrides = {
37-
"rasterio": _open_rasterio,
23+
def _construct_cache_dir(path):
24+
import pooch
25+
26+
if isinstance(path, pathlib.Path):
27+
path = os.fspath(path)
28+
elif path is None:
29+
path = pooch.os_cache(_default_cache_dir_name)
30+
31+
return path
32+
33+
34+
external_urls = {} # type: dict
35+
external_rasterio_urls = {
36+
"RGB.byte": "https://github.com/mapbox/rasterio/raw/1.2.1/tests/data/RGB.byte.tif",
37+
"shade": "https://github.com/mapbox/rasterio/raw/1.2.1/tests/data/shade.tif",
3838
}
3939

4040

4141
# idea borrowed from Seaborn
4242
def open_dataset(
4343
name,
44-
engine=None,
4544
cache=True,
4645
cache_dir=None,
4746
**kws,
@@ -51,31 +50,27 @@ def open_dataset(
5150
5251
If a local copy is found then always use that to avoid network traffic.
5352
53+
Available datasets:
54+
55+
* ``"air_temperature"``: NCEP reanalysis subset
56+
* ``"rasm"``: Output of the Regional Arctic System Model (RASM)
57+
* ``"ROMS_example"``: Regional Ocean Model System (ROMS) output
58+
* ``"tiny"``: small synthetic dataset with a 1D data variable
59+
* ``"era5-2mt-2019-03-uk.grib"``: ERA5 temperature data over the UK
60+
* ``"eraint_uvz"``: data from ERA-Interim reanalysis, monthly averages of upper level data
61+
5462
Parameters
5563
----------
5664
name : str
5765
Name of the file containing the dataset.
5866
e.g. 'air_temperature'
59-
engine : str, optional
60-
The engine to use.
6167
cache_dir : path-like, optional
6268
The directory in which to search for and write cached data.
6369
cache : bool, optional
6470
If True, then cache data locally for use on subsequent calls
6571
**kws : dict, optional
6672
Passed to xarray.open_dataset
6773
68-
Notes
69-
-----
70-
Available datasets:
71-
72-
* ``"air_temperature"``
73-
* ``"rasm"``
74-
* ``"ROMS_example"``
75-
* ``"tiny"``
76-
* ``"era5-2mt-2019-03-uk.grib"``
77-
* ``"RGB.byte"``: example rasterio file from https://github.com/mapbox/rasterio
78-
7974
See Also
8075
--------
8176
xarray.open_dataset
@@ -85,15 +80,12 @@ def open_dataset(
8580
except ImportError:
8681
raise ImportError("using the tutorial data requires pooch")
8782

88-
if isinstance(cache_dir, pathlib.Path):
89-
cache_dir = os.fspath(cache_dir)
90-
elif cache_dir is None:
91-
cache_dir = pooch.os_cache(_default_cache_dir_name)
83+
logger = pooch.get_logger()
84+
logger.setLevel("WARNING")
9285

86+
cache_dir = _construct_cache_dir(cache_dir)
9387
if name in external_urls:
94-
engine_, url = external_urls[name]
95-
if engine is None:
96-
engine = engine_
88+
url = external_urls[name]
9789
else:
9890
# process the name
9991
default_extension = ".nc"
@@ -103,17 +95,78 @@ def open_dataset(
10395

10496
url = f"{base_url}/raw/{version}/{path.name}"
10597

106-
_open = overrides.get(engine, _open_dataset)
10798
# retrieve the file
10899
filepath = pooch.retrieve(url=url, known_hash=None, path=cache_dir)
109-
ds = _open(filepath, engine=engine, **kws)
100+
ds = _open_dataset(filepath, **kws)
110101
if not cache:
111102
ds = ds.load()
112103
pathlib.Path(filepath).unlink()
113104

114105
return ds
115106

116107

108+
def open_rasterio(
109+
name,
110+
engine=None,
111+
cache=True,
112+
cache_dir=None,
113+
**kws,
114+
):
115+
"""
116+
Open a rasterio dataset from the online repository (requires internet).
117+
118+
If a local copy is found then always use that to avoid network traffic.
119+
120+
Available datasets:
121+
122+
* ``"RGB.byte"``: TIFF file derived from USGS Landsat 7 ETM imagery.
123+
* ``"shade"``: TIFF file derived from from USGS SRTM 90 data
124+
125+
``RGB.byte`` and ``shade`` are downloaded from the ``rasterio`` repository [1]_.
126+
127+
Parameters
128+
----------
129+
name : str
130+
Name of the file containing the dataset.
131+
e.g. 'RGB.byte'
132+
cache_dir : path-like, optional
133+
The directory in which to search for and write cached data.
134+
cache : bool, optional
135+
If True, then cache data locally for use on subsequent calls
136+
**kws : dict, optional
137+
Passed to xarray.open_rasterio
138+
139+
See Also
140+
--------
141+
xarray.open_rasterio
142+
143+
References
144+
----------
145+
.. [1] https://github.com/mapbox/rasterio
146+
"""
147+
try:
148+
import pooch
149+
except ImportError:
150+
raise ImportError("using the tutorial data requires pooch")
151+
152+
logger = pooch.get_logger()
153+
logger.setLevel("WARNING")
154+
155+
cache_dir = _construct_cache_dir(cache_dir)
156+
url = external_rasterio_urls.get(name)
157+
if url is None:
158+
raise ValueError(f"unknown rasterio dataset: {name}")
159+
160+
# retrieve the file
161+
filepath = pooch.retrieve(url=url, known_hash=None, path=cache_dir)
162+
arr = _open_rasterio(filepath, **kws)
163+
if not cache:
164+
arr = arr.load()
165+
pathlib.Path(filepath).unlink()
166+
167+
return arr
168+
169+
117170
def load_dataset(*args, **kwargs):
118171
"""
119172
Open, load into memory, and close a dataset from the online repository

0 commit comments

Comments
 (0)