diff --git a/.github/ISSUE_TEMPLATE/bugreport.yml b/.github/ISSUE_TEMPLATE/bugreport.yml index cc1a2e12be3..5bd7efd12f1 100644 --- a/.github/ISSUE_TEMPLATE/bugreport.yml +++ b/.github/ISSUE_TEMPLATE/bugreport.yml @@ -37,7 +37,7 @@ body: Please confirm that the bug report is in an excellent state, so we can understand & fix it quickly & efficiently. For more details, check out: - [Minimal Complete Verifiable Examples](https://stackoverflow.com/help/mcve) - - [Craft Minimal Bug Reports](http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) + - [Craft Minimal Bug Reports](https://matthewrocklin.com/minimal-bug-reports) options: - label: Minimal example — the example is as focused as reasonably possible to demonstrate the underlying issue in xarray. diff --git a/.github/config.yml b/.github/config.yml index c64c2e28e59..d11c3b7bea7 100644 --- a/.github/config.yml +++ b/.github/config.yml @@ -14,7 +14,7 @@ newIssueWelcomeComment: > newPRWelcomeComment: > Thank you for opening this pull request! It may take us a few days to respond here, so thank you for being patient. - If you have questions, some answers may be found in our [contributing guidelines](http://docs.xarray.dev/en/stable/contributing.html). + If you have questions, some answers may be found in our [contributing guidelines](https://docs.xarray.dev/en/stable/contributing.html). # Comment to be posted to on pull requests merged by a first time user firstPRMergeComment: > diff --git a/.github/workflows/benchmarks-last-release.yml b/.github/workflows/benchmarks-last-release.yml index 1df14dcc50c..5e36613368d 100644 --- a/.github/workflows/benchmarks-last-release.yml +++ b/.github/workflows/benchmarks-last-release.yml @@ -24,7 +24,7 @@ jobs: - name: Set up conda environment uses: mamba-org/setup-micromamba@v2 with: - micromamba-version: '2.0.2-2' + micromamba-version: '1.5.10-0' environment-file: ${{env.CONDA_ENV_FILE}} environment-name: xarray-tests cache-environment: true diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 9daf042bd5e..e2cde27532d 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -27,7 +27,7 @@ jobs: - name: Set up conda environment uses: mamba-org/setup-micromamba@v2 with: - micromamba-version: '2.0.2-2' + micromamba-version: '1.5.10-0' environment-file: ${{env.CONDA_ENV_FILE}} environment-name: xarray-tests cache-environment: true @@ -36,7 +36,7 @@ jobs: create-args: >- asv python-build - mamba + mamba<=1.5.10 - name: Run benchmarks diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index d4a0141a673..9ef315f505c 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -14,7 +14,6 @@ concurrency: env: FORCE_COLOR: 3 - MICROMAMBA_VERSION: "2.0.2-2" jobs: detect-ci-trigger: @@ -58,7 +57,6 @@ jobs: - name: Setup micromamba uses: mamba-org/setup-micromamba@v2 with: - micromamba-version: ${{env.MICROMAMBA_VERSION}} environment-file: ${{env.CONDA_ENV_FILE}} environment-name: xarray-tests create-args: >- @@ -104,7 +102,6 @@ jobs: - name: Setup micromamba uses: mamba-org/setup-micromamba@v2 with: - micromamba-version: ${{env.MICROMAMBA_VERSION}} environment-file: ${{env.CONDA_ENV_FILE}} environment-name: xarray-tests create-args: >- @@ -156,7 +153,6 @@ jobs: - name: Setup micromamba uses: mamba-org/setup-micromamba@v2 with: - micromamba-version: ${{env.MICROMAMBA_VERSION}} environment-file: ${{env.CONDA_ENV_FILE}} environment-name: xarray-tests create-args: >- @@ -213,7 +209,6 @@ jobs: - name: Setup micromamba uses: mamba-org/setup-micromamba@v2 with: - micromamba-version: ${{env.MICROMAMBA_VERSION}} environment-file: ${{env.CONDA_ENV_FILE}} environment-name: xarray-tests create-args: >- @@ -270,7 +265,6 @@ jobs: - name: Setup micromamba uses: mamba-org/setup-micromamba@v2 with: - micromamba-version: ${{env.MICROMAMBA_VERSION}} environment-file: ${{env.CONDA_ENV_FILE}} environment-name: xarray-tests create-args: >- @@ -304,12 +298,14 @@ jobs: name: Minimum Version Policy runs-on: "ubuntu-latest" needs: detect-ci-trigger - # disabled until `conda` is compatible with the new `libmambapy` - if: false && needs.detect-ci-trigger.outputs.triggered == 'false' + if: needs.detect-ci-trigger.outputs.triggered == 'false' defaults: run: shell: bash -l {0} + env: + COLUMNS: 120 + steps: - uses: actions/checkout@v4 with: @@ -318,18 +314,20 @@ jobs: - name: Setup micromamba uses: mamba-org/setup-micromamba@v2 with: - micromamba-version: ${{env.MICROMAMBA_VERSION}} environment-name: xarray-tests create-args: >- python=3.12 pyyaml python-dateutil - libmambapy + cytoolz + rich + rich-click + py-rattler - name: All-deps minimum versions policy run: | - python ci/min_deps_check.py ci/requirements/min-all-deps.yml + python ci/minimum_versions.py ci/requirements/min-all-deps.yml - name: Bare minimum versions policy run: | - python ci/min_deps_check.py ci/requirements/bare-minimum.yml + python ci/minimum_versions.py ci/requirements/bare-minimum.yml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1a08b4c5903..fc6d9d0e0b8 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,7 +14,6 @@ concurrency: env: FORCE_COLOR: 3 - micromamba_version: 2 jobs: detect-ci-trigger: @@ -111,7 +110,6 @@ jobs: - name: Setup micromamba uses: mamba-org/setup-micromamba@v2 with: - micromamba-version: '2.0.2-2' environment-file: ${{ env.CONDA_ENV_FILE }} environment-name: xarray-tests cache-environment: true diff --git a/.github/workflows/hypothesis.yaml b/.github/workflows/hypothesis.yaml index b90c2f26634..2a904c06824 100644 --- a/.github/workflows/hypothesis.yaml +++ b/.github/workflows/hypothesis.yaml @@ -63,7 +63,6 @@ jobs: - name: Setup micromamba uses: mamba-org/setup-micromamba@v2 with: - micromamba-version: "2.0.2-2" environment-file: ci/requirements/environment.yml environment-name: xarray-tests create-args: >- diff --git a/.github/workflows/pypi-release.yaml b/.github/workflows/pypi-release.yaml index bb75f68aacd..decb8ff3ba3 100644 --- a/.github/workflows/pypi-release.yaml +++ b/.github/workflows/pypi-release.yaml @@ -88,7 +88,7 @@ jobs: path: dist - name: Publish package to TestPyPI if: github.event_name == 'push' - uses: pypa/gh-action-pypi-publish@v1.11.0 + uses: pypa/gh-action-pypi-publish@v1.12.2 with: repository_url: https://test.pypi.org/legacy/ verbose: true @@ -111,6 +111,6 @@ jobs: name: releases path: dist - name: Publish package to PyPI - uses: pypa/gh-action-pypi-publish@v1.11.0 + uses: pypa/gh-action-pypi-publish@v1.12.2 with: verbose: true diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 1d2aebab930..db8bd981e51 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -17,7 +17,6 @@ concurrency: env: FORCE_COLOR: 3 - MICROMAMBA_VERSION: "2.0.2-2" jobs: detect-ci-trigger: @@ -64,7 +63,6 @@ jobs: - name: Set up conda environment uses: mamba-org/setup-micromamba@v2 with: - micromamba-version: ${{env.MICROMAMBA_VERSION}} environment-file: ci/requirements/environment.yml environment-name: xarray-tests create-args: >- @@ -121,7 +119,6 @@ jobs: - name: Set up conda environment uses: mamba-org/setup-micromamba@v2 with: - micromamba-version: ${{env.MICROMAMBA_VERSION}} environment-file: ci/requirements/environment.yml environment-name: xarray-tests create-args: >- diff --git a/.pep8speaks.yml b/.pep8speaks.yml deleted file mode 100644 index 8d87864e426..00000000000 --- a/.pep8speaks.yml +++ /dev/null @@ -1,6 +0,0 @@ -# https://github.com/OrkoHunter/pep8speaks for more info -# pep8speaks will use the flake8 configs in `setup.cfg` - -scanner: - diff_only: False - linter: flake8 diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index d457a9e9a4d..541fd2fa659 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -40,7 +40,7 @@ Project maintainers who do not follow or enforce the Code of Conduct in good fai ## Attribution -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [https://contributor-covenant.org/version/1/4][version] -[homepage]: http://contributor-covenant.org -[version]: http://contributor-covenant.org/version/1/4/ +[homepage]: https://contributor-covenant.org +[version]: https://contributor-covenant.org/version/1/4/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index dd9931f907b..9fef07e9a5e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1 +1 @@ -Xarray's contributor guidelines [can be found in our online documentation](http://docs.xarray.dev/en/stable/contributing.html) +Xarray's contributor guidelines [can be found in our online documentation](https://docs.xarray.dev/en/stable/contributing.html) diff --git a/LICENSE b/LICENSE index 37ec93a14fd..82fac1f1a63 100644 --- a/LICENSE +++ b/LICENSE @@ -176,7 +176,7 @@ recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2014-2024 xarray Developers Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README.md b/README.md index 8fc8ff335d4..bd62e5f2fd4 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ our efforts. ## History Xarray is an evolution of an internal tool developed at [The Climate -Corporation](http://climate.com/). It was originally written by Climate +Corporation](https://climate.com/). It was originally written by Climate Corp researchers Stephan Hoyer, Alex Kleeman and Eugene Brevdo and was released as open source in May 2014. The project was renamed from "xray" in January 2016. Xarray became a fiscally sponsored project of @@ -108,7 +108,7 @@ Thanks to our many contributors! ## License -Copyright 2014-2023, xarray Developers +Copyright 2014-2024, xarray Developers Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 9dc86df712d..ab256079c90 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -7,7 +7,7 @@ "project": "xarray", // The project's homepage - "project_url": "http://docs.xarray.dev/", + "project_url": "https://docs.xarray.dev/", // The URL or local path of the source code repository for the // project being benchmarked diff --git a/ci/min_deps_check.py b/ci/min_deps_check.py deleted file mode 100755 index a01d82ebfbb..00000000000 --- a/ci/min_deps_check.py +++ /dev/null @@ -1,218 +0,0 @@ -#!/usr/bin/env python -"""Fetch from conda database all available versions of the xarray dependencies and their -publication date. Compare it against requirements/min-all-deps.yml to verify the -policy on obsolete dependencies is being followed. Print a pretty report :) -""" - -from __future__ import annotations - -import itertools -import sys -from collections.abc import Iterator -from datetime import datetime - -import libmambapy # type: ignore[import] -import yaml -from dateutil.relativedelta import relativedelta - -CHANNELS = ["conda-forge", "defaults"] -IGNORE_DEPS = { - "coveralls", - "flake8", - "hypothesis", - "isort", - "mypy", - "pip", - "pytest", - "pytest-cov", - "pytest-env", - "pytest-timeout", - "pytest-xdist", - "setuptools", -} - -POLICY_MONTHS = {"python": 30, "numpy": 18} -POLICY_MONTHS_DEFAULT = 12 -POLICY_OVERRIDE: dict[str, tuple[int, int]] = {} -errors = [] - - -def error(msg: str) -> None: - global errors - errors.append(msg) - print("ERROR:", msg) - - -def warning(msg: str) -> None: - print("WARNING:", msg) - - -def parse_requirements(fname) -> Iterator[tuple[str, int, int, int | None]]: - """Load requirements/min-all-deps.yml - - Yield (package name, major version, minor version, [patch version]) - """ - global errors - - with open(fname) as fh: - contents = yaml.safe_load(fh) - for row in contents["dependencies"]: - if isinstance(row, dict) and list(row) == ["pip"]: - continue - pkg, eq, version = row.partition("=") - if pkg.rstrip("<>") in IGNORE_DEPS: - continue - if pkg.endswith(("<", ">")) or eq != "=": - error("package should be pinned with exact version: " + row) - continue - - try: - version_tup = tuple(int(x) for x in version.split(".")) - except ValueError as err: - raise ValueError("non-numerical version: " + row) from err - - if len(version_tup) == 2: - yield (pkg, *version_tup, None) # type: ignore[misc] - elif len(version_tup) == 3: - yield (pkg, *version_tup) # type: ignore[misc] - else: - raise ValueError("expected major.minor or major.minor.patch: " + row) - - -def query_conda(pkg: str) -> dict[tuple[int, int], datetime]: - """Query the conda repository for a specific package - - Return map of {(major version, minor version): publication date} - """ - - def metadata(entry): - version = entry.version - - time = datetime.fromtimestamp(entry.timestamp) - major, minor = map(int, version.split(".")[:2]) - - return (major, minor), time - - raw_data = libmambapy.SubdirData.query_all(pkg, channels=CHANNELS) - data = sorted(metadata(entry) for entry in raw_data if entry.timestamp != 0) - - release_dates = { - version: [time for _, time in group if time is not None] - for version, group in itertools.groupby(data, key=lambda x: x[0]) - } - out = {version: min(dates) for version, dates in release_dates.items() if dates} - - # Hardcoded fix to work around incorrect dates in conda - if pkg == "python": - out.update( - { - (2, 7): datetime(2010, 6, 3), - (3, 5): datetime(2015, 9, 13), - (3, 6): datetime(2016, 12, 23), - (3, 7): datetime(2018, 6, 27), - (3, 8): datetime(2019, 10, 14), - (3, 9): datetime(2020, 10, 5), - (3, 10): datetime(2021, 10, 4), - (3, 11): datetime(2022, 10, 24), - } - ) - - return out - - -def process_pkg( - pkg: str, req_major: int, req_minor: int, req_patch: int | None -) -> tuple[str, str, str, str, str, str]: - """Compare package version from requirements file to available versions in conda. - Return row to build pandas dataframe: - - - package name - - major.minor.[patch] version in requirements file - - publication date of version in requirements file (YYYY-MM-DD) - - major.minor version suggested by policy - - publication date of version suggested by policy (YYYY-MM-DD) - - status ("<", "=", "> (!)") - """ - print(f"Analyzing {pkg}...") - versions = query_conda(pkg) - - try: - req_published = versions[req_major, req_minor] - except KeyError: - error("not found in conda: " + pkg) - return pkg, fmt_version(req_major, req_minor, req_patch), "-", "-", "-", "(!)" - - policy_months = POLICY_MONTHS.get(pkg, POLICY_MONTHS_DEFAULT) - policy_published = datetime.now() - relativedelta(months=policy_months) - - filtered_versions = [ - version - for version, published in versions.items() - if published < policy_published - ] - policy_major, policy_minor = max(filtered_versions, default=(req_major, req_minor)) - - try: - policy_major, policy_minor = POLICY_OVERRIDE[pkg] - except KeyError: - pass - policy_published_actual = versions[policy_major, policy_minor] - - if (req_major, req_minor) < (policy_major, policy_minor): - status = "<" - elif (req_major, req_minor) > (policy_major, policy_minor): - status = "> (!)" - delta = relativedelta(datetime.now(), req_published).normalized() - n_months = delta.years * 12 + delta.months - warning( - f"Package is too new: {pkg}={req_major}.{req_minor} was " - f"published on {req_published:%Y-%m-%d} " - f"which was {n_months} months ago (policy is {policy_months} months)" - ) - else: - status = "=" - - if req_patch is not None: - warning("patch version should not appear in requirements file: " + pkg) - status += " (w)" - - return ( - pkg, - fmt_version(req_major, req_minor, req_patch), - req_published.strftime("%Y-%m-%d"), - fmt_version(policy_major, policy_minor), - policy_published_actual.strftime("%Y-%m-%d"), - status, - ) - - -def fmt_version(major: int, minor: int, patch: int | None = None) -> str: - if patch is None: - return f"{major}.{minor}" - else: - return f"{major}.{minor}.{patch}" - - -def main() -> None: - fname = sys.argv[1] - rows = [ - process_pkg(pkg, major, minor, patch) - for pkg, major, minor, patch in parse_requirements(fname) - ] - - print("\nPackage Required Policy Status") - print("----------------- -------------------- -------------------- ------") - fmt = "{:17} {:7} ({:10}) {:7} ({:10}) {}" - for row in rows: - print(fmt.format(*row)) - - if errors: - print("\nErrors:") - print("-------") - for i, e in enumerate(errors): - print(f"{i+1}. {e}") - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/ci/minimum_versions.py b/ci/minimum_versions.py new file mode 100644 index 00000000000..c226e304769 --- /dev/null +++ b/ci/minimum_versions.py @@ -0,0 +1,323 @@ +import asyncio +import bisect +import datetime +import pathlib +import sys +from dataclasses import dataclass, field + +import rich_click as click +import yaml +from dateutil.relativedelta import relativedelta +from rattler import Gateway, Version +from rich.console import Console +from rich.panel import Panel +from rich.style import Style +from rich.table import Column, Table +from tlz.functoolz import curry, pipe +from tlz.itertoolz import concat, groupby + +click.rich_click.SHOW_ARGUMENTS = True + +channels = ["conda-forge"] +platforms = ["noarch", "linux-64"] +ignored_packages = [ + "coveralls", + "pip", + "pytest", + "pytest-cov", + "pytest-env", + "pytest-xdist", + "pytest-timeout", + "hypothesis", +] + + +@dataclass +class Policy: + package_months: dict + default_months: int + overrides: dict[str, Version] = field(default_factory=dict) + + def minimum_version(self, package_name, releases): + if (override := self.overrides.get(package_name)) is not None: + return override + + policy_months = self.package_months.get(package_name, self.default_months) + today = datetime.date.today() + + cutoff_date = today - relativedelta(months=policy_months) + + index = bisect.bisect_left( + releases, cutoff_date, key=lambda x: x.timestamp.date() + ) + return releases[index - 1 if index > 0 else 0] + + +@dataclass +class Spec: + name: str + version: Version | None + + @classmethod + def parse(cls, spec_text): + warnings = [] + if ">" in spec_text or "<" in spec_text: + warnings.append( + f"package should be pinned with an exact version: {spec_text!r}" + ) + + spec_text = spec_text.replace(">", "").replace("<", "") + + if "=" in spec_text: + name, version_text = spec_text.split("=", maxsplit=1) + version = Version(version_text) + segments = version.segments() + + if len(segments) != 2 or (len(segments) == 3 and segments[2] != 0): + warnings.append( + f"package should be pinned to a minor version (got {version})" + ) + else: + name = spec_text + version = None + + return cls(name, version), (name, warnings) + + +@dataclass(order=True) +class Release: + version: Version + build_number: int + timestamp: datetime.datetime = field(compare=False) + + @classmethod + def from_repodata_record(cls, repo_data): + return cls( + version=repo_data.version, + build_number=repo_data.build_number, + timestamp=repo_data.timestamp, + ) + + +def parse_environment(text): + env = yaml.safe_load(text) + + specs = [] + warnings = [] + for dep in env["dependencies"]: + spec, warnings_ = Spec.parse(dep) + + warnings.append(warnings_) + specs.append(spec) + + return specs, warnings + + +def is_preview(version): + candidates = ["rc", "beta", "alpha"] + + *_, last_segment = version.segments() + return any(candidate in last_segment for candidate in candidates) + + +def group_packages(records): + groups = groupby(lambda r: r.name.normalized, records) + return { + name: sorted(map(Release.from_repodata_record, group)) + for name, group in groups.items() + } + + +def filter_releases(predicate, releases): + return { + name: [r for r in records if predicate(r)] for name, records in releases.items() + } + + +def deduplicate_releases(package_info): + def deduplicate(releases): + return min(releases, key=lambda p: p.timestamp) + + return { + name: list(map(deduplicate, groupby(lambda p: p.version, group).values())) + for name, group in package_info.items() + } + + +def find_policy_versions(policy, releases): + return { + name: policy.minimum_version(name, package_releases) + for name, package_releases in releases.items() + } + + +def is_suitable_release(release): + if release.timestamp is None: + return False + + segments = release.version.extend_to_length(3).segments() + + return segments[2] == [0] + + +def lookup_spec_release(spec, releases): + version = spec.version.extend_to_length(3) + + return releases[spec.name][version] + + +def compare_versions(environments, policy_versions): + status = {} + for env, specs in environments.items(): + env_status = any( + spec.version > policy_versions[spec.name].version for spec in specs + ) + status[env] = env_status + return status + + +def version_comparison_symbol(required, policy): + if required < policy: + return "<" + elif required > policy: + return ">" + else: + return "=" + + +def format_bump_table(specs, policy_versions, releases, warnings): + table = Table( + Column("Package", width=20), + Column("Required", width=8), + "Required (date)", + Column("Policy", width=8), + "Policy (date)", + "Status", + ) + + heading_style = Style(color="#ff0000", bold=True) + warning_style = Style(color="#ffff00", bold=True) + styles = { + ">": Style(color="#ff0000", bold=True), + "=": Style(color="#008700", bold=True), + "<": Style(color="#d78700", bold=True), + } + + for spec in specs: + policy_release = policy_versions[spec.name] + policy_version = policy_release.version.with_segments(0, 2) + policy_date = policy_release.timestamp + + required_version = spec.version + required_date = lookup_spec_release(spec, releases).timestamp + + status = version_comparison_symbol(required_version, policy_version) + style = styles[status] + + table.add_row( + spec.name, + str(required_version), + f"{required_date:%Y-%m-%d}", + str(policy_version), + f"{policy_date:%Y-%m-%d}", + status, + style=style, + ) + + grid = Table.grid(expand=True, padding=(0, 2)) + grid.add_column(style=heading_style, vertical="middle") + grid.add_column() + grid.add_row("Version summary", table) + + if any(warnings.values()): + warning_table = Table(width=table.width, expand=True) + warning_table.add_column("Package") + warning_table.add_column("Warning") + + for package, messages in warnings.items(): + if not messages: + continue + warning_table.add_row(package, messages[0], style=warning_style) + for message in messages[1:]: + warning_table.add_row("", message, style=warning_style) + + grid.add_row("Warnings", warning_table) + + return grid + + +@click.command() +@click.argument( + "environment_paths", + type=click.Path(exists=True, readable=True, path_type=pathlib.Path), + nargs=-1, +) +def main(environment_paths): + console = Console() + + parsed_environments = { + path.stem: parse_environment(path.read_text()) for path in environment_paths + } + + warnings = { + env: dict(warnings_) for env, (_, warnings_) in parsed_environments.items() + } + environments = { + env: [spec for spec in specs if spec.name not in ignored_packages] + for env, (specs, _) in parsed_environments.items() + } + + all_packages = list( + dict.fromkeys(spec.name for spec in concat(environments.values())) + ) + + policy_months = { + "python": 30, + "numpy": 18, + } + policy_months_default = 12 + overrides = {} + + policy = Policy( + policy_months, default_months=policy_months_default, overrides=overrides + ) + + gateway = Gateway() + query = gateway.query(channels, platforms, all_packages, recursive=False) + records = asyncio.run(query) + + package_releases = pipe( + records, + concat, + group_packages, + curry(filter_releases, lambda r: r.timestamp is not None), + deduplicate_releases, + ) + policy_versions = pipe( + package_releases, + curry(filter_releases, is_suitable_release), + curry(find_policy_versions, policy), + ) + status = compare_versions(environments, policy_versions) + + release_lookup = { + n: {r.version: r for r in releases} for n, releases in package_releases.items() + } + grids = { + env: format_bump_table(specs, policy_versions, release_lookup, warnings[env]) + for env, specs in environments.items() + } + root_grid = Table.grid() + root_grid.add_column() + + for env, grid in grids.items(): + root_grid.add_row(Panel(grid, title=env, expand=True)) + + console.print(root_grid) + + status_code = 1 if any(status.values()) else 0 + sys.exit(status_code) + + +if __name__ == "__main__": + main() diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 02c69a41924..43938880592 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,14 +36,7 @@ dependencies: - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - pydap - # start pydap server dependencies, can be removed if pydap-server is available - - gunicorn - - PasteDeploy - - docopt-ng - - Webob - - Jinja2 - - beautifulsoup4 - # end pydap server dependencies + - pydap-server - pytest - pytest-cov - pytest-env diff --git a/design_notes/flexible_indexes_notes.md b/design_notes/flexible_indexes_notes.md index f4a2c1c2125..c53acfa62b7 100644 --- a/design_notes/flexible_indexes_notes.md +++ b/design_notes/flexible_indexes_notes.md @@ -133,7 +133,7 @@ A possible, more explicit solution to reuse a `pandas.MultiIndex` in a DataArray New indexes may also be built from existing sets of coordinates or variables in a Dataset/DataArray using the `.set_index()` method. -The [current signature](http://docs.xarray.dev/en/stable/generated/xarray.DataArray.set_index.html#xarray.DataArray.set_index) of `.set_index()` is tailored to `pandas.MultiIndex` and tied to the concept of a dimension-index. It is therefore hardly reusable as-is in the context of flexible indexes proposed here. +The [current signature](https://docs.xarray.dev/en/stable/generated/xarray.DataArray.set_index.html#xarray.DataArray.set_index) of `.set_index()` is tailored to `pandas.MultiIndex` and tied to the concept of a dimension-index. It is therefore hardly reusable as-is in the context of flexible indexes proposed here. The new signature may look like one of these: diff --git a/doc/_static/ci.png b/doc/_static/ci.png index aec900b1fc5..090f466383e 100644 Binary files a/doc/_static/ci.png and b/doc/_static/ci.png differ diff --git a/doc/_static/dask-array.svg b/doc/_static/dask-array.svg new file mode 100644 index 00000000000..bdf33c0ac70 --- /dev/null +++ b/doc/_static/dask-array.svg @@ -0,0 +1,349 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/_static/dask_array.png b/doc/_static/dask_array.png deleted file mode 100644 index 7ddb6e400ef..00000000000 Binary files a/doc/_static/dask_array.png and /dev/null differ diff --git a/doc/_static/dataset-diagram.png b/doc/_static/dataset-diagram.png index be9aa8d653c..9ec18f65028 100644 Binary files a/doc/_static/dataset-diagram.png and b/doc/_static/dataset-diagram.png differ diff --git a/doc/_static/logos/Xarray_Icon_Final.png b/doc/_static/logos/Xarray_Icon_Final.png index 6c0bae41829..d5ae1d79e8e 100644 Binary files a/doc/_static/logos/Xarray_Icon_Final.png and b/doc/_static/logos/Xarray_Icon_Final.png differ diff --git a/doc/_static/logos/Xarray_Logo_FullColor_InverseRGB_Final.png b/doc/_static/logos/Xarray_Logo_FullColor_InverseRGB_Final.png index 68701eea116..8942dce382c 100644 Binary files a/doc/_static/logos/Xarray_Logo_FullColor_InverseRGB_Final.png and b/doc/_static/logos/Xarray_Logo_FullColor_InverseRGB_Final.png differ diff --git a/doc/_static/logos/Xarray_Logo_RGB_Final.png b/doc/_static/logos/Xarray_Logo_RGB_Final.png index 823ff8db961..0a07cbb85ca 100644 Binary files a/doc/_static/logos/Xarray_Logo_RGB_Final.png and b/doc/_static/logos/Xarray_Logo_RGB_Final.png differ diff --git a/doc/_static/opendap-prism-tmax.png b/doc/_static/opendap-prism-tmax.png index 7ff778a3d1e..e3f6d70fad0 100644 Binary files a/doc/_static/opendap-prism-tmax.png and b/doc/_static/opendap-prism-tmax.png differ diff --git a/doc/_static/thumbnails/ERA5-GRIB-example.png b/doc/_static/thumbnails/ERA5-GRIB-example.png index 412dd28a6d9..a4c4647b2dd 100644 Binary files a/doc/_static/thumbnails/ERA5-GRIB-example.png and b/doc/_static/thumbnails/ERA5-GRIB-example.png differ diff --git a/doc/_static/thumbnails/ROMS_ocean_model.png b/doc/_static/thumbnails/ROMS_ocean_model.png index 9333335d1ef..953af78c59d 100644 Binary files a/doc/_static/thumbnails/ROMS_ocean_model.png and b/doc/_static/thumbnails/ROMS_ocean_model.png differ diff --git a/doc/_static/thumbnails/area_weighted_temperature.png b/doc/_static/thumbnails/area_weighted_temperature.png index 7d3604d7c2b..f74621b5547 100644 Binary files a/doc/_static/thumbnails/area_weighted_temperature.png and b/doc/_static/thumbnails/area_weighted_temperature.png differ diff --git a/doc/_static/thumbnails/monthly-means.png b/doc/_static/thumbnails/monthly-means.png index da5691848b0..d05e1517833 100644 Binary files a/doc/_static/thumbnails/monthly-means.png and b/doc/_static/thumbnails/monthly-means.png differ diff --git a/doc/_static/thumbnails/multidimensional-coords.png b/doc/_static/thumbnails/multidimensional-coords.png index b0d893d6894..25409b07f7c 100644 Binary files a/doc/_static/thumbnails/multidimensional-coords.png and b/doc/_static/thumbnails/multidimensional-coords.png differ diff --git a/doc/_static/thumbnails/toy-weather-data.png b/doc/_static/thumbnails/toy-weather-data.png index 64ac0a4b021..59f53f037ad 100644 Binary files a/doc/_static/thumbnails/toy-weather-data.png and b/doc/_static/thumbnails/toy-weather-data.png differ diff --git a/doc/_static/thumbnails/visualization_gallery.png b/doc/_static/thumbnails/visualization_gallery.png index 9e6c2436be5..2b6c1248b4d 100644 Binary files a/doc/_static/thumbnails/visualization_gallery.png and b/doc/_static/thumbnails/visualization_gallery.png differ diff --git a/doc/_static/view-docs.png b/doc/_static/view-docs.png index 2e79ff6c291..ec03dddd68d 100644 Binary files a/doc/_static/view-docs.png and b/doc/_static/view-docs.png differ diff --git a/doc/contributing.rst b/doc/contributing.rst index 5f943e82558..6d269a9f0f7 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -7,7 +7,7 @@ Contributing to xarray .. note:: Large parts of this document came from the `Pandas Contributing - Guide `_. + Guide `_. Overview ======== @@ -68,7 +68,7 @@ If you are reporting a bug, please use the provided template which includes the #. Include a short, self-contained Python snippet reproducing the problem. You can format the code nicely by using `GitHub Flavored Markdown - `_:: + `_:: ```python import xarray as xr @@ -106,7 +106,7 @@ Version control, Git, and GitHub The code is hosted on `GitHub `_. To contribute you will need to sign up for a `free GitHub account -`_. We use `Git `_ for +`_. We use `Git `_ for version control to allow many people to work together on the project. Some great resources for learning Git: @@ -327,7 +327,7 @@ To return to your root environment:: conda deactivate -See the full `conda docs here `__. +See the full `conda docs here `__. Install pre-commit hooks ------------------------ @@ -365,9 +365,9 @@ About the *xarray* documentation -------------------------------- The documentation is written in **reStructuredText**, which is almost like writing -in plain English, and built using `Sphinx `__. The +in plain English, and built using `Sphinx `__. The Sphinx Documentation has an excellent `introduction to reST -`__. Review the Sphinx docs to perform more +`__. Review the Sphinx docs to perform more complex changes to the documentation as well. Some other important things to know about the docs: @@ -388,7 +388,7 @@ Some other important things to know about the docs: extend it in a similar manner. - The tutorials make heavy use of the `ipython directive - `_ sphinx extension. + `_ sphinx extension. This directive lets you put code in the documentation which will be run during the doc build. For example: @@ -551,7 +551,7 @@ xarray uses several tools to ensure a consistent code format throughout the proj - `ruff `_ for formatting, code quality checks and standardized order in imports - `absolufy-imports `_ for absolute instead of relative imports from different files, -- `mypy `_ for static type checking on `type hints +- `mypy `_ for static type checking on `type hints `_. We highly recommend that you setup `pre-commit hooks `_ @@ -624,7 +624,7 @@ Test-driven development/code writing ------------------------------------ *xarray* is serious about testing and strongly encourages contributors to embrace -`test-driven development (TDD) `_. +`test-driven development (TDD) `_. This development process "relies on the repetition of a very short development cycle: first the developer writes an (initially failing) automated test case that defines a desired improvement or new function, then produces the minimum amount of code to pass that test." @@ -636,7 +636,7 @@ Adding tests is one of the most common requests after code is pushed to *xarray* it is worth getting in the habit of writing tests ahead of time so that this is never an issue. Like many packages, *xarray* uses `pytest -`_ and the convenient +`_ and the convenient extensions in `numpy.testing `_. @@ -669,7 +669,7 @@ typically find tests wrapped in a class. class TestReallyCoolFeature: ... Going forward, we are moving to a more *functional* style using the -`pytest `__ framework, which offers a richer +`pytest `__ framework, which offers a richer testing framework that will facilitate testing and developing. Thus, instead of writing test classes, we will write test functions like this: @@ -816,7 +816,7 @@ speed up local testing on multicore machines, by running pytest with the optiona This can significantly reduce the time it takes to locally run tests before submitting a pull request. -For more, see the `pytest `_ documentation. +For more, see the `pytest `_ documentation. Running the performance test suite ---------------------------------- @@ -891,7 +891,7 @@ Learn `how to write a benchmark and how to use asv from the documentation `_. + available `here `_. Documenting your code --------------------- @@ -1062,7 +1062,7 @@ PR checklist - **Test your code**. - Write new tests if needed. See `"Test-driven development/code writing" `_. - - Test the code using `Pytest `_. Running all tests (type ``pytest`` in the root directory) takes a while, so feel free to only run the tests you think are needed based on your PR (example: ``pytest xarray/tests/test_dataarray.py``). CI will catch any failing tests. + - Test the code using `Pytest `_. Running all tests (type ``pytest`` in the root directory) takes a while, so feel free to only run the tests you think are needed based on your PR (example: ``pytest xarray/tests/test_dataarray.py``). CI will catch any failing tests. - By default, the upstream dev CI is disabled on pull request and push events. You can override this behavior per commit by adding a ``[test-upstream]`` tag to the first line of the commit message. For documentation-only commits, you can skip the CI per commit by adding a ``[skip-ci]`` tag to the first line of the commit message. - **Properly format your code** and verify that it passes the formatting guidelines set by `ruff `_. See `"Code formatting" `_. You can use `pre-commit `_ to run these automatically on each commit. diff --git a/doc/ecosystem.rst b/doc/ecosystem.rst index 1fa1ed42509..d5123669209 100644 --- a/doc/ecosystem.rst +++ b/doc/ecosystem.rst @@ -52,7 +52,7 @@ Geosciences - `xclim `_: A library for calculating climate science indices with unit handling built from xarray and dask. - `xESMF `_: Universal regridder for geospatial data. - `xgcm `_: Extends the xarray data model to understand finite volume grid cells (common in General Circulation Models) and provides interpolation and difference operations for such grids. -- `xmitgcm `_: a python package for reading `MITgcm `_ binary MDS files into xarray data structures. +- `xmitgcm `_: a python package for reading `MITgcm `_ binary MDS files into xarray data structures. - `xnemogcm `_: a package to read `NEMO `_ output files and add attributes to interface with xgcm. Machine Learning @@ -87,11 +87,11 @@ Extend xarray capabilities - `xr-scipy `_: A lightweight scipy wrapper for xarray. - `X-regression `_: Multiple linear regression from Statsmodels library coupled with Xarray library. - `xskillscore `_: Metrics for verifying forecasts. -- `xyzpy `_: Easily generate high dimensional data, including parallelization. +- `xyzpy `_: Easily generate high dimensional data, including parallelization. Visualization ~~~~~~~~~~~~~ -- `datashader `_, `geoviews `_, `holoviews `_, : visualization packages for large data. +- `datashader `_, `geoviews `_, `holoviews `_, : visualization packages for large data. - `hvplot `_ : A high-level plotting API for the PyData ecosystem built on HoloViews. - `psyplot `_: Interactive data visualization with python. - `xarray-leaflet `_: An xarray extension for tiled map plotting based on ipyleaflet. diff --git a/doc/examples/ROMS_ocean_model.ipynb b/doc/examples/ROMS_ocean_model.ipynb index d5c76380525..cca72d982ba 100644 --- a/doc/examples/ROMS_ocean_model.ipynb +++ b/doc/examples/ROMS_ocean_model.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The Regional Ocean Modeling System ([ROMS](http://myroms.org)) is an open source hydrodynamic model that is used for simulating currents and water properties in coastal and estuarine regions. ROMS is one of a few standard ocean models, and it has an active user community.\n", + "The Regional Ocean Modeling System ([ROMS](https://www.myroms.org/)) is an open source hydrodynamic model that is used for simulating currents and water properties in coastal and estuarine regions. ROMS is one of a few standard ocean models, and it has an active user community.\n", "\n", "ROMS uses a regular C-Grid in the horizontal, similar to other structured grid ocean and atmospheric models, and a stretched vertical coordinate (see [the ROMS documentation](https://www.myroms.org/wiki/Vertical_S-coordinate) for more details). Both of these require special treatment when using `xarray` to analyze ROMS ocean model output. This example notebook shows how to create a lazily evaluated vertical coordinate, and make some basic plots. The `xgcm` package is required to do analysis that is aware of the horizontal C-Grid." ] diff --git a/doc/examples/monthly_means_output.png b/doc/examples/monthly_means_output.png index 0f391a502b2..a2b3afb916e 100644 Binary files a/doc/examples/monthly_means_output.png and b/doc/examples/monthly_means_output.png differ diff --git a/doc/examples/multidimensional-coords.ipynb b/doc/examples/multidimensional-coords.ipynb index a138dff15aa..8ace13f7e69 100644 --- a/doc/examples/multidimensional-coords.ipynb +++ b/doc/examples/multidimensional-coords.ipynb @@ -126,7 +126,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In order to visualize the data on a conventional latitude-longitude grid, we can take advantage of xarray's ability to apply [cartopy](http://scitools.org.uk/cartopy/index.html) map projections." + "In order to visualize the data on a conventional latitude-longitude grid, we can take advantage of xarray's ability to apply [cartopy](https://scitools.org.uk/cartopy/docs/latest/) map projections." ] }, { diff --git a/doc/getting-started-guide/why-xarray.rst b/doc/getting-started-guide/why-xarray.rst index d7956817c03..0dc3586fc34 100644 --- a/doc/getting-started-guide/why-xarray.rst +++ b/doc/getting-started-guide/why-xarray.rst @@ -106,10 +106,10 @@ under active development. See our technical :ref:`roadmap` for more details, and feel free to reach out with questions about whether xarray is the right tool for your needs. -.. _datarray: https://github.com/fperez/datarray -.. _Dask: http://dask.org -.. _matplotlib: http://matplotlib.org -.. _netCDF: http://www.unidata.ucar.edu/software/netcdf -.. _NumPy: http://www.numpy.org -.. _pandas: http://pandas.pydata.org -.. _SciPy: http://www.scipy.org +.. _datarray: https://github.com/BIDS/datarray +.. _Dask: https://www.dask.org +.. _matplotlib: https://matplotlib.org +.. _netCDF: https://www.unidata.ucar.edu/software/netcdf +.. _NumPy: https://numpy.org +.. _pandas: https://pandas.pydata.org +.. _SciPy: https://www.scipy.org diff --git a/doc/roadmap.rst b/doc/roadmap.rst index c065a76a925..41f2e972f4d 100644 --- a/doc/roadmap.rst +++ b/doc/roadmap.rst @@ -148,7 +148,7 @@ implementations, e.g.: - Other ndarray objects, e.g., sparse, xnd, xtensor. Our strategy has been to pursue upstream improvements in NumPy (see -`NEP-22 `__) +`NEP-22 `__) for supporting a complete duck-typing interface using with NumPy's higher level array API. Improvements in NumPy's support for custom data types would also be highly useful for xarray users. diff --git a/doc/user-guide/dask.rst b/doc/user-guide/dask.rst index 5c421aa51d8..d7fb7cbd41e 100644 --- a/doc/user-guide/dask.rst +++ b/doc/user-guide/dask.rst @@ -2,325 +2,243 @@ .. _dask: -Parallel computing with Dask +Parallel Computing with Dask ============================ -Xarray integrates with `Dask `__ to support parallel -computations and streaming computation on datasets that don't fit into memory. -Currently, Dask is an entirely optional feature for xarray. However, the -benefits of using Dask are sufficiently strong that Dask may become a required -dependency in a future version of xarray. +Xarray integrates with `Dask `__, a general purpose library for parallel computing, to handle larger-than-memory computations. -For a full example of how to use xarray's Dask integration, read the -`blog post introducing xarray and Dask`_. More up-to-date examples -may be found at the `Pangeo project's gallery `_ -and at the `Dask examples website `_. +If you’ve been using Xarray to read in large datasets or split up data across a number of files, you may already be using Dask: -.. _blog post introducing xarray and Dask: https://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ +.. code-block:: python + + import xarray as xr + + ds = xr.open_zarr("/path/to/data.zarr") + timeseries = ds["temp"].mean(dim=["x", "y"]).compute() # Compute result + +Using Dask with Xarray feels similar to working with NumPy arrays, but on much larger datasets. The Dask integration is transparent, so you usually don’t need to manage the parallelism directly; Xarray and Dask handle these aspects behind the scenes. This makes it easy to write code that scales from small, in-memory datasets on a single machine to large datasets that are distributed across a cluster, with minimal code changes. + +Examples +-------- + +If you're new to using Xarray with Dask, we recommend the `Xarray + Dask Tutorial `_. + +Here are some examples for using Xarray with Dask at scale: -What is a Dask array? ---------------------- +- `Zonal averaging with the NOAA National Water Model `_ +- `CMIP6 Precipitation Frequency Analysis `_ +- `Using Dask + Cloud Optimized GeoTIFFs `_ -.. image:: ../_static/dask_array.png - :width: 40 % +Find more examples at the `Project Pythia cookbook gallery `_. + + +Using Dask with Xarray +---------------------- + +.. image:: ../_static/dask-array.svg + :width: 50 % :align: right :alt: A Dask array -Dask divides arrays into many small pieces, called *chunks*, each of which is -presumed to be small enough to fit into memory. +Dask divides arrays into smaller parts called chunks. These chunks are small, manageable pieces of the larger dataset, that Dask is able to process in parallel (see the `Dask Array docs on chunks `_). Commonly chunks are set when reading data, but you can also set the chunksize manually at any point in your workflow using :py:meth:`Dataset.chunk` and :py:meth:`DataArray.chunk`. See :ref:`dask.chunks` for more. -Unlike NumPy, which has eager evaluation, operations on Dask arrays are lazy. -Operations queue up a series of tasks mapped over blocks, and no computation is -performed until you actually ask values to be computed (e.g., to print results -to your screen or write to disk). At that point, data is loaded into memory -and computation proceeds in a streaming fashion, block-by-block. +Xarray operations on Dask-backed arrays are lazy. This means computations are not executed immediately, but are instead queued up as tasks in a Dask graph. -The actual computation is controlled by a multi-processing or thread pool, -which allows Dask to take full advantage of multiple processors available on -most modern computers. +When a result is requested (e.g., for plotting, writing to disk, or explicitly computing), Dask executes the task graph. The computations are carried out in parallel, with each chunk being processed independently. This parallel execution is key to handling large datasets efficiently. -For more details, read the `Dask documentation `__. -Note that xarray only makes use of ``dask.array`` and ``dask.delayed``. +Nearly all Xarray methods have been extended to work automatically with Dask Arrays. This includes things like indexing, concatenating, rechunking, grouped operations, etc. Common operations are covered in more detail in each of the sections below. .. _dask.io: Reading and writing data ------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~ -The usual way to create a ``Dataset`` filled with Dask arrays is to load the -data from a netCDF file or files. You can do this by supplying a ``chunks`` -argument to :py:func:`~xarray.open_dataset` or using the -:py:func:`~xarray.open_mfdataset` function. +When reading data, Dask divides your dataset into smaller chunks. You can specify the size of chunks with the ``chunks`` argument. Specifying ``chunks="auto"`` will set the dask chunk sizes to be a multiple of the on-disk chunk sizes. This can be a good idea, but usually the appropriate dask chunk size will depend on your workflow. -.. ipython:: python - :suppress: +.. tab:: Zarr - import os + The `Zarr `_ format is ideal for working with large datasets. Each chunk is stored in a separate file, allowing parallel reading and writing with Dask. You can also use Zarr to read/write directly from cloud storage buckets (see the `Dask documentation on connecting to remote data `__) - import numpy as np - import pandas as pd - import xarray as xr + When you open a Zarr dataset with :py:func:`~xarray.open_zarr`, it is loaded as a Dask array by default (if Dask is installed):: - np.random.seed(123456) - np.set_printoptions(precision=3, linewidth=100, threshold=100, edgeitems=3) + ds = xr.open_zarr("path/to/directory.zarr") - ds = xr.Dataset( - { - "temperature": ( - ("time", "latitude", "longitude"), - np.random.randn(30, 180, 180), - ), - "time": pd.date_range("2015-01-01", periods=30), - "longitude": np.arange(180), - "latitude": np.arange(89.5, -90.5, -1), - } - ) - ds.to_netcdf("example-data.nc") + See :ref:`io.zarr` for more details. -.. ipython:: python +.. tab:: NetCDF - ds = xr.open_dataset("example-data.nc", chunks={"time": 10}) - ds + Open a single netCDF file with :py:func:`~xarray.open_dataset` and supplying a ``chunks`` argument:: -In this example ``latitude`` and ``longitude`` do not appear in the ``chunks`` -dict, so only one chunk will be used along those dimensions. It is also -entirely equivalent to opening a dataset using :py:func:`~xarray.open_dataset` -and then chunking the data using the ``chunk`` method, e.g., -``xr.open_dataset('example-data.nc').chunk({'time': 10})``. + ds = xr.open_dataset("example-data.nc", chunks={"time": 10}) -To open multiple files simultaneously in parallel using Dask delayed, -use :py:func:`~xarray.open_mfdataset`:: + Or open multiple files in parallel with py:func:`~xarray.open_mfdataset`:: - xr.open_mfdataset('my/files/*.nc', parallel=True) + xr.open_mfdataset('my/files/*.nc', parallel=True) -This function will automatically concatenate and merge datasets into one in -the simple cases that it understands (see :py:func:`~xarray.combine_by_coords` -for the full disclaimer). By default, :py:func:`~xarray.open_mfdataset` will chunk each -netCDF file into a single Dask array; again, supply the ``chunks`` argument to -control the size of the resulting Dask arrays. In more complex cases, you can -open each file individually using :py:func:`~xarray.open_dataset` and merge the result, as -described in :ref:`combining data`. Passing the keyword argument ``parallel=True`` to -:py:func:`~xarray.open_mfdataset` will speed up the reading of large multi-file datasets by -executing those read tasks in parallel using ``dask.delayed``. + .. tip:: -.. warning:: + When reading in many netCDF files with py:func:`~xarray.open_mfdataset`, using ``engine="h5netcdf"`` can + be faster than the default which uses the netCDF4 package. - :py:func:`~xarray.open_mfdataset` called without ``chunks`` argument will return - dask arrays with chunk sizes equal to the individual files. Re-chunking - the dataset after creation with ``ds.chunk()`` will lead to an ineffective use of - memory and is not recommended. + Save larger-than-memory netCDF files:: -You'll notice that printing a dataset still shows a preview of array values, -even if they are actually Dask arrays. We can do this quickly with Dask because -we only need to compute the first few values (typically from the first block). -To reveal the true nature of an array, print a DataArray: + ds.to_netcdf("my-big-file.nc") -.. ipython:: python + Or set ``compute=False`` to return a dask.delayed object that can be computed later:: - ds.temperature + delayed_write = ds.to_netcdf("my-big-file.nc", compute=False) + delayed_write.compute() -Once you've manipulated a Dask array, you can still write a dataset too big to -fit into memory back to disk by using :py:meth:`~xarray.Dataset.to_netcdf` in the -usual way. + .. note:: -.. ipython:: python + When using Dask’s distributed scheduler to write NETCDF4 files, it may be necessary to set the environment variable ``HDF5_USE_FILE_LOCKING=FALSE`` to avoid competing locks within the HDF5 SWMR file locking scheme. Note that writing netCDF files with Dask’s distributed scheduler is only supported for the netcdf4 backend. - ds.to_netcdf("manipulated-example-data.nc") + See :ref:`io.netcdf` for more details. -By setting the ``compute`` argument to ``False``, :py:meth:`~xarray.Dataset.to_netcdf` -will return a ``dask.delayed`` object that can be computed later. +.. tab:: HDF5 -.. ipython:: python + Open HDF5 files with :py:func:`~xarray.open_dataset`:: - from dask.diagnostics import ProgressBar + xr.open_dataset("/path/to/my/file.h5", chunks='auto') - # or distributed.progress when using the distributed scheduler - delayed_obj = ds.to_netcdf("manipulated-example-data.nc", compute=False) - with ProgressBar(): - results = delayed_obj.compute() + See :ref:`io.hdf5` for more details. -.. ipython:: python - :suppress: +.. tab:: GeoTIFF - os.remove("manipulated-example-data.nc") # Was not opened. + Open large geoTIFF files with rioxarray:: -.. note:: + xds = rioxarray.open_rasterio("my-satellite-image.tif", chunks='auto') - When using Dask's distributed scheduler to write NETCDF4 files, - it may be necessary to set the environment variable `HDF5_USE_FILE_LOCKING=FALSE` - to avoid competing locks within the HDF5 SWMR file locking scheme. Note that - writing netCDF files with Dask's distributed scheduler is only supported for - the `netcdf4` backend. + See :ref:`io.rasterio` for more details. -A dataset can also be converted to a Dask DataFrame using :py:meth:`~xarray.Dataset.to_dask_dataframe`. -.. ipython:: python - :okwarning: +Loading Dask Arrays +~~~~~~~~~~~~~~~~~~~ - df = ds.to_dask_dataframe() - df +.. ipython:: python + :suppress: -Dask DataFrames do not support multi-indexes so the coordinate variables from the dataset are included as columns in the Dask DataFrame. + import os + import numpy as np + import pandas as pd + import xarray as xr -Using Dask with xarray ----------------------- + np.random.seed(123456) + np.set_printoptions(precision=3, linewidth=100, threshold=100, edgeitems=3) -Nearly all existing xarray methods (including those for indexing, computation, -concatenating and grouped operations) have been extended to work automatically -with Dask arrays. When you load data as a Dask array in an xarray data -structure, almost all xarray operations will keep it as a Dask array; when this -is not possible, they will raise an exception rather than unexpectedly loading -data into memory. Converting a Dask array into memory generally requires an -explicit conversion step. One notable exception is indexing operations: to -enable label based indexing, xarray will automatically load coordinate labels -into memory. + ds = xr.Dataset( + { + "temperature": ( + ("time", "latitude", "longitude"), + np.random.randn(30, 180, 180), + ), + "time": pd.date_range("2015-01-01", periods=30), + "longitude": np.arange(180), + "latitude": np.arange(89.5, -90.5, -1), + } + ) + ds.to_netcdf("example-data.nc") -.. tip:: +There are a few common cases where you may want to convert lazy Dask arrays into eager, in-memory Xarray data structures: - By default, dask uses its multi-threaded scheduler, which distributes work across - multiple cores and allows for processing some datasets that do not fit into memory. - For running across a cluster, `setup the distributed scheduler `_. +- You want to inspect smaller intermediate results when working interactively or debugging +- You've reduced the dataset (by filtering or with a groupby, for example) and now have something much smaller that fits in memory +- You need to compute intermediate results since Dask is unable (or struggles) to perform a certain computation. The canonical example of this is normalizing a dataset, e.g., ``ds - ds.mean()``, when ``ds`` is larger than memory. Typically, you should either save ``ds`` to disk or compute ``ds.mean()`` eagerly. -The easiest way to convert an xarray data structure from lazy Dask arrays into -*eager*, in-memory NumPy arrays is to use the :py:meth:`~xarray.Dataset.load` method: +To do this, you can use :py:meth:`Dataset.compute` or :py:meth:`DataArray.compute`: .. ipython:: python - ds.load() + ds.compute() -You can also access :py:attr:`~xarray.DataArray.values`, which will always be a -NumPy array: - -.. ipython:: - :verbatim: +.. note:: - In [5]: ds.temperature.values - Out[5]: - array([[[ 4.691e-01, -2.829e-01, ..., -5.577e-01, 3.814e-01], - [ 1.337e+00, -1.531e+00, ..., 8.726e-01, -1.538e+00], - ... - # truncated for brevity + Using :py:meth:`Dataset.compute` is preferred to :py:meth:`Dataset.load`, which changes the results in-place. -Explicit conversion by wrapping a DataArray with ``np.asarray`` also works: +You can also access :py:attr:`DataArray.values`, which will always be a NumPy array: .. ipython:: :verbatim: - In [5]: np.asarray(ds.temperature) + In [5]: ds.temperature.values Out[5]: array([[[ 4.691e-01, -2.829e-01, ..., -5.577e-01, 3.814e-01], [ 1.337e+00, -1.531e+00, ..., 8.726e-01, -1.538e+00], ... + # truncated for brevity -Alternatively you can load the data into memory but keep the arrays as -Dask arrays using the :py:meth:`~xarray.Dataset.persist` method: - -.. ipython:: python - - persisted = ds.persist() - -:py:meth:`~xarray.Dataset.persist` is particularly useful when using a -distributed cluster because the data will be loaded into distributed memory -across your machines and be much faster to use than reading repeatedly from -disk. - -.. warning:: - - On a single machine :py:meth:`~xarray.Dataset.persist` will try to load all of - your data into memory. You should make sure that your dataset is not larger than - available memory. - -.. note:: - - For more on the differences between :py:meth:`~xarray.Dataset.persist` and - :py:meth:`~xarray.Dataset.compute` see this `Stack Overflow answer on the differences between client persist and client compute `_ and the `Dask documentation `_. - -For performance you may wish to consider chunk sizes. The correct choice of -chunk size depends both on your data and on the operations you want to perform. -With xarray, both converting data to a Dask arrays and converting the chunk -sizes of Dask arrays is done with the :py:meth:`~xarray.Dataset.chunk` method: +NumPy ufuncs like :py:func:`numpy.sin` transparently work on all xarray objects, including those +that store lazy Dask arrays: .. ipython:: python - rechunked = ds.chunk({"latitude": 100, "longitude": 100}) + import numpy as np -.. warning:: + np.sin(ds) - Rechunking an existing dask array created with :py:func:`~xarray.open_mfdataset` - is not recommended (see above). +To access Dask arrays directly, use the :py:attr:`DataArray.data` attribute which exposes the DataArray's underlying array type. -You can view the size of existing chunks on an array by viewing the -:py:attr:`~xarray.Dataset.chunks` attribute: +If you're using a Dask cluster, you can also use :py:meth:`Dataset.persist` for quickly accessing intermediate outputs. This is most helpful after expensive operations like rechunking or setting an index. It's a way of telling the cluster that it should start executing the computations that you have defined so far, and that it should try to keep those results in memory. You will get back a new Dask array that is semantically equivalent to your old array, but now points to running data. -.. ipython:: python +.. code-block:: python - rechunked.chunks + ds = ds.persist() -If there are not consistent chunksizes between all the arrays in a dataset -along a particular dimension, an exception is raised when you try to access -``.chunks``. +.. tip:: -.. note:: + Remember to save the dataset returned by persist! This is a common mistake. - In the future, we would like to enable automatic alignment of Dask - chunksizes (but not the other way around). We might also require that all - arrays in a dataset share the same chunking alignment. Neither of these - are currently done. +.. _dask.chunks: -NumPy ufuncs like ``np.sin`` transparently work on all xarray objects, including those -that store lazy Dask arrays: +Chunking and performance +~~~~~~~~~~~~~~~~~~~~~~~~ -.. ipython:: python +The way a dataset is chunked can be critical to performance when working with large datasets. You'll want chunk sizes large enough to reduce the number of chunks that Dask has to think about (to reduce overhead from the task graph) but also small enough so that many of them can fit in memory at once. - import numpy as np +.. tip:: - np.sin(rechunked) + A good rule of thumb is to create arrays with a minimum chunk size of at least one million elements (e.g., a 1000x1000 matrix). With large arrays (10+ GB), you may need larger chunks. See `Choosing good chunk sizes in Dask `_. -To access Dask arrays directly, use the -:py:attr:`DataArray.data ` attribute. This attribute exposes -array data either as a Dask array or as a NumPy array, depending on whether it has been -loaded into Dask or not: +It can be helpful to choose chunk sizes based on your downstream analyses and to chunk as early as possible. Datasets with smaller chunks along the time axis, for example, can make time domain problems easier to parallelize since Dask can perform the same operation on each time chunk. If you're working with a large dataset with chunks that make downstream analyses challenging, you may need to rechunk your data. This is an expensive operation though, so is only recommended when needed. -.. ipython:: python +You can chunk or rechunk a dataset by: - ds.temperature.data +- Specifying the ``chunks`` kwarg when reading in your dataset. If you know you'll want to do some spatial subsetting, for example, you could use ``chunks={'latitude': 10, 'longitude': 10}`` to specify small chunks across space. This can avoid loading subsets of data that span multiple chunks, thus reducing the number of file reads. Note that this will only work, though, for chunks that are similar to how the data is chunked on disk. Otherwise, it will be very slow and require a lot of network bandwidth. +- Many array file formats are chunked on disk. You can specify ``chunks={}`` to have a single dask chunk map to a single on-disk chunk, and ``chunks="auto"`` to have a single dask chunk be a automatically chosen multiple of the on-disk chunks. +- Using :py:meth:`Dataset.chunk` after you've already read in your dataset. For time domain problems, for example, you can use ``ds.chunk(time=TimeResampler())`` to rechunk according to a specified unit of time. ``ds.chunk(time=TimeResampler("MS"))``, for example, will set the chunks so that a month of data is contained in one chunk. -.. note:: - ``.data`` is also used to expose other "computable" array backends beyond Dask and - NumPy (e.g. sparse and pint arrays). +For large-scale rechunking tasks (e.g., converting a simulation dataset stored with chunking only along time to a dataset with chunking only across space), consider writing another copy of your data on disk and/or using dedicated tools such as `Rechunker `_. .. _dask.automatic-parallelization: -Automatic parallelization with ``apply_ufunc`` and ``map_blocks`` ------------------------------------------------------------------ - -.. tip:: +Parallelize custom functions with ``apply_ufunc`` and ``map_blocks`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Some problems can become embarrassingly parallel and thus easy to parallelize - automatically by rechunking to a frequency, e.g. ``ds.chunk(time=TimeResampler("YE"))``. - See :py:meth:`Dataset.chunk` for more. - -Almost all of xarray's built-in operations work on Dask arrays. If you want to -use a function that isn't wrapped by xarray, and have it applied in parallel on +Almost all of Xarray's built-in operations work on Dask arrays. If you want to +use a function that isn't wrapped by Xarray, and have it applied in parallel on each block of your xarray object, you have three options: -1. Extract Dask arrays from xarray objects (``.data``) and use Dask directly. -2. Use :py:func:`~xarray.apply_ufunc` to apply functions that consume and return NumPy arrays. -3. Use :py:func:`~xarray.map_blocks`, :py:meth:`Dataset.map_blocks` or :py:meth:`DataArray.map_blocks` +1. Use :py:func:`~xarray.apply_ufunc` to apply functions that consume and return NumPy arrays. +2. Use :py:func:`~xarray.map_blocks`, :py:meth:`Dataset.map_blocks` or :py:meth:`DataArray.map_blocks` to apply functions that consume and return xarray objects. +3. Extract Dask Arrays from xarray objects with :py:attr:`DataArray.data` and use Dask directly. + +.. tip:: + + See the extensive Xarray tutorial on `apply_ufunc `_. ``apply_ufunc`` -~~~~~~~~~~~~~~~ +############### :py:func:`~xarray.apply_ufunc` automates `embarrassingly parallel `__ "map" type operations where a function written for processing NumPy arrays should be repeatedly -applied to xarray objects containing Dask arrays. It works similarly to +applied to Xarray objects containing Dask Arrays. It works similarly to :py:func:`dask.array.map_blocks` and :py:func:`dask.array.blockwise`, but without -requiring an intermediate layer of abstraction. +requiring an intermediate layer of abstraction. See the `Dask documentation `__ for more details. For the best performance when using Dask's multi-threaded scheduler, wrap a function that already releases the global interpreter lock, which fortunately @@ -415,9 +333,7 @@ application. .. tip:: - For the majority of NumPy functions that are already wrapped by Dask, it's - usually a better idea to use the pre-existing ``dask.array`` function, by - using either a pre-existing xarray methods or + When possible, it's recommended to use pre-existing ``dask.array`` functions, either with existing xarray methods or :py:func:`~xarray.apply_ufunc()` with ``dask='allowed'``. Dask can often have a more efficient implementation that makes use of the specialized structure of a problem, unlike the generic speedups offered by @@ -425,10 +341,10 @@ application. ``map_blocks`` -~~~~~~~~~~~~~~ +############## -Functions that consume and return xarray objects can be easily applied in parallel using :py:func:`map_blocks`. -Your function will receive an xarray Dataset or DataArray subset to one chunk +Functions that consume and return Xarray objects can be easily applied in parallel using :py:func:`map_blocks`. +Your function will receive an Xarray Dataset or DataArray subset to one chunk along each chunked dimension. .. ipython:: python @@ -455,7 +371,7 @@ Notice that the :py:meth:`map_blocks` call printed ``func`` is received 0-sized blocks! :py:meth:`map_blocks` needs to know what the final result looks like in terms of dimensions, shapes etc. It does so by running the provided function on 0-shaped inputs (*automated inference*). This works in many cases, but not all. If automatic inference does not -work for your function, provide the ``template`` kwarg (see below). +work for your function, provide the ``template`` kwarg (see :ref:`below `). In this case, automatic inference has worked so let's check that the result is as expected. @@ -469,7 +385,6 @@ This executes the Dask graph in `serial` using a for loop, but allows for printi debugging techniques. We can easily see that our function is receiving blocks of shape 10x180x180 and the returned result is identical to ``ds.time`` as expected. - Here is a common example where automated inference will not work. .. ipython:: python @@ -489,6 +404,8 @@ what the function returns) with dimensions, shapes, chunk sizes, attributes, coo variables that look exactly like the expected result. The variables should be dask-backed and hence not incur much memory cost. +.. _template-note: + .. note:: Note that when ``template`` is provided, ``attrs`` from ``template`` are copied over to the result. Any @@ -533,61 +450,45 @@ Notice that the 0-shaped sizes were not printed to screen. Since ``template`` ha As :py:func:`map_blocks` loads each block into memory, reduce as much as possible objects consumed by user functions. For example, drop useless variables before calling ``func`` with :py:func:`map_blocks`. +Deploying Dask +-------------- +By default, Dask uses the multi-threaded scheduler, which distributes work across multiple cores on a single machine and allows for processing some datasets that do not fit into memory. However, this has two limitations: -Chunking and performance ------------------------- - -The ``chunks`` parameter has critical performance implications when using Dask -arrays. If your chunks are too small, queueing up operations will be extremely -slow, because Dask will translate each operation into a huge number of -operations mapped across chunks. Computation on Dask arrays with small chunks -can also be slow, because each operation on a chunk has some fixed overhead from -the Python interpreter and the Dask task executor. - -Conversely, if your chunks are too big, some of your computation may be wasted, -because Dask only computes results one chunk at a time. - -A good rule of thumb is to create arrays with a minimum chunksize of at least -one million elements (e.g., a 1000x1000 matrix). With large arrays (10+ GB), the -cost of queueing up Dask operations can be noticeable, and you may need even -larger chunksizes. - -.. tip:: - - Check out the `dask documentation on chunks `_. - -.. tip:: +- You are limited by the size of your hard drive +- Downloading data can be slow and expensive - Many time domain problems become amenable to an embarrassingly parallel or blockwise solution - (e.g. using :py:func:`xarray.map_blocks`, :py:func:`dask.array.map_blocks`, or - :py:func:`dask.array.blockwise`) by rechunking to a frequency along the time dimension. - Provide :py:class:`xarray.groupers.TimeResampler` objects to :py:meth:`Dataset.chunk` to do so. - For example ``ds.chunk(time=TimeResampler("MS"))`` will set the chunks so that a month of - data is contained in one chunk. The resulting chunk sizes need not be uniform, depending on - the frequency of the data, and the calendar. +Instead, it can be faster and cheaper to run your computations close to where your data is stored, distributed across many machines on a Dask cluster. Often, this means deploying Dask on HPC clusters or on the cloud. See the `Dask deployment documentation `__ for more details. +Best Practices +-------------- -Optimization Tips ------------------ +Dask is pretty easy to use but there are some gotchas, many of which are under active development. Here are some tips we have found through experience. We also recommend checking out the `Dask best practices `_. -With analysis pipelines involving both spatial subsetting and temporal resampling, Dask performance -can become very slow or memory hungry in certain cases. Here are some optimization tips we have found -through experience: - -1. Do your spatial and temporal indexing (e.g. ``.sel()`` or ``.isel()``) early in the pipeline, especially before calling ``resample()`` or ``groupby()``. Grouping and resampling triggers some computation on all the blocks, which in theory should commute with indexing, but this optimization hasn't been implemented in Dask yet. (See `Dask issue #746 `_). +1. Do your spatial and temporal indexing (e.g. ``.sel()`` or ``.isel()``) early, especially before calling ``resample()`` or ``groupby()``. Grouping and resampling triggers some computation on all the blocks, which in theory should commute with indexing, but this optimization hasn't been implemented in Dask yet. (See `Dask issue #746 `_). 2. More generally, ``groupby()`` is a costly operation and will perform a lot better if the ``flox`` package is installed. See the `flox documentation `_ for more. By default Xarray will use ``flox`` if installed. 3. Save intermediate results to disk as a netCDF files (using ``to_netcdf()``) and then load them again with ``open_dataset()`` for further computations. For example, if subtracting temporal mean from a dataset, save the temporal mean to disk before subtracting. Again, in theory, Dask should be able to do the computation in a streaming fashion, but in practice this is a fail case for the Dask scheduler, because it tries to keep every chunk of an array that it computes in memory. (See `Dask issue #874 `_) -4. Specify smaller chunks across space when using :py:meth:`~xarray.open_mfdataset` (e.g., ``chunks={'latitude': 10, 'longitude': 10}``). This makes spatial subsetting easier, because there's no risk you will load subsets of data which span multiple chunks. On individual files, prefer to subset before chunking (suggestion 1). +4. Use the `Dask dashboard `_ to identify performance bottlenecks. + +Here's an example of a simplified workflow putting some of these tips together: + +.. code-block:: python -5. Chunk as early as possible, and avoid rechunking as much as possible. Always pass the ``chunks={}`` argument to :py:func:`~xarray.open_mfdataset` to avoid redundant file reads. + import xarray -6. Using the h5netcdf package by passing ``engine='h5netcdf'`` to :py:meth:`~xarray.open_mfdataset` can be quicker than the default ``engine='netcdf4'`` that uses the netCDF4 package. + ds = xr.open_zarr( # Since we're doing a spatial reduction, increase chunk size in x, y + "my-data.zarr", chunks={"x": 100, "y": 100} + ) + + time_subset = ds.sea_temperature.sel( + time=slice("2020-01-01", "2020-12-31") # Filter early + ) -7. Find `best practices specific to Dask arrays in the documentation `_. + # faster resampling when flox is installed + daily = ds.resample(time="D").mean() -8. The `dask diagnostics `_ can be useful in identifying performance bottlenecks. + daily.load() # Pull smaller results into memory after reducing the dataset diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index f4b3e5ab9f6..7175933dcbc 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -275,7 +275,7 @@ to automatically decode the values in the netCDF objects according to has an invalid "units" or "calendar" attribute. For these cases, you can turn this decoding off manually. -.. _CF conventions: http://cfconventions.org/ +.. _CF conventions: https://cfconventions.org/ You can view this encoding information (among others) in the :py:attr:`DataArray.encoding` and @@ -343,8 +343,8 @@ See its docstring for more details. (``compat='override'``). -.. _dask: http://dask.org -.. _blog post: http://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ +.. _dask: https://www.dask.org +.. _blog post: https://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ Sometimes multi-file datasets are not conveniently organized for easy use of :py:func:`open_mfdataset`. One can use the ``preprocess`` argument to provide a function that takes a dataset @@ -496,7 +496,7 @@ If character arrays are used: Technically, you can use `any string encoding recognized by Python `_ if you feel the need to deviate from UTF-8, by setting the ``_Encoding`` field in ``encoding``. But - `we don't recommend it `_. + `we don't recommend it `_. - The character dimension name can be specified by the ``char_dim_name`` field of a variable's ``encoding``. If the name of the character dimension is not specified, the default is ``f'string{data.shape[-1]}'``. When decoding character arrays from existing files, the @@ -1395,7 +1395,7 @@ For CSV files, one might also consider `xarray_extras`_. .. _xarray_extras: https://xarray-extras.readthedocs.io/en/latest/api/csv.html -.. _IO tools: http://pandas.pydata.org/pandas-docs/stable/io.html +.. _IO tools: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html Third party libraries diff --git a/pyproject.toml b/pyproject.toml index b886ee78b6f..55c8d92bfdb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -234,41 +234,40 @@ extend-exclude = [ ] [tool.ruff.lint] -# E402: module level import not at top of file -# E501: line too long - let the formatter worry about that -# E731: do not assign a lambda expression, use a def +extend-select = [ + "F", # Pyflakes + "E", # pycodestyle errors + "W", # pycodestyle warnings + "I", # isort + "UP", # pyupgrade + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "PIE", # flake8-pie + "TID", # flake8-tidy-imports (absolute imports) + "PGH", # pygrep-hooks + "PERF", # Perflint + "RUF", +] extend-safe-fixes = [ - "TID252", # absolute imports + "TID252", # absolute imports ] ignore = [ - "C40", - "E402", - "E501", - "E731", - "UP007", - "PERF20", + "E402", # module level import not at top of file + "E501", # line too long - let the formatter worry about that + "E731", # do not assign a lambda expression, use a def + "UP007", # use X | Y for type annotations + "UP027", # deprecated + "C40", # unnecessary generator, comprehension, or literal "PIE790", # unnecessary pass statement - "RUF001", - "RUF002", - "RUF003", - "RUF005", - "RUF012", -] -extend-select = [ - "B", # flake8-bugbear - "C4", # flake8-comprehensions - "F", # Pyflakes - "E", # Pycodestyle - "W", - "TID", # flake8-tidy-imports (absolute imports) - "I", # isort - "PERF", # Perflint - "PIE", # flake8-pie - "PGH", # pygrep-hooks - "RUF", - "UP", # Pyupgrade + "PERF203", # try-except within a loop incurs performance overhead + "RUF001", # string contains ambiguous unicode character + "RUF002", # docstring contains ambiguous acute accent unicode character + "RUF003", # comment contains ambiguous no-break space unicode character + "RUF005", # consider upacking operator instead of concatenation + "RUF012", # mutable class attributes ] + [tool.ruff.lint.per-file-ignores] # don't enforce absolute imports "asv_bench/**" = ["TID252"] diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 030395b26fb..fcbf1f8c4a0 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -250,9 +250,9 @@ def _determine_zarr_chunks( # if there are no chunks in encoding but there are dask chunks, we try to # use the same chunks in zarr # However, zarr chunks needs to be uniform for each array - # http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks + # https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html#chunks # while dask chunks can be variable sized - # http://dask.pydata.org/en/latest/array-design.html#chunks + # https://dask.pydata.org/en/latest/array-design.html#chunks if var_chunks and not enc_chunks: if any(len(set(chunks[:-1])) > 1 for chunks in var_chunks): raise ValueError( @@ -1377,7 +1377,7 @@ def open_zarr( References ---------- - http://zarr.readthedocs.io/ + https://zarr.readthedocs.io/ """ from xarray.backends.api import open_dataset diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py index d16ec52d645..4ca6a3f0a46 100644 --- a/xarray/coding/strings.py +++ b/xarray/coding/strings.py @@ -210,7 +210,7 @@ def _numpy_char_to_bytes(arr): # see https://github.com/numpy/numpy/issues/25916 # and https://github.com/numpy/numpy/pull/25922 copy = None if HAS_NUMPY_2_0 else False - # based on: http://stackoverflow.com/a/10984878/809705 + # based on: https://stackoverflow.com/a/10984878/809705 arr = np.array(arr, copy=copy, order="C") dtype = "S" + str(arr.shape[-1]) return arr.view(dtype).reshape(arr.shape[:-1]) diff --git a/xarray/conventions.py b/xarray/conventions.py index 133dbf00063..f315d9d3e2d 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -805,7 +805,7 @@ def cf_encoder(variables: T_Variables, attributes: T_Attrs): # Remove attrs from bounds variables (issue #2921) for var in new_vars.values(): - bounds = var.attrs["bounds"] if "bounds" in var.attrs else None + bounds = var.attrs.get("bounds") if bounds and bounds in new_vars: # see http://cfconventions.org/cf-conventions/cf-conventions.html#cell-boundaries for attr in [ diff --git a/xarray/core/common.py b/xarray/core/common.py index 17f83b7f310..28d8ffb1bcd 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -352,7 +352,7 @@ def __dir__(self) -> list[str]: def _ipython_key_completions_(self) -> list[str]: """Provide method for the key-autocompletions in IPython. - See http://ipython.readthedocs.io/en/stable/config/integrating.html#tab-completion + See https://ipython.readthedocs.io/en/stable/config/integrating.html#tab-completion For the details. """ items = { diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d434c114653..99e68d691c4 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -6999,7 +6999,7 @@ def groupby_bins( References ---------- - .. [1] http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html + .. [1] https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html """ from xarray.core.groupby import ( DataArrayGroupBy, @@ -7492,7 +7492,7 @@ def resample( References ---------- - .. [1] http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases + .. [1] https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases """ from xarray.core.resample import DataArrayResample diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 64400b5d5d7..5decc5f5d6e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -10618,7 +10618,7 @@ def groupby_bins( References ---------- - .. [1] http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html + .. [1] https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html """ from xarray.core.groupby import ( DatasetGroupBy, @@ -10884,7 +10884,7 @@ def resample( References ---------- - .. [1] http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases + .. [1] https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases """ from xarray.core.resample import DatasetResample diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 6d673389e05..ee90cf7477c 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -749,7 +749,7 @@ def _item_sources(self) -> Iterable[Mapping[Any, Any]]: def _ipython_key_completions_(self) -> list[str]: """Provide method for the key-autocompletions in IPython. - See http://ipython.readthedocs.io/en/stable/config/integrating.html#tab-completion + See https://ipython.readthedocs.io/en/stable/config/integrating.html#tab-completion For the details. """ diff --git a/xarray/core/formatting_html.py b/xarray/core/formatting_html.py index d0cb7c30e91..e6ae7d77dc6 100644 --- a/xarray/core/formatting_html.py +++ b/xarray/core/formatting_html.py @@ -155,7 +155,9 @@ def summarize_index(coord_names, index) -> str: return ( f"
{name}
" f"
{preview}
" - f"
" + # need empty input + label here to conform to the fixed CSS grid layout + f"" + f"" f"" f"" f"
{details}
" diff --git a/xarray/core/parallel.py b/xarray/core/parallel.py index a0dfe56807b..6d6a6672470 100644 --- a/xarray/core/parallel.py +++ b/xarray/core/parallel.py @@ -600,7 +600,7 @@ def _wrapper( # unchunked dimensions in the input have one chunk in the result # output can have new dimensions with exactly one chunk key: tuple[Any, ...] = (gname_l,) + tuple( - chunk_index[dim] if dim in chunk_index else 0 for dim in variable.dims + chunk_index.get(dim, 0) for dim in variable.dims ) # We're adding multiple new layers to the graph: diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 3be10d016e9..3ea8d388198 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -1008,7 +1008,7 @@ def parse_ordered_dims( def _check_dims(dim: Set[Hashable], all_dims: Set[Hashable]) -> None: wrong_dims = (dim - all_dims) - {...} if wrong_dims: - wrong_dims_str = ", ".join(f"'{d!s}'" for d in wrong_dims) + wrong_dims_str = ", ".join(f"'{d}'" for d in wrong_dims) raise ValueError( f"Dimension(s) {wrong_dims_str} do not exist. Expected one or more of {all_dims}" ) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 9bb68e96ca8..dbc7ffb753c 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -52,6 +52,7 @@ is_chunked_array, to_duck_array, ) +from xarray.namedarray.utils import module_available from xarray.util.deprecation_helpers import deprecate_dims NON_NUMPY_SUPPORTED_ARRAY_TYPES = ( @@ -1171,10 +1172,10 @@ def _pad_options_dim_to_index( if fill_with_shape: return [ - (n, n) if d not in pad_option else pad_option[d] + pad_option.get(d, (n, n)) for d, n in zip(self.dims, self.data.shape, strict=True) ] - return [(0, 0) if d not in pad_option else pad_option[d] for d in self.dims] + return [pad_option.get(d, (0, 0)) for d in self.dims] def pad( self, @@ -1972,7 +1973,7 @@ def _wrapper(npa, **kwargs): output_core_dims=[["quantile"]], output_dtypes=[np.float64], dask_gufunc_kwargs=dict(output_sizes={"quantile": len(q)}), - dask="parallelized", + dask="allowed" if module_available("dask", "2024.11.0") else "parallelized", kwargs=kwargs, ) diff --git a/xarray/static/css/style.css b/xarray/static/css/style.css index d4f5c104850..b1cefeb2af9 100644 --- a/xarray/static/css/style.css +++ b/xarray/static/css/style.css @@ -75,6 +75,7 @@ body.vscode-dark { .xr-section-item input { display: inline-block; opacity: 0; + height: 0; } .xr-section-item input + label { diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 3999aec9cbb..0da5fd13649 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -110,6 +110,7 @@ def _importorskip( has_dask_ge_2024_08_1, requires_dask_ge_2024_08_1 = _importorskip( "dask", minversion="2024.08.1" ) +has_dask_ge_2024_11_0, requires_dask_ge_2024_11_0 = _importorskip("dask", "2024.11.0") with warnings.catch_warnings(): warnings.filterwarnings( "ignore", diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 89ddcc21783..94247da1e6b 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -193,7 +193,7 @@ def test_groupby_da_datetime() -> None: def test_groupby_duplicate_coordinate_labels() -> None: - # fix for http://stackoverflow.com/questions/38065129 + # fix for https://stackoverflow.com/questions/38065129 array = xr.DataArray([1, 2, 3], [("x", [1, 1, 2])]) expected = xr.DataArray([3, 3], [("x", [1, 2])]) actual = array.groupby("x").sum() @@ -1670,7 +1670,7 @@ def test_groupby_bins( # the first value should not be part of any group ("right" binning) array[0] = 99 # bins follow conventions for pandas.cut - # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html + # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html bins = [0, 1.5, 5] df = array.to_dataframe() @@ -1968,7 +1968,7 @@ def test_resample_first(self) -> None: expected = DataArray([np.nan, 4, 8], [("time", times[::4])]) assert_identical(expected, actual) - # regression test for http://stackoverflow.com/questions/33158558/ + # regression test for https://stackoverflow.com/questions/33158558/ array = Dataset({"time": times})["time"] actual = array.resample(time="1D").last() expected_times = pd.to_datetime( diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 9ebd4e4a4d3..0ed47c2b5fe 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -36,6 +36,7 @@ assert_equal, assert_identical, assert_no_warnings, + has_dask_ge_2024_11_0, has_pandas_3, raise_if_dask_computes, requires_bottleneck, @@ -1871,9 +1872,16 @@ def test_quantile_interpolation_deprecation(self, method) -> None: def test_quantile_chunked_dim_error(self): v = Variable(["x", "y"], self.d).chunk({"x": 2}) - # this checks for ValueError in dask.array.apply_gufunc - with pytest.raises(ValueError, match=r"consists of multiple chunks"): - v.quantile(0.5, dim="x") + if has_dask_ge_2024_11_0: + # Dask rechunks + np.testing.assert_allclose( + v.compute().quantile(0.5, dim="x"), v.quantile(0.5, dim="x") + ) + + else: + # this checks for ValueError in dask.array.apply_gufunc + with pytest.raises(ValueError, match=r"consists of multiple chunks"): + v.quantile(0.5, dim="x") @pytest.mark.parametrize("compute_backend", ["numbagg", None], indirect=True) @pytest.mark.parametrize("q", [-0.1, 1.1, [2], [0.25, 2]])