From c5d3bdfdd6200002f81f8aea20fb762bc76b6b22 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 3 Nov 2024 20:21:42 +0000 Subject: [PATCH] ci: use duckdb instead of ibis to test interchange-only support (#3672) Co-authored-by: dangotbanned <125183946+dangotbanned@users.noreply.github.com> --- altair/utils/data.py | 5 +- pyproject.toml | 2 +- tests/__init__.py | 60 ++++++++++++++++----- tests/utils/test_to_values_narwhals.py | 20 +------ tests/utils/test_utils.py | 4 -- tests/vegalite/v5/test_api.py | 75 ++++++++++++++++---------- 6 files changed, 101 insertions(+), 65 deletions(-) diff --git a/altair/utils/data.py b/altair/utils/data.py index 91ed54a510..89ec31f1f3 100644 --- a/altair/utils/data.py +++ b/altair/utils/data.py @@ -65,8 +65,9 @@ class SupportsGeoInterface(Protocol): def is_data_type(obj: Any) -> TypeIs[DataType]: - return _is_pandas_dataframe(obj) or isinstance( - obj, (dict, DataFrameLike, SupportsGeoInterface, nw.DataFrame) + return isinstance(obj, (dict, SupportsGeoInterface)) or isinstance( + nw.from_native(obj, eager_or_interchange_only=True, strict=False), + nw.DataFrame, ) diff --git a/pyproject.toml b/pyproject.toml index 21c1a13423..5b5da88f93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,7 @@ all = [ dev = [ "hatch", "ruff>=0.6.0", - "ibis-framework[polars]", + "duckdb>=1.0", "ipython[kernel]", "pandas>=1.1.3", "pytest", diff --git a/tests/__init__.py b/tests/__init__.py index de79de20ba..1cda564382 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -2,16 +2,17 @@ import pkgutil import re +import sys from importlib.util import find_spec -from typing import TYPE_CHECKING +from pathlib import Path +from typing import TYPE_CHECKING, Any import pytest from tests import examples_arguments_syntax, examples_methods_syntax if TYPE_CHECKING: - import sys - from collections.abc import Collection, Iterator, Mapping + from collections.abc import Callable, Collection, Iterator, Mapping from re import Pattern if sys.version_info >= (3, 11): @@ -24,6 +25,21 @@ "pytest.MarkDecorator | Collection[pytest.MarkDecorator | pytest.Mark]" ) + +def windows_has_tzdata() -> bool: + """ + From PyArrow: python/pyarrow/tests/util.py. + + This is the default location where tz.cpp will look for (until we make + this configurable at run-time) + + Skip test on Windows when the tz database is not configured. + + See https://github.com/vega/altair/issues/3050. + """ + return (Path.home() / "Downloads" / "tzdata").exists() + + slow: pytest.MarkDecorator = pytest.mark.slow() """ Custom ``pytest.mark`` decorator. @@ -69,17 +85,37 @@ """ -skip_requires_pyarrow: pytest.MarkDecorator = pytest.mark.skipif( - find_spec("pyarrow") is None, reason="`pyarrow` not installed." -) -""" -``pytest.mark.skipif`` decorator. +def skip_requires_pyarrow( + fn: Callable[..., Any] | None = None, /, *, requires_tzdata: bool = False +) -> Callable[..., Any]: + """ + ``pytest.mark.skipif`` decorator. -Applies when `pyarrow`_ import would fail. + Applies when `pyarrow`_ import would fail. -.. _pyarrow: - https://pypi.org/project/pyarrow/ -""" + Additionally, we mark as expected to fail on `Windows`. + + https://github.com/vega/altair/issues/3050 + + .. _pyarrow: + https://pypi.org/project/pyarrow/ + """ + composed = pytest.mark.skipif( + find_spec("pyarrow") is None, reason="`pyarrow` not installed." + ) + if requires_tzdata: + composed = pytest.mark.xfail( + sys.platform == "win32" and not windows_has_tzdata(), + reason="Timezone database is not installed on Windows", + )(composed) + + def wrap(test_fn: Callable[..., Any], /) -> Callable[..., Any]: + return composed(test_fn) + + if fn is None: + return wrap + else: + return wrap(fn) def id_func_str_only(val) -> str: diff --git a/tests/utils/test_to_values_narwhals.py b/tests/utils/test_to_values_narwhals.py index 81cc5d348f..2c7b3c7dea 100644 --- a/tests/utils/test_to_values_narwhals.py +++ b/tests/utils/test_to_values_narwhals.py @@ -1,7 +1,5 @@ import re -import sys from datetime import datetime -from pathlib import Path import narwhals.stable.v1 as nw import pandas as pd @@ -11,23 +9,7 @@ from tests import skip_requires_pyarrow -def windows_has_tzdata(): - """ - From PyArrow: python/pyarrow/tests/util.py. - - This is the default location where tz.cpp will look for (until we make - this configurable at run-time) - """ - return Path.home().joinpath("Downloads", "tzdata").exists() - - -# Skip test on Windows when the tz database is not configured. -# See https://github.com/vega/altair/issues/3050. -@pytest.mark.skipif( - sys.platform == "win32" and not windows_has_tzdata(), - reason="Timezone database is not installed on Windows", -) -@skip_requires_pyarrow +@skip_requires_pyarrow(requires_tzdata=True) def test_arrow_timestamp_conversion(): """Test that arrow timestamp values are converted to ISO-8601 strings.""" import pyarrow as pa diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index f83f76528f..4b542f0c37 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -1,6 +1,5 @@ import io import json -import sys import narwhals.stable.v1 as nw import numpy as np @@ -121,9 +120,6 @@ def test_sanitize_dataframe_arrow_columns(): @skip_requires_pyarrow -@pytest.mark.xfail( - sys.platform == "win32", reason="Timezone database is not installed on Windows" -) def test_sanitize_pyarrow_table_columns() -> None: import pyarrow as pa diff --git a/tests/vegalite/v5/test_api.py b/tests/vegalite/v5/test_api.py index f6cd4ee6d6..7c4b6a151f 100644 --- a/tests/vegalite/v5/test_api.py +++ b/tests/vegalite/v5/test_api.py @@ -15,7 +15,7 @@ from importlib.util import find_spec from typing import TYPE_CHECKING -import ibis +import duckdb import jsonschema import narwhals.stable.v1 as nw import pandas as pd @@ -26,7 +26,7 @@ import altair as alt from altair.utils.core import use_signature from altair.utils.schemapi import Optional, SchemaValidationError, Undefined -from tests import skip_requires_vl_convert, slow +from tests import skip_requires_pyarrow, skip_requires_vl_convert, slow if TYPE_CHECKING: from typing import Any @@ -1607,20 +1607,15 @@ def test_polars_with_pandas_nor_pyarrow(monkeypatch: pytest.MonkeyPatch): assert "numpy" not in sys.modules -@pytest.mark.skipif( - Version("1.5") > PANDAS_VERSION, - reason="A warning is thrown on old pandas versions", -) -@pytest.mark.xfail( - sys.platform == "win32", reason="Timezone database is not installed on Windows" -) -def test_ibis_with_date_32(): - ibis.set_backend("polars") - df = pl.DataFrame( +@skip_requires_pyarrow(requires_tzdata=True) +def test_interchange_with_date_32(): + # Test that objects which Narwhals only supports at the interchange + # level can be plotted when they contain date32 columns. + df = pl.DataFrame( # noqa: F841 {"a": [1, 2, 3], "b": [date(2020, 1, 1), date(2020, 1, 2), date(2020, 1, 3)]} ) - tbl = ibis.memtable(df) - result = alt.Chart(tbl).mark_line().encode(x="a", y="b").to_dict() + rel = duckdb.sql("select * from df") + result = alt.Chart(rel).mark_line().encode(x="a", y="b").to_dict() assert next(iter(result["datasets"].values())) == [ {"a": 1, "b": "2020-01-01T00:00:00"}, {"a": 2, "b": "2020-01-02T00:00:00"}, @@ -1628,30 +1623,56 @@ def test_ibis_with_date_32(): ] -@pytest.mark.skipif( - Version("1.5") > PANDAS_VERSION, - reason="A warning is thrown on old pandas versions", -) -@pytest.mark.xfail( - sys.platform == "win32", reason="Timezone database is not installed on Windows" -) -def test_ibis_with_vegafusion(monkeypatch: pytest.MonkeyPatch): - ibis.set_backend("polars") - df = pl.DataFrame( +@skip_requires_pyarrow(requires_tzdata=True) +def test_interchange_with_vegafusion(monkeypatch: pytest.MonkeyPatch): + # Test that objects which Narwhals only supports at the interchange + # level don't get converted to PyArrow unnecessarily when plotted + # with the vegafusion transformer. + # TODO: this test can be drastically simplified when some level of + # DuckDB support in VegaFusion, as it can then just be `alt.Chart(rel_df)` + # without DuckDBWithInterchangeSupport. + df = pl.DataFrame( # noqa: F841 { "a": [1, 2, 3], "b": [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], } ) - tbl = ibis.memtable(df) + rel = duckdb.sql("select * from df") + + class DuckDBWithInterchangeSupport: + """ + DuckDB doesn't (yet?) support the interchange protocol. + + So, we create duckdb wrapper which defers to PyArrow's + implementation of the protocol. + """ + + def __init__(self, rel: duckdb.DuckDBPyRelation) -> None: + self._rel = rel + + def __dataframe__(self, allow_copy: bool = True) -> object: + return self._rel.to_arrow_table().__dataframe__() + + rel_df = DuckDBWithInterchangeSupport(rel) # "poison" `arrow_table_from_dfi_dataframe` to check that it does not get called # if we use the vegafusion transformer monkeypatch.setattr( "altair.utils.data.arrow_table_from_dfi_dataframe", lambda x: 1 / 0 ) - tbl = ibis.memtable(df) + + # Narwhals doesn't fully support our custom DuckDBWithInterchangeSupport, + # so we need to overwrite `to_native` + def to_native(df, strict): + if isinstance(df, nw.DataFrame): + return rel_df + return df + + monkeypatch.setattr("narwhals.stable.v1.to_native", to_native) + with alt.data_transformers.enable("vegafusion"): - result = alt.Chart(tbl).mark_line().encode(x="a", y="b").to_dict(format="vega") + result = ( + alt.Chart(rel_df).mark_line().encode(x="a", y="b").to_dict(format="vega") + ) assert next(iter(result["data"]))["values"] == [ {"a": 1, "b": "2020-01-01T00:00:00.000"}, {"a": 2, "b": "2020-01-02T00:00:00.000"},