From 243b61ee2a06a97016d4bc1df54410ff21084fa8 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 29 May 2024 12:33:17 +0400 Subject: [PATCH] feat(python): Add top-level `pl.sql` function (#16528) --- py-polars/polars/__init__.py | 3 +- py-polars/polars/sql/__init__.py | 2 + py-polars/polars/sql/context.py | 10 ++ py-polars/polars/sql/functions.py | 115 ++++++++++++++++++ py-polars/tests/unit/sql/test_joins.py | 7 +- .../tests/unit/sql/test_miscellaneous.py | 20 +-- py-polars/tests/unit/sql/test_numeric.py | 2 +- py-polars/tests/unit/sql/test_temporal.py | 6 +- .../tests/unit/sql/test_trigonometric.py | 43 +++---- 9 files changed, 167 insertions(+), 41 deletions(-) create mode 100644 py-polars/polars/sql/functions.py diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index e24215f85f36..007da7153c3b 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -215,7 +215,7 @@ threadpool_size, ) from polars.series import Series -from polars.sql import SQLContext +from polars.sql import SQLContext, sql from polars.string_cache import ( StringCache, disable_string_cache, @@ -430,6 +430,7 @@ "from_repr", # polars.sql "SQLContext", + "sql", # polars.utils "build_info", "get_index_type", diff --git a/py-polars/polars/sql/__init__.py b/py-polars/polars/sql/__init__.py index 35ce9fba0b72..e92fd23272e8 100644 --- a/py-polars/polars/sql/__init__.py +++ b/py-polars/polars/sql/__init__.py @@ -1,5 +1,7 @@ from polars.sql.context import SQLContext +from polars.sql.functions import sql __all__ = [ "SQLContext", + "sql", ] diff --git a/py-polars/polars/sql/context.py b/py-polars/polars/sql/context.py index 70a8a78e7e1a..cf0215ef20df 100644 --- a/py-polars/polars/sql/context.py +++ b/py-polars/polars/sql/context.py @@ -64,6 +64,16 @@ def __init__( **named_frames: DataFrame | LazyFrame | None, ) -> None: ... + @overload + def __init__( + self: SQLContext[DataFrame], + frames: Mapping[str, DataFrame | LazyFrame | None] | None = ..., + *, + register_globals: bool | int = ..., + eager_execution: bool, + **named_frames: DataFrame | LazyFrame | None, + ) -> None: ... + def __init__( self, frames: Mapping[str, DataFrame | LazyFrame | None] | None = None, diff --git a/py-polars/polars/sql/functions.py b/py-polars/polars/sql/functions.py new file mode 100644 index 000000000000..798d3228d065 --- /dev/null +++ b/py-polars/polars/sql/functions.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, overload + +if TYPE_CHECKING: + from polars.dataframe import DataFrame + from polars.lazyframe import LazyFrame + + +@overload +def sql(query: str, *, eager: Literal[False] = False) -> LazyFrame: ... + + +@overload +def sql(query: str, *, eager: Literal[True]) -> DataFrame: ... + + +def sql(query: str, *, eager: bool = False) -> DataFrame | LazyFrame: + """ + Execute a SQL query against frames in the global namespace. + + .. versionadded:: 0.20.31 + + .. warning:: + This functionality is considered **unstable**, although it is close to + being considered stable. It may be changed at any point without it being + considered a breaking change. + + Parameters + ---------- + query + SQL query to execute. + eager + Automatically collect the result and return a DataFrame instead of a LazyFrame. + + Notes + ----- + * More control over registration and execution behaviour is available by + using the :class:`SQLContext` object. + + See Also + -------- + SQLContext + + Examples + -------- + >>> lf1 = pl.LazyFrame({"a": [1, 2, 3], "b": [6, 7, 8], "c": ["z", "y", "x"]}) + >>> lf2 = pl.LazyFrame({"a": [3, 2, 1], "d": [125, -654, 888]}) + + Query the LazyFrame using SQL: + + >>> lf1.sql("SELECT c, b FROM self WHERE a > 1").collect() + shape: (2, 2) + ┌─────┬─────┐ + │ c ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ y ┆ 7 │ + │ x ┆ 8 │ + └─────┴─────┘ + + Join two LazyFrames: + + >>> pl.sql( + ... ''' + ... SELECT lf1.*, d + ... FROM lf1 + ... INNER JOIN lf2 USING (a) + ... WHERE a > 1 AND b < 8 + ... ''' + ... ).collect() + shape: (1, 4) + ┌─────┬─────┬─────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪══════╡ + │ 2 ┆ 7 ┆ y ┆ -654 │ + └─────┴─────┴─────┴──────┘ + + Apply SQL transforms (aliasing "self" to "frame") and subsequently + filter natively (you can freely mix SQL and native operations): + + >>> pl.sql( + ... query=''' + ... SELECT + ... a, + ... (a % 2 == 0) AS a_is_even, + ... (b::float4 / 2) AS "b/2", + ... CONCAT_WS(':', c, c, c) AS c_c_c + ... FROM lf1 + ... ORDER BY a + ... ''', + ... ).filter(~pl.col("c_c_c").str.starts_with("x")).collect() + shape: (2, 4) + ┌─────┬───────────┬─────┬───────┐ + │ a ┆ a_is_even ┆ b/2 ┆ c_c_c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ bool ┆ f32 ┆ str │ + ╞═════╪═══════════╪═════╪═══════╡ + │ 1 ┆ false ┆ 3.0 ┆ z:z:z │ + │ 2 ┆ true ┆ 3.5 ┆ y:y:y │ + └─────┴───────────┴─────┴───────┘ + """ + from polars.sql import SQLContext + + with SQLContext( + eager_execution=eager, + register_globals=True, + ) as ctx: + return ctx.execute(query) + + +__all__ = ["sql"] diff --git a/py-polars/tests/unit/sql/test_joins.py b/py-polars/tests/unit/sql/test_joins.py index a498cbb6629e..8c8d3069ddc3 100644 --- a/py-polars/tests/unit/sql/test_joins.py +++ b/py-polars/tests/unit/sql/test_joins.py @@ -73,14 +73,15 @@ def test_join_inner(foods_ipc_path: Path, join_clause: str) -> None: foods1 = pl.scan_ipc(foods_ipc_path) foods2 = foods1 # noqa: F841 - out = foods1.sql( + out = pl.sql( f""" SELECT * FROM foods1 INNER JOIN foods2 {join_clause} LIMIT 2 - """ - ).collect() + """, + eager=True, + ) assert out.to_dict(as_series=False) == { "category": ["vegetables", "vegetables"], diff --git a/py-polars/tests/unit/sql/test_miscellaneous.py b/py-polars/tests/unit/sql/test_miscellaneous.py index d32674db78ff..aaa51904bb4f 100644 --- a/py-polars/tests/unit/sql/test_miscellaneous.py +++ b/py-polars/tests/unit/sql/test_miscellaneous.py @@ -24,16 +24,16 @@ def test_any_all() -> None: res = df.sql( """ SELECT - x >= ALL(df.y) as 'All Geq', - x > ALL(df.y) as 'All G', - x < ALL(df.y) as 'All L', - x <= ALL(df.y) as 'All Leq', - x >= ANY(df.y) as 'Any Geq', - x > ANY(df.y) as 'Any G', - x < ANY(df.y) as 'Any L', - x <= ANY(df.y) as 'Any Leq', - x == ANY(df.y) as 'Any eq', - x != ANY(df.y) as 'Any Neq', + x >= ALL(df.y) AS "All Geq", + x > ALL(df.y) AS "All G", + x < ALL(df.y) AS "All L", + x <= ALL(df.y) AS "All Leq", + x >= ANY(df.y) AS "Any Geq", + x > ANY(df.y) AS "Any G", + x < ANY(df.y) AS "Any L", + x <= ANY(df.y) AS "Any Leq", + x == ANY(df.y) AS "Any eq", + x != ANY(df.y) AS "Any Neq", FROM df """, ) diff --git a/py-polars/tests/unit/sql/test_numeric.py b/py-polars/tests/unit/sql/test_numeric.py index 5e23189c79f7..f8b37b1dd642 100644 --- a/py-polars/tests/unit/sql/test_numeric.py +++ b/py-polars/tests/unit/sql/test_numeric.py @@ -29,7 +29,7 @@ def test_modulo() -> None: b % 3 AS b3, MOD(c, 4) AS c4, MOD(d, 5.5) AS d55 - FROM df + FROM self """ ) diff --git a/py-polars/tests/unit/sql/test_temporal.py b/py-polars/tests/unit/sql/test_temporal.py index 51715c4e0b5e..cf9080d69119 100644 --- a/py-polars/tests/unit/sql/test_temporal.py +++ b/py-polars/tests/unit/sql/test_temporal.py @@ -34,7 +34,7 @@ def test_date() -> None: @pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) def test_datetime_to_time(time_unit: Literal["ns", "us", "ms"]) -> None: - df = pl.DataFrame( + df = pl.DataFrame( # noqa: F841 { "dtm": [ datetime(2099, 12, 31, 23, 59, 59), @@ -46,8 +46,8 @@ def test_datetime_to_time(time_unit: Literal["ns", "us", "ms"]) -> None: schema={"dtm": pl.Datetime(time_unit)}, ) - res = df.sql("SELECT dtm::time as tm from df")["tm"].to_list() - assert res == [ + res = pl.sql("SELECT dtm::time AS tm from df").collect() + assert res["tm"].to_list() == [ time(23, 59, 59), time(12, 30, 30), time(1, 1, 1), diff --git a/py-polars/tests/unit/sql/test_trigonometric.py b/py-polars/tests/unit/sql/test_trigonometric.py index 5cae8bbe7f76..7e52174d6805 100644 --- a/py-polars/tests/unit/sql/test_trigonometric.py +++ b/py-polars/tests/unit/sql/test_trigonometric.py @@ -8,24 +8,21 @@ def test_arctan2() -> None: twoRootTwo = math.sqrt(2) / 2.0 - df = pl.DataFrame( + df = pl.DataFrame( # noqa: F841 { "y": [twoRootTwo, -twoRootTwo, twoRootTwo, -twoRootTwo], "x": [twoRootTwo, twoRootTwo, -twoRootTwo, -twoRootTwo], } ) - - sql = pl.SQLContext(df=df) - res = sql.execute( + res = pl.sql( """ SELECT - ATAN2D(y,x) as "atan2d", - ATAN2(y,x) as "atan2" + ATAN2D(y,x) as "atan2d", + ATAN2(y,x) as "atan2" FROM df """, eager=True, ) - df_result = pl.DataFrame({"atan2d": [45.0, -45.0, 135.0, -135.0]}) df_result = df_result.with_columns(pl.col("atan2d").cast(pl.Float64)) df_result = df_result.with_columns(pl.col("atan2d").radians().alias("atan2")) @@ -44,25 +41,25 @@ def test_trig() -> None: res = ctx.execute( """ SELECT - asin(1.0)/a as "pi values", - cos(asin(1.0)/a) AS "cos", - cot(asin(1.0)/a) AS "cot", - sin(asin(1.0)/a) AS "sin", - tan(asin(1.0)/a) AS "tan", + asin(1.0)/a as "pi values", + cos(asin(1.0)/a) AS "cos", + cot(asin(1.0)/a) AS "cot", + sin(asin(1.0)/a) AS "sin", + tan(asin(1.0)/a) AS "tan", - cosd(asind(1.0)/a) AS "cosd", - cotd(asind(1.0)/a) AS "cotd", - sind(asind(1.0)/a) AS "sind", - tand(asind(1.0)/a) AS "tand", + cosd(asind(1.0)/a) AS "cosd", + cotd(asind(1.0)/a) AS "cotd", + sind(asind(1.0)/a) AS "sind", + tand(asind(1.0)/a) AS "tand", - 1.0/a as "inverse pi values", - acos(1.0/a) AS "acos", - asin(1.0/a) AS "asin", - atan(1.0/a) AS "atan", + 1.0/a as "inverse pi values", + acos(1.0/a) AS "acos", + asin(1.0/a) AS "asin", + atan(1.0/a) AS "atan", - acosd(1.0/a) AS "acosd", - asind(1.0/a) AS "asind", - atand(1.0/a) AS "atand" + acosd(1.0/a) AS "acosd", + asind(1.0/a) AS "asind", + atand(1.0/a) AS "atand" FROM df """, eager=True,