diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index 31bae99c6146..a710011aa045 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -376,6 +376,8 @@ docs-selection = [ "is_last_distinct", "asof_join", "cross_join", + "semi_anti_join", + "iejoin", "concat_str", "string_reverse", "string_to_integer", diff --git a/docs/source/_build/API_REFERENCE_LINKS.yml b/docs/source/_build/API_REFERENCE_LINKS.yml index 1fea34db3465..1e301f592cb1 100644 --- a/docs/source/_build/API_REFERENCE_LINKS.yml +++ b/docs/source/_build/API_REFERENCE_LINKS.yml @@ -102,6 +102,7 @@ python: name: execute link: https://docs.pola.rs/api/python/stable/reference/sql/api/polars.SQLContext.execute.html join_asof: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html + join_where: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html concat: https://docs.pola.rs/api/python/stable/reference/api/polars.concat.html pivot: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.pivot.html unpivot: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unpivot.html @@ -180,6 +181,11 @@ rust: link: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.group_by_dynamic feature_flags: [dynamic_group_by] join: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join + join-semi_anti_join_flag: + name: join + link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join + feature_flags: ["semi_anti_join"] + vstack: https://docs.pola.rs/api/rust/dev/polars_core/frame/struct.DataFrame.html#method.vstack concat: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/functions/fn.concat.html @@ -193,7 +199,18 @@ rust: pivot: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/pivot/fn.pivot.html unpivot: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unpivot upsample: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.upsample - join_asof: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoin.html#method.join_asof + join_asof_by: + name: join_asof_by + link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoinBy.html#method.join_asof_by + feature_flags: ['asof_join'] + join_where: + name: join_where + link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.JoinBuilder.html#method.join_where + feature_flags: ["iejoin"] + cross_join: + name: cross_join + link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.LazyFrame.html#method.cross_join + feature_flags: [cross_join] unnest: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unnest read_csv: diff --git a/docs/source/_build/scripts/macro.py b/docs/source/_build/scripts/macro.py index 3b8055074d44..651786b0044b 100644 --- a/docs/source/_build/scripts/macro.py +++ b/docs/source/_build/scripts/macro.py @@ -1,10 +1,12 @@ from collections import OrderedDict import os -from typing import List, Optional, Set +from typing import Any, List, Optional, Set import yaml import logging +from mkdocs_macros.plugin import MacrosPlugin + # Supported Languages and their metadata LANGUAGES = OrderedDict( python={ @@ -130,7 +132,7 @@ def code_tab( """ -def define_env(env): +def define_env(env: MacrosPlugin) -> None: @env.macro def code_header( language: str, section: str = [], api_functions: List[str] = [] @@ -154,7 +156,11 @@ def code_header( @env.macro def code_block( - path: str, section: str = None, api_functions: List[str] = None + path: str, + section: str = None, + api_functions: List[str] = None, + python_api_functions: List[str] = None, + rust_api_functions: List[str] = None, ) -> str: """Dynamically generate a code block for the code located under {language}/path @@ -170,8 +176,14 @@ def code_block( for language, info in LANGUAGES.items(): base_path = f"{language}/{path}{info['extension']}" full_path = "docs/source/src/" + base_path + if language == "python": + extras = python_api_functions or [] + else: + extras = rust_api_functions or [] # Check if file exists for the language if os.path.exists(full_path): - result.append(code_tab(base_path, section, info, api_functions)) + result.append( + code_tab(base_path, section, info, api_functions + extras) + ) return "\n".join(result) diff --git a/docs/source/development/contributing/index.md b/docs/source/development/contributing/index.md index 30fb6ddc0ac9..c3175df9f5b2 100644 --- a/docs/source/development/contributing/index.md +++ b/docs/source/development/contributing/index.md @@ -268,6 +268,13 @@ df = pl.read_parquet("file.parquet") The snippet is delimited by `--8<-- [start:]` and `--8<-- [end:]`. The snippet name must match the name given in the second argument to `code_block` above. +In some cases, you may need to add links to different functions for the Python and Rust APIs. +When that is the case, you can use the two extra optional arguments that `code_block` accepts, that can be used to pass Python-only and Rust-only links: + +``` +{{code_block('path', 'snippet_name', ['common_api_links'], ['python_only_links'], ['rust_only_links'])}} +``` + #### Linting Before committing, install `dprint` (see above) and run `dprint fmt` from the `docs` directory to lint the markdown files. diff --git a/docs/source/src/python/user-guide/transformations/joins.py b/docs/source/src/python/user-guide/transformations/joins.py index a34ea310e614..e44cbdc560c1 100644 --- a/docs/source/src/python/user-guide/transformations/joins.py +++ b/docs/source/src/python/user-guide/transformations/joins.py @@ -1,117 +1,138 @@ -# --8<-- [start:setup] +# --8<-- [start:prep-data] +import pathlib +import requests + + +DATA = [ + ( + "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_groups.csv", + "docs/assets/data/monopoly_props_groups.csv", + ), + ( + "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_prices.csv", + "docs/assets/data/monopoly_props_prices.csv", + ), +] + + +for url, dest in DATA: + if pathlib.Path(dest).exists(): + continue + with open(dest, "wb") as f: + f.write(requests.get(url, timeout=10).content) +# --8<-- [end:prep-data] + +# --8<-- [start:props_groups] import polars as pl -from datetime import datetime - -# --8<-- [end:setup] - -# --8<-- [start:innerdf] -df_customers = pl.DataFrame( - { - "customer_id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - } -) -print(df_customers) -# --8<-- [end:innerdf] - -# --8<-- [start:innerdf2] -df_orders = pl.DataFrame( - { - "order_id": ["a", "b", "c"], - "customer_id": [1, 2, 2], - "amount": [100, 200, 300], - } -) -print(df_orders) -# --8<-- [end:innerdf2] - -# --8<-- [start:inner] -df_inner_customer_join = df_customers.join(df_orders, on="customer_id", how="inner") -print(df_inner_customer_join) -# --8<-- [end:inner] +props_groups = pl.read_csv("docs/assets/data/monopoly_props_groups.csv").head(5) +print(props_groups) +# --8<-- [end:props_groups] -# --8<-- [start:left] -df_left_join = df_customers.join(df_orders, on="customer_id", how="left") -print(df_left_join) -# --8<-- [end:left] +# --8<-- [start:props_prices] +props_prices = pl.read_csv("docs/assets/data/monopoly_props_prices.csv").head(5) +print(props_prices) +# --8<-- [end:props_prices] -# --8<-- [start:right] -df_right_join = df_orders.join(df_customers, on="customer_id", how="right") -print(df_right_join) -# --8<-- [end:right] +# --8<-- [start:equi-join] +result = props_groups.join(props_prices, on="property_name") +print(result) +# --8<-- [end:equi-join] -# --8<-- [start:full] -df_outer_join = df_customers.join(df_orders, on="customer_id", how="full") -print(df_outer_join) -# --8<-- [end:full] - -# --8<-- [start:full_coalesce] -df_outer_coalesce_join = df_customers.join( - df_orders, on="customer_id", how="full", coalesce=True +# --8<-- [start:props_groups2] +props_groups2 = props_groups.with_columns( + pl.col("property_name").str.to_lowercase(), ) -print(df_outer_coalesce_join) -# --8<-- [end:full_coalesce] +print(props_groups2) +# --8<-- [end:props_groups2] -# --8<-- [start:df3] -df_colors = pl.DataFrame( - { - "color": ["red", "blue", "green"], - } +# --8<-- [start:props_prices2] +props_prices2 = props_prices.select( + pl.col("property_name").alias("name"), pl.col("cost") ) -print(df_colors) -# --8<-- [end:df3] - -# --8<-- [start:df4] -df_sizes = pl.DataFrame( - { - "size": ["S", "M", "L"], - } +print(props_prices2) +# --8<-- [end:props_prices2] + +# --8<-- [start:join-key-expression] +result = props_groups2.join( + props_prices2, + left_on="property_name", + right_on=pl.col("name").str.to_lowercase(), +) +print(result) +# --8<-- [end:join-key-expression] + +# --8<-- [start:inner-join] +result = props_groups.join(props_prices, on="property_name", how="inner") +print(result) +# --8<-- [end:inner-join] + +# --8<-- [start:left-join] +result = props_groups.join(props_prices, on="property_name", how="left") +print(result) +# --8<-- [end:left-join] + +# --8<-- [start:right-join] +result = props_groups.join(props_prices, on="property_name", how="right") +print(result) +# --8<-- [end:right-join] + +# --8<-- [start:left-right-join-equals] +print( + result.equals( + props_prices.join( + props_groups, + on="property_name", + how="left", + # Reorder the columns to match the order from above. + ).select(pl.col("group"), pl.col("property_name"), pl.col("cost")) + ) ) -print(df_sizes) -# --8<-- [end:df4] +# --8<-- [end:left-right-join-equals] + +# --8<-- [start:full-join] +result = props_groups.join(props_prices, on="property_name", how="full") +print(result) +# --8<-- [end:full-join] + +# --8<-- [start:full-join-coalesce] +result = props_groups.join( + props_prices, + on="property_name", + how="full", + coalesce=True, +) +print(result) +# --8<-- [end:full-join-coalesce] -# --8<-- [start:cross] -df_cross_join = df_colors.join(df_sizes, how="cross") -print(df_cross_join) -# --8<-- [end:cross] +# --8<-- [start:semi-join] +result = props_groups.join(props_prices, on="property_name", how="semi") +print(result) +# --8<-- [end:semi-join] -# --8<-- [start:df5] -df_cars = pl.DataFrame( - { - "id": ["a", "b", "c"], - "make": ["ford", "toyota", "bmw"], - } -) -print(df_cars) -# --8<-- [end:df5] +# --8<-- [start:anti-join] +result = props_groups.join(props_prices, on="property_name", how="anti") +print(result) +# --8<-- [end:anti-join] -# --8<-- [start:df6] -df_repairs = pl.DataFrame( +# --8<-- [start:players] +players = pl.DataFrame( { - "id": ["c", "c"], - "cost": [100, 200], + "name": ["Alice", "Bob"], + "cash": [78, 135], } ) -print(df_repairs) -# --8<-- [end:df6] - -# --8<-- [start:inner2] -df_inner_join = df_cars.join(df_repairs, on="id", how="inner") -print(df_inner_join) -# --8<-- [end:inner2] +print(players) +# --8<-- [end:players] -# --8<-- [start:semi] -df_semi_join = df_cars.join(df_repairs, on="id", how="semi") -print(df_semi_join) -# --8<-- [end:semi] +# --8<-- [start:non-equi] +result = players.join_where(props_prices, pl.col("cash") > pl.col("cost")) +print(result) +# --8<-- [end:non-equi] -# --8<-- [start:anti] -df_anti_join = df_cars.join(df_repairs, on="id", how="anti") -print(df_anti_join) -# --8<-- [end:anti] +# --8<-- [start:df_trades] +from datetime import datetime -# --8<-- [start:df7] df_trades = pl.DataFrame( { "time": [ @@ -125,9 +146,9 @@ } ) print(df_trades) -# --8<-- [end:df7] +# --8<-- [end:df_trades] -# --8<-- [start:df8] +# --8<-- [start:df_quotes] df_quotes = pl.DataFrame( { "time": [ @@ -142,21 +163,23 @@ ) print(df_quotes) -# --8<-- [end:df8] - -# --8<-- [start:asofpre] -df_trades = df_trades.sort("time") -df_quotes = df_quotes.sort("time") # Set column as sorted -# --8<-- [end:asofpre] +# --8<-- [end:df_quotes] # --8<-- [start:asof] df_asof_join = df_trades.join_asof(df_quotes, on="time", by="stock") print(df_asof_join) # --8<-- [end:asof] -# --8<-- [start:asof2] +# --8<-- [start:asof-tolerance] df_asof_tolerance_join = df_trades.join_asof( df_quotes, on="time", by="stock", tolerance="1m" ) print(df_asof_tolerance_join) -# --8<-- [end:asof2] +# --8<-- [end:asof-tolerance] + +# --8<-- [start:cartesian-product] +tokens = pl.DataFrame({"monopoly_token": ["hat", "shoe", "boat"]}) + +result = players.select(pl.col("name")).join(tokens, how="cross") +print(result) +# --8<-- [end:cartesian-product] diff --git a/docs/source/src/rust/Cargo.toml b/docs/source/src/rust/Cargo.toml index 1bc09b3a8744..8a6607d4aa84 100644 --- a/docs/source/src/rust/Cargo.toml +++ b/docs/source/src/rust/Cargo.toml @@ -124,7 +124,7 @@ required-features = ["polars/lazy"] [[bin]] name = "user-guide-transformations-joins" path = "user-guide/transformations/joins.rs" -required-features = ["polars/lazy", "polars/asof_join"] +required-features = ["polars/lazy", "polars/strings", "polars/semi_anti_join", "polars/iejoin", "polars/cross_join"] [[bin]] name = "user-guide-transformations-unpivot" path = "user-guide/transformations/unpivot.rs" diff --git a/docs/source/src/rust/user-guide/transformations/joins.rs b/docs/source/src/rust/user-guide/transformations/joins.rs index 5caa0cc4ac18..5d1c50f733b1 100644 --- a/docs/source/src/rust/user-guide/transformations/joins.rs +++ b/docs/source/src/rust/user-guide/transformations/joins.rs @@ -3,218 +3,252 @@ use polars::prelude::*; // --8<-- [end:setup] fn main() -> Result<(), Box> { - // --8<-- [start:innerdf] - let df_customers = df! ( + // NOTE: This assumes the data has been downloaded and is available. + // See the corresponding Python script for the remote location of the data. - "customer_id" => &[1, 2, 3], - "name" => &["Alice", "Bob", "Charlie"], - )?; - - println!("{}", &df_customers); - // --8<-- [end:innerdf] + // --8<-- [start:props_groups] + let props_groups = CsvReadOptions::default() + .with_has_header(true) + .try_into_reader_with_file_path(Some( + "../../../assets/data/monopoly_props_groups.csv".into(), + ))? + .finish()? + .head(Some(5)); + println!("{}", props_groups); + // --8<-- [end:props_groups] - // --8<-- [start:innerdf2] - let df_orders = df!( - "order_id"=> &["a", "b", "c"], - "customer_id"=> &[1, 2, 2], - "amount"=> &[100, 200, 300], - )?; - println!("{}", &df_orders); - // --8<-- [end:innerdf2] + // --8<-- [start:props_prices] + let props_prices = CsvReadOptions::default() + .with_has_header(true) + .try_into_reader_with_file_path(Some( + "../../../assets/data/monopoly_props_prices.csv".into(), + ))? + .finish()? + .head(Some(5)); + println!("{}", props_prices); + // --8<-- [end:props_prices] - // --8<-- [start:inner] - let df_inner_customer_join = df_customers + // --8<-- [start:equi-join] + // In Rust, we cannot use the shorthand of specifying a common + // column name just once. + let result = props_groups .clone() .lazy() .join( - df_orders.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Inner), + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::default(), ) .collect()?; - println!("{}", &df_inner_customer_join); - // --8<-- [end:inner] + println!("{}", result); + // --8<-- [end:equi-join] - // --8<-- [start:left] - let df_left_join = df_customers + // --8<-- [start:props_groups2] + let props_groups2 = props_groups + .clone() + .lazy() + .with_column(col("property_name").str().to_lowercase()) + .collect()?; + println!("{}", props_groups2); + // --8<-- [end:props_groups2] + + // --8<-- [start:props_prices2] + let props_prices2 = props_prices + .clone() + .lazy() + .select([col("property_name").alias("name"), col("cost")]) + .collect()?; + println!("{}", props_prices2); + // --8<-- [end:props_prices2] + + // --8<-- [start:join-key-expression] + let result = props_groups2 .clone() .lazy() .join( - df_orders.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Left), + props_prices2.clone().lazy(), + [col("property_name")], + [col("name").str().to_lowercase()], + JoinArgs::default(), ) .collect()?; - println!("{}", &df_left_join); - // --8<-- [end:left] + println!("{}", result); + // --8<-- [end:join-key-expression] - // --8<-- [start:right] - let df_right_join = df_orders + // --8<-- [start:inner-join] + let result = props_groups .clone() .lazy() .join( - df_customers.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Right), + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Inner), ) .collect()?; - println!("{}", &df_right_join); - // --8<-- [end:right] + println!("{}", result); + // --8<-- [end:inner-join] - // --8<-- [start:full] - let df_full_join = df_customers + // --8<-- [start:left-join] + let result = props_groups .clone() .lazy() .join( - df_orders.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Full), + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Left), ) .collect()?; - println!("{}", &df_full_join); - // --8<-- [end:full] + println!("{}", result); + // --8<-- [end:left-join] - // --8<-- [start:full_coalesce] - let df_full_join = df_customers + // --8<-- [start:right-join] + let result = props_groups .clone() .lazy() .join( - df_orders.clone().lazy(), - [col("customer_id")], - [col("customer_id")], - JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Right), ) .collect()?; - println!("{}", &df_full_join); - // --8<-- [end:full_coalesce] + println!("{}", result); + // --8<-- [end:right-join] - // --8<-- [start:df3] - let df_colors = df!( - "color"=> &["red", "blue", "green"], - )?; - println!("{}", &df_colors); - // --8<-- [end:df3] - - // --8<-- [start:df4] - let df_sizes = df!( - "size"=> &["S", "M", "L"], - )?; - println!("{}", &df_sizes); - // --8<-- [end:df4] + // --8<-- [start:left-right-join-equals] + // `equals_missing` is needed instead of `equals` + // so that missing values compare as equal. + let dfs_match = result.equals_missing( + &props_prices + .clone() + .lazy() + .join( + props_groups.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Left), + ) + .select([ + // Reorder the columns to match the order of `result`. + col("group"), + col("property_name"), + col("cost"), + ]) + .collect()?, + ); + println!("{}", dfs_match); + // --8<-- [end:left-right-join-equals] - // --8<-- [start:cross] - let df_cross_join = df_colors + // --8<-- [start:full-join] + let result = props_groups .clone() .lazy() - .cross_join(df_sizes.clone().lazy(), None) + .join( + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Full), + ) .collect()?; - println!("{}", &df_cross_join); - // --8<-- [end:cross] + println!("{}", result); + // --8<-- [end:full-join] - // --8<-- [start:df5] - let df_cars = df!( - "id"=> &["a", "b", "c"], - "make"=> &["ford", "toyota", "bmw"], - )?; - println!("{}", &df_cars); - // --8<-- [end:df5] - - // --8<-- [start:df6] - let df_repairs = df!( - "id"=> &["c", "c"], - "cost"=> &[100, 200], - )?; - println!("{}", &df_repairs); - // --8<-- [end:df6] - - // --8<-- [start:inner2] - let df_inner_join = df_cars + // --8<-- [start:full-join-coalesce] + let result = props_groups .clone() .lazy() - .inner_join(df_repairs.clone().lazy(), col("id"), col("id")) + .join( + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], + JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), + ) .collect()?; - println!("{}", &df_inner_join); - // --8<-- [end:inner2] + println!("{}", result); + // --8<-- [end:full-join-coalesce] - // --8<-- [start:semi] - let df_semi_join = df_cars + // --8<-- [start:semi-join] + let result = props_groups .clone() .lazy() .join( - df_repairs.clone().lazy(), - [col("id")], - [col("id")], + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], JoinArgs::new(JoinType::Semi), ) .collect()?; - println!("{}", &df_semi_join); - // --8<-- [end:semi] + println!("{}", result); + // --8<-- [end:semi-join] - // --8<-- [start:anti] - let df_anti_join = df_cars + // --8<-- [start:anti-join] + let result = props_groups .clone() .lazy() .join( - df_repairs.clone().lazy(), - [col("id")], - [col("id")], + props_prices.clone().lazy(), + [col("property_name")], + [col("property_name")], JoinArgs::new(JoinType::Anti), ) .collect()?; - println!("{}", &df_anti_join); - // --8<-- [end:anti] + println!("{}", result); + // --8<-- [end:anti-join] + + // --8<-- [start:players] + let players = df!( + "name" => ["Alice", "Bob"], + "cash" => [78, 135], + )?; + println!("{}", players); + // --8<-- [end:players] + + // --8<-- [start:non-equi] + let result = players + .clone() + .lazy() + .join_builder() + .with(props_prices.clone().lazy()) + .join_where(vec![col("cash").cast(DataType::Int64).gt(col("cost"))]) + .collect()?; + println!("{}", result); + // --8<-- [end:non-equi] - // --8<-- [start:df7] + // --8<-- [start:df_trades] use chrono::prelude::*; + let df_trades = df!( - "time"=> &[ - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 3, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), - ], - "stock"=> &["A", "B", "B", "C"], - "trade"=> &[101, 299, 301, 500], + "time" => [ + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 3, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), + ], + "stock" => ["A", "B", "B", "C"], + "trade" => [101, 299, 301, 500], )?; - println!("{}", &df_trades); - // --8<-- [end:df7] + println!("{}", df_trades); + // --8<-- [end:df_trades] - // --8<-- [start:df8] + // --8<-- [start:df_quotes] let df_quotes = df!( - "time"=> &[ - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 2, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 4, 0).unwrap(), - NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), - ], - "stock"=> &["A", "B", "C", "A"], - "quote"=> &[100, 300, 501, 102], + "time" => [ + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 2, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 4, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), + ], + "stock" => ["A", "B", "C", "A"], + "quote" => [100, 300, 501, 102], )?; - - println!("{}", &df_quotes); - // --8<-- [end:df8] - - // --8<-- [start:asofpre] - let df_trades = df_trades - .sort( - ["time"], - SortMultipleOptions::default().with_maintain_order(true), - ) - .unwrap(); - let df_quotes = df_quotes - .sort( - ["time"], - SortMultipleOptions::default().with_maintain_order(true), - ) - .unwrap(); - // --8<-- [end:asofpre] + println!("{}", df_quotes); + // --8<-- [end:df_quotes] // --8<-- [start:asof] - let df_asof_join = df_trades.join_asof_by( + let result = df_trades.join_asof_by( &df_quotes, "time", "time", @@ -223,11 +257,11 @@ fn main() -> Result<(), Box> { AsofStrategy::Backward, None, )?; - println!("{}", &df_asof_join); + println!("{}", result); // --8<-- [end:asof] - // --8<-- [start:asof2] - let df_asof_tolerance_join = df_trades.join_asof_by( + // --8<-- [start:asof-tolerance] + let result = df_trades.join_asof_by( &df_quotes, "time", "time", @@ -236,8 +270,22 @@ fn main() -> Result<(), Box> { AsofStrategy::Backward, Some(AnyValue::Duration(60000, TimeUnit::Milliseconds)), )?; - println!("{}", &df_asof_tolerance_join); - // --8<-- [end:asof2] + println!("{}", result); + // --8<-- [end:asof-tolerance] + + // --8<-- [start:cartesian-product] + let tokens = df!( + "monopoly_token" => ["hat", "shoe", "boat"], + )?; + + let result = players + .clone() + .lazy() + .select([col("name")]) + .cross_join(tokens.clone().lazy(), None) + .collect()?; + println!("{}", result); + // --8<-- [end:cartesian-product] Ok(()) } diff --git a/docs/source/user-guide/transformations/joins.md b/docs/source/user-guide/transformations/joins.md index 7cf07e680503..b135a45f53d3 100644 --- a/docs/source/user-guide/transformations/joins.md +++ b/docs/source/user-guide/transformations/joins.md @@ -1,229 +1,273 @@ # Joins -## Join strategies +A join operation combines columns from one or more dataframes into a new dataframe. +The different “joining strategies” and matching criteria used by the different types of joins influence how columns are combined and also what rows are included in the result of the join operation. -Polars supports the following join strategies by specifying the `how` argument: +The most common type of join is an “equi join”, in which rows are matched by a key expression. +Polars supports several joining strategies for equi joins, which determine exactly how we handle the matching of rows. +Polars also supports “non-equi joins”, a type of join where the matching criterion is not an equality, and a type of join where rows are matched by key proximity, called “asof join”. -| Strategy | Description | -| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `inner` | Returns row with matching keys in _both_ frames. Non-matching rows in either the left or right frame are discarded. | -| `left` | Returns all rows in the left dataframe, whether or not a match in the right-frame is found. Non-matching rows have their right columns null-filled. | -| `right` | Returns all rows in the right dataframe, whether or not a match in the left-frame is found. Non-matching rows have their left columns null-filled. | -| `full` | Returns all rows from both the left and right dataframe. If no match is found in one frame, columns from the other frame are null-filled. | -| `cross` | Returns the Cartesian product of all rows from the left frame with all rows from the right frame. Duplicates rows are retained; the table length of `A` cross-joined with `B` is always `len(A) × len(B)`. | -| `semi` | Returns all rows from the left frame in which the join key is also present in the right frame. | -| `anti` | Returns all rows from the left frame in which the join key is _not_ present in the right frame. | +## Quick reference table -A separate `coalesce` parameter determines whether to merge key columns with the same name from the left and right -frames. +The table below acts as a quick reference for people who know what they are looking for. +If you want to learn about joins in general and how to work with them in Polars, feel free to skip the table and keep reading below. -### Inner join +=== ":fontawesome-brands-python: Python" + + [:material-api: `join`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join.html) + [:material-api: `join_where`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html) + [:material-api: `join_asof`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html) + +=== ":fontawesome-brands-rust: Rust" -An `inner` join produces a `DataFrame` that contains only the rows where the join key exists in both `DataFrames`. Let's -take for example the following two `DataFrames`: + [:material-api: `join`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join) + ([:material-flag-plus: semi_anti_join](/user-guide/installation/#feature-flags "Enable the feature flag semi_anti_join for semi and for anti joins"){.feature-flag} needed for some options.) + [:material-api: `join_asof_by`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoin.html#method.join_asof) + [:material-flag-plus: Available on feature asof_join](/user-guide/installation/#feature-flags "To use this functionality enable the feature flag asof_join"){.feature-flag} + [:material-api: `join_where`](https://docs.rs/polars/latest/polars/prelude/struct.JoinBuilder.html#method.join_where) + [:material-flag-plus: Available on feature iejoin](/user-guide/installation/#feature-flags "To use this functionality enable the feature flag iejoin"){.feature-flag} -{{code_block('user-guide/transformations/joins','innerdf',['DataFrame'])}} +| Type | Function | Brief description | +| --------------------- | -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Equi inner join | `join(..., how="inner")` | Keeps rows that matched both on the left and right. | +| Equi left outer join | `join(..., how="left")` | Keeps all rows from the left plus matching rows from the right. Non-matching rows from the left have their right columns filled with `null`. | +| Equi right outer join | `join(..., how="right")` | Keeps all rows from the right plus matching rows from the left. Non-matching rows from the right have their left columns filled with `null`. | +| Equi full join | `join(..., how="full")` | Keeps all rows from either dataframe, regardless of whether they match or not. Non-matching rows from one side have the columns from the other side filled with `null`. | +| Equi semi join | `join(..., how="semi")` | Keeps rows from the left that have a match on the right. | +| Equi anti join | `join(..., how="anti")` | Keeps rows from the left that do not have a match on the right. | +| Non-equi inner join | `join_where` | Finds all possible pairings of rows from the left and right that satisfy the given predicate(s). | +| Asof join | `join_asof`/`join_asof_by` | Like a left outer join, but matches on the nearest key instead of on exact key matches. | +| Cartesian product | `join(..., how="cross")` | Computes the [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) of the two dataframes. | -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:setup" ---8<-- "python/user-guide/transformations/joins.py:innerdf" +## Equi joins + +In an equi join, rows are matched by checking equality of a key expression. +You can do an equi join with the function `join` by specifying the name of the column to be used as key. +For the examples, we will be loading some (modified) Monopoly property data. + +First, we load a dataframe that contains property names and their colour group in the game: + +{{code_block('user-guide/transformations/joins','props_groups',[])}} + +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:prep-data" +--8<-- "python/user-guide/transformations/joins.py:props_groups" ``` -

+Next, we load a dataframe that contains property names and their price in the game: -{{code_block('user-guide/transformations/joins','innerdf2',['DataFrame'])}} +{{code_block('user-guide/transformations/joins','props_prices',[])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:innerdf2" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:props_prices" ``` -To get a `DataFrame` with the orders and their associated customer we can do an `inner` join on the `customer_id` -column: +Now, we join both dataframes to create a dataframe that contains property names, colour groups, and prices: -{{code_block('user-guide/transformations/joins','inner',['join'])}} +{{code_block('user-guide/transformations/joins','equi-join',['join'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:inner" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:equi-join" ``` -### Left join +The result has four rows but both dataframes used in the operation had five rows. +Polars uses a joining strategy to determine what happens with rows that have multiple matches or with rows that have no match at all. +By default, Polars computes an “inner join” but there are [other join strategies that we show next](#join-strategies). -The `left` outer join produces a `DataFrame` that contains all the rows from the left `DataFrame` and only the rows from -the right `DataFrame` where the join key exists in the left `DataFrame`. If we now take the example from above and want -to have a `DataFrame` with all the customers and their associated orders (regardless of whether they have placed an -order or not) we can do a `left` join: +In the example above, the two dataframes conveniently had the column we wish to use as key with the same name and with the values in the exact same format. +Suppose, for the sake of argument, that one of the dataframes had a differently named column and the other had the property names in lower case: -{{code_block('user-guide/transformations/joins','left',['join'])}} +{{code_block('user-guide/transformations/joins','props_groups2',['Expr.str'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:left" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:props_groups2" ``` -Notice, that the fields for the customer with the `customer_id` of `3` are null, as there are no orders for this -customer. +{{code_block('user-guide/transformations/joins','props_prices2',[])}} -### Right join +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:props_prices2" +``` -The `right` outer join produces a `DataFrame` that contains all the rows from the right `DataFrame` and only the rows from -the left `DataFrame` where the join key exists in the right `DataFrame`. If we now take the example from above and want -to have a `DataFrame` with all the customers and their associated orders (regardless of whether they have placed an -order or not) we can do a `right` join: +In a situation like this, where we may want to perform the same join as before, we can leverage `join`'s flexibility and specify arbitrary expressions to compute the joining key on the left and on the right, allowing one to compute row keys dynamically: -{{code_block('user-guide/transformations/joins','right',['join'])}} +{{code_block('user-guide/transformations/joins', 'join-key-expression', ['join', 'Expr.str'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:right" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:join-key-expression" ``` -Notice, that the fields for the customer with the `customer_id` of `3` are null, as there are no orders for this -customer. +Because we are joining on the right with an expression, Polars preserves the column “property_name” from the left and the column “name” from the right so we can have access to the original values that the key expressions were applied to. -### Outer join +## Join strategies -The `full` outer join produces a `DataFrame` that contains all the rows from both `DataFrames`. Columns are null, if the -join key does not exist in the source `DataFrame`. Doing a `full` outer join on the two `DataFrames` from above produces -a similar `DataFrame` to the `left` join: +When computing a join with `df1.join(df2, ...)`, we can specify one of many different join strategies. +A join strategy specifies what rows to keep from each dataframe based on whether they match rows from the other dataframe. + +### Inner join -{{code_block('user-guide/transformations/joins','full',['join'])}} +In an inner join the resulting dataframe only contains the rows from the left and right dataframes that matched. +That is the default strategy used by `join` and above we can see an example of that. +We repeat the example here and explicitly specify the join strategy: -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:full" +{{code_block('user-guide/transformations/joins','inner-join',['join'])}} + +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:inner-join" ``` -{{code_block('user-guide/transformations/joins','full_coalesce',['join'])}} +The result does not include the row from `props_groups` that contains “The Shire” and the result also does not include the row from `props_prices` that contains “Sesame Street”. + +### Left join + +A left outer join is a join where the result contains all the rows from the left dataframe and the rows of the right dataframe that matched any rows from the left dataframe. -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:full_coalesce" +{{code_block('user-guide/transformations/joins','left-join',['join'])}} + +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:left-join" ``` -### Cross join +If there are any rows from the left dataframe that have no matching rows on the right dataframe, they get the value `null` on the new columns. + +### Right join -A `cross` join is a Cartesian product of the two `DataFrames`. This means that every row in the left `DataFrame` is -joined with every row in the right `DataFrame`. The `cross` join is useful for creating a `DataFrame` with all possible -combinations of the columns in two `DataFrames`. Let's take for example the following two `DataFrames`. +Computationally speaking, a right outer join is exactly the same as a left outer join, but with the arguments swapped. +Here is an example: -{{code_block('user-guide/transformations/joins','df3',['DataFrame'])}} +{{code_block('user-guide/transformations/joins','right-join',['join'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df3" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:right-join" ``` -

+We show that `df1.join(df2, how="right", ...)` is the same as `df2.join(df1, how="left", ...)`, up to the order of the columns of the result, with the computation below: -{{code_block('user-guide/transformations/joins','df4',['DataFrame'])}} +{{code_block('user-guide/transformations/joins','left-right-join-equals',['join'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df4" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:left-right-join-equals" ``` -We can now create a `DataFrame` containing all possible combinations of the colors and sizes with a `cross` join: +### Full join -{{code_block('user-guide/transformations/joins','cross',['join'])}} +A full outer join will keep all of the rows from the left and right dataframes, even if they don't have matching rows in the other dataframe: -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:cross" +{{code_block('user-guide/transformations/joins','full-join',['join'])}} + +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:full-join" ``` -
+In this case, we see that we get two columns `property_name` and `property_name_right` to make up for the fact that we are matching on the column `property_name` of both dataframes and there are some names for which there are no matches. +The two columns help differentiate the source of each row data. +If we wanted to force `join` to coalesce the two columns `property_name` into a single column, we could set `coalesce=True` explicitly: -The `inner`, `left`, `right`, `full` and `cross` join strategies are standard amongst dataframe libraries. We provide more -details on the less familiar `semi`, `anti` and `asof` join strategies below. +{{code_block('user-guide/transformations/joins','full-join-coalesce',['join'])}} + +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:full-join-coalesce" +``` + +When not set, the parameter `coalesce` is determined automatically from the join strategy and the key(s) specified, which is why the inner, left, and right, joins acted as if `coalesce=True`, even though we didn't set it. ### Semi join -The `semi` join returns all rows from the left frame in which the join key is also present in the right frame. Consider -the following scenario: a car rental company has a `DataFrame` showing the cars that it owns with each car having a -unique `id`. +A semi join will return the rows of the left dataframe that have a match in the right dataframe, but we do not actually join the matching rows: -{{code_block('user-guide/transformations/joins','df5',['DataFrame'])}} +{{code_block('user-guide/transformations/joins', 'semi-join', [], ['join'], ['join-semi_anti_join_flag'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df5" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:semi-join" ``` -The company has another `DataFrame` showing each repair job carried out on a vehicle. +A semi join acts as a sort of row filter based on a second dataframe. + +### Anti join + +Conversely, an anti join will return the rows of the left dataframe that do not have a match in the right dataframe: -{{code_block('user-guide/transformations/joins','df6',['DataFrame'])}} +{{code_block('user-guide/transformations/joins', 'anti-join', [], ['join'], ['join-semi_anti_join_flag'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df6" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:anti-join" ``` -You want to answer this question: which of the cars have had repairs carried out? +## Non-equi joins + +In a non-equi join matches between the left and right dataframes are computed differently. +Instead of looking for matches on key expressions, we provide a single predicate that determines what rows of the left dataframe can be paired up with what rows of the right dataframe. -An inner join does not answer this question directly as it produces a `DataFrame` with multiple rows for each car that -has had multiple repair jobs: +For example, consider the following Monopoly players and their current cash: -{{code_block('user-guide/transformations/joins','inner2',['join'])}} +{{code_block('user-guide/transformations/joins','players',[])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:inner2" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:players" ``` -However, a semi join produces a single row for each car that has had a repair job carried out. +Using a non-equi join we can easily build a dataframe with all the possible properties that each player could be interested in buying. +We use the function `join_where` to compute a non-equi join: -{{code_block('user-guide/transformations/joins','semi',['join'])}} +{{code_block('user-guide/transformations/joins','non-equi',['join_where'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:semi" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:non-equi" ``` -### Anti join +You can provide multiple expressions as predicates but they all must use comparison operators that evaluate to a Boolean result and must refer to columns from both dataframes. -Continuing this example, an alternative question might be: which of the cars have **not** had a repair job carried out? -An anti join produces a `DataFrame` showing all the cars from `df_cars` where the `id` is not present in -the `df_repairs` `DataFrame`. +!!! note -{{code_block('user-guide/transformations/joins','anti',['join'])}} - -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:anti" -``` + `join_where` is still experimental and doesn't yet support arbitrary Boolean expressions as predicates. ## Asof join An `asof` join is like a left join except that we match on nearest key rather than equal keys. In Polars we can do an asof join with the `join_asof` method. -Consider the following scenario: a stock market broker has a `DataFrame` called `df_trades` showing transactions it has -made for different stocks. +For the asof join we will consider a scenario inspired by the stock market. +Suppose a stock market broker has a dataframe called `df_trades` showing transactions it has made for different stocks. -{{code_block('user-guide/transformations/joins','df7',['DataFrame'])}} +{{code_block('user-guide/transformations/joins','df_trades',[])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df7" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:df_trades" ``` -The broker has another `DataFrame` called `df_quotes` showing prices it has quoted for these stocks. +The broker has another dataframe called `df_quotes` showing prices it has quoted for these stocks: -{{code_block('user-guide/transformations/joins','df8',['DataFrame'])}} +{{code_block('user-guide/transformations/joins','df_quotes',[])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df8" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:df_quotes" ``` -You want to produce a `DataFrame` showing for each trade the most recent quote provided _before_ the trade. You do this -with `join_asof` (using the default `strategy = "backward"`). -To avoid joining between trades on one stock with a quote on another you must specify an exact preliminary join on the -stock column with `by="stock"`. +You want to produce a dataframe showing for each trade the most recent quote provided _before_ the trade. You do this with `join_asof` (using the default `strategy = "backward"`). +To avoid joining between trades on one stock with a quote on another you must specify an exact preliminary join on the stock column with `by="stock"`. -{{code_block('user-guide/transformations/joins','asof',['join_asof'])}} +{{code_block('user-guide/transformations/joins','asof', [], ['join_asof'], ['join_asof_by'])}} -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:asofpre" +```python exec="on" result="text" session="transformations/joins" --8<-- "python/user-guide/transformations/joins.py:asof" ``` -If you want to make sure that only quotes within a certain time range are joined to the trades you can specify -the `tolerance` argument. In this case we want to make sure that the last preceding quote is within 1 minute of the -trade so we set `tolerance = "1m"`. +If you want to make sure that only quotes within a certain time range are joined to the trades you can specify the `tolerance` argument. +In this case we want to make sure that the last preceding quote is within 1 minute of the trade so we set `tolerance = "1m"`. -=== ":fontawesome-brands-python: Python" +{{code_block('user-guide/transformations/joins','asof-tolerance', [], ['join_asof'], ['join_asof_by'])}} -```python ---8<-- "python/user-guide/transformations/joins.py:asof2" +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:asof-tolerance" ``` -```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:asof2" +## Cartesian product + +Polars allows you to compute the [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) of two dataframes, producing a dataframe where all rows of the left dataframe are paired up with all the rows of the right dataframe. +To compute the Cartesian product of two dataframes, you can pass the strategy `how="cross"` to the function `join` without specifying any of `on`, `left_on`, and `right_on`: + +{{code_block('user-guide/transformations/joins','cartesian-product',[],['join'],['cross_join'])}} + +```python exec="on" result="text" session="transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:cartesian-product" ```