From 4b5d9b453d7b19250beca669e25d88186f747c74 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 10 Apr 2024 14:00:15 +0100 Subject: [PATCH] feat: add holidays argument to business_day_count --- crates/polars-ops/src/series/ops/business.rs | 30 ++++++- .../src/dsl/function_expr/business.rs | 20 +++-- .../polars-plan/src/dsl/functions/business.rs | 12 ++- py-polars/polars/functions/business.py | 88 +++++++++++++------ py-polars/src/functions/business.rs | 9 +- .../time_series/test_business_day_count.py | 17 ++-- 6 files changed, 135 insertions(+), 41 deletions(-) diff --git a/crates/polars-ops/src/series/ops/business.rs b/crates/polars-ops/src/series/ops/business.rs index 115ccf8ae389..19562072fa95 100644 --- a/crates/polars-ops/src/series/ops/business.rs +++ b/crates/polars-ops/src/series/ops/business.rs @@ -1,3 +1,4 @@ +use ahash::HashSet; use polars_core::prelude::arity::binary_elementwise_values; use polars_core::prelude::*; @@ -7,14 +8,28 @@ use polars_core::prelude::*; /// - `start`: Series holding start dates. /// - `end`: Series holding end dates. /// - `week_mask`: A boolean array of length 7, where `true` indicates that the day is a business day. +/// - `holidays`: timestamps that are holidays. Must be provided as i32, i.e. the number of +/// days since the UNIX epoch. pub fn business_day_count( start: &Series, end: &Series, week_mask: [bool; 7], + holidays: &[i32], ) -> PolarsResult { if !week_mask.iter().any(|&x| x) { polars_bail!(ComputeError:"`week_mask` must have at least one business day"); } + + // De-dupe and sort holidays, and exclude non-business days. + let mut holidays: Vec = holidays + .iter() + .filter(|&x| *unsafe { week_mask.get_unchecked(weekday(*x)) }) + .cloned() + .collect::>() + .into_iter() + .collect(); + holidays.sort_unstable(); + let start_dates = start.date()?; let end_dates = end.date()?; let n_business_days_in_week_mask = week_mask.iter().filter(|&x| *x).count() as i32; @@ -28,6 +43,7 @@ pub fn business_day_count( end_date, &week_mask, n_business_days_in_week_mask, + &holidays, ) }) } else { @@ -42,6 +58,7 @@ pub fn business_day_count( end_date, &week_mask, n_business_days_in_week_mask, + &holidays, ) }) } else { @@ -54,6 +71,7 @@ pub fn business_day_count( end_date, &week_mask, n_business_days_in_week_mask, + &holidays, ) }), }; @@ -67,6 +85,7 @@ fn business_day_count_impl( mut end_date: i32, week_mask: &[bool; 7], n_business_days_in_week_mask: i32, + holidays: &[i32], ) -> i32 { let swapped = start_date > end_date; if swapped { @@ -75,10 +94,19 @@ fn business_day_count_impl( end_date += 1; } + let holidays_begin = match holidays.binary_search(&start_date) { + Ok(x) => x, + Err(x) => x, + } as i32; + let holidays_end = match holidays.binary_search(&end_date) { + Ok(x) => x, + Err(x) => x, + } as i32; + let mut start_weekday = weekday(start_date); let diff = end_date - start_date; let whole_weeks = diff / 7; - let mut count = 0; + let mut count = -(holidays_end - holidays_begin); count += whole_weeks * n_business_days_in_week_mask; start_date += whole_weeks * 7; while start_date < end_date { diff --git a/crates/polars-plan/src/dsl/function_expr/business.rs b/crates/polars-plan/src/dsl/function_expr/business.rs index 745dcfdff8f5..2740aa856ac5 100644 --- a/crates/polars-plan/src/dsl/function_expr/business.rs +++ b/crates/polars-plan/src/dsl/function_expr/business.rs @@ -12,7 +12,10 @@ use crate::prelude::SeriesUdf; #[derive(Clone, PartialEq, Debug, Eq, Hash)] pub enum BusinessFunction { #[cfg(feature = "business")] - BusinessDayCount { week_mask: [bool; 7] }, + BusinessDayCount { + week_mask: [bool; 7], + holidays: Vec, + }, } impl Display for BusinessFunction { @@ -30,16 +33,23 @@ impl From for SpecialEq> { use BusinessFunction::*; match func { #[cfg(feature = "business")] - BusinessDayCount { week_mask } => { - map_as_slice!(business_day_count, week_mask) + BusinessDayCount { + week_mask, + holidays, + } => { + map_as_slice!(business_day_count, week_mask, &holidays) }, } } } #[cfg(feature = "business")] -pub(super) fn business_day_count(s: &[Series], week_mask: [bool; 7]) -> PolarsResult { +pub(super) fn business_day_count( + s: &[Series], + week_mask: [bool; 7], + holidays: &[i32], +) -> PolarsResult { let start = &s[0]; let end = &s[1]; - polars_ops::prelude::business_day_count(start, end, week_mask) + polars_ops::prelude::business_day_count(start, end, week_mask, holidays) } diff --git a/crates/polars-plan/src/dsl/functions/business.rs b/crates/polars-plan/src/dsl/functions/business.rs index 0a0210ced57f..2aa21727e8f1 100644 --- a/crates/polars-plan/src/dsl/functions/business.rs +++ b/crates/polars-plan/src/dsl/functions/business.rs @@ -1,12 +1,20 @@ use super::*; #[cfg(feature = "dtype-date")] -pub fn business_day_count(start: Expr, end: Expr, week_mask: [bool; 7]) -> Expr { +pub fn business_day_count( + start: Expr, + end: Expr, + week_mask: [bool; 7], + holidays: Vec, +) -> Expr { let input = vec![start, end]; Expr::Function { input, - function: FunctionExpr::Business(BusinessFunction::BusinessDayCount { week_mask }), + function: FunctionExpr::Business(BusinessFunction::BusinessDayCount { + week_mask, + holidays, + }), options: FunctionOptions { allow_rename: true, ..Default::default() diff --git a/py-polars/polars/functions/business.py b/py-polars/polars/functions/business.py index 125bda15113e..0c468d84344a 100644 --- a/py-polars/polars/functions/business.py +++ b/py-polars/polars/functions/business.py @@ -1,6 +1,7 @@ from __future__ import annotations import contextlib +from datetime import date from typing import TYPE_CHECKING, Iterable from polars._utils.parse_expr_input import parse_as_expression @@ -10,8 +11,6 @@ import polars.polars as plr if TYPE_CHECKING: - from datetime import date - from polars import Expr from polars.type_aliases import IntoExprColumn @@ -20,6 +19,7 @@ def business_day_count( start: date | IntoExprColumn, end: date | IntoExprColumn, week_mask: Iterable[bool] = (True, True, True, True, True, False, False), + holidays: Iterable[date] = (), ) -> Expr: """ Count the number of business days between `start` and `end` (not including `end`). @@ -34,6 +34,19 @@ def business_day_count( Which days of the week to count. The default is Monday to Friday. If you wanted to count only Monday to Thursday, you would pass `(True, True, True, True, False, False, False)`. + holidays + Holidays to exclude from the count. The Python package + `python-holidays `_ + may come in handy here. You can install it with ``pip install holidays``, + and then, to get all Dutch holidays for years 2020-2024: + + .. code-block:: python + + import holidays + + my_holidays = holidays.country_holidays("NL", years=range(2020, 2025)) + + and pass `holidays=my_holidays` when you call `business_day_count`. Returns ------- @@ -49,39 +62,62 @@ def business_day_count( ... } ... ) >>> df.with_columns( - ... total_day_count=(pl.col("end") - pl.col("start")).dt.total_days(), ... business_day_count=pl.business_day_count("start", "end"), ... ) - shape: (2, 4) - ┌────────────┬────────────┬─────────────────┬────────────────────┐ - │ start ┆ end ┆ total_day_count ┆ business_day_count │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ date ┆ date ┆ i64 ┆ i32 │ - ╞════════════╪════════════╪═════════════════╪════════════════════╡ - │ 2020-01-01 ┆ 2020-01-02 ┆ 1 ┆ 1 │ - │ 2020-01-02 ┆ 2020-01-10 ┆ 8 ┆ 6 │ - └────────────┴────────────┴─────────────────┴────────────────────┘ - - Note how the two "count" columns differ due to the weekend (2020-01-04 - 2020-01-05) - not being counted by `business_day_count`. + shape: (2, 3) + ┌────────────┬────────────┬────────────────────┐ + │ start ┆ end ┆ business_day_count │ + │ --- ┆ --- ┆ --- │ + │ date ┆ date ┆ i32 │ + ╞════════════╪════════════╪════════════════════╡ + │ 2020-01-01 ┆ 2020-01-02 ┆ 1 │ + │ 2020-01-02 ┆ 2020-01-10 ┆ 6 │ + └────────────┴────────────┴────────────────────┘ + + Note how the business day count is 6 (as opposed a regular day count of 8) + due to the weekend (2020-01-04 - 2020-01-05) not being counted. You can pass a custom weekend - for example, if you only take Sunday off: >>> week_mask = (True, True, True, True, True, True, False) >>> df.with_columns( - ... total_day_count=(pl.col("end") - pl.col("start")).dt.total_days(), ... business_day_count=pl.business_day_count("start", "end", week_mask), ... ) - shape: (2, 4) - ┌────────────┬────────────┬─────────────────┬────────────────────┐ - │ start ┆ end ┆ total_day_count ┆ business_day_count │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ date ┆ date ┆ i64 ┆ i32 │ - ╞════════════╪════════════╪═════════════════╪════════════════════╡ - │ 2020-01-01 ┆ 2020-01-02 ┆ 1 ┆ 1 │ - │ 2020-01-02 ┆ 2020-01-10 ┆ 8 ┆ 7 │ - └────────────┴────────────┴─────────────────┴────────────────────┘ + shape: (2, 3) + ┌────────────┬────────────┬────────────────────┐ + │ start ┆ end ┆ business_day_count │ + │ --- ┆ --- ┆ --- │ + │ date ┆ date ┆ i32 │ + ╞════════════╪════════════╪════════════════════╡ + │ 2020-01-01 ┆ 2020-01-02 ┆ 1 │ + │ 2020-01-02 ┆ 2020-01-10 ┆ 7 │ + └────────────┴────────────┴────────────────────┘ + + You can also pass a list of holidays to exclude from the count: + + >>> from datetime import date + >>> holidays = [date(2020, 1, 1), date(2020, 1, 2)] + >>> df.with_columns( + ... business_day_count=pl.business_day_count("start", "end", holidays=holidays) + ... ) + shape: (2, 3) + ┌────────────┬────────────┬────────────────────┐ + │ start ┆ end ┆ business_day_count │ + │ --- ┆ --- ┆ --- │ + │ date ┆ date ┆ i32 │ + ╞════════════╪════════════╪════════════════════╡ + │ 2020-01-01 ┆ 2020-01-02 ┆ 0 │ + │ 2020-01-02 ┆ 2020-01-10 ┆ 5 │ + └────────────┴────────────┴────────────────────┘ """ start_pyexpr = parse_as_expression(start) end_pyexpr = parse_as_expression(end) - return wrap_expr(plr.business_day_count(start_pyexpr, end_pyexpr, week_mask)) + unix_epoch = date(1970, 1, 1) + return wrap_expr( + plr.business_day_count( + start_pyexpr, + end_pyexpr, + week_mask, + [(holiday - unix_epoch).days for holiday in holidays], + ) + ) diff --git a/py-polars/src/functions/business.rs b/py-polars/src/functions/business.rs index 0ca6ec058d4a..a0310492ba7e 100644 --- a/py-polars/src/functions/business.rs +++ b/py-polars/src/functions/business.rs @@ -4,8 +4,13 @@ use pyo3::prelude::*; use crate::PyExpr; #[pyfunction] -pub fn business_day_count(start: PyExpr, end: PyExpr, week_mask: [bool; 7]) -> PyExpr { +pub fn business_day_count( + start: PyExpr, + end: PyExpr, + week_mask: [bool; 7], + holidays: Vec, +) -> PyExpr { let start = start.inner; let end = end.inner; - dsl::business_day_count(start, end, week_mask).into() + dsl::business_day_count(start, end, week_mask, holidays).into() } diff --git a/py-polars/tests/parametric/time_series/test_business_day_count.py b/py-polars/tests/parametric/time_series/test_business_day_count.py index 7d9d61fbc5cb..437e8a7208a8 100644 --- a/py-polars/tests/parametric/time_series/test_business_day_count.py +++ b/py-polars/tests/parametric/time_series/test_business_day_count.py @@ -18,19 +18,26 @@ min_size=7, max_size=7, ), + holidays=st.lists( + st.dates(min_value=dt.date(1969, 1, 1), max_value=dt.date(1970, 12, 31)), + min_size=0, + max_size=100, + ), ) def test_against_np_busday_count( - start: dt.date, - end: dt.date, - week_mask: tuple[bool, ...], + start: dt.date, end: dt.date, week_mask: tuple[bool, ...], holidays: list[dt.date] ) -> None: assume(any(week_mask)) result = ( pl.DataFrame({"start": [start], "end": [end]}) - .select(n=pl.business_day_count("start", "end", week_mask=week_mask))["n"] + .select( + n=pl.business_day_count( + "start", "end", week_mask=week_mask, holidays=holidays + ) + )["n"] .item() ) - expected = np.busday_count(start, end, weekmask=week_mask) + expected = np.busday_count(start, end, weekmask=week_mask, holidays=holidays) if start > end and parse_version(np.__version__) < parse_version("1.25"): # Bug in old versions of numpy reject()