From 0b413219e81f376a4c698ddb2d09adff47bc57d8 Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Mon, 1 Jul 2024 12:14:37 +0200 Subject: [PATCH] feat: `use_columns` can now be a callable (#243) * test: move and fix name of test * feat: prepare API and tests * feat: implement feature * ci: fix --------- Co-authored-by: Luka Peschke --- python/fastexcel/__init__.py | 10 +- python/fastexcel/_fastexcel.pyi | 4 +- python/tests/test_column_selection.py | 136 +++++++++++++++++++++++++- python/tests/test_use_columns.py | 74 -------------- src/types/python/excelreader.rs | 2 +- src/types/python/excelsheet/mod.rs | 59 +++++++++-- 6 files changed, 196 insertions(+), 89 deletions(-) delete mode 100644 python/tests/test_use_columns.py diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index 60415cd..6529195 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations import sys -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Callable, Literal if sys.version_info < (3, 10): from typing_extensions import TypeAlias @@ -128,7 +128,7 @@ def load_sheet( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - use_columns: list[str] | list[int] | str | None = None, + use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, dtypes: DTypeMap | None = None, ) -> ExcelSheet: """Loads a sheet lazily by index or name. @@ -153,6 +153,8 @@ def load_sheet( - A string, a comma separated list of Excel column letters and column ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in `A,B,C,D,E` and `A,C,E,F`) + - A callable, a function that takes a column and returns a boolean + indicating whether the column should be used :param dtypes: An optional dict of dtypes. Keys can be column indices or names """ return ExcelSheet( @@ -209,7 +211,7 @@ def load_sheet_by_name( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - use_columns: list[str] | list[int] | str | None = None, + use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, dtypes: DTypeMap | None = None, ) -> ExcelSheet: """Loads a sheet by name. @@ -236,7 +238,7 @@ def load_sheet_by_idx( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - use_columns: list[str] | list[int] | str | None = None, + use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, dtypes: DTypeMap | None = None, ) -> ExcelSheet: """Loads a sheet by index. diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi index 59e892e..9efe012 100644 --- a/python/fastexcel/_fastexcel.pyi +++ b/python/fastexcel/_fastexcel.pyi @@ -1,7 +1,7 @@ from __future__ import annotations import typing -from typing import Literal +from typing import Callable, Literal import pyarrow as pa @@ -72,7 +72,7 @@ class _ExcelReader: skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - use_columns: list[str] | list[int] | str | None = None, + use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, dtypes: DTypeMap | None = None, eager: Literal[False] = ..., ) -> _ExcelSheet: ... diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py index 14a5387..588d575 100644 --- a/python/tests/test_column_selection.py +++ b/python/tests/test_column_selection.py @@ -1,3 +1,4 @@ +# ruff: noqa: E501 from __future__ import annotations import re @@ -307,7 +308,7 @@ def test_single_sheet_invalid_column_indices_negative_integer( expected_message = """invalid parameters: expected list[int] | list[str], got [-2] Context: 0: could not determine selected columns from provided object: [-2] - 1: expected selected columns to be list[str] | list[int] | str | None, got Some([-2]) + 1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([-2]) """ with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2]) @@ -319,7 +320,7 @@ def test_single_sheet_invalid_column_indices_empty_list( expected_message = """invalid parameters: list of selected columns is empty Context: 0: could not determine selected columns from provided object: [] - 1: expected selected columns to be list[str] | list[int] | str | None, got Some([]) + 1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([]) """ with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[]) @@ -345,3 +346,134 @@ def test_single_sheet_invalid_column_indices_column_does_not_exist_int( """ with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]) + + +def test_use_columns_with_column_names() -> None: + excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) + + sheet = excel_reader.load_sheet( + 0, + use_columns=[1, 2], + header_row=None, + skip_rows=1, + column_names=["bools_renamed", "dates_renamed"], + ) + + assert sheet.available_columns == [ + fastexcel.ColumnInfo( + name="__UNNAMED__0", + column_name_from="generated", + index=0, + dtype="float", + dtype_from="guessed", + ), + fastexcel.ColumnInfo( + name="bools_renamed", + index=1, + dtype="boolean", + dtype_from="guessed", + column_name_from="provided", + ), + fastexcel.ColumnInfo( + name="dates_renamed", + index=2, + dtype="datetime", + dtype_from="guessed", + column_name_from="provided", + ), + fastexcel.ColumnInfo( + name="__UNNAMED__3", + index=3, + dtype="float", + dtype_from="guessed", + column_name_from="generated", + ), + ] + + pd_assert_frame_equal( + sheet.to_pandas(), + pd.DataFrame( + { + "bools_renamed": [True, False, True], + "dates_renamed": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype( + "datetime64[ms]" + ), + } + ), + ) + pl_assert_frame_equal( + sheet.to_polars(), + pl.DataFrame( + { + "bools_renamed": [True, False, True], + "dates_renamed": ["2022-03-02 05:43:04"] * 3, + } + ).with_columns( + pl.col("dates_renamed").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms") + ), + ) + + +def test_use_columns_with_callable() -> None: + excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) + + sheet = excel_reader.load_sheet(2) + assert ( + [(c.name, c.dtype) for c in sheet.available_columns] + == [(c.name, c.dtype) for c in sheet.selected_columns] + == [ + ("col1", "float"), + ("__UNNAMED__1", "float"), + ("col3", "string"), + ("__UNNAMED__3", "float"), + ("col5", "string"), + ] + ) + + sheet = excel_reader.load_sheet( + 2, + use_columns=lambda col: col.name.startswith("col"), + ) + assert [(c.name, c.dtype) for c in sheet.selected_columns] == [ + ("col1", "float"), + ("col3", "string"), + ("col5", "string"), + ] + + sheet = excel_reader.load_sheet( + 2, + use_columns=lambda col: col.index % 2 == 1, + ) + assert [(c.name, c.dtype) for c in sheet.selected_columns] == [ + ("__UNNAMED__1", "float"), + ("__UNNAMED__3", "float"), + ] + + sheet = excel_reader.load_sheet( + 2, + use_columns=lambda col: col.dtype == "string", + ) + assert [(c.name, c.dtype) for c in sheet.selected_columns] == [ + ("col3", "string"), + ("col5", "string"), + ] + + +def test_use_columns_with_bad_callable() -> None: + excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) + with pytest.raises( + fastexcel.InvalidParametersError, + match=re.escape("`use_columns` callable could not be called (TypeError: "), + ): + excel_reader.load_sheet( + 2, + use_columns=lambda: True, # type: ignore + ) + + with pytest.raises( + fastexcel.InvalidParametersError, match="`use_columns` callable should return a boolean" + ): + excel_reader.load_sheet( + 2, + use_columns=lambda _: 42, # type: ignore + ) diff --git a/python/tests/test_use_columns.py b/python/tests/test_use_columns.py deleted file mode 100644 index 4115c08..0000000 --- a/python/tests/test_use_columns.py +++ /dev/null @@ -1,74 +0,0 @@ -from __future__ import annotations - -import fastexcel -import pandas as pd -import polars as pl -from pandas.testing import assert_frame_equal as pd_assert_frame_equal -from polars.testing import assert_frame_equal as pl_assert_frame_equal -from utils import path_for_fixture - - -def test_use_columns_with_use_columns() -> None: - excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) - - sheet = excel_reader.load_sheet( - 0, - use_columns=[1, 2], - header_row=None, - skip_rows=1, - column_names=["bools_renamed", "dates_renamed"], - ) - - assert sheet.available_columns == [ - fastexcel.ColumnInfo( - name="__UNNAMED__0", - column_name_from="generated", - index=0, - dtype="float", - dtype_from="guessed", - ), - fastexcel.ColumnInfo( - name="bools_renamed", - index=1, - dtype="boolean", - dtype_from="guessed", - column_name_from="provided", - ), - fastexcel.ColumnInfo( - name="dates_renamed", - index=2, - dtype="datetime", - dtype_from="guessed", - column_name_from="provided", - ), - fastexcel.ColumnInfo( - name="__UNNAMED__3", - index=3, - dtype="float", - dtype_from="guessed", - column_name_from="generated", - ), - ] - - pd_assert_frame_equal( - sheet.to_pandas(), - pd.DataFrame( - { - "bools_renamed": [True, False, True], - "dates_renamed": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype( - "datetime64[ms]" - ), - } - ), - ) - pl_assert_frame_equal( - sheet.to_polars(), - pl.DataFrame( - { - "bools_renamed": [True, False, True], - "dates_renamed": ["2022-03-02 05:43:04"] * 3, - } - ).with_columns( - pl.col("dates_renamed").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms") - ), - ) diff --git a/src/types/python/excelreader.rs b/src/types/python/excelreader.rs index 6089750..c1468b8 100644 --- a/src/types/python/excelreader.rs +++ b/src/types/python/excelreader.rs @@ -84,7 +84,7 @@ impl ExcelReader { fn build_selected_columns( use_columns: Option<&Bound<'_, PyAny>>, ) -> FastExcelResult { - use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}")) + use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got {use_columns:?}")) } // NOTE: Not implementing TryFrom here, because we're aren't building the file from the passed diff --git a/src/types/python/excelsheet/mod.rs b/src/types/python/excelsheet/mod.rs index 543c5a1..5179d99 100644 --- a/src/types/python/excelsheet/mod.rs +++ b/src/types/python/excelsheet/mod.rs @@ -114,27 +114,53 @@ impl TryFrom<&Bound<'_, PyList>> for SelectedColumns { } } -#[derive(Debug, PartialEq)] pub(crate) enum SelectedColumns { All, Selection(Vec), + DynamicSelection(PyObject), +} + +impl std::fmt::Debug for SelectedColumns { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::All => write!(f, "All"), + Self::Selection(selection) => write!(f, "Selection({selection:?})"), + Self::DynamicSelection(func) => { + let addr = func as *const _ as usize; + write!(f, "DynamicSelection({addr})") + } + } + } +} + +impl PartialEq for SelectedColumns { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::All, Self::All) => true, + (Self::Selection(selection), Self::Selection(other_selection)) => { + selection == other_selection + } + (Self::DynamicSelection(f1), Self::DynamicSelection(f2)) => std::ptr::eq(f1, f2), + _ => false, + } + } } impl SelectedColumns { pub(super) fn select_columns( &self, - column_info: &[ColumnInfo], + available_columns: &[ColumnInfo], ) -> FastExcelResult> { match self { - SelectedColumns::All => Ok(column_info.to_vec()), + SelectedColumns::All => Ok(available_columns.to_vec()), SelectedColumns::Selection(selection) => selection .iter() .map(|selected_column| { match selected_column { - IdxOrName::Idx(index) => column_info + IdxOrName::Idx(index) => available_columns .iter() .find(|col_info| &col_info.index() == index), - IdxOrName::Name(name) => column_info + IdxOrName::Name(name) => available_columns .iter() .find(|col_info| col_info.name() == name.as_str()), } @@ -142,9 +168,28 @@ impl SelectedColumns { FastExcelErrorKind::ColumnNotFound(selected_column.clone()).into() }) .cloned() - .with_context(|| format!("available columns are: {column_info:?}")) + .with_context(|| format!("available columns are: {available_columns:?}")) }) .collect(), + SelectedColumns::DynamicSelection(use_col_func) => Python::with_gil(|py| { + Ok(available_columns + .iter() + .filter_map( + |col_info| match use_col_func.call1(py, (col_info.clone(),)) { + Err(err) => Some(Err(FastExcelErrorKind::InvalidParameters(format!( + "`use_columns` callable could not be called ({err})" + )))), + Ok(should_use_col) => match should_use_col.extract::(py) { + Err(_) => Some(Err(FastExcelErrorKind::InvalidParameters( + "`use_columns` callable should return a boolean".to_string(), + ))), + Ok(true) => Some(Ok(col_info.clone())), + Ok(false) => None, + }, + }, + ) + .collect::, _>>()?) + }), } } @@ -272,6 +317,8 @@ impl TryFrom>> for SelectedColumns { .parse() } else if let Ok(py_list) = py_any.downcast::() { py_list.try_into() + } else if let Ok(py_function) = py_any.extract::() { + Ok(Self::DynamicSelection(py_function)) } else { Err(FastExcelErrorKind::InvalidParameters(format!( "unsupported object type {object_type}",