From 162cde36ad5a84d3fe6df70f3239614b15c23ada Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Sun, 30 Jun 2024 14:51:22 +0200 Subject: [PATCH 1/4] test: move and fix name of test --- python/tests/test_column_selection.py | 66 ++++++++++++++++++++++++ python/tests/test_use_columns.py | 74 --------------------------- 2 files changed, 66 insertions(+), 74 deletions(-) delete mode 100644 python/tests/test_use_columns.py diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py index 14a5387..2e9a5e6 100644 --- a/python/tests/test_column_selection.py +++ b/python/tests/test_column_selection.py @@ -345,3 +345,69 @@ def test_single_sheet_invalid_column_indices_column_does_not_exist_int( """ with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]) + + +def test_use_columns_with_column_names() -> None: + excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) + + sheet = excel_reader.load_sheet( + 0, + use_columns=[1, 2], + header_row=None, + skip_rows=1, + column_names=["bools_renamed", "dates_renamed"], + ) + + assert sheet.available_columns == [ + fastexcel.ColumnInfo( + name="__UNNAMED__0", + column_name_from="generated", + index=0, + dtype="float", + dtype_from="guessed", + ), + fastexcel.ColumnInfo( + name="bools_renamed", + index=1, + dtype="boolean", + dtype_from="guessed", + column_name_from="provided", + ), + fastexcel.ColumnInfo( + name="dates_renamed", + index=2, + dtype="datetime", + dtype_from="guessed", + column_name_from="provided", + ), + fastexcel.ColumnInfo( + name="__UNNAMED__3", + index=3, + dtype="float", + dtype_from="guessed", + column_name_from="generated", + ), + ] + + pd_assert_frame_equal( + sheet.to_pandas(), + pd.DataFrame( + { + "bools_renamed": [True, False, True], + "dates_renamed": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype( + "datetime64[ms]" + ), + } + ), + ) + pl_assert_frame_equal( + sheet.to_polars(), + pl.DataFrame( + { + "bools_renamed": [True, False, True], + "dates_renamed": ["2022-03-02 05:43:04"] * 3, + } + ).with_columns( + pl.col("dates_renamed").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms") + ), + ) diff --git a/python/tests/test_use_columns.py b/python/tests/test_use_columns.py deleted file mode 100644 index 4115c08..0000000 --- a/python/tests/test_use_columns.py +++ /dev/null @@ -1,74 +0,0 @@ -from __future__ import annotations - -import fastexcel -import pandas as pd -import polars as pl -from pandas.testing import assert_frame_equal as pd_assert_frame_equal -from polars.testing import assert_frame_equal as pl_assert_frame_equal -from utils import path_for_fixture - - -def test_use_columns_with_use_columns() -> None: - excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) - - sheet = excel_reader.load_sheet( - 0, - use_columns=[1, 2], - header_row=None, - skip_rows=1, - column_names=["bools_renamed", "dates_renamed"], - ) - - assert sheet.available_columns == [ - fastexcel.ColumnInfo( - name="__UNNAMED__0", - column_name_from="generated", - index=0, - dtype="float", - dtype_from="guessed", - ), - fastexcel.ColumnInfo( - name="bools_renamed", - index=1, - dtype="boolean", - dtype_from="guessed", - column_name_from="provided", - ), - fastexcel.ColumnInfo( - name="dates_renamed", - index=2, - dtype="datetime", - dtype_from="guessed", - column_name_from="provided", - ), - fastexcel.ColumnInfo( - name="__UNNAMED__3", - index=3, - dtype="float", - dtype_from="guessed", - column_name_from="generated", - ), - ] - - pd_assert_frame_equal( - sheet.to_pandas(), - pd.DataFrame( - { - "bools_renamed": [True, False, True], - "dates_renamed": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype( - "datetime64[ms]" - ), - } - ), - ) - pl_assert_frame_equal( - sheet.to_polars(), - pl.DataFrame( - { - "bools_renamed": [True, False, True], - "dates_renamed": ["2022-03-02 05:43:04"] * 3, - } - ).with_columns( - pl.col("dates_renamed").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms") - ), - ) From 1a3990b712c5442781ad64f8b7b20b32daf1a2a1 Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Sun, 30 Jun 2024 15:11:57 +0200 Subject: [PATCH 2/4] feat: prepare API and tests --- python/fastexcel/__init__.py | 10 +++--- python/fastexcel/_fastexcel.pyi | 4 +-- python/tests/test_column_selection.py | 46 +++++++++++++++++++++++++-- src/types/python/excelreader.rs | 2 +- 4 files changed, 53 insertions(+), 9 deletions(-) diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index 8717e1b..2bd70e4 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations import sys -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Callable, Literal if sys.version_info < (3, 10): from typing_extensions import TypeAlias @@ -128,7 +128,7 @@ def load_sheet( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - use_columns: list[str] | list[int] | str | None = None, + use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, dtypes: DTypeMap | None = None, ) -> ExcelSheet: """Loads a sheet by index or name. @@ -153,6 +153,8 @@ def load_sheet( - A string, a comma separated list of Excel column letters and column ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in `A,B,C,D,E` and `A,C,E,F`) + - A callable, a function that takes a column and returns a boolean + indicating whether the column should be used :param dtypes: An optional dict of dtypes. Keys can be column indices or names """ return ExcelSheet( @@ -177,7 +179,7 @@ def load_sheet_by_name( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - use_columns: list[str] | list[int] | str | None = None, + use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, dtypes: DTypeMap | None = None, ) -> ExcelSheet: """Loads a sheet by name. @@ -206,7 +208,7 @@ def load_sheet_by_idx( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - use_columns: list[str] | list[int] | str | None = None, + use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, dtypes: DTypeMap | None = None, ) -> ExcelSheet: """Loads a sheet by index. diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi index 0d6d3e1..24ac56e 100644 --- a/python/fastexcel/_fastexcel.pyi +++ b/python/fastexcel/_fastexcel.pyi @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Literal +from typing import Callable, Literal import pyarrow as pa @@ -70,7 +70,7 @@ class _ExcelReader: skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - use_columns: list[str] | list[int] | str | None = None, + use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, dtypes: DTypeMap | None = None, ) -> _ExcelSheet: ... @property diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py index 2e9a5e6..cf82de5 100644 --- a/python/tests/test_column_selection.py +++ b/python/tests/test_column_selection.py @@ -1,3 +1,4 @@ +# ruff: noqa: E501 from __future__ import annotations import re @@ -307,7 +308,7 @@ def test_single_sheet_invalid_column_indices_negative_integer( expected_message = """invalid parameters: expected list[int] | list[str], got [-2] Context: 0: could not determine selected columns from provided object: [-2] - 1: expected selected columns to be list[str] | list[int] | str | None, got Some([-2]) + 1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([-2]) """ with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2]) @@ -319,7 +320,7 @@ def test_single_sheet_invalid_column_indices_empty_list( expected_message = """invalid parameters: list of selected columns is empty Context: 0: could not determine selected columns from provided object: [] - 1: expected selected columns to be list[str] | list[int] | str | None, got Some([]) + 1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([]) """ with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[]) @@ -411,3 +412,44 @@ def test_use_columns_with_column_names() -> None: pl.col("dates_renamed").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms") ), ) + + +def test_use_columns_with_callable() -> None: + excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) + + sheet = excel_reader.load_sheet(2) + assert [(c.name, c.dtype) for c in sheet.available_columns] == [ + ("col1", "float"), + ("__UNNAMED__1", "float"), + ("col3", "string"), + ("__UNNAMED__3", "float"), + ("col5", "string"), + ] + + sheet = excel_reader.load_sheet( + 2, + use_columns=lambda col: col.name.startswith("col"), + ) + assert [(c.name, c.dtype) for c in sheet.available_columns] == [ + ("col1", "float"), + ("col3", "string"), + ("col5", "string"), + ] + + sheet = excel_reader.load_sheet( + 2, + use_columns=lambda col: col.index % 2 == 1, + ) + assert [(c.name, c.dtype) for c in sheet.available_columns] == [ + ("__UNNAMED__1", "float"), + ("__UNNAMED__3", "float"), + ] + + sheet = excel_reader.load_sheet( + 2, + use_columns=lambda col: col.dtype == "string", + ) + assert [(c.name, c.dtype) for c in sheet.available_columns] == [ + ("col3", "string"), + ("col5", "string"), + ] diff --git a/src/types/python/excelreader.rs b/src/types/python/excelreader.rs index e295e6c..574e2b4 100644 --- a/src/types/python/excelreader.rs +++ b/src/types/python/excelreader.rs @@ -63,7 +63,7 @@ impl ExcelReader { } fn build_selected_columns(use_columns: Option<&PyAny>) -> FastExcelResult { - use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}")) + use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got {use_columns:?}")) } #[allow(clippy::too_many_arguments)] From 3f38ba7c2221a02cdef710f20b4c576efe70faa9 Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Sun, 30 Jun 2024 16:34:06 +0200 Subject: [PATCH 3/4] feat: implement feature --- python/tests/test_column_selection.py | 46 ++++++++++++++++----- src/types/python/excelsheet/mod.rs | 59 ++++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 16 deletions(-) diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py index cf82de5..a8d6ea0 100644 --- a/python/tests/test_column_selection.py +++ b/python/tests/test_column_selection.py @@ -418,19 +418,23 @@ def test_use_columns_with_callable() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) sheet = excel_reader.load_sheet(2) - assert [(c.name, c.dtype) for c in sheet.available_columns] == [ - ("col1", "float"), - ("__UNNAMED__1", "float"), - ("col3", "string"), - ("__UNNAMED__3", "float"), - ("col5", "string"), - ] + assert ( + [(c.name, c.dtype) for c in sheet.available_columns] + == [(c.name, c.dtype) for c in sheet.selected_columns] + == [ + ("col1", "float"), + ("__UNNAMED__1", "float"), + ("col3", "string"), + ("__UNNAMED__3", "float"), + ("col5", "string"), + ] + ) sheet = excel_reader.load_sheet( 2, use_columns=lambda col: col.name.startswith("col"), ) - assert [(c.name, c.dtype) for c in sheet.available_columns] == [ + assert [(c.name, c.dtype) for c in sheet.selected_columns] == [ ("col1", "float"), ("col3", "string"), ("col5", "string"), @@ -440,7 +444,7 @@ def test_use_columns_with_callable() -> None: 2, use_columns=lambda col: col.index % 2 == 1, ) - assert [(c.name, c.dtype) for c in sheet.available_columns] == [ + assert [(c.name, c.dtype) for c in sheet.selected_columns] == [ ("__UNNAMED__1", "float"), ("__UNNAMED__3", "float"), ] @@ -449,7 +453,29 @@ def test_use_columns_with_callable() -> None: 2, use_columns=lambda col: col.dtype == "string", ) - assert [(c.name, c.dtype) for c in sheet.available_columns] == [ + assert [(c.name, c.dtype) for c in sheet.selected_columns] == [ ("col3", "string"), ("col5", "string"), ] + + +def test_use_columns_with_bad_callable() -> None: + excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) + with pytest.raises( + fastexcel.InvalidParametersError, + match=re.escape( + "`use_columns` callable could not be called (TypeError: test_use_columns_with_bad_callable..() takes 0 positional arguments but 1 was given)", + ), + ): + excel_reader.load_sheet( + 2, + use_columns=lambda: True, # type: ignore + ) + + with pytest.raises( + fastexcel.InvalidParametersError, match="`use_columns` callable should return a boolean" + ): + excel_reader.load_sheet( + 2, + use_columns=lambda _: 42, # type: ignore + ) diff --git a/src/types/python/excelsheet/mod.rs b/src/types/python/excelsheet/mod.rs index 4792891..a1c9a61 100644 --- a/src/types/python/excelsheet/mod.rs +++ b/src/types/python/excelsheet/mod.rs @@ -104,27 +104,53 @@ impl TryFrom<&PyList> for SelectedColumns { } } -#[derive(Debug, PartialEq)] pub(crate) enum SelectedColumns { All, Selection(Vec), + DynamicSelection(PyObject), +} + +impl std::fmt::Debug for SelectedColumns { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::All => write!(f, "All"), + Self::Selection(selection) => write!(f, "Selection({selection:?})"), + Self::DynamicSelection(func) => { + let addr = func as *const _ as usize; + write!(f, "DynamicSelection({addr})") + } + } + } +} + +impl PartialEq for SelectedColumns { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::All, Self::All) => true, + (Self::Selection(selection), Self::Selection(other_selection)) => { + selection == other_selection + } + (Self::DynamicSelection(f1), Self::DynamicSelection(f2)) => std::ptr::eq(f1, f2), + _ => false, + } + } } impl SelectedColumns { pub(super) fn select_columns( &self, - column_info: &[ColumnInfo], + available_columns: &[ColumnInfo], ) -> FastExcelResult> { match self { - SelectedColumns::All => Ok(column_info.to_vec()), + SelectedColumns::All => Ok(available_columns.to_vec()), SelectedColumns::Selection(selection) => selection .iter() .map(|selected_column| { match selected_column { - IdxOrName::Idx(index) => column_info + IdxOrName::Idx(index) => available_columns .iter() .find(|col_info| &col_info.index() == index), - IdxOrName::Name(name) => column_info + IdxOrName::Name(name) => available_columns .iter() .find(|col_info| col_info.name() == name.as_str()), } @@ -132,9 +158,28 @@ impl SelectedColumns { FastExcelErrorKind::ColumnNotFound(selected_column.clone()).into() }) .cloned() - .with_context(|| format!("available columns are: {column_info:?}")) + .with_context(|| format!("available columns are: {available_columns:?}")) }) .collect(), + SelectedColumns::DynamicSelection(use_col_func) => Python::with_gil(|py| { + Ok(available_columns + .iter() + .filter_map( + |col_info| match use_col_func.call1(py, (col_info.clone(),)) { + Err(err) => Some(Err(FastExcelErrorKind::InvalidParameters(format!( + "`use_columns` callable could not be called ({err})" + )))), + Ok(should_use_col) => match should_use_col.extract::(py) { + Err(_) => Some(Err(FastExcelErrorKind::InvalidParameters( + "`use_columns` callable should return a boolean".to_string(), + ))), + Ok(true) => Some(Ok(col_info.clone())), + Ok(false) => None, + }, + }, + ) + .collect::, _>>()?) + }), } } const ALPHABET: [char; 26] = [ @@ -261,6 +306,8 @@ impl TryFrom> for SelectedColumns { .parse() } else if let Ok(py_list) = py_any.downcast::() { py_list.try_into() + } else if let Ok(py_function) = py_any.extract::() { + Ok(Self::DynamicSelection(py_function)) } else { Err(FastExcelErrorKind::InvalidParameters(format!( "unsupported object type {object_type}", From 97fadffe33434f30e0b7511a5da82dd79600a868 Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Sun, 30 Jun 2024 17:22:26 +0200 Subject: [PATCH 4/4] ci: fix --- python/tests/test_column_selection.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py index a8d6ea0..588d575 100644 --- a/python/tests/test_column_selection.py +++ b/python/tests/test_column_selection.py @@ -463,9 +463,7 @@ def test_use_columns_with_bad_callable() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) with pytest.raises( fastexcel.InvalidParametersError, - match=re.escape( - "`use_columns` callable could not be called (TypeError: test_use_columns_with_bad_callable..() takes 0 positional arguments but 1 was given)", - ), + match=re.escape("`use_columns` callable could not be called (TypeError: "), ): excel_reader.load_sheet( 2,