Skip to content

Commit

Permalink
feat: use_columns can now be a callable (#243)
Browse files Browse the repository at this point in the history
* test: move and fix name of test

* feat: prepare API and tests

* feat: implement feature

* ci: fix

---------

Co-authored-by: Luka Peschke <[email protected]>
  • Loading branch information
PrettyWood and lukapeschke authored Jul 1, 2024
1 parent b93636e commit 0b41321
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 89 deletions.
10 changes: 6 additions & 4 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import sys
from typing import TYPE_CHECKING, Literal
from typing import TYPE_CHECKING, Callable, Literal

if sys.version_info < (3, 10):
from typing_extensions import TypeAlias
Expand Down Expand Up @@ -128,7 +128,7 @@ def load_sheet(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet lazily by index or name.
Expand All @@ -153,6 +153,8 @@ def load_sheet(
- A string, a comma separated list of Excel column letters and column
ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
`A,B,C,D,E` and `A,C,E,F`)
- A callable, a function that takes a column and returns a boolean
indicating whether the column should be used
:param dtypes: An optional dict of dtypes. Keys can be column indices or names
"""
return ExcelSheet(
Expand Down Expand Up @@ -209,7 +211,7 @@ def load_sheet_by_name(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by name.
Expand All @@ -236,7 +238,7 @@ def load_sheet_by_idx(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by index.
Expand Down
4 changes: 2 additions & 2 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import typing
from typing import Literal
from typing import Callable, Literal

import pyarrow as pa

Expand Down Expand Up @@ -72,7 +72,7 @@ class _ExcelReader:
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
eager: Literal[False] = ...,
) -> _ExcelSheet: ...
Expand Down
136 changes: 134 additions & 2 deletions python/tests/test_column_selection.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# ruff: noqa: E501
from __future__ import annotations

import re
Expand Down Expand Up @@ -307,7 +308,7 @@ def test_single_sheet_invalid_column_indices_negative_integer(
expected_message = """invalid parameters: expected list[int] | list[str], got [-2]
Context:
0: could not determine selected columns from provided object: [-2]
1: expected selected columns to be list[str] | list[int] | str | None, got Some([-2])
1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([-2])
"""
with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2])
Expand All @@ -319,7 +320,7 @@ def test_single_sheet_invalid_column_indices_empty_list(
expected_message = """invalid parameters: list of selected columns is empty
Context:
0: could not determine selected columns from provided object: []
1: expected selected columns to be list[str] | list[int] | str | None, got Some([])
1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([])
"""
with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[])
Expand All @@ -345,3 +346,134 @@ def test_single_sheet_invalid_column_indices_column_does_not_exist_int(
"""
with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42])


def test_use_columns_with_column_names() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))

sheet = excel_reader.load_sheet(
0,
use_columns=[1, 2],
header_row=None,
skip_rows=1,
column_names=["bools_renamed", "dates_renamed"],
)

assert sheet.available_columns == [
fastexcel.ColumnInfo(
name="__UNNAMED__0",
column_name_from="generated",
index=0,
dtype="float",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="bools_renamed",
index=1,
dtype="boolean",
dtype_from="guessed",
column_name_from="provided",
),
fastexcel.ColumnInfo(
name="dates_renamed",
index=2,
dtype="datetime",
dtype_from="guessed",
column_name_from="provided",
),
fastexcel.ColumnInfo(
name="__UNNAMED__3",
index=3,
dtype="float",
dtype_from="guessed",
column_name_from="generated",
),
]

pd_assert_frame_equal(
sheet.to_pandas(),
pd.DataFrame(
{
"bools_renamed": [True, False, True],
"dates_renamed": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype(
"datetime64[ms]"
),
}
),
)
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame(
{
"bools_renamed": [True, False, True],
"dates_renamed": ["2022-03-02 05:43:04"] * 3,
}
).with_columns(
pl.col("dates_renamed").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")
),
)


def test_use_columns_with_callable() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))

sheet = excel_reader.load_sheet(2)
assert (
[(c.name, c.dtype) for c in sheet.available_columns]
== [(c.name, c.dtype) for c in sheet.selected_columns]
== [
("col1", "float"),
("__UNNAMED__1", "float"),
("col3", "string"),
("__UNNAMED__3", "float"),
("col5", "string"),
]
)

sheet = excel_reader.load_sheet(
2,
use_columns=lambda col: col.name.startswith("col"),
)
assert [(c.name, c.dtype) for c in sheet.selected_columns] == [
("col1", "float"),
("col3", "string"),
("col5", "string"),
]

sheet = excel_reader.load_sheet(
2,
use_columns=lambda col: col.index % 2 == 1,
)
assert [(c.name, c.dtype) for c in sheet.selected_columns] == [
("__UNNAMED__1", "float"),
("__UNNAMED__3", "float"),
]

sheet = excel_reader.load_sheet(
2,
use_columns=lambda col: col.dtype == "string",
)
assert [(c.name, c.dtype) for c in sheet.selected_columns] == [
("col3", "string"),
("col5", "string"),
]


def test_use_columns_with_bad_callable() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
with pytest.raises(
fastexcel.InvalidParametersError,
match=re.escape("`use_columns` callable could not be called (TypeError: "),
):
excel_reader.load_sheet(
2,
use_columns=lambda: True, # type: ignore
)

with pytest.raises(
fastexcel.InvalidParametersError, match="`use_columns` callable should return a boolean"
):
excel_reader.load_sheet(
2,
use_columns=lambda _: 42, # type: ignore
)
74 changes: 0 additions & 74 deletions python/tests/test_use_columns.py

This file was deleted.

2 changes: 1 addition & 1 deletion src/types/python/excelreader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ impl ExcelReader {
fn build_selected_columns(
use_columns: Option<&Bound<'_, PyAny>>,
) -> FastExcelResult<SelectedColumns> {
use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}"))
use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got {use_columns:?}"))
}

// NOTE: Not implementing TryFrom here, because we're aren't building the file from the passed
Expand Down
Loading

0 comments on commit 0b41321

Please sign in to comment.