Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: use_columns can now be a callable #243

Merged
merged 5 commits into from
Jul 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import sys
from typing import TYPE_CHECKING, Literal
from typing import TYPE_CHECKING, Callable, Literal

if sys.version_info < (3, 10):
from typing_extensions import TypeAlias
Expand Down Expand Up @@ -128,7 +128,7 @@ def load_sheet(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet lazily by index or name.
Expand All @@ -153,6 +153,8 @@ def load_sheet(
- A string, a comma separated list of Excel column letters and column
ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
`A,B,C,D,E` and `A,C,E,F`)
- A callable, a function that takes a column and returns a boolean
indicating whether the column should be used
:param dtypes: An optional dict of dtypes. Keys can be column indices or names
"""
return ExcelSheet(
Expand Down Expand Up @@ -209,7 +211,7 @@ def load_sheet_by_name(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by name.
Expand All @@ -236,7 +238,7 @@ def load_sheet_by_idx(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by index.
Expand Down
4 changes: 2 additions & 2 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import typing
from typing import Literal
from typing import Callable, Literal

import pyarrow as pa

Expand Down Expand Up @@ -72,7 +72,7 @@ class _ExcelReader:
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None,
dtypes: DTypeMap | None = None,
eager: Literal[False] = ...,
) -> _ExcelSheet: ...
Expand Down
136 changes: 134 additions & 2 deletions python/tests/test_column_selection.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# ruff: noqa: E501
from __future__ import annotations

import re
Expand Down Expand Up @@ -307,7 +308,7 @@ def test_single_sheet_invalid_column_indices_negative_integer(
expected_message = """invalid parameters: expected list[int] | list[str], got [-2]
Context:
0: could not determine selected columns from provided object: [-2]
1: expected selected columns to be list[str] | list[int] | str | None, got Some([-2])
1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([-2])
"""
with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2])
Expand All @@ -319,7 +320,7 @@ def test_single_sheet_invalid_column_indices_empty_list(
expected_message = """invalid parameters: list of selected columns is empty
Context:
0: could not determine selected columns from provided object: []
1: expected selected columns to be list[str] | list[int] | str | None, got Some([])
1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([])
"""
with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[])
Expand All @@ -345,3 +346,134 @@ def test_single_sheet_invalid_column_indices_column_does_not_exist_int(
"""
with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42])


def test_use_columns_with_column_names() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))

sheet = excel_reader.load_sheet(
0,
use_columns=[1, 2],
header_row=None,
skip_rows=1,
column_names=["bools_renamed", "dates_renamed"],
)

assert sheet.available_columns == [
fastexcel.ColumnInfo(
name="__UNNAMED__0",
column_name_from="generated",
index=0,
dtype="float",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="bools_renamed",
index=1,
dtype="boolean",
dtype_from="guessed",
column_name_from="provided",
),
fastexcel.ColumnInfo(
name="dates_renamed",
index=2,
dtype="datetime",
dtype_from="guessed",
column_name_from="provided",
),
fastexcel.ColumnInfo(
name="__UNNAMED__3",
index=3,
dtype="float",
dtype_from="guessed",
column_name_from="generated",
),
]

pd_assert_frame_equal(
sheet.to_pandas(),
pd.DataFrame(
{
"bools_renamed": [True, False, True],
"dates_renamed": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype(
"datetime64[ms]"
),
}
),
)
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame(
{
"bools_renamed": [True, False, True],
"dates_renamed": ["2022-03-02 05:43:04"] * 3,
}
).with_columns(
pl.col("dates_renamed").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")
),
)


def test_use_columns_with_callable() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))

sheet = excel_reader.load_sheet(2)
assert (
[(c.name, c.dtype) for c in sheet.available_columns]
== [(c.name, c.dtype) for c in sheet.selected_columns]
== [
("col1", "float"),
("__UNNAMED__1", "float"),
("col3", "string"),
("__UNNAMED__3", "float"),
("col5", "string"),
]
)

sheet = excel_reader.load_sheet(
2,
use_columns=lambda col: col.name.startswith("col"),
)
assert [(c.name, c.dtype) for c in sheet.selected_columns] == [
("col1", "float"),
("col3", "string"),
("col5", "string"),
]

sheet = excel_reader.load_sheet(
2,
use_columns=lambda col: col.index % 2 == 1,
)
assert [(c.name, c.dtype) for c in sheet.selected_columns] == [
("__UNNAMED__1", "float"),
("__UNNAMED__3", "float"),
]

sheet = excel_reader.load_sheet(
2,
use_columns=lambda col: col.dtype == "string",
)
assert [(c.name, c.dtype) for c in sheet.selected_columns] == [
("col3", "string"),
("col5", "string"),
]


def test_use_columns_with_bad_callable() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
with pytest.raises(
fastexcel.InvalidParametersError,
match=re.escape("`use_columns` callable could not be called (TypeError: "),
):
excel_reader.load_sheet(
2,
use_columns=lambda: True, # type: ignore
)

with pytest.raises(
fastexcel.InvalidParametersError, match="`use_columns` callable should return a boolean"
):
excel_reader.load_sheet(
2,
use_columns=lambda _: 42, # type: ignore
)
74 changes: 0 additions & 74 deletions python/tests/test_use_columns.py

This file was deleted.

2 changes: 1 addition & 1 deletion src/types/python/excelreader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ impl ExcelReader {
fn build_selected_columns(
use_columns: Option<&Bound<'_, PyAny>>,
) -> FastExcelResult<SelectedColumns> {
use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}"))
use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got {use_columns:?}"))
}

// NOTE: Not implementing TryFrom here, because we're aren't building the file from the passed
Expand Down
Loading
Loading