Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow to select a subset of columns #189

Merged
merged 14 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ default-features = false
features = ["pyarrow"]

[dev-dependencies]
pretty_assertions = "1.4.0"
rstest = { version = "0.18.2", default-features = false }

# NOTE: This is a hack to bypass pyo3 limitations when testing:
Expand Down
7 changes: 7 additions & 0 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def load_sheet_by_name(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
) -> ExcelSheet:
"""Loads a sheet by name.

Expand All @@ -127,6 +128,7 @@ def load_sheet_by_name(
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
)
)

Expand All @@ -139,6 +141,7 @@ def load_sheet_by_idx(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
) -> ExcelSheet:
"""Loads a sheet by index.

Expand Down Expand Up @@ -166,6 +169,7 @@ def load_sheet_by_idx(
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
)
)

Expand All @@ -178,6 +182,7 @@ def load_sheet(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
PrettyWood marked this conversation as resolved.
Show resolved Hide resolved
) -> ExcelSheet:
"""Loads a sheet by name if a string is passed or by index if an integer is passed.

Expand All @@ -191,6 +196,7 @@ def load_sheet(
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
)
if isinstance(idx_or_name, int)
else self.load_sheet_by_name(
Expand All @@ -200,6 +206,7 @@ def load_sheet(
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
)
)

Expand Down
12 changes: 2 additions & 10 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class _ExcelReader:
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
) -> _ExcelSheet: ...
def load_sheet_by_idx(
self,
Expand All @@ -43,16 +44,7 @@ class _ExcelReader:
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
) -> _ExcelSheet: ...
def load_sheet(
self,
idx_or_name: int | str,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
) -> _ExcelSheet: ...
@property
def sheet_names(self) -> list[str]: ...
Expand Down
251 changes: 251 additions & 0 deletions python/tests/test_column_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
from __future__ import annotations

import re
from typing import Any

import fastexcel
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from utils import path_for_fixture


@pytest.fixture
def excel_reader_single_sheet() -> fastexcel.ExcelReader:
return fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))


def test_single_sheet_all_columns(excel_reader_single_sheet: fastexcel.ExcelReader) -> None:
sheet = excel_reader_single_sheet.load_sheet(0)

sheet_explicit_arg = excel_reader_single_sheet.load_sheet(0, use_columns=None)

expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
expected_pd_df = pd.DataFrame(expected)
expected_pl_df = pl.DataFrame(expected)

pd_df = sheet.to_pandas()
pd_assert_frame_equal(pd_df, expected_pd_df)
pd_df_explicit_arg = sheet_explicit_arg.to_pandas()
pd_assert_frame_equal(pd_df_explicit_arg, expected_pd_df)

pl_df = sheet.to_polars()
pl_assert_frame_equal(pl_df, expected_pl_df)
pl_df_explicit_arg = sheet_explicit_arg.to_polars()
pl_assert_frame_equal(pl_df_explicit_arg, expected_pl_df)


def test_single_sheet_subset_by_str(excel_reader_single_sheet: fastexcel.ExcelReader) -> None:
expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}

for sheet_name_or_idx in [0, "January"]:
for col in ["Month", "Year"]:
sheet = excel_reader_single_sheet.load_sheet(0, use_columns=[col])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

load_sheet(sheet_name_or_idx)?
Same below?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

woops good catch 8ff4192


pd_df = sheet.to_pandas()
pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]}))

pl_df = sheet.to_polars()
pl_assert_frame_equal(pl_df, pl.DataFrame({col: expected[col]}))


def test_single_sheet_subset_by_index(excel_reader_single_sheet: fastexcel.ExcelReader) -> None:
expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}

for sheet_name_or_idx in [0, "January"]:
for idx, col_name in enumerate(["Month", "Year"]):
sheet = excel_reader_single_sheet.load_sheet(0, use_columns=[idx])

pd_df = sheet.to_pandas()
pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]}))

pl_df = sheet.to_polars()
pl_assert_frame_equal(pl_df, pl.DataFrame({col_name: expected[col_name]}))


@pytest.fixture
def excel_reader_single_sheet_with_unnamed_columns() -> fastexcel.ExcelReader:
return fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))


@pytest.fixture
def single_sheet_with_unnamed_columns_expected() -> dict[str, list[Any]]:
return {
"col1": [2.0, 3.0],
"__UNNAMED__1": [1.5, 2.5],
"col3": ["hello", "world"],
"__UNNAMED__3": [-5.0, -6.0],
"col5": ["a", "b"],
}


def test_single_sheet_with_unnamed_columns(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
) -> None:
use_columns_str = ["col1", "col3", "__UNNAMED__3"]
use_columns_idx = [0, 2, 3]
expected = {
k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in use_columns_str
}

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str
)

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx
)

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_single_sheet_with_unnamed_columns_and_pagination(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
) -> None:
use_columns_str = ["col1", "col3", "__UNNAMED__3"]
use_columns_idx = [0, 2, 3]

# first row only
expected = {
k: v[:1]
for k, v in single_sheet_with_unnamed_columns_expected.items()
if k in use_columns_str
}

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str, n_rows=1
)

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, n_rows=1
)

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

# second row
expected = {
k: v[1:]
for k, v in single_sheet_with_unnamed_columns_expected.items()
if k in use_columns_str
}

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str, skip_rows=1
)

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, skip_rows=1
)

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
use_columns_str = ["col0", "col2", "col3"]
use_columns_idx = [0, 2, 3]
expected: dict[str, list[Any]] = {
"col0": [2.0, 3.0],
"col2": ["hello", "world"],
"col3": [-5.0, -6.0],
}
column_names = [f"col{i}" for i in range(5)]

# skipping the header row only
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str, skip_rows=1, column_names=column_names
)

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, skip_rows=1, column_names=column_names
)

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

# skipping the header row + first data row
expected_first_row_skipped = {k: v[1:] for k, v in expected.items()}

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str, skip_rows=2, column_names=column_names
)

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, skip_rows=2, column_names=column_names
)

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))


def test_single_sheet_invalid_column_indices_negative_integer(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
expected_message = """invalid parameters: expected list[int] | list[str], got [-2]
Context:
0: expected selected columns to be list[str] | list[int] | None, got Some([-2])
"""
with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2])


def test_single_sheet_invalid_column_indices_empty_list(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
expected_message = """invalid parameters: list of select columns is empty
Context:
0: expected selected columns to be list[str] | list[int] | None, got Some([])
"""
with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[])


def test_single_sheet_invalid_column_indices_column_does_not_exist_str(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
expected_message = """column with name "nope" not found
Context:
0: selected columns are invalid
1: could not create RecordBatch from sheet "January"
2: could not convert RecordBatch to pyarrow for sheet "January"
"""
with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(
0, use_columns=["nope"]
).to_arrow()


def test_single_sheet_invalid_column_indices_column_does_not_exist_int(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
expected_message = """column at index 42 not found
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be nice to add to ExcelSheet the range of indices and list of columns. Currently we only have the width

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will add 👍

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

045a114 . I wonder wether the property should be called use_columns to match the parameter though 🤔

Context:
0: selected columns are invalid
1: could not create RecordBatch from sheet "January"
2: could not convert RecordBatch to pyarrow for sheet "January"
"""
with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]).to_arrow()
Loading
Loading