Skip to content

Commit c11fc28

Browse files
committed
feat: allow to select a subset of columns
closes #172 Signed-off-by: Luka Peschke <[email protected]>
1 parent 35bb5a6 commit c11fc28

File tree

9 files changed

+540
-56
lines changed

9 files changed

+540
-56
lines changed

Cargo.lock

Lines changed: 23 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ default-features = false
2121
features = ["pyarrow"]
2222

2323
[dev-dependencies]
24+
pretty_assertions = "1.4.0"
2425
rstest = { version = "0.18.2", default-features = false }
2526

2627
# NOTE: This is a hack to bypass pyo3 limitations when testing:

python/fastexcel/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ def load_sheet_by_name(
102102
skip_rows: int = 0,
103103
n_rows: int | None = None,
104104
schema_sample_rows: int | None = 1_000,
105+
use_columns: list[str] | list[int] | None = None,
105106
) -> ExcelSheet:
106107
"""Loads a sheet by name.
107108
@@ -127,6 +128,7 @@ def load_sheet_by_name(
127128
skip_rows=skip_rows,
128129
n_rows=n_rows,
129130
schema_sample_rows=schema_sample_rows,
131+
use_columns=use_columns,
130132
)
131133
)
132134

@@ -139,6 +141,7 @@ def load_sheet_by_idx(
139141
skip_rows: int = 0,
140142
n_rows: int | None = None,
141143
schema_sample_rows: int | None = 1_000,
144+
use_columns: list[str] | list[int] | None = None,
142145
) -> ExcelSheet:
143146
"""Loads a sheet by index.
144147
@@ -166,6 +169,7 @@ def load_sheet_by_idx(
166169
skip_rows=skip_rows,
167170
n_rows=n_rows,
168171
schema_sample_rows=schema_sample_rows,
172+
use_columns=use_columns,
169173
)
170174
)
171175

@@ -178,6 +182,7 @@ def load_sheet(
178182
skip_rows: int = 0,
179183
n_rows: int | None = None,
180184
schema_sample_rows: int | None = 1_000,
185+
use_columns: list[str] | list[int] | None = None,
181186
) -> ExcelSheet:
182187
"""Loads a sheet by name if a string is passed or by index if an integer is passed.
183188
@@ -191,6 +196,7 @@ def load_sheet(
191196
skip_rows=skip_rows,
192197
n_rows=n_rows,
193198
schema_sample_rows=schema_sample_rows,
199+
use_columns=use_columns,
194200
)
195201
if isinstance(idx_or_name, int)
196202
else self.load_sheet_by_name(
@@ -200,6 +206,7 @@ def load_sheet(
200206
skip_rows=skip_rows,
201207
n_rows=n_rows,
202208
schema_sample_rows=schema_sample_rows,
209+
use_columns=use_columns,
203210
)
204211
)
205212

python/fastexcel/_fastexcel.pyi

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class _ExcelReader:
3333
skip_rows: int = 0,
3434
n_rows: int | None = None,
3535
schema_sample_rows: int | None = 1_000,
36+
use_columns: list[str] | list[int] | None = None,
3637
) -> _ExcelSheet: ...
3738
def load_sheet_by_idx(
3839
self,
@@ -43,16 +44,7 @@ class _ExcelReader:
4344
skip_rows: int = 0,
4445
n_rows: int | None = None,
4546
schema_sample_rows: int | None = 1_000,
46-
) -> _ExcelSheet: ...
47-
def load_sheet(
48-
self,
49-
idx_or_name: int | str,
50-
*,
51-
header_row: int | None = 0,
52-
column_names: list[str] | None = None,
53-
skip_rows: int = 0,
54-
n_rows: int | None = None,
55-
schema_sample_rows: int | None = 1_000,
47+
use_columns: list[str] | list[int] | None = None,
5648
) -> _ExcelSheet: ...
5749
@property
5850
def sheet_names(self) -> list[str]: ...

python/tests/test_column_selection.py

Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
from __future__ import annotations
2+
3+
import re
4+
from typing import Any
5+
6+
import fastexcel
7+
import pandas as pd
8+
import polars as pl
9+
import pytest
10+
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
11+
from polars.testing import assert_frame_equal as pl_assert_frame_equal
12+
from utils import path_for_fixture
13+
14+
15+
@pytest.fixture
16+
def excel_reader_single_sheet() -> fastexcel.ExcelReader:
17+
return fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
18+
19+
20+
def test_single_sheet_all_columns(excel_reader_single_sheet: fastexcel.ExcelReader) -> None:
21+
sheet = excel_reader_single_sheet.load_sheet(0)
22+
23+
sheet_explicit_arg = excel_reader_single_sheet.load_sheet(0, use_columns=None)
24+
25+
expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
26+
expected_pd_df = pd.DataFrame(expected)
27+
expected_pl_df = pl.DataFrame(expected)
28+
29+
pd_df = sheet.to_pandas()
30+
pd_assert_frame_equal(pd_df, expected_pd_df)
31+
pd_df_explicit_arg = sheet_explicit_arg.to_pandas()
32+
pd_assert_frame_equal(pd_df_explicit_arg, expected_pd_df)
33+
34+
pl_df = sheet.to_polars()
35+
pl_assert_frame_equal(pl_df, expected_pl_df)
36+
pl_df_explicit_arg = sheet_explicit_arg.to_polars()
37+
pl_assert_frame_equal(pl_df_explicit_arg, expected_pl_df)
38+
39+
40+
def test_single_sheet_subset_by_str(excel_reader_single_sheet: fastexcel.ExcelReader) -> None:
41+
expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
42+
43+
for sheet_name_or_idx in [0, "January"]:
44+
for col in ["Month", "Year"]:
45+
sheet = excel_reader_single_sheet.load_sheet(0, use_columns=[col])
46+
47+
pd_df = sheet.to_pandas()
48+
pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]}))
49+
50+
pl_df = sheet.to_polars()
51+
pl_assert_frame_equal(pl_df, pl.DataFrame({col: expected[col]}))
52+
53+
54+
def test_single_sheet_subset_by_index(excel_reader_single_sheet: fastexcel.ExcelReader) -> None:
55+
expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
56+
57+
for sheet_name_or_idx in [0, "January"]:
58+
for idx, col_name in enumerate(["Month", "Year"]):
59+
sheet = excel_reader_single_sheet.load_sheet(0, use_columns=[idx])
60+
61+
pd_df = sheet.to_pandas()
62+
pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]}))
63+
64+
pl_df = sheet.to_polars()
65+
pl_assert_frame_equal(pl_df, pl.DataFrame({col_name: expected[col_name]}))
66+
67+
68+
@pytest.fixture
69+
def excel_reader_single_sheet_with_unnamed_columns() -> fastexcel.ExcelReader:
70+
return fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
71+
72+
73+
@pytest.fixture
74+
def single_sheet_with_unnamed_columns_expected() -> dict[str, list[Any]]:
75+
return {
76+
"col1": [2.0, 3.0],
77+
"__UNNAMED__1": [1.5, 2.5],
78+
"col3": ["hello", "world"],
79+
"__UNNAMED__3": [-5.0, -6.0],
80+
"col5": ["a", "b"],
81+
}
82+
83+
84+
def test_single_sheet_with_unnamed_columns(
85+
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
86+
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
87+
) -> None:
88+
use_columns_str = ["col1", "col3", "__UNNAMED__3"]
89+
use_columns_idx = [0, 2, 3]
90+
expected = {
91+
k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in use_columns_str
92+
}
93+
94+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
95+
"With unnamed columns", use_columns=use_columns_str
96+
)
97+
98+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
99+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
100+
101+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
102+
"With unnamed columns", use_columns=use_columns_idx
103+
)
104+
105+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
106+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
107+
108+
109+
def test_single_sheet_with_unnamed_columns_and_pagination(
110+
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
111+
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
112+
) -> None:
113+
use_columns_str = ["col1", "col3", "__UNNAMED__3"]
114+
use_columns_idx = [0, 2, 3]
115+
116+
# first row only
117+
expected = {
118+
k: v[:1]
119+
for k, v in single_sheet_with_unnamed_columns_expected.items()
120+
if k in use_columns_str
121+
}
122+
123+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
124+
"With unnamed columns", use_columns=use_columns_str, n_rows=1
125+
)
126+
127+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
128+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
129+
130+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
131+
"With unnamed columns", use_columns=use_columns_idx, n_rows=1
132+
)
133+
134+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
135+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
136+
137+
# second row
138+
expected = {
139+
k: v[1:]
140+
for k, v in single_sheet_with_unnamed_columns_expected.items()
141+
if k in use_columns_str
142+
}
143+
144+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
145+
"With unnamed columns", use_columns=use_columns_str, skip_rows=1
146+
)
147+
148+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
149+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
150+
151+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
152+
"With unnamed columns", use_columns=use_columns_idx, skip_rows=1
153+
)
154+
155+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
156+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
157+
158+
159+
def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
160+
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
161+
) -> None:
162+
use_columns_str = ["col0", "col2", "col3"]
163+
use_columns_idx = [0, 2, 3]
164+
expected: dict[str, list[Any]] = {
165+
"col0": [2.0, 3.0],
166+
"col2": ["hello", "world"],
167+
"col3": [-5.0, -6.0],
168+
}
169+
column_names = [f"col{i}" for i in range(5)]
170+
171+
# skipping the header row only
172+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
173+
"With unnamed columns", use_columns=use_columns_str, skip_rows=1, column_names=column_names
174+
)
175+
176+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
177+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
178+
179+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
180+
"With unnamed columns", use_columns=use_columns_idx, skip_rows=1, column_names=column_names
181+
)
182+
183+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
184+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
185+
186+
# skipping the header row + first data row
187+
expected_first_row_skipped = {k: v[1:] for k, v in expected.items()}
188+
189+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
190+
"With unnamed columns", use_columns=use_columns_str, skip_rows=2, column_names=column_names
191+
)
192+
193+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped))
194+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))
195+
196+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
197+
"With unnamed columns", use_columns=use_columns_idx, skip_rows=2, column_names=column_names
198+
)
199+
200+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped))
201+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))
202+
203+
204+
def test_single_sheet_invalid_column_indices_negative_integer(
205+
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
206+
) -> None:
207+
expected_message = """invalid parameters: expected list[int] | list[str], got [-2]
208+
Context:
209+
0: expected selected columns to be list[str] | list[int] | None, got Some([-2])
210+
"""
211+
with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
212+
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2])
213+
214+
215+
def test_single_sheet_invalid_column_indices_empty_list(
216+
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
217+
) -> None:
218+
expected_message = """invalid parameters: list of select columns is empty
219+
Context:
220+
0: expected selected columns to be list[str] | list[int] | None, got Some([])
221+
"""
222+
with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
223+
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[])
224+
225+
226+
def test_single_sheet_invalid_column_indices_column_does_not_exist_str(
227+
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
228+
) -> None:
229+
expected_message = """column with name "nope" not found
230+
Context:
231+
0: selected columns are invalid
232+
1: could not create RecordBatch from sheet "January"
233+
2: could not convert RecordBatch to pyarrow for sheet "January"
234+
"""
235+
with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)):
236+
excel_reader_single_sheet_with_unnamed_columns.load_sheet(
237+
0, use_columns=["nope"]
238+
).to_arrow()
239+
240+
241+
def test_single_sheet_invalid_column_indices_column_does_not_exist_int(
242+
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
243+
) -> None:
244+
expected_message = """column at index 42 not found
245+
Context:
246+
0: selected columns are invalid
247+
1: could not create RecordBatch from sheet "January"
248+
2: could not convert RecordBatch to pyarrow for sheet "January"
249+
"""
250+
with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)):
251+
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]).to_arrow()

0 commit comments

Comments
 (0)