Skip to content

Commit 2147bb5

Browse files
authored
feat: introduce eager loading functions (#147)
* chore(deps): Upgrade calamine 0.22.1 -> 0.23.0 Signed-off-by: Luka Peschke <[email protected]> * feat: introduce eager loading functions Signed-off-by: Luka Peschke <[email protected]> * adapt to recent changes Signed-off-by: Luka Peschke <[email protected]> * feat: added support for schema_sample_rows Signed-off-by: Luka Peschke <[email protected]> * solve merge conflicts Signed-off-by: Luka Peschke <[email protected]> * adapt to recent changes on main Signed-off-by: Luka Peschke <[email protected]> * adapt to recent changes on main Signed-off-by: Luka Peschke <[email protected]> * adapt error message Signed-off-by: Luka Peschke <[email protected]> * adapt to recent changes on main Signed-off-by: Luka Peschke <[email protected]> * fat refactor, might support non-eager by-ref Signed-off-by: Luka Peschke <[email protected]> * add iterations to test.py Signed-off-by: Luka Peschke <[email protected]> * remove unused file Signed-off-by: Luka Peschke <[email protected]> * adapt to recent changes on main Signed-off-by: Luka Peschke <[email protected]> * fix: ensure eager=True always returns a RecordBatch Signed-off-by: Luka Peschke <[email protected]> * remove commented out code Signed-off-by: Luka Peschke <[email protected]> * simplify lifetime annotations Signed-off-by: Luka Peschke <[email protected]> * adapt code to recent changes Signed-off-by: Luka Peschke <[email protected]> * remove dbg! Signed-off-by: Luka Peschke <[email protected]> * fix typing Signed-off-by: Luka Peschke <[email protected]> * chore: clippy rust 1.79 Signed-off-by: Luka Peschke <[email protected]> * docs: improve docstrings Signed-off-by: Luka Peschke <[email protected]> --------- Signed-off-by: Luka Peschke <[email protected]>
1 parent 4332278 commit 2147bb5

File tree

13 files changed

+831
-361
lines changed

13 files changed

+831
-361
lines changed

python/fastexcel/__init__.py

Lines changed: 51 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def load_sheet(
131131
use_columns: list[str] | list[int] | str | None = None,
132132
dtypes: DTypeMap | None = None,
133133
) -> ExcelSheet:
134-
"""Loads a sheet by index or name.
134+
"""Loads a sheet lazily by index or name.
135135
136136
:param idx_or_name: The index (starting at 0) or the name of the sheet to load.
137137
:param header_row: The index of the row containing the column labels, default index is 0.
@@ -165,9 +165,41 @@ def load_sheet(
165165
schema_sample_rows=schema_sample_rows,
166166
use_columns=use_columns,
167167
dtypes=dtypes,
168+
eager=False,
168169
)
169170
)
170171

172+
def load_sheet_eager(
173+
self,
174+
idx_or_name: int | str,
175+
*,
176+
header_row: int | None = 0,
177+
column_names: list[str] | None = None,
178+
skip_rows: int = 0,
179+
n_rows: int | None = None,
180+
schema_sample_rows: int | None = 1_000,
181+
use_columns: list[str] | list[int] | str | None = None,
182+
dtypes: DTypeMap | None = None,
183+
) -> pa.RecordBatch:
184+
"""Loads a sheet eagerly by index or name.
185+
186+
For xlsx files, this will be faster and more memory-efficient, as it will use
187+
`worksheet_range_ref` under the hood, which returns borrowed types.
188+
189+
Refer to `load_sheet` for parameter documentation
190+
"""
191+
return self._reader.load_sheet(
192+
idx_or_name=idx_or_name,
193+
header_row=header_row,
194+
column_names=column_names,
195+
skip_rows=skip_rows,
196+
n_rows=n_rows,
197+
schema_sample_rows=schema_sample_rows,
198+
use_columns=use_columns,
199+
dtypes=dtypes,
200+
eager=True,
201+
)
202+
171203
def load_sheet_by_name(
172204
self,
173205
name: str,
@@ -184,17 +216,15 @@ def load_sheet_by_name(
184216
185217
Refer to `load_sheet` for parameter documentation
186218
"""
187-
return ExcelSheet(
188-
self._reader.load_sheet(
189-
name,
190-
header_row=header_row,
191-
column_names=column_names,
192-
skip_rows=skip_rows,
193-
n_rows=n_rows,
194-
schema_sample_rows=schema_sample_rows,
195-
use_columns=use_columns,
196-
dtypes=dtypes,
197-
)
219+
return self.load_sheet(
220+
name,
221+
header_row=header_row,
222+
column_names=column_names,
223+
skip_rows=skip_rows,
224+
n_rows=n_rows,
225+
schema_sample_rows=schema_sample_rows,
226+
use_columns=use_columns,
227+
dtypes=dtypes,
198228
)
199229

200230
def load_sheet_by_idx(
@@ -213,17 +243,15 @@ def load_sheet_by_idx(
213243
214244
Refer to `load_sheet` for parameter documentation
215245
"""
216-
return ExcelSheet(
217-
self._reader.load_sheet(
218-
idx,
219-
header_row=header_row,
220-
column_names=column_names,
221-
skip_rows=skip_rows,
222-
n_rows=n_rows,
223-
schema_sample_rows=schema_sample_rows,
224-
use_columns=use_columns,
225-
dtypes=dtypes,
226-
)
246+
return self.load_sheet(
247+
idx,
248+
header_row=header_row,
249+
column_names=column_names,
250+
skip_rows=skip_rows,
251+
n_rows=n_rows,
252+
schema_sample_rows=schema_sample_rows,
253+
use_columns=use_columns,
254+
dtypes=dtypes,
227255
)
228256

229257
def __repr__(self) -> str:

python/fastexcel/_fastexcel.pyi

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import typing
34
from typing import Literal
45

56
import pyarrow as pa
@@ -61,6 +62,7 @@ class _ExcelSheet:
6162
class _ExcelReader:
6263
"""A class representing an open Excel file and allowing to read its sheets"""
6364

65+
@typing.overload
6466
def load_sheet(
6567
self,
6668
idx_or_name: str | int,
@@ -72,7 +74,22 @@ class _ExcelReader:
7274
schema_sample_rows: int | None = 1_000,
7375
use_columns: list[str] | list[int] | str | None = None,
7476
dtypes: DTypeMap | None = None,
77+
eager: Literal[False] = ...,
7578
) -> _ExcelSheet: ...
79+
@typing.overload
80+
def load_sheet(
81+
self,
82+
idx_or_name: str | int,
83+
*,
84+
header_row: int | None = 0,
85+
column_names: list[str] | None = None,
86+
skip_rows: int = 0,
87+
n_rows: int | None = None,
88+
schema_sample_rows: int | None = 1_000,
89+
use_columns: list[str] | list[int] | str | None = None,
90+
dtypes: DTypeMap | None = None,
91+
eager: Literal[True] = ...,
92+
) -> pa.RecordBatch: ...
7693
@property
7794
def sheet_names(self) -> list[str]: ...
7895

python/tests/test_eagerness.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from datetime import date, datetime, timedelta
2+
3+
import fastexcel
4+
import polars as pl
5+
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
6+
from polars.testing import assert_frame_equal as pl_assert_frame_equal
7+
from pyarrow import RecordBatch
8+
from utils import path_for_fixture
9+
10+
11+
def test_load_sheet_eager_single_sheet() -> None:
12+
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
13+
14+
eager_pandas = excel_reader.load_sheet_eager(0).to_pandas()
15+
lazy_pandas = excel_reader.load_sheet(0).to_pandas()
16+
pd_assert_frame_equal(eager_pandas, lazy_pandas)
17+
18+
eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager(0))
19+
assert isinstance(eager_polars, pl.DataFrame)
20+
lazy_polars = excel_reader.load_sheet(0).to_polars()
21+
pl_assert_frame_equal(eager_polars, lazy_polars)
22+
23+
24+
def test_multiple_sheets_with_unnamed_columns():
25+
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
26+
27+
eager_pandas = excel_reader.load_sheet_eager("With unnamed columns").to_pandas()
28+
lazy_pandas = excel_reader.load_sheet("With unnamed columns").to_pandas()
29+
pd_assert_frame_equal(eager_pandas, lazy_pandas)
30+
31+
eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager("With unnamed columns"))
32+
assert isinstance(eager_polars, pl.DataFrame)
33+
lazy_polars = excel_reader.load_sheet("With unnamed columns").to_polars()
34+
pl_assert_frame_equal(eager_polars, lazy_polars)
35+
36+
37+
def test_eager_with_an_ods_file_should_return_a_recordbatch() -> None:
38+
ods_reader = fastexcel.read_excel(path_for_fixture("dates.ods"))
39+
40+
record_batch = ods_reader.load_sheet_eager(0)
41+
assert isinstance(record_batch, RecordBatch)
42+
pl_df = pl.from_arrow(record_batch)
43+
assert isinstance(pl_df, pl.DataFrame)
44+
pl_assert_frame_equal(
45+
pl_df,
46+
pl.DataFrame(
47+
{
48+
"date": [date(2023, 6, 1)],
49+
"datestr": ["2023-06-01T02:03:04+02:00"],
50+
"time": [timedelta(hours=1, minutes=2, seconds=3)],
51+
"datetime": [datetime(2023, 6, 1, 2, 3, 4)],
52+
}
53+
).with_columns(*(pl.col(col).dt.cast_time_unit("ms") for col in ("datetime", "time"))),
54+
)

src/error.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
use std::{error::Error, fmt::Display};
22

3+
use calamine::XlsxError;
4+
35
use crate::types::idx_or_name::IdxOrName;
46

57
#[derive(Debug)]
@@ -14,6 +16,7 @@ pub(crate) enum FastExcelErrorKind {
1416
// the actual type has not much value for us, so we just store a string context
1517
ArrowError(String),
1618
InvalidParameters(String),
19+
Internal(String),
1720
}
1821

1922
impl Display for FastExcelErrorKind {
@@ -41,6 +44,7 @@ impl Display for FastExcelErrorKind {
4144
}
4245
FastExcelErrorKind::ArrowError(err) => write!(f, "arrow error: {err}"),
4346
FastExcelErrorKind::InvalidParameters(err) => write!(f, "invalid parameters: {err}"),
47+
FastExcelErrorKind::Internal(err) => write!(f, "fastexcel error: {err}"),
4448
}
4549
}
4650
}
@@ -99,6 +103,12 @@ impl From<FastExcelErrorKind> for FastExcelError {
99103
}
100104
}
101105

106+
impl From<XlsxError> for FastExcelError {
107+
fn from(err: XlsxError) -> Self {
108+
FastExcelErrorKind::CalamineError(calamine::Error::Xlsx(err)).into()
109+
}
110+
}
111+
102112
pub(crate) type FastExcelResult<T> = Result<T, FastExcelError>;
103113

104114
impl<T> ErrorContext for FastExcelResult<T> {
@@ -181,6 +191,13 @@ pub(crate) mod py_errors {
181191
FastExcelError,
182192
"Provided parameters are invalid"
183193
);
194+
// Internal error
195+
create_exception!(
196+
_fastexcel,
197+
InternalError,
198+
FastExcelError,
199+
"Internal fastexcel error"
200+
);
184201

185202
pub(crate) trait IntoPyResult {
186203
type Inner;
@@ -217,6 +234,7 @@ pub(crate) mod py_errors {
217234
FastExcelErrorKind::InvalidParameters(_) => {
218235
InvalidParametersError::new_err(message)
219236
}
237+
FastExcelErrorKind::Internal(_) => ArrowError::new_err(message),
220238
})
221239
}
222240
}

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
mod error;
22
mod types;
3+
mod utils;
34

45
use error::{py_errors, ErrorContext};
56
use pyo3::prelude::*;

0 commit comments

Comments
 (0)