Skip to content

Commit

Permalink
feat(excelsheet): add support for multi-dtype columns (#164)
Browse files Browse the repository at this point in the history
* feat(deps-dev): as rstest as a dev dependency

Signed-off-by: Luka Peschke <[email protected]>

* feat(excelsheet): add support for multi-dtype columns

closes #160

Signed-off-by: Luka Peschke <[email protected]>

* fix: use as_f64 rather than get_float

Signed-off-by: Luka Peschke <[email protected]>

* test: add null + int and null + int + float test case

Signed-off-by: Luka Peschke <[email protected]>

* feat: add support for bools when determining the dtype fo a column

Signed-off-by: Luka Peschke <[email protected]>

* feat: add support for int columns

Signed-off-by: Luka Peschke <[email protected]>

* feat: added a schema_sample_rows param

Signed-off-by: Luka Peschke <[email protected]>

* chore: doc

---------

Signed-off-by: Luka Peschke <[email protected]>
Co-authored-by: Eric Jolibois <[email protected]>
  • Loading branch information
lukapeschke and PrettyWood authored Feb 13, 2024
1 parent cc56cef commit e243719
Show file tree
Hide file tree
Showing 10 changed files with 412 additions and 45 deletions.
118 changes: 98 additions & 20 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@ version = "40.0.0"
# There's a lot of stuff we don't want here, such as serde support
default-features = false
features = ["pyarrow"]

[dev-dependencies]
rstest = { version = "0.18.2", default-features = false }
6 changes: 4 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ format = ruff format python/ *.py
mypy = mypy python/ *.py
pytest = pytest -v
## Rust
clippy = cargo clippy
fmt = cargo fmt
clippy = cargo clippy
fmt = cargo fmt
cargo-test = cargo test
## Docs
pdoc = pdoc -o docs python/fastexcel

Expand Down Expand Up @@ -38,6 +39,7 @@ prod-install:
./prod_install.sh

test:
$(cargo-test)
$(pytest)

doc:
Expand Down
37 changes: 27 additions & 10 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,17 +88,23 @@ def load_sheet_by_name(
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
) -> ExcelSheet:
"""Loads a sheet by name.
:param name: The name of the sheet to load.
:param header_row: The index of the row containing the column labels, default index is 0.
If `None`, the sheet does not have any column labels.
:param column_names: Overrides headers found in the document. If `column_names` is used,
`header_row` will be ignored.
:param n_rows: Specifies how many rows should be loaded. If `None`, all rows are loaded
:param skip_rows: Specifies how many should be skipped after the header. If `header_row` is
`None`, it skips the number of rows from the sheet's start.
:param column_names: Overrides headers found in the document.
If `column_names` is used, `header_row` will be ignored.
:param n_rows: Specifies how many rows should be loaded.
If `None`, all rows are loaded
:param skip_rows: Specifies how many rows should be skipped after the header.
If `header_row` is `None`, it skips the number of rows from the
start of the sheet.
:param schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column.
If `None`, all rows will be used.
"""
return ExcelSheet(
self._reader.load_sheet_by_name(
Expand All @@ -107,6 +113,7 @@ def load_sheet_by_name(
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
)
)

Expand All @@ -118,17 +125,23 @@ def load_sheet_by_idx(
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
) -> ExcelSheet:
"""Loads a sheet by index.
:param idx: The index (starting at 0) of the sheet to load.
:param header_row: The index of the row containing the column labels, default index is 0.
If `None`, the sheet does not have any column labels.
:param column_names: Overrides headers found in the document. If `column_names` is used,
`header_row` will be ignored.
:param n_rows: Specifies how many rows should be loaded. If `None`, all rows are loaded
:param skip_rows: Specifies how many should be skipped after the header. If `header_row` is
`None`, it skips the number of rows from the sheet's start.
:param column_names: Overrides headers found in the document.
If `column_names` is used, `header_row` will be ignored.
:param n_rows: Specifies how many rows should be loaded.
If `None`, all rows are loaded
:param skip_rows: Specifies how many rows should be skipped after the header.
If `header_row` is `None`, it skips the number of rows from the
start of the sheet.
:param schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column.
If `None`, all rows will be used.
"""
if idx < 0:
raise ValueError(f"Expected idx to be > 0, got {idx}")
Expand All @@ -139,6 +152,7 @@ def load_sheet_by_idx(
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
)
)

Expand All @@ -150,6 +164,7 @@ def load_sheet(
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
) -> ExcelSheet:
"""Loads a sheet by name if a string is passed or by index if an integer is passed.
Expand All @@ -162,6 +177,7 @@ def load_sheet(
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
)
if isinstance(idx_or_name, int)
else self.load_sheet_by_name(
Expand All @@ -170,6 +186,7 @@ def load_sheet(
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
)
)

Expand Down
3 changes: 3 additions & 0 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class _ExcelReader:
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
) -> _ExcelSheet: ...
def load_sheet_by_idx(
self,
Expand All @@ -41,6 +42,7 @@ class _ExcelReader:
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
) -> _ExcelSheet: ...
def load_sheet(
self,
Expand All @@ -50,6 +52,7 @@ class _ExcelReader:
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
) -> _ExcelSheet: ...
@property
def sheet_names(self) -> list[str]: ...
Expand Down
Binary file not shown.
Loading

0 comments on commit e243719

Please sign in to comment.