Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow to select a subset of columns #189

Merged
merged 14 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 28 additions & 28 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,34 @@ jobs:
source .venv/bin/activate
make lint

check-docs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Set up rust toolchain
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- run: |
git config user.name github-actions
git config user.email [email protected]

# venv required by maturin
python3 -m venv .venv
source .venv/bin/activate

make install-test-requirements
make install-doc-requirements
# Required for pdoc to be able to import the sources
make dev-install
make doc

# GitHub provides only x86_64 runners, so we cannot test on arm architecture
test:
runs-on: ${{ matrix.os }}
Expand Down Expand Up @@ -110,31 +138,3 @@ jobs:
command: build
args: "-o dist --interpreter python${{ matrix.python-version }}"
target: ${{ steps.target.outputs.target }}

check-docs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Set up rust toolchain
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- run: |
git config user.name github-actions
git config user.email [email protected]

# venv required by maturin
python3 -m venv .venv
source .venv/bin/activate

make install-test-requirements
make install-doc-requirements
# Required for pdoc to be able to import the sources
make dev-install
make doc
23 changes: 23 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 12 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ crate-type = ["cdylib"]
[dependencies]
calamine = { version = "0.24.0", features = ["dates"] }
chrono = { version = "0.4.34", default-features = false }
pyo3 = { version = "0.20.3", features = ["extension-module", "abi3-py38"] }
# NOTE: "extension-module" is actually required, see comments on features below
pyo3 = { version = "0.20.3", features = ["abi3-py38"] }

[dependencies.arrow]
version = "50.0.0"
Expand All @@ -20,4 +21,14 @@ default-features = false
features = ["pyarrow"]

[dev-dependencies]
pretty_assertions = "1.4.0"
rstest = { version = "0.18.2", default-features = false }

# NOTE: This is a hack to bypass pyo3 limitations when testing:
# https://pyo3.rs/v0.20.3/faq.html#i-cant-run-cargo-test-or-i-cant-build-in-a-cargo-workspace-im-having-linker-issues-like-symbol-not-found-or-undefined-reference-to-_pyexc_systemerror
lukapeschke marked this conversation as resolved.
Show resolved Hide resolved
[features]
extension-module = ["pyo3/extension-module"]
default = ["extension-module"]
# feature for tests only. This makes Python::with_gil auto-initialize Python
# interpreters, which allows us to instantiate Python objects in tests
tests = ["pyo3/auto-initialize"]
21 changes: 17 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,30 @@ pytest = pytest -v
## Rust
clippy = cargo clippy
fmt = cargo fmt
cargo-test = cargo test
cargo-test = cargo test --no-default-features --features tests
## Docs
pdoc = pdoc -o docs python/fastexcel

lint:
lint-python:
$(ruff)
$(format) --check --diff
$(mypy)

lint-rust:
$(clippy)
format:

lint: lint-rust lint-python

format-python:
$(ruff) --fix
$(format)

format-rust:
$(fmt)
$(clippy) --fix --lib -p fastexcel --allow-dirty --allow-staged

format: format-rust format-python

install-test-requirements:
pip install -U -r test-requirements.txt -r build-requirements.txt

Expand All @@ -39,10 +48,14 @@ dev-install:
prod-install:
./prod_install.sh

test:
test-rust:
$(cargo-test)

test-python:
$(pytest)

test: test-rust test-python

doc:
$(pdoc)

Expand Down
25 changes: 25 additions & 0 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
CalamineCellError,
CalamineError,
CannotRetrieveCellDataError,
ColumnNotFoundError,
FastExcelError,
InvalidParametersError,
SheetNotFoundError,
Expand Down Expand Up @@ -54,6 +55,16 @@ def total_height(self) -> int:
"""The sheet's total height"""
return self._sheet.total_height

@property
def selected_columns(self) -> list[str] | list[int] | None:
"""The sheet's selected columns"""
return self._sheet.selected_columns

@property
def available_columns(self) -> list[str]:
"""The columns available for the given sheet"""
return self._sheet.available_columns

def to_arrow(self) -> pa.RecordBatch:
"""Converts the sheet to a pyarrow `RecordBatch`"""
return self._sheet.to_arrow()
Expand Down Expand Up @@ -101,6 +112,7 @@ def load_sheet_by_name(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
) -> ExcelSheet:
"""Loads a sheet by name.

Expand All @@ -117,6 +129,9 @@ def load_sheet_by_name(
:param schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column.
If `None`, all rows will be used.
:param use_columns: Specifies the columns to use. Can either be a list of column names, or
a list of column indices (starting at 0).
If `None`, all columns will be used.
"""
return ExcelSheet(
self._reader.load_sheet_by_name(
Expand All @@ -126,6 +141,7 @@ def load_sheet_by_name(
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
)
)

Expand All @@ -138,6 +154,7 @@ def load_sheet_by_idx(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
) -> ExcelSheet:
"""Loads a sheet by index.

Expand All @@ -154,6 +171,9 @@ def load_sheet_by_idx(
:param schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column.
If `None`, all rows will be used.
:param use_columns: Specifies the columns to use. Can either be a list of column names, or
a list of column indices (starting at 0).
If `None`, all columns will be used.
"""
if idx < 0:
raise ValueError(f"Expected idx to be > 0, got {idx}")
Expand All @@ -165,6 +185,7 @@ def load_sheet_by_idx(
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
)
)

Expand All @@ -177,6 +198,7 @@ def load_sheet(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
PrettyWood marked this conversation as resolved.
Show resolved Hide resolved
) -> ExcelSheet:
"""Loads a sheet by name if a string is passed or by index if an integer is passed.

Expand All @@ -190,6 +212,7 @@ def load_sheet(
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
)
if isinstance(idx_or_name, int)
else self.load_sheet_by_name(
Expand All @@ -199,6 +222,7 @@ def load_sheet(
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
)
)

Expand All @@ -224,6 +248,7 @@ def read_excel(path: Path | str) -> ExcelReader:
"CalamineCellError",
"CalamineError",
"SheetNotFoundError",
"ColumnNotFoundError",
"ArrowError",
"InvalidParametersError",
"UnsupportedColumnTypeCombinationError",
Expand Down
19 changes: 9 additions & 10 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ class _ExcelSheet:
@property
def offset(self) -> int:
"""The sheet's offset before data starts"""
@property
def selected_columns(self) -> list[str] | list[int] | None:
"""The sheet's selected columns"""
@property
def available_columns(self) -> list[str]:
"""The columns available for the given sheet"""
def to_arrow(self) -> pa.RecordBatch:
"""Converts the sheet to a pyarrow `RecordBatch`"""

Expand All @@ -33,6 +39,7 @@ class _ExcelReader:
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
) -> _ExcelSheet: ...
def load_sheet_by_idx(
self,
Expand All @@ -43,16 +50,7 @@ class _ExcelReader:
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
) -> _ExcelSheet: ...
def load_sheet(
self,
idx_or_name: int | str,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
) -> _ExcelSheet: ...
@property
def sheet_names(self) -> list[str]: ...
Expand All @@ -69,5 +67,6 @@ class CannotRetrieveCellDataError(FastExcelError): ...
class CalamineCellError(FastExcelError): ...
class CalamineError(FastExcelError): ...
class SheetNotFoundError(FastExcelError): ...
class ColumnNotFoundError(FastExcelError): ...
class ArrowError(FastExcelError): ...
class InvalidParametersError(FastExcelError): ...
Loading
Loading