diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 732fdc2..f937e1f 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -46,6 +46,34 @@ jobs: source .venv/bin/activate make lint + check-docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Set up rust toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - run: | + git config user.name github-actions + git config user.email github-actions@github.com + + # venv required by maturin + python3 -m venv .venv + source .venv/bin/activate + + make install-test-requirements + make install-doc-requirements + # Required for pdoc to be able to import the sources + make dev-install + make doc + # GitHub provides only x86_64 runners, so we cannot test on arm architecture test: runs-on: ${{ matrix.os }} @@ -110,31 +138,3 @@ jobs: command: build args: "-o dist --interpreter python${{ matrix.python-version }}" target: ${{ steps.target.outputs.target }} - - check-docs: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: Set up rust toolchain - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable - override: true - - run: | - git config user.name github-actions - git config user.email github-actions@github.com - - # venv required by maturin - python3 -m venv .venv - source .venv/bin/activate - - make install-test-requirements - make install-doc-requirements - # Required for pdoc to be able to import the sources - make dev-install - make doc diff --git a/Cargo.lock b/Cargo.lock index 8c44037..e7c0053 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -344,6 +344,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +[[package]] +name = "diff" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" + [[package]] name = "encoding_rs" version = "0.8.31" @@ -360,6 +366,7 @@ dependencies = [ "arrow", "calamine", "chrono", + "pretty_assertions", "pyo3", "rstest", ] @@ -678,6 +685,16 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" +[[package]] +name = "pretty_assertions" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" +dependencies = [ + "diff", + "yansi", +] + [[package]] name = "proc-macro-hack" version = "0.5.19" @@ -1122,6 +1139,12 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +[[package]] +name = "yansi" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" + [[package]] name = "zip" version = "0.6.3" diff --git a/Cargo.toml b/Cargo.toml index c62376d..1210ccf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,8 @@ crate-type = ["cdylib"] [dependencies] calamine = { version = "0.24.0", features = ["dates"] } chrono = { version = "0.4.34", default-features = false } -pyo3 = { version = "0.20.3", features = ["extension-module", "abi3-py38"] } +# NOTE: "extension-module" is actually required, see comments on features below +pyo3 = { version = "0.20.3", features = ["abi3-py38"] } [dependencies.arrow] version = "50.0.0" @@ -20,4 +21,14 @@ default-features = false features = ["pyarrow"] [dev-dependencies] +pretty_assertions = "1.4.0" rstest = { version = "0.18.2", default-features = false } + +# NOTE: This is a hack to bypass pyo3 limitations when testing: +# https://pyo3.rs/v0.20.3/faq.html#i-cant-run-cargo-test-or-i-cant-build-in-a-cargo-workspace-im-having-linker-issues-like-symbol-not-found-or-undefined-reference-to-_pyexc_systemerror +[features] +extension-module = ["pyo3/extension-module"] +default = ["extension-module"] +# feature for tests only. This makes Python::with_gil auto-initialize Python +# interpreters, which allows us to instantiate Python objects in tests +tests = ["pyo3/auto-initialize"] diff --git a/Makefile b/Makefile index d4e2ee7..f7c29fe 100644 --- a/Makefile +++ b/Makefile @@ -9,21 +9,30 @@ pytest = pytest -v ## Rust clippy = cargo clippy fmt = cargo fmt -cargo-test = cargo test +cargo-test = cargo test --no-default-features --features tests ## Docs pdoc = pdoc -o docs python/fastexcel -lint: +lint-python: $(ruff) $(format) --check --diff $(mypy) + +lint-rust: $(clippy) -format: + +lint: lint-rust lint-python + +format-python: $(ruff) --fix $(format) + +format-rust: $(fmt) $(clippy) --fix --lib -p fastexcel --allow-dirty --allow-staged +format: format-rust format-python + install-test-requirements: pip install -U -r test-requirements.txt -r build-requirements.txt @@ -39,10 +48,14 @@ dev-install: prod-install: ./prod_install.sh -test: +test-rust: $(cargo-test) + +test-python: $(pytest) +test: test-rust test-python + doc: $(pdoc) diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index fb00d9b..b57f8f4 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -17,6 +17,7 @@ CalamineCellError, CalamineError, CannotRetrieveCellDataError, + ColumnNotFoundError, FastExcelError, InvalidParametersError, SheetNotFoundError, @@ -54,6 +55,16 @@ def total_height(self) -> int: """The sheet's total height""" return self._sheet.total_height + @property + def selected_columns(self) -> list[str] | list[int] | None: + """The sheet's selected columns""" + return self._sheet.selected_columns + + @property + def available_columns(self) -> list[str]: + """The columns available for the given sheet""" + return self._sheet.available_columns + def to_arrow(self) -> pa.RecordBatch: """Converts the sheet to a pyarrow `RecordBatch`""" return self._sheet.to_arrow() @@ -101,6 +112,7 @@ def load_sheet_by_name( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + use_columns: list[str] | list[int] | None = None, ) -> ExcelSheet: """Loads a sheet by name. @@ -117,6 +129,9 @@ def load_sheet_by_name( :param schema_sample_rows: Specifies how many rows should be used to determine the dtype of a column. If `None`, all rows will be used. + :param use_columns: Specifies the columns to use. Can either be a list of column names, or + a list of column indices (starting at 0). + If `None`, all columns will be used. """ return ExcelSheet( self._reader.load_sheet_by_name( @@ -126,6 +141,7 @@ def load_sheet_by_name( skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, + use_columns=use_columns, ) ) @@ -138,6 +154,7 @@ def load_sheet_by_idx( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + use_columns: list[str] | list[int] | None = None, ) -> ExcelSheet: """Loads a sheet by index. @@ -154,6 +171,9 @@ def load_sheet_by_idx( :param schema_sample_rows: Specifies how many rows should be used to determine the dtype of a column. If `None`, all rows will be used. + :param use_columns: Specifies the columns to use. Can either be a list of column names, or + a list of column indices (starting at 0). + If `None`, all columns will be used. """ if idx < 0: raise ValueError(f"Expected idx to be > 0, got {idx}") @@ -165,6 +185,7 @@ def load_sheet_by_idx( skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, + use_columns=use_columns, ) ) @@ -177,6 +198,7 @@ def load_sheet( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + use_columns: list[str] | list[int] | None = None, ) -> ExcelSheet: """Loads a sheet by name if a string is passed or by index if an integer is passed. @@ -190,6 +212,7 @@ def load_sheet( skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, + use_columns=use_columns, ) if isinstance(idx_or_name, int) else self.load_sheet_by_name( @@ -199,6 +222,7 @@ def load_sheet( skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, + use_columns=use_columns, ) ) @@ -224,6 +248,7 @@ def read_excel(path: Path | str) -> ExcelReader: "CalamineCellError", "CalamineError", "SheetNotFoundError", + "ColumnNotFoundError", "ArrowError", "InvalidParametersError", "UnsupportedColumnTypeCombinationError", diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi index 865f4e7..b4e2c36 100644 --- a/python/fastexcel/_fastexcel.pyi +++ b/python/fastexcel/_fastexcel.pyi @@ -18,6 +18,12 @@ class _ExcelSheet: @property def offset(self) -> int: """The sheet's offset before data starts""" + @property + def selected_columns(self) -> list[str] | list[int] | None: + """The sheet's selected columns""" + @property + def available_columns(self) -> list[str]: + """The columns available for the given sheet""" def to_arrow(self) -> pa.RecordBatch: """Converts the sheet to a pyarrow `RecordBatch`""" @@ -33,6 +39,7 @@ class _ExcelReader: skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + use_columns: list[str] | list[int] | None = None, ) -> _ExcelSheet: ... def load_sheet_by_idx( self, @@ -43,16 +50,7 @@ class _ExcelReader: skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - ) -> _ExcelSheet: ... - def load_sheet( - self, - idx_or_name: int | str, - *, - header_row: int | None = 0, - column_names: list[str] | None = None, - skip_rows: int = 0, - n_rows: int | None = None, - schema_sample_rows: int | None = 1_000, + use_columns: list[str] | list[int] | None = None, ) -> _ExcelSheet: ... @property def sheet_names(self) -> list[str]: ... @@ -69,5 +67,6 @@ class CannotRetrieveCellDataError(FastExcelError): ... class CalamineCellError(FastExcelError): ... class CalamineError(FastExcelError): ... class SheetNotFoundError(FastExcelError): ... +class ColumnNotFoundError(FastExcelError): ... class ArrowError(FastExcelError): ... class InvalidParametersError(FastExcelError): ... diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py new file mode 100644 index 0000000..8fa363c --- /dev/null +++ b/python/tests/test_column_selection.py @@ -0,0 +1,266 @@ +from __future__ import annotations + +import re +from typing import Any + +import fastexcel +import pandas as pd +import polars as pl +import pytest +from pandas.testing import assert_frame_equal as pd_assert_frame_equal +from polars.testing import assert_frame_equal as pl_assert_frame_equal +from utils import path_for_fixture + + +@pytest.fixture +def excel_reader_single_sheet() -> fastexcel.ExcelReader: + return fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) + + +def test_single_sheet_all_columns(excel_reader_single_sheet: fastexcel.ExcelReader) -> None: + sheet = excel_reader_single_sheet.load_sheet(0) + + sheet_explicit_arg = excel_reader_single_sheet.load_sheet(0, use_columns=None) + assert sheet.selected_columns is None + assert sheet.available_columns == ["Month", "Year"] + + expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} + expected_pd_df = pd.DataFrame(expected) + expected_pl_df = pl.DataFrame(expected) + + pd_df = sheet.to_pandas() + pd_assert_frame_equal(pd_df, expected_pd_df) + pd_df_explicit_arg = sheet_explicit_arg.to_pandas() + pd_assert_frame_equal(pd_df_explicit_arg, expected_pd_df) + + pl_df = sheet.to_polars() + pl_assert_frame_equal(pl_df, expected_pl_df) + pl_df_explicit_arg = sheet_explicit_arg.to_polars() + pl_assert_frame_equal(pl_df_explicit_arg, expected_pl_df) + + +def test_single_sheet_subset_by_str(excel_reader_single_sheet: fastexcel.ExcelReader) -> None: + expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} + + # looks like mypy 1.8 became more stupid + sheets: list[str | int] = [0, "January"] + for sheet_name_or_idx in sheets: + for col in ["Month", "Year"]: + sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[col]) + assert sheet.selected_columns == [col] + assert sheet.available_columns == ["Month", "Year"] + + pd_df = sheet.to_pandas() + pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]})) + + pl_df = sheet.to_polars() + pl_assert_frame_equal(pl_df, pl.DataFrame({col: expected[col]})) + + +def test_single_sheet_subset_by_index(excel_reader_single_sheet: fastexcel.ExcelReader) -> None: + expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} + + sheets: list[str | int] = [0, "January"] + for sheet_name_or_idx in sheets: + for idx, col_name in enumerate(["Month", "Year"]): + sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[idx]) + assert sheet.selected_columns == [idx] + assert sheet.available_columns == ["Month", "Year"] + + pd_df = sheet.to_pandas() + pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]})) + + pl_df = sheet.to_polars() + pl_assert_frame_equal(pl_df, pl.DataFrame({col_name: expected[col_name]})) + + +@pytest.fixture +def excel_reader_single_sheet_with_unnamed_columns() -> fastexcel.ExcelReader: + return fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) + + +@pytest.fixture +def single_sheet_with_unnamed_columns_expected() -> dict[str, list[Any]]: + return { + "col1": [2.0, 3.0], + "__UNNAMED__1": [1.5, 2.5], + "col3": ["hello", "world"], + "__UNNAMED__3": [-5.0, -6.0], + "col5": ["a", "b"], + } + + +def test_single_sheet_with_unnamed_columns( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, + single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], +) -> None: + use_columns_str = ["col1", "col3", "__UNNAMED__3"] + use_columns_idx = [0, 2, 3] + expected = { + k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in use_columns_str + } + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_str + ) + assert sheet.selected_columns == use_columns_str + assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_idx + ) + assert sheet.selected_columns == use_columns_idx + assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + +def test_single_sheet_with_unnamed_columns_and_pagination( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, + single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], +) -> None: + use_columns_str = ["col1", "col3", "__UNNAMED__3"] + use_columns_idx = [0, 2, 3] + + # first row only + expected = { + k: v[:1] + for k, v in single_sheet_with_unnamed_columns_expected.items() + if k in use_columns_str + } + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_str, n_rows=1 + ) + assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_idx, n_rows=1 + ) + assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + # second row + expected = { + k: v[1:] + for k, v in single_sheet_with_unnamed_columns_expected.items() + if k in use_columns_str + } + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_str, skip_rows=1 + ) + assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_idx, skip_rows=1 + ) + assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + +def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, +) -> None: + use_columns_str = ["col0", "col2", "col3"] + use_columns_idx = [0, 2, 3] + expected: dict[str, list[Any]] = { + "col0": [2.0, 3.0], + "col2": ["hello", "world"], + "col3": [-5.0, -6.0], + } + column_names = [f"col{i}" for i in range(5)] + + # skipping the header row only + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_str, skip_rows=1, column_names=column_names + ) + assert sheet.available_columns == column_names + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_idx, skip_rows=1, column_names=column_names + ) + assert sheet.available_columns == column_names + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + # skipping the header row + first data row + expected_first_row_skipped = {k: v[1:] for k, v in expected.items()} + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_str, skip_rows=2, column_names=column_names + ) + assert sheet.available_columns == column_names + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped)) + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_idx, skip_rows=2, column_names=column_names + ) + assert sheet.available_columns == column_names + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped)) + + +def test_single_sheet_invalid_column_indices_negative_integer( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, +) -> None: + expected_message = """invalid parameters: expected list[int] | list[str], got [-2] +Context: + 0: expected selected columns to be list[str] | list[int] | None, got Some([-2]) +""" + with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)): + excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2]) + + +def test_single_sheet_invalid_column_indices_empty_list( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, +) -> None: + expected_message = """invalid parameters: list of selected columns is empty +Context: + 0: expected selected columns to be list[str] | list[int] | None, got Some([]) +""" + with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)): + excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[]) + + +def test_single_sheet_invalid_column_indices_column_does_not_exist_str( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, +) -> None: + expected_message = """column with name "nope" not found +Context: + 0: selected columns are invalid, available columns are: ["Month", "Year"] +""" + with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)): + excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=["nope"]) + + +def test_single_sheet_invalid_column_indices_column_does_not_exist_int( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, +) -> None: + expected_message = """column at index 42 not found +Context: + 0: selected columns are invalid, available columns are: ["Month", "Year"] +""" + with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)): + excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]) diff --git a/python/tests/test_errors.py b/python/tests/test_errors.py index 1b1c33e..4897158 100644 --- a/python/tests/test_errors.py +++ b/python/tests/test_errors.py @@ -51,6 +51,7 @@ def test_sheet_not_found_error() -> None: "calamine returned an error regarding the content of the cell", ), (fastexcel.CalamineError, "Generic calamine error"), + (fastexcel.ColumnNotFoundError, "Column was not found"), (fastexcel.SheetNotFoundError, "Sheet was not found"), (fastexcel.ArrowError, "Generic arrow error"), (fastexcel.InvalidParametersError, "Provided parameters are invalid"), diff --git a/src/error.rs b/src/error.rs index 4c6de09..48e6bdb 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,20 +1,28 @@ use std::{error::Error, fmt::Display}; #[derive(Debug)] -pub(crate) enum SheetIdxOrName { +pub(crate) enum IdxOrName { Idx(usize), - // Leaving this variant if someday we want to check if a name exists before calling worksheet_range - #[allow(dead_code)] Name(String), } +impl IdxOrName { + pub(super) fn format_message(&self) -> String { + match self { + Self::Idx(idx) => format!("at index {idx}"), + Self::Name(name) => format!("with name \"{name}\""), + } + } +} + #[derive(Debug)] pub(crate) enum FastExcelErrorKind { UnsupportedColumnTypeCombination(String), CannotRetrieveCellData(usize, usize), CalamineCellError(calamine::CellErrorType), CalamineError(calamine::Error), - SheetNotFound(SheetIdxOrName), + SheetNotFound(IdxOrName), + ColumnNotFound(IdxOrName), // Arrow errors can be of several different types (arrow::error::Error, PyError), and having // the actual type has not much value for us, so we just store a string context ArrowError(String), @@ -37,14 +45,13 @@ impl Display for FastExcelErrorKind { write!(f, "calamine error: {calamine_error}") } FastExcelErrorKind::SheetNotFound(idx_or_name) => { - let message = { - match idx_or_name { - SheetIdxOrName::Idx(idx) => format!("at index {idx}"), - SheetIdxOrName::Name(name) => format!("with name \"{name}\" not found"), - } - }; + let message = idx_or_name.format_message(); write!(f, "sheet {message} not found") } + FastExcelErrorKind::ColumnNotFound(idx_or_name) => { + let message = idx_or_name.format_message(); + write!(f, "column {message} not found") + } FastExcelErrorKind::ArrowError(err) => write!(f, "arrow error: {err}"), FastExcelErrorKind::InvalidParameters(err) => write!(f, "invalid parameters: {err}"), } @@ -53,7 +60,7 @@ impl Display for FastExcelErrorKind { #[derive(Debug)] pub(crate) struct FastExcelError { - kind: FastExcelErrorKind, + pub kind: FastExcelErrorKind, context: Vec, } @@ -166,6 +173,13 @@ pub(crate) mod py_errors { FastExcelError, "Sheet was not found" ); + // Sheet not found + create_exception!( + _fastexcel, + ColumnNotFoundError, + FastExcelError, + "Column was not found" + ); // Arrow error create_exception!( _fastexcel, @@ -209,6 +223,9 @@ pub(crate) mod py_errors { FastExcelErrorKind::SheetNotFound(_) => { SheetNotFoundError::new_err(message) } + FastExcelErrorKind::ColumnNotFound(_) => { + ColumnNotFoundError::new_err(message) + } FastExcelErrorKind::ArrowError(_) => ArrowError::new_err(message), FastExcelErrorKind::InvalidParameters(_) => { InvalidParametersError::new_err(message) diff --git a/src/lib.rs b/src/lib.rs index 7740754..1537816 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -55,6 +55,10 @@ fn _fastexcel(py: Python, m: &PyModule) -> PyResult<()> { "SheetNotFoundError", py.get_type::(), ), + ( + "ColumnNotFoundError", + py.get_type::(), + ), ("ArrowError", py.get_type::()), ( "InvalidParametersError", diff --git a/src/types/excelreader.rs b/src/types/excelreader.rs index 6424db1..f198061 100644 --- a/src/types/excelreader.rs +++ b/src/types/excelreader.rs @@ -1,10 +1,10 @@ use std::{fs::File, io::BufReader}; use calamine::{open_workbook_auto, Reader, Sheets}; -use pyo3::{pyclass, pymethods, PyResult}; +use pyo3::{pyclass, pymethods, types::PyList, PyResult}; use crate::error::{ - py_errors::IntoPyResult, ErrorContext, FastExcelErrorKind, FastExcelResult, SheetIdxOrName, + py_errors::IntoPyResult, ErrorContext, FastExcelErrorKind, FastExcelResult, IdxOrName, }; use super::{ @@ -50,7 +50,9 @@ impl ExcelReader { skip_rows = 0, n_rows = None, schema_sample_rows = 1_000, + use_columns = None ))] + #[allow(clippy::too_many_arguments)] pub fn load_sheet_by_name( &mut self, name: String, @@ -59,6 +61,7 @@ impl ExcelReader { skip_rows: usize, n_rows: Option, schema_sample_rows: Option, + use_columns: Option<&PyList>, ) -> PyResult { let range = self .sheets @@ -69,13 +72,16 @@ impl ExcelReader { let header = Header::new(header_row, column_names); let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?; - Ok(ExcelSheet::new( + let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?; + ExcelSheet::try_new( name, range, header, pagination, schema_sample_rows, - )) + selected_columns, + ) + .into_pyresult() } #[pyo3(signature = ( @@ -86,7 +92,9 @@ impl ExcelReader { skip_rows = 0, n_rows = None, schema_sample_rows = 1_000, + use_columns = None ))] + #[allow(clippy::too_many_arguments)] pub fn load_sheet_by_idx( &mut self, idx: usize, @@ -95,11 +103,12 @@ impl ExcelReader { skip_rows: usize, n_rows: Option, schema_sample_rows: Option, + use_columns: Option<&PyList>, ) -> PyResult { let name = self .sheet_names .get(idx) - .ok_or_else(|| FastExcelErrorKind::SheetNotFound(SheetIdxOrName::Idx(idx)).into()) + .ok_or_else(|| FastExcelErrorKind::SheetNotFound(IdxOrName::Idx(idx)).into()) .with_context(|| { format!( "Sheet index {idx} is out of range. File has {} sheets", @@ -114,7 +123,7 @@ impl ExcelReader { .worksheet_range_at(idx) // Returns Option, Self::Error>>, so we convert the Option into a // SheetNotFoundError and unwrap it - .ok_or_else(|| FastExcelErrorKind::SheetNotFound(SheetIdxOrName::Idx(idx)).into()) + .ok_or_else(|| FastExcelErrorKind::SheetNotFound(IdxOrName::Idx(idx)).into()) .into_pyresult()? // And here, we convert the calamine error in an owned error and unwrap it .map_err(|err| FastExcelErrorKind::CalamineError(err).into()) @@ -122,12 +131,15 @@ impl ExcelReader { let header = Header::new(header_row, column_names); let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?; - Ok(ExcelSheet::new( + let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?; + ExcelSheet::try_new( name, range, header, pagination, schema_sample_rows, - )) + selected_columns, + ) + .into_pyresult() } } diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index d950eec..5f42dff 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -2,6 +2,7 @@ use std::sync::Arc; use crate::error::{ py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, + IdxOrName, }; use arrow::{ @@ -18,11 +19,13 @@ use chrono::NaiveDate; use pyo3::{ prelude::{pyclass, pymethods, PyObject, Python}, + types::PyList, PyResult, }; use crate::utils::arrow::arrow_schema_from_column_names_and_range; +#[derive(Debug)] pub(crate) enum Header { None, At(usize), @@ -76,6 +79,102 @@ impl Pagination { } } +#[derive(Debug, PartialEq)] +pub(crate) enum SelectedColumns { + All, + ByIndex(Vec), + ByName(Vec), +} + +impl SelectedColumns { + pub(crate) fn validate_columns(&self, column_names: &[String]) -> FastExcelResult<()> { + match self { + SelectedColumns::All => Ok(()), + // If no selected indice is >= to the len of column_names, we're good + SelectedColumns::ByIndex(indices) => indices.iter().try_for_each(|idx| { + if idx >= &column_names.len() { + Err(FastExcelErrorKind::ColumnNotFound(IdxOrName::Idx(*idx)).into()) + } else { + Ok(()) + } + }), + // Every selected column must be in the provided column_names + SelectedColumns::ByName(selected_names) => { + selected_names.iter().try_for_each(|selected_name| { + if column_names.contains(selected_name) { + Ok(()) + } else { + Err(FastExcelErrorKind::ColumnNotFound(IdxOrName::Name( + selected_name.to_string(), + )) + .into()) + } + }) + } + } + } + + pub(crate) fn idx_for_column( + &self, + col_names: &[String], + col_name: &str, + col_idx: usize, + ) -> Option { + match self { + SelectedColumns::All => None, + SelectedColumns::ByIndex(indices) => { + if indices.contains(&col_idx) { + Some(col_idx) + } else { + None + } + } + SelectedColumns::ByName(names) => { + // cannot use .contains() because we have &String and &str + if names.iter().any(|name| name == col_name) { + col_names.iter().position(|name| name == col_name) + } else { + None + } + } + } + } + + pub(crate) fn to_python<'p>(&self, py: Python<'p>) -> Option<&'p PyList> { + match self { + SelectedColumns::All => None, + SelectedColumns::ByIndex(idx_vec) => Some(PyList::new(py, idx_vec)), + SelectedColumns::ByName(name_vec) => Some(PyList::new(py, name_vec)), + } + } +} + +impl TryFrom> for SelectedColumns { + type Error = FastExcelError; + + fn try_from(value: Option<&PyList>) -> FastExcelResult { + use FastExcelErrorKind::InvalidParameters; + + match value { + None => Ok(Self::All), + Some(py_list) => { + if py_list.is_empty() { + Err(InvalidParameters("list of selected columns is empty".to_string()).into()) + } else if let Ok(name_vec) = py_list.extract::>() { + Ok(Self::ByName(name_vec)) + } else if let Ok(index_vec) = py_list.extract::>() { + Ok(Self::ByIndex(index_vec)) + } else { + Err(InvalidParameters(format!( + "expected list[int] | list[str], got {py_list:?}" + )) + .into()) + } + } + } + } +} + #[pyclass(name = "_ExcelSheet")] pub(crate) struct ExcelSheet { #[pyo3(get)] @@ -87,6 +186,8 @@ pub(crate) struct ExcelSheet { total_height: Option, width: Option, schema_sample_rows: Option, + selected_columns: SelectedColumns, + available_columns: Vec, } impl ExcelSheet { @@ -94,26 +195,45 @@ impl ExcelSheet { &self.data } - pub(crate) fn new( + pub(crate) fn try_new( name: String, data: Range, header: Header, pagination: Pagination, schema_sample_rows: Option, - ) -> Self { - ExcelSheet { + selected_columns: SelectedColumns, + ) -> FastExcelResult { + let mut sheet = ExcelSheet { name, header, pagination, data, schema_sample_rows, + selected_columns, height: None, total_height: None, width: None, - } + // an empty vec as it will be replaced + available_columns: Vec::with_capacity(0), + }; + + let available_columns = sheet.get_available_columns(); + + // Ensuring selected columns are valid + sheet + .selected_columns + .validate_columns(&available_columns) + .with_context(|| { + format!( + "selected columns are invalid, available columns are: {available_columns:?}" + ) + })?; + + sheet.available_columns = available_columns; + Ok(sheet) } - pub(crate) fn column_names(&self) -> Vec { + fn get_available_columns(&self) -> Vec { let width = self.data.width(); match &self.header { Header::None => (0..width) @@ -263,10 +383,11 @@ impl TryFrom<&ExcelSheet> for Schema { arrow_schema_from_column_names_and_range( sheet.data(), - &sheet.column_names(), + &sheet.available_columns, sheet.offset(), // If sample_rows is higher than the sheet's limit, use the limit instead std::cmp::min(sample_rows, sheet.limit()), + &sheet.selected_columns, ) } } @@ -277,50 +398,71 @@ impl TryFrom<&ExcelSheet> for RecordBatch { fn try_from(sheet: &ExcelSheet) -> FastExcelResult { let offset = sheet.offset(); let limit = sheet.limit(); + let schema = Schema::try_from(sheet) - .with_context(|| format!("Could not build schema for sheet {}", sheet.name))?; - let mut iter = schema - .fields() + .with_context(|| format!("could not build schema for sheet {}", sheet.name))?; + + let mut iter = sheet + .available_columns .iter() .enumerate() - .map(|(col_idx, field)| { - ( - field.name(), - match field.data_type() { - ArrowDataType::Boolean => { - create_boolean_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Int64 => { - create_int_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Float64 => { - create_float_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Utf8 => { - create_string_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Timestamp(TimeUnit::Millisecond, None) => { - create_datetime_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Date32 => { - create_date_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Duration(TimeUnit::Millisecond) => { - create_duration_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Null => Arc::new(NullArray::new(limit - offset)), - _ => unreachable!(), - }, - ) + .filter_map(|(idx, column_name)| { + // checking if the current column has been selected + if let Some(col_idx) = match sheet.selected_columns { + // All columns selected, return the current index + SelectedColumns::All => Some(idx), + // Otherwise, return its index. If None is found, it means the column was not + // selected, and we will just continue + _ => sheet.selected_columns.idx_for_column( + &sheet.available_columns, + column_name, + idx, + ), + } { + // At this point, we know for sure that the column is in the schema so we can + // safely unwrap + let field = schema.field_with_name(column_name).unwrap(); + Some(( + field.name(), + match field.data_type() { + ArrowDataType::Boolean => { + create_boolean_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Int64 => { + create_int_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Float64 => { + create_float_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Utf8 => { + create_string_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Timestamp(TimeUnit::Millisecond, None) => { + create_datetime_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Date32 => { + create_date_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Duration(TimeUnit::Millisecond) => { + create_duration_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Null => Arc::new(NullArray::new(limit - offset)), + _ => unreachable!(), + }, + )) + } else { + None + } }) .peekable(); + // If the iterable is empty, try_from_iter returns an Err if iter.peek().is_none() { Ok(RecordBatch::new_empty(Arc::new(schema))) } else { RecordBatch::try_from_iter(iter) .map_err(|err| FastExcelErrorKind::ArrowError(err.to_string()).into()) - .with_context(|| format!("Could not convert sheet {} to RecordBatch", sheet.name)) + .with_context(|| format!("could not convert sheet {} to RecordBatch", sheet.name)) } } } @@ -359,16 +501,26 @@ impl ExcelSheet { self.header.offset() + self.pagination.offset() } + #[getter] + pub fn selected_columns<'p>(&'p self, py: Python<'p>) -> Option<&PyList> { + self.selected_columns.to_python(py) + } + + #[getter] + pub fn available_columns<'p>(&'p self, py: Python<'p>) -> &PyList { + PyList::new(py, &self.available_columns) + } + pub fn to_arrow(&self, py: Python<'_>) -> PyResult { RecordBatch::try_from(self) - .with_context(|| format!("Could not create RecordBatch from sheet {}", self.name)) + .with_context(|| format!("could not create RecordBatch from sheet \"{}\"", &self.name)) .and_then(|rb| { rb.to_pyarrow(py) .map_err(|err| FastExcelErrorKind::ArrowError(err.to_string()).into()) }) .with_context(|| { format!( - "Could not convert RecordBatch to pyarrow for sheet {}", + "could not convert RecordBatch to pyarrow for sheet \"{}\"", self.name ) }) @@ -379,3 +531,69 @@ impl ExcelSheet { format!("ExcelSheet<{}>", self.name) } } + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn selected_columns_from_none() { + assert_eq!( + TryInto::::try_into(None).unwrap(), + SelectedColumns::All + ) + } + + #[test] + fn selected_columns_from_list_of_valid_ints() { + Python::with_gil(|py| { + let py_list = PyList::new(py, vec![0, 1, 2]); + assert_eq!( + TryInto::::try_into(Some(py_list)).unwrap(), + SelectedColumns::ByIndex(vec![0, 1, 2]) + ) + }); + } + + #[test] + fn selected_columns_from_list_of_valid_strings() { + Python::with_gil(|py| { + let py_list = PyList::new(py, vec!["foo", "bar"]); + assert_eq!( + TryInto::::try_into(Some(py_list)).unwrap(), + SelectedColumns::ByName(vec!["foo".to_string(), "bar".to_string()]) + ) + }); + } + + #[test] + fn selected_columns_from_invalid_ints() { + Python::with_gil(|py| { + let py_list = PyList::new(py, vec![0, 2, -1]); + let err = TryInto::::try_into(Some(py_list)).unwrap_err(); + + assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_))); + }); + } + + #[test] + fn selected_columns_from_empty_int_list() { + Python::with_gil(|py| { + let py_list = PyList::new(py, Vec::::new()); + let err = TryInto::::try_into(Some(py_list)).unwrap_err(); + + assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_))); + }); + } + + #[test] + fn selected_columns_from_empty_string_list() { + Python::with_gil(|py| { + let py_list = PyList::new(py, Vec::::new()); + let err = TryInto::::try_into(Some(py_list)).unwrap_err(); + + assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_))); + }); + } +} diff --git a/src/utils/arrow.rs b/src/utils/arrow.rs index 33501d9..7da209b 100644 --- a/src/utils/arrow.rs +++ b/src/utils/arrow.rs @@ -3,7 +3,10 @@ use std::{collections::HashSet, sync::OnceLock}; use arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}; use calamine::{CellErrorType, Data as CalData, DataType, Range}; -use crate::error::{FastExcelErrorKind, FastExcelResult}; +use crate::{ + error::{FastExcelErrorKind, FastExcelResult}, + types::excelsheet::SelectedColumns, +}; /// All the possible string values that should be considered as NULL const NULL_STRING_VALUES: [&str; 19] = [ @@ -136,12 +139,20 @@ pub(crate) fn arrow_schema_from_column_names_and_range( column_names: &[String], row_idx: usize, row_limit: usize, + selected_columns: &SelectedColumns, ) -> FastExcelResult { let mut fields = Vec::with_capacity(column_names.len()); - for (col_idx, name) in column_names.iter().enumerate() { - let col_type = get_arrow_column_type(range, row_idx, row_limit, col_idx)?; - fields.push(Field::new(&alias_for_name(name, &fields), col_type, true)); + for (idx, name) in column_names.iter().enumerate() { + // If we have an index for the given column, extract it and add it to the schema. Otherwise, + // just ignore it + if let Some(col_idx) = match selected_columns { + SelectedColumns::All => Some(idx), + _ => selected_columns.idx_for_column(column_names, name, idx), + } { + let col_type = get_arrow_column_type(range, row_idx, row_limit, col_idx)?; + fields.push(Field::new(&alias_for_name(name, &fields), col_type, true)); + } } Ok(Schema::new(fields)) diff --git a/test.py b/test.py index 163f028..7ce0f28 100644 --- a/test.py +++ b/test.py @@ -6,14 +6,17 @@ def get_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument("file") + parser.add_argument("-c", "--column", type=str, nargs="+", help="the columns to use") return parser.parse_args() def main(): args = get_args() excel_file = fastexcel.read_excel(args.file) + use_columns = args.column or None + for sheet_name in excel_file.sheet_names: - excel_file.load_sheet_by_name(sheet_name).to_pandas() + excel_file.load_sheet_by_name(sheet_name, use_columns=use_columns).to_arrow() if __name__ == "__main__":