From 39491322b5be423cd25c386d8e2e707cc3f0f28a Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Sun, 25 Feb 2024 13:31:02 +0100 Subject: [PATCH 01/14] feat(Makefile): split test and lint targets between rust and python Signed-off-by: Luka Peschke --- Makefile | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index d4e2ee7..defe771 100644 --- a/Makefile +++ b/Makefile @@ -13,17 +13,26 @@ cargo-test = cargo test ## Docs pdoc = pdoc -o docs python/fastexcel -lint: +lint-python: $(ruff) $(format) --check --diff + +lint-rust: $(mypy) $(clippy) -format: + +lint: lint-rust lint-python + +format-python: $(ruff) --fix $(format) + +format-rust: $(fmt) $(clippy) --fix --lib -p fastexcel --allow-dirty --allow-staged +format: format-rust format-python + install-test-requirements: pip install -U -r test-requirements.txt -r build-requirements.txt @@ -39,10 +48,14 @@ dev-install: prod-install: ./prod_install.sh -test: +test-rust: $(cargo-test) + +test-python: $(pytest) +test: test-rust test-python + doc: $(pdoc) From 048405d07b53c0813f92f3d2122cad34707e0731 Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Sun, 25 Feb 2024 13:31:27 +0100 Subject: [PATCH 02/14] fix(build): allow tests in pyo3 modules to be run Signed-off-by: Luka Peschke --- Cargo.toml | 9 ++++++++- Makefile | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c62376d..62d12c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,8 @@ crate-type = ["cdylib"] [dependencies] calamine = { version = "0.24.0", features = ["dates"] } chrono = { version = "0.4.34", default-features = false } -pyo3 = { version = "0.20.3", features = ["extension-module", "abi3-py38"] } +# NOTE: "extension-module" is actually required, see comments on features below +pyo3 = { version = "0.20.3", features = ["abi3-py38"] } [dependencies.arrow] version = "50.0.0" @@ -21,3 +22,9 @@ features = ["pyarrow"] [dev-dependencies] rstest = { version = "0.18.2", default-features = false } + +# NOTE: This is a hack to bypass pyo3 limitations when testing: +# https://pyo3.rs/v0.20.3/faq.html#i-cant-run-cargo-test-or-i-cant-build-in-a-cargo-workspace-im-having-linker-issues-like-symbol-not-found-or-undefined-reference-to-_pyexc_systemerror +[features] +extension-module = ["pyo3/extension-module"] +default = ["extension-module"] diff --git a/Makefile b/Makefile index defe771..eb4295d 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ pytest = pytest -v ## Rust clippy = cargo clippy fmt = cargo fmt -cargo-test = cargo test +cargo-test = cargo test --no-default-features ## Docs pdoc = pdoc -o docs python/fastexcel From 5e0bd762e430a3bbc59353658ca43ec4a1a682f5 Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Sun, 25 Feb 2024 13:40:39 +0100 Subject: [PATCH 03/14] refactor(build): Allow rust tests to spin up python interpreters Signed-off-by: Luka Peschke --- Cargo.toml | 3 +++ Makefile | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 62d12c4..f2f85e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,3 +28,6 @@ rstest = { version = "0.18.2", default-features = false } [features] extension-module = ["pyo3/extension-module"] default = ["extension-module"] +# feature for tests only. This makes Python::with_gil auto-initialize Python +# interpreters, which allows us ot instantiate Python objects in tests +tests = ["pyo3/auto-initialize"] diff --git a/Makefile b/Makefile index eb4295d..46ad571 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ pytest = pytest -v ## Rust clippy = cargo clippy fmt = cargo fmt -cargo-test = cargo test --no-default-features +cargo-test = cargo test --no-default-features --features tests ## Docs pdoc = pdoc -o docs python/fastexcel From f044a51c9964459fe371838f1e900cf5818565ec Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Sun, 25 Feb 2024 14:14:17 +0100 Subject: [PATCH 04/14] feat: introduced a ColumnNotFoundError exception Signed-off-by: Luka Peschke --- python/fastexcel/__init__.py | 2 ++ python/fastexcel/_fastexcel.pyi | 1 + python/tests/test_errors.py | 1 + src/error.rs | 41 +++++++++++++++++++++++++-------- src/lib.rs | 4 ++++ src/types/excelreader.rs | 6 ++--- 6 files changed, 42 insertions(+), 13 deletions(-) diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index fb00d9b..232ce4f 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -17,6 +17,7 @@ CalamineCellError, CalamineError, CannotRetrieveCellDataError, + ColumnNotFoundError, FastExcelError, InvalidParametersError, SheetNotFoundError, @@ -224,6 +225,7 @@ def read_excel(path: Path | str) -> ExcelReader: "CalamineCellError", "CalamineError", "SheetNotFoundError", + "ColumnNotFoundError", "ArrowError", "InvalidParametersError", "UnsupportedColumnTypeCombinationError", diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi index 865f4e7..6524559 100644 --- a/python/fastexcel/_fastexcel.pyi +++ b/python/fastexcel/_fastexcel.pyi @@ -69,5 +69,6 @@ class CannotRetrieveCellDataError(FastExcelError): ... class CalamineCellError(FastExcelError): ... class CalamineError(FastExcelError): ... class SheetNotFoundError(FastExcelError): ... +class ColumnNotFoundError(FastExcelError): ... class ArrowError(FastExcelError): ... class InvalidParametersError(FastExcelError): ... diff --git a/python/tests/test_errors.py b/python/tests/test_errors.py index 1b1c33e..4897158 100644 --- a/python/tests/test_errors.py +++ b/python/tests/test_errors.py @@ -51,6 +51,7 @@ def test_sheet_not_found_error() -> None: "calamine returned an error regarding the content of the cell", ), (fastexcel.CalamineError, "Generic calamine error"), + (fastexcel.ColumnNotFoundError, "Column was not found"), (fastexcel.SheetNotFoundError, "Sheet was not found"), (fastexcel.ArrowError, "Generic arrow error"), (fastexcel.InvalidParametersError, "Provided parameters are invalid"), diff --git a/src/error.rs b/src/error.rs index 4c6de09..2f056f7 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,20 +1,28 @@ use std::{error::Error, fmt::Display}; #[derive(Debug)] -pub(crate) enum SheetIdxOrName { +pub(crate) enum IdxOrName { Idx(usize), - // Leaving this variant if someday we want to check if a name exists before calling worksheet_range - #[allow(dead_code)] Name(String), } +impl IdxOrName { + pub(super) fn format_message(&self) -> String { + match self { + Self::Idx(idx) => format!("at index {idx}"), + Self::Name(name) => format!("with name \"{name}\" not found"), + } + } +} + #[derive(Debug)] pub(crate) enum FastExcelErrorKind { UnsupportedColumnTypeCombination(String), CannotRetrieveCellData(usize, usize), CalamineCellError(calamine::CellErrorType), CalamineError(calamine::Error), - SheetNotFound(SheetIdxOrName), + SheetNotFound(IdxOrName), + ColumnNotFound(IdxOrName), // Arrow errors can be of several different types (arrow::error::Error, PyError), and having // the actual type has not much value for us, so we just store a string context ArrowError(String), @@ -37,14 +45,13 @@ impl Display for FastExcelErrorKind { write!(f, "calamine error: {calamine_error}") } FastExcelErrorKind::SheetNotFound(idx_or_name) => { - let message = { - match idx_or_name { - SheetIdxOrName::Idx(idx) => format!("at index {idx}"), - SheetIdxOrName::Name(name) => format!("with name \"{name}\" not found"), - } - }; + let message = idx_or_name.format_message(); write!(f, "sheet {message} not found") } + FastExcelErrorKind::ColumnNotFound(idx_or_name) => { + let message = idx_or_name.format_message(); + write!(f, "column {message} not found") + } FastExcelErrorKind::ArrowError(err) => write!(f, "arrow error: {err}"), FastExcelErrorKind::InvalidParameters(err) => write!(f, "invalid parameters: {err}"), } @@ -70,6 +77,10 @@ impl FastExcelError { context: vec![], } } + + pub(crate) fn kind(&self) -> &FastExcelErrorKind { + &self.kind + } } impl Display for FastExcelError { @@ -166,6 +177,13 @@ pub(crate) mod py_errors { FastExcelError, "Sheet was not found" ); + // Sheet not found + create_exception!( + _fastexcel, + ColumnNotFoundError, + FastExcelError, + "Column was not found" + ); // Arrow error create_exception!( _fastexcel, @@ -209,6 +227,9 @@ pub(crate) mod py_errors { FastExcelErrorKind::SheetNotFound(_) => { SheetNotFoundError::new_err(message) } + FastExcelErrorKind::ColumnNotFound(_) => { + ColumnNotFoundError::new_err(message) + } FastExcelErrorKind::ArrowError(_) => ArrowError::new_err(message), FastExcelErrorKind::InvalidParameters(_) => { InvalidParametersError::new_err(message) diff --git a/src/lib.rs b/src/lib.rs index 7740754..1537816 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -55,6 +55,10 @@ fn _fastexcel(py: Python, m: &PyModule) -> PyResult<()> { "SheetNotFoundError", py.get_type::(), ), + ( + "ColumnNotFoundError", + py.get_type::(), + ), ("ArrowError", py.get_type::()), ( "InvalidParametersError", diff --git a/src/types/excelreader.rs b/src/types/excelreader.rs index 6424db1..976bddb 100644 --- a/src/types/excelreader.rs +++ b/src/types/excelreader.rs @@ -4,7 +4,7 @@ use calamine::{open_workbook_auto, Reader, Sheets}; use pyo3::{pyclass, pymethods, PyResult}; use crate::error::{ - py_errors::IntoPyResult, ErrorContext, FastExcelErrorKind, FastExcelResult, SheetIdxOrName, + py_errors::IntoPyResult, ErrorContext, FastExcelErrorKind, FastExcelResult, IdxOrName, }; use super::{ @@ -99,7 +99,7 @@ impl ExcelReader { let name = self .sheet_names .get(idx) - .ok_or_else(|| FastExcelErrorKind::SheetNotFound(SheetIdxOrName::Idx(idx)).into()) + .ok_or_else(|| FastExcelErrorKind::SheetNotFound(IdxOrName::Idx(idx)).into()) .with_context(|| { format!( "Sheet index {idx} is out of range. File has {} sheets", @@ -114,7 +114,7 @@ impl ExcelReader { .worksheet_range_at(idx) // Returns Option, Self::Error>>, so we convert the Option into a // SheetNotFoundError and unwrap it - .ok_or_else(|| FastExcelErrorKind::SheetNotFound(SheetIdxOrName::Idx(idx)).into()) + .ok_or_else(|| FastExcelErrorKind::SheetNotFound(IdxOrName::Idx(idx)).into()) .into_pyresult()? // And here, we convert the calamine error in an owned error and unwrap it .map_err(|err| FastExcelErrorKind::CalamineError(err).into()) From c5e6f682af22cacf6b20c719ff443d76f41e8617 Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Sun, 25 Feb 2024 17:19:35 +0100 Subject: [PATCH 05/14] feat: allow to select a subset of columns closes #172 Signed-off-by: Luka Peschke --- Cargo.lock | 23 +++ Cargo.toml | 1 + python/fastexcel/__init__.py | 7 + python/fastexcel/_fastexcel.pyi | 12 +- python/tests/test_column_selection.py | 251 ++++++++++++++++++++++++ src/error.rs | 8 +- src/types/excelreader.rs | 12 +- src/types/excelsheet.rs | 263 ++++++++++++++++++++++---- src/utils/arrow.rs | 19 +- 9 files changed, 540 insertions(+), 56 deletions(-) create mode 100644 python/tests/test_column_selection.py diff --git a/Cargo.lock b/Cargo.lock index 8c44037..e7c0053 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -344,6 +344,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +[[package]] +name = "diff" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" + [[package]] name = "encoding_rs" version = "0.8.31" @@ -360,6 +366,7 @@ dependencies = [ "arrow", "calamine", "chrono", + "pretty_assertions", "pyo3", "rstest", ] @@ -678,6 +685,16 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" +[[package]] +name = "pretty_assertions" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" +dependencies = [ + "diff", + "yansi", +] + [[package]] name = "proc-macro-hack" version = "0.5.19" @@ -1122,6 +1139,12 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +[[package]] +name = "yansi" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" + [[package]] name = "zip" version = "0.6.3" diff --git a/Cargo.toml b/Cargo.toml index f2f85e1..96e8512 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ default-features = false features = ["pyarrow"] [dev-dependencies] +pretty_assertions = "1.4.0" rstest = { version = "0.18.2", default-features = false } # NOTE: This is a hack to bypass pyo3 limitations when testing: diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index 232ce4f..4776c21 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -102,6 +102,7 @@ def load_sheet_by_name( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + use_columns: list[str] | list[int] | None = None, ) -> ExcelSheet: """Loads a sheet by name. @@ -127,6 +128,7 @@ def load_sheet_by_name( skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, + use_columns=use_columns, ) ) @@ -139,6 +141,7 @@ def load_sheet_by_idx( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + use_columns: list[str] | list[int] | None = None, ) -> ExcelSheet: """Loads a sheet by index. @@ -166,6 +169,7 @@ def load_sheet_by_idx( skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, + use_columns=use_columns, ) ) @@ -178,6 +182,7 @@ def load_sheet( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + use_columns: list[str] | list[int] | None = None, ) -> ExcelSheet: """Loads a sheet by name if a string is passed or by index if an integer is passed. @@ -191,6 +196,7 @@ def load_sheet( skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, + use_columns=use_columns, ) if isinstance(idx_or_name, int) else self.load_sheet_by_name( @@ -200,6 +206,7 @@ def load_sheet( skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, + use_columns=use_columns, ) ) diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi index 6524559..7db8493 100644 --- a/python/fastexcel/_fastexcel.pyi +++ b/python/fastexcel/_fastexcel.pyi @@ -33,6 +33,7 @@ class _ExcelReader: skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + use_columns: list[str] | list[int] | None = None, ) -> _ExcelSheet: ... def load_sheet_by_idx( self, @@ -43,16 +44,7 @@ class _ExcelReader: skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - ) -> _ExcelSheet: ... - def load_sheet( - self, - idx_or_name: int | str, - *, - header_row: int | None = 0, - column_names: list[str] | None = None, - skip_rows: int = 0, - n_rows: int | None = None, - schema_sample_rows: int | None = 1_000, + use_columns: list[str] | list[int] | None = None, ) -> _ExcelSheet: ... @property def sheet_names(self) -> list[str]: ... diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py new file mode 100644 index 0000000..4107e07 --- /dev/null +++ b/python/tests/test_column_selection.py @@ -0,0 +1,251 @@ +from __future__ import annotations + +import re +from typing import Any + +import fastexcel +import pandas as pd +import polars as pl +import pytest +from pandas.testing import assert_frame_equal as pd_assert_frame_equal +from polars.testing import assert_frame_equal as pl_assert_frame_equal +from utils import path_for_fixture + + +@pytest.fixture +def excel_reader_single_sheet() -> fastexcel.ExcelReader: + return fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) + + +def test_single_sheet_all_columns(excel_reader_single_sheet: fastexcel.ExcelReader) -> None: + sheet = excel_reader_single_sheet.load_sheet(0) + + sheet_explicit_arg = excel_reader_single_sheet.load_sheet(0, use_columns=None) + + expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} + expected_pd_df = pd.DataFrame(expected) + expected_pl_df = pl.DataFrame(expected) + + pd_df = sheet.to_pandas() + pd_assert_frame_equal(pd_df, expected_pd_df) + pd_df_explicit_arg = sheet_explicit_arg.to_pandas() + pd_assert_frame_equal(pd_df_explicit_arg, expected_pd_df) + + pl_df = sheet.to_polars() + pl_assert_frame_equal(pl_df, expected_pl_df) + pl_df_explicit_arg = sheet_explicit_arg.to_polars() + pl_assert_frame_equal(pl_df_explicit_arg, expected_pl_df) + + +def test_single_sheet_subset_by_str(excel_reader_single_sheet: fastexcel.ExcelReader) -> None: + expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} + + for sheet_name_or_idx in [0, "January"]: + for col in ["Month", "Year"]: + sheet = excel_reader_single_sheet.load_sheet(0, use_columns=[col]) + + pd_df = sheet.to_pandas() + pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]})) + + pl_df = sheet.to_polars() + pl_assert_frame_equal(pl_df, pl.DataFrame({col: expected[col]})) + + +def test_single_sheet_subset_by_index(excel_reader_single_sheet: fastexcel.ExcelReader) -> None: + expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} + + for sheet_name_or_idx in [0, "January"]: + for idx, col_name in enumerate(["Month", "Year"]): + sheet = excel_reader_single_sheet.load_sheet(0, use_columns=[idx]) + + pd_df = sheet.to_pandas() + pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]})) + + pl_df = sheet.to_polars() + pl_assert_frame_equal(pl_df, pl.DataFrame({col_name: expected[col_name]})) + + +@pytest.fixture +def excel_reader_single_sheet_with_unnamed_columns() -> fastexcel.ExcelReader: + return fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) + + +@pytest.fixture +def single_sheet_with_unnamed_columns_expected() -> dict[str, list[Any]]: + return { + "col1": [2.0, 3.0], + "__UNNAMED__1": [1.5, 2.5], + "col3": ["hello", "world"], + "__UNNAMED__3": [-5.0, -6.0], + "col5": ["a", "b"], + } + + +def test_single_sheet_with_unnamed_columns( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, + single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], +) -> None: + use_columns_str = ["col1", "col3", "__UNNAMED__3"] + use_columns_idx = [0, 2, 3] + expected = { + k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in use_columns_str + } + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_str + ) + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_idx + ) + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + +def test_single_sheet_with_unnamed_columns_and_pagination( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, + single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], +) -> None: + use_columns_str = ["col1", "col3", "__UNNAMED__3"] + use_columns_idx = [0, 2, 3] + + # first row only + expected = { + k: v[:1] + for k, v in single_sheet_with_unnamed_columns_expected.items() + if k in use_columns_str + } + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_str, n_rows=1 + ) + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_idx, n_rows=1 + ) + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + # second row + expected = { + k: v[1:] + for k, v in single_sheet_with_unnamed_columns_expected.items() + if k in use_columns_str + } + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_str, skip_rows=1 + ) + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_idx, skip_rows=1 + ) + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + +def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, +) -> None: + use_columns_str = ["col0", "col2", "col3"] + use_columns_idx = [0, 2, 3] + expected: dict[str, list[Any]] = { + "col0": [2.0, 3.0], + "col2": ["hello", "world"], + "col3": [-5.0, -6.0], + } + column_names = [f"col{i}" for i in range(5)] + + # skipping the header row only + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_str, skip_rows=1, column_names=column_names + ) + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_idx, skip_rows=1, column_names=column_names + ) + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + # skipping the header row + first data row + expected_first_row_skipped = {k: v[1:] for k, v in expected.items()} + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_str, skip_rows=2, column_names=column_names + ) + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped)) + + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_idx, skip_rows=2, column_names=column_names + ) + + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped)) + + +def test_single_sheet_invalid_column_indices_negative_integer( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, +) -> None: + expected_message = """invalid parameters: expected list[int] | list[str], got [-2] +Context: + 0: expected selected columns to be list[str] | list[int] | None, got Some([-2]) +""" + with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)): + excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2]) + + +def test_single_sheet_invalid_column_indices_empty_list( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, +) -> None: + expected_message = """invalid parameters: list of select columns is empty +Context: + 0: expected selected columns to be list[str] | list[int] | None, got Some([]) +""" + with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)): + excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[]) + + +def test_single_sheet_invalid_column_indices_column_does_not_exist_str( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, +) -> None: + expected_message = """column with name "nope" not found +Context: + 0: selected columns are invalid + 1: could not create RecordBatch from sheet "January" + 2: could not convert RecordBatch to pyarrow for sheet "January" +""" + with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)): + excel_reader_single_sheet_with_unnamed_columns.load_sheet( + 0, use_columns=["nope"] + ).to_arrow() + + +def test_single_sheet_invalid_column_indices_column_does_not_exist_int( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, +) -> None: + expected_message = """column at index 42 not found +Context: + 0: selected columns are invalid + 1: could not create RecordBatch from sheet "January" + 2: could not convert RecordBatch to pyarrow for sheet "January" +""" + with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)): + excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]).to_arrow() diff --git a/src/error.rs b/src/error.rs index 2f056f7..48e6bdb 100644 --- a/src/error.rs +++ b/src/error.rs @@ -10,7 +10,7 @@ impl IdxOrName { pub(super) fn format_message(&self) -> String { match self { Self::Idx(idx) => format!("at index {idx}"), - Self::Name(name) => format!("with name \"{name}\" not found"), + Self::Name(name) => format!("with name \"{name}\""), } } } @@ -60,7 +60,7 @@ impl Display for FastExcelErrorKind { #[derive(Debug)] pub(crate) struct FastExcelError { - kind: FastExcelErrorKind, + pub kind: FastExcelErrorKind, context: Vec, } @@ -77,10 +77,6 @@ impl FastExcelError { context: vec![], } } - - pub(crate) fn kind(&self) -> &FastExcelErrorKind { - &self.kind - } } impl Display for FastExcelError { diff --git a/src/types/excelreader.rs b/src/types/excelreader.rs index 976bddb..e6986da 100644 --- a/src/types/excelreader.rs +++ b/src/types/excelreader.rs @@ -1,7 +1,7 @@ use std::{fs::File, io::BufReader}; use calamine::{open_workbook_auto, Reader, Sheets}; -use pyo3::{pyclass, pymethods, PyResult}; +use pyo3::{pyclass, pymethods, types::PyList, PyResult}; use crate::error::{ py_errors::IntoPyResult, ErrorContext, FastExcelErrorKind, FastExcelResult, IdxOrName, @@ -50,7 +50,9 @@ impl ExcelReader { skip_rows = 0, n_rows = None, schema_sample_rows = 1_000, + use_columns = None ))] + #[allow(clippy::too_many_arguments)] pub fn load_sheet_by_name( &mut self, name: String, @@ -59,6 +61,7 @@ impl ExcelReader { skip_rows: usize, n_rows: Option, schema_sample_rows: Option, + use_columns: Option<&PyList>, ) -> PyResult { let range = self .sheets @@ -69,12 +72,14 @@ impl ExcelReader { let header = Header::new(header_row, column_names); let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?; + let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?; Ok(ExcelSheet::new( name, range, header, pagination, schema_sample_rows, + selected_columns, )) } @@ -86,7 +91,9 @@ impl ExcelReader { skip_rows = 0, n_rows = None, schema_sample_rows = 1_000, + use_columns = None ))] + #[allow(clippy::too_many_arguments)] pub fn load_sheet_by_idx( &mut self, idx: usize, @@ -95,6 +102,7 @@ impl ExcelReader { skip_rows: usize, n_rows: Option, schema_sample_rows: Option, + use_columns: Option<&PyList>, ) -> PyResult { let name = self .sheet_names @@ -122,12 +130,14 @@ impl ExcelReader { let header = Header::new(header_row, column_names); let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?; + let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?; Ok(ExcelSheet::new( name, range, header, pagination, schema_sample_rows, + selected_columns, )) } } diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index d950eec..964f07a 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -2,6 +2,7 @@ use std::sync::Arc; use crate::error::{ py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, + IdxOrName, }; use arrow::{ @@ -18,11 +19,13 @@ use chrono::NaiveDate; use pyo3::{ prelude::{pyclass, pymethods, PyObject, Python}, + types::PyList, PyResult, }; use crate::utils::arrow::arrow_schema_from_column_names_and_range; +#[derive(Debug)] pub(crate) enum Header { None, At(usize), @@ -76,6 +79,100 @@ impl Pagination { } } +#[derive(Debug, PartialEq)] +pub(crate) enum SelectedColumns { + All, + ByIndex(Vec), + ByName(Vec), +} + +impl SelectedColumns { + pub(crate) fn validate_columns(&self, column_names: &[String]) -> FastExcelResult<()> { + match self { + SelectedColumns::All => Ok(()), + // If no selected indice is >= to the len of column_names, we're good + SelectedColumns::ByIndex(indices) => indices.iter().try_for_each(|idx| { + if idx >= &column_names.len() { + Err(FastExcelErrorKind::ColumnNotFound(IdxOrName::Idx(*idx)).into()) + } else { + Ok(()) + } + }), + // Every selected column must be in the provided column_names + SelectedColumns::ByName(selected_names) => { + selected_names.iter().try_for_each(|selected_name| { + if column_names.contains(selected_name) { + Ok(()) + } else { + Err(FastExcelErrorKind::ColumnNotFound(IdxOrName::Name( + selected_name.to_string(), + )) + .into()) + } + }) + } + } + } + + pub(crate) fn idx_for_column( + &self, + col_names: &[String], + col_name: &str, + col_idx: usize, + ) -> Option { + match self { + SelectedColumns::All => None, + SelectedColumns::ByIndex(indices) => { + if indices.contains(&col_idx) { + Some(col_idx) + } else { + None + } + } + SelectedColumns::ByName(names) => { + // cannot use .contains() because we have &String and &str + if names.iter().any(|name| name == col_name) { + col_names.iter().position(|name| name == col_name) + } else { + None + } + } + } + } +} + +impl TryFrom> for SelectedColumns { + type Error = FastExcelError; + + fn try_from(value: Option<&PyList>) -> FastExcelResult { + use FastExcelErrorKind::InvalidParameters; + + match value { + None => Ok(Self::All), + Some(py_list) => { + if let Ok(name_vec) = py_list.extract::>() { + if name_vec.is_empty() { + Err(InvalidParameters("list of select columns is empty".to_string()).into()) + } else { + Ok(Self::ByName(name_vec)) + } + } else if let Ok(index_vec) = py_list.extract::>() { + if index_vec.is_empty() { + Err(InvalidParameters("list of select columns is empty".to_string()).into()) + } else { + Ok(Self::ByIndex(index_vec)) + } + } else { + Err(InvalidParameters(format!( + "expected list[int] | list[str], got {py_list:?}" + )) + .into()) + } + } + } + } +} + #[pyclass(name = "_ExcelSheet")] pub(crate) struct ExcelSheet { #[pyo3(get)] @@ -87,6 +184,7 @@ pub(crate) struct ExcelSheet { total_height: Option, width: Option, schema_sample_rows: Option, + selected_columns: SelectedColumns, } impl ExcelSheet { @@ -100,6 +198,7 @@ impl ExcelSheet { header: Header, pagination: Pagination, schema_sample_rows: Option, + selected_columns: SelectedColumns, ) -> Self { ExcelSheet { name, @@ -107,6 +206,7 @@ impl ExcelSheet { pagination, data, schema_sample_rows, + selected_columns, height: None, total_height: None, width: None, @@ -267,6 +367,7 @@ impl TryFrom<&ExcelSheet> for Schema { sheet.offset(), // If sample_rows is higher than the sheet's limit, use the limit instead std::cmp::min(sample_rows, sheet.limit()), + &sheet.selected_columns, ) } } @@ -277,50 +378,76 @@ impl TryFrom<&ExcelSheet> for RecordBatch { fn try_from(sheet: &ExcelSheet) -> FastExcelResult { let offset = sheet.offset(); let limit = sheet.limit(); + + let column_names = sheet.column_names(); + + // Ensuring selected columns are valid + sheet + .selected_columns + .validate_columns(&column_names) + .with_context(|| "selected columns are invalid")?; + let schema = Schema::try_from(sheet) - .with_context(|| format!("Could not build schema for sheet {}", sheet.name))?; - let mut iter = schema - .fields() + .with_context(|| format!("could not build schema for sheet {}", sheet.name))?; + + let mut iter = column_names .iter() .enumerate() - .map(|(col_idx, field)| { - ( - field.name(), - match field.data_type() { - ArrowDataType::Boolean => { - create_boolean_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Int64 => { - create_int_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Float64 => { - create_float_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Utf8 => { - create_string_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Timestamp(TimeUnit::Millisecond, None) => { - create_datetime_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Date32 => { - create_date_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Duration(TimeUnit::Millisecond) => { - create_duration_array(sheet.data(), col_idx, offset, limit) - } - ArrowDataType::Null => Arc::new(NullArray::new(limit - offset)), - _ => unreachable!(), - }, - ) + .filter_map(|(idx, column_name)| { + // checking if the current column has been selected + if let Some(col_idx) = match sheet.selected_columns { + // All columns selected, return the current index + SelectedColumns::All => Some(idx), + // Otherwise, return its index. If None is found, it means the column was not + // selected, and we will just continue + _ => sheet + .selected_columns + .idx_for_column(&column_names, column_name, idx), + } { + // At this point, we know for sure that the column is in the schema so we can + // safely unwrap + let field = schema.field_with_name(column_name).unwrap(); + Some(( + field.name(), + match field.data_type() { + ArrowDataType::Boolean => { + create_boolean_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Int64 => { + create_int_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Float64 => { + create_float_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Utf8 => { + create_string_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Timestamp(TimeUnit::Millisecond, None) => { + create_datetime_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Date32 => { + create_date_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Duration(TimeUnit::Millisecond) => { + create_duration_array(sheet.data(), col_idx, offset, limit) + } + ArrowDataType::Null => Arc::new(NullArray::new(limit - offset)), + _ => unreachable!(), + }, + )) + } else { + None + } }) .peekable(); + // If the iterable is empty, try_from_iter returns an Err if iter.peek().is_none() { Ok(RecordBatch::new_empty(Arc::new(schema))) } else { RecordBatch::try_from_iter(iter) .map_err(|err| FastExcelErrorKind::ArrowError(err.to_string()).into()) - .with_context(|| format!("Could not convert sheet {} to RecordBatch", sheet.name)) + .with_context(|| format!("could not convert sheet {} to RecordBatch", sheet.name)) } } } @@ -361,14 +488,14 @@ impl ExcelSheet { pub fn to_arrow(&self, py: Python<'_>) -> PyResult { RecordBatch::try_from(self) - .with_context(|| format!("Could not create RecordBatch from sheet {}", self.name)) + .with_context(|| format!("could not create RecordBatch from sheet \"{}\"", &self.name)) .and_then(|rb| { rb.to_pyarrow(py) .map_err(|err| FastExcelErrorKind::ArrowError(err.to_string()).into()) }) .with_context(|| { format!( - "Could not convert RecordBatch to pyarrow for sheet {}", + "could not convert RecordBatch to pyarrow for sheet \"{}\"", self.name ) }) @@ -379,3 +506,69 @@ impl ExcelSheet { format!("ExcelSheet<{}>", self.name) } } + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn selected_columns_from_none() { + assert_eq!( + TryInto::::try_into(None).unwrap(), + SelectedColumns::All + ) + } + + #[test] + fn selected_columns_from_list_of_valid_ints() { + Python::with_gil(|py| { + let py_list = PyList::new(py, vec![0, 1, 2]); + assert_eq!( + TryInto::::try_into(Some(py_list)).unwrap(), + SelectedColumns::ByIndex(vec![0, 1, 2]) + ) + }); + } + + #[test] + fn selected_columns_from_list_of_valid_strings() { + Python::with_gil(|py| { + let py_list = PyList::new(py, vec!["foo", "bar"]); + assert_eq!( + TryInto::::try_into(Some(py_list)).unwrap(), + SelectedColumns::ByName(vec!["foo".to_string(), "bar".to_string()]) + ) + }); + } + + #[test] + fn selected_columns_from_invalid_ints() { + Python::with_gil(|py| { + let py_list = PyList::new(py, vec![0, 2, -1]); + let err = TryInto::::try_into(Some(py_list)).unwrap_err(); + + assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_))); + }); + } + + #[test] + fn selected_columns_from_empty_int_list() { + Python::with_gil(|py| { + let py_list = PyList::new(py, Vec::::new()); + let err = TryInto::::try_into(Some(py_list)).unwrap_err(); + + assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_))); + }); + } + + #[test] + fn selected_columns_from_empty_string_list() { + Python::with_gil(|py| { + let py_list = PyList::new(py, Vec::::new()); + let err = TryInto::::try_into(Some(py_list)).unwrap_err(); + + assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_))); + }); + } +} diff --git a/src/utils/arrow.rs b/src/utils/arrow.rs index 33501d9..7da209b 100644 --- a/src/utils/arrow.rs +++ b/src/utils/arrow.rs @@ -3,7 +3,10 @@ use std::{collections::HashSet, sync::OnceLock}; use arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}; use calamine::{CellErrorType, Data as CalData, DataType, Range}; -use crate::error::{FastExcelErrorKind, FastExcelResult}; +use crate::{ + error::{FastExcelErrorKind, FastExcelResult}, + types::excelsheet::SelectedColumns, +}; /// All the possible string values that should be considered as NULL const NULL_STRING_VALUES: [&str; 19] = [ @@ -136,12 +139,20 @@ pub(crate) fn arrow_schema_from_column_names_and_range( column_names: &[String], row_idx: usize, row_limit: usize, + selected_columns: &SelectedColumns, ) -> FastExcelResult { let mut fields = Vec::with_capacity(column_names.len()); - for (col_idx, name) in column_names.iter().enumerate() { - let col_type = get_arrow_column_type(range, row_idx, row_limit, col_idx)?; - fields.push(Field::new(&alias_for_name(name, &fields), col_type, true)); + for (idx, name) in column_names.iter().enumerate() { + // If we have an index for the given column, extract it and add it to the schema. Otherwise, + // just ignore it + if let Some(col_idx) = match selected_columns { + SelectedColumns::All => Some(idx), + _ => selected_columns.idx_for_column(column_names, name, idx), + } { + let col_type = get_arrow_column_type(range, row_idx, row_limit, col_idx)?; + fields.push(Field::new(&alias_for_name(name, &fields), col_type, true)); + } } Ok(Schema::new(fields)) From 309f8b43732f17c47a57d6be636e9fcaf1fe377f Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Mon, 26 Feb 2024 11:49:27 +0100 Subject: [PATCH 06/14] fix: mypy should be in lint-python make target Signed-off-by: Luka Peschke --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 46ad571..f7c29fe 100644 --- a/Makefile +++ b/Makefile @@ -16,9 +16,9 @@ pdoc = pdoc -o docs python/fastexcel lint-python: $(ruff) $(format) --check --diff + $(mypy) lint-rust: - $(mypy) $(clippy) lint: lint-rust lint-python From 3acd8c81a8d8ab158981c633ce894b4da0bafd8e Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Mon, 26 Feb 2024 11:50:21 +0100 Subject: [PATCH 07/14] typo Signed-off-by: Luka Peschke --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 96e8512..1210ccf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,5 +30,5 @@ rstest = { version = "0.18.2", default-features = false } extension-module = ["pyo3/extension-module"] default = ["extension-module"] # feature for tests only. This makes Python::with_gil auto-initialize Python -# interpreters, which allows us ot instantiate Python objects in tests +# interpreters, which allows us to instantiate Python objects in tests tests = ["pyo3/auto-initialize"] From 8ff4192db63953845b904a4a92fae47d0ced7787 Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Mon, 26 Feb 2024 11:54:16 +0100 Subject: [PATCH 08/14] fix(tests): use sheet_name_or_idx Signed-off-by: Luka Peschke --- python/tests/test_column_selection.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py index 4107e07..230872c 100644 --- a/python/tests/test_column_selection.py +++ b/python/tests/test_column_selection.py @@ -40,9 +40,11 @@ def test_single_sheet_all_columns(excel_reader_single_sheet: fastexcel.ExcelRead def test_single_sheet_subset_by_str(excel_reader_single_sheet: fastexcel.ExcelReader) -> None: expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} - for sheet_name_or_idx in [0, "January"]: + # looks like mypy 1.8 became more stupid + sheets: list[str | int] = [0, "January"] + for sheet_name_or_idx in sheets: for col in ["Month", "Year"]: - sheet = excel_reader_single_sheet.load_sheet(0, use_columns=[col]) + sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[col]) pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]})) @@ -54,9 +56,10 @@ def test_single_sheet_subset_by_str(excel_reader_single_sheet: fastexcel.ExcelRe def test_single_sheet_subset_by_index(excel_reader_single_sheet: fastexcel.ExcelReader) -> None: expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} - for sheet_name_or_idx in [0, "January"]: + sheets: list[str | int] = [0, "January"] + for sheet_name_or_idx in sheets: for idx, col_name in enumerate(["Month", "Year"]): - sheet = excel_reader_single_sheet.load_sheet(0, use_columns=[idx]) + sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[idx]) pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]})) From bbb35423ec6e13ea855747ca7c73abbe80978ce8 Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Mon, 26 Feb 2024 12:11:56 +0100 Subject: [PATCH 09/14] refactor: only check for py_list lenght once Signed-off-by: Luka Peschke --- python/tests/test_column_selection.py | 2 +- src/types/excelsheet.rs | 16 +++++----------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py index 230872c..eff55f0 100644 --- a/python/tests/test_column_selection.py +++ b/python/tests/test_column_selection.py @@ -218,7 +218,7 @@ def test_single_sheet_invalid_column_indices_negative_integer( def test_single_sheet_invalid_column_indices_empty_list( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, ) -> None: - expected_message = """invalid parameters: list of select columns is empty + expected_message = """invalid parameters: list of selected columns is empty Context: 0: expected selected columns to be list[str] | list[int] | None, got Some([]) """ diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index 964f07a..8da0c32 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -150,18 +150,12 @@ impl TryFrom> for SelectedColumns { match value { None => Ok(Self::All), Some(py_list) => { - if let Ok(name_vec) = py_list.extract::>() { - if name_vec.is_empty() { - Err(InvalidParameters("list of select columns is empty".to_string()).into()) - } else { - Ok(Self::ByName(name_vec)) - } + if py_list.is_empty() { + Err(InvalidParameters("list of selected columns is empty".to_string()).into()) + } else if let Ok(name_vec) = py_list.extract::>() { + Ok(Self::ByName(name_vec)) } else if let Ok(index_vec) = py_list.extract::>() { - if index_vec.is_empty() { - Err(InvalidParameters("list of select columns is empty".to_string()).into()) - } else { - Ok(Self::ByIndex(index_vec)) - } + Ok(Self::ByIndex(index_vec)) } else { Err(InvalidParameters(format!( "expected list[int] | list[str], got {py_list:?}" From 36c054d59c2a3d81c1c6888b54c27d02c8d4f6e6 Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Mon, 26 Feb 2024 12:15:19 +0100 Subject: [PATCH 10/14] docs: documented use_columns param Signed-off-by: Luka Peschke --- python/fastexcel/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index 4776c21..3dab411 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -119,6 +119,9 @@ def load_sheet_by_name( :param schema_sample_rows: Specifies how many rows should be used to determine the dtype of a column. If `None`, all rows will be used. + :param use_columns: Specifies the columns to use. Can either be a list of column names, or + a list of column indices (starting at 0). + If `None`, all columns will be used. """ return ExcelSheet( self._reader.load_sheet_by_name( @@ -158,6 +161,9 @@ def load_sheet_by_idx( :param schema_sample_rows: Specifies how many rows should be used to determine the dtype of a column. If `None`, all rows will be used. + :param use_columns: Specifies the columns to use. Can either be a list of column names, or + a list of column indices (starting at 0). + If `None`, all columns will be used. """ if idx < 0: raise ValueError(f"Expected idx to be > 0, got {idx}") From f5a8d4dba834f53a53e4983cc457fd133801217d Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Mon, 26 Feb 2024 12:16:19 +0100 Subject: [PATCH 11/14] ci: move check-docs jobs higher in the CI file to get the result on top in PR previews Signed-off-by: Luka Peschke --- .github/workflows/CI.yml | 56 ++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 732fdc2..f937e1f 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -46,6 +46,34 @@ jobs: source .venv/bin/activate make lint + check-docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Set up rust toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - run: | + git config user.name github-actions + git config user.email github-actions@github.com + + # venv required by maturin + python3 -m venv .venv + source .venv/bin/activate + + make install-test-requirements + make install-doc-requirements + # Required for pdoc to be able to import the sources + make dev-install + make doc + # GitHub provides only x86_64 runners, so we cannot test on arm architecture test: runs-on: ${{ matrix.os }} @@ -110,31 +138,3 @@ jobs: command: build args: "-o dist --interpreter python${{ matrix.python-version }}" target: ${{ steps.target.outputs.target }} - - check-docs: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: Set up rust toolchain - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable - override: true - - run: | - git config user.name github-actions - git config user.email github-actions@github.com - - # venv required by maturin - python3 -m venv .venv - source .venv/bin/activate - - make install-test-requirements - make install-doc-requirements - # Required for pdoc to be able to import the sources - make dev-install - make doc From 045a11428f1dfa2a011b719436b07cf7b05fa91c Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Mon, 26 Feb 2024 12:30:49 +0100 Subject: [PATCH 12/14] feat: added a selected_columns property to ExcelSheet Signed-off-by: Luka Peschke --- python/fastexcel/__init__.py | 5 +++++ python/fastexcel/_fastexcel.pyi | 3 +++ python/tests/test_column_selection.py | 5 +++++ src/types/excelsheet.rs | 13 +++++++++++++ 4 files changed, 26 insertions(+) diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index 3dab411..18d4992 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -55,6 +55,11 @@ def total_height(self) -> int: """The sheet's total height""" return self._sheet.total_height + @property + def selected_columns(self) -> list[str] | list[int] | None: + """The sheet's selected columns""" + return self._sheet.selected_columns + def to_arrow(self) -> pa.RecordBatch: """Converts the sheet to a pyarrow `RecordBatch`""" return self._sheet.to_arrow() diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi index 7db8493..df7ead0 100644 --- a/python/fastexcel/_fastexcel.pyi +++ b/python/fastexcel/_fastexcel.pyi @@ -18,6 +18,9 @@ class _ExcelSheet: @property def offset(self) -> int: """The sheet's offset before data starts""" + @property + def selected_columns(self) -> list[str] | list[int] | None: + """The sheet's selected columns""" def to_arrow(self) -> pa.RecordBatch: """Converts the sheet to a pyarrow `RecordBatch`""" diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py index eff55f0..ab0cd07 100644 --- a/python/tests/test_column_selection.py +++ b/python/tests/test_column_selection.py @@ -21,6 +21,7 @@ def test_single_sheet_all_columns(excel_reader_single_sheet: fastexcel.ExcelRead sheet = excel_reader_single_sheet.load_sheet(0) sheet_explicit_arg = excel_reader_single_sheet.load_sheet(0, use_columns=None) + assert sheet.selected_columns is None expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} expected_pd_df = pd.DataFrame(expected) @@ -45,6 +46,7 @@ def test_single_sheet_subset_by_str(excel_reader_single_sheet: fastexcel.ExcelRe for sheet_name_or_idx in sheets: for col in ["Month", "Year"]: sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[col]) + assert sheet.selected_columns == [col] pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]})) @@ -60,6 +62,7 @@ def test_single_sheet_subset_by_index(excel_reader_single_sheet: fastexcel.Excel for sheet_name_or_idx in sheets: for idx, col_name in enumerate(["Month", "Year"]): sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[idx]) + assert sheet.selected_columns == [idx] pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]})) @@ -97,6 +100,7 @@ def test_single_sheet_with_unnamed_columns( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str ) + assert sheet.selected_columns == use_columns_str pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -104,6 +108,7 @@ def test_single_sheet_with_unnamed_columns( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx ) + assert sheet.selected_columns == use_columns_idx pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index 8da0c32..b765306 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -139,6 +139,14 @@ impl SelectedColumns { } } } + + pub(crate) fn to_python<'p>(&self, py: Python<'p>) -> Option<&'p PyList> { + match self { + SelectedColumns::All => None, + SelectedColumns::ByIndex(idx_vec) => Some(PyList::new(py, idx_vec)), + SelectedColumns::ByName(name_vec) => Some(PyList::new(py, name_vec)), + } + } } impl TryFrom> for SelectedColumns { @@ -480,6 +488,11 @@ impl ExcelSheet { self.header.offset() + self.pagination.offset() } + #[getter] + pub fn selected_columns<'p>(&'p self, py: Python<'p>) -> Option<&PyList> { + self.selected_columns.to_python(py) + } + pub fn to_arrow(&self, py: Python<'_>) -> PyResult { RecordBatch::try_from(self) .with_context(|| format!("could not create RecordBatch from sheet \"{}\"", &self.name)) From 1edc77a6af945613ab3790d431dd5be6eee4bd9c Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Mon, 26 Feb 2024 14:27:39 +0100 Subject: [PATCH 13/14] refactor: validate columns on sheet instantiation Signed-off-by: Luka Peschke --- python/tests/test_column_selection.py | 14 +++----- src/types/excelreader.rs | 10 +++--- src/types/excelsheet.rs | 49 +++++++++++++++++---------- test.py | 5 ++- 4 files changed, 45 insertions(+), 33 deletions(-) diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py index ab0cd07..2129173 100644 --- a/python/tests/test_column_selection.py +++ b/python/tests/test_column_selection.py @@ -236,14 +236,10 @@ def test_single_sheet_invalid_column_indices_column_does_not_exist_str( ) -> None: expected_message = """column with name "nope" not found Context: - 0: selected columns are invalid - 1: could not create RecordBatch from sheet "January" - 2: could not convert RecordBatch to pyarrow for sheet "January" + 0: selected columns are invalid, available columns are: ["Month", "Year"] """ with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)): - excel_reader_single_sheet_with_unnamed_columns.load_sheet( - 0, use_columns=["nope"] - ).to_arrow() + excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=["nope"]) def test_single_sheet_invalid_column_indices_column_does_not_exist_int( @@ -251,9 +247,7 @@ def test_single_sheet_invalid_column_indices_column_does_not_exist_int( ) -> None: expected_message = """column at index 42 not found Context: - 0: selected columns are invalid - 1: could not create RecordBatch from sheet "January" - 2: could not convert RecordBatch to pyarrow for sheet "January" + 0: selected columns are invalid, available columns are: ["Month", "Year"] """ with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)): - excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]).to_arrow() + excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]) diff --git a/src/types/excelreader.rs b/src/types/excelreader.rs index e6986da..f198061 100644 --- a/src/types/excelreader.rs +++ b/src/types/excelreader.rs @@ -73,14 +73,15 @@ impl ExcelReader { let header = Header::new(header_row, column_names); let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?; let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?; - Ok(ExcelSheet::new( + ExcelSheet::try_new( name, range, header, pagination, schema_sample_rows, selected_columns, - )) + ) + .into_pyresult() } #[pyo3(signature = ( @@ -131,13 +132,14 @@ impl ExcelReader { let header = Header::new(header_row, column_names); let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?; let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?; - Ok(ExcelSheet::new( + ExcelSheet::try_new( name, range, header, pagination, schema_sample_rows, selected_columns, - )) + ) + .into_pyresult() } } diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index b765306..6da29fa 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -187,6 +187,7 @@ pub(crate) struct ExcelSheet { width: Option, schema_sample_rows: Option, selected_columns: SelectedColumns, + available_columns: Vec, } impl ExcelSheet { @@ -194,15 +195,15 @@ impl ExcelSheet { &self.data } - pub(crate) fn new( + pub(crate) fn try_new( name: String, data: Range, header: Header, pagination: Pagination, schema_sample_rows: Option, selected_columns: SelectedColumns, - ) -> Self { - ExcelSheet { + ) -> FastExcelResult { + let mut sheet = ExcelSheet { name, header, pagination, @@ -212,10 +213,27 @@ impl ExcelSheet { height: None, total_height: None, width: None, - } + // an empty vec as it will be replaced + available_columns: Vec::with_capacity(0), + }; + + let available_columns = sheet.get_available_columns(); + + // Ensuring selected columns are valid + sheet + .selected_columns + .validate_columns(&available_columns) + .with_context(|| { + format!( + "selected columns are invalid, available columns are: {available_columns:?}" + ) + })?; + + sheet.available_columns = available_columns; + Ok(sheet) } - pub(crate) fn column_names(&self) -> Vec { + fn get_available_columns(&self) -> Vec { let width = self.data.width(); match &self.header { Header::None => (0..width) @@ -365,7 +383,7 @@ impl TryFrom<&ExcelSheet> for Schema { arrow_schema_from_column_names_and_range( sheet.data(), - &sheet.column_names(), + &sheet.available_columns, sheet.offset(), // If sample_rows is higher than the sheet's limit, use the limit instead std::cmp::min(sample_rows, sheet.limit()), @@ -381,18 +399,11 @@ impl TryFrom<&ExcelSheet> for RecordBatch { let offset = sheet.offset(); let limit = sheet.limit(); - let column_names = sheet.column_names(); - - // Ensuring selected columns are valid - sheet - .selected_columns - .validate_columns(&column_names) - .with_context(|| "selected columns are invalid")?; - let schema = Schema::try_from(sheet) .with_context(|| format!("could not build schema for sheet {}", sheet.name))?; - let mut iter = column_names + let mut iter = sheet + .available_columns .iter() .enumerate() .filter_map(|(idx, column_name)| { @@ -402,9 +413,11 @@ impl TryFrom<&ExcelSheet> for RecordBatch { SelectedColumns::All => Some(idx), // Otherwise, return its index. If None is found, it means the column was not // selected, and we will just continue - _ => sheet - .selected_columns - .idx_for_column(&column_names, column_name, idx), + _ => sheet.selected_columns.idx_for_column( + &sheet.available_columns, + column_name, + idx, + ), } { // At this point, we know for sure that the column is in the schema so we can // safely unwrap diff --git a/test.py b/test.py index 163f028..7ce0f28 100644 --- a/test.py +++ b/test.py @@ -6,14 +6,17 @@ def get_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument("file") + parser.add_argument("-c", "--column", type=str, nargs="+", help="the columns to use") return parser.parse_args() def main(): args = get_args() excel_file = fastexcel.read_excel(args.file) + use_columns = args.column or None + for sheet_name in excel_file.sheet_names: - excel_file.load_sheet_by_name(sheet_name).to_pandas() + excel_file.load_sheet_by_name(sheet_name, use_columns=use_columns).to_arrow() if __name__ == "__main__": From 61cfc3936b29cb4a7908c593ec9feadaa70911bf Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Mon, 26 Feb 2024 18:06:58 +0100 Subject: [PATCH 14/14] feat: added an available_columns property Signed-off-by: Luka Peschke --- python/fastexcel/__init__.py | 5 +++++ python/fastexcel/_fastexcel.pyi | 3 +++ python/tests/test_column_selection.py | 13 +++++++++++++ src/types/excelsheet.rs | 5 +++++ 4 files changed, 26 insertions(+) diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index 18d4992..b57f8f4 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -60,6 +60,11 @@ def selected_columns(self) -> list[str] | list[int] | None: """The sheet's selected columns""" return self._sheet.selected_columns + @property + def available_columns(self) -> list[str]: + """The columns available for the given sheet""" + return self._sheet.available_columns + def to_arrow(self) -> pa.RecordBatch: """Converts the sheet to a pyarrow `RecordBatch`""" return self._sheet.to_arrow() diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi index df7ead0..b4e2c36 100644 --- a/python/fastexcel/_fastexcel.pyi +++ b/python/fastexcel/_fastexcel.pyi @@ -21,6 +21,9 @@ class _ExcelSheet: @property def selected_columns(self) -> list[str] | list[int] | None: """The sheet's selected columns""" + @property + def available_columns(self) -> list[str]: + """The columns available for the given sheet""" def to_arrow(self) -> pa.RecordBatch: """Converts the sheet to a pyarrow `RecordBatch`""" diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py index 2129173..8fa363c 100644 --- a/python/tests/test_column_selection.py +++ b/python/tests/test_column_selection.py @@ -22,6 +22,7 @@ def test_single_sheet_all_columns(excel_reader_single_sheet: fastexcel.ExcelRead sheet_explicit_arg = excel_reader_single_sheet.load_sheet(0, use_columns=None) assert sheet.selected_columns is None + assert sheet.available_columns == ["Month", "Year"] expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} expected_pd_df = pd.DataFrame(expected) @@ -47,6 +48,7 @@ def test_single_sheet_subset_by_str(excel_reader_single_sheet: fastexcel.ExcelRe for col in ["Month", "Year"]: sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[col]) assert sheet.selected_columns == [col] + assert sheet.available_columns == ["Month", "Year"] pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]})) @@ -63,6 +65,7 @@ def test_single_sheet_subset_by_index(excel_reader_single_sheet: fastexcel.Excel for idx, col_name in enumerate(["Month", "Year"]): sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[idx]) assert sheet.selected_columns == [idx] + assert sheet.available_columns == ["Month", "Year"] pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]})) @@ -101,6 +104,7 @@ def test_single_sheet_with_unnamed_columns( "With unnamed columns", use_columns=use_columns_str ) assert sheet.selected_columns == use_columns_str + assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -109,6 +113,7 @@ def test_single_sheet_with_unnamed_columns( "With unnamed columns", use_columns=use_columns_idx ) assert sheet.selected_columns == use_columns_idx + assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -131,6 +136,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str, n_rows=1 ) + assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -138,6 +144,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx, n_rows=1 ) + assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -152,6 +159,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str, skip_rows=1 ) + assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -159,6 +167,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx, skip_rows=1 ) + assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -180,6 +189,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str, skip_rows=1, column_names=column_names ) + assert sheet.available_columns == column_names pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -187,6 +197,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx, skip_rows=1, column_names=column_names ) + assert sheet.available_columns == column_names pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @@ -197,6 +208,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str, skip_rows=2, column_names=column_names ) + assert sheet.available_columns == column_names pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped)) @@ -204,6 +216,7 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names( sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx, skip_rows=2, column_names=column_names ) + assert sheet.available_columns == column_names pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped)) diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index 6da29fa..5f42dff 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -506,6 +506,11 @@ impl ExcelSheet { self.selected_columns.to_python(py) } + #[getter] + pub fn available_columns<'p>(&'p self, py: Python<'p>) -> &PyList { + PyList::new(py, &self.available_columns) + } + pub fn to_arrow(&self, py: Python<'_>) -> PyResult { RecordBatch::try_from(self) .with_context(|| format!("could not create RecordBatch from sheet \"{}\"", &self.name))