Skip to content

Commit

Permalink
feat: support column ranges in string format in use_columns (#190)
Browse files Browse the repository at this point in the history
* feat: support column ranges in string format in use_columns

Signed-off-by: Luka Peschke <[email protected]>

* support ranges beyond Z

Signed-off-by: Luka Peschke <[email protected]>

* adapt docstrings

Signed-off-by: Luka Peschke <[email protected]>

* remove outdated comment

Signed-off-by: Luka Peschke <[email protected]>

* refactor: make end of range inclusive

Signed-off-by: Luka Peschke <[email protected]>

* fix python test

Signed-off-by: Luka Peschke <[email protected]>

---------

Signed-off-by: Luka Peschke <[email protected]>
  • Loading branch information
lukapeschke authored Feb 27, 2024
1 parent 5ac369e commit e1fcd7c
Show file tree
Hide file tree
Showing 5 changed files with 257 additions and 39 deletions.
26 changes: 17 additions & 9 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def load_sheet_by_name(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
use_columns: list[str] | list[int] | str | None = None,
) -> ExcelSheet:
"""Loads a sheet by name.
Expand All @@ -129,9 +129,13 @@ def load_sheet_by_name(
:param schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column.
If `None`, all rows will be used.
:param use_columns: Specifies the columns to use. Can either be a list of column names, or
a list of column indices (starting at 0).
If `None`, all columns will be used.
:param use_columns: Specifies the columns to use. Can either be:
- `None` to select all columns
- a list of strings, the column names
- a list of ints, the column indices (starting at 0)
- a string, a comma separated list of Excel column letters and column
ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
`A,B,C,D,E` and `A,C,E,F`)
"""
return ExcelSheet(
self._reader.load_sheet_by_name(
Expand All @@ -154,7 +158,7 @@ def load_sheet_by_idx(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
use_columns: list[str] | list[int] | str | None = None,
) -> ExcelSheet:
"""Loads a sheet by index.
Expand All @@ -171,9 +175,13 @@ def load_sheet_by_idx(
:param schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column.
If `None`, all rows will be used.
:param use_columns: Specifies the columns to use. Can either be a list of column names, or
a list of column indices (starting at 0).
If `None`, all columns will be used.
:param use_columns: Specifies the columns to use. Can either be:
- `None` to select all columns
- a list of strings, the column names
- a list of ints, the column indices (starting at 0)
- a string, a comma separated list of Excel column letters and column
ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
`A,B,C,D,E` and `A,C,E,F`)
"""
if idx < 0:
raise ValueError(f"Expected idx to be > 0, got {idx}")
Expand All @@ -198,7 +206,7 @@ def load_sheet(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
use_columns: list[str] | list[int] | str | None = None,
) -> ExcelSheet:
"""Loads a sheet by name if a string is passed or by index if an integer is passed.
Expand Down
4 changes: 2 additions & 2 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class _ExcelReader:
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
use_columns: list[str] | list[int] | str | None = None,
) -> _ExcelSheet: ...
def load_sheet_by_idx(
self,
Expand All @@ -50,7 +50,7 @@ class _ExcelReader:
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
use_columns: list[str] | list[int] | str | None = None,
) -> _ExcelSheet: ...
@property
def sheet_names(self) -> list[str]: ...
Expand Down
26 changes: 24 additions & 2 deletions python/tests/test_column_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,12 +222,33 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))


def test_single_sheet_with_unnamed_columns_and_str_range(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
) -> None:
use_columns_str = "A,C:E"
use_columns_idx = [0, 2, 3, 4]
expected = {
k: v
for k, v in single_sheet_with_unnamed_columns_expected.items()
if k in ["col1", "col3", "__UNNAMED__3", "col5"]
}
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str
)
assert sheet.selected_columns == use_columns_idx
assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"]
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_single_sheet_invalid_column_indices_negative_integer(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
expected_message = """invalid parameters: expected list[int] | list[str], got [-2]
Context:
0: expected selected columns to be list[str] | list[int] | None, got Some([-2])
0: could not determine selected columns from provided object: [-2]
1: expected selected columns to be list[str] | list[int] | str | None, got Some([-2])
"""
with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2])
Expand All @@ -238,7 +259,8 @@ def test_single_sheet_invalid_column_indices_empty_list(
) -> None:
expected_message = """invalid parameters: list of selected columns is empty
Context:
0: expected selected columns to be list[str] | list[int] | None, got Some([])
0: could not determine selected columns from provided object: []
1: expected selected columns to be list[str] | list[int] | str | None, got Some([])
"""
with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[])
Expand Down
11 changes: 6 additions & 5 deletions src/types/excelreader.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::{fs::File, io::BufReader};

use calamine::{open_workbook_auto, Reader, Sheets};
use pyo3::{pyclass, pymethods, types::PyList, PyResult};
use pyo3::{pyclass, pymethods, PyAny, PyResult};

use crate::error::{
py_errors::IntoPyResult, ErrorContext, FastExcelErrorKind, FastExcelResult, IdxOrName,
Expand Down Expand Up @@ -61,7 +61,8 @@ impl ExcelReader {
skip_rows: usize,
n_rows: Option<usize>,
schema_sample_rows: Option<usize>,
use_columns: Option<&PyList>,
// pyo3 forces us to take an Option in case the default value is None
use_columns: Option<&PyAny>,
) -> PyResult<ExcelSheet> {
let range = self
.sheets
Expand All @@ -72,7 +73,7 @@ impl ExcelReader {

let header = Header::new(header_row, column_names);
let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?;
let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}")).into_pyresult()?;
ExcelSheet::try_new(
name,
range,
Expand Down Expand Up @@ -103,7 +104,7 @@ impl ExcelReader {
skip_rows: usize,
n_rows: Option<usize>,
schema_sample_rows: Option<usize>,
use_columns: Option<&PyList>,
use_columns: Option<&PyAny>,
) -> PyResult<ExcelSheet> {
let name = self
.sheet_names
Expand Down Expand Up @@ -131,7 +132,7 @@ impl ExcelReader {

let header = Header::new(header_row, column_names);
let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?;
let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}")).into_pyresult()?;
ExcelSheet::try_new(
name,
range,
Expand Down
Loading

0 comments on commit e1fcd7c

Please sign in to comment.