feat: support column ranges in string format in use_columns (#190)

* feat: support column ranges in string format in use_columns Signed-off-by: Luka Peschke <[email protected]> * support ranges beyond Z Signed-off-by: Luka Peschke <[email protected]> * adapt docstrings Signed-off-by: Luka Peschke <[email protected]> * remove outdated comment Signed-off-by: Luka Peschke <[email protected]> * refactor: make end of range inclusive Signed-off-by: Luka Peschke <[email protected]> * fix python test Signed-off-by: Luka Peschke <[email protected]> --------- Signed-off-by: Luka Peschke <[email protected]>
ToucanToco · Feb 27, 2024 · e1fcd7c · e1fcd7c
1 parent 5ac369e
commit e1fcd7c
Show file tree

Hide file tree

Showing 5 changed files with 257 additions and 39 deletions.
diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py
@@ -112,7 +112,7 @@ def load_sheet_by_name(
         skip_rows: int = 0,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
-        use_columns: list[str] | list[int] | None = None,
+        use_columns: list[str] | list[int] | str | None = None,
     ) -> ExcelSheet:
         """Loads a sheet by name.
 
@@ -129,9 +129,13 @@ def load_sheet_by_name(
         :param schema_sample_rows: Specifies how many rows should be used to determine
                                    the dtype of a column.
                                    If `None`, all rows will be used.
-        :param use_columns: Specifies the columns to use. Can either be a list of column names, or
-                            a list of column indices (starting at 0).
-                            If `None`, all columns will be used.
+        :param use_columns: Specifies the columns to use. Can either be:
+                            - `None` to select all columns
+                            - a list of strings, the column names
+                            - a list of ints, the column indices (starting at 0)
+                            - a string, a comma separated list of Excel column letters and column
+                              ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
+                              `A,B,C,D,E` and `A,C,E,F`)
         """
         return ExcelSheet(
             self._reader.load_sheet_by_name(
@@ -154,7 +158,7 @@ def load_sheet_by_idx(
         skip_rows: int = 0,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
-        use_columns: list[str] | list[int] | None = None,
+        use_columns: list[str] | list[int] | str | None = None,
     ) -> ExcelSheet:
         """Loads a sheet by index.
 
@@ -171,9 +175,13 @@ def load_sheet_by_idx(
         :param schema_sample_rows: Specifies how many rows should be used to determine
                                    the dtype of a column.
                                    If `None`, all rows will be used.
-        :param use_columns: Specifies the columns to use. Can either be a list of column names, or
-                            a list of column indices (starting at 0).
-                            If `None`, all columns will be used.
+        :param use_columns: Specifies the columns to use. Can either be:
+                            - `None` to select all columns
+                            - a list of strings, the column names
+                            - a list of ints, the column indices (starting at 0)
+                            - a string, a comma separated list of Excel column letters and column
+                              ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
+                              `A,B,C,D,E` and `A,C,E,F`)
         """
         if idx < 0:
             raise ValueError(f"Expected idx to be > 0, got {idx}")
@@ -198,7 +206,7 @@ def load_sheet(
         skip_rows: int = 0,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
-        use_columns: list[str] | list[int] | None = None,
+        use_columns: list[str] | list[int] | str | None = None,
     ) -> ExcelSheet:
         """Loads a sheet by name if a string is passed or by index if an integer is passed.
 

diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi
@@ -39,7 +39,7 @@ class _ExcelReader:
         skip_rows: int = 0,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
-        use_columns: list[str] | list[int] | None = None,
+        use_columns: list[str] | list[int] | str | None = None,
     ) -> _ExcelSheet: ...
     def load_sheet_by_idx(
         self,
@@ -50,7 +50,7 @@ class _ExcelReader:
         skip_rows: int = 0,
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
-        use_columns: list[str] | list[int] | None = None,
+        use_columns: list[str] | list[int] | str | None = None,
     ) -> _ExcelSheet: ...
     @property
     def sheet_names(self) -> list[str]: ...

diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py
@@ -222,12 +222,33 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))
 
 
+def test_single_sheet_with_unnamed_columns_and_str_range(
+    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
+    single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
+) -> None:
+    use_columns_str = "A,C:E"
+    use_columns_idx = [0, 2, 3, 4]
+    expected = {
+        k: v
+        for k, v in single_sheet_with_unnamed_columns_expected.items()
+        if k in ["col1", "col3", "__UNNAMED__3", "col5"]
+    }
+    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
+        "With unnamed columns", use_columns=use_columns_str
+    )
+    assert sheet.selected_columns == use_columns_idx
+    assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"]
+    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
+    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
+
+
 def test_single_sheet_invalid_column_indices_negative_integer(
     excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
 ) -> None:
     expected_message = """invalid parameters: expected list[int] | list[str], got [-2]
 Context:
-    0: expected selected columns to be list[str] | list[int] | None, got Some([-2])
+    0: could not determine selected columns from provided object: [-2]
+    1: expected selected columns to be list[str] | list[int] | str | None, got Some([-2])
 """
     with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
         excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2])
@@ -238,7 +259,8 @@ def test_single_sheet_invalid_column_indices_empty_list(
 ) -> None:
     expected_message = """invalid parameters: list of selected columns is empty
 Context:
-    0: expected selected columns to be list[str] | list[int] | None, got Some([])
+    0: could not determine selected columns from provided object: []
+    1: expected selected columns to be list[str] | list[int] | str | None, got Some([])
 """
     with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
         excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[])

diff --git a/src/types/excelreader.rs b/src/types/excelreader.rs
@@ -1,7 +1,7 @@
 use std::{fs::File, io::BufReader};
 
 use calamine::{open_workbook_auto, Reader, Sheets};
-use pyo3::{pyclass, pymethods, types::PyList, PyResult};
+use pyo3::{pyclass, pymethods, PyAny, PyResult};
 
 use crate::error::{
     py_errors::IntoPyResult, ErrorContext, FastExcelErrorKind, FastExcelResult, IdxOrName,
@@ -61,7 +61,8 @@ impl ExcelReader {
         skip_rows: usize,
         n_rows: Option<usize>,
         schema_sample_rows: Option<usize>,
-        use_columns: Option<&PyList>,
+        // pyo3 forces us to take an Option in case the default value is None
+        use_columns: Option<&PyAny>,
     ) -> PyResult<ExcelSheet> {
         let range = self
             .sheets
@@ -72,7 +73,7 @@ impl ExcelReader {
 
         let header = Header::new(header_row, column_names);
         let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
-        let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?;
+        let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}")).into_pyresult()?;
         ExcelSheet::try_new(
             name,
             range,
@@ -103,7 +104,7 @@ impl ExcelReader {
         skip_rows: usize,
         n_rows: Option<usize>,
         schema_sample_rows: Option<usize>,
-        use_columns: Option<&PyList>,
+        use_columns: Option<&PyAny>,
     ) -> PyResult<ExcelSheet> {
         let name = self
             .sheet_names
@@ -131,7 +132,7 @@ impl ExcelReader {
 
         let header = Header::new(header_row, column_names);
         let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
-        let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?;
+        let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}")).into_pyresult()?;
         ExcelSheet::try_new(
             name,
             range,