From e1fcd7cc93244b90a4fbee2f0a55bc2c47202280 Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Tue, 27 Feb 2024 21:40:32 +0100 Subject: [PATCH] feat: support column ranges in string format in use_columns (#190) * feat: support column ranges in string format in use_columns Signed-off-by: Luka Peschke * support ranges beyond Z Signed-off-by: Luka Peschke * adapt docstrings Signed-off-by: Luka Peschke * remove outdated comment Signed-off-by: Luka Peschke * refactor: make end of range inclusive Signed-off-by: Luka Peschke * fix python test Signed-off-by: Luka Peschke --------- Signed-off-by: Luka Peschke --- python/fastexcel/__init__.py | 26 ++- python/fastexcel/_fastexcel.pyi | 4 +- python/tests/test_column_selection.py | 26 ++- src/types/excelreader.rs | 11 +- src/types/excelsheet.rs | 229 +++++++++++++++++++++++--- 5 files changed, 257 insertions(+), 39 deletions(-) diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index b57f8f4..4c519a4 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -112,7 +112,7 @@ def load_sheet_by_name( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - use_columns: list[str] | list[int] | None = None, + use_columns: list[str] | list[int] | str | None = None, ) -> ExcelSheet: """Loads a sheet by name. @@ -129,9 +129,13 @@ def load_sheet_by_name( :param schema_sample_rows: Specifies how many rows should be used to determine the dtype of a column. If `None`, all rows will be used. - :param use_columns: Specifies the columns to use. Can either be a list of column names, or - a list of column indices (starting at 0). - If `None`, all columns will be used. + :param use_columns: Specifies the columns to use. Can either be: + - `None` to select all columns + - a list of strings, the column names + - a list of ints, the column indices (starting at 0) + - a string, a comma separated list of Excel column letters and column + ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in + `A,B,C,D,E` and `A,C,E,F`) """ return ExcelSheet( self._reader.load_sheet_by_name( @@ -154,7 +158,7 @@ def load_sheet_by_idx( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - use_columns: list[str] | list[int] | None = None, + use_columns: list[str] | list[int] | str | None = None, ) -> ExcelSheet: """Loads a sheet by index. @@ -171,9 +175,13 @@ def load_sheet_by_idx( :param schema_sample_rows: Specifies how many rows should be used to determine the dtype of a column. If `None`, all rows will be used. - :param use_columns: Specifies the columns to use. Can either be a list of column names, or - a list of column indices (starting at 0). - If `None`, all columns will be used. + :param use_columns: Specifies the columns to use. Can either be: + - `None` to select all columns + - a list of strings, the column names + - a list of ints, the column indices (starting at 0) + - a string, a comma separated list of Excel column letters and column + ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in + `A,B,C,D,E` and `A,C,E,F`) """ if idx < 0: raise ValueError(f"Expected idx to be > 0, got {idx}") @@ -198,7 +206,7 @@ def load_sheet( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - use_columns: list[str] | list[int] | None = None, + use_columns: list[str] | list[int] | str | None = None, ) -> ExcelSheet: """Loads a sheet by name if a string is passed or by index if an integer is passed. diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi index b4e2c36..01fd90d 100644 --- a/python/fastexcel/_fastexcel.pyi +++ b/python/fastexcel/_fastexcel.pyi @@ -39,7 +39,7 @@ class _ExcelReader: skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - use_columns: list[str] | list[int] | None = None, + use_columns: list[str] | list[int] | str | None = None, ) -> _ExcelSheet: ... def load_sheet_by_idx( self, @@ -50,7 +50,7 @@ class _ExcelReader: skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, - use_columns: list[str] | list[int] | None = None, + use_columns: list[str] | list[int] | str | None = None, ) -> _ExcelSheet: ... @property def sheet_names(self) -> list[str]: ... diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py index 8fa363c..a4cade3 100644 --- a/python/tests/test_column_selection.py +++ b/python/tests/test_column_selection.py @@ -222,12 +222,33 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names( pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped)) +def test_single_sheet_with_unnamed_columns_and_str_range( + excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, + single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], +) -> None: + use_columns_str = "A,C:E" + use_columns_idx = [0, 2, 3, 4] + expected = { + k: v + for k, v in single_sheet_with_unnamed_columns_expected.items() + if k in ["col1", "col3", "__UNNAMED__3", "col5"] + } + sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( + "With unnamed columns", use_columns=use_columns_str + ) + assert sheet.selected_columns == use_columns_idx + assert sheet.available_columns == ["col1", "__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) + + def test_single_sheet_invalid_column_indices_negative_integer( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, ) -> None: expected_message = """invalid parameters: expected list[int] | list[str], got [-2] Context: - 0: expected selected columns to be list[str] | list[int] | None, got Some([-2]) + 0: could not determine selected columns from provided object: [-2] + 1: expected selected columns to be list[str] | list[int] | str | None, got Some([-2]) """ with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2]) @@ -238,7 +259,8 @@ def test_single_sheet_invalid_column_indices_empty_list( ) -> None: expected_message = """invalid parameters: list of selected columns is empty Context: - 0: expected selected columns to be list[str] | list[int] | None, got Some([]) + 0: could not determine selected columns from provided object: [] + 1: expected selected columns to be list[str] | list[int] | str | None, got Some([]) """ with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[]) diff --git a/src/types/excelreader.rs b/src/types/excelreader.rs index f198061..6a15ce5 100644 --- a/src/types/excelreader.rs +++ b/src/types/excelreader.rs @@ -1,7 +1,7 @@ use std::{fs::File, io::BufReader}; use calamine::{open_workbook_auto, Reader, Sheets}; -use pyo3::{pyclass, pymethods, types::PyList, PyResult}; +use pyo3::{pyclass, pymethods, PyAny, PyResult}; use crate::error::{ py_errors::IntoPyResult, ErrorContext, FastExcelErrorKind, FastExcelResult, IdxOrName, @@ -61,7 +61,8 @@ impl ExcelReader { skip_rows: usize, n_rows: Option, schema_sample_rows: Option, - use_columns: Option<&PyList>, + // pyo3 forces us to take an Option in case the default value is None + use_columns: Option<&PyAny>, ) -> PyResult { let range = self .sheets @@ -72,7 +73,7 @@ impl ExcelReader { let header = Header::new(header_row, column_names); let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?; - let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?; + let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}")).into_pyresult()?; ExcelSheet::try_new( name, range, @@ -103,7 +104,7 @@ impl ExcelReader { skip_rows: usize, n_rows: Option, schema_sample_rows: Option, - use_columns: Option<&PyList>, + use_columns: Option<&PyAny>, ) -> PyResult { let name = self .sheet_names @@ -131,7 +132,7 @@ impl ExcelReader { let header = Header::new(header_row, column_names); let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?; - let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?; + let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}")).into_pyresult()?; ExcelSheet::try_new( name, range, diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index 5f42dff..2596dd1 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -1,4 +1,4 @@ -use std::sync::Arc; +use std::{cmp, collections::HashSet, str::FromStr, sync::Arc}; use crate::error::{ py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, @@ -19,8 +19,8 @@ use chrono::NaiveDate; use pyo3::{ prelude::{pyclass, pymethods, PyObject, Python}, - types::PyList, - PyResult, + types::{PyList, PyString}, + PyAny, PyResult, }; use crate::utils::arrow::arrow_schema_from_column_names_and_range; @@ -149,28 +149,161 @@ impl SelectedColumns { } } -impl TryFrom> for SelectedColumns { +impl TryFrom<&PyList> for SelectedColumns { type Error = FastExcelError; - fn try_from(value: Option<&PyList>) -> FastExcelResult { + fn try_from(py_list: &PyList) -> FastExcelResult { use FastExcelErrorKind::InvalidParameters; - match value { + if py_list.is_empty() { + Err(InvalidParameters("list of selected columns is empty".to_string()).into()) + } else if let Ok(name_vec) = py_list.extract::>() { + Ok(Self::ByName(name_vec)) + } else if let Ok(index_vec) = py_list.extract::>() { + Ok(Self::ByIndex(index_vec)) + } else { + Err( + InvalidParameters(format!("expected list[int] | list[str], got {py_list:?}")) + .into(), + ) + } + } +} + +impl SelectedColumns { + const ALPHABET: [char; 26] = [ + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', + 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + ]; + + fn col_idx_for_col_as_letter(col: &str) -> FastExcelResult { + use FastExcelErrorKind::InvalidParameters; + + if col.is_empty() { + return Err(InvalidParameters( + "a column should have at least one character, got none".to_string(), + ) + .into()); + } + + col.chars() + // iterating over all chars reversed, to have a power based on their rank + .rev() + .enumerate() + // Parses every char, checks its position and returns its numeric equivalent based on + // its rank. For example, AB becomes 27 (26 + 1) + .map(|(idx, col_chr)| { + let pos_in_alphabet = Self::ALPHABET + .iter() + .position(|chr| chr == &col_chr) + .ok_or_else(|| { + FastExcelError::from(InvalidParameters(format!( + "Char is not a valid column name: {col_chr}" + ))) + })?; + + Ok(match idx { + // in case it's the last char, just return its position + 0 => pos_in_alphabet, + // otherwise, 26^idx * (position + 1) + // For example, CBA is 2081: + // A -> 0 + // B -> 26 (53^1 * (1 + 1)) + // C -> 2028 (26^2 * (2 + 1)) + _ => 26usize.pow(idx as u32) * (pos_in_alphabet + 1), + }) + }) + // Sums all previously obtained ranks + .try_fold(0usize, |acc, elem_result| { + elem_result.map(|elem| acc + elem) + }) + } + + fn col_indices_for_letter_range(col_range: &str) -> FastExcelResult> { + use FastExcelErrorKind::InvalidParameters; + + let col_elements = col_range.split(':').collect::>(); + if col_elements.len() == 2 { + let start = Self::col_idx_for_col_as_letter(col_elements[0]) + .with_context(|| format!("invalid start element for range \"{col_range}\""))?; + let end = Self::col_idx_for_col_as_letter(col_elements[1]) + .with_context(|| format!("invalid end element for range \"{col_range}\""))?; + + match start.cmp(&end) { + cmp::Ordering::Less => Ok((start..=end).collect()), + cmp::Ordering::Greater => Err(InvalidParameters(format!( + "end of range is before start: \"{col_range}\"" + )) + .into()), + cmp::Ordering::Equal => { + Err(InvalidParameters(format!("empty range: \"{col_range}\"")).into()) + } + } + } else { + Err(InvalidParameters(format!( + "expected range to contain exactly 2 elements, got {n_elements}: \"{col_range}\"", + n_elements = col_elements.len() + )) + .into()) + } + } +} + +impl FromStr for SelectedColumns { + type Err = FastExcelError; + + fn from_str(s: &str) -> FastExcelResult { + let unique_col_indices: HashSet = s + .to_uppercase() + .split(',') + .map(|col_or_range| { + if col_or_range.contains(':') { + Self::col_indices_for_letter_range(col_or_range) + } else { + Self::col_idx_for_col_as_letter(col_or_range).map(|idx| vec![idx]) + } + }) + .collect::>>()? + .into_iter() + .flatten() + .collect(); + let mut sorted_col_indices: Vec = unique_col_indices.into_iter().collect(); + sorted_col_indices.sort(); + Ok(Self::ByIndex(sorted_col_indices)) + } +} + +impl TryFrom> for SelectedColumns { + type Error = FastExcelError; + + fn try_from(py_any_opt: Option<&PyAny>) -> FastExcelResult { + match py_any_opt { None => Ok(Self::All), - Some(py_list) => { - if py_list.is_empty() { - Err(InvalidParameters("list of selected columns is empty".to_string()).into()) - } else if let Ok(name_vec) = py_list.extract::>() { - Ok(Self::ByName(name_vec)) - } else if let Ok(index_vec) = py_list.extract::>() { - Ok(Self::ByIndex(index_vec)) + Some(py_any) => { + // Not trying to downcast to PyNone here as we assume that this would result in + // py_any_opt being None + if let Ok(py_str) = py_any.downcast::() { + py_str + .to_str() + .map_err(|err| { + FastExcelErrorKind::InvalidParameters(format!( + "provided string is not valid unicode: {err}" + )) + })? + .parse() + } else if let Ok(py_list) = py_any.downcast::() { + py_list.try_into() } else { - Err(InvalidParameters(format!( - "expected list[int] | list[str], got {py_list:?}" + Err(FastExcelErrorKind::InvalidParameters(format!( + "unsupported object type {object_type}", + object_type = py_any.get_type() )) .into()) } } + .with_context(|| { + format!("could not determine selected columns from provided object: {py_any}") + }), } } } @@ -386,7 +519,7 @@ impl TryFrom<&ExcelSheet> for Schema { &sheet.available_columns, sheet.offset(), // If sample_rows is higher than the sheet's limit, use the limit instead - std::cmp::min(sample_rows, sheet.limit()), + cmp::min(sample_rows, sheet.limit()), &sheet.selected_columns, ) } @@ -536,6 +669,7 @@ impl ExcelSheet { mod tests { use super::*; use pretty_assertions::assert_eq; + use rstest::rstest; #[test] fn selected_columns_from_none() { @@ -548,7 +682,7 @@ mod tests { #[test] fn selected_columns_from_list_of_valid_ints() { Python::with_gil(|py| { - let py_list = PyList::new(py, vec![0, 1, 2]); + let py_list = PyList::new(py, vec![0, 1, 2]).as_ref(); assert_eq!( TryInto::::try_into(Some(py_list)).unwrap(), SelectedColumns::ByIndex(vec![0, 1, 2]) @@ -559,7 +693,7 @@ mod tests { #[test] fn selected_columns_from_list_of_valid_strings() { Python::with_gil(|py| { - let py_list = PyList::new(py, vec!["foo", "bar"]); + let py_list = PyList::new(py, vec!["foo", "bar"]).as_ref(); assert_eq!( TryInto::::try_into(Some(py_list)).unwrap(), SelectedColumns::ByName(vec!["foo".to_string(), "bar".to_string()]) @@ -570,7 +704,7 @@ mod tests { #[test] fn selected_columns_from_invalid_ints() { Python::with_gil(|py| { - let py_list = PyList::new(py, vec![0, 2, -1]); + let py_list = PyList::new(py, vec![0, 2, -1]).as_ref(); let err = TryInto::::try_into(Some(py_list)).unwrap_err(); assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_))); @@ -580,7 +714,7 @@ mod tests { #[test] fn selected_columns_from_empty_int_list() { Python::with_gil(|py| { - let py_list = PyList::new(py, Vec::::new()); + let py_list = PyList::new(py, Vec::::new()).as_ref(); let err = TryInto::::try_into(Some(py_list)).unwrap_err(); assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_))); @@ -590,10 +724,63 @@ mod tests { #[test] fn selected_columns_from_empty_string_list() { Python::with_gil(|py| { - let py_list = PyList::new(py, Vec::::new()); + let py_list = PyList::new(py, Vec::::new()).as_ref(); let err = TryInto::::try_into(Some(py_list)).unwrap_err(); assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_))); }); } + + #[rstest] + // Standard unique columns + #[case("A,B,D", vec![0, 1, 3])] + // Standard unique columns + range + #[case("A,B:E,Y", vec![0, 1, 2, 3, 4, 24])] + // Standard unique column + ranges with mixed case + #[case("A:c,b:E,w,Y:z", vec![0, 1, 2, 3, 4, 22, 24, 25])] + // Ranges beyond Z + #[case("A,y:AB", vec![0, 24, 25, 26, 27])] + #[case("BB:BE,DDC:DDF", vec![53, 54, 55, 56, 2810, 2811, 2812, 2813])] + fn selected_columns_from_valid_ranges(#[case] raw: &str, #[case] expected: Vec) { + Python::with_gil(|py| { + let expected_range = SelectedColumns::ByIndex(expected); + let input = PyString::new(py, raw).as_ref(); + + let range = TryInto::::try_into(Some(input)) + .expect("expected a valid column selection"); + + assert_eq!(range, expected_range) + }) + } + + #[rstest] + // Standard unique columns + #[case("", "at least one character")] + // empty range + #[case("a:a,b:d,e", "empty range")] + // end before start + #[case("b:a", "end of range is before start")] + // no start + #[case(":a", "at least one character, got none")] + // no end + #[case("a:", "at least one character, got none")] + // too many elements + #[case("a:b:e", "exactly 2 elements, got 3")] + fn selected_columns_from_invalid_ranges(#[case] raw: &str, #[case] message: &str) { + Python::with_gil(|py| { + let input = PyString::new(py, raw).as_ref(); + + let err = + TryInto::::try_into(Some(input)).expect_err("expected an error"); + + match err.kind { + FastExcelErrorKind::InvalidParameters(detail) => { + if !detail.contains(message) { + panic!("expected \"{detail}\" to contain \"{message}\"") + } + } + _ => panic!("Expected error to be InvalidParameters, got {err:?}"), + } + }) + } }