diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py index ab0cd07..a55e567 100644 --- a/python/tests/test_column_selection.py +++ b/python/tests/test_column_selection.py @@ -237,13 +237,9 @@ def test_single_sheet_invalid_column_indices_column_does_not_exist_str( expected_message = """column with name "nope" not found Context: 0: selected columns are invalid - 1: could not create RecordBatch from sheet "January" - 2: could not convert RecordBatch to pyarrow for sheet "January" """ with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)): - excel_reader_single_sheet_with_unnamed_columns.load_sheet( - 0, use_columns=["nope"] - ).to_arrow() + excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=["nope"]) def test_single_sheet_invalid_column_indices_column_does_not_exist_int( @@ -252,8 +248,6 @@ def test_single_sheet_invalid_column_indices_column_does_not_exist_int( expected_message = """column at index 42 not found Context: 0: selected columns are invalid - 1: could not create RecordBatch from sheet "January" - 2: could not convert RecordBatch to pyarrow for sheet "January" """ with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)): - excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]).to_arrow() + excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]) diff --git a/src/types/excelreader.rs b/src/types/excelreader.rs index e6986da..f198061 100644 --- a/src/types/excelreader.rs +++ b/src/types/excelreader.rs @@ -73,14 +73,15 @@ impl ExcelReader { let header = Header::new(header_row, column_names); let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?; let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?; - Ok(ExcelSheet::new( + ExcelSheet::try_new( name, range, header, pagination, schema_sample_rows, selected_columns, - )) + ) + .into_pyresult() } #[pyo3(signature = ( @@ -131,13 +132,14 @@ impl ExcelReader { let header = Header::new(header_row, column_names); let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?; let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?; - Ok(ExcelSheet::new( + ExcelSheet::try_new( name, range, header, pagination, schema_sample_rows, selected_columns, - )) + ) + .into_pyresult() } } diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index b765306..6da29fa 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -187,6 +187,7 @@ pub(crate) struct ExcelSheet { width: Option, schema_sample_rows: Option, selected_columns: SelectedColumns, + available_columns: Vec, } impl ExcelSheet { @@ -194,15 +195,15 @@ impl ExcelSheet { &self.data } - pub(crate) fn new( + pub(crate) fn try_new( name: String, data: Range, header: Header, pagination: Pagination, schema_sample_rows: Option, selected_columns: SelectedColumns, - ) -> Self { - ExcelSheet { + ) -> FastExcelResult { + let mut sheet = ExcelSheet { name, header, pagination, @@ -212,10 +213,27 @@ impl ExcelSheet { height: None, total_height: None, width: None, - } + // an empty vec as it will be replaced + available_columns: Vec::with_capacity(0), + }; + + let available_columns = sheet.get_available_columns(); + + // Ensuring selected columns are valid + sheet + .selected_columns + .validate_columns(&available_columns) + .with_context(|| { + format!( + "selected columns are invalid, available columns are: {available_columns:?}" + ) + })?; + + sheet.available_columns = available_columns; + Ok(sheet) } - pub(crate) fn column_names(&self) -> Vec { + fn get_available_columns(&self) -> Vec { let width = self.data.width(); match &self.header { Header::None => (0..width) @@ -365,7 +383,7 @@ impl TryFrom<&ExcelSheet> for Schema { arrow_schema_from_column_names_and_range( sheet.data(), - &sheet.column_names(), + &sheet.available_columns, sheet.offset(), // If sample_rows is higher than the sheet's limit, use the limit instead std::cmp::min(sample_rows, sheet.limit()), @@ -381,18 +399,11 @@ impl TryFrom<&ExcelSheet> for RecordBatch { let offset = sheet.offset(); let limit = sheet.limit(); - let column_names = sheet.column_names(); - - // Ensuring selected columns are valid - sheet - .selected_columns - .validate_columns(&column_names) - .with_context(|| "selected columns are invalid")?; - let schema = Schema::try_from(sheet) .with_context(|| format!("could not build schema for sheet {}", sheet.name))?; - let mut iter = column_names + let mut iter = sheet + .available_columns .iter() .enumerate() .filter_map(|(idx, column_name)| { @@ -402,9 +413,11 @@ impl TryFrom<&ExcelSheet> for RecordBatch { SelectedColumns::All => Some(idx), // Otherwise, return its index. If None is found, it means the column was not // selected, and we will just continue - _ => sheet - .selected_columns - .idx_for_column(&column_names, column_name, idx), + _ => sheet.selected_columns.idx_for_column( + &sheet.available_columns, + column_name, + idx, + ), } { // At this point, we know for sure that the column is in the schema so we can // safely unwrap diff --git a/test.py b/test.py index 163f028..7ce0f28 100644 --- a/test.py +++ b/test.py @@ -6,14 +6,17 @@ def get_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument("file") + parser.add_argument("-c", "--column", type=str, nargs="+", help="the columns to use") return parser.parse_args() def main(): args = get_args() excel_file = fastexcel.read_excel(args.file) + use_columns = args.column or None + for sheet_name in excel_file.sheet_names: - excel_file.load_sheet_by_name(sheet_name).to_pandas() + excel_file.load_sheet_by_name(sheet_name, use_columns=use_columns).to_arrow() if __name__ == "__main__":