refactor: validate columns on sheet instantiation

Signed-off-by: Luka Peschke <[email protected]>
ToucanToco · Feb 26, 2024 · 1edc77a · 1edc77a
1 parent 045a114
commit 1edc77a
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 33 deletions.
diff --git a/python/tests/test_column_selection.py b/python/tests/test_column_selection.py
@@ -236,24 +236,18 @@ def test_single_sheet_invalid_column_indices_column_does_not_exist_str(
 ) -> None:
     expected_message = """column with name "nope" not found
 Context:
-    0: selected columns are invalid
-    1: could not create RecordBatch from sheet "January"
-    2: could not convert RecordBatch to pyarrow for sheet "January"
+    0: selected columns are invalid, available columns are: ["Month", "Year"]
 """
     with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)):
-        excel_reader_single_sheet_with_unnamed_columns.load_sheet(
-            0, use_columns=["nope"]
-        ).to_arrow()
+        excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=["nope"])
 
 
 def test_single_sheet_invalid_column_indices_column_does_not_exist_int(
     excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
 ) -> None:
     expected_message = """column at index 42 not found
 Context:
-    0: selected columns are invalid
-    1: could not create RecordBatch from sheet "January"
-    2: could not convert RecordBatch to pyarrow for sheet "January"
+    0: selected columns are invalid, available columns are: ["Month", "Year"]
 """
     with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)):
-        excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]).to_arrow()
+        excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42])
diff --git a/src/types/excelreader.rs b/src/types/excelreader.rs
@@ -73,14 +73,15 @@ impl ExcelReader {
         let header = Header::new(header_row, column_names);
         let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
         let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?;
-        Ok(ExcelSheet::new(
+        ExcelSheet::try_new(
             name,
             range,
             header,
             pagination,
             schema_sample_rows,
             selected_columns,
-        ))
+        )
+        .into_pyresult()
     }
 
     #[pyo3(signature = (
@@ -131,13 +132,14 @@ impl ExcelReader {
         let header = Header::new(header_row, column_names);
         let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
         let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?;
-        Ok(ExcelSheet::new(
+        ExcelSheet::try_new(
             name,
             range,
             header,
             pagination,
             schema_sample_rows,
             selected_columns,
-        ))
+        )
+        .into_pyresult()
     }
 }
diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs
@@ -187,22 +187,23 @@ pub(crate) struct ExcelSheet {
     width: Option<usize>,
     schema_sample_rows: Option<usize>,
     selected_columns: SelectedColumns,
+    available_columns: Vec<String>,
 }
 
 impl ExcelSheet {
     pub(crate) fn data(&self) -> &Range<CalData> {
         &self.data
     }
 
-    pub(crate) fn new(
+    pub(crate) fn try_new(
         name: String,
         data: Range<CalData>,
         header: Header,
         pagination: Pagination,
         schema_sample_rows: Option<usize>,
         selected_columns: SelectedColumns,
-    ) -> Self {
-        ExcelSheet {
+    ) -> FastExcelResult<Self> {
+        let mut sheet = ExcelSheet {
             name,
             header,
             pagination,
@@ -212,10 +213,27 @@ impl ExcelSheet {
             height: None,
             total_height: None,
             width: None,
-        }
+            // an empty vec as it will be replaced
+            available_columns: Vec::with_capacity(0),
+        };
+
+        let available_columns = sheet.get_available_columns();
+
+        // Ensuring selected columns are valid
+        sheet
+            .selected_columns
+            .validate_columns(&available_columns)
+            .with_context(|| {
+                format!(
+                    "selected columns are invalid, available columns are: {available_columns:?}"
+                )
+            })?;
+
+        sheet.available_columns = available_columns;
+        Ok(sheet)
     }
 
-    pub(crate) fn column_names(&self) -> Vec<String> {
+    fn get_available_columns(&self) -> Vec<String> {
         let width = self.data.width();
         match &self.header {
             Header::None => (0..width)
@@ -365,7 +383,7 @@ impl TryFrom<&ExcelSheet> for Schema {
 
         arrow_schema_from_column_names_and_range(
             sheet.data(),
-            &sheet.column_names(),
+            &sheet.available_columns,
             sheet.offset(),
             // If sample_rows is higher than the sheet's limit, use the limit instead
             std::cmp::min(sample_rows, sheet.limit()),
@@ -381,18 +399,11 @@ impl TryFrom<&ExcelSheet> for RecordBatch {
         let offset = sheet.offset();
         let limit = sheet.limit();
 
-        let column_names = sheet.column_names();
-
-        // Ensuring selected columns are valid
-        sheet
-            .selected_columns
-            .validate_columns(&column_names)
-            .with_context(|| "selected columns are invalid")?;
-
         let schema = Schema::try_from(sheet)
             .with_context(|| format!("could not build schema for sheet {}", sheet.name))?;
 
-        let mut iter = column_names
+        let mut iter = sheet
+            .available_columns
             .iter()
             .enumerate()
             .filter_map(|(idx, column_name)| {
@@ -402,9 +413,11 @@ impl TryFrom<&ExcelSheet> for RecordBatch {
                     SelectedColumns::All => Some(idx),
                     // Otherwise, return its index. If None is found, it means the column was not
                     // selected, and we will just continue
-                    _ => sheet
-                        .selected_columns
-                        .idx_for_column(&column_names, column_name, idx),
+                    _ => sheet.selected_columns.idx_for_column(
+                        &sheet.available_columns,
+                        column_name,
+                        idx,
+                    ),
                 } {
                     // At this point, we know for sure that the column is in the schema so we can
                     // safely unwrap

diff --git a/test.py b/test.py
@@ -6,14 +6,17 @@
 def get_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser()
     parser.add_argument("file")
+    parser.add_argument("-c", "--column", type=str, nargs="+", help="the columns to use")
     return parser.parse_args()
 
 
 def main():
     args = get_args()
     excel_file = fastexcel.read_excel(args.file)
+    use_columns = args.column or None
+
     for sheet_name in excel_file.sheet_names:
-        excel_file.load_sheet_by_name(sheet_name).to_pandas()
+        excel_file.load_sheet_by_name(sheet_name, use_columns=use_columns).to_arrow()
 
 
 if __name__ == "__main__":