Skip to content

Commit

Permalink
refactor: validate columns on sheet instantiation
Browse files Browse the repository at this point in the history
Signed-off-by: Luka Peschke <[email protected]>
  • Loading branch information
lukapeschke committed Feb 26, 2024
1 parent 045a114 commit 1edc77a
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 33 deletions.
14 changes: 4 additions & 10 deletions python/tests/test_column_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,24 +236,18 @@ def test_single_sheet_invalid_column_indices_column_does_not_exist_str(
) -> None:
expected_message = """column with name "nope" not found
Context:
0: selected columns are invalid
1: could not create RecordBatch from sheet "January"
2: could not convert RecordBatch to pyarrow for sheet "January"
0: selected columns are invalid, available columns are: ["Month", "Year"]
"""
with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(
0, use_columns=["nope"]
).to_arrow()
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=["nope"])


def test_single_sheet_invalid_column_indices_column_does_not_exist_int(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
expected_message = """column at index 42 not found
Context:
0: selected columns are invalid
1: could not create RecordBatch from sheet "January"
2: could not convert RecordBatch to pyarrow for sheet "January"
0: selected columns are invalid, available columns are: ["Month", "Year"]
"""
with pytest.raises(fastexcel.ColumnNotFoundError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]).to_arrow()
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42])
10 changes: 6 additions & 4 deletions src/types/excelreader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,15 @@ impl ExcelReader {
let header = Header::new(header_row, column_names);
let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?;
Ok(ExcelSheet::new(
ExcelSheet::try_new(
name,
range,
header,
pagination,
schema_sample_rows,
selected_columns,
))
)
.into_pyresult()
}

#[pyo3(signature = (
Expand Down Expand Up @@ -131,13 +132,14 @@ impl ExcelReader {
let header = Header::new(header_row, column_names);
let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?;
Ok(ExcelSheet::new(
ExcelSheet::try_new(
name,
range,
header,
pagination,
schema_sample_rows,
selected_columns,
))
)
.into_pyresult()
}
}
49 changes: 31 additions & 18 deletions src/types/excelsheet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,22 +187,23 @@ pub(crate) struct ExcelSheet {
width: Option<usize>,
schema_sample_rows: Option<usize>,
selected_columns: SelectedColumns,
available_columns: Vec<String>,
}

impl ExcelSheet {
pub(crate) fn data(&self) -> &Range<CalData> {
&self.data
}

pub(crate) fn new(
pub(crate) fn try_new(
name: String,
data: Range<CalData>,
header: Header,
pagination: Pagination,
schema_sample_rows: Option<usize>,
selected_columns: SelectedColumns,
) -> Self {
ExcelSheet {
) -> FastExcelResult<Self> {
let mut sheet = ExcelSheet {
name,
header,
pagination,
Expand All @@ -212,10 +213,27 @@ impl ExcelSheet {
height: None,
total_height: None,
width: None,
}
// an empty vec as it will be replaced
available_columns: Vec::with_capacity(0),
};

let available_columns = sheet.get_available_columns();

// Ensuring selected columns are valid
sheet
.selected_columns
.validate_columns(&available_columns)
.with_context(|| {
format!(
"selected columns are invalid, available columns are: {available_columns:?}"
)
})?;

sheet.available_columns = available_columns;
Ok(sheet)
}

pub(crate) fn column_names(&self) -> Vec<String> {
fn get_available_columns(&self) -> Vec<String> {
let width = self.data.width();
match &self.header {
Header::None => (0..width)
Expand Down Expand Up @@ -365,7 +383,7 @@ impl TryFrom<&ExcelSheet> for Schema {

arrow_schema_from_column_names_and_range(
sheet.data(),
&sheet.column_names(),
&sheet.available_columns,
sheet.offset(),
// If sample_rows is higher than the sheet's limit, use the limit instead
std::cmp::min(sample_rows, sheet.limit()),
Expand All @@ -381,18 +399,11 @@ impl TryFrom<&ExcelSheet> for RecordBatch {
let offset = sheet.offset();
let limit = sheet.limit();

let column_names = sheet.column_names();

// Ensuring selected columns are valid
sheet
.selected_columns
.validate_columns(&column_names)
.with_context(|| "selected columns are invalid")?;

let schema = Schema::try_from(sheet)
.with_context(|| format!("could not build schema for sheet {}", sheet.name))?;

let mut iter = column_names
let mut iter = sheet
.available_columns
.iter()
.enumerate()
.filter_map(|(idx, column_name)| {
Expand All @@ -402,9 +413,11 @@ impl TryFrom<&ExcelSheet> for RecordBatch {
SelectedColumns::All => Some(idx),
// Otherwise, return its index. If None is found, it means the column was not
// selected, and we will just continue
_ => sheet
.selected_columns
.idx_for_column(&column_names, column_name, idx),
_ => sheet.selected_columns.idx_for_column(
&sheet.available_columns,
column_name,
idx,
),
} {
// At this point, we know for sure that the column is in the schema so we can
// safely unwrap
Expand Down
5 changes: 4 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@
def get_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("file")
parser.add_argument("-c", "--column", type=str, nargs="+", help="the columns to use")
return parser.parse_args()


def main():
args = get_args()
excel_file = fastexcel.read_excel(args.file)
use_columns = args.column or None

for sheet_name in excel_file.sheet_names:
excel_file.load_sheet_by_name(sheet_name).to_pandas()
excel_file.load_sheet_by_name(sheet_name, use_columns=use_columns).to_arrow()


if __name__ == "__main__":
Expand Down

0 comments on commit 1edc77a

Please sign in to comment.