diff --git a/python/tests/fixtures/sheet-with-na.xlsx b/python/tests/fixtures/sheet-with-na.xlsx new file mode 100644 index 0000000..e098fc9 Binary files /dev/null and b/python/tests/fixtures/sheet-with-na.xlsx differ diff --git a/python/tests/test_fastexcel.py b/python/tests/test_fastexcel.py index a97e5e8..d033394 100644 --- a/python/tests/test_fastexcel.py +++ b/python/tests/test_fastexcel.py @@ -431,3 +431,20 @@ def test_sheet_with_pagination_out_of_bound(): pl.col("Amazing").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms") ), ) + + +def test_sheet_with_na(): + """Test reading a sheet with #N/A cells. For now, we consider them as null""" + excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-na.xlsx")) + sheet = excel_reader.load_sheet(0) + + assert sheet.name == "Sheet1" + assert sheet.height == sheet.total_height == 2 + assert sheet.width == 2 + + expected = { + "Title": ["A", "B"], + "Amount": [None, 100.0], + } + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index e7ec2eb..bb55775 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -246,17 +246,17 @@ fn create_duration_array( impl TryFrom<&ExcelSheet> for Schema { type Error = anyhow::Error; - fn try_from(value: &ExcelSheet) -> Result { + fn try_from(sheet: &ExcelSheet) -> Result { // Checking how many rows we want to use to determine the dtype for a column. If sample_rows is // not provided, we sample limit rows, i.e on the entire column - let sample_rows = value.offset() + value.schema_sample_rows().unwrap_or(value.limit()); + let sample_rows = sheet.offset() + sheet.schema_sample_rows().unwrap_or(sheet.limit()); arrow_schema_from_column_names_and_range( - value.data(), - &value.column_names(), - value.offset(), + sheet.data(), + &sheet.column_names(), + sheet.offset(), // If sample_rows is higher than the sheet's limit, use the limit instead - std::cmp::min(sample_rows, value.limit()), + std::cmp::min(sample_rows, sheet.limit()), ) } } @@ -264,11 +264,11 @@ impl TryFrom<&ExcelSheet> for Schema { impl TryFrom<&ExcelSheet> for RecordBatch { type Error = anyhow::Error; - fn try_from(value: &ExcelSheet) -> Result { - let offset = value.offset(); - let limit = value.limit(); - let schema = Schema::try_from(value) - .with_context(|| format!("Could not build schema for sheet {}", value.name))?; + fn try_from(sheet: &ExcelSheet) -> Result { + let offset = sheet.offset(); + let limit = sheet.limit(); + let schema = Schema::try_from(sheet) + .with_context(|| format!("Could not build schema for sheet {}", sheet.name))?; let mut iter = schema .fields() .iter() @@ -278,25 +278,25 @@ impl TryFrom<&ExcelSheet> for RecordBatch { field.name(), match field.data_type() { ArrowDataType::Boolean => { - create_boolean_array(value.data(), col_idx, offset, limit) + create_boolean_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Int64 => { - create_int_array(value.data(), col_idx, offset, limit) + create_int_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Float64 => { - create_float_array(value.data(), col_idx, offset, limit) + create_float_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Utf8 => { - create_string_array(value.data(), col_idx, offset, limit) + create_string_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Timestamp(TimeUnit::Millisecond, None) => { - create_datetime_array(value.data(), col_idx, offset, limit) + create_datetime_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Date32 => { - create_date_array(value.data(), col_idx, offset, limit) + create_date_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Duration(TimeUnit::Millisecond) => { - create_duration_array(value.data(), col_idx, offset, limit) + create_duration_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Null => Arc::new(NullArray::new(limit - offset)), _ => unreachable!(), @@ -309,7 +309,7 @@ impl TryFrom<&ExcelSheet> for RecordBatch { Ok(RecordBatch::new_empty(Arc::new(schema))) } else { RecordBatch::try_from_iter(iter) - .with_context(|| format!("Could not convert sheet {} to RecordBatch", value.name)) + .with_context(|| format!("Could not convert sheet {} to RecordBatch", sheet.name)) } } } diff --git a/src/utils/arrow.rs b/src/utils/arrow.rs index 12b2df9..3523e03 100644 --- a/src/utils/arrow.rs +++ b/src/utils/arrow.rs @@ -2,7 +2,7 @@ use std::{collections::HashSet, sync::OnceLock}; use anyhow::{anyhow, Context, Result}; use arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}; -use calamine::{Data as CalData, DataType, Range}; +use calamine::{CellErrorType, Data as CalData, DataType, Range}; fn get_cell_type(data: &Range, row: usize, col: usize) -> Result { let cell = data @@ -31,7 +31,10 @@ fn get_cell_type(data: &Range, row: usize, col: usize) -> Result Ok(ArrowDataType::Duration(TimeUnit::Millisecond)), // Errors and nulls - CalData::Error(err) => Err(anyhow!("Error in calamine cell: {err:?}")), + CalData::Error(err) => match err { + CellErrorType::NA => Ok(ArrowDataType::Null), + _ => Err(anyhow!("Error in calamine cell: {err:?}")), + }, CalData::Empty => Ok(ArrowDataType::Null), } }