diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index 6529195..cf4a761 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -128,6 +128,7 @@ def load_sheet( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, dtypes: DTypeMap | None = None, ) -> ExcelSheet: @@ -146,6 +147,11 @@ def load_sheet( :param schema_sample_rows: Specifies how many rows should be used to determine the dtype of a column. If `None`, all rows will be used. + :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) + will try to coerce different dtypes in a column to the same one, + whereas `strict` will raise an error in case a column contains + several dtypes. Note that this only applies to columns whose dtype + is guessed, i.e. not specified via `dtypes`. :param use_columns: Specifies the columns to use. Can either be: - `None` to select all columns - A list of strings and ints, the column names and/or indices @@ -165,6 +171,7 @@ def load_sheet( skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, + dtype_coercion=dtype_coercion, use_columns=use_columns, dtypes=dtypes, eager=False, @@ -180,6 +187,7 @@ def load_sheet_eager( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | None = None, dtypes: DTypeMap | None = None, ) -> pa.RecordBatch: @@ -197,6 +205,7 @@ def load_sheet_eager( skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, + dtype_coercion=dtype_coercion, use_columns=use_columns, dtypes=dtypes, eager=True, @@ -211,6 +220,7 @@ def load_sheet_by_name( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, dtypes: DTypeMap | None = None, ) -> ExcelSheet: @@ -225,6 +235,7 @@ def load_sheet_by_name( skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, + dtype_coercion=dtype_coercion, use_columns=use_columns, dtypes=dtypes, ) @@ -238,6 +249,7 @@ def load_sheet_by_idx( skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, dtypes: DTypeMap | None = None, ) -> ExcelSheet: @@ -252,6 +264,7 @@ def load_sheet_by_idx( skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, + dtype_coercion=dtype_coercion, use_columns=use_columns, dtypes=dtypes, ) diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi index 9efe012..b845691 100644 --- a/python/fastexcel/_fastexcel.pyi +++ b/python/fastexcel/_fastexcel.pyi @@ -72,6 +72,7 @@ class _ExcelReader: skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None = None, dtypes: DTypeMap | None = None, eager: Literal[False] = ..., @@ -86,6 +87,7 @@ class _ExcelReader: skip_rows: int = 0, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, + dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | None = None, dtypes: DTypeMap | None = None, eager: Literal[True] = ..., diff --git a/python/tests/test_dtypes.py b/python/tests/test_dtypes.py index e155864..7dc974f 100644 --- a/python/tests/test_dtypes.py +++ b/python/tests/test_dtypes.py @@ -1,9 +1,10 @@ from __future__ import annotations from datetime import date, datetime -from typing import Any +from typing import Any, Literal import fastexcel +import numpy as np import pandas as pd import polars as pl import pytest @@ -190,3 +191,66 @@ def test_sheet_datetime_conversion( pl_df = sheet.to_polars() assert pl_df["Date"].dtype == expected_pl_dtype assert pl_df["Date"].to_list() == [expected] * 9 + + +@pytest.mark.parametrize("eager", [True, False]) +@pytest.mark.parametrize("dtype_coercion", ["coerce", None]) +def test_dtype_coercion_behavior__coerce( + dtype_coercion: Literal["coerce"] | None, eager: bool +) -> None: + excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx")) + + kwargs = {"dtype_coercion": dtype_coercion} if dtype_coercion else {} + sheet = ( + excel_reader.load_sheet_eager(0, **kwargs) # type:ignore[arg-type] + if eager + else excel_reader.load_sheet(0, **kwargs).to_arrow() # type:ignore[arg-type] + ) + + pd_df = sheet.to_pandas() + assert pd_df["Mixed dates"].dtype == "object" + assert pd_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3 + + pl_df = pl.from_arrow(data=sheet) + assert isinstance(pl_df, pl.DataFrame) + assert pl_df["Mixed dates"].dtype == pl.Utf8 + assert pl_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3 + + +@pytest.mark.parametrize("eager", [True, False]) +def test_dtype_coercion_behavior__strict_sampling_eveything(eager: bool) -> None: + excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx")) + + with pytest.raises( + fastexcel.UnsupportedColumnTypeCombinationError, match="type coercion is strict" + ): + if eager: + excel_reader.load_sheet_eager(0, dtype_coercion="strict") + else: + excel_reader.load_sheet(0, dtype_coercion="strict").to_arrow() + + +@pytest.mark.parametrize("eager", [True, False]) +def test_dtype_coercion_behavior__strict_sampling_limit(eager: bool) -> None: + excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx")) + + sheet = ( + excel_reader.load_sheet_eager(0, dtype_coercion="strict", schema_sample_rows=5) + if eager + else excel_reader.load_sheet(0, dtype_coercion="strict", schema_sample_rows=5).to_arrow() + ) + + pd_df = sheet.to_pandas() + assert pd_df["Mixed dates"].dtype == "datetime64[ms]" + assert ( + pd_df["Mixed dates"].to_list() == [pd.Timestamp("2023-07-21 00:00:00")] * 6 + [pd.NaT] * 3 + ) + assert pd_df["Asset ID"].dtype == "float64" + assert pd_df["Asset ID"].replace(np.nan, None).to_list() == [84444.0] * 7 + [None] * 2 + + pl_df = pl.from_arrow(data=sheet) + assert isinstance(pl_df, pl.DataFrame) + assert pl_df["Mixed dates"].dtype == pl.Datetime + assert pl_df["Mixed dates"].to_list() == [datetime(2023, 7, 21)] * 6 + [None] * 3 + assert pl_df["Asset ID"].dtype == pl.Float64 + assert pl_df["Asset ID"].to_list() == [84444.0] * 7 + [None] * 2 diff --git a/src/types/dtype.rs b/src/types/dtype.rs index 5be6596..e264949 100644 --- a/src/types/dtype.rs +++ b/src/types/dtype.rs @@ -101,6 +101,41 @@ impl From<&DType> for ArrowDataType { } } +#[derive(Debug, Clone, PartialEq, Eq, Hash, Copy)] +pub(crate) enum DTypeCoercion { + Coerce, + Strict, +} + +impl FromStr for DTypeCoercion { + type Err = FastExcelError; + + fn from_str(raw_dtype_coercion: &str) -> FastExcelResult { + match raw_dtype_coercion { + "coerce" => Ok(Self::Coerce), + "strict" => Ok(Self::Strict), + _ => Err(FastExcelErrorKind::InvalidParameters(format!( + "unsupported dtype_coercion: \"{raw_dtype_coercion}\"" + )) + .into()), + } + } +} + +impl FromPyObject<'_> for DTypeCoercion { + fn extract_bound(py_dtype_coercion: &Bound<'_, PyAny>) -> PyResult { + if let Ok(dtype_coercion_pystr) = py_dtype_coercion.extract::<&PyString>() { + dtype_coercion_pystr.to_str()?.parse() + } else { + Err(FastExcelErrorKind::InvalidParameters(format!( + "{py_dtype_coercion:?} cannot be converted to str" + )) + .into()) + } + .into_pyresult() + } +} + /// All the possible string values that should be considered as NULL const NULL_STRING_VALUES: [&str; 19] = [ "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", @@ -203,6 +238,7 @@ pub(crate) fn get_dtype_for_column( start_row: usize, end_row: usize, col: usize, + dtype_coercion: &DTypeCoercion, ) -> FastExcelResult { let mut column_types = (start_row..end_row) .map(|row| get_cell_dtype(data, row, col)) @@ -214,6 +250,14 @@ pub(crate) fn get_dtype_for_column( if column_types.is_empty() { // If no type apart from NULL was found, it's a NULL column Ok(DType::Null) + } else if matches!(dtype_coercion, &DTypeCoercion::Strict) && column_types.len() != 1 { + // If dtype coercion is strict and we do not have a single dtype, it's an error + Err( + FastExcelErrorKind::UnsupportedColumnTypeCombination(format!( + "type coercion is strict and column contains {column_types:?}" + )) + .into(), + ) } else if column_types.len() == 1 { // If a single non-null type was found, return it Ok(column_types.into_iter().next().unwrap()) @@ -288,15 +332,65 @@ mod tests { #[case(7, 11, DType::Float)] // int + bool #[case(10, 12, DType::Int)] - fn get_arrow_column_type_multi_dtype_ok( + fn get_arrow_column_type_multi_dtype_ok_coerce( range: Range, #[case] start_row: usize, #[case] end_row: usize, #[case] expected: DType, ) { assert_eq!( - get_dtype_for_column(&range, start_row, end_row, 0).unwrap(), + get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Coerce).unwrap(), expected ); } + + #[rstest] + // pure bool + #[case(0, 2, DType::Bool)] + // pure int + #[case(3, 4, DType::Int)] + // pure float + #[case(4, 5, DType::Float)] + // pure string + #[case(5, 6, DType::String)] + // empty + null + int + #[case(6, 9, DType::Int)] + fn get_arrow_column_type_multi_dtype_ok_strict( + range: Range, + #[case] start_row: usize, + #[case] end_row: usize, + #[case] expected: DType, + ) { + assert_eq!( + get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Strict).unwrap(), + expected + ); + } + + #[rstest] + // pure int + float + #[case(3, 5)] + // float + string + #[case(4, 6)] + // int + float + string + #[case(3, 6)] + // null + int + float + string + empty + null + #[case(2, 8)] + // int + float + null + #[case(7, 10)] + // int + float + bool + null + #[case(7, 11)] + // int + bool + #[case(10, 12)] + fn get_arrow_column_type_multi_dtype_ko_strict( + range: Range, + #[case] start_row: usize, + #[case] end_row: usize, + ) { + let result = get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Strict); + assert!(matches!( + result.unwrap_err().kind, + FastExcelErrorKind::UnsupportedColumnTypeCombination(_) + )); + } } diff --git a/src/types/python/excelreader.rs b/src/types/python/excelreader.rs index c1468b8..1837fd9 100644 --- a/src/types/python/excelreader.rs +++ b/src/types/python/excelreader.rs @@ -17,7 +17,10 @@ use crate::{ error::{ py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, }, - types::{dtype::DTypeMap, idx_or_name::IdxOrName}, + types::{ + dtype::{DTypeCoercion, DTypeMap}, + idx_or_name::IdxOrName, + }, }; use crate::utils::schema::get_schema_sample_rows; @@ -108,6 +111,7 @@ impl ExcelReader { sample_rows: Option, selected_columns: &SelectedColumns, dtypes: Option<&DTypeMap>, + dtype_coercion: &DTypeCoercion, ) -> FastExcelResult { let offset = header.offset() + pagination.offset(); let limit = { @@ -129,6 +133,7 @@ impl ExcelReader { offset, sample_rows_limit, dtypes, + dtype_coercion, )?; let fields = available_columns @@ -150,6 +155,7 @@ impl ExcelReader { skip_rows: usize, n_rows: Option, schema_sample_rows: Option, + dtype_coercion: DTypeCoercion, use_columns: Option<&Bound<'_, PyAny>>, dtypes: Option, eager: bool, @@ -167,6 +173,7 @@ impl ExcelReader { schema_sample_rows, &selected_columns, dtypes.as_ref(), + &dtype_coercion, ) .into_pyresult() .and_then(|rb| rb.to_pyarrow(py)) @@ -179,6 +186,7 @@ impl ExcelReader { header, pagination, schema_sample_rows, + dtype_coercion, selected_columns, dtypes, ) @@ -224,6 +232,7 @@ impl ExcelReader { skip_rows = 0, n_rows = None, schema_sample_rows = 1_000, + dtype_coercion = DTypeCoercion::Coerce, use_columns = None, dtypes = None, eager = false, @@ -237,6 +246,7 @@ impl ExcelReader { skip_rows: usize, n_rows: Option, schema_sample_rows: Option, + dtype_coercion: DTypeCoercion, use_columns: Option<&Bound<'_, PyAny>>, dtypes: Option, eager: bool, @@ -278,6 +288,7 @@ impl ExcelReader { skip_rows, n_rows, schema_sample_rows, + dtype_coercion, use_columns, dtypes, eager, diff --git a/src/types/python/excelsheet/column_info.rs b/src/types/python/excelsheet/column_info.rs index 0947d22..d7bd903 100644 --- a/src/types/python/excelsheet/column_info.rs +++ b/src/types/python/excelsheet/column_info.rs @@ -8,7 +8,7 @@ use crate::{ py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, }, types::{ - dtype::{DType, DTypeMap}, + dtype::{DType, DTypeCoercion, DTypeMap}, idx_or_name::IdxOrName, }, }; @@ -237,6 +237,7 @@ impl ColumnInfoBuilder { start_row: usize, end_row: usize, specified_dtypes: Option<&DTypeMap>, + dtype_coercion: &DTypeCoercion, ) -> FastExcelResult<(DType, DTypeFrom)> { specified_dtypes .and_then(|dtypes| { @@ -253,7 +254,7 @@ impl ColumnInfoBuilder { .map(FastExcelResult::Ok) // If we could not look up a dtype, guess it from the data .unwrap_or_else(|| { - data.dtype_for_column(start_row, end_row, self.index) + data.dtype_for_column(start_row, end_row, self.index, dtype_coercion) .map(|dtype| (dtype, DTypeFrom::Guessed)) }) } @@ -264,9 +265,10 @@ impl ColumnInfoBuilder { start_row: usize, end_row: usize, specified_dtypes: Option<&DTypeMap>, + dtype_coercion: &DTypeCoercion, ) -> FastExcelResult { let (dtype, dtype_from) = self - .dtype_info(data, start_row, end_row, specified_dtypes) + .dtype_info(data, start_row, end_row, specified_dtypes, dtype_coercion) .with_context(|| format!("could not determine dtype for column {}", self.name))?; Ok(ColumnInfo::new( self.name, @@ -401,6 +403,7 @@ pub(crate) fn build_available_columns( start_row: usize, end_row: usize, specified_dtypes: Option<&DTypeMap>, + dtype_coercion: &DTypeCoercion, ) -> FastExcelResult> { let mut aliased_available_columns = Vec::with_capacity(available_columns_info.len()); @@ -414,7 +417,7 @@ pub(crate) fn build_available_columns( } aliased_available_columns.push(alias); // Setting the dtype info - column_info_builder.finish(data, start_row, end_row, specified_dtypes) + column_info_builder.finish(data, start_row, end_row, specified_dtypes, dtype_coercion) }) .collect() } diff --git a/src/types/python/excelsheet/mod.rs b/src/types/python/excelsheet/mod.rs index 5179d99..f274b68 100644 --- a/src/types/python/excelsheet/mod.rs +++ b/src/types/python/excelsheet/mod.rs @@ -18,7 +18,6 @@ use pyo3::{ Bound, PyAny, PyObject, PyResult, ToPyObject, }; -use crate::utils::schema::get_schema_sample_rows; use crate::{ error::{ py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, @@ -28,6 +27,7 @@ use crate::{ idx_or_name::IdxOrName, }, }; +use crate::{types::dtype::DTypeCoercion, utils::schema::get_schema_sample_rows}; use self::column_info::{build_available_columns, build_available_columns_info, ColumnInfo}; use self::sheet_data::{ @@ -345,7 +345,7 @@ pub(crate) struct ExcelSheet { total_height: Option, width: Option, schema_sample_rows: Option, - // selected_columns: SelectedColumns, + dtype_coercion: DTypeCoercion, selected_columns: Vec, available_columns: Vec, dtypes: Option, @@ -356,12 +356,14 @@ impl ExcelSheet { &self.data } + #[allow(clippy::too_many_arguments)] pub(crate) fn try_new( name: String, data: ExcelSheetData<'static>, header: Header, pagination: Pagination, schema_sample_rows: Option, + dtype_coercion: DTypeCoercion, selected_columns: SelectedColumns, dtypes: Option, ) -> FastExcelResult { @@ -373,6 +375,7 @@ impl ExcelSheet { pagination, data, schema_sample_rows, + dtype_coercion, dtypes, height: None, total_height: None, @@ -391,6 +394,7 @@ impl ExcelSheet { sheet.offset(), row_limit, sheet.dtypes.as_ref(), + &sheet.dtype_coercion, )?; let selected_columns = selected_columns.select_columns(&available_columns)?; diff --git a/src/types/python/excelsheet/sheet_data.rs b/src/types/python/excelsheet/sheet_data.rs index faa343c..5a4b337 100644 --- a/src/types/python/excelsheet/sheet_data.rs +++ b/src/types/python/excelsheet/sheet_data.rs @@ -5,7 +5,7 @@ use calamine::{Data as CalData, DataRef as CalDataRef, DataType, Range}; use crate::{ error::FastExcelResult, - types::dtype::{get_dtype_for_column, DType}, + types::dtype::{get_dtype_for_column, DType, DTypeCoercion}, }; pub(crate) enum ExcelSheetData<'r> { @@ -40,10 +40,15 @@ impl ExcelSheetData<'_> { start_row: usize, end_row: usize, col: usize, + dtype_coercion: &DTypeCoercion, ) -> FastExcelResult { match self { - ExcelSheetData::Owned(data) => get_dtype_for_column(data, start_row, end_row, col), - ExcelSheetData::Ref(data) => get_dtype_for_column(data, start_row, end_row, col), + ExcelSheetData::Owned(data) => { + get_dtype_for_column(data, start_row, end_row, col, dtype_coercion) + } + ExcelSheetData::Ref(data) => { + get_dtype_for_column(data, start_row, end_row, col, dtype_coercion) + } } } } diff --git a/test.py b/test.py old mode 100644 new mode 100755 index 5fa3c4a..c17600d --- a/test.py +++ b/test.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import fastexcel