Skip to content

Commit

Permalink
feat: fallback to string if dtype cannot be infered (#307)
Browse files Browse the repository at this point in the history
* feat: fallback to string if dtype cannot be infered

closes #277

* feat: add warning message

* test: fix for windows (we dont care about the path)
  • Loading branch information
PrettyWood authored Oct 21, 2024
1 parent 0fe8bc2 commit fb3d540
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 2 deletions.
19 changes: 19 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ crate-type = ["cdylib"]
arrow = { version = "53.1.0", default-features = false, features = ["pyarrow"] }
calamine = { version = "0.26.1", features = ["dates"] }
chrono = { version = "0.4.38", default-features = false }
log = "0.4.22"
pyo3 = { version = "0.22.5", features = ["abi3-py38"] }
pyo3-log = "0.11.0"

[dev-dependencies]
pretty_assertions = "1.4.1"
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ warn_unused_ignores = true

[tool.pytest.ini_options]
testpaths = ["python/tests"]
log_cli = true
log_cli_level = "INFO"

[tool.ruff]
line-length = 100
Expand Down
Binary file added python/tests/fixtures/infer-dtypes-fallback.xlsx
Binary file not shown.
39 changes: 39 additions & 0 deletions python/tests/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,3 +314,42 @@ def test_one_dtype_for_all() -> None:
),
]
assert sheet.to_polars().dtypes == [pl.String] * 7


def test_fallback_infer_dtypes(mocker) -> None:
"""it should fallback to string if it can't infer the dtype"""
import logging

logger_instance_mock = mocker.patch("logging.getLogger", autospec=True).return_value

excel_reader = fastexcel.read_excel(path_for_fixture("infer-dtypes-fallback.xlsx"))
sheet = excel_reader.load_sheet(0)

# Ensure a warning message was logged to explain the fallback to string
logger_instance_mock.makeRecord.assert_called_once_with(
"fastexcel.types.dtype",
logging.WARNING,
mocker.ANY,
mocker.ANY,
"Could not determine dtype for column 1, falling back to string",
mocker.ANY,
mocker.ANY,
)

assert sheet.available_columns == [
fastexcel.ColumnInfo(
name="id",
index=0,
dtype="float",
dtype_from="guessed",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="label",
index=1,
dtype="string",
dtype_from="guessed",
column_name_from="looked_up",
),
]
assert sheet.to_polars().dtypes == [pl.Float64, pl.String]
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ fn get_version() -> String {

#[pymodule]
fn _fastexcel(m: &Bound<'_, PyModule>) -> PyResult<()> {
pyo3_log::init();

let py = m.py();
m.add_function(wrap_pyfunction!(read_excel, m)?)?;
m.add_class::<ColumnInfo>()?;
Expand Down
10 changes: 8 additions & 2 deletions src/types/dtype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::{

use arrow::datatypes::{DataType as ArrowDataType, TimeUnit};
use calamine::{CellErrorType, CellType, DataType, Range};
use log::warn;
use pyo3::{
prelude::PyAnyMethods, Bound, FromPyObject, PyAny, PyObject, PyResult, Python, ToPyObject,
};
Expand Down Expand Up @@ -281,8 +282,13 @@ pub(crate) fn get_dtype_for_column<DT: CellType + Debug + DataType>(
column_types.remove(&DType::Null);

if column_types.is_empty() {
// If no type apart from NULL was found, it's a NULL column
Ok(DType::Null)
// If no type apart from NULL was found, fallback to string except if the column is empty
if start_row == end_row {
Ok(DType::Null)
} else {
warn!("Could not determine dtype for column {col}, falling back to string");
Ok(DType::String)
}
} else if matches!(dtype_coercion, &DTypeCoercion::Strict) && column_types.len() != 1 {
// If dtype coercion is strict and we do not have a single dtype, it's an error
Err(
Expand Down
1 change: 1 addition & 0 deletions test-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@ openpyxl>=3.1.2,<4
pre-commit>=2.20.0,<5
pytest>=7.1.3
pytest-benchmark>=4.0.0,<5
pytest-mock>=3.1
ruff>=0.7,<0.8
xlrd>=2.0.1,<3

0 comments on commit fb3d540

Please sign in to comment.