Skip to content

Commit

Permalink
feat: add nullish strings (#182)
Browse files Browse the repository at this point in the history
  • Loading branch information
PrettyWood authored Feb 19, 2024
1 parent 7dd5058 commit 3d68ebc
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 23 deletions.
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ dependencies = [
pandas = ["pandas>=1.4.4"]
polars = ["polars>=0.16.14"]


[project.urls]
"Source Code" = "https://github.com/ToucanToco/fastexcel"
Issues = "https://github.com/ToucanToco/fastexcel"
Expand Down
Binary file not shown.
Binary file added python/tests/fixtures/sheet-null-strings.xlsx
Binary file not shown.
75 changes: 75 additions & 0 deletions python/tests/test_fastexcel.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from datetime import datetime

import fastexcel
import pandas as pd
import polars as pl
Expand Down Expand Up @@ -448,3 +450,76 @@ def test_sheet_with_na():
}
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


@pytest.mark.parametrize("excel_file", ["sheet-null-strings.xlsx", "sheet-null-strings-empty.xlsx"])
def test_null_strings(excel_file: str):
excel_reader = fastexcel.read_excel(path_for_fixture(excel_file))
sheet = excel_reader.load_sheet(0)

assert sheet.height == sheet.total_height == 10
assert sheet.width == 6

expected = {
"FIRST_LABEL": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
"SECOND_LABEL": ["AA", "BB", "CC", "DD", "EE", "FF", "GG", "HH", "II", "JJ"],
"DATES_AND_NULLS": [
None,
None,
None,
datetime(2022, 12, 19, 0, 0),
datetime(2022, 8, 26, 0, 0),
datetime(2023, 5, 6, 0, 0),
datetime(2023, 3, 20, 0, 0),
datetime(2022, 8, 29, 0, 0),
None,
None,
],
"TIMESTAMPS_AND_NULLS": [
None,
None,
datetime(2023, 2, 18, 6, 13, 56, 730000),
datetime(2022, 9, 20, 20, 0, 7, 50000),
datetime(2022, 9, 24, 17, 4, 31, 236000),
None,
None,
None,
datetime(2022, 9, 14, 1, 50, 58, 390000),
datetime(2022, 10, 21, 17, 20, 12, 223000),
],
"INTS_AND_NULLS": [
2076.0,
2285.0,
39323.0,
None,
None,
None,
11953.0,
None,
30192.0,
None,
],
"FLOATS_AND_NULLS": [
141.02023312814603,
778.0655928608671,
None,
497.60307287584106,
627.446112513911,
None,
None,
None,
488.3509486743364,
None,
],
}

pd_df = pd.DataFrame(expected)
pd_df["DATES_AND_NULLS"] = pd_df["DATES_AND_NULLS"].dt.as_unit("ms")
pd_df["TIMESTAMPS_AND_NULLS"] = pd_df["TIMESTAMPS_AND_NULLS"].dt.as_unit("ms")
pd_assert_frame_equal(sheet.to_pandas(), pd_df)

pl_df = pl.DataFrame(expected).with_columns(
pl.col("DATES_AND_NULLS").dt.cast_time_unit("ms"),
pl.col("TIMESTAMPS_AND_NULLS").dt.cast_time_unit("ms"),
)
pl_assert_frame_equal(sheet.to_polars(), pl_df)
57 changes: 35 additions & 22 deletions src/utils/arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,23 @@ use anyhow::{anyhow, Context, Result};
use arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit};
use calamine::{CellErrorType, Data as CalData, DataType, Range};

/// All the possible string values that should be considered as NULL
const NULL_STRING_VALUES: [&str; 19] = [
"", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN",
"<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null",
];

fn get_cell_type(data: &Range<CalData>, row: usize, col: usize) -> Result<ArrowDataType> {
let cell = data
.get((row, col))
.with_context(|| format!("Could not retrieve data at ({row},{col})"))?;
match cell {
CalData::Int(_) => Ok(ArrowDataType::Int64),
CalData::Float(_) => Ok(ArrowDataType::Float64),
CalData::String(_) => Ok(ArrowDataType::Utf8),
CalData::String(v) => match v {
v if NULL_STRING_VALUES.contains(&v.as_str()) => Ok(ArrowDataType::Null),
_ => Ok(ArrowDataType::Utf8),
},
CalData::Bool(_) => Ok(ArrowDataType::Boolean),
// Since calamine 0.24.0, a new ExcelDateTime exists for the Datetime type. It can either be
// a duration or a datatime
Expand Down Expand Up @@ -148,42 +157,46 @@ mod tests {
// First column
Cell::new((0, 0), CalData::Bool(true)),
Cell::new((1, 0), CalData::Bool(false)),
Cell::new((2, 0), CalData::Int(42)),
Cell::new((3, 0), CalData::Float(13.37)),
Cell::new((4, 0), CalData::String("hello".to_string())),
Cell::new((5, 0), CalData::Empty),
Cell::new((6, 0), CalData::Int(12)),
Cell::new((7, 0), CalData::Float(12.21)),
Cell::new((8, 0), CalData::Bool(true)),
Cell::new((9, 0), CalData::Int(1337)),
Cell::new((2, 0), CalData::String("NULL".to_string())),
Cell::new((3, 0), CalData::Int(42)),
Cell::new((4, 0), CalData::Float(13.37)),
Cell::new((5, 0), CalData::String("hello".to_string())),
Cell::new((6, 0), CalData::Empty),
Cell::new((7, 0), CalData::String("#N/A".to_string())),
Cell::new((8, 0), CalData::Int(12)),
Cell::new((9, 0), CalData::Float(12.21)),
Cell::new((10, 0), CalData::Bool(true)),
Cell::new((11, 0), CalData::Int(1337)),
])
}

#[rstest]
// pure bool
#[case(0, 2, ArrowDataType::Boolean)]
// pure int
#[case(2, 3, ArrowDataType::Int64)]
#[case(3, 4, ArrowDataType::Int64)]
// pure float
#[case(3, 4, ArrowDataType::Float64)]
#[case(4, 5, ArrowDataType::Float64)]
// pure string
#[case(4, 5, ArrowDataType::Utf8)]
#[case(5, 6, ArrowDataType::Utf8)]
// pure int + float
#[case(2, 4, ArrowDataType::Float64)]
#[case(3, 5, ArrowDataType::Float64)]
// null + int + float
#[case(2, 5, ArrowDataType::Float64)]
// float + string
#[case(3, 5, ArrowDataType::Utf8)]
#[case(4, 6, ArrowDataType::Utf8)]
// int + float + string
#[case(2, 5, ArrowDataType::Utf8)]
// int + float + string + empty
#[case(2, 6, ArrowDataType::Utf8)]
// int + null
#[case(5, 7, ArrowDataType::Int64)]
#[case(3, 6, ArrowDataType::Utf8)]
// null + int + float + string + empty + null
#[case(2, 8, ArrowDataType::Utf8)]
// empty + null + int
#[case(6, 9, ArrowDataType::Int64)]
// int + float + null
#[case(5, 8, ArrowDataType::Float64)]
#[case(7, 10, ArrowDataType::Float64)]
// int + float + bool + null
#[case(5, 9, ArrowDataType::Float64)]
#[case(7, 11, ArrowDataType::Float64)]
// int + bool
#[case(8, 10, ArrowDataType::Int64)]
#[case(10, 12, ArrowDataType::Int64)]
fn get_arrow_column_type_multi_dtype_ok(
range: Range<CalData>,
#[case] start_row: usize,
Expand Down

0 comments on commit 3d68ebc

Please sign in to comment.