feat: add nullish strings (#182)

ToucanToco · Feb 19, 2024 · 3d68ebc · 3d68ebc
1 parent 7dd5058
commit 3d68ebc
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 23 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,6 @@ dependencies = [
 pandas = ["pandas>=1.4.4"]
 polars = ["polars>=0.16.14"]
 
-
 [project.urls]
 "Source Code" = "https://github.com/ToucanToco/fastexcel"
 Issues = "https://github.com/ToucanToco/fastexcel"

diff --git a/python/tests/fixtures/sheet-null-strings-empty.xlsx b/python/tests/fixtures/sheet-null-strings-empty.xlsx
diff --git a/python/tests/fixtures/sheet-null-strings.xlsx b/python/tests/fixtures/sheet-null-strings.xlsx
diff --git a/python/tests/test_fastexcel.py b/python/tests/test_fastexcel.py
@@ -1,3 +1,5 @@
+from datetime import datetime
+
 import fastexcel
 import pandas as pd
 import polars as pl
@@ -448,3 +450,76 @@ def test_sheet_with_na():
     }
     pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
     pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
+
+
+@pytest.mark.parametrize("excel_file", ["sheet-null-strings.xlsx", "sheet-null-strings-empty.xlsx"])
+def test_null_strings(excel_file: str):
+    excel_reader = fastexcel.read_excel(path_for_fixture(excel_file))
+    sheet = excel_reader.load_sheet(0)
+
+    assert sheet.height == sheet.total_height == 10
+    assert sheet.width == 6
+
+    expected = {
+        "FIRST_LABEL": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+        "SECOND_LABEL": ["AA", "BB", "CC", "DD", "EE", "FF", "GG", "HH", "II", "JJ"],
+        "DATES_AND_NULLS": [
+            None,
+            None,
+            None,
+            datetime(2022, 12, 19, 0, 0),
+            datetime(2022, 8, 26, 0, 0),
+            datetime(2023, 5, 6, 0, 0),
+            datetime(2023, 3, 20, 0, 0),
+            datetime(2022, 8, 29, 0, 0),
+            None,
+            None,
+        ],
+        "TIMESTAMPS_AND_NULLS": [
+            None,
+            None,
+            datetime(2023, 2, 18, 6, 13, 56, 730000),
+            datetime(2022, 9, 20, 20, 0, 7, 50000),
+            datetime(2022, 9, 24, 17, 4, 31, 236000),
+            None,
+            None,
+            None,
+            datetime(2022, 9, 14, 1, 50, 58, 390000),
+            datetime(2022, 10, 21, 17, 20, 12, 223000),
+        ],
+        "INTS_AND_NULLS": [
+            2076.0,
+            2285.0,
+            39323.0,
+            None,
+            None,
+            None,
+            11953.0,
+            None,
+            30192.0,
+            None,
+        ],
+        "FLOATS_AND_NULLS": [
+            141.02023312814603,
+            778.0655928608671,
+            None,
+            497.60307287584106,
+            627.446112513911,
+            None,
+            None,
+            None,
+            488.3509486743364,
+            None,
+        ],
+    }
+
+    pd_df = pd.DataFrame(expected)
+    pd_df["DATES_AND_NULLS"] = pd_df["DATES_AND_NULLS"].dt.as_unit("ms")
+    pd_df["TIMESTAMPS_AND_NULLS"] = pd_df["TIMESTAMPS_AND_NULLS"].dt.as_unit("ms")
+    pd_assert_frame_equal(sheet.to_pandas(), pd_df)
+
+    pl_df = pl.DataFrame(expected).with_columns(
+        pl.col("DATES_AND_NULLS").dt.cast_time_unit("ms"),
+        pl.col("TIMESTAMPS_AND_NULLS").dt.cast_time_unit("ms"),
+    )
+    pl_assert_frame_equal(sheet.to_polars(), pl_df)
diff --git a/src/utils/arrow.rs b/src/utils/arrow.rs
@@ -4,14 +4,23 @@ use anyhow::{anyhow, Context, Result};
 use arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit};
 use calamine::{CellErrorType, Data as CalData, DataType, Range};
 
+/// All the possible string values that should be considered as NULL
+const NULL_STRING_VALUES: [&str; 19] = [
+    "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN",
+    "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null",
+];
+
 fn get_cell_type(data: &Range<CalData>, row: usize, col: usize) -> Result<ArrowDataType> {
     let cell = data
         .get((row, col))
         .with_context(|| format!("Could not retrieve data at ({row},{col})"))?;
     match cell {
         CalData::Int(_) => Ok(ArrowDataType::Int64),
         CalData::Float(_) => Ok(ArrowDataType::Float64),
-        CalData::String(_) => Ok(ArrowDataType::Utf8),
+        CalData::String(v) => match v {
+            v if NULL_STRING_VALUES.contains(&v.as_str()) => Ok(ArrowDataType::Null),
+            _ => Ok(ArrowDataType::Utf8),
+        },
         CalData::Bool(_) => Ok(ArrowDataType::Boolean),
         // Since calamine 0.24.0, a new ExcelDateTime exists for the Datetime type. It can either be
         // a duration or a datatime
@@ -148,42 +157,46 @@ mod tests {
             // First column
             Cell::new((0, 0), CalData::Bool(true)),
             Cell::new((1, 0), CalData::Bool(false)),
-            Cell::new((2, 0), CalData::Int(42)),
-            Cell::new((3, 0), CalData::Float(13.37)),
-            Cell::new((4, 0), CalData::String("hello".to_string())),
-            Cell::new((5, 0), CalData::Empty),
-            Cell::new((6, 0), CalData::Int(12)),
-            Cell::new((7, 0), CalData::Float(12.21)),
-            Cell::new((8, 0), CalData::Bool(true)),
-            Cell::new((9, 0), CalData::Int(1337)),
+            Cell::new((2, 0), CalData::String("NULL".to_string())),
+            Cell::new((3, 0), CalData::Int(42)),
+            Cell::new((4, 0), CalData::Float(13.37)),
+            Cell::new((5, 0), CalData::String("hello".to_string())),
+            Cell::new((6, 0), CalData::Empty),
+            Cell::new((7, 0), CalData::String("#N/A".to_string())),
+            Cell::new((8, 0), CalData::Int(12)),
+            Cell::new((9, 0), CalData::Float(12.21)),
+            Cell::new((10, 0), CalData::Bool(true)),
+            Cell::new((11, 0), CalData::Int(1337)),
         ])
     }
 
     #[rstest]
     // pure bool
     #[case(0, 2, ArrowDataType::Boolean)]
     // pure int
-    #[case(2, 3, ArrowDataType::Int64)]
+    #[case(3, 4, ArrowDataType::Int64)]
     // pure float
-    #[case(3, 4, ArrowDataType::Float64)]
+    #[case(4, 5, ArrowDataType::Float64)]
     // pure string
-    #[case(4, 5, ArrowDataType::Utf8)]
+    #[case(5, 6, ArrowDataType::Utf8)]
     // pure int + float
-    #[case(2, 4, ArrowDataType::Float64)]
+    #[case(3, 5, ArrowDataType::Float64)]
+    // null + int + float
+    #[case(2, 5, ArrowDataType::Float64)]
     // float + string
-    #[case(3, 5, ArrowDataType::Utf8)]
+    #[case(4, 6, ArrowDataType::Utf8)]
     // int + float + string
-    #[case(2, 5, ArrowDataType::Utf8)]
-    // int + float + string + empty
-    #[case(2, 6, ArrowDataType::Utf8)]
-    // int + null
-    #[case(5, 7, ArrowDataType::Int64)]
+    #[case(3, 6, ArrowDataType::Utf8)]
+    // null + int + float + string + empty + null
+    #[case(2, 8, ArrowDataType::Utf8)]
+    // empty + null + int
+    #[case(6, 9, ArrowDataType::Int64)]
     // int + float + null
-    #[case(5, 8, ArrowDataType::Float64)]
+    #[case(7, 10, ArrowDataType::Float64)]
     // int + float + bool + null
-    #[case(5, 9, ArrowDataType::Float64)]
+    #[case(7, 11, ArrowDataType::Float64)]
     // int + bool
-    #[case(8, 10, ArrowDataType::Int64)]
+    #[case(10, 12, ArrowDataType::Int64)]
     fn get_arrow_column_type_multi_dtype_ok(
         range: Range<CalData>,
         #[case] start_row: usize,