feat: added support for a dtypes parameter (#195)

Signed-off-by: Luka Peschke <[email protected]>
ToucanToco · Mar 4, 2024 · 5b70648 · 5b70648
1 parent 9680faf
commit 5b70648
Show file tree

Hide file tree

Showing 8 changed files with 366 additions and 11 deletions.
diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
+
+from typing_extensions import TypeAlias
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -27,6 +29,9 @@
 )
 from ._fastexcel import read_excel as _read_excel
 
+DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
+DTypeMap: TypeAlias = "dict[str, DType] | dict[int, DType]"
+
 
 class ExcelSheet:
     """A class representing a single sheet in an Excel File"""
@@ -64,6 +69,11 @@ def available_columns(self) -> list[str]:
         """The columns available for the given sheet"""
         return self._sheet.available_columns
 
+    @property
+    def specified_dtypes(self) -> DTypeMap | None:
+        """The dtypes specified for the sheet"""
+        return self._sheet.specified_dtypes
+
     def to_arrow(self) -> pa.RecordBatch:
         """Converts the sheet to a pyarrow `RecordBatch`"""
         return self._sheet.to_arrow()
@@ -112,6 +122,7 @@ def load_sheet_by_name(
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         use_columns: list[str] | list[int] | str | None = None,
+        dtypes: DTypeMap | None = None,
     ) -> ExcelSheet:
         """Loads a sheet by name.
 
@@ -135,6 +146,8 @@ def load_sheet_by_name(
                             - a string, a comma separated list of Excel column letters and column
                               ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
                               `A,B,C,D,E` and `A,C,E,F`)
+        :param dtypes: An optional dict of dtypes. Keys can either be indices (in case `use_columns`
+                       is a list of ints or an Excel range), or column names
         """
         return ExcelSheet(
             self._reader.load_sheet_by_name(
@@ -145,6 +158,7 @@ def load_sheet_by_name(
                 n_rows=n_rows,
                 schema_sample_rows=schema_sample_rows,
                 use_columns=use_columns,
+                dtypes=dtypes,
             )
         )
 
@@ -158,6 +172,7 @@ def load_sheet_by_idx(
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         use_columns: list[str] | list[int] | str | None = None,
+        dtypes: DTypeMap | None = None,
     ) -> ExcelSheet:
         """Loads a sheet by index.
 
@@ -181,6 +196,8 @@ def load_sheet_by_idx(
                             - a string, a comma separated list of Excel column letters and column
                               ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
                               `A,B,C,D,E` and `A,C,E,F`)
+        :param dtypes: An optional dict of dtypes. Keys can either be indices (in case `use_columns`
+                       is a list of ints or an Excel range), or column names
         """
         if idx < 0:
             raise ValueError(f"Expected idx to be > 0, got {idx}")
@@ -193,6 +210,7 @@ def load_sheet_by_idx(
                 n_rows=n_rows,
                 schema_sample_rows=schema_sample_rows,
                 use_columns=use_columns,
+                dtypes=dtypes,
             )
         )
 
@@ -206,6 +224,7 @@ def load_sheet(
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         use_columns: list[str] | list[int] | str | None = None,
+        dtypes: DTypeMap | None = None,
     ) -> ExcelSheet:
         """Loads a sheet by name if a string is passed or by index if an integer is passed.
 
@@ -220,6 +239,7 @@ def load_sheet(
                 n_rows=n_rows,
                 schema_sample_rows=schema_sample_rows,
                 use_columns=use_columns,
+                dtypes=dtypes,
             )
             if isinstance(idx_or_name, int)
             else self.load_sheet_by_name(
@@ -230,6 +250,7 @@ def load_sheet(
                 n_rows=n_rows,
                 schema_sample_rows=schema_sample_rows,
                 use_columns=use_columns,
+                dtypes=dtypes,
             )
         )
 

diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi
@@ -1,7 +1,13 @@
 from __future__ import annotations
 
+from typing import Literal
+
 import pyarrow as pa
 
+_DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
+
+_DTypeMap = dict[str, _DType] | dict[int, _DType]
+
 class _ExcelSheet:
     @property
     def name(self) -> str:
@@ -24,6 +30,9 @@ class _ExcelSheet:
     @property
     def available_columns(self) -> list[str]:
         """The columns available for the given sheet"""
+    @property
+    def specified_dtypes(self) -> _DTypeMap | None:
+        """The dtypes specified for the sheet"""
     def to_arrow(self) -> pa.RecordBatch:
         """Converts the sheet to a pyarrow `RecordBatch`"""
 
@@ -40,6 +49,7 @@ class _ExcelReader:
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         use_columns: list[str] | list[int] | str | None = None,
+        dtypes: _DTypeMap | None = None,
     ) -> _ExcelSheet: ...
     def load_sheet_by_idx(
         self,
@@ -51,6 +61,7 @@ class _ExcelReader:
         n_rows: int | None = None,
         schema_sample_rows: int | None = 1_000,
         use_columns: list[str] | list[int] | str | None = None,
+        dtypes: _DTypeMap | None = None,
     ) -> _ExcelSheet: ...
     @property
     def sheet_names(self) -> list[str]: ...

diff --git a/python/tests/test_dtypes.py b/python/tests/test_dtypes.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from datetime import datetime
+from datetime import date, datetime
 from typing import Any
 
 import fastexcel
@@ -97,3 +97,79 @@ def test_sheet_with_mixed_dtypes_and_sample_rows(expected_data: dict[str, list[A
     pl_assert_frame_equal(
         pl_df, pl.DataFrame(expected_data, schema_overrides={"Date": pl.Datetime(time_unit="ms")})
     )
+
+
+@pytest.mark.parametrize("dtype_by_index", (True, False))
+@pytest.mark.parametrize(
+    "dtype,expected_data,expected_pd_dtype,expected_pl_dtype",
+    [
+        ("int", [123456, 44333, 44333, 87878, 87878], "int64", pl.Int64),
+        ("float", [123456.0, 44333.0, 44333.0, 87878.0, 87878.0], "float64", pl.Float64),
+        ("string", ["123456", "44333", "44333", "87878", "87878"], "object", pl.Utf8),
+        ("boolean", [True] * 5, "bool", pl.Boolean),
+        (
+            "datetime",
+            [datetime(2238, 1, 3)] + [datetime(2021, 5, 17)] * 2 + [datetime(2140, 8, 6)] * 2,
+            "datetime64[ms]",
+            pl.Datetime,
+        ),
+        (
+            "date",
+            [date(2238, 1, 3)] + [date(2021, 5, 17)] * 2 + [date(2140, 8, 6)] * 2,
+            "object",
+            pl.Date,
+        ),
+        #  conversion to duration not supported yet
+        ("duration", [pd.NaT] * 5, "timedelta64[ms]", pl.Duration),
+    ],
+)
+def test_sheet_with_mixed_dtypes_specify_dtypes(
+    dtype_by_index: bool,
+    dtype: fastexcel.DType,
+    expected_data: list[Any],
+    expected_pd_dtype: str,
+    expected_pl_dtype: pl.DataType,
+) -> None:
+    dtypes: fastexcel.DTypeMap = {0: dtype} if dtype_by_index else {"Employee ID": dtype}  # type:ignore[dict-item]
+    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
+    sheet = excel_reader.load_sheet(0, dtypes=dtypes, n_rows=5)
+    assert sheet.specified_dtypes == dtypes
+
+    pd_df = sheet.to_pandas()
+    assert pd_df["Employee ID"].dtype == expected_pd_dtype
+    assert pd_df["Employee ID"].to_list() == expected_data
+
+    pl_df = sheet.to_polars()
+    assert pl_df["Employee ID"].dtype == expected_pl_dtype
+    assert pl_df["Employee ID"].to_list() == (expected_data if dtype != "duration" else [None] * 5)
+
+
+@pytest.mark.parametrize(
+    "dtypes,expected,expected_pd_dtype,expected_pl_dtype",
+    [
+        (None, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
+        ({"Date": "datetime"}, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
+        ({"Date": "date"}, date(2023, 7, 21), "object", pl.Date),
+        ({"Date": "string"}, "2023-07-21 00:00:00", "object", pl.Utf8),
+        ({2: "datetime"}, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
+        ({2: "date"}, date(2023, 7, 21), "object", pl.Date),
+        ({2: "string"}, "2023-07-21 00:00:00", "object", pl.Utf8),
+    ],
+)
+def test_sheet_datetime_conversion(
+    dtypes: fastexcel.DTypeMap | None,
+    expected: Any,
+    expected_pd_dtype: str,
+    expected_pl_dtype: pl.DataType,
+) -> None:
+    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
+
+    sheet = excel_reader.load_sheet(0, dtypes=dtypes)
+    assert sheet.specified_dtypes == dtypes
+    pd_df = sheet.to_pandas()
+    assert pd_df["Date"].dtype == expected_pd_dtype
+    assert pd_df["Date"].to_list() == [expected] * 9
+
+    pl_df = sheet.to_polars()
+    assert pl_df["Date"].dtype == expected_pl_dtype
+    assert pl_df["Date"].to_list() == [expected] * 9
diff --git a/src/types/dtype.rs b/src/types/dtype.rs
@@ -0,0 +1,164 @@
+use std::{collections::HashMap, str::FromStr};
+
+use arrow::datatypes::{DataType as ArrowDataType, TimeUnit};
+use pyo3::{
+    types::{IntoPyDict, PyDict},
+    PyObject, Python, ToPyObject,
+};
+
+use crate::error::{FastExcelError, FastExcelErrorKind, FastExcelResult};
+
+#[derive(Debug)]
+pub(crate) enum DType {
+    Null,
+    Int,
+    Float,
+    String,
+    Bool,
+    DateTime,
+    Date,
+    Duration,
+}
+
+impl FromStr for DType {
+    type Err = FastExcelError;
+
+    fn from_str(raw_dtype: &str) -> FastExcelResult<Self> {
+        match raw_dtype {
+            "null" => Ok(Self::Null),
+            "int" => Ok(Self::Int),
+            "float" => Ok(Self::Float),
+            "string" => Ok(Self::String),
+            "boolean" => Ok(Self::Bool),
+            "datetime" => Ok(Self::DateTime),
+            "date" => Ok(Self::Date),
+            "duration" => Ok(Self::Duration),
+            _ => Err(FastExcelErrorKind::InvalidParameters(format!(
+                "unsupported dtype: \"{raw_dtype}\""
+            ))
+            .into()),
+        }
+    }
+}
+
+impl ToPyObject for DType {
+    fn to_object(&self, py: Python<'_>) -> PyObject {
+        match self {
+            DType::Null => "null",
+            DType::Int => "int",
+            DType::Float => "float",
+            DType::String => "string",
+            DType::Bool => "boolean",
+            DType::DateTime => "datetime",
+            DType::Date => "date",
+            DType::Duration => "duration",
+        }
+        .to_object(py)
+    }
+}
+
+#[derive(Debug)]
+pub(crate) enum DTypeMap {
+    ByIndex(HashMap<usize, DType>),
+    ByName(HashMap<String, DType>),
+}
+
+impl DTypeMap {
+    pub(crate) fn dtype_for_col_name(&self, col_name: &String) -> Option<&DType> {
+        match self {
+            DTypeMap::ByName(name_map) => name_map.get(col_name),
+            _ => None,
+        }
+    }
+
+    pub(crate) fn dtype_for_col_idx(&self, col_idx: usize) -> Option<&DType> {
+        match self {
+            DTypeMap::ByIndex(idx_map) => idx_map.get(&col_idx),
+            _ => None,
+        }
+    }
+}
+
+impl<S: AsRef<str>> TryFrom<HashMap<usize, S>> for DTypeMap {
+    type Error = FastExcelError;
+
+    fn try_from(value: HashMap<usize, S>) -> FastExcelResult<Self> {
+        value
+            .into_iter()
+            .map(|(column, raw_dtype)| {
+                raw_dtype
+                    .as_ref()
+                    .parse()
+                    .map(|raw_dtype| (column, raw_dtype))
+            })
+            .collect::<FastExcelResult<HashMap<_, _>>>()
+            .map(Self::ByIndex)
+    }
+}
+
+impl<S: AsRef<str>> TryFrom<HashMap<String, S>> for DTypeMap {
+    type Error = FastExcelError;
+
+    fn try_from(value: HashMap<String, S>) -> FastExcelResult<Self> {
+        value
+            .into_iter()
+            .map(|(column, raw_dtype)| {
+                raw_dtype
+                    .as_ref()
+                    .parse()
+                    .map(|raw_dtype| (column, raw_dtype))
+            })
+            .collect::<FastExcelResult<HashMap<_, _>>>()
+            .map(Self::ByName)
+    }
+}
+
+impl TryFrom<&PyDict> for DTypeMap {
+    type Error = FastExcelError;
+
+    fn try_from(py_dict: &PyDict) -> FastExcelResult<Self> {
+        if let Ok(string_map) = py_dict.extract::<HashMap<String, &str>>() {
+            string_map.try_into()
+        } else if let Ok(string_map) = py_dict.extract::<HashMap<usize, &str>>() {
+            string_map.try_into()
+        } else {
+            Err(FastExcelErrorKind::InvalidParameters(format!(
+                "unsupported dtype map: {py_dict:?}"
+            ))
+            .into())
+        }
+    }
+}
+
+impl From<&DType> for ArrowDataType {
+    fn from(dtype: &DType) -> Self {
+        match dtype {
+            DType::Null => ArrowDataType::Null,
+            DType::Int => ArrowDataType::Int64,
+            DType::Float => ArrowDataType::Float64,
+            DType::String => ArrowDataType::Utf8,
+            DType::Bool => ArrowDataType::Boolean,
+            DType::DateTime => ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
+            DType::Date => ArrowDataType::Date32,
+            DType::Duration => ArrowDataType::Duration(TimeUnit::Millisecond),
+        }
+    }
+}
+
+impl ToPyObject for DTypeMap {
+    fn to_object(&self, py: Python<'_>) -> PyObject {
+        match self {
+            DTypeMap::ByIndex(idx_map) => idx_map
+                .iter()
+                .map(|(k, v)| (k, v.to_object(py)))
+                .into_py_dict(py)
+                .into(),
+
+            DTypeMap::ByName(name_map) => name_map
+                .iter()
+                .map(|(k, v)| (k, v.to_object(py)))
+                .into_py_dict(py)
+                .into(),
+        }
+    }
+}