Skip to content

Commit

Permalink
feat: added support for a dtypes parameter (#195)
Browse files Browse the repository at this point in the history
Signed-off-by: Luka Peschke <[email protected]>
  • Loading branch information
lukapeschke authored Mar 4, 2024
1 parent 9680faf commit 5b70648
Show file tree
Hide file tree
Showing 8 changed files with 366 additions and 11 deletions.
23 changes: 22 additions & 1 deletion python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Literal

from typing_extensions import TypeAlias

if TYPE_CHECKING:
import pandas as pd
Expand All @@ -27,6 +29,9 @@
)
from ._fastexcel import read_excel as _read_excel

DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
DTypeMap: TypeAlias = "dict[str, DType] | dict[int, DType]"


class ExcelSheet:
"""A class representing a single sheet in an Excel File"""
Expand Down Expand Up @@ -64,6 +69,11 @@ def available_columns(self) -> list[str]:
"""The columns available for the given sheet"""
return self._sheet.available_columns

@property
def specified_dtypes(self) -> DTypeMap | None:
"""The dtypes specified for the sheet"""
return self._sheet.specified_dtypes

def to_arrow(self) -> pa.RecordBatch:
"""Converts the sheet to a pyarrow `RecordBatch`"""
return self._sheet.to_arrow()
Expand Down Expand Up @@ -112,6 +122,7 @@ def load_sheet_by_name(
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
dtypes: DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by name.
Expand All @@ -135,6 +146,8 @@ def load_sheet_by_name(
- a string, a comma separated list of Excel column letters and column
ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
`A,B,C,D,E` and `A,C,E,F`)
:param dtypes: An optional dict of dtypes. Keys can either be indices (in case `use_columns`
is a list of ints or an Excel range), or column names
"""
return ExcelSheet(
self._reader.load_sheet_by_name(
Expand All @@ -145,6 +158,7 @@ def load_sheet_by_name(
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
dtypes=dtypes,
)
)

Expand All @@ -158,6 +172,7 @@ def load_sheet_by_idx(
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
dtypes: DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by index.
Expand All @@ -181,6 +196,8 @@ def load_sheet_by_idx(
- a string, a comma separated list of Excel column letters and column
ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
`A,B,C,D,E` and `A,C,E,F`)
:param dtypes: An optional dict of dtypes. Keys can either be indices (in case `use_columns`
is a list of ints or an Excel range), or column names
"""
if idx < 0:
raise ValueError(f"Expected idx to be > 0, got {idx}")
Expand All @@ -193,6 +210,7 @@ def load_sheet_by_idx(
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
dtypes=dtypes,
)
)

Expand All @@ -206,6 +224,7 @@ def load_sheet(
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
dtypes: DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by name if a string is passed or by index if an integer is passed.
Expand All @@ -220,6 +239,7 @@ def load_sheet(
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
dtypes=dtypes,
)
if isinstance(idx_or_name, int)
else self.load_sheet_by_name(
Expand All @@ -230,6 +250,7 @@ def load_sheet(
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
dtypes=dtypes,
)
)

Expand Down
11 changes: 11 additions & 0 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
from __future__ import annotations

from typing import Literal

import pyarrow as pa

_DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]

_DTypeMap = dict[str, _DType] | dict[int, _DType]

class _ExcelSheet:
@property
def name(self) -> str:
Expand All @@ -24,6 +30,9 @@ class _ExcelSheet:
@property
def available_columns(self) -> list[str]:
"""The columns available for the given sheet"""
@property
def specified_dtypes(self) -> _DTypeMap | None:
"""The dtypes specified for the sheet"""
def to_arrow(self) -> pa.RecordBatch:
"""Converts the sheet to a pyarrow `RecordBatch`"""

Expand All @@ -40,6 +49,7 @@ class _ExcelReader:
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
dtypes: _DTypeMap | None = None,
) -> _ExcelSheet: ...
def load_sheet_by_idx(
self,
Expand All @@ -51,6 +61,7 @@ class _ExcelReader:
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
dtypes: _DTypeMap | None = None,
) -> _ExcelSheet: ...
@property
def sheet_names(self) -> list[str]: ...
Expand Down
78 changes: 77 additions & 1 deletion python/tests/test_dtypes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from datetime import datetime
from datetime import date, datetime
from typing import Any

import fastexcel
Expand Down Expand Up @@ -97,3 +97,79 @@ def test_sheet_with_mixed_dtypes_and_sample_rows(expected_data: dict[str, list[A
pl_assert_frame_equal(
pl_df, pl.DataFrame(expected_data, schema_overrides={"Date": pl.Datetime(time_unit="ms")})
)


@pytest.mark.parametrize("dtype_by_index", (True, False))
@pytest.mark.parametrize(
"dtype,expected_data,expected_pd_dtype,expected_pl_dtype",
[
("int", [123456, 44333, 44333, 87878, 87878], "int64", pl.Int64),
("float", [123456.0, 44333.0, 44333.0, 87878.0, 87878.0], "float64", pl.Float64),
("string", ["123456", "44333", "44333", "87878", "87878"], "object", pl.Utf8),
("boolean", [True] * 5, "bool", pl.Boolean),
(
"datetime",
[datetime(2238, 1, 3)] + [datetime(2021, 5, 17)] * 2 + [datetime(2140, 8, 6)] * 2,
"datetime64[ms]",
pl.Datetime,
),
(
"date",
[date(2238, 1, 3)] + [date(2021, 5, 17)] * 2 + [date(2140, 8, 6)] * 2,
"object",
pl.Date,
),
# conversion to duration not supported yet
("duration", [pd.NaT] * 5, "timedelta64[ms]", pl.Duration),
],
)
def test_sheet_with_mixed_dtypes_specify_dtypes(
dtype_by_index: bool,
dtype: fastexcel.DType,
expected_data: list[Any],
expected_pd_dtype: str,
expected_pl_dtype: pl.DataType,
) -> None:
dtypes: fastexcel.DTypeMap = {0: dtype} if dtype_by_index else {"Employee ID": dtype} # type:ignore[dict-item]
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
sheet = excel_reader.load_sheet(0, dtypes=dtypes, n_rows=5)
assert sheet.specified_dtypes == dtypes

pd_df = sheet.to_pandas()
assert pd_df["Employee ID"].dtype == expected_pd_dtype
assert pd_df["Employee ID"].to_list() == expected_data

pl_df = sheet.to_polars()
assert pl_df["Employee ID"].dtype == expected_pl_dtype
assert pl_df["Employee ID"].to_list() == (expected_data if dtype != "duration" else [None] * 5)


@pytest.mark.parametrize(
"dtypes,expected,expected_pd_dtype,expected_pl_dtype",
[
(None, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
({"Date": "datetime"}, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
({"Date": "date"}, date(2023, 7, 21), "object", pl.Date),
({"Date": "string"}, "2023-07-21 00:00:00", "object", pl.Utf8),
({2: "datetime"}, datetime(2023, 7, 21), "datetime64[ms]", pl.Datetime),
({2: "date"}, date(2023, 7, 21), "object", pl.Date),
({2: "string"}, "2023-07-21 00:00:00", "object", pl.Utf8),
],
)
def test_sheet_datetime_conversion(
dtypes: fastexcel.DTypeMap | None,
expected: Any,
expected_pd_dtype: str,
expected_pl_dtype: pl.DataType,
) -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))

sheet = excel_reader.load_sheet(0, dtypes=dtypes)
assert sheet.specified_dtypes == dtypes
pd_df = sheet.to_pandas()
assert pd_df["Date"].dtype == expected_pd_dtype
assert pd_df["Date"].to_list() == [expected] * 9

pl_df = sheet.to_polars()
assert pl_df["Date"].dtype == expected_pl_dtype
assert pl_df["Date"].to_list() == [expected] * 9
164 changes: 164 additions & 0 deletions src/types/dtype.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
use std::{collections::HashMap, str::FromStr};

use arrow::datatypes::{DataType as ArrowDataType, TimeUnit};
use pyo3::{
types::{IntoPyDict, PyDict},
PyObject, Python, ToPyObject,
};

use crate::error::{FastExcelError, FastExcelErrorKind, FastExcelResult};

#[derive(Debug)]
pub(crate) enum DType {
Null,
Int,
Float,
String,
Bool,
DateTime,
Date,
Duration,
}

impl FromStr for DType {
type Err = FastExcelError;

fn from_str(raw_dtype: &str) -> FastExcelResult<Self> {
match raw_dtype {
"null" => Ok(Self::Null),
"int" => Ok(Self::Int),
"float" => Ok(Self::Float),
"string" => Ok(Self::String),
"boolean" => Ok(Self::Bool),
"datetime" => Ok(Self::DateTime),
"date" => Ok(Self::Date),
"duration" => Ok(Self::Duration),
_ => Err(FastExcelErrorKind::InvalidParameters(format!(
"unsupported dtype: \"{raw_dtype}\""
))
.into()),
}
}
}

impl ToPyObject for DType {
fn to_object(&self, py: Python<'_>) -> PyObject {
match self {
DType::Null => "null",
DType::Int => "int",
DType::Float => "float",
DType::String => "string",
DType::Bool => "boolean",
DType::DateTime => "datetime",
DType::Date => "date",
DType::Duration => "duration",
}
.to_object(py)
}
}

#[derive(Debug)]
pub(crate) enum DTypeMap {
ByIndex(HashMap<usize, DType>),
ByName(HashMap<String, DType>),
}

impl DTypeMap {
pub(crate) fn dtype_for_col_name(&self, col_name: &String) -> Option<&DType> {
match self {
DTypeMap::ByName(name_map) => name_map.get(col_name),
_ => None,
}
}

pub(crate) fn dtype_for_col_idx(&self, col_idx: usize) -> Option<&DType> {
match self {
DTypeMap::ByIndex(idx_map) => idx_map.get(&col_idx),
_ => None,
}
}
}

impl<S: AsRef<str>> TryFrom<HashMap<usize, S>> for DTypeMap {
type Error = FastExcelError;

fn try_from(value: HashMap<usize, S>) -> FastExcelResult<Self> {
value
.into_iter()
.map(|(column, raw_dtype)| {
raw_dtype
.as_ref()
.parse()
.map(|raw_dtype| (column, raw_dtype))
})
.collect::<FastExcelResult<HashMap<_, _>>>()
.map(Self::ByIndex)
}
}

impl<S: AsRef<str>> TryFrom<HashMap<String, S>> for DTypeMap {
type Error = FastExcelError;

fn try_from(value: HashMap<String, S>) -> FastExcelResult<Self> {
value
.into_iter()
.map(|(column, raw_dtype)| {
raw_dtype
.as_ref()
.parse()
.map(|raw_dtype| (column, raw_dtype))
})
.collect::<FastExcelResult<HashMap<_, _>>>()
.map(Self::ByName)
}
}

impl TryFrom<&PyDict> for DTypeMap {
type Error = FastExcelError;

fn try_from(py_dict: &PyDict) -> FastExcelResult<Self> {
if let Ok(string_map) = py_dict.extract::<HashMap<String, &str>>() {
string_map.try_into()
} else if let Ok(string_map) = py_dict.extract::<HashMap<usize, &str>>() {
string_map.try_into()
} else {
Err(FastExcelErrorKind::InvalidParameters(format!(
"unsupported dtype map: {py_dict:?}"
))
.into())
}
}
}

impl From<&DType> for ArrowDataType {
fn from(dtype: &DType) -> Self {
match dtype {
DType::Null => ArrowDataType::Null,
DType::Int => ArrowDataType::Int64,
DType::Float => ArrowDataType::Float64,
DType::String => ArrowDataType::Utf8,
DType::Bool => ArrowDataType::Boolean,
DType::DateTime => ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
DType::Date => ArrowDataType::Date32,
DType::Duration => ArrowDataType::Duration(TimeUnit::Millisecond),
}
}
}

impl ToPyObject for DTypeMap {
fn to_object(&self, py: Python<'_>) -> PyObject {
match self {
DTypeMap::ByIndex(idx_map) => idx_map
.iter()
.map(|(k, v)| (k, v.to_object(py)))
.into_py_dict(py)
.into(),

DTypeMap::ByName(name_map) => name_map
.iter()
.map(|(k, v)| (k, v.to_object(py)))
.into_py_dict(py)
.into(),
}
}
}
Loading

0 comments on commit 5b70648

Please sign in to comment.