feat(excelsheet): add support for multi-dtype columns (#164)

* feat(deps-dev): as rstest as a dev dependency Signed-off-by: Luka Peschke <[email protected]> * feat(excelsheet): add support for multi-dtype columns closes #160 Signed-off-by: Luka Peschke <[email protected]> * fix: use as_f64 rather than get_float Signed-off-by: Luka Peschke <[email protected]> * test: add null + int and null + int + float test case Signed-off-by: Luka Peschke <[email protected]> * feat: add support for bools when determining the dtype fo a column Signed-off-by: Luka Peschke <[email protected]> * feat: add support for int columns Signed-off-by: Luka Peschke <[email protected]> * feat: added a schema_sample_rows param Signed-off-by: Luka Peschke <[email protected]> * chore: doc --------- Signed-off-by: Luka Peschke <[email protected]> Co-authored-by: Eric Jolibois <[email protected]>
ToucanToco · Feb 13, 2024 · e243719 · e243719
1 parent cc56cef
commit e243719
Show file tree

Hide file tree

Showing 10 changed files with 412 additions and 45 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -19,3 +19,6 @@ version = "40.0.0"
 # There's a lot of stuff we don't want here, such as serde support
 default-features = false
 features = ["pyarrow"]
+
+[dev-dependencies]
+rstest = { version = "0.18.2", default-features = false }
diff --git a/Makefile b/Makefile
@@ -7,8 +7,9 @@ format	= ruff format python/ *.py
 mypy	= mypy python/ *.py
 pytest	= pytest -v
 ## Rust
-clippy	= cargo clippy
-fmt	= cargo fmt
+clippy		= cargo clippy
+fmt		= cargo fmt
+cargo-test	= cargo test
 ## Docs
 pdoc	= pdoc -o docs python/fastexcel
 
@@ -38,6 +39,7 @@ prod-install:
 	./prod_install.sh
 
 test:
+	$(cargo-test)
 	$(pytest)
 
 doc:

diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py
@@ -88,17 +88,23 @@ def load_sheet_by_name(
         column_names: list[str] | None = None,
         skip_rows: int = 0,
         n_rows: int | None = None,
+        schema_sample_rows: int | None = 1_000,
     ) -> ExcelSheet:
         """Loads a sheet by name.
 
         :param name: The name of the sheet to load.
         :param header_row: The index of the row containing the column labels, default index is 0.
                            If `None`, the sheet does not have any column labels.
-        :param column_names: Overrides headers found in the document. If `column_names` is used,
-                             `header_row` will be ignored.
-        :param n_rows: Specifies how many rows should be loaded. If `None`, all rows are loaded
-        :param skip_rows: Specifies how many should be skipped after the header. If `header_row` is
-                          `None`, it skips the number of rows from the sheet's start.
+        :param column_names: Overrides headers found in the document.
+                             If `column_names` is used, `header_row` will be ignored.
+        :param n_rows: Specifies how many rows should be loaded.
+                       If `None`, all rows are loaded
+        :param skip_rows: Specifies how many rows should be skipped after the header.
+                          If `header_row` is `None`, it skips the number of rows from the
+                          start of the sheet.
+        :param schema_sample_rows: Specifies how many rows should be used to determine
+                                   the dtype of a column.
+                                   If `None`, all rows will be used.
         """
         return ExcelSheet(
             self._reader.load_sheet_by_name(
@@ -107,6 +113,7 @@ def load_sheet_by_name(
                 column_names=column_names,
                 skip_rows=skip_rows,
                 n_rows=n_rows,
+                schema_sample_rows=schema_sample_rows,
             )
         )
 
@@ -118,17 +125,23 @@ def load_sheet_by_idx(
         column_names: list[str] | None = None,
         skip_rows: int = 0,
         n_rows: int | None = None,
+        schema_sample_rows: int | None = 1_000,
     ) -> ExcelSheet:
         """Loads a sheet by index.
 
         :param idx: The index (starting at 0) of the sheet to load.
         :param header_row: The index of the row containing the column labels, default index is 0.
                            If `None`, the sheet does not have any column labels.
-        :param column_names: Overrides headers found in the document. If `column_names` is used,
-                             `header_row` will be ignored.
-        :param n_rows: Specifies how many rows should be loaded. If `None`, all rows are loaded
-        :param skip_rows: Specifies how many should be skipped after the header. If `header_row` is
-                          `None`, it skips the number of rows from the sheet's start.
+        :param column_names: Overrides headers found in the document.
+                             If `column_names` is used, `header_row` will be ignored.
+        :param n_rows: Specifies how many rows should be loaded.
+                       If `None`, all rows are loaded
+        :param skip_rows: Specifies how many rows should be skipped after the header.
+                          If `header_row` is `None`, it skips the number of rows from the
+                          start of the sheet.
+        :param schema_sample_rows: Specifies how many rows should be used to determine
+                                   the dtype of a column.
+                                   If `None`, all rows will be used.
         """
         if idx < 0:
             raise ValueError(f"Expected idx to be > 0, got {idx}")
@@ -139,6 +152,7 @@ def load_sheet_by_idx(
                 column_names=column_names,
                 skip_rows=skip_rows,
                 n_rows=n_rows,
+                schema_sample_rows=schema_sample_rows,
             )
         )
 
@@ -150,6 +164,7 @@ def load_sheet(
         column_names: list[str] | None = None,
         skip_rows: int = 0,
         n_rows: int | None = None,
+        schema_sample_rows: int | None = 1_000,
     ) -> ExcelSheet:
         """Loads a sheet by name if a string is passed or by index if an integer is passed.
 
@@ -162,6 +177,7 @@ def load_sheet(
                 column_names=column_names,
                 skip_rows=skip_rows,
                 n_rows=n_rows,
+                schema_sample_rows=schema_sample_rows,
             )
             if isinstance(idx_or_name, int)
             else self.load_sheet_by_name(
@@ -170,6 +186,7 @@ def load_sheet(
                 column_names=column_names,
                 skip_rows=skip_rows,
                 n_rows=n_rows,
+                schema_sample_rows=schema_sample_rows,
             )
         )
 

diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi
@@ -32,6 +32,7 @@ class _ExcelReader:
         column_names: list[str] | None = None,
         skip_rows: int = 0,
         n_rows: int | None = None,
+        schema_sample_rows: int | None = 1_000,
     ) -> _ExcelSheet: ...
     def load_sheet_by_idx(
         self,
@@ -41,6 +42,7 @@ class _ExcelReader:
         column_names: list[str] | None = None,
         skip_rows: int = 0,
         n_rows: int | None = None,
+        schema_sample_rows: int | None = 1_000,
     ) -> _ExcelSheet: ...
     def load_sheet(
         self,
@@ -50,6 +52,7 @@ class _ExcelReader:
         column_names: list[str] | None = None,
         skip_rows: int = 0,
         n_rows: int | None = None,
+        schema_sample_rows: int | None = 1_000,
     ) -> _ExcelSheet: ...
     @property
     def sheet_names(self) -> list[str]: ...

diff --git a/python/tests/fixtures/fixture-multi-dtypes-columns.xlsx b/python/tests/fixtures/fixture-multi-dtypes-columns.xlsx