Merge branch 'main' into multi-dtype-columns

ToucanToco · Feb 9, 2024 · 1cab5d0 · 1cab5d0
2 parents 025bdc1 + cc56cef
commit 1cab5d0
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 46 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,7 +10,7 @@ crate-type = ["cdylib"]
 
 [dependencies]
 anyhow = "1.0.79"
-calamine = { version = "0.22.1", features = ["dates"] }
+calamine = { version = "0.24.0", features = ["dates"] }
 chrono = { version = "0.4.33", default-features = false }
 pyo3 = { version = "0.18.3", features = ["extension-module", "anyhow"] }
 

diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 ToucanToco
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/types/excelreader.rs b/src/types/excelreader.rs
@@ -57,7 +57,6 @@ impl ExcelReader {
         let range = self
             .sheets
             .worksheet_range(&name)
-            .with_context(|| format!("Sheet {name} not found"))?
             .with_context(|| format!("Error while loading sheet {name}"))?;
 
         let header = Header::new(header_row, column_names);

diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs
@@ -10,7 +10,7 @@ use arrow::{
     pyarrow::PyArrowConvert,
     record_batch::RecordBatch,
 };
-use calamine::{DataType as CalDataType, Range};
+use calamine::{Data as CalData, DataType, Range};
 use chrono::NaiveDate;
 
 use pyo3::prelude::{pyclass, pymethods, PyObject, Python};
@@ -52,7 +52,7 @@ impl Pagination {
     pub(crate) fn new(
         skip_rows: usize,
         n_rows: Option<usize>,
-        range: &Range<CalDataType>,
+        range: &Range<CalData>,
     ) -> Result<Self> {
         let max_height = range.height();
         if max_height < skip_rows {
@@ -72,20 +72,20 @@ pub(crate) struct ExcelSheet {
     pub(crate) name: String,
     header: Header,
     pagination: Pagination,
-    data: Range<CalDataType>,
+    data: Range<CalData>,
     height: Option<usize>,
     total_height: Option<usize>,
     width: Option<usize>,
 }
 
 impl ExcelSheet {
-    pub(crate) fn data(&self) -> &Range<CalDataType> {
+    pub(crate) fn data(&self) -> &Range<CalData> {
         &self.data
     }
 
     pub(crate) fn new(
         name: String,
-        data: Range<CalDataType>,
+        data: Range<CalData>,
         header: Header,
         pagination: Pagination,
     ) -> Self {
@@ -141,7 +141,7 @@ impl ExcelSheet {
 }
 
 fn create_boolean_array(
-    data: &Range<CalDataType>,
+    data: &Range<CalData>,
     col: usize,
     offset: usize,
     limit: usize,
@@ -152,7 +152,7 @@ fn create_boolean_array(
 }
 
 fn create_int_array(
-    data: &Range<CalDataType>,
+    data: &Range<CalData>,
     col: usize,
     offset: usize,
     limit: usize,
@@ -163,7 +163,7 @@ fn create_int_array(
 }
 
 fn create_float_array(
-    data: &Range<CalDataType>,
+    data: &Range<CalData>,
     col: usize,
     offset: usize,
     limit: usize,
@@ -174,7 +174,7 @@ fn create_float_array(
 }
 
 fn create_string_array(
-    data: &Range<CalDataType>,
+    data: &Range<CalData>,
     col: usize,
     offset: usize,
     limit: usize,
@@ -184,20 +184,20 @@ fn create_string_array(
         // is slower for columns containing mostly/only strings (which we expect to meet more often than
         // mixed dtype columns containing mostly numbers)
         data.get((row, col)).and_then(|cell| match cell {
-            CalDataType::String(s) => Some(s.to_string()),
-            CalDataType::Float(s) => Some(s.to_string()),
-            CalDataType::Int(s) => Some(s.to_string()),
+            CalData::String(s) => Some(s.to_string()),
+            CalData::Float(s) => Some(s.to_string()),
+            CalData::Int(s) => Some(s.to_string()),
             _ => None,
         })
     })))
 }
 
-fn duration_type_to_i64(caldt: &CalDataType) -> Option<i64> {
+fn duration_type_to_i64(caldt: &CalData) -> Option<i64> {
     caldt.as_duration().map(|d| d.num_milliseconds())
 }
 
 fn create_date_array(
-    data: &Range<CalDataType>,
+    data: &Range<CalData>,
     col: usize,
     offset: usize,
     limit: usize,
@@ -211,7 +211,7 @@ fn create_date_array(
 }
 
 fn create_datetime_array(
-    data: &Range<CalDataType>,
+    data: &Range<CalData>,
     col: usize,
     offset: usize,
     limit: usize,
@@ -226,7 +226,7 @@ fn create_datetime_array(
 }
 
 fn create_duration_array(
-    data: &Range<CalDataType>,
+    data: &Range<CalData>,
     col: usize,
     offset: usize,
     limit: usize,

diff --git a/src/utils/arrow.rs b/src/utils/arrow.rs
@@ -2,32 +2,37 @@ use std::{collections::HashSet, sync::OnceLock};
 
 use anyhow::{anyhow, Context, Result};
 use arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit};
-use calamine::{DataType as CalDataType, Range};
+use calamine::{Data as CalData, DataType, Range};
 
-fn get_cell_type(data: &Range<CalDataType>, row: usize, col: usize) -> Result<ArrowDataType> {
+fn get_cell_type(data: &Range<CalData>, row: usize, col: usize) -> Result<ArrowDataType> {
     let cell = data
         .get((row, col))
         .with_context(|| format!("Could not retrieve data at ({row},{col})"))?;
     match cell {
-        CalDataType::Int(_) => Ok(ArrowDataType::Int64),
-        CalDataType::Float(_) => Ok(ArrowDataType::Float64),
-        CalDataType::String(_) => Ok(ArrowDataType::Utf8),
-        CalDataType::Bool(_) => Ok(ArrowDataType::Boolean),
-        CalDataType::DateTime(_) => Ok(ArrowDataType::Timestamp(TimeUnit::Millisecond, None)),
+        CalData::Int(_) => Ok(ArrowDataType::Int64),
+        CalData::Float(_) => Ok(ArrowDataType::Float64),
+        CalData::String(_) => Ok(ArrowDataType::Utf8),
+        CalData::Bool(_) => Ok(ArrowDataType::Boolean),
+        // Since calamine 0.24.0, a new ExcelDateTime exists for the Datetime type. It can either be
+        // a duration or a datatime
+        CalData::DateTime(excel_datetime) => Ok(if excel_datetime.is_datetime() {
+            ArrowDataType::Timestamp(TimeUnit::Millisecond, None)
+        } else {
+            ArrowDataType::Duration(TimeUnit::Millisecond)
+        }),
         // These types contain an ISO8601 representation of a date/datetime or a duration
-        CalDataType::DateTimeIso(_) => match cell.as_datetime() {
+        CalData::DateTimeIso(_) => match cell.as_datetime() {
             // If we cannot convert the cell to a datetime, we're working on a date
             Some(_) => Ok(ArrowDataType::Timestamp(TimeUnit::Millisecond, None)),
             // NOTE: not using the Date64 type on purpose, as pyarrow converts it to a datetime
             // rather than a date
             None => Ok(ArrowDataType::Date32),
         },
-        CalDataType::DurationIso(_) => Ok(ArrowDataType::Duration(TimeUnit::Millisecond)),
         // A simple duration
-        CalDataType::Duration(_) => Ok(ArrowDataType::Duration(TimeUnit::Millisecond)),
+        CalData::DurationIso(_) => Ok(ArrowDataType::Duration(TimeUnit::Millisecond)),
         // Errors and nulls
-        CalDataType::Error(err) => Err(anyhow!("Error in calamine cell: {err:?}")),
-        CalDataType::Empty => Ok(ArrowDataType::Null),
+        CalData::Error(err) => Err(anyhow!("Error in calamine cell: {err:?}")),
+        CalData::Empty => Ok(ArrowDataType::Null),
     }
 }
 
@@ -50,7 +55,7 @@ fn string_types() -> &'static HashSet<ArrowDataType> {
 }
 
 fn get_arrow_column_type(
-    data: &Range<CalDataType>,
+    data: &Range<CalData>,
     start_row: usize,
     end_row: usize,
     col: usize,
@@ -99,7 +104,7 @@ fn alias_for_name(name: &str, fields: &[Field]) -> String {
 }
 
 pub(crate) fn arrow_schema_from_column_names_and_range(
-    range: &Range<CalDataType>,
+    range: &Range<CalData>,
     column_names: &[String],
     row_idx: usize,
     row_limit: usize,
@@ -122,17 +127,17 @@ mod tests {
     use super::*;
 
     #[fixture]
-    fn range() -> Range<CalDataType> {
+    fn range() -> Range<CalData> {
         Range::from_sparse(vec![
             // First column
-            Cell::new((0, 0), CalDataType::Bool(true)),
-            Cell::new((1, 0), CalDataType::Bool(false)),
-            Cell::new((2, 0), CalDataType::Int(42)),
-            Cell::new((3, 0), CalDataType::Float(13.37)),
-            Cell::new((4, 0), CalDataType::String("hello".to_string())),
-            Cell::new((5, 0), CalDataType::Empty),
-            Cell::new((6, 0), CalDataType::Int(12)),
-            Cell::new((7, 0), CalDataType::Float(12.21)),
+            Cell::new((0, 0), CalData::Bool(true)),
+            Cell::new((1, 0), CalData::Bool(false)),
+            Cell::new((2, 0), CalData::Int(42)),
+            Cell::new((3, 0), CalData::Float(13.37)),
+            Cell::new((4, 0), CalData::String("hello".to_string())),
+            Cell::new((5, 0), CalData::Empty),
+            Cell::new((6, 0), CalData::Int(12)),
+            Cell::new((7, 0), CalData::Float(12.21)),
         ])
     }
 
@@ -158,7 +163,7 @@ mod tests {
     // int + float + null
     #[case(5, 8, ArrowDataType::Float64)]
     fn get_arrow_column_type_multi_dtype_ok(
-        range: Range<CalDataType>,
+        range: Range<CalData>,
         #[case] start_row: usize,
         #[case] end_row: usize,
         #[case] expected: ArrowDataType,