diff --git a/Cargo.lock b/Cargo.lock index 7143cb9..6c2cba7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -239,9 +239,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "calamine" -version = "0.22.1" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe0ba51a659bb6c8bffd6f7c1c5ffafcafa0c97e4769411d841c3cc5c154ab47" +checksum = "8a3a315226fdc5b1c3e33521073e1712a05944bc0664d665ff1f6ff0396334da" dependencies = [ "byteorder", "chrono", @@ -735,9 +735,9 @@ dependencies = [ [[package]] name = "quick-xml" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" dependencies = [ "encoding_rs", "memchr", diff --git a/Cargo.toml b/Cargo.toml index 62a2515..630a385 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ crate-type = ["cdylib"] [dependencies] anyhow = "1.0.79" -calamine = { version = "0.22.1", features = ["dates"] } +calamine = { version = "0.24.0", features = ["dates"] } chrono = { version = "0.4.33", default-features = false } pyo3 = { version = "0.18.3", features = ["extension-module", "anyhow"] } diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6e2642c --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 ToucanToco + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/src/types/excelreader.rs b/src/types/excelreader.rs index e654307..592b235 100644 --- a/src/types/excelreader.rs +++ b/src/types/excelreader.rs @@ -57,7 +57,6 @@ impl ExcelReader { let range = self .sheets .worksheet_range(&name) - .with_context(|| format!("Sheet {name} not found"))? .with_context(|| format!("Error while loading sheet {name}"))?; let header = Header::new(header_row, column_names); diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index 174a16f..b12e2fd 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -10,7 +10,7 @@ use arrow::{ pyarrow::PyArrowConvert, record_batch::RecordBatch, }; -use calamine::{DataType as CalDataType, Range}; +use calamine::{Data as CalData, DataType, Range}; use chrono::NaiveDate; use pyo3::prelude::{pyclass, pymethods, PyObject, Python}; @@ -52,7 +52,7 @@ impl Pagination { pub(crate) fn new( skip_rows: usize, n_rows: Option, - range: &Range, + range: &Range, ) -> Result { let max_height = range.height(); if max_height < skip_rows { @@ -72,20 +72,20 @@ pub(crate) struct ExcelSheet { pub(crate) name: String, header: Header, pagination: Pagination, - data: Range, + data: Range, height: Option, total_height: Option, width: Option, } impl ExcelSheet { - pub(crate) fn data(&self) -> &Range { + pub(crate) fn data(&self) -> &Range { &self.data } pub(crate) fn new( name: String, - data: Range, + data: Range, header: Header, pagination: Pagination, ) -> Self { @@ -141,7 +141,7 @@ impl ExcelSheet { } fn create_boolean_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, @@ -152,7 +152,7 @@ fn create_boolean_array( } fn create_int_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, @@ -163,7 +163,7 @@ fn create_int_array( } fn create_float_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, @@ -174,7 +174,7 @@ fn create_float_array( } fn create_string_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, @@ -184,20 +184,20 @@ fn create_string_array( // is slower for columns containing mostly/only strings (which we expect to meet more often than // mixed dtype columns containing mostly numbers) data.get((row, col)).and_then(|cell| match cell { - CalDataType::String(s) => Some(s.to_string()), - CalDataType::Float(s) => Some(s.to_string()), - CalDataType::Int(s) => Some(s.to_string()), + CalData::String(s) => Some(s.to_string()), + CalData::Float(s) => Some(s.to_string()), + CalData::Int(s) => Some(s.to_string()), _ => None, }) }))) } -fn duration_type_to_i64(caldt: &CalDataType) -> Option { +fn duration_type_to_i64(caldt: &CalData) -> Option { caldt.as_duration().map(|d| d.num_milliseconds()) } fn create_date_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, @@ -211,7 +211,7 @@ fn create_date_array( } fn create_datetime_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, @@ -226,7 +226,7 @@ fn create_datetime_array( } fn create_duration_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, diff --git a/src/utils/arrow.rs b/src/utils/arrow.rs index c13201f..de4953a 100644 --- a/src/utils/arrow.rs +++ b/src/utils/arrow.rs @@ -2,32 +2,37 @@ use std::{collections::HashSet, sync::OnceLock}; use anyhow::{anyhow, Context, Result}; use arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}; -use calamine::{DataType as CalDataType, Range}; +use calamine::{Data as CalData, DataType, Range}; -fn get_cell_type(data: &Range, row: usize, col: usize) -> Result { +fn get_cell_type(data: &Range, row: usize, col: usize) -> Result { let cell = data .get((row, col)) .with_context(|| format!("Could not retrieve data at ({row},{col})"))?; match cell { - CalDataType::Int(_) => Ok(ArrowDataType::Int64), - CalDataType::Float(_) => Ok(ArrowDataType::Float64), - CalDataType::String(_) => Ok(ArrowDataType::Utf8), - CalDataType::Bool(_) => Ok(ArrowDataType::Boolean), - CalDataType::DateTime(_) => Ok(ArrowDataType::Timestamp(TimeUnit::Millisecond, None)), + CalData::Int(_) => Ok(ArrowDataType::Int64), + CalData::Float(_) => Ok(ArrowDataType::Float64), + CalData::String(_) => Ok(ArrowDataType::Utf8), + CalData::Bool(_) => Ok(ArrowDataType::Boolean), + // Since calamine 0.24.0, a new ExcelDateTime exists for the Datetime type. It can either be + // a duration or a datatime + CalData::DateTime(excel_datetime) => Ok(if excel_datetime.is_datetime() { + ArrowDataType::Timestamp(TimeUnit::Millisecond, None) + } else { + ArrowDataType::Duration(TimeUnit::Millisecond) + }), // These types contain an ISO8601 representation of a date/datetime or a duration - CalDataType::DateTimeIso(_) => match cell.as_datetime() { + CalData::DateTimeIso(_) => match cell.as_datetime() { // If we cannot convert the cell to a datetime, we're working on a date Some(_) => Ok(ArrowDataType::Timestamp(TimeUnit::Millisecond, None)), // NOTE: not using the Date64 type on purpose, as pyarrow converts it to a datetime // rather than a date None => Ok(ArrowDataType::Date32), }, - CalDataType::DurationIso(_) => Ok(ArrowDataType::Duration(TimeUnit::Millisecond)), // A simple duration - CalDataType::Duration(_) => Ok(ArrowDataType::Duration(TimeUnit::Millisecond)), + CalData::DurationIso(_) => Ok(ArrowDataType::Duration(TimeUnit::Millisecond)), // Errors and nulls - CalDataType::Error(err) => Err(anyhow!("Error in calamine cell: {err:?}")), - CalDataType::Empty => Ok(ArrowDataType::Null), + CalData::Error(err) => Err(anyhow!("Error in calamine cell: {err:?}")), + CalData::Empty => Ok(ArrowDataType::Null), } } @@ -50,7 +55,7 @@ fn string_types() -> &'static HashSet { } fn get_arrow_column_type( - data: &Range, + data: &Range, start_row: usize, end_row: usize, col: usize, @@ -99,7 +104,7 @@ fn alias_for_name(name: &str, fields: &[Field]) -> String { } pub(crate) fn arrow_schema_from_column_names_and_range( - range: &Range, + range: &Range, column_names: &[String], row_idx: usize, row_limit: usize, @@ -122,17 +127,17 @@ mod tests { use super::*; #[fixture] - fn range() -> Range { + fn range() -> Range { Range::from_sparse(vec![ // First column - Cell::new((0, 0), CalDataType::Bool(true)), - Cell::new((1, 0), CalDataType::Bool(false)), - Cell::new((2, 0), CalDataType::Int(42)), - Cell::new((3, 0), CalDataType::Float(13.37)), - Cell::new((4, 0), CalDataType::String("hello".to_string())), - Cell::new((5, 0), CalDataType::Empty), - Cell::new((6, 0), CalDataType::Int(12)), - Cell::new((7, 0), CalDataType::Float(12.21)), + Cell::new((0, 0), CalData::Bool(true)), + Cell::new((1, 0), CalData::Bool(false)), + Cell::new((2, 0), CalData::Int(42)), + Cell::new((3, 0), CalData::Float(13.37)), + Cell::new((4, 0), CalData::String("hello".to_string())), + Cell::new((5, 0), CalData::Empty), + Cell::new((6, 0), CalData::Int(12)), + Cell::new((7, 0), CalData::Float(12.21)), ]) } @@ -158,7 +163,7 @@ mod tests { // int + float + null #[case(5, 8, ArrowDataType::Float64)] fn get_arrow_column_type_multi_dtype_ok( - range: Range, + range: Range, #[case] start_row: usize, #[case] end_row: usize, #[case] expected: ArrowDataType,