From c034647cbc456575875946153b0c266efa31ce97 Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Wed, 7 Feb 2024 14:05:53 +0100 Subject: [PATCH 1/2] Create LICENSE (#171) closes #170 --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6e2642c --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 ToucanToco + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From cc56cefa507d7c33b7cb4af4b5d3b114ea0f47ff Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 9 Feb 2024 13:52:02 +0100 Subject: [PATCH 2/2] chore(deps): bump calamine from 0.22.1 to 0.24.0 (#175) * chore(deps): bump calamine from 0.22.1 to 0.24.0 Bumps [calamine](https://github.com/tafia/calamine) from 0.22.1 to 0.24.0. - [Changelog](https://github.com/tafia/calamine/blob/master/Changelog.md) - [Commits](https://github.com/tafia/calamine/compare/v0.22.1...v0.24.0) --- updated-dependencies: - dependency-name: calamine dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] * refactor: adapt code to calamine 0.24.0 Work items: * Rename calamine::DataType to calamine::Data and bring the calamine::DataType trait into scope * Support calamine's new ExcelDateType when determining a column's dtype * .worksheet_range's return type has changed from Option>> to Result> Signed-off-by: Luka Peschke --------- Signed-off-by: dependabot[bot] Signed-off-by: Luka Peschke Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Luka Peschke --- Cargo.lock | 8 ++++---- Cargo.toml | 2 +- src/types/excelreader.rs | 1 - src/types/excelsheet.rs | 26 +++++++++++++------------- src/utils/arrow.rs | 35 ++++++++++++++++++----------------- 5 files changed, 36 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 296772d..2509252 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -239,9 +239,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "calamine" -version = "0.22.1" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe0ba51a659bb6c8bffd6f7c1c5ffafcafa0c97e4769411d841c3cc5c154ab47" +checksum = "8a3a315226fdc5b1c3e33521073e1712a05944bc0664d665ff1f6ff0396334da" dependencies = [ "byteorder", "chrono", @@ -728,9 +728,9 @@ dependencies = [ [[package]] name = "quick-xml" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" dependencies = [ "encoding_rs", "memchr", diff --git a/Cargo.toml b/Cargo.toml index c023300..723c455 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ crate-type = ["cdylib"] [dependencies] anyhow = "1.0.79" -calamine = { version = "0.22.1", features = ["dates"] } +calamine = { version = "0.24.0", features = ["dates"] } chrono = { version = "0.4.33", default-features = false } pyo3 = { version = "0.18.3", features = ["extension-module", "anyhow"] } diff --git a/src/types/excelreader.rs b/src/types/excelreader.rs index e654307..592b235 100644 --- a/src/types/excelreader.rs +++ b/src/types/excelreader.rs @@ -57,7 +57,6 @@ impl ExcelReader { let range = self .sheets .worksheet_range(&name) - .with_context(|| format!("Sheet {name} not found"))? .with_context(|| format!("Error while loading sheet {name}"))?; let header = Header::new(header_row, column_names); diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index 36cf6ce..c10897b 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -10,7 +10,7 @@ use arrow::{ pyarrow::PyArrowConvert, record_batch::RecordBatch, }; -use calamine::{DataType as CalDataType, Range}; +use calamine::{Data as CalData, DataType, Range}; use chrono::NaiveDate; use pyo3::prelude::{pyclass, pymethods, PyObject, Python}; @@ -52,7 +52,7 @@ impl Pagination { pub(crate) fn new( skip_rows: usize, n_rows: Option, - range: &Range, + range: &Range, ) -> Result { let max_height = range.height(); if max_height < skip_rows { @@ -72,20 +72,20 @@ pub(crate) struct ExcelSheet { pub(crate) name: String, header: Header, pagination: Pagination, - data: Range, + data: Range, height: Option, total_height: Option, width: Option, } impl ExcelSheet { - pub(crate) fn data(&self) -> &Range { + pub(crate) fn data(&self) -> &Range { &self.data } pub(crate) fn new( name: String, - data: Range, + data: Range, header: Header, pagination: Pagination, ) -> Self { @@ -142,7 +142,7 @@ impl ExcelSheet { } fn create_boolean_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, @@ -153,7 +153,7 @@ fn create_boolean_array( } fn create_int_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, @@ -164,7 +164,7 @@ fn create_int_array( } fn create_float_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, @@ -175,7 +175,7 @@ fn create_float_array( } fn create_string_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, @@ -185,12 +185,12 @@ fn create_string_array( }))) } -fn duration_type_to_i64(caldt: &CalDataType) -> Option { +fn duration_type_to_i64(caldt: &CalData) -> Option { caldt.as_duration().map(|d| d.num_milliseconds()) } fn create_date_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, @@ -204,7 +204,7 @@ fn create_date_array( } fn create_datetime_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, @@ -219,7 +219,7 @@ fn create_datetime_array( } fn create_duration_array( - data: &Range, + data: &Range, col: usize, offset: usize, limit: usize, diff --git a/src/utils/arrow.rs b/src/utils/arrow.rs index e1d4e93..66bbf9c 100644 --- a/src/utils/arrow.rs +++ b/src/utils/arrow.rs @@ -1,35 +1,36 @@ use anyhow::{anyhow, Context, Result}; use arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}; -use calamine::{DataType as CalDataType, Range}; +use calamine::{Data as CalData, DataType, Range}; -fn get_arrow_column_type( - data: &Range, - row: usize, - col: usize, -) -> Result { +fn get_arrow_column_type(data: &Range, row: usize, col: usize) -> Result { let cell = data .get((row, col)) .with_context(|| format!("Could not retrieve data at ({row},{col})"))?; match cell { - CalDataType::Int(_) => Ok(ArrowDataType::Int64), - CalDataType::Float(_) => Ok(ArrowDataType::Float64), - CalDataType::String(_) => Ok(ArrowDataType::Utf8), - CalDataType::Bool(_) => Ok(ArrowDataType::Boolean), - CalDataType::DateTime(_) => Ok(ArrowDataType::Timestamp(TimeUnit::Millisecond, None)), + CalData::Int(_) => Ok(ArrowDataType::Int64), + CalData::Float(_) => Ok(ArrowDataType::Float64), + CalData::String(_) => Ok(ArrowDataType::Utf8), + CalData::Bool(_) => Ok(ArrowDataType::Boolean), + // Since calamine 0.24.0, a new ExcelDateTime exists for the Datetime type. It can either be + // a duration or a datatime + CalData::DateTime(excel_datetime) => Ok(if excel_datetime.is_datetime() { + ArrowDataType::Timestamp(TimeUnit::Millisecond, None) + } else { + ArrowDataType::Duration(TimeUnit::Millisecond) + }), // These types contain an ISO8601 representation of a date/datetime or a duration - CalDataType::DateTimeIso(_) => match cell.as_datetime() { + CalData::DateTimeIso(_) => match cell.as_datetime() { // If we cannot convert the cell to a datetime, we're working on a date Some(_) => Ok(ArrowDataType::Timestamp(TimeUnit::Millisecond, None)), // NOTE: not using the Date64 type on purpose, as pyarrow converts it to a datetime // rather than a date None => Ok(ArrowDataType::Date32), }, - CalDataType::DurationIso(_) => Ok(ArrowDataType::Duration(TimeUnit::Millisecond)), // A simple duration - CalDataType::Duration(_) => Ok(ArrowDataType::Duration(TimeUnit::Millisecond)), + CalData::DurationIso(_) => Ok(ArrowDataType::Duration(TimeUnit::Millisecond)), // Errors and nulls - CalDataType::Error(err) => Err(anyhow!("Error in calamine cell: {err:?}")), - CalDataType::Empty => Ok(ArrowDataType::Null), + CalData::Error(err) => Err(anyhow!("Error in calamine cell: {err:?}")), + CalData::Empty => Ok(ArrowDataType::Null), } } @@ -50,7 +51,7 @@ fn alias_for_name(name: &str, fields: &[Field]) -> String { } pub(crate) fn arrow_schema_from_column_names_and_range( - range: &Range, + range: &Range, column_names: &[String], row_idx: usize, ) -> Result {