diff --git a/python/tests/fixtures/fixture-single-sheet-duplicated-columns.xlsx b/python/tests/fixtures/fixture-single-sheet-duplicated-columns.xlsx new file mode 100644 index 0000000..e07cdba Binary files /dev/null and b/python/tests/fixtures/fixture-single-sheet-duplicated-columns.xlsx differ diff --git a/python/tests/test_alias_generation.py b/python/tests/test_alias_generation.py new file mode 100644 index 0000000..562dd3b --- /dev/null +++ b/python/tests/test_alias_generation.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import fastexcel +import pandas as pd +import polars as pl +import pytest +from pandas.testing import assert_frame_equal as pd_assert_frame_equal +from polars.testing import assert_frame_equal as pl_assert_frame_equal +from utils import path_for_fixture + + +@pytest.mark.parametrize("use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"]]) +def test_alias_generation_with_use_columns(use_columns: list[str] | list[int] | None) -> None: + excel_reader = fastexcel.read_excel( + path_for_fixture("fixture-single-sheet-duplicated-columns.xlsx") + ) + + sheet = excel_reader.load_sheet(0, use_columns=use_columns) + assert sheet.available_columns == ["col", "col_1", "col_2"] + + pd_assert_frame_equal( + sheet.to_pandas(), + pd.DataFrame( + { + "col": [1.0, 2.0], + "col_1": [2019.0, 2020.0], + "col_2": pd.Series( + [pd.Timestamp("2019-02-01 00:01:02"), pd.Timestamp("2014-01-02 06:01:02")] + ).astype("datetime64[ms]"), + } + ), + ) + pl_assert_frame_equal( + sheet.to_polars(), + pl.DataFrame( + { + "col": [1.0, 2.0], + "col_1": [2019.0, 2020.0], + "col_2": ["2019-02-01 00:01:02", "2014-01-02 06:01:02"], + } + ).with_columns(pl.col("col_2").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")), + ) diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index 2596dd1..c384256 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -1,8 +1,11 @@ use std::{cmp, collections::HashSet, str::FromStr, sync::Arc}; -use crate::error::{ - py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, - IdxOrName, +use crate::{ + error::{ + py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, + IdxOrName, + }, + utils::arrow::alias_for_name, }; use arrow::{ @@ -352,17 +355,23 @@ impl ExcelSheet { let available_columns = sheet.get_available_columns(); + let mut aliased_available_columns = Vec::with_capacity(available_columns.len()); + + available_columns.iter().for_each(|column_name| { + aliased_available_columns.push(alias_for_name(column_name, &aliased_available_columns)) + }); + // Ensuring selected columns are valid sheet .selected_columns - .validate_columns(&available_columns) + .validate_columns(&aliased_available_columns) .with_context(|| { format!( "selected columns are invalid, available columns are: {available_columns:?}" ) })?; - sheet.available_columns = available_columns; + sheet.available_columns = aliased_available_columns; Ok(sheet) } diff --git a/src/utils/arrow.rs b/src/utils/arrow.rs index 7da209b..81c95e1 100644 --- a/src/utils/arrow.rs +++ b/src/utils/arrow.rs @@ -118,20 +118,23 @@ fn get_arrow_column_type( } } -fn alias_for_name(name: &str, fields: &[Field]) -> String { - fn rec(name: &str, fields: &[Field], depth: usize) -> String { +pub(crate) fn alias_for_name(name: &str, existing_names: &[String]) -> String { + fn rec(name: &str, existing_names: &[String], depth: usize) -> String { let alias = if depth == 0 { name.to_owned() } else { format!("{name}_{depth}") }; - match fields.iter().any(|f| f.name() == &alias) { - true => rec(name, fields, depth + 1), + match existing_names + .iter() + .any(|existing_name| existing_name == &alias) + { + true => rec(name, existing_names, depth + 1), false => alias, } } - rec(name, fields, 0) + rec(name, existing_names, 0) } pub(crate) fn arrow_schema_from_column_names_and_range( @@ -142,6 +145,7 @@ pub(crate) fn arrow_schema_from_column_names_and_range( selected_columns: &SelectedColumns, ) -> FastExcelResult { let mut fields = Vec::with_capacity(column_names.len()); + let mut existing_names = Vec::with_capacity(column_names.len()); for (idx, name) in column_names.iter().enumerate() { // If we have an index for the given column, extract it and add it to the schema. Otherwise, @@ -151,7 +155,9 @@ pub(crate) fn arrow_schema_from_column_names_and_range( _ => selected_columns.idx_for_column(column_names, name, idx), } { let col_type = get_arrow_column_type(range, row_idx, row_limit, col_idx)?; - fields.push(Field::new(&alias_for_name(name, &fields), col_type, true)); + let aliased_name = alias_for_name(name, &existing_names); + fields.push(Field::new(&aliased_name, col_type, true)); + existing_names.push(aliased_name); } }