Skip to content

Commit

Permalink
fix: introduce regression on alias generation introduced by column se…
Browse files Browse the repository at this point in the history
…lection

Signed-off-by: Luka Peschke <[email protected]>
  • Loading branch information
lukapeschke committed Feb 27, 2024
1 parent 8dba9fd commit 1956a81
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 11 deletions.
Binary file not shown.
42 changes: 42 additions & 0 deletions python/tests/test_alias_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from __future__ import annotations

import fastexcel
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from utils import path_for_fixture


@pytest.mark.parametrize("use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"]])
def test_alias_generation_with_use_columns(use_columns: list[str] | list[int] | None) -> None:
excel_reader = fastexcel.read_excel(
path_for_fixture("fixture-single-sheet-duplicated-columns.xlsx")
)

sheet = excel_reader.load_sheet(0, use_columns=use_columns)
assert sheet.available_columns == ["col", "col_1", "col_2"]

pd_assert_frame_equal(
sheet.to_pandas(),
pd.DataFrame(
{
"col": [1.0, 2.0],
"col_1": [2019.0, 2020.0],
"col_2": pd.Series(
[pd.Timestamp("2019-02-01 00:01:02"), pd.Timestamp("2014-01-02 06:01:02")]
).astype("datetime64[ms]"),
}
),
)
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame(
{
"col": [1.0, 2.0],
"col_1": [2019.0, 2020.0],
"col_2": ["2019-02-01 00:01:02", "2014-01-02 06:01:02"],
}
).with_columns(pl.col("col_2").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")),
)
19 changes: 14 additions & 5 deletions src/types/excelsheet.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
use std::{cmp, collections::HashSet, str::FromStr, sync::Arc};

use crate::error::{
py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult,
IdxOrName,
use crate::{
error::{
py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult,
IdxOrName,
},
utils::arrow::alias_for_name,
};

use arrow::{
Expand Down Expand Up @@ -352,17 +355,23 @@ impl ExcelSheet {

let available_columns = sheet.get_available_columns();

let mut aliased_available_columns = Vec::with_capacity(available_columns.len());

available_columns.iter().for_each(|column_name| {
aliased_available_columns.push(alias_for_name(column_name, &aliased_available_columns))
});

// Ensuring selected columns are valid
sheet
.selected_columns
.validate_columns(&available_columns)
.validate_columns(&aliased_available_columns)
.with_context(|| {
format!(
"selected columns are invalid, available columns are: {available_columns:?}"
)
})?;

sheet.available_columns = available_columns;
sheet.available_columns = aliased_available_columns;
Ok(sheet)
}

Expand Down
18 changes: 12 additions & 6 deletions src/utils/arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,20 +118,23 @@ fn get_arrow_column_type(
}
}

fn alias_for_name(name: &str, fields: &[Field]) -> String {
fn rec(name: &str, fields: &[Field], depth: usize) -> String {
pub(crate) fn alias_for_name(name: &str, existing_names: &[String]) -> String {
fn rec(name: &str, existing_names: &[String], depth: usize) -> String {
let alias = if depth == 0 {
name.to_owned()
} else {
format!("{name}_{depth}")
};
match fields.iter().any(|f| f.name() == &alias) {
true => rec(name, fields, depth + 1),
match existing_names
.iter()
.any(|existing_name| existing_name == &alias)
{
true => rec(name, existing_names, depth + 1),
false => alias,
}
}

rec(name, fields, 0)
rec(name, existing_names, 0)
}

pub(crate) fn arrow_schema_from_column_names_and_range(
Expand All @@ -142,6 +145,7 @@ pub(crate) fn arrow_schema_from_column_names_and_range(
selected_columns: &SelectedColumns,
) -> FastExcelResult<Schema> {
let mut fields = Vec::with_capacity(column_names.len());
let mut existing_names = Vec::with_capacity(column_names.len());

for (idx, name) in column_names.iter().enumerate() {
// If we have an index for the given column, extract it and add it to the schema. Otherwise,
Expand All @@ -151,7 +155,9 @@ pub(crate) fn arrow_schema_from_column_names_and_range(
_ => selected_columns.idx_for_column(column_names, name, idx),
} {
let col_type = get_arrow_column_type(range, row_idx, row_limit, col_idx)?;
fields.push(Field::new(&alias_for_name(name, &fields), col_type, true));
let aliased_name = alias_for_name(name, &existing_names);
fields.push(Field::new(&aliased_name, col_type, true));
existing_names.push(aliased_name);
}
}

Expand Down

0 comments on commit 1956a81

Please sign in to comment.