From 1956a81dd2fc2ca5ff30791d79cea7e1fd22f628 Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Tue, 27 Feb 2024 18:19:54 +0100 Subject: [PATCH] fix: introduce regression on alias generation introduced by column selection Signed-off-by: Luka Peschke --- ...xture-single-sheet-duplicated-columns.xlsx | Bin 0 -> 6101 bytes python/tests/test_alias_generation.py | 42 ++++++++++++++++++ src/types/excelsheet.rs | 19 +++++--- src/utils/arrow.rs | 18 +++++--- 4 files changed, 68 insertions(+), 11 deletions(-) create mode 100644 python/tests/fixtures/fixture-single-sheet-duplicated-columns.xlsx create mode 100644 python/tests/test_alias_generation.py diff --git a/python/tests/fixtures/fixture-single-sheet-duplicated-columns.xlsx b/python/tests/fixtures/fixture-single-sheet-duplicated-columns.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..e07cdba6a49ca0e60aaf21ea4b4970c7ca1c3818 GIT binary patch literal 6101 zcmaJ_1z42Z)}Eoe8BE$jkLcsvZm^g4 z&FiZ2Q(%JA2v>&+cAW<^P3<9rmSu*Cdy*y}L`VJBm9*_`(Vev^>D>^1m!A&7!_-Dz zaH_%Go47qTlnFU8cPax++*u>J!u=bcn@K}37POza;5&Cd+o05*^gXF#i|igl^VPvQ z+LPQ^@EGEL6=Tp02XzU(ehe~c>r?fB!$w3j)G)E$4T@8LL3*JW1pv_aS3eQ{<)`Z& ziq@WjW$+StZ|n1V)&l3&-J=A!g%eynGep7J=v+$Gu~as@FS0JYD9wgkCPSqGlaT}2 zGg4g8k4gJ6rmqG$%;$McZxw-7u%2_4=eGRDB@ zfpi=)H+l@_d*)CzO~aap6mo6aW3gS|PD#tc-1;D$XCt_qJe_JDZr1BTl}vh0HvTks z4%XnO`1}Pib%}kwVsh@Jpwq;=^nzNfNeous+)11&o4=N>5p7oy&sEEDKS}$frGy8D zmntb}KE90Dn3R?J{`qiV?~O*iv|^N=i-)Up{nt0$6$R)6oqHC$w!b-WO04|GWJy^E*5P*yxFXtVy5lt6O3*B@?Pw(XHslc)0o*6!;A#YD?0_U3mG|EvuPB}c+VN=Hs5$v_eNe=^D za|cvP=0_$}WeXE}GL^sJhBTh@0;?`+Y{={3^k}?7KuKl3@#a7?D~a@T1s6Wf0!dU4$k+w zsMYro{;~#B;dwF?(B>rFc0WJN)t!($;N{oV=r;o1vXxp-Z+2L{!J-JoXx1tik(Mp9 zN2ERUW}YwpEP;8v@K#;7JDVdJ=sL~TFKepftQL7wnOmJ(G0GY^$@?wh0TiDT<4%9F zRO?rP$v}Q6XYq*u%Nh%Jy(a=u$ErRV{2h|8>4#fU%htLTeYb!e{r(fjAs|BzB(Y_JqP;nPSnVM0Y8!O7O;#@t1#@(NT9gYIyz zZ<*~1V^~U=BTPt)jbZ%>WB7kz4DRA#4}o8yYx992G=rDQd# zls+b6(rkAHnY_|u?*f)8Rlgp%$)*%KZ&Q&(KVllP*NPZ z%$b&Omkk(knxb0~Ghdz=wkge&x1DXpP&G(Ie3giWq(nX9fYckm;M*r{P01y zx1i>+T9t7HQ@L&0sbqGe@yX$0Ci7{A6!2q+KYY;sVp3?{c(6CHI$UGYBaZ^>{Dw_5 z%kxx>q5*`WJS}(S$28kj0nt=v`KgbH0sdG2Ovc|_ul6$Wf zCOq;Xu)a4fwqi}{z?;An-#;}+$4LRY2Vf23z@uG#u9~G-uIh*S- z!TUAFwMyJItpwu({2?d^oVRpCzXicxfci)~9u}gH(yW&2T+q8)8YE9Q;!jT+FPZlF zkVXCFZaYC}x&j4EgwVee_gvn3v62e>jATh0Rk)M%R!VTcP?Ad9Ho{yky|=TlF%DQ1 zw&iRT*{5t@RH_V(dnGzzc=~y61fjZ8qUlh#sdK*Gc#|(MF!heP+m<0F-+!-$FzW1$ zVUXCyc2fGhaWVBqLVBbxSEn7%+>H6V>KMAsw^1g{-#$!v8nd65madstO!d+oUmo1l z*<63+A#Ck6>RVy8JwNO`W#?ADLEWo_*=uYzA&@XCLD$!4%k8ik>$*IGf$r8Ui=Ro;9KOwGhG^mVWi9htT+#4`LiT}C*z&0}e*lY}Au~-vuoc(;%E}?9tVcU+{H>tS+u_hqjm+S!}mUYXo zJa{G57_y`21@4jXkt{vy>UJ^&j@3Cl@}Xu}<#ZCKx`ZFzN>QQe3Shp@dXrY!W0}8{ z4PDuIjR+1ZUS1O~K0GJg>5?DwSZ*{b3=2>-BibJ|>D!fmIl4!&>F(fS<93i=y>O

Rn3GcykJ-uWj&tCvtaxWj9_&u=wRbdJ^e zbypi3BX^k_Q=`Pwa&C(ls1Y>8VM4>LZY$Z@EUxIS7&f+&oq?sjY0=8}&nR~js05Ri z4PUBhvbdhp3|1vaq=3z9r`PQcHo9{Q4+}er4zB!$SCl%gy(hG1+K&xo}ge`ol`TVoRMRF)%qd4r- zC5W3p93N!f$4qf6R^rgFlV9qKRaS@;V%}6V%UzZT)YfB76McX9W~1U*BlGOzACppE zNn0(8)Q3Gc2BqeS6+)M))bpihoFws@toLyqH9v{wludo8V+nA^S=Hnf+?X0WEy{JJ zuzof3z8~FV8N?tST8t4n3&+k+V_e$}fi~<}F^{65Xb&HtK+TOmwXIKZ=RQdNi!?P=GXfwzuxsu|o7FFDm^eCYW+jcO8Oh^rxaUYy390hAd9 zosjpRKpB@iM+a6~7G%_uQ>xOVWH~Boo*suo=5X`gGPL^xUyp5TuX@3a*C#aAllzUs zymX1{^2Mw5Mw(cb$Xl0701~~IljUyw@hdr!+G7gc-bDkA-Ia49KP0l+C3`ns#W%`B z4$K_i&7Bw3pY8xG;$7agV7Yu$YximDvP<<3ni2FdD=5?5*p?N{1}{-y<3-KUC2G07AS!@e$eIdEMF!26?RQ0<&?n`kCHY zV!#A#^hCi&*#UjCL$>B1R2Q=a?0rB`%pDc#*xcw@J_tpv*dH1jW^RJ#WzE%A_FJ8#DR|LnA(2NXLZeasRxq(6O zRtvKuZFP8ld*Js&1yT~c=*sso&OW{u<}YSJj$E_6tZK&Lx(~upnpiEo5$(Mf&K^9% z{dqo}#G?1Kpz<8BXZKb2`{rdo6nMGBs^YBffz>_%S1dLj1Egd^VUKj5@LxO*TMl}# z5k=x`rI$DEi`EzfcCZt2RH3`!DJ`&|&zQjA)~yQ4e~J)oB*BM-;CReJyz6`Cwi5}c z@pzTN#3>BRTiq1HwJ{9miM?y7sibe~mtjuk`xyO=Btg7ZE?!_JZ)MQdbMl)4CaRGS zv(I{$Nw3@#oGzHYg>=HeA3IQDXM{>dv-5leA6?uAMs(DoCIR^c~ z9PldR{`Z(<2KUGjx~25@?v-2XO+!GWdbI8W)fivV##=Di)lvsOjf#PJYwU63C%YvQ zDMhN|K6_ZJzGDB8%ko7qhRq-#KT|wnszrk^e3h(cf~fdO;W~=wdkufIB`)vaO=|!UHaXtw8d?3Y+vIYF9nqOEh0D6>UMO1-*>=- z-!i6NhvRu_bn!#FgXbzD{$}~v+CTsx;*Sv_x*icTC#S1`WW|D^ZCB|1L9W|gtx~KM zszFm_Gl^{@;bL9Op=Q*b$iu?Uap?mT{37{$V{Fvo$BS=0s|x@I4l6^!E^>|*9JZvH z>eV-1oGC?N;f6icZwB+26sjx<_nAZq8Q8-VS;xooyj6V^mC3w#B^Rb^>ILdd-9JFa ziAC^KTV9gp12dE#>Eipd`zO(>`J(6HI=d=g2CK5EA9Xl9*-z`T#_Y7q_S1x^Nu<Lhg+#d1JcYEu+6**NSWOQTRiQ_GPvG>2gIK?dl%xU3wCml8N4; zDlUPeTW*X{9=$gwDEeXpsJ=8TBKi|Qy@ps0D?dj1BpvCwe?RV5&G0WAyTVU zYGpfX3C0YxrEM$#6U22!G$(>dPZ*zyo$Rg)ovRT%p4&d2g)(?K#9bleaKBJDK%-R=xuaMWD+wPPmGMBEie>N%&V$CiA_wWi5 zVY#m@pvWMG^?n71A+44mb%$I=X0Jb;o%MkD+|L4S6FxX0FArB34AsTIm|hYt@U?bD z-Kl!;cI?M+d-|mTCdaajBxY#y$pmdhk9YSyiJeQn#Z{Mur~`L0G0~=JXj7Q_9TI=s z?>n`|oQ$q37jlwDXkpsbC)1eu;=Q$k%^hB14zGhh{cM_Vr(vYRkvLSuxQZDNg$(d( z{&{`kcQya~r~Lag^mjYghd)>I!C!J4`Okkh5&xa1{;qtztG_xDzXXEpQIN{NPm6vx zaJ`+mYW{yo5Bg67zeWMSt6r}>uKJx{LX7>th5w)K=XWdDv-?$*@=J1&>p|`S{(H^x zyM^nC@2U#;C4mJ0s0n^oyw1;8dGVK&AOrD>UH;CJzpGy-sw)!zC6Y*M|5N{)(tp>! wP6z*t9E{|ew#EOb?RNv$vH4E}s>qN34~J-|p(9}o0N^68R%A-L`04Kd06R_^8UO$Q literal 0 HcmV?d00001 diff --git a/python/tests/test_alias_generation.py b/python/tests/test_alias_generation.py new file mode 100644 index 0000000..562dd3b --- /dev/null +++ b/python/tests/test_alias_generation.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import fastexcel +import pandas as pd +import polars as pl +import pytest +from pandas.testing import assert_frame_equal as pd_assert_frame_equal +from polars.testing import assert_frame_equal as pl_assert_frame_equal +from utils import path_for_fixture + + +@pytest.mark.parametrize("use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"]]) +def test_alias_generation_with_use_columns(use_columns: list[str] | list[int] | None) -> None: + excel_reader = fastexcel.read_excel( + path_for_fixture("fixture-single-sheet-duplicated-columns.xlsx") + ) + + sheet = excel_reader.load_sheet(0, use_columns=use_columns) + assert sheet.available_columns == ["col", "col_1", "col_2"] + + pd_assert_frame_equal( + sheet.to_pandas(), + pd.DataFrame( + { + "col": [1.0, 2.0], + "col_1": [2019.0, 2020.0], + "col_2": pd.Series( + [pd.Timestamp("2019-02-01 00:01:02"), pd.Timestamp("2014-01-02 06:01:02")] + ).astype("datetime64[ms]"), + } + ), + ) + pl_assert_frame_equal( + sheet.to_polars(), + pl.DataFrame( + { + "col": [1.0, 2.0], + "col_1": [2019.0, 2020.0], + "col_2": ["2019-02-01 00:01:02", "2014-01-02 06:01:02"], + } + ).with_columns(pl.col("col_2").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")), + ) diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index 2596dd1..c384256 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -1,8 +1,11 @@ use std::{cmp, collections::HashSet, str::FromStr, sync::Arc}; -use crate::error::{ - py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, - IdxOrName, +use crate::{ + error::{ + py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, + IdxOrName, + }, + utils::arrow::alias_for_name, }; use arrow::{ @@ -352,17 +355,23 @@ impl ExcelSheet { let available_columns = sheet.get_available_columns(); + let mut aliased_available_columns = Vec::with_capacity(available_columns.len()); + + available_columns.iter().for_each(|column_name| { + aliased_available_columns.push(alias_for_name(column_name, &aliased_available_columns)) + }); + // Ensuring selected columns are valid sheet .selected_columns - .validate_columns(&available_columns) + .validate_columns(&aliased_available_columns) .with_context(|| { format!( "selected columns are invalid, available columns are: {available_columns:?}" ) })?; - sheet.available_columns = available_columns; + sheet.available_columns = aliased_available_columns; Ok(sheet) } diff --git a/src/utils/arrow.rs b/src/utils/arrow.rs index 7da209b..81c95e1 100644 --- a/src/utils/arrow.rs +++ b/src/utils/arrow.rs @@ -118,20 +118,23 @@ fn get_arrow_column_type( } } -fn alias_for_name(name: &str, fields: &[Field]) -> String { - fn rec(name: &str, fields: &[Field], depth: usize) -> String { +pub(crate) fn alias_for_name(name: &str, existing_names: &[String]) -> String { + fn rec(name: &str, existing_names: &[String], depth: usize) -> String { let alias = if depth == 0 { name.to_owned() } else { format!("{name}_{depth}") }; - match fields.iter().any(|f| f.name() == &alias) { - true => rec(name, fields, depth + 1), + match existing_names + .iter() + .any(|existing_name| existing_name == &alias) + { + true => rec(name, existing_names, depth + 1), false => alias, } } - rec(name, fields, 0) + rec(name, existing_names, 0) } pub(crate) fn arrow_schema_from_column_names_and_range( @@ -142,6 +145,7 @@ pub(crate) fn arrow_schema_from_column_names_and_range( selected_columns: &SelectedColumns, ) -> FastExcelResult { let mut fields = Vec::with_capacity(column_names.len()); + let mut existing_names = Vec::with_capacity(column_names.len()); for (idx, name) in column_names.iter().enumerate() { // If we have an index for the given column, extract it and add it to the schema. Otherwise, @@ -151,7 +155,9 @@ pub(crate) fn arrow_schema_from_column_names_and_range( _ => selected_columns.idx_for_column(column_names, name, idx), } { let col_type = get_arrow_column_type(range, row_idx, row_limit, col_idx)?; - fields.push(Field::new(&alias_for_name(name, &fields), col_type, true)); + let aliased_name = alias_for_name(name, &existing_names); + fields.push(Field::new(&aliased_name, col_type, true)); + existing_names.push(aliased_name); } }