From 9bec071fb0bdfb09c876cb9d2d9e8d81908fd2cc Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Wed, 14 Feb 2024 12:18:24 +0100 Subject: [PATCH 1/2] chore: s/value/sheet/g --- src/types/excelsheet.rs | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/types/excelsheet.rs b/src/types/excelsheet.rs index e7ec2eb..bb55775 100644 --- a/src/types/excelsheet.rs +++ b/src/types/excelsheet.rs @@ -246,17 +246,17 @@ fn create_duration_array( impl TryFrom<&ExcelSheet> for Schema { type Error = anyhow::Error; - fn try_from(value: &ExcelSheet) -> Result { + fn try_from(sheet: &ExcelSheet) -> Result { // Checking how many rows we want to use to determine the dtype for a column. If sample_rows is // not provided, we sample limit rows, i.e on the entire column - let sample_rows = value.offset() + value.schema_sample_rows().unwrap_or(value.limit()); + let sample_rows = sheet.offset() + sheet.schema_sample_rows().unwrap_or(sheet.limit()); arrow_schema_from_column_names_and_range( - value.data(), - &value.column_names(), - value.offset(), + sheet.data(), + &sheet.column_names(), + sheet.offset(), // If sample_rows is higher than the sheet's limit, use the limit instead - std::cmp::min(sample_rows, value.limit()), + std::cmp::min(sample_rows, sheet.limit()), ) } } @@ -264,11 +264,11 @@ impl TryFrom<&ExcelSheet> for Schema { impl TryFrom<&ExcelSheet> for RecordBatch { type Error = anyhow::Error; - fn try_from(value: &ExcelSheet) -> Result { - let offset = value.offset(); - let limit = value.limit(); - let schema = Schema::try_from(value) - .with_context(|| format!("Could not build schema for sheet {}", value.name))?; + fn try_from(sheet: &ExcelSheet) -> Result { + let offset = sheet.offset(); + let limit = sheet.limit(); + let schema = Schema::try_from(sheet) + .with_context(|| format!("Could not build schema for sheet {}", sheet.name))?; let mut iter = schema .fields() .iter() @@ -278,25 +278,25 @@ impl TryFrom<&ExcelSheet> for RecordBatch { field.name(), match field.data_type() { ArrowDataType::Boolean => { - create_boolean_array(value.data(), col_idx, offset, limit) + create_boolean_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Int64 => { - create_int_array(value.data(), col_idx, offset, limit) + create_int_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Float64 => { - create_float_array(value.data(), col_idx, offset, limit) + create_float_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Utf8 => { - create_string_array(value.data(), col_idx, offset, limit) + create_string_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Timestamp(TimeUnit::Millisecond, None) => { - create_datetime_array(value.data(), col_idx, offset, limit) + create_datetime_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Date32 => { - create_date_array(value.data(), col_idx, offset, limit) + create_date_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Duration(TimeUnit::Millisecond) => { - create_duration_array(value.data(), col_idx, offset, limit) + create_duration_array(sheet.data(), col_idx, offset, limit) } ArrowDataType::Null => Arc::new(NullArray::new(limit - offset)), _ => unreachable!(), @@ -309,7 +309,7 @@ impl TryFrom<&ExcelSheet> for RecordBatch { Ok(RecordBatch::new_empty(Arc::new(schema))) } else { RecordBatch::try_from_iter(iter) - .with_context(|| format!("Could not convert sheet {} to RecordBatch", value.name)) + .with_context(|| format!("Could not convert sheet {} to RecordBatch", sheet.name)) } } } From 5cc29d6fc5a799f394c8630563e35d1e3529af9f Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Wed, 14 Feb 2024 12:24:45 +0100 Subject: [PATCH 2/2] fix: consider #N/A cells as null --- python/tests/fixtures/sheet-with-na.xlsx | Bin 0 -> 8927 bytes python/tests/test_fastexcel.py | 17 +++++++++++++++++ src/utils/arrow.rs | 7 +++++-- 3 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 python/tests/fixtures/sheet-with-na.xlsx diff --git a/python/tests/fixtures/sheet-with-na.xlsx b/python/tests/fixtures/sheet-with-na.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..e098fc93705b1447e98d5bdb45b7e15f37cc6ce9 GIT binary patch literal 8927 zcmeHNg{y2>zBm z+pCYa*wchnW>g~F2k%%?*I&RB8YrE>{WeB0X=lsoXL|gNQ~6#Dz26+Mr)i-koq-Ok zrFN6sRhmj0gr$*h%KRDf0x&!QhBH6Bh}P03NUg?CURxS{QfAbXV9(y}|J2{Noh3fz zMDwcoF_DcKr*V!{2wmv-3_#O9<^yphuhU3Cwfk{q?ff;fF$P5J zF7<7S`t6La8Hep1rfvrD-l72jcXucN&A-61P7la%f`~O$1nM3jU}@$Cc7Skm{jC27 z$Nyps{%PnHiE0{MK&9Kx`DyUa}Vp{@);@S+nFhGb;-j} z6oX)aKL+6QVsTr2bZ0C46*2gPq73z3mEmbut{&)YkK9rfTq~Bl@jRzar_M4IK|U;= zFeqDjV@aM$|009J#Gzab?g-Zta!i6k%5Y-wj1Z$9HNyqViz=ik8J*qA@ahJ^+^wXM z%%JJiqD^ecNMY5T@eHCqh^6({D*rwQy3REl-^!)f2yd#ReOu45)Mj!YO2eW;;1u?tI`wsd;V`xei5DfKqkc3DiryV0^q!|Gc zasURBw*&X@IPr3Jv$u41w*Sdq|Hcdwg2E79{<}+swwg*e5Eu5~HUj9C=}Cn5)t#Gu zM{oNfM*jy^_#-;Lz>`G^wq_H9FDl$9&fz}$L%p8o0yrxu1g9PBCDHizd|+Sjbl*Y28R;b&-XLVak)j>#~U#jt$=zyK zXBbDVKc1`k4BL0)Utz6Lma9;g35ffLyVFMWDbfV7CPLQUX0+(-FT2ZJ53shwk2nTT z_Mflz_f9P$R{HNDvk{pYLhe(E$Cp^2DA-qzPNsOEI?AOIQ#?+!0%>945~wt z;bk{aTO)HeL`h{L5zsqe$w_cP&2Qo$)$w@g5G$@bQOnbS5h8>d#+?T4vCQ!hu%9HB ztBz*N<&LRMK{3kCMt-7Gvt`8o&Eo+s6s%X4Mm@>D#nI?**O&_>dI6)Ws=+%rUV39r zy0gp(q@vPG%UDX}o+6XV=iJP{z*KnAs?zaY2)&q2SgCqld)j3Lwp-JRLfR1& zs~}|r(??cXKp&eE7lgM8nWYl7z(Lcwj@KDqnzd7Gb#==axrqjgzmrm5zJ|eZO(;CT zsQRIfGOF%1X~%IuxB3E;LSougBx*`=XUmpZf){xD(NW=SkH5tsQ|+x>=?tCDyblfQ zj1#_YU`Gw>K@O0v0FHCyLXX1BIydA51ja}(I~Yu-(!E)pyA^uJxSI{6KF6vPOWKG7 zZMv%bK+hgmMTgQ}adz!qt*ZjG@UV$AGMAgY)um(HyyM|30XLu6ulb7CZOVnm#~*!krn78i?1xX z#4^XDrnqqSzL`ngk9DyB-tD;(pClCZ)P(wm6XO<2QS-zf=x@b87<#`Bm*2Rf*mgJ- zFD19wQR?G4=hwbd1~KGlb&&QT9f5=Vt91jXtoS*eDC?+9>b|XmG@4eZ|7816$yV+Y zkIb`?5bBBfy+BCQ{LIv*$u{b-!Fs^?**>K+-BonduDDnv zVp@wbXOCuheYrvY$U_MEhBvQ-yOG*2~N*k_%440pnlnQYq9mSa`w;YVpG{G|L&%(Jr*J@+r zGIFpz$5$cr@a~i4>DOZyn$?*HGU@D?d6Q}xmL&G8MYl_?4rAr_;2PprF{&MgJicJN zzM!-H95QjuS%`-8!a=)Lpvbh*Nynj-cIahU>5C+gtu&UyX^*3wn)25fmC%-u^-oo_ zsw>nMbkPv0dt=a+S=Qu|C=2_Z3}JTz(kM&UhF72@0rav;nndKfzzUSfEgmK706fKN znQ{fa@5nADdU_SP+?&JmO)Aj@__Xa0`=L*{y;J?l7o!N9U+-n4JhWCWEU?wg^BfOO zmzqp9KV+HG$bd80@4#vcd}RDOJMhf~RqlUEQ9vsQ8@T*JAC(L1#a?!(%n;C*0|ssk z&BN4pJ2ZeLuv0fq=<>uzrk=}JdghqHbShF3vUQ&=!op9x^1=`BMJgjUf7~v^##Wcm z$5un(sGk$-#+lsD3?5Q634$?;Q;^=icC0_M*TrtP2V<4IVPaySV8U!DjTI~sT$YzQ zE}n77^XZ-CF&s}#WsUEOkHO}V6B=y)Hm)T+NW}t;9XZ-h@)K{MQ%)$$&02)72I?kB z3?$aexG}p~?9oc6>?H(8hYsXLhqQledtTbMH4mX!f9JGT_f}@GrsjO2PPoAF zq}+cGbzx|~|MLwr3VtIm_3mQ%?J_X!G+`%U<)%M%rN5|nQCvP(d`;q^<-W*~ICJ}n zKXYvV=ZsHjuaXo4tE0Qt9Fr9Z_Qph+->+W0>xzfut_t< zJT${lxyG3305bB#j#0dY6&Ii91o|GkACAM)eHGImbS9>nwP_xSXT{E>Gft!i*$c7o z`)epA$Kru8oh~16{7`|v33S7A?8oSxLAjK3$V;VB7KXDCf5HfV)%fwbP{`t=*j}Yh zhBw5$lvH60>BQ`X+LfdXi5I4!mYRM6ymSmZ*eReOztUEXWu8|u*s&~&Ps(bT@@l?y z)yhP^+|b^mqMJRk$l(#CUhx}0XTir0w7F!uStn`WCLYs&B)5iUDC?Aa$J?Z8?r$&# zIoj;78>htxGx94mLn(RMd%AsN)LuVcjAX+fno?9hBeI?1lwg6ac{P>S@?>9cka%I# z%;bF&RT=c zbG@vy^JSNcC;X^djj zHpZ|BFLlG2CNS`w!|@-PH|nxtgY+)mNrc=MF+|q`c?(iZ%%cn_w9mXPyZcq^f3W9H z3DR=si2L0hY#v9niwB)-Ty~k&Hr27IIH5D?=Ykt_zBu4_Q4X5o}n=?8RCH3wD8&-Ne1ScKqQA0kO+ zcLQa*6Q%GrJWKWl`;bkIX}3B(s&_l#lmWm2U*L=!A>yki`k>01;=+?CH7O|{#FbYh<*SA zBTLtM25q>scC>}%cm$~0Vysqq-GVw4A60QP%K1jK9$!7SyYPqWDBbj!RQ!TS#@>V|z&b8YCbF zqZWLF>rG=%=*?58c0d*EeZzXES>{OK#4`IYtZr!-?BMgmWXEc2XV*|~VwNbp#ez$6 zxp|O@&{~$RZ+1M6+L8eB8qI3V1nV^F1~9RuU3ohW!_Yd-mw3{N0+=ei2A8T#w*}n>sTy0umrBv%-m#$ zG)#Z-S|FWcE}4Pzo9iPZHnwCbs@H{YY&X9W=+>_l%yE3eg$y`gav z>efva&96%zpe9~=vp$WJO#>Be7>VyMR8DROZs>X$rlGNXoglSPeNLg}L8@hjBSUCv z{lfHXhQXM-!I)L&wBN37J+S!skY%M~1RyzDSW%BwnqT&x3t+7@2hcgs> zllV$AkTfm1`Dmf;I$xVzee67J%V6>aoD8w<@8f{uz)+H94$-01Yr~%AjEG5IjV8+` z3OBy3xQBT5^7PD&A&(yEjCPAUUDnxQG_W!7+g}oFkakUbqct&18k4-$Y4jvh&T+c% zvPdN(sg00J7I~VaK7hvaD%~%+o70RlCRBV<#Y-}Rec`QV5cF~`1-mdZ<|Xjy$t0HrZF!_ zGc^D~v;A&e^oeb2(miQ2LUa|o{781PmaWj~MxE2G%ATtU2Y<>3)Ve$w5QEYIH;#wq%eADhR_B^kzHEVc?WbKV8AI-t*kW;m0{E!uH)!?g48>dRkx zlIjoMR6pJFV0*Y^{z~g9&cyZ=(o>63c~9GSz0+KK-_|&XUgE{XsG5Bysaw8UX?yzB z-*q34Hv776AlJ~*)9&T-I&qeJF!lV=oi8FUO)HJ}v3=*O$0)@PU<}wevr|r*+k*G& z+!~BA?3loT`GzIvDIn3328x^98(Q{f=4-40R&9WDyO#R9e-tH1uQ4<85Q2m;DgZ$G zJN^EA!-Uv?!R`?5-`32;j02&g9#%7#AClKcC~si97MW zW^&EqWzba|dC@uFRjroKSW!7gw$VwpA{7&Se64?aE5mn0orUMrvdv(t28{&S5~(vJ z<1@spt6684@Lx$BWhAU_9DojTOs#ru;}W&T4V^QA(!8Q0{`QjA!URX4=F=dO@R9T| z7&*=XYD%jkV_hVxP;gMRKRB=Yu zAOr4Wp=(8wW20rvhgFebMFlFR{#n}f>Cnr|!lfm%DaObxQJVxL%`hF_?)SBxq8$D` z`mu77T%?gQXmt@pHYVfTV!e@fWb)zUOf!1lJOal2$Qf>?UV*#~3~!95^5nICc-Yx= zt4BDW9gs~HVb7?IT_t3{m8k4c!Q#Z>Q`Y=0iCD$EyWgth(GHwYAXf156T_`}r!E3A z6<}6C1o-jci*T+;XXl9fq0}e!{C7u=9{%(oHKmT0J%2qDR5U^}nL#%;&XSHD84hSe zxR7=x+ovz5kF_s?2aPZvVPp4@}+9#JhUgm5yv1X zEk<&6j%&@5sXe+CD}UWXOd{sEZU2vc)v_w7YMiQZfJQ#E>-Qx}Yd^=BG&Z z_gy8y6|G1a;>s@KAVeGp;RIr1;Rd$Sad)$Ivi|MG%xYNrekTwYbriNl)w$zX78%LN z%H|#F8$25ScvRKSn$6eGSni(5?YZ?*<_ey%M{kZS{ewXc^q0DYBBN8CO=6t&xdviW zZE7!wbNVzJFFylg$b+Id`RV7??W@RRbKXzV#SJl?jt!Dtmw&HPG=qe()J&C&EpBX} zgp`d-Sz2J*<+@NDT=WmSxF|d&E0E9sL?+%raqcQ|Jz*x~FAasvKlrDyrKyixDItdK zfq*R@LPxZ6ex~W>>;mC_=IjRkR|f2V@*!f;lM~JC!+?aJ7SXS9620PrX0_xv9SXUP zEvn<=VJ}zpyHv51Mc~fuC6$m5+6_<$AH`*=saMZw3ePI_Q4X6ZT_;J;QbG4=3!6N${fwz;rzfIqfLk_aE9S%wY`8koP^Mg)y}%Ir?w`wiRH` zrNxfod&fqlBzbbCHztvi$0i%gGFkaQB81*(M3P9oS}uob|3J*)-=WoMYKW152tNvh z;DP(+@Uw7n`Cs%QrtDuOD^blEq5FjIqg>Ho%~J?FYGH*`SSmd^skTH)^t4tldX`tq z7+LXZXIi6vZ1St;Y#a@@sl?loJ)CTIUsbEHcinDU(a12&y3=cr*giFvWK1ZjKQ@f_80MMDrH}whF zhyC#5hJ6M1w|-64`0|{)=?^&1m-5r*d7hhBwsmh*APv8p)c8^$mAAZt8GA}u!r4ZR zCBn4MnqmC2 zANM+uD)7Y_ALQ^Y(Pk#{c)k+aq@jap$Mh9g5vzXU$+BOsfsGAnC4@ni&4D4SJ;E!) zwX#|uU zaAN!~m+0!-5=4{F7OdRD`kPVPd#_B6y`Bk$rEPvWJ@V8tO}jL+7GRRBGP?Uu#u5pc z6F~<5{3*e|SMA^JKYUoAsq$9`e@%M++wfTNStAk%Roqssc zBKc3p|F#AF)yuCO|A!YA^4~}BE93v_;jc{khdlsrj{*Sr8?XLq{?{krpUuOl{$&2I Y=c1+xDq?j3fcuDFFoL?3X@8#mA6PJkasU7T literal 0 HcmV?d00001 diff --git a/python/tests/test_fastexcel.py b/python/tests/test_fastexcel.py index a97e5e8..d033394 100644 --- a/python/tests/test_fastexcel.py +++ b/python/tests/test_fastexcel.py @@ -431,3 +431,20 @@ def test_sheet_with_pagination_out_of_bound(): pl.col("Amazing").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms") ), ) + + +def test_sheet_with_na(): + """Test reading a sheet with #N/A cells. For now, we consider them as null""" + excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-na.xlsx")) + sheet = excel_reader.load_sheet(0) + + assert sheet.name == "Sheet1" + assert sheet.height == sheet.total_height == 2 + assert sheet.width == 2 + + expected = { + "Title": ["A", "B"], + "Amount": [None, 100.0], + } + pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) + pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) diff --git a/src/utils/arrow.rs b/src/utils/arrow.rs index 12b2df9..3523e03 100644 --- a/src/utils/arrow.rs +++ b/src/utils/arrow.rs @@ -2,7 +2,7 @@ use std::{collections::HashSet, sync::OnceLock}; use anyhow::{anyhow, Context, Result}; use arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}; -use calamine::{Data as CalData, DataType, Range}; +use calamine::{CellErrorType, Data as CalData, DataType, Range}; fn get_cell_type(data: &Range, row: usize, col: usize) -> Result { let cell = data @@ -31,7 +31,10 @@ fn get_cell_type(data: &Range, row: usize, col: usize) -> Result Ok(ArrowDataType::Duration(TimeUnit::Millisecond)), // Errors and nulls - CalData::Error(err) => Err(anyhow!("Error in calamine cell: {err:?}")), + CalData::Error(err) => match err { + CellErrorType::NA => Ok(ArrowDataType::Null), + _ => Err(anyhow!("Error in calamine cell: {err:?}")), + }, CalData::Empty => Ok(ArrowDataType::Null), } }