From f3ec0e57b3790b3cc5149e3d746b84bb48ef80c9 Mon Sep 17 00:00:00 2001 From: Oksana Belyaeva Date: Wed, 6 Dec 2023 13:10:37 +0300 Subject: [PATCH] TLDR-538 tesseract trustai (#377) --- dedoc/scripts/accsum | Bin 0 -> 39280 bytes dedoc/scripts/calc_tesseract_benchmarks.py | 142 +++++++++++++--- dedoc/train_dataset/trainer/errors_saver.py | 2 +- resources/benchmarks/tesseract_benchmark.txt | 170 ++++++++++++++++++- tests/api_tests/test_api_format_json.py | 2 +- 5 files changed, 285 insertions(+), 31 deletions(-) create mode 100755 dedoc/scripts/accsum diff --git a/dedoc/scripts/accsum b/dedoc/scripts/accsum new file mode 100755 index 0000000000000000000000000000000000000000..1efd2a47ac37b8f777fd58074895dc33ea0cc6f1 GIT binary patch literal 39280 zcmeHwdwf*Ywf{~+G(=3|14KnVs8~>l0Y!s?CV_!L6Ny3;EjkRzOfoW=8Rr2(3B(54 zj3FAWT5h%1M{lY1Ra&i5st`~idbu@PAF0wdK07g3qqP`YWbSwEz4nHZp%El_Bj2VNsa{#1Fl!}q~f3i49 z9ESJ^i79@WBH*fHh*fKv$9N7Xr7O)=fSrzGIZbs5iRLnIII?l6LNyh_kN~BlvTj~3 zqo9TiXgV=p;a$tdb5@E^#iMjxe&+iX-o;dK`8Cz`qq@<#i1~IpN2wEGoPnln9C5bax9q%g&-T|%@%sX?rYTLO z=S?~9+~P>Ec$%y?=_lEFm(|FesJ*qRNae9O2_t^@r;*ifEeV#5`DMwMqi^26eEkEr ztS8=SICZ3^?g9hSBq*4Dz**9WLKnGT;|w&=bvo zFU^1(8Su&s_^J%$31pCeJ_Eim1O8eD{a0s@e>MaDlMHw<^cy(s$M-VG?}XrR?eca8 z`6n~r`5Ev78T5ai0skZe-kCxFEXW%;?Z@T@p|A#iQ`3I%#d6? z-n|FBNQ@P|jO(QPC4RgJ{k|Nb)KSWM25oviVtKosYmxM5afB!xlhQ$Vsj6 zY9GoEP@s;oa_0$86ABawg?)jjS9npBfQUrHf!f9p>gJ0CYiG=GMIyECfLD0J;b0gj zDqFi8rCILsx_y2T3t&Q6j#PENfIy8xzB*D-7Ym87XQeMtC%ix-qSo#A2WyGtmzmTB zV^QG^#lZ5``hyVY&%SHxEzaQlUKWf-Lv-mu3FOc{oInSjy6jBlt8X;3COsCHhSwK_&Npluv#}7dDJbj$ccOV5xG!KGO-hUBS~K&WrjnqYjl4oJ6_>i*uOQ7t}KtdkfErG3LT$l!`mvAK&cJ? zA>&RP{&U6`+i)%4V8gY1lMU};J#9Ap4aPUxa4r9k4S$d2x7ctk-)+PH#qzy2T+1hH zxUScb4L_zz*)8t|Lk6_{Co(?PhF35?(S~2mc&QE7@=hDB|x@UIvjvf*PcWBZ5G?SBg6V{Q0E z#wXhF3m7l8;qw@G+VHCxUu?s@j5pZu6^u98@S7QLv*B^ZH`?$!7=Ory-^2J88~y;} z-8TH^jQ85`#~Dx9@TV9bvf*8f=S9-({{rJ0(14L_OjCL2DD@irTN8RHvm_)^9nvf*uvZ?WO`Fy3v$A7Z@MhX0cBgbm-t_>c|% zGvj&Dbo+n8_*ff$WVN!-L>qoO~Wyvc?K8E>=U?Tl}< z;de6rkPZI{<6CU_CdRvMco*ZnHoTYdgbg2Je8`69U#{$#XIt+c#rRkoK9TW>HvD47 zOKtd-j5}?3J>!dQ_&UZLZ1{tWH`(xQjJMhFKQO-0h7U0QkPS~VzQu;0c!jc0w+)}c zcvn#QE$wmWJr7FJZNUo(;G@TakJcb)uLVEFg7;Z)>vfO59-uv^$yRwTC*iawp$~mM zLpbeIm3J-pZu$q7;XFFd6TA&NIp%KpA2jtSROoLs%XN5+>?oLs!WOUAFEIJtKJW*NVN z;?(8)@0amQC{8Zk-zno26erj2Z;|l}C{8ZjACmED6em~iUn=8gQk-14zgosmr8v26 zf0>M*Kyh-}{t_8Kn&RZD{f3O^Q=DA1zevWjDPBNvA>;qN3h_dU4;}4%lW&9%GX5aN$<_MHWcui00uzFEtNA4|Cg>ZHpd5k2pK7>?q^mWkjdEqH@h%?K5ts!ch9l2M7 zI%f#UcN03}0tMmgJOeS}!zNVH(W ziGRCt|G2~uSyY@BD%jtjgG$tg#INo5LV`l*6-K;ypEDlobGGB`aP0dnNjp16KT-k` z-@S#RiHp%C+a3E_hqBflg&OSwPcdEwW4Ge~2@Rax?l|Dw=_VE;JX1N%dUS?X$AK&;oZ&dI`o(Qhykl0DDRBNz@baZZ%8N#|$o;Ou_{_-7aEtsISc}PS{WYOBI;Z=^_lr<V$BMu}Imv8hlp3+SxrL74Pf3Wx?v z74UB;V5(|G)EKff@txb~)&d-);u`pG)xVI@`mVyex=b?|Qr6n`y4`*WbFJ%&&W0j5 z#!mFHxy4e|7grlxkq+pC*TznfMPsLdOG>|iaf`N4r3Z3lwaqNckYzV4-&N;qA2ES! zL&lie1toyFPfV4$7p~hRiw}c#gvO!+k;@b*3U5Swe*DwDSznKxGuNvL(>V$Xf~4($vu03kV)-t)#QPYN*b6&L~-} zXl~-3FOx~L;zmYW&&F5^TTeq%QwM~-xjuizDif`@4zr5#ofBkzvSWu^b2W1)Bh%W< zy4$fAwxs?9GeI@f7d|Cs-W1K>LU=#jOLDe42D3D0X5sq(BCAt5TD!7HxH&coYGwlM z|DKJtBz6)@W|L$fRsczwKuKrLK)F)gQH9~JX0Md&n0`FfO1HEVCW>S9Yu=Zoi}VPR zQ{UL=%t{sbBF*h!iIHNn}lE%nWkoovqO%5@Vnoe`M%A`2e2B0UdT|7tLjyGq~xAp zx#yG|bZ{3AFymo6qlOu`r!=Ea!$YL!r!-&9a=w%t#(+cRrm|dlO3oNgZY0YcZ!ggE z=u~7W?3kyU@DXPGmHLP*9oOv+X8hjHSjCLrCU}{$JRX9Z5x`HhJvf0A~EZ8)ZpCDoP}pP20tq- zI`T2t=Ci`9cMwvQ_^j}{9h~7b&X5u(R(?%})LiXNG~qPPd4j3?J@heb-s_H9_CN3^>Op?T+VvOBRXm z+K&I+*6p}BA|@w2)0%V^l6HcT*6I&^YH9T%v(?irh+ah&R0}+1xIbcMG7a}0l%$Mi z)q;hcOoh~CW1Ta0(!~v_DjUlyNQ5TsjQ0#wQNbY0S(O=9VZ#(Qq_8eekxEpC98=@f zLvS>6i9rsTkn3iep*yBewHEtyTuJlF7JhxC=XA2&=<_6*KGG_gjS3(WY$lmkXg@g46c4wsUSlZ-nv1Ro*Zrx zGBd5S_+v|JELACx+s!-GmAAb?k=skm_G7bU zP2_gO4lF3nbPzfsOO1Pph5KlM9D5wf+APXqO8~3{;GHG1VzgN^3XP-|1*c*O0PfY` z;;P9T&s`uhLL2d(M@Q}pbIhz!qERnM6LjRxCBgWybOB}&?u2mri0pV54X6VrQZ3>r z>FBYvNFp0%^|a-o#zVGrOU!gL97AimA(2oLtwYxoe*a}mh?a>Z-h2RM{@Ep{5vrT7 zYH~MZ+G`HLDvw@)ecPu-WSgUZQGBS)K}o{tNotX#@TnT&ECy%ejGe2=IwLV#DxD*a zpk1J}Ev$~*Du}}9DfK1h>Ojup!W38$E95!I9iz~jrvFSr zC{0H-9E6I{mjjQ&aum_*0rESxvCjV{mA_x*j|5+n|Jqdk=OX`7mA|T|%722&A3jr! z!Gelr&0m+kaKo3h+G)?-Uo9=gBLeT?Wosi1!brVqFptTu*lV5WS5SIOU3!{HtVd>G z8BZ2KEagLnxx^vMV23Pg4Gm2glye(pL14Fd?)55&FTJxHJ>vPn!56 zz*AjZwr6{8AB~6@LgaS*b2@^(a2mdMCm#4PQdT-~spG5ITfiI#%hIlIXHnG5rYw=nh!oWB{pK21VM>?m?5`P7nId=+_XI7EHc6a0cRg zh}WL$nGIK@-q|s|1lgKR`Yi0D41NceO?)eY<`C7XXk)*%xmOgfdkxD3v;gJ`HV947 zsIas|92>%2v~r?RHBQ52ueE!l8_8YJyl;>$-4Xk*ZxEq98G5w->%`>M3;M>odtfrz zO08YAvkL1Uf!g$;Ha8RJYl<#a$mMb2`Y-KTqWMy3M|JkV2sxw_Zm+^!vu(i%zw4U|P()4YO)vjd-%#ixRn5w865Az|lMGzgIWVFVbBD}E zj&%m^Vk?h$6F!*EJI3Vw4m{Jt)BOcEf|R3>&UQg}H^#yLpC~hE#yCnEM)%)yzDgz` zd>~8oU#nUb>l?k$loLr0u2N_~pHjH~AIfIKN$@T^dB@9NBZUEB<~ual?0|uW)f0b7 zjrbSfzFuN;pIpMG7r6zO8JJt-c0|1Fzc9)E_wFvG*qLPF1CMiz>FJiCH8PARWh{u`4TzL^kf>^w?#~>`H zH|0Zbp@CE5Y!QY+3`S*0OU_+@@#$QxC;6&MaU}HPMlG7^0hpggB+FLPlNbqkkmA8_ z^uXP6fLnKw0kg^hZXI1MY7g7{7pTj<09=iP>^kVyUHj;tDm^o$9zv>y)UMJjtGW)U zZeUhQ-T2|vEmG<}hNLozR`)kN8eo?0BB)E5hispS55{Zy+8pn18w*FV-10PVlS0pu zxK4P#P2J{%C&jCeh3(bC%(BM(4?IJmb>F1Wrq9sbQg~LM(ame&KW%z0Xt?j znVO5C8JnY6YdJnQ@4>zv_6Wuo9^=|MvIo_XGF9Bv&{_0me^sa zZPvia?KQoH+p69~A-hr+YKgm%mDL(qPz7tWHt$UquKyV0K__)x>-PI8D2HiidJqGT zM4${?-lj4*hU60$N3K1pb?AuZajiomqsO%luOY1rw;WDl$ zl@lH^Xmil6Ij$DCN`>psL&bHk#=|i*Vn8B6%`tEq$x0(7)ePe72adz20jW(`@u-nMFL#oqR^5g2N4)h2_OHLsIx22XzKimbwIuhPYa zvmKAf>`)56lGBbw<63oB1M+1f;TqI5gF>#=P{6W0-1L$~hPlxfEl+-|qaG+rS(cA< z6EddLMW@HYQua$dpHzdsjz^W=KvQn&<=T_AVb5VwjHQ;$*m!Z|LU} zWTVPnGnBO&U2V*o+}5F!G}rcE2u-kR@*MF;x`H|EYx-Fn_QX^Z?~ebot!7`FV=pFx z{n8uZ3XY+c=6xc1Iu#Vg+D?t3?4An;^cHUG!SKF6s|VKr`!OQs$IfqQ-it1e@$|2_ zwI}Z|zDIWxu{Pchi_7JX#pu65P5Y4>hT(*^D-A)kf}`t43rrKNY)>p#pbF)|g$9a5 z&mpKHg^@75NVM8w*`3tt)F?`J?jf>N5xtPwkAywQgLWr&p&D1++?+_FI4dV=jeS_d zWmD7*-8a#F&lpzjUHP~YKbo}_aw+$z*3%c-VvB9+LP4;qoBoJQ&E+3K zw-zw1z)QD?s85y^w;=Fvpt;fgA9(mN5k)S;F`p#nRhrq32N&2fN%SzNwdq<<;uDp4 zw_FA~dI#o0vbQ~ixtd0cvLc!|aewc(D1*7mqRB?Kdk2OvwcnEUhAa{W897L+E6dmu zDqF7JR_xDr0!f>OHok6|hG@L%rTO7dW5F#j8#wk+Ie2>;sw6rvA0?!ba?rVZ7aWS_ z&_pNns^@0tQ5Q-zFikFE9lhHqrcIMrg0&a*?TwbFX>c|9sGQgAtCNpjvy|4L(N$dz z4NR6h1RHUIn5!<5<(|HF%wOQf3taJ_S~4B<-rRg}f7Tkh2cK~6Zt1&-3M6|cMH3v= zencigW5@xQZ~N#C{sD};vgXupFt*c!g!D|1+N88{P@~sSnL+2)7tT84iKUla@;$n! z{o*5M$B2CAj-fm%G=LpL7ds~*)Z_GZeL)v(xMv*iHurmEjpckI-HF@vN|;=At%S4o zRct`#QZL38HqBdVN6b1wMQO7hbBCNo`?UYSuvjUQpQZ-SA{dw^8JyBcv|c^c8b>>K zzbYMX$YMurIP5D)1@amIR%xgEwp`{1F;r|so15(g3(}N2Xq`QhxDb}v=~Q*(T&g_O zO1vY4L9~;i|4zPV{yX`)=sq1~qx0Q|RO$Jus6F2j*?)Ln7YX1v+Hi&IA5ktwm@GW) zqWmZw+lGx|G#lIKmVW9=WED7HCwZxPFX>999BUqHYbOS#DZEVzQw`KDfg!nZ(!uv| z;@ObW3k+V8?}pFfCI#*=(o67b?jLr`oBP^1!}K*OC44FM%%^YIF86}WSy%8bwK^ES z(<#T>T}CERt)ZUqMXMA+3mmQt&%!cmxB3Z~L$>e8b>VgJTe(1H7fc+}qn%diNO_K7 zdMw>8P?zX16$~wNr68(_N6q8{2p8Gp*Cbh;H$Y1*eYMZbMDM-Q3y~R8@Ga zV~Cv(Rg@kUTM0!J8B6YVy^f@J>8EMaS$1i18Oj0g$jQ_JTzJE^qlk()T zd!4P%;S&8%v5)B8o9|v*zNGxx^6Sc7&vmBWm%w`p=DQU1CIzneo$-TmpQiu*|ABhS zOTM{7wx@T{FhSsO*Ve|u?%GwZuqPA@N2eM#V46`}Tx?uqLq#AMH9Ub}tiHkU1@L1I zhS%r!M4)eizm87Ba_YF4&@>?+!ZKSCtPOisxcx>s=Wkpg^Ea;agd@ISz&O9SB%dUw zeDjgdKSJRPK%uYBpdZX|*WxEHP&A=IZ!p~GjuJAV6vZ=)36XqZ7)m>^;%Q#Rk?hnz zC76mM4JS2jnh|i(yMX(|y@=;#Hm>-A`Sr}UuiBiBY z=6L-0ugaD$WS(`RCeTQk*NH{JsM{~o&JM-`l%&cRiFl;lCS2jGZ-~mw(LT_xl#q-} zi~yHl$~;d%{_uz@Dx|M2s|nz@O7IgO$fSJX?D_K{8;FL3e$YBkgt3Jocdai_Z?%pR zUI+{M;57?ZH7*PKwfG!gy$?SpqC{#!_+6D+H%wRQiNYKjucRMWNy97MQD0!HtpE%T zQv#EdUIc~@&6XJ%@--YyQT+Ibr*2BQkdi5&Ld>r&S5fK;5upoH6|D1leQ04_v)RGM zW%y|sSZa>XU5g(~@!{82*kvl*5nrt_4?lpiT*&|0Xofpbhf0SaEM~6?`}}?c%EOJG zfX^KePR}y@d`qe@$Fo5OCMI;g^2*Dx8gCQG?qDLj}b=PcU4M zlBP1@nldh&?m&Iaf4#dgXfv!)URMVb1?lHyltpwKKq)z@->8lSYNIg}Di{z8V{oJh ztQv|&q|K=JsHm77Ll+CwuF^IY;%l11z}RC7s;2Y%mU~R-vRLCXH0oT8P;gd^7AqsD zc(lQT0ZTUM0u;m@r9lL}u-4=Ey91tJEMhG3)HVc20=cVBRP96Y!{|o~sCiK#bI`!} zS{tm#&k#}790B>uJ9S36Kh)q3#2P(eUu{HG1e=UGVK;ufr(RUxM|+kV4!;L}fxZQ+ zH+rIwG8Q(tL&zY#B)UJ&LDFSz$j`pyQe8i3JF8)xzA#)O;6^J)-ODLZ5ex+zgWe#w z9lWzKgt{;D`F+t<#wFC0q%bW{njCPLRUrZH7c$2xn501!BuyRXp|e6Ohg2oq;Z^91 z%6l+UTHy+reNBLRU!5_(&J(N;yF<`k9rmrj4=Wip7;eh(Q$HLE3F@%M*Mx#5h$}+#oAD)6U=t;I#&w2*`BvpWZEz5`o4f!Kb zWT{3^qx@+qlY?~t;k16hZ%S!-`b8>*ogHlqon4GcwQkCUIzd0r<%vX$h0iFBU6US>e~T z+;y&~rzvWtq@UD+QL&V&4aTsTQHwG9u`oj_6*HFvE)w}-=7f@Ii!UN_F33uebz;ti zEm-#9`BYK4Gnw3j@MJvI@j2*B(0f7WzK~3g!u97Tpp!w1u`5*x+6U?f{oU?las%jJ zK_3OZrW<-d7rvNG?gRY|=qNn0;rwr;2d#b?=|O)0>IWSH-2i$OE+ZcWo%;&XgKhxb z2RaseBBSuw$-UT=n+*B^_V_A6$6#aD4;lmA06Gab5*`Iz1-b`x8|XgJI@|*qb(j!e zf=&i4#;u}C&{j}C=wW}twHN4RpqoJ-2JHbo{LjhcAZRIQ5gwXygO-5)7PJzSUTXJ) z{^nie4|?=_$RG4l&~DJ{K;H*_5LDp0`%}zJXv=$G2?FHQgdI0o8Y+#JTGxrleLVC~$=n~NX1YHd}0gtiX3wkA<;e7^l1U4x5 zf;NI40NoEd4vRDo_GM>+ZUS8bdh#IBgWe5#FQ^|+>puhfF6drR19vM9fG!3dhnqm3 zfX)Ox4exs_0bLBb8uT{MdqH=DJ_C9qUQpQ!I{F_-59$XUhv#PZg4TiV16>V@ooRVw z-FSt_YAVV)an#{?omqz$5k3y*#vRGzSxQ9yA6{OF(^vC51<< z%xe)BpK#%s=bU~jDWLRoaF#$X%xWH#*Akr5&<}n=Xn8@=_p@go2^WwD@mJ&g9rza$ zqWL$Xz=#t6UYu`%|7%c_pZ%B=ApSEr{m&(nZ>RC!F!_6de+T_;vXx(-(tiN>IpEWE zRZ4%rOg|3w{yz9$rtv>F`7^H6i|nneOs{?)*bML*9& zzfAFSMri)M;2(G?nLOXBKP=VD^v{5Q490;Qto&@hng3q!>%e!V@vkxY2f+U+_@AWl zKQj5_FkXdm={z@$UvBbeg1-^`4Qc!znEWN+kHEF@y0rSOHS4z;{059m^cG&K{&P(G z+zbBg7_aCihsjsfQ}qY`LGUxxZ!h@ofKR_IYNpTmrK$e__%UoQS6lUGUuu?r9LC$9 zf$vD;mzn&T;J*a^rZoNoCVvU|)vqU$cct+=P5x@|UjTo)l^>F9<^SNn1%8c{pIt2l z3}DZIUx;zM)yj9a=47u(>F9$@05VRiO!jQE4D=&FcZ0tqjenKNH^6@t{A<(ri%otR z_=T85GPTiC@D1=MS<~04@{I-70{$fM7hCxVQ~$UZJP&v!Xs1;Bos~_giT@1vG2HLm zlh(ew&Gy|3{&>t&!``ah zV;}fuVZNh{3&VaCz|PLzF+&7WOBAO{hCzyX&vw&_+@G7FErEB`d}yK z`3h_L=2ZF(;J*+4rB;6ST(f+Sf?xO!&tG~x^`_GA0sjQ>e~^~{_ssnFfqygB7xbId zX4~XUHu~PN)?1}kzAB#VuYU09`Ek0ll(O#~rv44!KLY-_R(>$0 z|55O_gP&=<+ynk=;AfgI_klkEex~w`8j0Uh$NKOIYyR1fo8_Af{@viy^Tnz1-)-_M z!9NP?$}7|OmrFj46@KuY;CqN)QBZVSc16M1TXP%*M#qSXf{7b)odqS|Kg?NB+Io0Z zL0Pz=nhg;=wi6YvJ z`g<98c}gC*=>qJFg=gyT&7nt^)g+-^!rTjX1a;#Q%rX; zeS_(TO!qTAj2&bQ(=)Wg*y{a%8qEtCUz=2a_l1PW5%hOXsF|rx>F>7SB|dSAg>N8C zr{<4G_-veAmH3CF1m_H#`a3QcAY6u1^UD#w2B-dR%Xbl8ic|C52;YcPf2ZXpgx6U3 zYZ0bXm$wDs`*7;-vD}X^o%%a04ptg+xq({)d>F*r~YorW`t=2MSm{^p9vB{occQ{ zA%tJXslSiXgK!e3{w|8Z-fhGX*BK$H={r!_3gg?Tmzk4!(a1y8f z-U$tGSKz$*GE@r3LWGkz_4iHip-8b7r~a-<3&Kg9H?ur$M~k&M;~Z~6IEhn#zeHew zT#HkGx1=?s`MBo8IoIOU-zC8(!Ngje`gAQG{4?t>L)e8=e+OhK!bzO^`yXNg?1fW*_oD^j`*7;-ecX?55~u#o2YvJ) ziBo^yLwpDQ38((92X*+1aq91Rlp%Z{PW>H^`w>p!)ZgzAQ!!rP)ZgtWLHIVD`g2pi|AqF#slTsLjd1cS++Se6aS-*vslTUDg75;Ie_(z! z!a$;d*j=$tou_65eonq_!OGlqv|rx%w@ zIVUD#Q`bx_EiO5Cl0u~szTsaEH0tlOT1DwgXZpLQ>3EKaVvMpMIT(EOI?2kzSImT7 zAEo1m3B8U=#}5~JeUXmm3B4{!$43gi{z=F4&2>^be&}}wiJybjWHqk`(&dj7CBw|e zIT&ozOB7aV(o=ZI_Xm~!B2jcnx~#&FGQR_B<>uf^hHY0`5f#o6$C%%LvvPB=3$NF) z>A2;)hZaVT7=!QL8WuzabFj+P>rpE+hgOlH;p9`_i%ge4!5Woe^4$^QQ+Raxfgv5< zRv)9b`v3KF3M&?Tp_bQH)uPA7I?YcpeN+ z^54ZAL&qnIh+`SQ1ak=;)E;!4!gwd+8x$-i10T-LrNF6u?`M1JVd&#rg?BNoFL1UV z#`qdTYSwq5=14tbg~9&38R_Y`gv)dPnJO-y0YT7}p*%m!(fXp2yaB`(;goZV~E zaWCs%#SJT;0YS=82K_66lb#;dqwCwsdVYYpl#W(K1bvT~?2n(;lgARqA7cCn%$;;F zThPzrlKxj&Kcez@h2>|m{3OQTVcf&`V#fC|{w(A2*$D8yWc*`!^Az_25vOAWDnjKP z!;UsX!D1xv;o7|jIF<7cwsM}tdTuXO3jWA?CNX|JE0)h5fOkIQf4D#i*0X#W;}=}0 z0Qsx|f|oO1KU3keS^m3>PrgV2mogq=d?n*&G5!PK!`16HmcOAw3C3Ce9>#r~VL9WQ zGU(Crr*p?*bBM=S&l1Le#CoV1hAYo+A)keL=gy@pXd%PNUjYB7b{w3q1dCXHH}v2t z?2!4WSL#1>-g$@R^*#-&5%i2IeYsbEFJ0q@Bj2oH=Bbl_(|3XUxI8jB5=~+JtFx5` zd9NP9Qo^$jE&s(*Ugwsgm$RN9&QqASGyRtTQJ5clVuoa}n~N10^xTpGzfbDHe9C^_ zw4g|;`Z;i=S>>z$Kd0q6Up@Z3gz?N6X1;z?=35}9@widLir!=VBdnL{s8B?FDshfj z$cPbC-{Xb;9-7YgIN(QHI~e@;_$$aULMAWt@J+m;CDC*$QZ4{5#SF{qZj1SQ+1hSz zu>J=2&$~EHUj{w&jM#8?UJh@ea!x5$3aVL85&VJhQ|Bl^m;WS*vyg=pXTZx@&yQIT z?VZpOljXs>;ye}CcB^Lj?{I(7{mz|1e@h1ZhZ*qyV*S^Yt9&06z8g6Ck9{5c zdItG^)_(%e*Ls~ZltG>zBqlrmll?HriPoBA&>7$d7U~%S7gB3q@LqNgRMLpGsr(CUgYhZ7( zPd^(>uMh8%@|@U09s*AD-D_N5?Qg$k{mon+uD1AH20c9)@OLxd`+-xxu-iEs1F2yt zpAL=#KAitdmUw~K#Qv~=6U@jUU&ZqOVEHnZU&8p3c?w{*2r~XY`_J1f-vs8x8GGqFC)6#u_nBL)QH#T@Xk{4`RC0j z9bU-g#XG7lcR1``<-+?^;Z?#Lb~k!lb+N|ARY+oqxgZ%$6UB??kys-Ng+MqIMIaWc z!z*1V7G5M)FGu-(0gnpd1@IULgCXWdSK>`6dWVYMP7^EfmaNC68m za4_JN^>l}ktBgRAlt5|l@}w#XOcchOryjheg_kVxrt6K6)`B8}ck3a2L(EsZoWdd& z_4xrrJ&hrH_08q0LkrNGaCp^AK>^qZO+nT*v53Iynsp>j@1%uep{N4z+GPqVo_5}O z=ZYxaP)uPF5orjnbj9d>IT68YaZwk)6BlxaBOaK%K0>97qC`zpQW)10j=CbK2HUX# zHuQwqlz6#Pz8C6?xR&8{)aBAZc&9Pqil}$DgpV^H77rB=?j!v0N^psQNhO5;%)O$#%HeWcHpk^cEX{-Jglo>#mz7t| zpPkN$MBOs!yvu4_4kv4H&bdOk=2b4JD6e!am^*i&W07l7c}1mzwEBI^Q2Izr`m&I3 z&mxFd7F{mripnyUmCdW1UoqP?t$14TbZZ%1Zh9|YNaMR!Hh2QAKu}pc7J%z=553hF%*Zf6Qu2lbAR5hD==OVHe{R%=b7EMnleaq7Xdqo6ZSjH=Yu3*?z z=krjB-F4~0P~Z!sL$Rp0l-{{?)qA2Y1n?$uR5b(JA65uQ$eme^Gm(7_X+l0*9xjip zhi+fpRfJdCG(B^8crYxM;az^JhU^bIoosqo3UGQFW#ljnL4yLlMrvuN+U0PyI%GA(U((_!6 zZo8qqFqTcDnJY#ty#MVHwWu)l^k`7M4@&QW25ZxcfngecLQV+>hx{R)WR6KLX;5zn zDx=&69t4PqL1|;Fe&n^UbjCIXl0CD&d#v8n( zi!QP%vI0X%J>I-e&&aGIy`@`EF9z58J$NBK;M_E8V zYqL)3U0MtjkznnN8LmjA)*Z0a|9p&q;YMG846f{e@fbeuu0!qQoTH`%H6NjJ%V^M( z=H_9^^l36RHI`~No?7Iaywv^}4`lP`k%ApZ7Lk>E$$!(Ujy^&5h(&~cr5eNeFinU) zZ!?-#mOEXW%Y_j{51ll$=!rjVU@=<>nKwr2h3uEQR;UEtvrbpL(jBH>rO+al;o7y# zgI`1NC~MMO>T=aKxoK3DGiJIPnNC?>2DvxFwNQd|QBfRO)rjT-jfNF%&>=ZjhD0%5 zllBysSInOhb=PyOJ`gKj7W4V*reOXprdWeJ(jbcKRt2C>(P&t4R;XX5NsG9^3w!)- zO2DCzKPrl)3l<|-TpvVKws|ovgCG=?17NYIffqgvbx@*XDk-~yOrgV+it>U2SuXrs zMlAv~DHOv6MKPvR`mF}Np8LO~bbEm=y>NLY4}D)y({YSzLbBH5G(@mOH`D9;hngBz zVTAQ{gNA8&YE7^2GirLe7G!EqKL_}JJd>o$zmWr)R&qLACdxyvk2y-WgXxh3v;6wL zrKY8vA(rdXPC7kbZVFnTV@t?PukUYas-Fvkh&=4}yACKlbEV6#?}KW34X4xP*K(RJ z2cMqd(s6yiRMRp}Z!bT?@>v_IqE4^xt7@vBE7SSg>%R_ZDSZVi)X%ADx`h=}`r~lg z)87h=Y(nLuLqFH2DW$O<+#DtpN64C9-_O;w&t`DFU#;n#w)FbGuco@ZIzt`T^e49T zdLKg5E`4Fm3Ut0&-%oAng}z_Rbfgw!YOg=-3s4nx`|115ntpCmpv$e*{>qkK@4IWd zPAldF_WZZo((C)znil2joD{X^|03W5oYaTt(Dws2lVjts%j>wNuR{jAv1WRGKYYu- znf@K5CHv|8yOtY>VLyx0OQuBBKf@BH;NNhP@0#iP{(6_6xH#B;+P`(VwI6JRNLu~@ zgHqvsyd+{GF)fEqujv=yS<{>QoFSE{S9e_2VKCL?D+EZFE`J~I@ASr$MCao1X(cce zZ2r#%mjp^oL$1y){Lg4pf$cg0 Y<2qk0dlUl0rT List: _get_avg(statistics[dataset]["Accuracy"])] -if __name__ == "__main__": - base_zip = "data_tesseract_benchmarks" - output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) - cache_dir = os.path.join(get_config()["intermediate_data_path"], "tesseract_data") - os.makedirs(cache_dir, exist_ok=True) - benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip") +def __parse_symbol_info(lines: List[str]) -> Tuple[List, int]: + symbols_info = [] + matched_symbols = [(line_num, line) for line_num, line in enumerate(lines) if "Count Missed %Right" in line][-1] + start_block_line = matched_symbols[0] - if not os.path.isfile(benchmark_data_path): - wget.download("https://at.ispras.ru/owncloud/index.php/s/HqKt53BWmR8nCVG/download", benchmark_data_path) - print(f"Benchmark data downloaded to {benchmark_data_path}") - else: - print(f"Use cached benchmark data from {benchmark_data_path}") - assert os.path.isfile(benchmark_data_path) + for line in lines[start_block_line + 1:]: + # example line: "1187 11 99.07 {<\n>}" + row_values = [value.strip() for value in re.findall(r"\d+.\d*|{\S+|\W+}", line)] + row_values[-1] = row_values[-1][1:-1] # get symbol value + symbols_info.append(row_values) + # Sort errors + symbols_info = sorted(symbols_info, key=lambda row: int(row[1]), reverse=True) # by missed + + return symbols_info, start_block_line + + +def __parse_ocr_errors(lines: List[str]) -> List: + ocr_errors = [] + matched_errors = [(line_num, line) for line_num, line in enumerate(lines) if "Errors Marked Correct-Generated" in line][0] + for num, line in enumerate(lines[matched_errors[0] + 1:]): + # example line: " 2 0 { 6}-{б}" + errors = re.findall(r"(\d+)", line)[0] + chars = re.findall(r"{(.*)}-{(.*)}", line)[0] + ocr_errors.append([errors, chars[0], chars[1]]) + + return ocr_errors + +def __get_summary_symbol_error(path_reports: str) -> Texttable: + # 1 - call accsum for get summary of all reports + accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "accsum")) + + if os.path.exists(f"{path_reports}/../accsum_report.txt"): + os.remove(f"{path_reports}/../accsum_report.txt") + + file_reports = " ".join([os.path.join(path_reports, f) for f in os.listdir(path_reports) if os.path.isfile(os.path.join(path_reports, f))]) + + command = f"{accuracy_script_path} {file_reports} >> {path_reports}/../accsum_report.txt" + os.system(command) + accsum_report_path = os.path.join(path_reports, "../accsum_report.txt") + + # 2 - parse report info + with open(accsum_report_path, "r") as f: + lines = f.readlines() + + symbols_info, start_symbol_block_line = __parse_symbol_info(lines) + ocr_errors = __parse_ocr_errors(lines[:start_symbol_block_line - 1]) + + # 3 - calculate ocr errors according to a symbol + ocr_errors_by_symbol = {} + for symbol_info in symbols_info: + ocr_errors_by_symbol[symbol_info[-1]] = [] + for ocr_err in ocr_errors: + if ocr_err[-1] == "" or len(ocr_err[-2]) > 3 or len(ocr_err[-1]) > 3: # to ignore errors with long text (len > 3) or without text + continue + if symbol_info[-1] in ocr_err[-2]: + ocr_errors_by_symbol[symbol_info[-1]].append(f"{ocr_err[0]} & <{ocr_err[1]}> -> <{ocr_err[2]}>") + + # 4 - create table with OCR errors + ocr_err_by_symbol_table = Texttable() + title = [["Symbol", "Cnt Errors & Correct-Generated"]] + ocr_err_by_symbol_table.add_rows(title) + for symbol, value in ocr_errors_by_symbol.items(): + if len(value) != 0: + ocr_err_by_symbol_table.add_row([symbol, value]) + + return ocr_err_by_symbol_table + + +def __create_statistic_tables(statistics: dict, accuracy_values: List) -> Tuple[Texttable, Texttable]: accs = [["Dataset", "Image name", "--psm", "Amount of words", "Accuracy OCR"]] accs_common = [["Dataset", "ASCII_Spacing_Chars", "ASCII_Special_Symbols", "ASCII_Digits", "ASCII_Uppercase_Chars", "Latin1_Special_Symbols", "Cyrillic", "Amount of words", "AVG Accuracy"]] + + table_accuracy_per_image = Texttable() + accs.extend(accuracy_values) + table_accuracy_per_image.add_rows(accs) + + # calculating average accuracy for each data set + table_common = Texttable() + + for dataset_name in sorted(statistics.keys()): + row = [dataset_name] + row.extend(_get_avg_by_dataset(statistics, dataset_name)) + accs_common.append(row) + table_common.add_rows(accs_common) + + return table_common, table_accuracy_per_image + + +def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str) -> Tuple[Texttable, Texttable]: statistics = {} + accuracy_values = [] with zipfile.ZipFile(benchmark_data_path, "r") as arch_file: names_dirs = [member.filename for member in arch_file.infolist() if member.file_size > 0] @@ -115,7 +190,7 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: gt_path = os.path.join(base_zip, dataset_name, "gts", f"{base_name}.txt") imgs_path = os.path.join(base_zip, dataset_name, "imgs", img_name) - accuracy_path = os.path.join(cache_dir, f"{dataset_name}_{base_name}_accuracy.txt") + accuracy_path = os.path.join(cache_dir_accuracy, f"{dataset_name}_{base_name}_accuracy.txt") with TemporaryDirectory() as tmpdir: tmp_gt_path = os.path.join(tmpdir, "tmp_gt.txt") @@ -145,30 +220,45 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: os.system(command) statistics = _update_statistics_by_dataset(statistics, dataset_name, accuracy_path, word_cnt) - accs.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) + accuracy_values.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) except Exception as ex: print(ex) print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`") - table_aacuracy_per_image = Texttable() - table_aacuracy_per_image.add_rows(accs) + table_common, table_accuracy_per_image = __create_statistic_tables(statistics, accuracy_values) + return table_common, table_accuracy_per_image - # calculating average accuracy for each data set - table_common = Texttable() - for dataset_name in sorted(statistics.keys()): - row = [dataset_name] - row.extend(_get_avg_by_dataset(statistics, dataset_name)) - accs_common.append(row) - table_common.add_rows(accs_common) +if __name__ == "__main__": + base_zip = "data_tesseract_benchmarks" + output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) + cache_dir = os.path.join(get_config()["intermediate_data_path"], "tesseract_data") + os.makedirs(cache_dir, exist_ok=True) + cache_dir_accuracy = os.path.join(cache_dir, "accuracy") + os.makedirs(cache_dir_accuracy, exist_ok=True) + + benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip") + if not os.path.isfile(benchmark_data_path): + wget.download("https://at.ispras.ru/owncloud/index.php/s/HqKt53BWmR8nCVG/download", benchmark_data_path) + print(f"Benchmark data downloaded to {benchmark_data_path}") + else: + print(f"Use cached benchmark data from {benchmark_data_path}") + assert os.path.isfile(benchmark_data_path) + + table_common, table_accuracy_per_image = __calculate_ocr_reports(cache_dir_accuracy, benchmark_data_path) + + table_errors = __get_summary_symbol_error(path_reports=cache_dir_accuracy) with open(os.path.join(output_dir, "tesseract_benchmark.txt"), "w") as res_file: res_file.write(f"Tesseract version is {pytesseract.get_tesseract_version()}\nTable 1 - Accuracy for each file\n") - res_file.write(table_aacuracy_per_image.draw()) + res_file.write(table_accuracy_per_image.draw()) res_file.write(f"\n\nTable 2 - AVG by each type of symbols:\n") res_file.write(table_common.draw()) + res_file.write(f"\n\nTable 3 -OCR error by symbol:\n") + res_file.write(table_errors.draw()) print(f"Tesseract version is {pytesseract.get_tesseract_version()}") - print(table_aacuracy_per_image.draw()) + print(table_accuracy_per_image.draw()) print(table_common.draw()) + print(table_errors.draw()) diff --git a/dedoc/train_dataset/trainer/errors_saver.py b/dedoc/train_dataset/trainer/errors_saver.py index ae7fd26e..1d591a96 100644 --- a/dedoc/train_dataset/trainer/errors_saver.py +++ b/dedoc/train_dataset/trainer/errors_saver.py @@ -46,7 +46,7 @@ def save_errors(self, error_cnt: Counter, errors_uids: List[str], csv_path: str, with open(path_file) as file: lines = file.readlines() lines_cnt = Counter(lines) - lines.sort(key=lambda l: (-lines_cnt[l], l)) + lines.sort(key=lambda value: (-lines_cnt[value], value)) path_out = os.path.join(self.errors_path, f"{int(1000 * len(lines) / errors_total_num):04d}_{file_name}") with open(path_out, "w") as file_out: diff --git a/resources/benchmarks/tesseract_benchmark.txt b/resources/benchmarks/tesseract_benchmark.txt index 6a59d51a..fd980a45 100644 --- a/resources/benchmarks/tesseract_benchmark.txt +++ b/resources/benchmarks/tesseract_benchmark.txt @@ -1,4 +1,5 @@ Tesseract version is 5.0.0 +Table 1 - Accuracy for each file +---------------+---------------------+-------+-----------------+--------------+ | Dataset | Image name | --psm | Amount of words | Accuracy OCR | +===============+=====================+=======+=================+==============+ @@ -18,7 +19,7 @@ Tesseract version is 5.0.0 | others | Zaklyuchenie_nevrol | 4 | 241 | 88.800 | | | oga_01 | | | | +---------------+---------------------+-------+-----------------+--------------+ -| others | napalm_doc_2_2_6 | 4 | 124 | 85.500 | +| others | napalm_doc_2_2_6 | 4 | 124 | 86.100 | +---------------+---------------------+-------+-----------------+--------------+ | tz-npa | 1.620e+14 | 4 | 695 | 99.800 | +---------------+---------------------+-------+-----------------+--------------+ @@ -74,6 +75,8 @@ Tesseract version is 5.0.0 +---------------+---------------------+-------+-----------------+--------------+ | tz-npa | ТЗ_09 | 4 | 154 | 97.500 | +---------------+---------------------+-------+-----------------+--------------+ + +Table 2 - AVG by each type of symbols: +--------+--------+--------+--------+--------+--------+--------+-------+-------+ | Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | | t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | @@ -84,9 +87,170 @@ Tesseract version is 5.0.0 | h- | | | | | | | | 0 | | words | | | | | | | | | +--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| others | 90.967 | 79.867 | 89.533 | 0 | 0 | 86.133 | 890 | 86.03 | +| others | 90.967 | 77.400 | 89.533 | 0 | 0 | 86.433 | 890 | 86.23 | | | | | | | | | | 3 | +--------+--------+--------+--------+--------+--------+--------+-------+-------+ | tz-npa | 99.268 | 91.064 | 92.076 | 0 | 0 | 99.480 | 7483 | 98.39 | | | | | | | | | | 6 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ \ No newline at end of file ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ + +Table 3 -OCR error by symbol: ++--------+---------------------------------------------------------------------+ +| Symbol | Cnt Errors & Correct-Generated | ++========+=====================================================================+ +| | ['3 & -> ', '2 & < 6> -> <б>', '2 & < > -> <__>', "2 & | +| | <1 > -> <'>", '2 & <и > -> <н>'] | ++--------+---------------------------------------------------------------------+ +| . | ['5 & <.> -> <,>', '3 & <3.> -> < De>', '3 & -> ', '2 & | +| | <6.> -> ', '2 & <г.> -> <Г>'] | ++--------+---------------------------------------------------------------------+ +| , | ['66 & <,> -> <.>', '3 & <ва,> -> <нь>'] | ++--------+---------------------------------------------------------------------+ +| 1 | ['6 & <1> -> <|>', '4 & <1С> -> ', "3 & <1> -> <'>", '3 & <№1> | +| | -> ', '3 & <№1»> -> ', "2 & <1 > -> <'>", '2 & <1C> -> | +| | ', '2 & <1C> -> <С>', '2 & <1> -> ', '1 & <1> -> <Г>', '1 & | +| | <1> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| е | ['6 & <е> -> <с>', '2 & <не> -> ', '2 & <ре> -> <с>', '1 & <е> | +| | -> <а>'] | ++--------+---------------------------------------------------------------------+ +| н | ['2 & <н> -> <и>', '2 & <не> -> ', '1 & <н> -> <й>', '1 & <н> | +| | -> <п>'] | ++--------+---------------------------------------------------------------------+ +| и | ['3 & <ти> -> < TH>', '3 & <тип> -> ', '2 & <и > -> <н>', '2 & | +| | <ис> -> <не>'] | ++--------+---------------------------------------------------------------------+ +| а | ['3 & <ва,> -> <нь>'] | ++--------+---------------------------------------------------------------------+ +| о | ['2 & <то> -> ', '1 & <о> -> <0>'] | ++--------+---------------------------------------------------------------------+ +| т | ['7 & <т> -> <г>', '4 & <т> -> < г>', '3 & <ти> -> < TH>', '3 & | +| | <тип> -> ', '2 & <то> -> '] | ++--------+---------------------------------------------------------------------+ +| 2 | ['2 & <28> -> ', '2 & <28> -> <ИР>', '2 & <28> -> <Я >'] | ++--------+---------------------------------------------------------------------+ +| л | ['2 & <л> -> <п>'] | ++--------+---------------------------------------------------------------------+ +| С | ['6 & <СЗВ> -> ', '4 & <1С> -> ', '4 & <ОС> -> ', '3 & | +| | <С> -> ', '2 & <СА> -> ', '1 & <С> -> <—>'] | ++--------+---------------------------------------------------------------------+ +| 3 | ['3 & <3.> -> < De>', '1 & <3> -> '] | ++--------+---------------------------------------------------------------------+ +| г | ['2 & <г.> -> <Г>', '2 & <г> -> <т >', '2 & <г> -> <т>', '2 & <гр> | +| | -> ', '2 & <гр> -> <тв>'] | ++--------+---------------------------------------------------------------------+ +| N | ['22 & -> <М>'] | ++--------+---------------------------------------------------------------------+ +| в | ['3 & <ва,> -> <нь>', '1 & <в> -> <В>', '1 & <в> -> <п>'] | ++--------+---------------------------------------------------------------------+ +| р | ['2 & <гр> -> ', '2 & <гр> -> <тв>', '2 & <ре> -> <с>'] | ++--------+---------------------------------------------------------------------+ +| Н | ['6 & <Н> -> <* П>', '6 & <Н> -> <° >', '3 & <Н> -> <¢ П>', '2 & | +| | <ЕН> -> <ек>', '2 & <Н> -> <. >', '2 & <Н> -> <И>'] | ++--------+---------------------------------------------------------------------+ +| с | ['2 & <ис> -> <не>', '1 & <с> -> ', '1 & <с> -> <©>', '1 & <с> | +| | -> <е>'] | ++--------+---------------------------------------------------------------------+ +| А | ['2 & <СА> -> '] | ++--------+---------------------------------------------------------------------+ +| И | ['3 & <И> -> ', '1 & <И> -> <Й>', '1 & <И> -> <Н>', '1 & <И> | +| | -> <П>'] | ++--------+---------------------------------------------------------------------+ +| д | ['3 & <д> -> <л>'] | ++--------+---------------------------------------------------------------------+ +| Е | ['2 & <ЕН> -> <ек>'] | ++--------+---------------------------------------------------------------------+ +| О | ['4 & <ОС> -> ', '2 & <ВО> -> <Ю>', '2 & <Об> -> <06>', '1 & | +| | <О> -> <о>'] | ++--------+---------------------------------------------------------------------+ +| П | ['1 & <П> -> <И>'] | ++--------+---------------------------------------------------------------------+ +| Т | ['4 & <Т> -> <Г>', '3 & <МРТ> -> ', '3 & <ТЗР> -> '] | ++--------+---------------------------------------------------------------------+ +| п | ['3 & <тип> -> ', '2 & <п> -> <и>', '2 & <п> -> <н>'] | ++--------+---------------------------------------------------------------------+ +| В | ['6 & <СЗВ> -> ', '2 & <ВЗ> -> <Ръ>', '2 & <ВО> -> <Ю>'] | ++--------+---------------------------------------------------------------------+ +| 0 | ['3 & <608> -> '] | ++--------+---------------------------------------------------------------------+ +| - | ['3 & <-> -> <=>', '1 & <-> -> <|>'] | ++--------+---------------------------------------------------------------------+ +| 6 | ['3 & <608> -> ', '2 & < 6> -> <б>', '2 & <6.> -> '] | ++--------+---------------------------------------------------------------------+ +| I | ['3 & -> ', '3 & -> <Ш>', '3 & -> <УП>', '1 | +| | & -> <|>'] | ++--------+---------------------------------------------------------------------+ +| М | ['3 & <МРТ> -> '] | ++--------+---------------------------------------------------------------------+ +| Р | ['3 & <МРТ> -> ', '3 & <ТЗР> -> '] | ++--------+---------------------------------------------------------------------+ +| б | ['2 & <Об> -> <06>'] | ++--------+---------------------------------------------------------------------+ +| 5 | ['2 & <75> -> <#2>'] | ++--------+---------------------------------------------------------------------+ +| ; | ['8 & <;> -> <:>'] | ++--------+---------------------------------------------------------------------+ +| ь | ['2 & <ь> -> < Ь>'] | ++--------+---------------------------------------------------------------------+ +| 8 | ['3 & <608> -> ', '2 & <28> -> ', '2 & <28> -> <ИР>', '2 & | +| | <28> -> <Я >'] | ++--------+---------------------------------------------------------------------+ +| E | ['6 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| З | ['6 & <СЗВ> -> ', '3 & <БЗ> -> <653>', '3 & <ТЗР> -> ', | +| | '2 & <ВЗ> -> <Ръ>'] | ++--------+---------------------------------------------------------------------+ +| 7 | ['2 & <75> -> <#2>'] | ++--------+---------------------------------------------------------------------+ +| ц | ['1 & <ц> -> <щ>'] | ++--------+---------------------------------------------------------------------+ +| ч | ['1 & <ч> -> <з>'] | ++--------+---------------------------------------------------------------------+ +| C | ['2 & <1C> -> ', '2 & <1C> -> <С>', '2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| Б | ['3 & <БЗ> -> <653>'] | ++--------+---------------------------------------------------------------------+ +| Д | ['1 & <Д> -> <З>'] | ++--------+---------------------------------------------------------------------+ +| й | ['1 & <й> -> <:>'] | ++--------+---------------------------------------------------------------------+ +| Ц | ['1 & <Ц> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| P | ['6 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| R | ['6 & -> <ЕВР>'] | ++--------+---------------------------------------------------------------------+ +| a | ['4 & -> <на>', '1 & -> <а>'] | ++--------+---------------------------------------------------------------------+ +| G | ['2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| H | ['4 & -> <на>'] | ++--------+---------------------------------------------------------------------+ +| V | ['3 & -> <УП>'] | ++--------+---------------------------------------------------------------------+ +| m | ['2 & -> '] | ++--------+---------------------------------------------------------------------+ +| | | ['1 & <|> -> <1>'] | ++--------+---------------------------------------------------------------------+ +| № | ['3 & <№1> -> ', '3 & <№1»> -> '] | ++--------+---------------------------------------------------------------------+ +| Ю | ['2 & <Ю> -> <1О>'] | ++--------+---------------------------------------------------------------------+ +| Y | ['1 & -> <У>'] | ++--------+---------------------------------------------------------------------+ +| _ | ['1 & <_> -> < >'] | ++--------+---------------------------------------------------------------------+ +| c | ['1 & -> <с>'] | ++--------+---------------------------------------------------------------------+ +| d | ['1 & -> <4>'] | ++--------+---------------------------------------------------------------------+ +| o | ['2 & -> '] | ++--------+---------------------------------------------------------------------+ +| y | ['1 & -> <у>'] | ++--------+---------------------------------------------------------------------+ +| » | ['3 & <№1»> -> '] | ++--------+---------------------------------------------------------------------+ +| щ | ['1 & <щ> -> <ш>'] | ++--------+---------------------------------------------------------------------+ +| ‚ | ['2 & <‚> -> <_,>'] | ++--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/tests/api_tests/test_api_format_json.py b/tests/api_tests/test_api_format_json.py index 72128afc..8b81cf93 100644 --- a/tests/api_tests/test_api_format_json.py +++ b/tests/api_tests/test_api_format_json.py @@ -41,7 +41,7 @@ def test_dict_with_list(self) -> None: result = self._send_request(file_name)["content"]["structure"] first_list_items = result["subparagraphs"][0]["subparagraphs"][0]["subparagraphs"] second_list_items = result["subparagraphs"][1]["subparagraphs"][0]["subparagraphs"] - first_list_items, second_list_items = sorted([first_list_items, second_list_items], key=lambda l: -len(l)) + first_list_items, second_list_items = sorted([first_list_items, second_list_items], key=lambda value: -len(value)) nodes = result["subparagraphs"][1]["subparagraphs"] self.assertEqual("list", nodes[0]["metadata"]["paragraph_type"])