From 262c2558c596dbadd278d5b26b08492c5b4825e8 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Wed, 20 Dec 2023 15:21:07 +0100 Subject: [PATCH 1/7] v2.0.1 --- CMakeLists.txt | 2 +- setup.py | 2 +- src/ganon/config.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d2b3510..28c6f447 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ # ============================================================================= cmake_minimum_required( VERSION 3.4 FATAL_ERROR ) -project( ganon VERSION 2.0.0 LANGUAGES CXX ) +project( ganon VERSION 2.0.1 LANGUAGES CXX ) # ----------------------------------------------------------------------------- # build setup diff --git a/setup.py b/setup.py index 5b7e4a63..e91c05a4 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def read(filename): setup( name="ganon", - version="2.0.0", + version="2.0.1", url="https://www.github.com/pirovc/ganon", license='MIT', author="Vitor C. Piro", diff --git a/src/ganon/config.py b/src/ganon/config.py index 59548027..366d79f6 100644 --- a/src/ganon/config.py +++ b/src/ganon/config.py @@ -8,7 +8,7 @@ class Config: - version = "2.0.0" + version = "2.0.1" path_exec = {"build": "", "classify": "", "get_seq_info": "", "genome_updater": ""} empty = False From 27843aff77d399e697dbd91bd85b3f54f3954253 Mon Sep 17 00:00:00 2001 From: pirovc <4673375+pirovc@users.noreply.github.com> Date: Wed, 3 Jan 2024 16:17:17 +0100 Subject: [PATCH 2/7] Bugfix/gtdb metadata url (#276) * fix parse tsv instead of tar * fix tests * fix tests * fix sh --- src/ganon/build_update.py | 8 ++++- src/ganon/tax_util.py | 30 ++++++++---------- .../data/build-custom/ar53_metadata.tar.gz | Bin 2364 -> 0 bytes .../data/build-custom/ar53_metadata.tsv.gz | Bin 0 -> 2274 bytes .../data/build-custom/bac120_metadata.tar.gz | Bin 2221 -> 0 bytes .../data/build-custom/bac120_metadata.tsv.gz | Bin 0 -> 2140 bytes tests/ganon/data/build-custom/filter_files.sh | 8 ++--- tests/ganon/integration/test_build_custom.py | 24 +++++++------- .../integration_online/test_build_custom.py | 11 ++++++- tests/ganon/integration_online/test_report.py | 4 +-- tests/ganon/utils.py | 8 +++-- 11 files changed, 54 insertions(+), 39 deletions(-) delete mode 100644 tests/ganon/data/build-custom/ar53_metadata.tar.gz create mode 100644 tests/ganon/data/build-custom/ar53_metadata.tsv.gz delete mode 100644 tests/ganon/data/build-custom/bac120_metadata.tar.gz create mode 100644 tests/ganon/data/build-custom/bac120_metadata.tsv.gz diff --git a/src/ganon/build_update.py b/src/ganon/build_update.py index 44a83ee3..e41015da 100644 --- a/src/ganon/build_update.py +++ b/src/ganon/build_update.py @@ -556,8 +556,14 @@ def write_tax(tax_file, info, tax, genome_sizes, user_bins_col, level, input_tar for target, row in info.iterrows(): tax_node = row["specialization"] if user_bins_col == "specialization" else target tax_name = row["specialization_name"] if user_bins_col == "specialization" else target - tax.add(tax_node, row["node"], name=tax_name, rank=tax_rank) + # Check if node is already present with correct parent + # in case of input-target sequence, info has repeated pairs of node/parent + if tax.latest(tax_node) is tax.undefined_node: + tax.add(tax_node, row["node"], name=tax_name, rank=tax_rank) + else: + assert tax.parent(tax_node)==row["node"] + # Write filtered taxonomy with added nodes rm_files(tax_file) tax.write(tax_file) diff --git a/src/ganon/tax_util.py b/src/ganon/tax_util.py index 81707e2b..23ff406d 100644 --- a/src/ganon/tax_util.py +++ b/src/ganon/tax_util.py @@ -54,7 +54,7 @@ def parse_genome_size_files(cfg, build_output_folder): if cfg.taxonomy == "ncbi": files = download([cfg.ncbi_url + "/genomes/ASSEMBLY_REPORTS/species_genome_size.txt.gz"], build_output_folder) elif cfg.taxonomy == "gtdb": - files = download([cfg.gtdb_url + "/ar53_metadata.tar.gz", cfg.gtdb_url + "/bac120_metadata.tar.gz"], build_output_folder) + files = download([cfg.gtdb_url + "/ar53_metadata.tsv.gz", cfg.gtdb_url + "/bac120_metadata.tsv.gz"], build_output_folder) else: print_log("Parsing auxiliary files for genome size", cfg.quiet) files = cfg.genome_size_files @@ -73,21 +73,19 @@ def parse_genome_size_files(cfg, build_output_folder): elif cfg.taxonomy == "gtdb": for file in files: - with tarfile.open(file, mode='r:gz') as tfile: - tfile.extractall(path=build_output_folder) - for n in tfile.getnames(): - with open(build_output_folder + "/" + n, "r") as f: - # skip first line wiht header - # col 0: accession (with GC_ RF_ prefix), col 13: genome_size, col 16: gtdb_taxonomy (d__Archaea;p__Thermoproteota;...) - next(f) - for line in f: - fields = line.rstrip().split("\t") - t = fields[16].split(";")[-1] # species taxid (leaf) - # In GTDB, several genome sizes are available for each node - # accumulate them in a list and make average - if t not in leaves_sizes: - leaves_sizes[t] = [] - leaves_sizes[t].append(int(fields[13])) + with gzip.open(file, "rt") as f: + # skip first line wiht header + # col 0: accession (with GC_ RF_ prefix), col 13: genome_size, col 16: gtdb_taxonomy (d__Archaea;p__Thermoproteota;...) + next(f) + for line in f: + fields = line.rstrip().split("\t") + t = fields[16].split(";")[-1] # species taxid (leaf) + # In GTDB, several genome sizes are available for each node + # accumulate them in a list and make average + if t not in leaves_sizes: + leaves_sizes[t] = [] + leaves_sizes[t].append(int(fields[13])) + # Average sizes for t in list(leaves_sizes.keys()): leaves_sizes[t] = int(sum(leaves_sizes[t])/len(leaves_sizes[t])) diff --git a/tests/ganon/data/build-custom/ar53_metadata.tar.gz b/tests/ganon/data/build-custom/ar53_metadata.tar.gz deleted file mode 100644 index 0cfeceed404c5c63da600761190f29f09f818345..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2364 zcmV-C3B&duiwFP!000001MOK`bK5o+_AB!%_}I?W3;==`OrB(C<0g&0sgrEF`@(@J zh{A>hRf2S*{q;TIO(0D<$t2Ux&KS~A#5uT}>xToISAm}tqUIZ3^Q0mu@@jo|_FWr- z7E*eKq6mcqL7&hlqAeXE9MFKAVd|40B0l!RGlXe?$(f73uNcd%(KWAJ_iQh#@*%wo znedFS3aw?S9A2zt)|8D-)?5qiq&ty*ERwV=_PMBqf?-FRs(B$5ug%1s zSn%qjsDRg~dPd8o5fnKRHq3fPIc838@uA7ATy5}dk zP!`D;VCVrSt9N7f_eNBQM0N`V_vQ!o^k6bOm`&C)zvJzi>zspVJO|Ckh34F~taVye zVv#BCc-}0tj1yr;aEI~r(g@wG|0~k^zF4Ev3c;17s8OgT4aOXavfx>=li3b51$AJ< zj+B>w%qWG}$fg*H%JQR{_)(>6zznTLvCa>$(VfU=G{p-sBbi|7NNZW{tJ3H<_BA)T z*mWndZ&N^sp%1L;&~*Ub`-u>KYpy;Xw>(0?z&H7k!i20GrY|JTFkb319~bXB#}w$O zKmtSdpxRScCOa#WLg;e&aN^G_GeI{x(TWWOm5-lH$HB!`l zN)*f~Y#0N(S0cBptM^6<50VTg}%)h}APowm?>Zs+EKVY!qzSje?!_ zVl!E~QTHwKN2XyJZ4{e2b>^zhOe07gt!Cb`>>l_`$Rp}Zs6E_FXj7b-uq=S->7(UM zzgN_}j`*lYAX*a7=1e5C8J3VX`M#VscP47b8l}A#39WDW;9kqq?W-0rtV|EuIH%9l zCs!BA)#WOI>Wu_6MuCSNZr1IP2{^nY`Bo_|2ND{(>cAoRgXV3E+`5W^uDF(+m*SSkuU1XEvsc>cZ}fHC$+7=@un zSP=Wpp!rHDnH5LHcZUrm(oR$b&)qwS5Z9FS9cVi-1>?7mEqrQ+A=^KCu8Kq$K5jR1HtE#tBnL4ZM1vz8R+KrZ^}K+^l2-oZ!Ix<80ns(_$f$-Oos$G z0P7!@c()Vodj(=vrjm5at@}=V;@ai4yMil$RO75^z?m2`ab+UO1dxdvd&s9GdH!Z~ zbK_=J+3ZdClwxM)*C>sKw7}SWJ2g8B>O!h--~m5u(`$&wlV)lkn8N5;v@&Fp)2tkW zM?>uV@+yEZVAR7N3yy&}&A&3kLc%PX>LLjKr_rPgBWsos`045{gbGU29f#U}ri9?J5VrU3x1SyIrMb5&OUkx$u zFIE^bH8LSFzC&odT3!9?!_TONmf_tX%dppJWP$VqREQ$nKl>z^`RyLzw1>fiQUB~P z028xr_dKFNFPQ}cB#se@S{MK(g$ZG6RWgc20L0Ewa1M@`Tk=61xPAHJpTGTkg_9TW zU%yWLz2yJ{*V!jWKU7>v_FFwW4=Lyhc>?8P(g`&#l%YG|JKytt-u1)o~ac!QS3_a?Ph6pxj&>NwrAg7Nu8xY{| zAi>wh&BG5kt{gbDp4`5L9D3270GQ%FZDZnnt}KXY5FzZuu@^(mgn(H9XECVb_z~0s zfMCq?p>&`Gu_%flzXLd9B4=U0dCp$Tw3_7Xkm3E}2FpjYM zE>Gn~?sqb8iU&#r`1Ddwe;~7CIK*!LlU%-mBb9~J!$%%~){NDT4oHWD2u{m^6;l)- z20y+Zp%Ip+`cIUfX>OfBM-HgFNS~Zc*OFs8Q>!m9CbY}`4@mA+m;G(EVH{_}455Hd zeYJrwh0I+#(E()6;b0o-jluh!u??uqac^2zob(?vVEhI31bltt07g&*fsLKc1}~r} zH-8Io-Y(1q%90oZ&MmhQ1 znFVn@O9G6H(U~95lOPH(VG;cJaQ6HSNdV%+jw#WOSjgb4Wl@iXP<%MZgPYESa2Uq~ zE(dT+f@Fx}fRPZAJ9LaS5rZoNh4Kb)90xdrfQ<;))2E^DEbO^d!|Zr|zmCyxsHqMT z$L@fLN5TMz!!m2xg@xUB;ij%@uJmWnu#E-mT}JyDwouSvR04R8a1PEb}sxDcn_WMgOOA7i0l!siYEyUyF37%&3F= z!^_)?m)Adzi=PD5#UNkQv5og-P9OWr%png>7;c7?%QpQ|c>e#;290S7RT~>%zNMhy z?!ky-W%K9mq7j}YJEm|Dz+rD97Ve)u4VJk&hWip=*Qk0% z%cT-L6Emt7^>ntbD=t;CgVU-qgA-+aKxw&=DjQ_#1{q$gBjUkPm{)$*xlEzxTJwS)zSestIiI9|)FSALP2=&}I;w8hnX zkxHSHa%%)=G|B^;AHh~ z9RAUW>X688gW%cvz+N6qRtK}iTINr@-E*CD5RK=c`MA-XyOy<1%StRV#S_n)WtMRw zoCxkPzFwN4oAtj$T0a+SbXp;}vJ^E6wWPtABT*JSOLj8bfu^7i9N3Za^3NHi5F6PP zBT-p?R1-g{bS)FEwJ6s40SB8CKtQmB=&6z2r-O- zRUNv)hS&W<2){K~Uyeue5dsFj$&VBkWaY4YA!&yBQkVI-c{ezwKt}}<7_y&Vl(|To zT-c(kv{KwAlvT!+)P-dtFh(t;vS(FdtHmEPQLEWjqpO!_`CzP(qMlQtU`}Df95}ra zxn*6w=Sqo9lJUJBC|%}5OhTM?BnVbpnhy86P~S>X4J(^V#`k2DG-@kz zm@kHDWzBOV$`Ixjbwa7Q(!Jkmz8*rXo>{U5vI11CBy3=#V9Rb4oU|94$<~c}ZjnDT z4clm=*wm>rS9N9@LF#BV^Oj}Lz-K}pQD;K!;buac;>?6)0ZdOHEpNuXqULqPM?C`3 zl6W>}BB9N>W=e<+QmoQ9ITs?XyT|eai>;TAprSwSZ}5deFu>eWt#-zD%yKRtd(G z27ZhKmmrJ+>{1_oiO%~zVhj9=R<|EHT zj0Vhu7ngV;@uQGQ=eLjT1VV^Q!zc`0%7WNKgXSxtWL6v% z-yJrPNIOv#Ja?WTLYzXF?#F;OihVGC``E&#Mhp|yJ+8VAWA*W_Q}&4<#nFd{_xCF= zq!c9ixVpQ?VdzIJ3|ohpWv%`cljsb`XCy>(CvCt?Ph#V2tFo{K**1BO#8+eF#3S7{ z?E}N-imQzT9Bs67^cm>p^l!>JuJrs?P|w<8_At_&V2Aqj86IUjZOaPg6-nMb-rgo&V7_}^)uS0k zWU%sS%!_^sf5`|&vU41q_ce$y<{312>EKHa9hAs7&Mja6G{c80sN~zSQj-7VTnf0s z=Hf7J`&U5%k^`!^Hr?OGv3Ytj^{77{BG{loZ-kzLQ6FtKAVBe;!S}|^%MU152#VI5 zhmVj$@7fALeA1_FOx@Sog4hotOi&!VG2~1Lm<4bagF48IpceoHW3C6SgHJJwq6qRk zfEp8_h4bb)dneOslCwubm$S$5&=yz}0AgX>cE?rPk($Cd!X7(3l^eO=$-F6EXc6Gk zOFjL8%#NXm-TyPW`T!-Bg}zIUJOHg3tDPOt4hb<-%Yha9IKT{kJTJl{EKl{HC_mHO zsz65$sJlp?RHi%0i9b`TZ!jjbtNtHI&UKglZMR_@XTuEj0iF780}~%Icd4QS$ecl8 z8v2dF`<<~3sLXM1+E<+PA2VS54gCasJ>vjIaRh-)P-la8(3G411vno~!(8Zb!ZavP znRJd8CnTQr3n!I1jWaLF@Ym19WBd~2)6Q%?rAHhFIXp%=eXV9eT+fn#U}JRV#q%VH z0zz2?KOWSc|3MOfI59FU+7SyG)LItxSP0Dr!Y-@Yg;GM? zFo3MK`_!ocZ0)D2&-Au};AhwY`d9fRpBjCV0X?4UTtllhRkFv%32gxzJ0~t*mR)5;=+6J@n`l431Y+UaBmYoWBAQuoGn>Ku0(e_9=1^YxxbHtMw{wfl};Bp^Z=_Z`!Fjtm3x?EZDcB{Z7E~TBfb> zvuLb~8WhljS-lg96gn<8MuA#8nU7+tzkSvT-)xF(4WmM(LRalCrB>Vvd()@4dSRqaa73)I74fZud+QHbR)6}rg$zUkVzH`tz@w)3u9kD*4)&h z9gcE;h=3B^7^cd-9RRtv3nBehTwNcIJWxP4Z|Wm~1!>VOpDCJ7Ug{$2H@AaB30M>$ zaAYf=7nw-vObk_5NvZe{D9V&8sq=x0z!{a0YA98TjTXO6R4pf4^`V}_vd&pOqK=W! zpCYW2fzwNo4ZN#&Tq&`RQ@+z3Vk;kV67tkSkgP*(EbrM=-$+q*E9+G*DslRq9fBf1@SK@5D`$N;=X4jx7bb^_Rfi;T{E6o| z-;LoSjz1{VSXhK-P?F2j%@~zD5nKSC#_?CF*3C+Ncy^`I;AEKq%Q>bTTwA+tv2oiv zcm92Gb?$7yGTa$4z8RCl&^fz+HW&vOEK-FM;G^c&{AmPy=!K2v5IV>PjN%20aOCt3 zm{>n&`un=P-a=V5bu0AgFSk7F++gfIJ>zOEfx~L;_EyheL+jnM<>gNk%Qo<@nqX&b z`{;gZzCoPj)#oOXH()k*E-rsQU;3DM)VW%IK1U=#9wFht6<@BF|8#$?6J4lNcli~z z#po?!@NuSz(a3M=EE_Cu{+I-Oq&b1nicU8eL>0R_3Au}C?jCqi(3rq-(BG`d~=N#e?7O{q&Bi{!p9J0WtK+o6@BjU`Q`BLd`4+}&E19@##}(l(?+o(`}_yoRV@RS+=7j_Hx0w?NPoIn%r_`FkVQ?xrkq*ZkaM zUnwkM56hF-+rZ#ycw1rqKusHE-T_@xli~2q5|My(UkD#^cwcZ2l-)t4Jh06Vmsh~IAEE2}k!w_fq{dTQ0YIm2ANeMG2xCQI>TIZ)>%l7}~YvLu`idZAuOJo)8a%DDeG{ z&}K)7Qh%l$4h`Q!k72v7<;M@h5;a--cxyVgzrQUH4b|IThlSXSOkdue z{OeY@zwCr^$36Pbe8-g$Cz#UjPHB_C6Q&#I?d-%%3V)XZx3ln+yq`fmbJF8uhWaWq zue)848lhNIWvB0pIQc zqY=G>#3RS2jh7h%Z%2P29`9`(@OY2SHSRXfvqXN5!w84)9yik~~r{pz8(&aD1u804M+ep9?>9 diff --git a/tests/ganon/data/build-custom/bac120_metadata.tsv.gz b/tests/ganon/data/build-custom/bac120_metadata.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..0f849405b5357d728a062c795c6268b7c061c559 GIT binary patch literal 2140 zcmV-i2&4BOiwFo_rdMMC17cxgF)}b;ZDn*}WMOn+Uve@qH!gH@b^z^ITW{Pp7JgQL zh2C}-NG7DH8`n=0HwoI^xQ5;ALtX@qhNIzHqNbwcdh+Y{kd!o(Mv3hp4O(Df!xBFp zlIQyA&@9inQlgd~t2UzO>P}@FrnvI*JFYLjJHFvy$ zxlXn-tQ3;7g3nOJ+o`l_8zyA7htrz6fD=V~^zwQuWHIPu2U{+cos+F7`b~=b^m5=$ z&Ot{w6ZWYqo+w$(J8m_cZEAIQLC9Gu+H7q#xqvL_Y@k$iE@r!7t>JAi8 z&a67{TyT}uJFP&cyfQ+W)!)9Wob7gXxrI?J3$EIJb{q}@Ua7h)AgZ6wN-iBpJ75^W zOwsPg;cp#pj+w92f9nBUiQQkAN=cZ(!8Y{u_aijU{Mzkug8@@Qn?^)R` zPw61Si4YDQ>m>=*ZT`#i_FSzoXccGDF;p95puw3VC@NNDdr|DcQZNS&v`|_9K0_+L z67Xc-P zF-$c_JJ_6cNyt+RL9)8qSl+X#z7xC|R(6|8v_OV-=xi|N_hgiHawkfV z=Yv??vQn!uq`8AmkcLUs$E{_XA;sF6Ck>HRU}_^^1D6HObSL4Y1K&=z?&P^gelHYk zqqJeupw2>N-87Qa$!q$pD9*@d0*|aSL3_HHU{ju%OgRM8`YZVHEMg72`08|2(J@q^Qt(A)%j8B;PjRH!@JknyUTTk5W-=C zaJ0f6@}gvw5_sYu_U^LmHOozU_<#TK(~m(&qknjb7Q+-19C;M4l7Pk}reQ=A5_;GR z(tso$CaZ{~Nlep(P#Q&EfYK-qy@fMdZ+klD;*e$6I*01bfaF>Bj#U*qOyN4q-bme8 zScH{OlIx4z6csGzj00X|*+(I_W~Dm4`$8&kvdDquJY5dHtzEy~`F)*x|GvJx@^)Yu z9*meiw8=^0U0x#;;248NvX&ftWFF>^5r_yjjz#Ds8&iT;G{vbmI$&n~lIgFz=57yV zW$Kozi(ejCIk-XFdvVF+Rse@}${(#>z+CJ7%k|AqGs|}H)6THVzJ2uHnJwo&ccDbtMg}?p?+hX(*QTX-diDBe- zU0FKI%^#hBPc$diT9Nq%PGsQtSBSzd?3t!v07;EKv-pFAp0Uw=sm_M?x^yRiu4lo6R9%XnT7)MF{AX0^;^9#d8!I$q7^GS>1jhhA?TrB>FL4pGSnmrXAgjBV0bwsVDt)~CNRbmoD%90iWBf0MZm{I&jumV zZi=z~BvO<-kE7?tU%WRhFfYrq1i99Yby*5McX93>zaMm!fAtX?f&yhI9v(eQPi$Xe zX&X|Pr(+xjp&@Em6~~m)b9xNWOCV^1ym?;e{JrN}f7jHNUvYgmH%A%5cFQxc569qn zcza>?NKI>HUIASbo#F6hNrN~TJ_v6rxG(r8${y#xRq`&U?u-}R9R8BMf3Q*yODIrH z&YKoq6kK?f7Gkn};kSyz?+XYaE03G5c;U0dUTk0Zyv1=8j8VJ-`j3KdTf9@R0E5?Z%OL+; z+%yfmWYBikc@BPId^m)y|*HZmF zTWf~9Ka)_3HbwQ{5j5m3p;d9C;oQ0^jZ8J#+JX6Ad1p31Q%HT zPO#-~xSXJvCZWFcSl^N*{{CJ)`ZM_ZDgR6{=1d=k#8aIb@O6(VNy%qOJoO@Cyi6&0 zJN*ms_-L!c<0H1zxZ5}{68RNQQk=j&ZlPDkFH!gs^+fI@%TxcY^2q1;2^=2aS15di zW*iRN-n@l_{^q}}Rn8hQWAXoRDRMrp_&<$*Sv@qC&WxGK0yC2*bk;PUm0!=yD74iE SJ@;ROoc{;4ysJ429{>QMFDB~% literal 0 HcmV?d00001 diff --git a/tests/ganon/data/build-custom/filter_files.sh b/tests/ganon/data/build-custom/filter_files.sh index 5e05fad7..fd831b11 100644 --- a/tests/ganon/data/build-custom/filter_files.sh +++ b/tests/ganon/data/build-custom/filter_files.sh @@ -9,8 +9,8 @@ wget --quiet --output-document - "ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/ass wget "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" --output-document base_files/new_taxdump.tar.gz wget "https://data.gtdb.ecogenomic.org/releases/latest/bac120_taxonomy.tsv.gz" --output-document base_files/bac120_taxonomy.tsv.gz wget "https://data.gtdb.ecogenomic.org/releases/latest/ar53_taxonomy.tsv.gz" --output-document base_files/ar53_taxonomy.tsv.gz -wget "https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tar.gz" --output-document base_files/bac120_metadata.tar.gz -wget "https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tar.gz" --output-document base_files/ar53_metadata.tar.gz +wget "https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tsv.gz" --output-document base_files/bac120_metadata.tsv.gz +wget "https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tsv.gz" --output-document base_files/ar53_metadata.tsv.gz tar xf base_files/bac120_metadata.tar.gz bac120_metadata_r207.tsv tar xf base_files/ar53_metadata.tar.gz ar53_metadata_r207.tsv tar xf base_files/new_taxdump.tar.gz taxidlineage.dmp nodes.dmp names.dmp @@ -52,8 +52,8 @@ head -n 1 base_files/bac120_metadata_r207.tsv > bac120_metadata_r207.tsv head -n 1 base_files/ar53_metadata_r207.tsv > ar53_metadata_r207.tsv join <(cut -f 1 assembly_summary.txt | sort) <(awk 'BEGIN{FS=OFS="\t"}{print substr($1,4,length($1)), $0}' base_files/bac120_metadata_r207.tsv | sort -t$'\t' -k 1,1) -t$'\t' | cut -f 2- >> bac120_metadata_r207.tsv join <(cut -f 1 assembly_summary.txt | sort) <(awk 'BEGIN{FS=OFS="\t"}{print substr($1,4,length($1)), $0}' base_files/ar53_metadata_r207.tsv | sort -t$'\t' -k 1,1) -t$'\t' | cut -f 2- >> ar53_metadata_r207.tsv -tar -czf bac120_metadata.tar.gz bac120_metadata_r207.tsv -tar -czf ar53_metadata.tar.gz ar53_metadata_r207.tsv +gzip -c bac120_metadata_r207.tsv > bac120_metadata.tsv.gz +gzip -c ar53_metadata_r207.tsv > ar53_metadata.tsv.gz rm bac120_metadata_r207.tsv ar53_metadata_r207.tsv # make nucl_gb.accession2taxid.gz diff --git a/tests/ganon/integration/test_build_custom.py b/tests/ganon/integration/test_build_custom.py index d59cd598..f04c2d6d 100644 --- a/tests/ganon/integration/test_build_custom.py +++ b/tests/ganon/integration/test_build_custom.py @@ -215,8 +215,8 @@ def test_taxonomy(self): params["taxonomy"] = "gtdb" params["taxonomy_files"] = [data_dir + "build-custom/ar53_taxonomy.tsv.gz", data_dir + "build-custom/bac120_taxonomy.tsv.gz"] - params["genome_size_files"] = [data_dir + "build-custom/ar53_metadata.tar.gz", - data_dir + "build-custom/bac120_metadata.tar.gz"] + params["genome_size_files"] = [data_dir + "build-custom/ar53_metadata.tsv.gz", + data_dir + "build-custom/bac120_metadata.tsv.gz"] cfg = Config("build-custom", **params) self.assertTrue(run_ganon(cfg, params["db_prefix"]), "ganon build-custom run failed") res = build_sanity_check_and_parse(vars(cfg)) @@ -302,8 +302,8 @@ def test_level_file_default(self): params["taxonomy"] = "gtdb" params["taxonomy_files"] = [data_dir + "build-custom/ar53_taxonomy.tsv.gz", data_dir + "build-custom/bac120_taxonomy.tsv.gz"] - params["genome_size_files"] = [data_dir + "build-custom/ar53_metadata.tar.gz", - data_dir + "build-custom/bac120_metadata.tar.gz"] + params["genome_size_files"] = [data_dir + "build-custom/ar53_metadata.tsv.gz", + data_dir + "build-custom/bac120_metadata.tsv.gz"] cfg = Config("build-custom", **params) self.assertTrue(run_ganon(cfg, params["db_prefix"]), "ganon build-custom run failed") res = build_sanity_check_and_parse(vars(cfg)) @@ -338,8 +338,8 @@ def test_level_file_taxrank(self): params["taxonomy"] = "gtdb" params["taxonomy_files"] = [data_dir + "build-custom/ar53_taxonomy.tsv.gz", data_dir + "build-custom/bac120_taxonomy.tsv.gz"] - params["genome_size_files"] = [data_dir + "build-custom/ar53_metadata.tar.gz", - data_dir + "build-custom/bac120_metadata.tar.gz"] + params["genome_size_files"] = [data_dir + "build-custom/ar53_metadata.tsv.gz", + data_dir + "build-custom/bac120_metadata.tsv.gz"] cfg = Config("build-custom", **params) self.assertTrue(run_ganon(cfg, params["db_prefix"]), "ganon build-custom run failed") res = build_sanity_check_and_parse(vars(cfg)) @@ -373,8 +373,8 @@ def test_level_file_leaves(self): params["taxonomy"] = "gtdb" params["taxonomy_files"] = [data_dir + "build-custom/ar53_taxonomy.tsv.gz", data_dir + "build-custom/bac120_taxonomy.tsv.gz"] - params["genome_size_files"] = [data_dir + "build-custom/ar53_metadata.tar.gz", - data_dir + "build-custom/bac120_metadata.tar.gz"] + params["genome_size_files"] = [data_dir + "build-custom/ar53_metadata.tsv.gz", + data_dir + "build-custom/bac120_metadata.tsv.gz"] cfg = Config("build-custom", **params) self.assertTrue(run_ganon(cfg, params["db_prefix"]), "ganon build-custom run failed") self.assertIsNotNone(build_sanity_check_and_parse(vars(cfg)), "ganon build-custom sanity check failed") @@ -415,8 +415,8 @@ def test_level_file_specialization(self): params["taxonomy"] = "gtdb" params["taxonomy_files"] = [data_dir + "build-custom/ar53_taxonomy.tsv.gz", data_dir + "build-custom/bac120_taxonomy.tsv.gz"] - params["genome_size_files"] = [data_dir + "build-custom/ar53_metadata.tar.gz", - data_dir + "build-custom/bac120_metadata.tar.gz"] + params["genome_size_files"] = [data_dir + "build-custom/ar53_metadata.tsv.gz", + data_dir + "build-custom/bac120_metadata.tsv.gz"] cfg = Config("build-custom", **params) self.assertTrue(run_ganon(cfg, params["db_prefix"]), "ganon build-custom run failed") self.assertIsNotNone(build_sanity_check_and_parse(vars(cfg)), "ganon build-custom sanity check failed") @@ -447,8 +447,8 @@ def test_level_file_specialization(self): params["taxonomy"] = "gtdb" params["taxonomy_files"] = [data_dir + "build-custom/ar53_taxonomy.tsv.gz", data_dir + "build-custom/bac120_taxonomy.tsv.gz"] - params["genome_size_files"] = [data_dir + "build-custom/ar53_metadata.tar.gz", - data_dir + "build-custom/bac120_metadata.tar.gz"] + params["genome_size_files"] = [data_dir + "build-custom/ar53_metadata.tsv.gz", + data_dir + "build-custom/bac120_metadata.tsv.gz"] cfg = Config("build-custom", **params) self.assertTrue(run_ganon(cfg, params["db_prefix"]), "ganon build-custom run failed") self.assertIsNotNone(build_sanity_check_and_parse(vars(cfg)), "ganon build-custom sanity check failed") diff --git a/tests/ganon/integration_online/test_build_custom.py b/tests/ganon/integration_online/test_build_custom.py index 956309b5..7adb3393 100644 --- a/tests/ganon/integration_online/test_build_custom.py +++ b/tests/ganon/integration_online/test_build_custom.py @@ -40,7 +40,7 @@ def setUpClass(self): def test_taxonomy(self): """ - ganon build-custom with --taxonomy ncbi,gtdb + ganon build-custom with --taxonomy ncbi,gtdb (downloads taxonomy) """ #ncbi params = self.default_params.copy() @@ -69,6 +69,7 @@ def test_level_sequence_default_gtdb(self): # --level default (sequence) - GTDB params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_level_sequence_default_gtdb" + params["filter_type"] = "ibf" params["input_target"] = "sequence" params["taxonomy"] = "gtdb" params["taxonomy_files"] = [data_dir + "build-custom/ar53_taxonomy.tsv.gz", @@ -86,6 +87,7 @@ def test_level_sequence_taxrank_gtdb(self): # --level genus NCBI params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_level_sequence_taxrank_gtdb" + params["filter_type"] = "ibf" params["input_target"] = "sequence" params["level"] = "genus" params["taxonomy"] = "gtdb" @@ -106,6 +108,7 @@ def test_level_sequence_leaves_gtdb(self): # --level leaves NCBI params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_level_sequence_leaves_ncbi" + params["filter_type"] = "ibf" params["input_target"] = "sequence" params["level"] = "leaves" params["taxonomy"] = "gtdb" @@ -123,6 +126,7 @@ def test_level_sequence_assembly(self): # --level assembly no tax params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_level_sequence_assembly" + params["filter_type"] = "ibf" params["input_target"] = "sequence" params["level"] = "assembly" params["ncbi_sequence_info"] = ["nucl_gb"] @@ -133,6 +137,7 @@ def test_level_sequence_assembly(self): # --level assembly NCBI params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_level_sequence_assembly_ncbi" + params["filter_type"] = "ibf" params["input_target"] = "sequence" params["level"] = "assembly" params["taxonomy"] = "ncbi" @@ -145,6 +150,7 @@ def test_level_sequence_assembly(self): # --level assembly GTDB params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_level_sequence_assembly_gtdb" + params["filter_type"] = "ibf" params["input_target"] = "sequence" params["level"] = "assembly" params["taxonomy"] = "gtdb" @@ -159,6 +165,7 @@ def test_ncbi_sequence_info(self): # dead_nucl nucl_gb params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_ncbi_sequence_info" + params["filter_type"] = "ibf" params["input_target"] = "sequence" params["taxonomy"] = "ncbi" params["taxonomy_files"] = data_dir + "build-custom/taxdump.tar.gz" @@ -171,6 +178,7 @@ def test_ncbi_sequence_info(self): # dead_nucl - none found params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_ncbi_sequence_info_wrong" + params["filter_type"] = "ibf" params["input_target"] = "sequence" params["taxonomy"] = "ncbi" params["taxonomy_files"] = data_dir + "build-custom/taxdump.tar.gz" @@ -181,6 +189,7 @@ def test_ncbi_sequence_info(self): # eutils params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_ncbi_sequence_info_eutils" + params["filter_type"] = "ibf" params["input_target"] = "sequence" params["taxonomy"] = "ncbi" params["taxonomy_files"] = data_dir + "build-custom/taxdump.tar.gz" diff --git a/tests/ganon/integration_online/test_report.py b/tests/ganon/integration_online/test_report.py index e25ce229..9499b4f9 100644 --- a/tests/ganon/integration_online/test_report.py +++ b/tests/ganon/integration_online/test_report.py @@ -114,8 +114,8 @@ def test_gtdb(self): "taxonomy": "gtdb", "taxonomy_files": [data_dir + "build-custom/ar53_taxonomy.tsv.gz", data_dir + "build-custom/bac120_taxonomy.tsv.gz"], - "genome_size_files": [data_dir + "build-custom/ar53_metadata.tar.gz", - data_dir + "build-custom/bac120_metadata.tar.gz"], + "genome_size_files": [data_dir + "build-custom/ar53_metadata.tsv.gz", + data_dir + "build-custom/bac120_metadata.tsv.gz"], "level": "species", "threads": 1, "keep_files": True, diff --git a/tests/ganon/utils.py b/tests/ganon/utils.py index 99c8d5e0..a6c558c4 100644 --- a/tests/ganon/utils.py +++ b/tests/ganon/utils.py @@ -11,6 +11,7 @@ sys.path.append('src') from ganon import ganon from ganon.config import Config +from ganon.util import download def run_ganon(cfg, prefix): """ @@ -102,7 +103,8 @@ def build_sanity_check_and_parse(params, skipped_targets: bool=False): res = {} if not check_files(params["db_prefix"], ["ibf"] if params["taxonomy"] == "skip" else ["ibf", "tax"]): - return None + if not check_files(params["db_prefix"], ["hibf"] if params["taxonomy"] == "skip" else ["hibf", "tax"]): + return None # target_info file to be read by ganon-build # res["target"] fields ['file', 'target', 'sequence'] @@ -386,8 +388,8 @@ def download_bulk_files(download_dir): "genomes/genbank/assembly_summary_genbank_historical.txt", "genomes/ASSEMBLY_REPORTS/species_genome_size.txt.gz"], "https://data.gtdb.ecogenomic.org/releases/latest/": - ["ar53_metadata.tar.gz", - "bac120_metadata.tar.gz"] + ["ar53_metadata.tsv.gz", + "bac120_metadata.tsv.gz"] } for url, files in bulk_files.items(): From 8f2e92fea36d08fe0e32915ce77268bdf3241415 Mon Sep 17 00:00:00 2001 From: pirovc <4673375+pirovc@users.noreply.github.com> Date: Fri, 12 Jan 2024 18:24:24 +0100 Subject: [PATCH 3/7] docs (#278) --- README.md | 2 ++ docs/custom_databases.md | 19 +++++++++++++------ docs/index.md | 2 ++ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b0fd3025..7487c4a1 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ [![Build Status](https://travis-ci.com/pirovc/ganon.svg?branch=master)](https://travis-ci.com/pirovc/ganon) [![codecov](https://codecov.io/gh/pirovc/ganon/branch/master/graph/badge.svg)](https://codecov.io/gh/pirovc/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/downloads.svg)](https://anaconda.org/bioconda/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/platforms.svg)](https://anaconda.org/bioconda/ganon) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/ganon/README.html) [![Publication](https://img.shields.io/badge/DOI-10.1101%2F406017-blue)](https://dx.doi.org/10.1093/bioinformatics/btaa458) +[ganon2 pre-print](https://www.biorxiv.org/content/10.1101/2023.12.07.570547) + ganon2 classifies DNA sequences against large sets of genomic reference sequences efficiently. It features: - integrated download and build of any subset from RefSeq/Genbank/GTDB with incremental updates diff --git a/docs/custom_databases.md b/docs/custom_databases.md index ec3c9696..f8d06b79 100644 --- a/docs/custom_databases.md +++ b/docs/custom_databases.md @@ -17,7 +17,7 @@ It is also possible to use **non-standard accessions and headers** to build cust If you just want to build a database without any taxonomic or target information, just sent the files with `--input`, use `--taxonomy skip` and choose between `--input-target file` or `sequence`. !!! warning - the target and specialization fields (2nd and 4th col) cannot be the same as the target (3rd col) + the target and specialization fields (2nd and 4th col) cannot be the same as the node (3rd col)
Examples of --input-file @@ -121,8 +121,15 @@ wget -A genomic.fna.gz -m -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.go wget -A genomic.fna.gz -m -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plastid/" wget -A genomic.fna.gz -m -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/mitochondrion/" +# Split sequences in files and retrieve taxonomy +mkdir sequences/ +zcat plasmid.* plastid.* mitochondrion.* | awk '$0 ~ ">" {accver=(substr($1,2)); print accver}{print $0 > "sequences/"accver".fna"}' | ganon-get-seq-info.sh -e -i - | awk '{print "sequences/"$1".fna\t"$1"\t"$3}' > ppm.tsv + # Build ganon database -ganon build-custom --input plasmid.* plastid.* mitochondrion.* --db-prefix ppm --input-target sequence --level leaves --threads 32 +ganon build-custom --input-file ppm.tsv --db-prefix ppm --level species --threads 16 + +# OPTIONAL Remove temporary folder and downloaded files +rm -rf sequences/ ppm.tsv plasmid.* plastid.* mitochondrion.* ``` ### UniVec, UniVec_core @@ -132,13 +139,13 @@ ganon build-custom --input plasmid.* plastid.* mitochondrion.* --db-prefix ppm - ```bash # UniVec wget -O "UniVec.fasta" --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec" -grep -o '^>[^ ]*' UniVec.fasta | sed 's/^>//' | awk '{print "UniVec.fasta\t"$1"\t81077"}' > UniVec_ganon_input_file.tsv -ganon build-custom --input-file UniVec_ganon_input_file.tsv --db-prefix UniVec --input-target sequence --level leaves --threads 8 +echo -e "UniVec.fasta\tUniVec\t81077" > UniVec_Core_ganon_input_file.tsv +ganon build-custom --input-file UniVec_ganon_input_file.tsv --db-prefix UniVec --level leaves --threads 8 # UniVec_Core wget -O "UniVec_Core.fasta" --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec_Core" -grep -o '^>[^ ]*' UniVec_Core.fasta | sed 's/^>//' | awk '{print "UniVec_Core.fasta\t"$1"\t81077"}' > UniVec_Core_ganon_input_file.tsv -ganon build-custom --input-file UniVec_Core_ganon_input_file.tsv --db-prefix UniVec_Core --input-target sequence --level leaves --threads 8 +echo -e "UniVec_Core.fasta\tUniVec_Core\t81077" > UniVec_Core_ganon_input_file.tsv +ganon build-custom --input-file UniVec_Core_ganon_input_file.tsv --db-prefix UniVec_Core --level leaves --threads 8 ``` !!! note diff --git a/docs/index.md b/docs/index.md index 7b2aaa4e..ac6616e2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -4,6 +4,8 @@ Code: [GitHub repository](https://github.com/pirovc/ganon) +[ganon2 pre-print](https://www.biorxiv.org/content/10.1101/2023.12.07.570547) + ganon is designed to index large sets of genomic reference sequences and to classify reads against them efficiently. The tool uses [Hierarchical Interleaved Bloom Filters](https://doi.org/10.1186/s13059-023-02971-4) as indices based on k-mers with optional minimizers. It was mainly developed, but not limited, to the metagenomics classification problem: quickly assign sequence fragments to their closest reference among thousands of references. After classification, taxonomic or sequence abundances are estimated and reported. ## Features From b6905645716a1117270db7261fac50c9676429b6 Mon Sep 17 00:00:00 2001 From: pirovc <4673375+pirovc@users.noreply.github.com> Date: Fri, 12 Jan 2024 19:10:57 +0100 Subject: [PATCH 4/7] Bugfix/raptor symbolic link overwrite (#279) * unlink before creating link * do not allow --hash-functions 0 for hibf --- src/ganon/build_update.py | 7 +++++-- src/ganon/config.py | 6 +++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/ganon/build_update.py b/src/ganon/build_update.py index e41015da..d9d6a147 100644 --- a/src/ganon/build_update.py +++ b/src/ganon/build_update.py @@ -361,10 +361,13 @@ def build_custom(cfg, which_call: str="build_custom"): suffixes = Path(first_file).suffixes # If last one is gz, get real suffix exts = "".join(suffixes[-2:]) if suffixes[-1]==".gz" else suffixes[-1] + target_file = build_output_folder + new_target + exts + # Attempt to remove if exists + # Create symbolic link with correct name for the first file - Path(build_output_folder + new_target + exts).symlink_to(first_file) + Path(target_file).unlink(missing_ok=True) + Path(target_file).symlink_to(first_file) # Write input file for raptor (space separated) - filehibf.write(build_output_folder + new_target + exts + " " + " ".join(files[1:]) + "\n") + filehibf.write(target_file + " " + " ".join(files[1:]) + "\n") print_log("raptor prepare", cfg.quiet) run_raptor_prepare_cmd = " ".join([cfg.path_exec['raptor'], "prepare", diff --git a/src/ganon/config.py b/src/ganon/config.py index 366d79f6..9c6c6798 100644 --- a/src/ganon/config.py +++ b/src/ganon/config.py @@ -51,7 +51,7 @@ def __init__(self, which: str=None, **kwargs): build_default_advanced_args.add_argument("-p", "--max-fp", type=int_or_float(minval=0, maxval=1), metavar="", default=None, help="Max. false positive for bloom filters. Mutually exclusive --filter-size. Defaults to 0.001 with --filter-type hibf or 0.05 with --filter-type ibf.") build_default_advanced_args.add_argument("-k", "--kmer-size", type=unsigned_int(minval=1), metavar="", default=19, help="The k-mer size to split sequences.") build_default_advanced_args.add_argument("-w", "--window-size", type=unsigned_int(minval=1), metavar="", default=31, help="The window-size to build filter with minimizers.") - build_default_advanced_args.add_argument("-s", "--hash-functions", type=unsigned_int(minval=0, maxval=5), metavar="", default=4, help="The number of hash functions for the interleaved bloom filter [0-5]. 0 to detect optimal value.", choices=range(6)) + build_default_advanced_args.add_argument("-s", "--hash-functions", type=unsigned_int(minval=0, maxval=5), metavar="", default=4, help="The number of hash functions for the interleaved bloom filter [1-5]. With --filter-type ibf, 0 will try to set optimal value.", choices=range(6)) build_default_advanced_args.add_argument("-f", "--filter-size", type=unsigned_float(), metavar="", default=0, help="Fixed size for filter in Megabytes (MB). Mutually exclusive --max-fp. Only valid for --filter-type ibf.") build_default_advanced_args.add_argument("-j", "--mode", type=str, metavar="", default="avg", help="Create smaller or faster filters at the cost of classification speed or database size, respectively [" + ", ".join(self.choices_mode) + "]. If --filter-size is used, smaller/smallest refers to the false positive rate. By default, an average value is calculated to balance classification speed and database size. Only valid for --filter-type ibf.", choices=self.choices_mode) build_default_advanced_args.add_argument("-y", "--min-length", type=unsigned_int(minval=0), metavar="", default=0, help="Skip sequences smaller then value defined. 0 to not skip any sequence. Only valid for --filter-type ibf.") @@ -376,6 +376,10 @@ def validate(self): print_log("--filter-type hibf is currently only supported with --input-target file") return False + if self.filter_type == "hibf" and self.hash_functions == 0: + print_log("--filter-type hibf requires --hash-function value between 1 and 5") + return False + if self.level == "custom" and not self.input_file: print_log("--level custom requires --input-file") return False From 4e6e535eb266e229547ea532320ad4b4e7a18a12 Mon Sep 17 00:00:00 2001 From: pirovc <4673375+pirovc@users.noreply.github.com> Date: Fri, 12 Jan 2024 20:15:46 +0100 Subject: [PATCH 5/7] Bugfix/skip tax report (#280) * use dummy tax. with skip * fix code --- src/ganon/config.py | 4 ++++ src/ganon/report.py | 27 +++++++++++++++------------ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/ganon/config.py b/src/ganon/config.py index 9c6c6798..d2601c02 100644 --- a/src/ganon/config.py +++ b/src/ganon/config.py @@ -488,6 +488,10 @@ def validate(self): print_log("File not found: " + file) return False + if self.db_prefix and self.taxonomy == "skip": + print_log("To skip taxonomy, omit --db-prefix and set --taxonomy skip") + return False + elif self.which == "table": pass diff --git a/src/ganon/report.py b/src/ganon/report.py index c93d4686..cdcaa82b 100644 --- a/src/ganon/report.py +++ b/src/ganon/report.py @@ -7,7 +7,7 @@ from ganon.util import print_log from ganon.tax_util import get_genome_size, parse_genome_size_tax -from multitax import CustomTx, NcbiTx, GtdbTx +from multitax import CustomTx, NcbiTx, GtdbTx, DummyTx def report(cfg): @@ -46,20 +46,23 @@ def report(cfg): "Failed to get genome sizes from .tax files, run ganon report without -d/--db-prefix") return False else: - tx = time.time() - if cfg.taxonomy_files: - print_log("Parsing " + cfg.taxonomy + " taxonomy", cfg.quiet) + if cfg.taxonomy == "skip": + tax = DummyTx(**tax_args) else: - print_log("Downloading and parsing " + - cfg.taxonomy + " taxonomy", cfg.quiet) + tx = time.time() + if cfg.taxonomy_files: + print_log("Parsing " + cfg.taxonomy + " taxonomy", cfg.quiet) + else: + print_log("Downloading and parsing " + + cfg.taxonomy + " taxonomy", cfg.quiet) - if cfg.taxonomy == "ncbi": - tax = NcbiTx(files=cfg.taxonomy_files, **tax_args) - elif cfg.taxonomy == "gtdb": - tax = GtdbTx(files=cfg.taxonomy_files, **tax_args) + if cfg.taxonomy == "ncbi": + tax = NcbiTx(files=cfg.taxonomy_files, **tax_args) + elif cfg.taxonomy == "gtdb": + tax = GtdbTx(files=cfg.taxonomy_files, **tax_args) - print_log(" - done in " + str("%.2f" % - (time.time() - tx)) + "s.\n", cfg.quiet) + print_log(" - done in " + str("%.2f" % + (time.time() - tx)) + "s.\n", cfg.quiet) # In case no tax was provided, generate genome sizes (for the full tree) if cfg.report_type in ["abundance", "corr"]: From fedb30551e602fb3e10bf55f2e58524e8259d02d Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 12 Jan 2024 20:21:00 +0100 Subject: [PATCH 6/7] docs --- docs/index.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/index.md b/docs/index.md index ac6616e2..1163db4f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -139,6 +139,8 @@ ctest -VV . ## Parameters +## Parameters + ``` usage: ganon [-h] [-v] {build,build-custom,update,classify,reassign,report,table} ... @@ -146,7 +148,7 @@ usage: ganon [-h] [-v] - - - - - - - - - - _ _ _ _ _ (_|(_|| |(_)| | - _| v. 2.0.0 + _| v. 2.0.1 - - - - - - - - - - positional arguments: @@ -221,8 +223,8 @@ advanced arguments: -k , --kmer-size The k-mer size to split sequences. (default: 19) -w , --window-size The window-size to build filter with minimizers. (default: 31) -s , --hash-functions - The number of hash functions for the interleaved bloom filter [0-5]. 0 to detect optimal value. - (default: 4) + The number of hash functions for the interleaved bloom filter [1-5]. With --filter-type ibf, 0 + will try to set optimal value. (default: 4) -f , --filter-size Fixed size for filter in Megabytes (MB). Mutually exclusive --max-fp. Only valid for --filter- type ibf. (default: 0) -j , --mode Create smaller or faster filters at the cost of classification speed or database size, @@ -312,8 +314,8 @@ advanced arguments: -k , --kmer-size The k-mer size to split sequences. (default: 19) -w , --window-size The window-size to build filter with minimizers. (default: 31) -s , --hash-functions - The number of hash functions for the interleaved bloom filter [0-5]. 0 to detect optimal value. - (default: 4) + The number of hash functions for the interleaved bloom filter [1-5]. With --filter-type ibf, 0 + will try to set optimal value. (default: 4) -f , --filter-size Fixed size for filter in Megabytes (MB). Mutually exclusive --max-fp. Only valid for --filter- type ibf. (default: 0) -j , --mode Create smaller or faster filters at the cost of classification speed or database size, From 10c1965d321f6279efcb8ad0eb247d3ee58f8857 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 12 Jan 2024 20:25:06 +0100 Subject: [PATCH 7/7] docs --- docs/index.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/index.md b/docs/index.md index 1163db4f..dbfe57e9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -139,8 +139,6 @@ ctest -VV . ## Parameters -## Parameters - ``` usage: ganon [-h] [-v] {build,build-custom,update,classify,reassign,report,table} ...