diff --git a/.github/workflows/docker-pipeline.yml b/.github/workflows/docker-pipeline.yml index f39ae26..2475791 100644 --- a/.github/workflows/docker-pipeline.yml +++ b/.github/workflows/docker-pipeline.yml @@ -2,14 +2,19 @@ name: Build and push docker image for pipeline on: push: - tags: [ '*.*.*' ] + branches: + - "*" + tags: + - "*.*.*" + schedule: + - cron: '0 6 15 * *' # 15th of each month at 6am env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} jobs: - build_pipeline_image: + build_and_test_pipeline_image: runs-on: ubuntu-latest steps: @@ -29,9 +34,31 @@ jobs: with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + - name: Build image + uses: docker/build-push-action@v6 + with: + annotations: ${{ steps.meta.outputs.annotations }} + build-args: | + MEFETCH_EMAIL=${{ secrets.MEFETCH_EMAIL }} + MEFETCH_API_KEY=${{ secrets.MEFETCH_API_KEY }} + YA16SDB_VERSION=${{ steps.meta.outputs.version }} + cache-from: type=gha + cache-to: type=gha,mode=max + labels: ${{ steps.meta.outputs.labels }} + load: true + push: false + tags: | + ${{ steps.meta.outputs.tags }} + gha_image + + - name: Test run and unnitests + run: docker run gha_image /bin/bash -c "scons settings=testfiles/settings.conf;python -m unittest" + - name: Build and push tag + if: github.ref_type == 'tag' uses: docker/build-push-action@v6 with: + annotations: ${{ steps.meta.outputs.annotations }} cache-from: type=gha cache-to: type=gha,mode=max labels: ${{ steps.meta.outputs.labels }} diff --git a/Dockerfile b/Dockerfile index b643370..0254a34 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,23 +1,20 @@ FROM python:3.11-bookworm - -RUN apt-get update && apt-get upgrade -y && apt-get install --assume-yes --no-install-recommends \ - ca-certificates git wget - -ADD requirements.txt /usr/local/share/ya16sdb/ -ADD bin/bootstrap.sh /usr/local/share/ya16sdb/bin/ - +ARG MEFETCH_API_KEY MEFETCH_EMAIL YA16SDB_VERSION +ENV \ +MEFETCH_API_KEY=${MEFETCH_API_KEY} \ +MEFETCH_EMAIL=${MEFETCH_EMAIL} \ +PIP_ROOT_USER_ACTION=ignore \ +SCONSFLAGS="--file /usr/local/share/ya16sdb/SConstruct" \ +YA16SDB_VERSION=${YA16SDB_VERSION} +RUN apt-get update && apt-get upgrade -y && apt-get install -y ca-certificates wget WORKDIR /usr/local/share/ya16sdb/ -RUN ["/bin/bash", "-c", "bin/bootstrap.sh /usr/local/"] - -ADD .git/ /usr/local/share/ya16sdb/.git/ -ADD data/ /usr/local/share/ya16sdb/data/ -ADD bin/ /usr/local/share/ya16sdb/bin/ -ADD SConstruct ncbi.conf /usr/local/share/ya16sdb/ - -RUN find /usr/local/share/ya16sdb/ -type f -exec chmod 644 {} \; && \ -find /usr/local/share/ya16sdb/ -type d -exec chmod 755 {} \; && \ -find /usr/local/share/ya16sdb/bin/ -type f -exec chmod 755 {} \; - -ENV SCONSFLAGS="--file /usr/local/share/ya16sdb/SConstruct" - +COPY requirements.txt SConstruct ncbi.conf ./ +COPY testfiles/ ./testfiles/ +COPY tests/ ./tests/ +COPY bin/ ./bin/ +COPY data/ ./data/ +RUN bin/bootstrap.sh /usr/local/ +RUN find . -type f -exec chmod 644 {} \; && \ +find . -type d -exec chmod 755 {} \; && \ +find ./bin/ -type f -exec chmod 755 {} \; CMD ["scons", "--dry-run"] diff --git a/SConstruct b/SConstruct index ad579ad..e56ef50 100644 --- a/SConstruct +++ b/SConstruct @@ -8,57 +8,32 @@ import configparser import csv import errno import os -import SCons import sys import time import warnings -from SCons.Script import ARGUMENTS, GetBuildFailures, Depends +from SCons.Script import ARGUMENTS, Environment, GetBuildFailures, Depends +this_dir = os.path.dirname((lambda x: x).__code__.co_filename) venv = os.environ.get('VIRTUAL_ENV') if not venv: warnings.warn('No active virtualenv detected, using system environment') -if not os.path.exists('settings.conf'): +settings_file = ARGUMENTS.get('settings', 'settings.conf') +if not os.path.isfile(settings_file): sys.exit("Can't find settings.conf") - - -class Environment(SCons.Environment.Environment): - def __init__(self, singularity=None, verbosity=0, **kws): - self.singularity = singularity - self.verbosity = verbosity - SCons.Environment.Environment.__init__(self, **kws) - - def Command(self, - target, - source, - action, - singularity=None, - options=None, - **kws): - # if docker and self.docker: # TODO: implement this - if singularity and self.singularity: - self.Depends(target, singularity) - sactions = [] - for a in self.Flatten(action): - sa = self.singularity - sa = '{} --bind $pipeline'.format(sa) - if options: - sa = '{} {}'.format(sa, options) - sa = '{} {} {}'.format(sa, singularity, a) - sactions.append(VirtualAction(a, sa, self.verbosity)) - action = sactions - return SCons.Environment.Environment.Command( - self, target, source, action, **kws) - - -class VirtualAction(SCons.Action.CommandAction): - def __init__(self, command, singularity, verbosity, **kw): - self.command = singularity if verbosity else command - SCons.Action.CommandAction.__init__(self, singularity, **kw) - - def print_cmd_line(self, _, target, source, env): - c = env.subst(self.command, SCons.Subst.SUBST_RAW, target, source) - SCons.Action.CommandAction.print_cmd_line(self, c, target, source, env) +conf = configparser.ConfigParser() +conf.optionxform = str # preserve key case-sensitivity +conf.read(settings_file) +settings = conf['DEFAULT'] +if 'ncbi_conf' in settings: + conf.read(settings['ncbi_conf']) +else: + conf.read(os.path.join(this_dir, 'ncbi.conf')) +true_vals = ['t', 'y', '1'] +release = ARGUMENTS.get('release', 'no').lower()[0] in true_vals +out = os.path.join(settings['outdir'], time.strftime('%Y%m%d')) +log = os.path.join(out, 'log.txt') +cachedir = settings['cachedir'] def blast_db(env, sequence_file, output_base): @@ -69,9 +44,8 @@ def blast_db(env, sequence_file, output_base): blast_out = env.Command( target=[output_base + ext for ext in extensions], source=sequence_file, - action=('makeblastdb -dbtype nucl ' - '-in $SOURCE -out ' + output_base), - singularity=settings['blast']) + action='makeblastdb -dbtype nucl ' + '-in $SOURCE -out ' + output_base) env.Command( target=output_base, source=blast_out, @@ -86,12 +60,10 @@ def taxonomy(fa, info, path): taxtable = env.Command( target=os.path.join(path, 'taxonomy.csv'), source=info, - action=['taxit taxtable ' - '--seq-info $SOURCE ' - '--out $TARGET ' - '$tax_url'], - singularity=settings['taxit'], - options='--bind $tax_url') + action='taxit taxtable ' + '--seq-info $SOURCE ' + '--out $TARGET ' + '$tax_url') """ Taxtable output replacing tax_ids with taxnames @@ -99,8 +71,7 @@ def taxonomy(fa, info, path): lineages = env.Command( target=os.path.join(path, 'lineages.csv'), source=[taxtable, info], - action='taxit lineage_table --csv-table $TARGET $SOURCES', - singularity=settings['taxit']) + action='taxit lineage_table --csv-table $TARGET $SOURCES') """ Mothur output - https://mothur.org/wiki/Taxonomy_File @@ -108,24 +79,13 @@ def taxonomy(fa, info, path): mothur = env.Command( target=os.path.join(path, 'lineages.txt'), source=[taxtable, info], - action='taxit lineage_table --taxonomy-table $TARGET $SOURCES', - singularity=settings['taxit']) + action='taxit lineage_table --taxonomy-table $TARGET $SOURCES') blast = blast_db(env, fa, os.path.join(path, 'blast')) return taxtable, lineages, mothur, blast -true_vals = ['t', 'y', '1'] -release = ARGUMENTS.get('release', 'no').lower()[0] in true_vals -test = ARGUMENTS.get('test', 'no').lower()[0] in true_vals -absolute_dir = os.path.dirname((lambda x: x).__code__.co_filename) -conf = configparser.SafeConfigParser() -conf.read(['settings.conf', os.path.join(absolute_dir, 'ncbi.conf')]) -settings = conf['TEST'] if test else conf['DEFAULT'] -out = os.path.join(settings['outdir'], time.strftime('%Y%m%d')) -cachedir = settings['cachedir'] - cfiles = { 'genbank_cache': os.path.join(cachedir, 'records.gb'), 'outliers_cache': os.path.join(cachedir, 'filter_outliers.csv'), @@ -145,46 +105,32 @@ for k, v in cfiles.items(): os.makedirs(d) open(v, 'w').close() -environment_variables = dict( - os.environ, - PATH=':'.join([ - os.path.join(absolute_dir, 'bin'), - (os.path.join(venv, 'bin') if venv else ''), - '/usr/local/bin', - '/usr/bin', - '/bin']), -) - +# use default ENV and add/append values after env = Environment( - ENV=environment_variables, - log=os.path.join(out, 'log.txt'), + log=log, out=out, - pipeline=absolute_dir, - shell='bash', + pipeline=this_dir, tax_url=os.path.abspath(settings['taxonomy']), - verbosity=1, **settings ) +env.PrependENVPath('PATH', os.path.join(this_dir, 'bin')) -env.EnsureSConsVersion(3, 0, 5) -env.Decider('MD5') +for k, v in os.environ.items(): + if k.startswith('MEFETCH_'): + env['ENV'][k] = v +env['ENV'].update(conf['ENV']) +env['ENV']['MEFETCH_DB'] = 'nucleotide' +env['ENV']['MEFETCH_LOG'] = log +env['ENV']['MEFETCH_MODE'] = 'text' -mefetch = ('mefetch -vv ' - '-api-key $api_key ' # FIXME: what happens if no $api_key? - '-db nucleotide ' - '-email $email ' - '-log $log ' - '-max-retry -1 ' - '-mode text ' - '-proc $nreq ' - '-retry $retry') +env.Decider('MD5-timestamp') classified = env.Command( source=None, target='$out/ncbi/classified.txt', - action=['esearch -db nucleotide -query "$classified" | ' + - mefetch + ' -format acc -out $TARGET'], - singularity=settings['eutils']) + action='esearch -db nucleotide -query "$classified" | ' + 'mefetch -vv -format acc -out $TARGET -reqs 3' + ) """ Candidatus Saccharibacteria @@ -193,9 +139,8 @@ https://gitlab.labmed.uw.edu/molmicro/mkrefpkg/issues/36 tm7 = env.Command( source=None, target='$out/ncbi/tm7.txt', - action=['esearch -db nucleotide -query "$tm7" | ' + - mefetch + ' -format acc -out $TARGET'], - singularity=settings['eutils']) + action='esearch -db nucleotide -query "$tm7" | ' + 'mefetch -vv -format acc -out $TARGET') """ Check the cache for last download_date and download list of modified @@ -211,11 +156,9 @@ else: modified = env.Command( source=None, target='$out/ncbi/modified.txt', - action=[ - 'esearch -db nucleotide -query "($classified OR $tm7) ' - 'AND $download_date[Modification Date] : 3000[Modification Date]" | ' + - mefetch + ' -format acc -out $TARGET'], - singularity=settings['eutils']) + action='esearch -db nucleotide -query "($classified OR $tm7) AND ' + '$download_date[Modification Date] : 3000[Modification Date]" | ' + 'mefetch -vv -format acc -out $TARGET -reqs 3') """ type strains records @@ -225,9 +168,8 @@ http://www.ncbi.nlm.nih.gov/news/01-21-2014-sequence-by-type/ types = env.Command( source=None, target='$out/ncbi/types.txt', - action=['esearch -db nucleotide -query "$types" | ' + - mefetch + ' -format acc -out $TARGET'], - singularity=settings['eutils']) + action='esearch -db nucleotide -query "$types" | ' + 'mefetch -vv -format acc -out $TARGET -reqs 3') """ Trim accession2taxid with 16s records and update taxids @@ -240,12 +182,10 @@ accession2taxid = env.Command( accession2taxid = env.Command( target='$out/ncbi/accession2update_taxids.csv', source=accession2taxid, - action=['taxit update_taxids ' - '--outfile $TARGET ' - '--unknown-action drop ' - '$SOURCE $tax_url'], - singularity=settings['taxit'], - options='--bind $tax_url') + action='taxit update_taxids ' + '--outfile $TARGET ' + '--unknown-action drop ' + '$SOURCE $tax_url') """ Create a list of cached records removing: @@ -273,18 +213,32 @@ download = env.Command( source=[accession2taxid, cache, settings['do_not_download']], action='download.py --out $TARGET $SOURCES') +coordinates = env.Command( + target='$out/ncbi/genbank.csv', + source=download, + action='mefetch -vv ' + '-failed $out/ncbi/ft_failed.txt ' + '-format ft ' + '-id $SOURCE ' + '-retmax 10 ' + '| ftract -feature "rrna:product:16S ribosomal RNA" -out $TARGET' + ) +env.Precious(coordinates) + """ download genbank records + +-retmax 1 when reading from -csv file """ gbs = env.Command( target='$out/ncbi/download.gb', - source=download, - action=[mefetch + ' -format ft -id $SOURCE -retmax 1 | ' - 'ftract -feature "rrna:product:16S ribosomal RNA" ' - '-log $log ' - '-on-error continue ' - '-min-length 1200 | ' + - mefetch + ' -csv -format gbwithparts -out $TARGET -retmax 1']) + source=coordinates, + action='mefetch -vv ' + '-csv ' + '-failed $out/ncbi/failed.gb ' + '-format gb ' + '-id $SOURCE ' + '-out $TARGET') today = time.strftime('%d-%b-%Y') fa, seq_info, pubmed_info, references, refseq_info = env.Command( @@ -322,12 +276,10 @@ fa, seq_info = env.Command( target=['$out/ncbi/extract_genbank/update_taxids/seqs.fasta', '$out/ncbi/extract_genbank/update_taxids/seq_info.csv'], source=[fa, seq_info], - action=['taxit update_taxids ' - '--unknown-action drop ' - '${SOURCES[1]} $tax_url | ' - 'partition_refs.py ${SOURCES[0]} - $TARGETS'], - singularity=settings['taxit'], - options='--bind $tax_url') + action='taxit update_taxids ' + '--unknown-action drop ' + '${SOURCES[1]} $tax_url | ' + 'partition_refs.py ${SOURCES[0]} - $TARGETS') """ cmsearch new sequences against rfam model @@ -335,9 +287,8 @@ cmsearch new sequences against rfam model cmsearch = env.Command( target='$out/ncbi/extract_genbank/update_taxids/cmsearch/table.tsv', source=['$pipeline/data/SSU_rRNA_bacteria.cm', fa], - action=('cmsearch --cpu 14 -E 0.01 --hmmonly -o /dev/null ' - '--tblout $TARGET $SOURCES || true'), - singularity=settings['infernal']) + action='cmsearch --cpu 14 -E 0.01 --hmmonly -o /dev/null ' + '--tblout $TARGET $SOURCES || true') """ Fix record orientations @@ -397,9 +348,7 @@ env.Command( taxtable = env.Command( target='$out/taxonomy.csv', source=seq_info, - action='taxit -v taxtable --seq-info $SOURCE --out $TARGET $tax_url', - singularity=settings['taxit'], - options='--bind $tax_url') + action='taxit -v taxtable --seq-info $SOURCE --out $TARGET $tax_url') """ Map for WGS records without a refseq assembly accession @@ -407,13 +356,13 @@ Map for WGS records without a refseq assembly accession asm = env.Command( target='$out/ani/assembly_summary_genbank.txt', source=None, - action=('wget ' - '--output-document $TARGET ' - '--quiet ' - '--retry-on-http-error 403 ' - '--tries 100 ' - 'https://ftp.ncbi.nlm.nih.gov/' - 'genomes/genbank/assembly_summary_genbank.txt')) + action='wget ' + '--output-document $TARGET ' + '--quiet ' + '--retry-on-http-error 403 ' + '--tries 100 ' + 'https://ftp.ncbi.nlm.nih.gov/' + 'genomes/genbank/assembly_summary_genbank.txt') """ The ANI tax check report @@ -421,13 +370,13 @@ The ANI tax check report ani = env.Command( target='$out/ani/ANI_report_prokaryotes.txt', source=None, - action=('wget ' - '--output-document $TARGET ' - '--quiet ' - '--retry-on-http-error 403 ' - '--tries 100 ' - 'https://ftp.ncbi.nlm.nih.gov/' - 'genomes/ASSEMBLY_REPORTS/ANI_report_prokaryotes.txt')) + action='wget ' + '--output-document $TARGET ' + '--quiet ' + '--retry-on-http-error 403 ' + '--tries 100 ' + 'https://ftp.ncbi.nlm.nih.gov/' + 'genomes/ASSEMBLY_REPORTS/ANI_report_prokaryotes.txt') """ Create feather file and initial columns @@ -475,14 +424,14 @@ type_fa, type_info = env.Command( target=['$out/dedup/1200bp/types/seqs.fasta', '$out/dedup/1200bp/types/seq_info.csv'], source=[fa, feather], - action=['partition_refs.py ' - '--drop-duplicate-sequences ' - '--is_species ' - '--is_type ' - '--is_valid ' - '--min-length 1200 ' - '--prop-ambig-cutoff 0.01 ' - '$SOURCES $TARGETS']) + action='partition_refs.py ' + '--drop-duplicate-sequences ' + '--is_species ' + '--is_type ' + '--is_valid ' + '--min-length 1200 ' + '--prop-ambig-cutoff 0.01 ' + '$SOURCES $TARGETS') type_tax, type_lineages, types_mothur, type_blast = taxonomy( type_fa, type_info, '$out/dedup/1200bp/types/') @@ -494,14 +443,14 @@ fa, seq_info = env.Command( target=['$out/dedup/1200bp/named/seqs.fasta', '$out/dedup/1200bp/named/seq_info.csv'], source=[fa, feather], - action=('partition_refs.py ' - '--drop-duplicate-sequences ' - '--is_species ' - '--is_valid ' - '--min-length 1200 ' - '--prop-ambig-cutoff 0.01 ' - '--species-cap %(species_cap)s ' - '${SOURCES[:2]} $TARGETS' % settings)) + action='partition_refs.py ' + '--drop-duplicate-sequences ' + '--is_species ' + '--is_valid ' + '--min-length 1200 ' + '--prop-ambig-cutoff 0.01 ' + '--species-cap %(species_cap)s ' + '${SOURCES[:2]} $TARGETS' % settings) named = fa @@ -539,8 +488,7 @@ fa, details = env.Command( '--strategy cluster ' '--threads-per-job 14 ' '${SOURCES[:3]}', - 'cp ${TARGETS[1]} ' + cfiles['outliers_cache']], - singularity=settings['deenurp']) + 'cp ${TARGETS[1]} ' + cfiles['outliers_cache']]) """ add distance metrics to feather file @@ -567,16 +515,15 @@ filtered_type_tax, filtered_type_lineages, filtered_type_mothur, _ = taxonomy( filtered_type_hits = env.Command( target='$out/dedup/1200bp/named/types_vsearch.tsv', source=[named, filtered_type_fa], - action=('vsearch --usearch_global ${SOURCES[0]} ' - '--blast6out $TARGET ' - '--db ${SOURCES[1]} ' - '--id 0.75 ' - '--maxaccepts 5 ' - '--self ' # reject same sequence hits - '--strand plus ' - '--threads 14 ' - '--top_hits_only'), - singularity=settings['vsearch']) + action='vsearch --usearch_global ${SOURCES[0]} ' + '--blast6out $TARGET ' + '--db ${SOURCES[1]} ' + '--id 0.75 ' + '--maxaccepts 5 ' + '--self ' # reject same sequence hits + '--strand plus ' + '--threads 14 ' + '--top_hits_only') """ This output will be used in the filter plots @@ -587,13 +534,13 @@ filtered_type_classifications = env.Command( filtered_type_info, filtered_type_tax, os.path.join('$pipeline', 'data/classifier_thresholds.csv')], - action=['classify -vv ' - '--lineages ${SOURCES[2]} ' - '--rank-thresholds ${SOURCES[3]} ' - '--seq-info ${SOURCES[1]} ' - '--starred 101 ' - '--out $TARGET ' - '${SOURCES[0]}']) + action='classify -vv ' + '--lineages ${SOURCES[2]} ' + '--rank-thresholds ${SOURCES[3]} ' + '--seq-info ${SOURCES[1]} ' + '--starred 101 ' + '--out $TARGET ' + '${SOURCES[0]}') """ Adds type_classification to feather file for Dash application @@ -610,9 +557,7 @@ expand taxids into descendants trusted_taxids = env.Command( target='$out/dedup/1200bp/named/filtered/trusted/taxids.txt', source=settings['trust'], - action='taxit get_descendants --out $TARGET $tax_url $SOURCE', - singularity=settings['taxit'], - options='--bind $tax_url') + action='taxit get_descendants --out $TARGET $tax_url $SOURCE') """ expand taxids into descendants @@ -620,9 +565,7 @@ expand taxids into descendants dnt_ids = env.Command( target='$out/dedup/1200bp/named/filtered/trusted/dnt_ids.txt', source=settings['do_not_trust'], - action='taxit get_descendants --out $TARGET $tax_url $SOURCE', - singularity=settings['taxit'], - options='--bind $tax_url') + action='taxit get_descendants --out $TARGET $tax_url $SOURCE') trusted = env.Command( target='$out/dedup/1200bp/named/filtered/trusted/trust_ids.txt', @@ -641,17 +584,17 @@ fa, seq_info = env.Command( target=['$out/dedup/1200bp/named/filtered/trusted/seqs.fasta', '$out/dedup/1200bp/named/filtered/trusted/seq_info.csv'], source=[named, feather, trusted, dnt], - action=['partition_refs.py ' - '--do_not_trust ${SOURCES[3]} ' - '--drop-duplicate-sequences ' - '--inliers ' # filter_outliers = True & is_out = False - '--is_species ' - '--is_valid ' - '--min-length 1200 ' - '--prop-ambig-cutoff 0.01 ' - '--species-cap %(species_cap)s ' - '--trusted ${SOURCES[2]} ' - '${SOURCES[:2]} $TARGETS' % settings]) + action='partition_refs.py ' + '--do_not_trust ${SOURCES[3]} ' + '--drop-duplicate-sequences ' + '--inliers ' # filter_outliers = True & is_out = False + '--is_species ' + '--is_valid ' + '--min-length 1200 ' + '--prop-ambig-cutoff 0.01 ' + '--species-cap %(species_cap)s ' + '--trusted ${SOURCES[2]} ' + '${SOURCES[:2]} $TARGETS' % settings) Depends([fa, seq_info], filter_outliers) taxtable, lineages, mothur, blast = taxonomy( @@ -681,16 +624,15 @@ FIXME: use named types not trusted types named_type_hits = env.Command( target='$out/dedup/1200bp/named/trusted_vsearch.tsv', source=[named, fa], - action=('vsearch --usearch_global ${SOURCES[0]} ' - '--blast6out $TARGET ' - '--db ${SOURCES[1]} ' - '--id 0.75 ' - '--maxaccepts 5 ' - '--self ' # reject same sequence hits - '--strand plus ' - '--threads 14 ' - '--top_hits_only'), - singularity=settings['deenurp']) + action='vsearch --usearch_global ${SOURCES[0]} ' + '--blast6out $TARGET ' + '--db ${SOURCES[1]} ' + '--id 0.75 ' + '--maxaccepts 5 ' + '--self ' # reject same sequence hits + '--strand plus ' + '--threads 14 ' + '--top_hits_only') """ Creates match_seqname, match_pct, match_version, match_species and @@ -730,9 +672,8 @@ git version used to generate output """ commit = env.Command( target='$out/git_version.txt', - source=os.path.join('$pipeline', '.git/objects'), - action=['(echo $$(hostname):$pipeline;' - 'git --git-dir $pipeline/.git describe --tags --dirty) > $TARGET']) + source=None, + action='(echo $$(hostname):$pipeline;version.py) > $TARGET') def write_build_status(): diff --git a/bin/cmsearch.py b/bin/cmsearch.py index f792546..f43b32b 100755 --- a/bin/cmsearch.py +++ b/bin/cmsearch.py @@ -39,11 +39,11 @@ def main(): else: cmsearch = pandas.read_csv( args.cmsearch, - delim_whitespace=True, comment='#', dtype=dtypes, header=None, names=CMSEARCH_COLS, + sep='\\s+', usecols=dtypes.keys()) # alignments are already sorted by bitscore quality cmsearch = cmsearch.drop_duplicates(subset='seqname', keep='first') diff --git a/bin/version.py b/bin/version.py new file mode 100755 index 0000000..c2f695e --- /dev/null +++ b/bin/version.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import argparse +import os +import subprocess +import sys + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--out', type=argparse.FileType('w'), default=sys.stdout) + return parser.parse_args() + + +def main(): + args = get_args() + if 'YA16SDB_VERSION' in os.environ: + ver = os.environ['YA16SDB_VERSION'] + else: + try: + cmd = ['git', 'describe', '--tags', '--dirty'] + ver = subprocess.check_output(cmd, text=True).strip() + except subprocess.CalledProcessError: + ver = '' + args.out.write(ver) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/ncbi.conf b/ncbi.conf index 133837d..fb67ff1 100644 --- a/ncbi.conf +++ b/ncbi.conf @@ -5,10 +5,3 @@ classified=%(16s)s NOT(environmental samples[Organism] OR unclassified Bacteria[Organism]) tm7=%(16s)s AND Candidatus Saccharibacteria[Organism] types=%(16s)s AND sequence_from_type[Filter] - -[TEST] -classified=%(16s)s AND (%(test_ids)s) -test_ids=txid104100[Organism] OR txid198480[Organism] OR txid333297[Organism] OR txid487838[Organism] OR txid1978400[Organism] OR txid749509[Organism] OR txid1138917[Organism] OR txid1140002[Organism] OR txid393538[Organism] OR txid1676683[Organism] OR txid1736139[Organism] -test_tm7=txid1805375[Organism] OR txid1476577[Organism] -tm7=%(16s)s AND %(test_tm7)s -types=%(16s)s AND sequence_from_type[Filter] AND (%(test_ids)s OR %(test_tm7)s) diff --git a/settings-example.conf b/settings-example.conf index ed8f702..4d6f060 100644 --- a/settings-example.conf +++ b/settings-example.conf @@ -2,46 +2,31 @@ outdir=output cachedir=%(outdir)s/.cache -# sequences searching and downloading -# api_key= -email= -nreq=3 -retry=10000 - # sequence sorting and maximum representative capping sort_by=is_type,is_published,is_refseq,ambig_count,modified_date,download_date,seqhash species_cap=5000 -path= - -# sequence selection filter lists -do_not_download=%(path)s/lists/do_not_download.txt -do_not_trust=%(path)s/lists/do_not_trust.txt -trust=%(path)s/lists/trust.txt +# optional sequence selection filter lists +do_not_download=path/to/do_not_download.txt +do_not_trust=path/to/do_not_trust.txt +trust=path/to/trust.txt # taxonomy -accession2taxid=%(path)s/ncbi/taxonomy/LATEST/accession2taxid.gz -taxdmp=%(path)s/ncbi/taxonomy/LATEST/taxdmp.zip -taxonomy=%(path)s/credentials/postgresql-ncbi_taxonomy-taxonomy_user.conf - -# virtual container runtime commands -# docker=/user/bin/docker run # TODO: implement -# singularity=/usr/local/bin/singularity run - -# images -blast=%(path)s/singularity/blast-2.10.1-singularity3.6.1.img -deenurp=%(path)s/singularity/deenurp-v0.2.7-singularity3.4.1-dist.img -eutils=%(path)s/singularity/ncbi-edirect-15.6-singularity3.7.3.img -infernal=%(path)s/singularity/infernal-1.1.4-singularity3.7.1.img -taxit=%(path)s/singularity/taxtastic-0.9.1-singularity3.6.1.img -vsearch=%(path)s/singularity/vsearch-2.13.4.img - -[TEST] -testfiles=testfiles -accession2taxid=%(testfiles)s/accession2taxid.gz -do_not_download=%(testfiles)s/do_not_download.txt -do_not_trust=%(testfiles)s/do_not_trust.txt -trust=%(testfiles)s/trust.txt -outdir=test_output -taxdmp=%(testfiles)s/taxdmp.zip -taxonomy=%(testfiles)s/taxonomy.conf +accession2taxid=path/to/accession2taxid.gz +taxdmp=path/to/taxdmp.zip +taxonomy=path/to/taxonomy.conf + +# this var is for dev purposes only +# ncbi_conf=testfiles/ncbi.conf + +[ENV] + +# optional MEFETCH_API_KEY for more mefetch -reqs +# MEFETCH_API_KEY= + +# must set MEFETCH_EMAIL if not in os.environ +# MEFETCH_EMAIL= + +MEFETCH_MAX_RETRY=-1 +MEFETCH_RETRY=10000 +MEFETCH_WORKERS=100 diff --git a/testfiles/ncbi.conf b/testfiles/ncbi.conf new file mode 100644 index 0000000..2927bdc --- /dev/null +++ b/testfiles/ncbi.conf @@ -0,0 +1,7 @@ +[DEFAULT] +16s=16s[All Fields] AND rRNA[Feature Key] AND Bacteria[Organism] AND 1200 : 99999999999[Sequence Length] +classified=%(16s)s AND (%(test_ids)s) +test_ids=txid104100[Organism] OR txid198480[Organism] OR txid333297[Organism] OR txid487838[Organism] OR txid1978400[Organism] OR txid749509[Organism] OR txid1138917[Organism] OR txid1140002[Organism] OR txid393538[Organism] OR txid1676683[Organism] OR txid1736139[Organism] +test_tm7=txid1805375[Organism] OR txid1476577[Organism] +tm7=%(16s)s AND %(test_tm7)s +types=%(16s)s AND sequence_from_type[Filter] AND (%(test_ids)s OR %(test_tm7)s) diff --git a/testfiles/settings.conf b/testfiles/settings.conf new file mode 100644 index 0000000..eb129d0 --- /dev/null +++ b/testfiles/settings.conf @@ -0,0 +1,31 @@ +[DEFAULT] +outdir=test_output +cachedir=%(outdir)s/.cache + +# sequence sorting and maximum representative capping +sort_by=is_type,is_published,is_refseq,ambig_count,modified_date,download_date,seqhash +species_cap=5000 + +# optional sequence selection filter lists +do_not_download=testfiles/do_not_download.txt +do_not_trust=testfiles/do_not_trust.txt +trust=testfiles/trust.txt + +accession2taxid=testfiles/accession2taxid.gz +taxdmp=testfiles/taxdmp.zip +taxonomy=testfiles/taxonomy.conf + +# only set this var for dev or testing purposes +ncbi_conf=testfiles/ncbi.conf + +[ENV] + +# optional MEFETCH_API_KEY for more mefetch -reqs +# MEFETCH_API_KEY= + +# must set MEFETCH_EMAIL if not in os.environ +# MEFETCH_EMAIL= + +MEFETCH_MAX_RETRY=-1 +MEFETCH_RETRY=10000 +MEFETCH_WORKERS=100 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..c2443ab --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,51 @@ +import unittest +import os + + +class TestSConsPipelineOutput(unittest.TestCase): + def test_pipeline_output(self): + output_dir = 'test_output' + + output_files = [ + 'blast', + 'lineages.csv', + 'seqs.fasta', + 'seq_info.csv', + 'taxonomy.csv', + ] + + expected = {} + for date_dir in os.listdir(output_dir): + if len(date_dir) == 8 and date_dir.isnumeric(): + out = f'{output_dir}/{date_dir}' + self.assertTrue(os.path.isfile(out + '/SUCCESS')) + expected.update({ + f'{out}/dedup/1200bp/types': output_files, + f'{out}/dedup/1200bp/named/': output_files, + f'{out}/dedup/1200bp/named/filtered': [ + 'blast', + 'taxonomy.csv', + 'outliers.csv', + 'unsorted.fasta'], + f'{out}/dedup/1200bp/named/filtered/trusted': output_files, + f'{out}/dedup/1200bp/named/filtered/types': output_files, + }) + + # Verify the folder structure and contents + for folder_path, contents in expected.items(): + self.assertTrue( + os.path.isdir(folder_path), + f"Folder {folder_path} does not exist") + actual_contents = os.listdir(folder_path) + for expected_file in contents: + self.assertIn( + expected_file, + actual_contents, + f"{expected_file} not found in {folder_path}") + file = os.path.join(folder_path, expected_file) + self.assertTrue(os.path.isfile(file)) + self.assertTrue(os.stat(file).st_size > 0) # not empty + + +if __name__ == '__main__': + unittest.main()