diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..0b1e1e7 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,27 @@ +**/__pycache__ +**/.venv +**/.classpath +**/.dockerignore +**/.env +**/.git +**/.gitignore +**/.project +**/.settings +**/.toolstarget +**/.vs +**/.vscode +**/*.*proj.user +**/*.dbmdl +**/*.jfm +**/bin +**/charts +**/docker-compose* +**/compose* +**/Dockerfile* +**/node_modules +**/npm-debug.log +**/obj +**/secrets.dev.yaml +**/values.dev.yaml +LICENSE +README.md diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..dbe5e8f --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,32 @@ +name: publish + + +on: [push] + + +jobs: + + publish-itsxpress-image: + + runs-on: ubuntu-latest + + + steps: + + - uses: actions/checkout@v3 + + + - name: Build the itsxpress Docker image + + run: | + + echo $CR_PAT | docker login ghcr.io -u seina001 --password-stdin + + env: + + CR_PAT: ${{ secrets.CR_PAT }} + + - name: Publish image + run: | + docker build . --tag ghcr.io/usda-ars-gbru/itsxpress:latest + docker push ghcr.io/usda-ars-gbru/itsxpress:latest diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml new file mode 100644 index 0000000..435a3e9 --- /dev/null +++ b/.github/workflows/python-package-conda.yml @@ -0,0 +1,55 @@ +name: GitHub Actions Weekly Build (V2) + +on: + push: + branches: + - version2 + schedule: + - cron: "0 0 */7 * *" + +jobs: + build-linux: + runs-on: ubuntu-latest + strategy: + max-parallel: 5 + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.7 + uses: actions/setup-python@v3 + with: + python-version: '3.8.13' + - run: | + sudo apt update + sudo apt install gcc-10 g++-10 + - name: Add conda to system path + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + echo $CONDA/bin >> $GITHUB_PATH + - name: Initiate conda channels + uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: testenv + python-version: 3.8.13 + channels: conda-forge,bioconda + allow-softlinks: true + channel-priority: flexible + show-channel-urls: true + use-only-tar-bz2: true + + - name: Set up environment + run: | + wget https://data.qiime2.org/distro/core/qiime2-2022.8-py38-linux-conda.yml + conda env create -n qiime2-2022.8 --file qiime2-2022.8-py38-linux-conda.yml + - name: Install Dependencies + run: | + source activate qiime2-2022.8 + conda install biopython + pip install pyzstd + pip install pytest + pip install . + + - name: Test with pytest + run: | + source activate qiime2-2022.8 + pytest diff --git a/.gitignore b/.gitignore index 26a261c..96cdb9d 100644 --- a/.gitignore +++ b/.gitignore @@ -39,7 +39,7 @@ __pycache__/ .Python build/ develop-eggs/ -dist/ +#dist/ downloads/ eggs/ .eggs/ @@ -52,7 +52,6 @@ wheels/ *.egg-info/ .installed.cfg *.egg -MANIFEST # PyInstaller # Usually these files are written by a python script from a template diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8b6acda --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM condaforge/mambaforge:4.10.3-5 + +# # Install system dependencies +RUN apt-get update && \ + apt-get install -y build-essential && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.source="https://github.com/usda-ars-gbru/itsxpress" +# Install conda dependencies +RUN conda update mamba -c conda-forge +RUN mamba install -c conda-forge python=3.8.16 +RUN mamba install -c bioconda vsearch=2.22.1 hmmer=3.1b2 + +# Copy the itsxpress package files and install dependencies +COPY . /app +WORKDIR /app +RUN pip install --no-cache-dir . + +# Set the default command to run itsxpress +CMD ["itsxpress"] diff --git a/MANIFEST.in b/MANIFEST.in index 194081e..3805e29 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,8 @@ include itsxpress/ITSx_db/HMMs/* include tests/test_data/* include tests/test_data/ex_tmpdir/* +include tests/test_main_pytest.py include LICENSE.txt +include itsxpress/citations.bib +include changelog.md +include Dockerfile \ No newline at end of file diff --git a/README.rst b/README.rst index ded4a2b..0032f2a 100644 --- a/README.rst +++ b/README.rst @@ -25,24 +25,15 @@ Author * Adam R. Rivers, US Department of Agriculture, Agricultural Research Service * Sveinn V. Einarsson, US Department of Agriculture, Agricultural Research Service - Citation -------- Rivers AR, Weber KC, Gardner TG et al. ITSxpress: Software to rapidly trim internally transcribed spacer sequences with quality scores for marker gene -analysis [version 1]. F1000Research 2018, 7:1418 +analysis [version 1; referees: awaiting peer review]. F1000Research 2018, 7:1418 (doi: `10.12688/f1000research.15704.1`_) .. _`10.12688/f1000research.15704.1`: https://doi.org/10.12688/f1000research.15704.1 -##### - -**This is the end of life version 1 ITSxpress. -The new version 2 of ITSxpress, has the Qiime2 plugin built in with the command line version of ITSxpress. See -main branch of ITSxpress.** - -##### - Introduction ------------- @@ -54,78 +45,65 @@ et al. (2013)`_ published software the software package ITSx_ to do this. ITSxpress is designed to support the calling of exact sequence variants rather than OTUs_. This newer method of sequence error-correction requires quality score data from each -sequence, so each input sequence must be trimmed. ITSxpress makes this possible by +sequence, so each input sequence must be trimmed. ITSXpress makes this possible by taking FASTQ data, de-replicating the sequences then identifying the start and stop -sites using HMMSearch. Results are parsed and the trimmed files are returned. The ITS1, +sites using HMMSearch. Results are parsed and the trimmed files are returned. The ITS 1, ITS2 or the entire ITS region including the 5.8s rRNA gene can be selected. ITSxpress uses the hmm model from ITSx so results are comparable. -ITSxpress is also available as a `QIIME2 Plugin`_ +ITSxpress is also a QIIME2 plugin. Starting from version 2.0.0 of ITSxpress, the QIIME2 plugin is included with +the command line version of ITSxpress. The installation method will be slightly different depending on whether +QIIME2 is being used. .. _`Bengtsson-Palme et al. (2013)`: https://doi.org/10.1111/2041-210X.12073 .. _ITSx: http://microbiology.se/software/itsx/ .. _OTUs: https://doi.org/10.1038/ismej.2017.119 .. _`QIIME2 Plugin`: https://github.com/USDA-ARS-GBRU/q2_itsxpress -.. _`mamba installation guide`: https://mamba.readthedocs.io/en/latest/installation.html - - -Installation -------------- - -This is the installation of the final iteration of ITSxpress version 1: (BBmap is no longer used in ITSxpress version 2): - - This version should primarily be used for reproducability with other datasets, which used ITSxpress =<1.8.1 - - The new version 2 is compatible with the newer versions of Qiime2 - - **If you want to install this iteration of ITSxpress with Qiime2, then you you need to follow the install instructions here:** `QIIME2 Plugin`_ -Since this version is no longer supported, you **must** create a new conda environment in order for the depenendencies to be compatible. +Environment with or without QIIME2 +----------------------------------- +Create a new conda environment before installing ITSxpress. +If using QIIME2, follow the installation instructions on their wiki: https://docs.qiime2.org/2022.11/install/native/ -Example on how to install and create new conda environment for this version of ITSxpress. We are using mamba because it resolves packages better and faster, but conda can be substituted. +As of now ITSxpress is compatible with version 2022.8 and 2022.11 of QIIME2 - - Information on installing mamba or micromamba (either highly recommended) can be found here: `mamba installation guide`_ +Example: .. code-block:: bash - - mamba create -n ITSxpress_V1EOL python=3.8.13 - mamba activate ITSxpress_V1EOL - #or - conda create -n ITSxpress_V1EOL python=3.8.13 - conda activate ITSxpress_V1EOL - - -ITSxpress can be installed in 3 ways: --------------------------------------- + wget https://data.qiime2.org/distro/core/qiime2-2022.11-py38-osx-conda.yml + mamba env create -n qiime2-2022.11 --file qiime2-2022.11-py38-osx-conda.yml + mamba activate qiime2-2022.11 - -1. **Bioconda:** (preferred method because it handles dependencies): +If you are only installing the command line version of ITSxpress and not QIIME2: .. code-block:: bash - mamba install -y -c bioconda itsxpress==1.8.1 + mamba env create -n ITSxpress + mamba activate ITSxpress -2. **Pip:** https://pypi.org/project/itsxpress/: - - If using Pip, you will need to specify the versions of the dependencies listed below before installing itsxpress +Installation +------------- +Within either conda environment, described above, ITSxpress can be installed from: -.. code-block:: bash +1. Bioconda: (preferred method because it handles dependencies, **Pip is no longer maintained for ITSxpress>=2.0.0**): - mamba install -y -c bioconda hmmer==3.1b2 - mamba install -y -c bioconda bbmap==38.69 - mamba install -y -c bioconda vsearch==2.21.1 - pip install itsxpress +.. code-block:: bash + mamba install -c bioconda itsxpress -3. **The Github repository:** https://github.com/USDA-ARS-GBRU/itsxpress +2. The Github repository: https://github.com/USDA-ARS-GBRU/itsxpress .. code-block:: bash - git clone -branch 1.8.1-EOL https://github.com/USDA-ARS-GBRU/itsxpress.git + git clone https://github.com/USDA-ARS-GBRU/itsxpress.git Dependencies ------------- -This software requires Vsearch=2.21.1, BBtools=38.69, Hmmer=3.1b2 and Biopython>=1.79. Bioconda +The software requires Vsearch, Hmmer and Biopython. Bioconda takes care of this for you so it is the preferred installation method. @@ -195,8 +173,8 @@ forward and reverse gzipped FASTQ files using two cpu threads. Return a single m itsxpress --fastq r1.fastq.gz --fastq2 r2.fastq.gz --region ITS2 \ --taxa Fungi --log logfile.txt --outfile trimmed_reads.fastq.gz --threads 2 -ITSxpress can take gzipped or un-gzipped FASTQ files and it can write gzipped or -un-gzipped FASTQ files. It expects FASTQ files to end in: .fq, .fastq, .fq.gz or fastq.gz. +ITSxpress can take uncompressed, gzipped or zstd compressed FASTQ files and it can write uncompressed, gzipped or +zstd compressed FASTQ files. It expects FASTQ files to end in: .fq, .fastq, .fq.gz, fastq.gz, .fq.zst or fastq.zst. Use case 2: Trimming the ITS2 region from a fungal amplicon sequencing dataset with forward and reverse gzipped FASTQ files using two cpu threads. Return a forward @@ -207,20 +185,11 @@ and reverse read files for use in Dada2. itsxpress --fastq r1.fastq.gz --fastq2 r2.fastq.gz --region ITS2 \ --taxa Fungi --log logfile.txt --outfile trimmed_reads.fastq.gz --threads 2 -ITSxpress can take gzipped or un-gzipped FASTQ files and it can write gzipped or -un-gzipped FASTQ files. It expects FASTQ files to end in: .fq, .fastq, .fq.gz or fastq.gz. +ITSxpress can take uncompressed, gzipped or zstd compressed FASTQ files and it can write uncompressed, gzipped or +zstd compressed FASTQ files. It expects FASTQ files to end in: .fq, .fastq, .fq.gz, fastq.gz, .fq.zst or fastq.zst. Use case 3: Trimming the ITS2 region from a fungal amplicon sequencing dataset with -an interleaved gzipped FASTQ files using two cpu threads. Return a single merged file for use in Deblur. - -.. code-block:: bash - - itsxpress --fastq interleaved.fastq.gz --region ITS2 --taxa Fungi \ - --log logfile.txt --outfile trimmed_reads.fastq.gz --threads 2 - - -Use case 4: Trimming the ITS2 region from a fungal amplicon sequencing dataset with an single-ended gzipped FASTQ files using two cpu threads. .. code-block:: bash @@ -231,17 +200,9 @@ an single-ended gzipped FASTQ files using two cpu threads. Single ended data is less common and may come from a dataset where the reads have already been merged. -Use case 5: Trimming the ITS1 region from a Alveolata amplicon sequencing dataset with -an interleaved gzipped FASTQ files using 8 cpu threads. - -.. code-block:: bash - - itsxpress --fastq interleaved.fastq.gz --region ITS1 --taxa Alveolata \ - --log logfile.txt --outfile trimmed_reads.fastq.gz --threads 8 - - License information -------------------- This software is a work of the United States Department of Agriculture, Agricultural Research Service and is released under a Creative Commons CC0 public domain attribution. +======= diff --git a/changelog.md b/changelog.md new file mode 100644 index 0000000..a8e9efd --- /dev/null +++ b/changelog.md @@ -0,0 +1,86 @@ +2.0.0 (2023-06-28) +------------------ +Release Highlights +- Removed BBmap dependency + - BBmap scripts are no longer used in the pipeline, including: + - reformat.sh (interleaved files no longer supported) + - bbmerge.sh (merging of paired-end reads now done with Vsearch --fastq_mergepairs) + - merging of paired-end reads is different between Vsearch and BBmerge, so results may differ +- Updated dereplication step for newer versions of Vsearch + - Dereplication step now done using Vsearch --fastx_uniques (derep_fulllength command no longer supports fastq files) +- Qiime2 plugin version of ITSxpress is now part of the standalone package of ITSxpress + - No longer need to install q2-itsxpress separately and will be installed if Qiime2 is already installed, otherwise only the standalone version will be installed + - Qiime2 plugin version of ITSxpress is no longer maintained after q2-itsxpress v1.8.1 +- Pacbio sequences are now supported if fastq file scores are in Illumina format +Bug Fixes +- Fixed bug where the q2-itsxpress plugin was not handling single-end reads correctly, and was looking for a reverse read file + +1.8.1 (2023-06-02) +------------------ +Release Highlights + +- This is the final version that uses BBmap scripts. The next version will remove the BBmap dependency and use Vsearch for all steps. + - This version can still be found on the EOL-1.8.1 branch of the repository + +- Replaced derep_fulllength with fastx_uniques command compatible with Vsearch>=2.21.1 +- Fixed version of dependencies +- Updated pip install config file to pyproject.toml +- Updated readme usage section to reference compatible Qiime2 version +- Added read count output to log file + + +1.8.0 (2019-12-9) +----------------- +- Added support for primer sets in the reverse orientation +- Fixed a bug that could cause crashes when an intermediate file was empty + +1.7.2 (2018-11-8) +----------------- +- This release fixes issue [#8](https://github.com/USDA-ARS-GBRU/itsxpress/issues/8) + - This issue caused ITSxpress to incorrectly trim about 0.2% of read pairs. Sometimes this would result in it writing blank fastq records which would cause Qiime to detect an error and stop processing. + +1.7.0 (2018-09-12) +------------------ +New Features: + +- Support for the output of unmerged paired end files. This allows users to use Dada2 for sequence variant calling. +- The API is now documented at ReadTheDocs + +1.6.4 (2018-7-26) +----------------- +- Fix for issue validating fastq.gz files that was not solved by v1.6.3 + +1.6.3 (2018-7-25) +----------------- +- Fixed issue validating fastq.gz and added tests. + +1.6.2 (2018-7-25) +----------------- +- This release fixes an error that occasionally occurred when validating FASTQ files. ITSxpress used BBtools reformat.sh which occasionally threw an exception when validating FASTQ files due to a race condition. FASTQ file validation is now done with Biopython instead. + +1.6.1 (2018-7-19) +----------------- +- Changed the default clustering identity to 99.5%. +- Experiments with fungal soil samples showed that ITSxpress and ITSx trimmed 99.822% of reads in the ITS1 region within 2 bases of each other and 99.099% of reads in the ITS2 region within 2 bases of each other at 99.5% identity. For higher accuracy, dereplication can be run at at 100% identity. + + +1.6.0 (2018-7-13) +----------------- +- This release adds a new feature to cluster merged sequences at less than 100% identity. This speeds up typical dataset trimming by about 10x over previous versions depending on the sample, without major effects on trimming accuracy. This feature is controlled with the --cluster_id flag. Default behavior is now to cluster at 0.987 identity. + +1.5.6 (2018-7-3) +----------------- +- Database is now included in the release + +1.5.4 (2018-6-27) +----------------- +- Fixed bug in handling of temporary files files specified with the --tempfile flag +- updated readme +- fixed issues with exception handling + + +1.5.2 (2018-6-21) +----------------- +- Fixed an indexing error causing ITS trimming to be off by 1 base. +- Fixed error when raising file not found exception +- removed old readme \ No newline at end of file diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml new file mode 100644 index 0000000..b8ba69e --- /dev/null +++ b/conda.recipe/meta.yaml @@ -0,0 +1,50 @@ +package: + name: '{{ "itsxpress"|lower }}' + version: '{{ "2.0.0" }}' + +# source +source: + git_url: https://github.com/USDA-ARS-GBRU/itsxpress.git + git_tag: version2 + +build: + noarch: python + number: 2 + entry_points: + - itsxpress=itsxpress.main:main + script: python -m pip install --no-deps --ignore-installed . + +requirements: + host: + - pip + - python ==3.8.13 + run: + - pip + - python ==3.8.13 + - biopython >=1.79 + - hmmer ==3.1b2 + - vsearch ==2.21.1 + - pyzstd + +test: + imports: + - itsxpress + commands: + - itsxpress --help + +about: + home: http://github.com/usda-ars-gbru/itsxpress + license: 'CC0 1.0' + license_family: 'PUBLIC-DOMAIN' + # license_file: '' + summary: 'ITSxpress: Software to rapidly trim the Internally Transcribed Spacer (ITS) region from FASTQ files' + + +extra: + recipe-maintainers: + - arivers + - sveinn.einarsson + identifiers: + - biotools:ITSxpress + - doi:10.5281/zenodo.1304349 + - doi:10.12688/f1000research.15704.1 diff --git a/itsxpress/Dedup.py b/itsxpress/Dedup.py new file mode 100644 index 0000000..e1be7ba --- /dev/null +++ b/itsxpress/Dedup.py @@ -0,0 +1,311 @@ +import logging +import gzip +import pyzstd as zstd +import os +import tempfile + +from itertools import tee + +from Bio import SeqIO + +class Dedup: + """A class to handle deduplicated sequence data. + To speed processing Vmatch is used to remove duplicate amplicons so that the + start and stop sites are determined only once. + Attributes: + matchdict (dict): a dictionary of each sequence ID as a key and + its representative sequence ID as a value {seq1:rep1, seq2:rep1, seq3:rep2}. + uc_file (str): the location of the .uc file containing matching information. + rep_file (str): The location of the representative sequences file. + seq_file (str): The location of the complete sequences file. + fastq (str): the location of the input fastq + fastq2 (str) the location of optional Read2 input fastq if paired + """ + + + def parse(self): + """Parse the uc data file to populate the matchdict attribute. + Raises: + Exception: General exception if uc file is not parsed properly + """ + try: + with open(self.uc_file, 'r') as f: + self.matchdict = {} + for line in f: + ll = line.split() + datatype = ll[0] + ref = ll[9] + seq = ll[8] + if datatype == 'S': + self.matchdict[seq] = seq + elif datatype == 'H': + self.matchdict[seq] = ref + except Exception as e: + logging.exception("Could not parse the Vsearch '.uc' file.") + raise e + def __init__(self, uc_file, rep_file, seq_file, fastq=None, fastq2=None): + self.matchdict = None + self.uc_file = uc_file + self.rep_file = rep_file + self.seq_file = seq_file + self.fastq = fastq + self.fastq2 = fastq2 + self.parse() + + + def _get_paired_seq_generator(self, zipseqgen, itspos,wri_file): + """This function takes a zipped object of two Biopython SeqIO sequence generators, and + returns a two generators of Biopython SeqRecords for Dada2. Sequences where the ITS ends could + not be determined are omitted. + Args: + zipseqgen (obj): A zipped object with two Biopython SeqIO generators + for the forward and reverse input sequences + ispos (obj): An itsxpress ItsPosition object + Returns: + (obj): A two Python SeqRecord generators that yield filtered, trimmed sequence records. + """ + def _filterfunc(ziprecord): + """ Filters records down to those that contain a valid ITS start and stop position + Args: + record (obj): a Biopython SeqRecord object + Returns: + bool: True if an ITS start and stop positions are present false otherwise + """ + try: + record1, record2 = ziprecord + if record1.id in self.matchdict: + repseq = self.matchdict[record1.id] + start, stop, tlen = itspos.get_position(repseq) + if start and stop: + if start < stop: + return True + else: + return False + except KeyError: + return False + + + + def _map_func(ziprecord): + """Trims the record down to the selected ITS region + Args: + record (obj): a Biopython SeqRecord object + Returns: + obj: two Biopython SeqRecord objects; forward and reverse reads trimmed to the ITS region + """ + record1, record2 = ziprecord + repseq = self.matchdict[record1.id] + start, stop, tlen = itspos.get_position(repseq) + r2start = tlen - stop + return record1[start:], record2[r2start:] + + def _split_gen(gen): + gen_a, gen_b = tee(gen, 2) + return (a for a, b in gen_a), (b for a, b in gen_b) + + filt = filter(_filterfunc, zipseqgen) + gen1 = map(_map_func, filt) + gen1_split_a, gen1_split_b = _split_gen(gen1) + if wri_file == False: + zeroseqctr1 = 0 + seqlist1=[] + seqlist2=[] + for i in list(gen1_split_a): + if i.seq == "": + zeroseqctr1=zeroseqctr1+1 + seqlist1.append(i.id) + zeroseqctr2 = 0 + for i in list(gen1_split_b): + if i.seq == "": + zeroseqctr2=zeroseqctr2+1 + seqlist2.append(i.id) + if zeroseqctr1 or zeroseqctr2 !=0: + print("Total number of sequences that are empty Split A: ",zeroseqctr1) + print("Sequence IDs: ") + print(seqlist1) + print("Total number of sequences that are empty Split B: ",zeroseqctr2) + print("Sequence IDs: ") + print(seqlist2) + return gen1_split_a, gen1_split_b + + + def create_paired_trimmed_seqs(self, outfile1, outfile2, gzipped,zstd_file, itspos,wri_file): + """Writes two FASTQ files, optionally gzipped, with the reads trimmed to the + selected region. + Args: + outfile1 (str): The file to write the forward sequences to. + outfile2 (str): The file to write the reverse sequences to. + gzip (bool): Should the output files be gzipped? + itspos (object): an ItsPosition object + Returns: + str: Name of the file written + """ + + def _write_seqs(seqs, outfile): + """Helper function to optionally write sequences in compressed format + Args: + seqs (obj): A biopython SeqRecord generators + outfile (str): A file name to writ the fastq data to. + """ + if gzipped: + with gzip.open(outfile, 'wt') as g: + SeqIO.write(seqs, g, "fastq") + elif zstd_file: + with zstd.open(outfile, 'wt')as g: + SeqIO.write(seqs, g, "fastq") + else: + with open(outfile, 'w') as g: + SeqIO.write(seqs, g, "fastq") + + def _create_gen(f, g): + """Create a sequence generator + Args: + f (str): a file name for read 1 fastq data + g (str): a file name for read 2 fastq data + """ + seqgen1 = SeqIO.parse(f, 'fastq') + seqgen2 = SeqIO.parse(g, 'fastq') + zipseqgen = zip(seqgen1, seqgen2) + seqs1, seqs2 = self._get_paired_seq_generator(zipseqgen, itspos,wri_file) + if wri_file: + _write_seqs(seqs1, outfile1) + _write_seqs(seqs2, outfile2) + + try: + if self.fastq.endswith(".gz") and self.fastq2.endswith(".gz"): + with gzip.open(self.fastq, 'rt') as f: + with gzip.open(self.fastq2, 'rt') as g: + _create_gen(f, g) + elif self.fastq.endswith(".zst") and self.fastq2.endswith(".zst"): + with zstd.open(self.fastq,'rt') as f: + with zstd.open(self.fastq2, 'rt') as g: + _create_gen(f,g) + elif self.fastq.endswith(".fq") and self.fastq2.endswith(".fq"): + with open(self.fastq, 'r') as f: + with open(self.fastq2, 'r') as g: + _create_gen(f, g) + elif (self.fastq2.endswith(".fastq") or self.fastq2.endswith(".fastq")): + with open(self.fastq, 'r') as f: + with open(self.fastq2, 'r') as g: + _create_gen(f, g) + else: + raise ValueError("Fastq and Fastq2 files should both be gzipped (.gz), zstd compressed (.zst) or both be uncompressed. Mixed input is not accepted.") + + except Exception as e: + raise e + + + def _get_trimmed_seq_generator(self, seqgen, itspos,wri_file): + """This function takes a Biopython SeqIO sequence generator, and + returns a generator of trimmed sequences suitable for Deblur. Sequences where the ITS ends could + not be determined are omitted. + Args: + seqgen (obj): A Biopython SeqIO generator of all input sequences + ispos (obj): An itsxpress ItsPosition object + Returns: + (obj): A map object generator that yields filtered, trimmed sequence records. + """ + def _filterfunc(record): + """ Filters records down to those that contain a valid ITS start and stop position + Args: + record (obj): a Biopython SeqRecord object + Returns: + bool: True if an ITS start and stop positions are present false otherwise + """ + try: + if record.id in self.matchdict: + repseq = self.matchdict[record.id] + start, stop, tlen = itspos.get_position(repseq) + if start and stop: + if start < stop: + return True + else: + return False + except KeyError: + return False + + + + def map_func(record): + """Trims the record down to the selected ITS region + Args: + record (obj): a Biopython SeqRecord object + Returns: + obj: a Biopython SeqRecord object trimmed to the ITS region + """ + repseq = self.matchdict[record.id] + start, stop, tlen = itspos.get_position(repseq) + return record[start:stop] + + filt = filter(_filterfunc, seqgen) + if wri_file == False: + zeroseqctr = 0 + seqlist=[] + r1 = map(map_func, filt) + for i in list(r1): + if i.seq == "": + zeroseqctr=zeroseqctr+1 + seqlist.append(i.id) + + if zeroseqctr !=0: + print("Total number of sequences that are empty: ",zeroseqctr) + print("Sequence IDs: ") + print(seqlist) + return map(map_func, filt) + + + def create_trimmed_seqs(self, outfile, gzipped,zstd_file, itspos,wri_file, tempdir=None): + """Creates a FASTQ file, optionally gzipped, with the reads trimmed to the + selected region. + Args: + outfile (str): The file to write the sequences to. + gzip (bool): Should the files be gzipped? + zstd_file (bool): Should the files be zstd compressed? (.zst files) + itspos (object): an ItsPosition object + wri_file (bool): Should file be written or checked for empty sequences? + """ + if tempdir: + if not os.path.exists(tempdir): + logging.warning("Specified location for tempfile ({}) does not exist, using default location.".format(tempdir)) + self.tempdir = tempfile.mkdtemp(prefix='itsxpress_') + else: + self.tempdir = tempfile.mkdtemp(prefix='itsxpress_', dir=tempdir) + else: + self.tempdir = tempfile.mkdtemp(prefix='itsxpress_') + + def _write_seqs(): + if gzipped: + tempf = os.path.join(tempdir,'temp.fa') + with open(tempf, 'w') as g: + SeqIO.write(seqs, g, "fastq") + with open(tempf,'rb') as f_in, gzip.open(outfile,'wb') as f_out: + f_out.writelines(f_in) + elif zstd_file: + tempf = os.path.join(tempdir,'temp.fa') + with open(tempf, 'w') as g: + SeqIO.write(seqs, g, "fastq") + with open(tempf,'rb') as f_in: + with zstd.open(outfile,'wb') as f_out: + f_out.writelines(f_in) + else: + with open(outfile, 'w') as g: + SeqIO.write(seqs, g, "fastq") + + if self.seq_file.endswith(".gz"): + with gzip.open(self.seq_file, 'rt') as f: + seqgen = SeqIO.parse(f, 'fastq') + seqs = self._get_trimmed_seq_generator(seqgen, itspos,wri_file) + if wri_file: + _write_seqs() + elif self.seq_file.endswith(".zst"): + with zstd.open(self.seq_file, 'rt') as f: + seqgen = SeqIO.parse(f, 'fastq') + seqs = self._get_trimmed_seq_generator(seqgen, itspos,wri_file) + if wri_file: + _write_seqs() + else: + with open(self.seq_file, 'r') as f: + seqgen = SeqIO.parse(f, 'fastq') + seqs = self._get_trimmed_seq_generator(seqgen, itspos,wri_file) + if wri_file: + _write_seqs() \ No newline at end of file diff --git a/itsxpress/ITSposition.py b/itsxpress/ITSposition.py new file mode 100644 index 0000000..47b4a32 --- /dev/null +++ b/itsxpress/ITSposition.py @@ -0,0 +1,107 @@ +import logging + +class ItsPosition: + """Class for ITS positional information derived from hmmserach domtable files. + Args: + domtable (str): the path locating the domtable file from HMMER 3 hmmsearch. + region (str): The region of the ITS to extract choices: ["ITS1", "ITS2", "ALL"]. + Attributes: + ddict (dict): A dictionary holding the scores and start and stop + positions for the selected segment of each sequence. + Example: {sample:{left:{score:31, pos:15}, right:{score:32, pos:354}, tlen:449} + leftprefix (str): the left prefix to search for (set by type variable). + rightprefix (str): the right prefix to search for (set by type variable). + """ + def _score(self, sequence, stype, score, from_pos, to_pos, tlen): + """Evaluates scores and positions from the new line of a domtable file and + updates ddict if necessary. + Args: + sequence (str): The name of the sequence. + stype (str): {'left', 'right'} + score (int): The bit score from HMMSearch. + to_pos (int): the ending position of the left sequence. + from_pos (int): The beginning position of the right sequence. + tlen (int): the length of the sequence + """ + + if stype in self.ddict[sequence]: + if score > self.ddict[sequence][stype]["score"]: + self.ddict[sequence][stype]["score"] = score + self.ddict[sequence][stype]["to_pos"] = to_pos + self.ddict[sequence][stype]["from_pos"] = from_pos + else: + self.ddict[sequence][stype] = {} + self.ddict[sequence][stype]["score"] = score + self.ddict[sequence][stype]["to_pos"] = to_pos + self.ddict[sequence][stype]["from_pos"] = from_pos + self.ddict[sequence]["tlen"] = tlen + + + def parse(self): + """Parses dom table from HMMsearch. + The dom table is parsed and the start and stop position from the top scoring + hmm math is saved. The start and stop positions of reach sequence are added to the ddict attribute. + """ + try: + with open(self.domtable, 'r') as f: + for num, line in enumerate(f): + if not line.startswith("#"): + ll = line.split() + sequence = ll[0] + hmmprofile = ll[3] + score = float(ll[13]) + from_pos = int(ll[19]) + to_pos = int(ll[20]) + tlen = int(ll[2]) + if sequence not in self.ddict: + self.ddict[sequence] = {} + if hmmprofile.startswith(self.leftprefix): + self._score(sequence, 'left', score, from_pos, to_pos, tlen) + elif hmmprofile.startswith(self.rightprefix): + self._score(sequence, 'right', score, from_pos, to_pos, tlen) + except Exception as e: + logging.error("Exception occurred when parsing HMMSearh results") + raise e + + def __init__(self, domtable, region): + self.domtable = domtable + self.ddict = {} + if region == "ITS2": + self.leftprefix = '3_' + self.rightprefix = '4_' + elif region == "ITS1": + self.leftprefix = '1_' + self.rightprefix = '2_' + elif region == "ALL": + self.leftprefix = '1_' + self.rightprefix = '4_' + self.parse() + + + def get_position(self, sequence): + """ Returns the start and stop positions for a given sequence. + Args: + sequence (str): The name of the sequence. + Returns: + (tuple): (start position, end position) zero indexed + Raises: + KeyError: If input sequence is not present in dictionary (no ITS start or stop sites were found) + """ + + try: + if "left" in self.ddict[sequence]: + start = int(self.ddict[sequence]["left"]["to_pos"]) + else: + start = None + if "right" in self.ddict[sequence]: + stop = int(self.ddict[sequence]["right"]["from_pos"]) - 1 + else: + stop = None + if "tlen" in self.ddict[sequence]: + tlen = int(self.ddict[sequence]["tlen"]) + else: + tlen = None + return(start, stop, tlen) + except KeyError: + logging.debug("No ITS stop or start sites were identified for sequence {}, skipping.".format(sequence)) + raise KeyError diff --git a/itsxpress/SeqSample.py b/itsxpress/SeqSample.py new file mode 100644 index 0000000..05812fd --- /dev/null +++ b/itsxpress/SeqSample.py @@ -0,0 +1,108 @@ +import os +import logging +import tempfile +import subprocess + + + +class SeqSample: + """The class for processing sequence data into trimmed sequences. + Attributes: + tempdir (obj): A temporary directory object + fastq (str): The path to the input FASTQ file + uc_file (str): The path to the Vsearch uc mapping file + rep_file: (str) the path to the representative sequences FASTA file created by Vsearch + seq_file (str): the location of the fastq or fastq.gz sequence file used for analysis + """ + + + def __init__(self, fastq, tempdir=None): + if tempdir: + if not os.path.exists(tempdir): + logging.warning("Specified location for tempfile ({}) does not exist, using default location.".format(tempdir)) + self.tempdir = tempfile.mkdtemp(prefix='itsxpress_') + else: + self.tempdir = tempfile.mkdtemp(prefix='itsxpress_', dir=tempdir) + else: + self.tempdir = tempfile.mkdtemp(prefix='itsxpress_') + self.fastq = fastq + self.uc_file = None + self.rep_file = None + self.dom_file = None + self.seq_file = None + + + + def deduplicate(self, threads=1): + """Runs Vsearch dereplication to create a FASTA file of non-redundant sequences. + Args: + threads (int or str):the number of processor threads to use + """ + try: + self.uc_file=os.path.join(self.tempdir, 'uc.txt') + self.rep_file=os.path.join(self.tempdir,'rep.fa') + parameters = ["vsearch", + "--fastx_uniques", + self.seq_file, + "--fastaout", self.rep_file, + "--uc", self.uc_file, + "--strand", "both"] + p2 = subprocess.run(parameters, stderr=subprocess.PIPE) + logging.info(p2.stderr.decode('utf-8')) + p2.check_returncode() + except subprocess.CalledProcessError as e: + logging.exception("Could not perform dereplication with Vsearch. Error from Vsearch was:\n {}".format(p2.stderr.decode('utf-8'))) + raise e + except FileNotFoundError as f: + logging.error("Vsearch was not found, make sure Vsearch is installed and executable") + raise f + + + def cluster(self, threads, cluster_id=0.995): + """Runs Vsearch clustering to create a FASTA file of non-redundant sequences. + Args: + threads (int or str):the number of processor threads to use + """ + try: + self.uc_file = os.path.join(self.tempdir, 'uc.txt') + self.rep_file = os.path.join(self.tempdir,'rep.fa') + parameters = ["vsearch", + "--cluster_size", self.seq_file, + "--centroids", self.rep_file, + "--uc", self.uc_file, + "--strand", "both", + "--id", str(cluster_id), + "--threads", str(threads)] + p2 = subprocess.run(parameters, stderr=subprocess.PIPE) + print(p2.stderr.decode('utf-8')) + p2.check_returncode() + except subprocess.CalledProcessError as e: + logging.exception("Could not perform clustering with Vsearch. Error from Vsearch was:\n {}".format(p2.stderr.decode('utf-8'))) + raise e + except FileNotFoundError as f: + logging.error("Vsearch was not found, make sure Vsearch is installed and executable") + raise f + + def _search(self, hmmfile, threads): + try: + self.dom_file = os.path.join(self.tempdir, 'domtbl.txt') + #Run Hmmsearch + parameters = ["hmmsearch", + "--domtblout", + self.dom_file, + "-T", "10", + "--cpu", str(threads), + "--tformat", "fasta", + "--F1", "1e-6", + "--F2", "1e-6", + "--F3", "1e-6", + hmmfile, + self.rep_file] + p4 = subprocess.run(parameters, stderr=subprocess.PIPE, stdout=subprocess.DEVNULL) + p4.check_returncode() + except subprocess.CalledProcessError as e : + logging.exception("Could not perform ITS identification with hmmserach. The error was:\n {}".format(p4.stderr.decode('utf-8'))) + raise e + except FileNotFoundError as f: + logging.error("hmmsearch was not found, make sure HMMER3 is installed and executable") + raise f \ No newline at end of file diff --git a/itsxpress/SeqSampleNotPaired.py b/itsxpress/SeqSampleNotPaired.py new file mode 100644 index 0000000..f0dbc07 --- /dev/null +++ b/itsxpress/SeqSampleNotPaired.py @@ -0,0 +1,11 @@ +from itsxpress.SeqSample import SeqSample + +class SeqSampleNotPaired(SeqSample): + """SeqSample class extended to unpaired format. + """ + + def __init__(self, fastq, tempdir): + SeqSample.__init__(self, fastq, tempdir) + self.seq_file = self.fastq + self.r1 = self.fastq + self.fastq2 = None \ No newline at end of file diff --git a/itsxpress/SeqSamplePaired.py b/itsxpress/SeqSamplePaired.py new file mode 100644 index 0000000..4b674da --- /dev/null +++ b/itsxpress/SeqSamplePaired.py @@ -0,0 +1,69 @@ +from itsxpress.SeqSample import SeqSample + + + +import subprocess +import logging +import os +import pyzstd as zstd + +from Bio import SeqIO + +from itsxpress.definitions import ROOT_DIR, taxa_choices, taxa_dict, maxmismatches, maxratio + +class SeqSamplePairedNotInterleaved(SeqSample): + """SeqSample class extended to paired, two FASTQ file format. + """ + def __init__(self, fastq, tempdir, fastq2, reversed_primers=False ): + SeqSample.__init__(self, fastq, tempdir) + if reversed_primers: + self.r1 = fastq2 + self.fastq2 = fastq + else: + self.r1 = fastq + self.fastq2 = fastq2 + + def _merge_reads(self, threads,stagger): + try: + seq_file = os.path.join(self.tempdir, 'seq.fq') + if self.r1.split('.')[-1] == 'zst' and self.fastq2.split('.')[-1] == 'zst': + r1_temp = os.path.join(self.tempdir, 'r1_temp.fq') + r2_temp = os.path.join(self.tempdir, 'r2_temp.fq') + parameters = ['zstd','-d', self.r1,'-o', r1_temp] + p1 = subprocess.run(parameters, stderr=subprocess.PIPE) + p1.check_returncode() + logging.info(p1.stderr.decode('utf-8')) + parameters = ['zstd','-d', self.fastq2,'-o', r2_temp] + p1 = subprocess.run(parameters, stderr=subprocess.PIPE) + p1.check_returncode() + logging.info(p1.stderr.decode('utf-8')) + self.r1 = r1_temp + self.fastq2 = r2_temp + + if stagger: + parameters = ['vsearch', + '--fastq_mergepairs' , self.r1, + '--reverse' , self.fastq2, + '--fastqout' ,seq_file, + '--fastq_maxdiffs' , str(maxmismatches), + '--fastq_maxee' , str(2), + '--threads' ,str(threads), + '--fastq_allowmergestagger'] + else: + parameters = ['vsearch', + '--fastq_mergepairs' , self.r1, + '--reverse' , self.fastq2, + '--fastqout' ,seq_file, + '--fastq_maxdiffs' , str(maxmismatches), + '--fastq_maxee' , str(2), + '--threads' ,str(threads)] + p1 = subprocess.run(parameters, stderr=subprocess.PIPE) + self.seq_file = seq_file + p1.check_returncode() + logging.info(p1.stderr.decode('utf-8')) + except subprocess.CalledProcessError as e: + logging.exception("Could not perform read merging with vsearch. Error from vsearch was: \n {}".format(p1.stderr.decode('utf-8'))) + raise e + except FileNotFoundError as f: + logging.error("vsearch was not found, make sure vsearch is installed on this environment") + raise f \ No newline at end of file diff --git a/itsxpress/__init__.py b/itsxpress/__init__.py index c2d7d1b..912d5bc 100644 --- a/itsxpress/__init__.py +++ b/itsxpress/__init__.py @@ -1,3 +1,17 @@ -import itsxpress.main -import itsxpress.definitions -__all__ = ["main", "definitions"] +import itsxpress.main +import itsxpress.definitions +import itsxpress.ITSposition +import itsxpress.Dedup +import itsxpress.SeqSample +import itsxpress.SeqSampleNotPaired +import itsxpress.SeqSamplePaired + +try: + import itsxpress.q2_itsxpress + import itsxpress.plugin_setup +except ModuleNotFoundError as e: + #logging + print("{}.Could not initialize the Qiime plugin portion of ITSxpress. Command line ITSxpress will still work normally. If you wish to use the Qiime2 ITSxpress plugin, you need to install Qiime2 first into your environment.\n".format(e)) + pass + +__all__ = ["main", "definitions"] diff --git a/itsxpress/citations.bib b/itsxpress/citations.bib new file mode 100644 index 0000000..fd2bb59 --- /dev/null +++ b/itsxpress/citations.bib @@ -0,0 +1,9 @@ +@Article{ Rivers2018, +AUTHOR = { Rivers, AR and Weber, KC and Gardner, TG and Liu, S and Armstrong, SD}, +TITLE = {ITSxpress: Software to rapidly trim internally transcribed spacer sequences with quality scores for marker gene analysis}, +JOURNAL = {F1000Research}, +VOLUME = {7}, +YEAR = {2018}, +NUMBER = {1418}, +DOI = {10.12688/f1000research.15704.1} +} \ No newline at end of file diff --git a/itsxpress/main.py b/itsxpress/main.py index a41bf13..769ec8b 100755 --- a/itsxpress/main.py +++ b/itsxpress/main.py @@ -1,31 +1,22 @@ #!/usr/bin/env python -"""##### This is the end of life version 1 of q2_itsxpress and the command line version of ITSxpress. -The new version 2 of ITSxpress, has the Qiime2 plugin built in with command line version of ITSxpress. ##### - -ITSxpress: A python module to rapidly trim ITS amplicon sequences from FASTQ files. - -Authors: Adam Rivers, Kyle weber, Sveinn Einarsson, USDA Agricultural Research Service - +"""ITSxpress: A python module to rapidly trim ITS amplicon sequences from FASTQ files. +Authors: Adam Rivers, Kyle weber, USDA Agricultural Research Service The internally transcribed spacer region is a region between the highly conserved small subunit (SSU) of rRNA and the large subunit (LSU) of the rRNA. The eukaryotic ITS contains the 5.8s gene and two variable length spacer regions. In amplicon sequencing studies it is common practice to trim off the conserved (SSU, 5,8S or LSU) regions. Bengtsson-Palme et al. (2013) published software the software package ITSx to do this. - ITSxpress is a high-speed implementation of the methods in ITSx than also allows FASTQ files to be processed. Processing FASTQ files Which is essential for analyzing sequences using the newer exact Sequence Variant methods in Qiime2, Dada2, Deblur and Unoise that are replacing OTU clustering. - ITSxpress is also available as a QIIME Plugin. See https://github.com/USDA-ARS-GBRU/q2_itsxpress for details. - Process: - * Merges and error corrects reads using bbduk if reads are paired-end - * Deduplicates reads using Vmatch to eliminate redundant hmm searches + * Merges and error corrects reads using Vsearch if reads are paired-end + * Deduplicates reads using Vsearch to eliminate redundant hmm searches * Searches for conserved regions using the ITSx hmms, using HMMsearch: * Parses everything in python returning (optionally gzipped) fastq files. - Reference: Johan Bengtsson-Palme, Vilmar Veldre, Martin Ryberg, Martin Hartmann, Sara Branco, Zheng Wang, Anna Godhe, Yann Bertrand, Pierre De Wit, Marisol Sanchez, @@ -35,7 +26,11 @@ eukaryotes for use in environmental sequencing. Methods in Ecology and Evolution, 4: 914-919, 2013 (DOI: 10.1111/2041-210X.12073) """ + + + import gzip +import pyzstd as zstd import tempfile import argparse import subprocess @@ -46,9 +41,15 @@ import math from itertools import tee +from numpy import empty + from Bio import SeqIO from itsxpress.definitions import ROOT_DIR, taxa_choices, taxa_dict, maxmismatches, maxratio +from itsxpress.Dedup import Dedup +from itsxpress.ITSposition import ItsPosition +from itsxpress.SeqSamplePaired import SeqSamplePairedNotInterleaved +from itsxpress.SeqSampleNotPaired import SeqSampleNotPaired def restricted_float(x): x = float(x) @@ -71,6 +72,8 @@ def myparser(): parser.add_argument('--outfile2', '-o2', type=str, help="the trimmed read 2 Fastq file, if it \ ends in 'gz' it will be gzipped. If provided, reads will be returned unmerged.", default=None) parser.add_argument('--tempdir', help='The temp file directory', default=None) + parser.add_argument('--allow_staggered_reads', help='Allow merging of staggered reads with --fastq_allowmergestagger \ + for Vsearch --fastq_mergepairs. See Vsearch documentation. (Optional) Default is true.', default=True) parser.add_argument('--keeptemp' ,help="Should intermediate files be kept?", action='store_true') parser.add_argument('--region', help='', choices=["ITS2", "ITS1", "ALL"], required=True) parser.add_argument('--taxa', help='The taxonomic group sequenced.', choices=taxa_choices, default="Fungi") @@ -81,619 +84,25 @@ def myparser(): return parser - - -class ItsPosition: - """Class for ITS positional information derived from hmmserach domtable files. - - Args: - domtable (str): the path locating the domtable file from HMMER 3 hmmsearch. - region (str): The region of the ITS to extract choices: ["ITS1", "ITS2", "ALL"]. - - Attributes: - ddict (dict): A dictionary holding the scores and start and stop - positions for the selected segment of each sequence. - Example: {sample:{left:{score:31, pos:15}, right:{score:32, pos:354}, tlen:449} - leftprefix (str): the left prefix to search for (set by type variable). - rightprefix (str): the right prefix to search for (set by type variable). - - """ - def _score(self, sequence, stype, score, from_pos, to_pos, tlen): - """Evaluates scores and positions from the new line of a domtable file and - updates ddict if necessary. - - Args: - sequence (str): The name of the sequence. - stype (str): {'left', 'right'} - score (int): The bit score from HMMSearch. - to_pos (int): the ending position of the left sequence. - from_pos (int): The beginning position of the right sequence. - tlen (int): the length of the sequence - - """ - - if stype in self.ddict[sequence]: - if score > self.ddict[sequence][stype]["score"]: - self.ddict[sequence][stype]["score"] = score - self.ddict[sequence][stype]["to_pos"] = to_pos - self.ddict[sequence][stype]["from_pos"] = from_pos - else: - self.ddict[sequence][stype] = {} - self.ddict[sequence][stype]["score"] = score - self.ddict[sequence][stype]["to_pos"] = to_pos - self.ddict[sequence][stype]["from_pos"] = from_pos - self.ddict[sequence]["tlen"] = tlen - - - def parse(self): - """Parses dom table from HMMsearch. - The dom table is parsed and the start and stop position from the top scoring - hmm math is saved. The start and stop positions of reach sequence are added to the ddict attribute. - - """ - try: - with open(self.domtable, 'r') as f: - for num, line in enumerate(f): - if not line.startswith("#"): - ll = line.split() - sequence = ll[0] - hmmprofile = ll[3] - score = float(ll[13]) - from_pos = int(ll[19]) - to_pos = int(ll[20]) - tlen = int(ll[2]) - if sequence not in self.ddict: - self.ddict[sequence] = {} - if hmmprofile.startswith(self.leftprefix): - self._score(sequence, 'left', score, from_pos, to_pos, tlen) - elif hmmprofile.startswith(self.rightprefix): - self._score(sequence, 'right', score, from_pos, to_pos, tlen) - except Exception as e: - logging.error("Exception occurred when parsing HMMSearh results") - raise e - - def __init__(self, domtable, region): - self.domtable = domtable - self.ddict = {} - if region == "ITS2": - self.leftprefix = '3_' - self.rightprefix = '4_' - elif region == "ITS1": - self.leftprefix = '1_' - self.rightprefix = '2_' - elif region == "ALL": - self.leftprefix = '1_' - self.rightprefix = '4_' - self.parse() - - - def get_position(self, sequence): - """ Returns the start and stop positions for a given sequence. - - Args: - sequence (str): The name of the sequence. - - Returns: - (tuple): (start position, end position) zero indexed - - Raises: - KeyError: If input sequence is not present in dictionary (no ITS start or stop sites were found) - - """ - - try: - if "left" in self.ddict[sequence]: - start = int(self.ddict[sequence]["left"]["to_pos"]) - else: - start = None - if "right" in self.ddict[sequence]: - stop = int(self.ddict[sequence]["right"]["from_pos"]) - 1 - else: - stop = None - if "tlen" in self.ddict[sequence]: - tlen = int(self.ddict[sequence]["tlen"]) - else: - tlen = None - return(start, stop, tlen) - except KeyError: - logging.debug("No ITS stop or start sites were identified for sequence {}, skipping.".format(sequence)) - raise KeyError - -class Dedup: - """A class to handle deduplicated sequence data. - - To speed processing Vmatch is used to remove duplicate amplicons so that the - start and stop sites are determined only once. - - Attributes: - matchdict (dict): a dictionary of each sequence ID as a key and - its representative sequence ID as a value {seq1:rep1, seq2:rep1, seq3:rep2}. - uc_file (str): the location of the .uc file containing matching information. - rep_file (str): The location of the representative sequences file. - seq_file (str): The location of the complete sequences file. - fastq (str): the location of the input fastq - fastq2 (str) the location of optional Read2 input fastq if paired - - """ - - - def parse(self): - """Parse the uc data file to populate the matchdict attribute. - - Raises: - Exception: General exception if uc file is not parsed properly - - """ - try: - with open(self.uc_file, 'r') as f: - self.matchdict = {} - for line in f: - ll = line.split() - datatype = ll[0] - ref = ll[9] - seq = ll[8] - if datatype == 'S': - self.matchdict[seq] = seq - elif datatype == 'H': - self.matchdict[seq] = ref - except Exception as e: - logging.exception("Could not parse the Vsearch '.uc' file.") - raise e - - def __init__(self, uc_file, rep_file, seq_file, fastq=None, fastq2=None): - self.matchdict = None - self.uc_file = uc_file - self.rep_file = rep_file - self.seq_file = seq_file - self.fastq = fastq - self.fastq2 = fastq2 - self.parse() - - - def _get_paired_seq_generator(self, zipseqgen, itspos): - """This function takes a zipped object of two Biopython SeqIO sequence generators, and - returns a two generators of Biopython SeqRecords for Dada2. Sequences where the ITS ends could - not be determined are omitted. - - Args: - zipseqgen (obj): A zipped object with two Biopython SeqIO generators - for the forward and reverse input sequences - ispos (obj): An itsxpress ItsPosition object - - Returns: - (obj): A two Python SeqRecord generators that yield filtered, trimmed sequence records. - - """ - def _filterfunc(ziprecord): - """ Filters records down to those that contain a valid ITS start and stop position - - Args: - record (obj): a Biopython SeqRecord object - - Returns: - bool: True if an ITS start and stop positions are present false otherwise - - """ - try: - record1, record2 = ziprecord - if record1.id in self.matchdict: - repseq = self.matchdict[record1.id] - start, stop, tlen = itspos.get_position(repseq) - if start and stop: - if start < stop: - return True - else: - return False - except KeyError: - return False - - - - def _map_func(ziprecord): - """Trims the record down to the selected ITS region - - Args: - record (obj): a Biopython SeqRecord object - - Returns: - obj: two Biopython SeqRecord objects; forward and reverse reads trimmed to the ITS region - """ - record1, record2 = ziprecord - repseq = self.matchdict[record1.id] - start, stop, tlen = itspos.get_position(repseq) - r2start = tlen - stop - return record1[start:], record2[r2start:] - - def _split_gen(gen): - gen_a, gen_b = tee(gen, 2) - return (a for a, b in gen_a), (b for a, b in gen_b) - - filt = filter(_filterfunc, zipseqgen) - gen1 = map(_map_func, filt) - gen1_split_a, gen1_split_b = _split_gen(gen1) - return gen1_split_a, gen1_split_b - - - def create_paired_trimmed_seqs(self, outfile1, outfile2, gzipped, itspos): - """Writes two FASTQ files, optionally gzipped, with the reads trimmed to the - selected region. - - Args: - outfile1 (str): The file to write the forward sequences to. - outfile2 (str): The file to write the reverse sequences to. - gzip (bool): Should the output files be gzipped? - itspos (object): an ItsPosition object - - Returns: - str: Name of the file written - - """ - - def _write_seqs(seqs, outfile): - """Helper function to optionally write sequences in compressed format - - Args: - seqs (obj): A biopython SeqRecord generators - outfile (str): A file name to writ the fastq data to. - - """ - if gzipped: - with gzip.open(outfile, 'wt') as g: - SeqIO.write(seqs, g, "fastq") - else: - with open(outfile, 'w') as g: - SeqIO.write(seqs, g, "fastq") - - def _create_gen(f, g): - """Create a sequence generator - - Args: - f (str): a file name for read 1 fastq data - g (str): a file name for read 2 fastq data - - """ - seqgen1 = SeqIO.parse(f, 'fastq') - seqgen2 = SeqIO.parse(g, 'fastq') - zipseqgen = zip(seqgen1, seqgen2) - seqs1, seqs2 = self._get_paired_seq_generator(zipseqgen, itspos) - _write_seqs(seqs1, outfile1) - _write_seqs(seqs2, outfile2) - - try: - if self.fastq.endswith(".gz") and self.fastq2.endswith(".gz"): - with gzip.open(self.fastq, 'rt') as f: - with gzip.open(self.fastq2, 'rt') as g: - _create_gen(f, g) - - elif not (self.fastq2.endswith(".gz") or self.fastq2.endswith(".gz")): - with open(self.fastq, 'r') as f: - with open(self.fastq2, 'r') as g: - _create_gen(f, g) - else: - raise ValueError("Fastq and Fastq2 files should both be gzipped or both be un-gzipped. Mixed input is not accepted.") - - except Exception as e: - raise e - - - def _get_trimmed_seq_generator(self, seqgen, itspos): - """This function takes a Biopython SeqIO sequence generator, and - returns a generator of trimmed sequences suitable for Deblur. Sequences where the ITS ends could - not be determined are omitted. - - Args: - seqgen (obj): A Biopython SeqIO generator of all input sequences - ispos (obj): An itsxpress ItsPosition object - - Returns: - (obj): A map object generator that yields filtered, trimmed sequence records. - - """ - def _filterfunc(record): - """ Filters records down to those that contain a valid ITS start and stop position - - Args: - record (obj): a Biopython SeqRecord object - - Returns: - bool: True if an ITS start and stop positions are present false otherwise - - """ - try: - if record.id in self.matchdict: - repseq = self.matchdict[record.id] - start, stop, tlen = itspos.get_position(repseq) - if start and stop: - if start < stop: - return True - else: - return False - except KeyError: - return False - - - - def map_func(record): - """Trims the record down to the selected ITS region - - Args: - record (obj): a Biopython SeqRecord object - - Returns: - obj: a Biopython SeqRecord object trimmed to the ITS region - """ - repseq = self.matchdict[record.id] - start, stop, tlen = itspos.get_position(repseq) - return record[start:stop] - - filt = filter(_filterfunc, seqgen) - return map(map_func, filt) - - - def create_trimmed_seqs(self, outfile, gzipped, itspos): - """Creates a FASTQ file, optionally gzipped, with the reads trimmed to the - selected region. - - Args: - outfile (str): The file to write the sequences to. - gzip (bool): Should the files be gzipped? - itspos (object): an ItsPosition object - - """ - - def _write_seqs(): - if gzipped: - with gzip.open(outfile, 'wt') as g: - SeqIO.write(seqs, g, "fastq") - else: - with open(outfile, 'w') as g: - SeqIO.write(seqs, g, "fastq") - - if self.seq_file.endswith(".gz"): - with gzip.open(self.seq_file, 'rt') as f: - seqgen = SeqIO.parse(f, 'fastq') - seqs = self._get_trimmed_seq_generator(seqgen, itspos) - _write_seqs() - - else: - with open(self.seq_file, 'r') as f: - seqgen = SeqIO.parse(f, 'fastq') - seqs = self._get_trimmed_seq_generator(seqgen, itspos) - _write_seqs() - - -class SeqSample: - """The class for processing sequence data into trimmed sequences. - - Attributes: - tempdir (obj): A temporary directory object - fastq (str): The path to the input FASTQ file - uc_file (str): The path to the Vsearch uc mapping file - rep_file: (str) the path to the representative sequences FASTA file created by Vsearch - seq_file (str): the location of the fastq or fastq.gz sequence file used for analysis - - """ - - - def __init__(self, fastq, tempdir=None): - if tempdir: - if not os.path.exists(tempdir): - logging.warning("Specified location for tempfile ({}) does not exist, using default location.".format(tempdir)) - self.tempdir = tempfile.mkdtemp(prefix='itsxpress_') - else: - self.tempdir = tempfile.mkdtemp(prefix='itsxpress_', dir=tempdir) - else: - self.tempdir = tempfile.mkdtemp(prefix='itsxpress_') - self.fastq = fastq - self.uc_file = None - self.rep_file = None - self.dom_file = None - self.seq_file = None - - - - def deduplicate(self, threads=1): - """Runs Vsearch dereplication to create a FASTA file of non-redundant sequences. - - Args: - threads (int or str):the number of processor threads to use - - """ - try: - self.uc_file=os.path.join(self.tempdir, 'uc.txt') - self.rep_file=os.path.join(self.tempdir,'rep.fa') - parameters = ["vsearch", - "--fastx_uniques", - self.seq_file, - "--fastaout", self.rep_file, - "--uc", self.uc_file, - "--strand", "both"] - p2 = subprocess.run(parameters, stderr=subprocess.PIPE) - logging.info(p2.stderr.decode('utf-8')) - p2.check_returncode() - except subprocess.CalledProcessError as e: - logging.exception("Could not perform dereplication with Vsearch. Error from Vsearch was:\n {}".format(p2.stderr.decode('utf-8'))) - raise e - except FileNotFoundError as f: - logging.error("Vsearch was not found, make sure Vsearch is installed and executable") - raise f - - - def cluster(self, threads=1, cluster_id=0.995): - """Runs Vsearch clustering to create a FASTA file of non-redundant sequences. - - Args: - threads (int or str):the number of processor threads to use - - """ - try: - self.uc_file = os.path.join(self.tempdir, 'uc.txt') - self.rep_file = os.path.join(self.tempdir,'rep.fa') - parameters = ["vsearch", - "--cluster_size", self.seq_file, - "--centroids", self.rep_file, - "--uc", self.uc_file, - "--strand", "both", - "--id", str(cluster_id), - "--threads", str(threads)] - p2 = subprocess.run(parameters, stderr=subprocess.PIPE) - print(p2.stderr.decode('utf-8')) - p2.check_returncode() - except subprocess.CalledProcessError as e: - logging.exception("Could not perform clustering with Vsearch. Error from Vsearch was:\n {}".format(p2.stderr.decode('utf-8'))) - raise e - except FileNotFoundError as f: - logging.error("Vsearch was not found, make sure Vsearch is installed and executable") - raise f - - def _search(self, hmmfile, threads=1): - try: - self.dom_file = os.path.join(self.tempdir, 'domtbl.txt') - #Run Hmmsearch - parameters = ["hmmsearch", - "--domtblout", - self.dom_file, - "-T", "10", - "--cpu", str(threads), - "--tformat", "fasta", - "--F1", "1e-6", - "--F2", "1e-6", - "--F3", "1e-6", - hmmfile, - self.rep_file] - p4 = subprocess.run(parameters, stderr=subprocess.PIPE, stdout=subprocess.DEVNULL) - p4.check_returncode() - except subprocess.CalledProcessError as e : - logging.exception("Could not perform ITS identification with hmmserach. The error was:\n {}".format(p4.stderr.decode('utf-8'))) - raise e - except FileNotFoundError as f: - logging.error("hmmsearch was not found, make sure HMMER3 is installed and executable") - raise f - -class SeqSamplePairedInterleaved(SeqSample): - """SeqSample class extended to paired, interleaved format. - - """ - def split_interleaved(self, reversed_primers=False): - try: - seq_r1 = os.path.join(self.tempdir, 'seq_r1.fq.gz') - seq_r2 = os.path.join(self.tempdir, 'seq_r2.fq.gz') - parameters = ['reformat.sh', - 'in=' + self.fastq, - 'out=' + seq_r1, - 'out2=' + seq_r2, - ] - p1 = subprocess.run(parameters, stderr=subprocess.PIPE) - p1.check_returncode() - if reversed_primers: - self.r1 = seq_r2 - self.fastq2 = seq_r1 - else: - self.r1 = seq_r1 - self.fastq2 = seq_r2 - logging.info(p1.stderr.decode('utf-8')) - except subprocess.CalledProcessError as e: - logging.exception("could not perform read merging with BBmerge. Error from BBmerge was: \n {}".format(p1.stderr.decode('utf-8'))) - raise e - except FileNotFoundError as f: - logging.error("BBmerge was not found, make sure BBmerge is executable") - raise f - - def __init__(self, fastq, tempdir, reversed_primers=False): - SeqSample.__init__(self, fastq, tempdir) - self.split_interleaved(reversed_primers=reversed_primers) - - def _merge_reads(self, threads): - try: - seq_file = os.path.join(self.tempdir, 'seq.fq.gz') - parameters = ['bbmerge.sh', - 'in=' + self.fastq, - 'out=' + seq_file, - 't=' + str(threads), - 'maxmismatches=' + str(maxmismatches), - 'maxratio=' + str(maxratio)] - p1 = subprocess.run(parameters, stderr=subprocess.PIPE) - self.seq_file = seq_file - p1.check_returncode() - logging.info(p1.stderr.decode('utf-8')) - except subprocess.CalledProcessError as e: - logging.exception("could not perform read merging with BBmerge. Error from BBmerge was: \n {}".format(p1.stderr.decode('utf-8'))) - raise e - except FileNotFoundError as f: - logging.error("BBmerge was not found, make sure BBmerge is executable") - raise f - -class SeqSamplePairedNotInterleaved(SeqSample): - """SeqSample class extended to paired, two FASTQ file format. - - """ - def __init__(self, fastq, tempdir, fastq2, reversed_primers=False ): - SeqSample.__init__(self, fastq, tempdir) - if reversed_primers: - self.r1 = fastq2 - self.fastq2 = fastq - else: - self.r1 = fastq - self.fastq2 = fastq2 - - def _merge_reads(self, threads): - try: - seq_file = os.path.join(self.tempdir, 'seq.fq.gz') - parameters = ['bbmerge.sh', - 'in=' + self.r1, - 'in2=' + self.fastq2, - 'out=' + seq_file, - 't=' + str(threads), - 'maxmismatches=' + str(maxmismatches), - 'maxratio=' + str(maxratio)] - p1 = subprocess.run(parameters, stderr=subprocess.PIPE) - self.seq_file = seq_file - p1.check_returncode() - logging.info(p1.stderr.decode('utf-8')) - except subprocess.CalledProcessError as e: - logging.exception("Could not perform read merging with BBmerge. Error from BBmerge was: \n {}".format(p1.stderr.decode('utf-8'))) - raise e - except FileNotFoundError as f: - logging.error("BBmerge was not found, make sure BBmerge is executable") - raise f - -class SeqSampleNotPaired(SeqSample): - """SeqSample class extended to unpaired format. - - """ - - def __init__(self, fastq, tempdir): - SeqSample.__init__(self, fastq, tempdir) - self.seq_file = self.fastq - self.r1 = self.fastq - self.fastq2 = None - ## Utility Functions def _is_paired(fastq, fastq2, single_end): """Determines the workflow based on file inputs. - Args: - """ if fastq and fastq2: paired_end = True - interleaved = False elif single_end: paired_end = False - interleaved = False else: paired_end = True - interleaved = True - return paired_end, interleaved + return paired_end def _logger_setup(logfile): """Set up logging to a logfile and the terminal standard out. - Args: fastq (str): The path to a fastq or fastq.gz file fastq2 (str): The path to a fastq or fastq.gz file for the reverse sequences - """ try: logging.basicConfig(level=logging.DEBUG, @@ -716,24 +125,63 @@ def _logger_setup(logfile): def _check_fastqs(fastq, fastq2=None): """Verifies the input files are valid fastq or fastq.gz files. - + Add interleaved explanation Args: fastq (str): The path to a fastq or fastq.gz file fastq2 (str): The path to a fastq or fastq.gz file for the reverse sequences - Raises: ValueError: If Biopython detected invalid FASTQ files """ - def core(file): + def check_interleaved(file): + try: + if file.endswith('.gz'): + f = gzip.open(file, 'rt') + elif file.endswith('.zst'): + f = zstd.open(file, 'rt') + else: + f = open(file, 'r') + lines = f.readlines() + L1 = lines[0:96:8] + L2 = lines[3:96:8] + L1_old = [i.strip().split('/', 1)[0] for i in L1] + L2_old = [i.strip().split('/', 1)[0] for i in L2] + L1_new = [i.strip().split(' ', 1)[0] for i in L1] + L2_new = [i.strip().split(' ', 1)[0] for i in L2] + assert L1_old != L2_old or L1_new != L2_new + + except AssertionError as a: + logging.error("'File may be interleaved. ITSxpress will run with errors. Check BBmap reformat.sh to split interleaved files before using ITSxpress.") + raise a + except IOError as f: + logging.error("File may be wrong format for interleaved file check.") + raise f + + + def core(file): try: if file.endswith('.gz'): f = gzip.open(file, 'rt') + elif file.endswith('.zst'): + f = zstd.open(file, 'rt') else: f = open(file, 'r') n = 0 for record in SeqIO.parse(f, 'fastq'): while n < 100: n += 1 + check_interleaved(file) + f.close() + if fastq2: + if fastq2.endswith('.gz'): + f = gzip.open(fastq2, 'rt') + elif file.endswith('.zst'): + f = zstd.open(file, 'rt') + else: + f = open(fastq2, 'r') + n = 0 + for record in SeqIO.parse(f, 'fastq'): + while n < 100: + n += 1 f.close() except ValueError as e: logging.error("There appears to be an issue with the format of input file {}.".format(file)) @@ -756,9 +204,12 @@ def _check_total_reads(file, file2 = None): def core(file): if file.endswith('.gz'): f = gzip.open(file, 'rt') + elif file.endswith('.zst'): + f = zstd.open(file, 'rt') else: f = open(file, 'r') n = 0 + for i, line in enumerate(f): if i % 4 == 0: n += 1 @@ -767,39 +218,30 @@ def core(file): reads = core(file) logging.info("Total number of reads in file {} is {}.".format(file, reads)) + #Why is file2 False? if file2: reads = core(file2) logging.info("Total number of reads in file {} is {}.".format(file2, reads)) -# workflow - def main(args=None): """Run Complete ITS trimming workflow. - """ # Set up logging t0 = time.time() parser = myparser() if not args: args = parser.parse_args() - _logger_setup(args.log) try: - print(args.cluster_id) logging.info("Verifying the input sequences.") _check_fastqs(args.fastq, args.fastq2) # Parse input types - paired_end, interleaved = _is_paired(args.fastq, args.fastq2, args.single_end) - # create SeqSample objects and merge if needed - if paired_end and interleaved: - logging.info("Sequences are paired-end and interleaved. They will be merged using BBmerge.") - sobj = SeqSamplePairedInterleaved(fastq=args.fastq, tempdir=args.tempdir, reversed_primers=args.reversed_primers ) - sobj._merge_reads(threads=str(args.threads)) - elif paired_end and not interleaved: - logging.info("Sequences are paired-end in two files. They will be merged using BBmerge.") + paired_end = _is_paired(args.fastq, args.fastq2, args.single_end) + if paired_end: + logging.info("Sequences are paired-end in two files. They will be merged using Vsearch.") sobj = SeqSamplePairedNotInterleaved(fastq=args.fastq, fastq2=args.fastq2, tempdir=args.tempdir, reversed_primers=args.reversed_primers) - sobj._merge_reads(threads=str(args.threads)) - elif not paired_end and not interleaved: + sobj._merge_reads(threads=str(args.threads), stagger=args.allow_staggered_reads) + elif not paired_end: logging.info("Sequences are assumed to be single-end.") sobj = SeqSampleNotPaired(fastq=args.fastq, tempdir=args.tempdir) logging.info("Temporary directory is: {}".format(sobj.tempdir)) @@ -818,28 +260,45 @@ def main(args=None): its_pos = ItsPosition(domtable=sobj.dom_file, region=args.region) # Create deduplication object dedup_obj = Dedup(uc_file=sobj.uc_file, rep_file=sobj.rep_file, seq_file=sobj.seq_file, fastq=sobj.r1, fastq2=sobj.fastq2) - # Create trimmed sequences - logging.info("Writing out sequences") + # Trim sequences if args.outfile2: if args.outfile.split('.')[-1] == 'gz' and args.outfile2.split('.')[-1] == 'gz': - dedup_obj.create_paired_trimmed_seqs(args.outfile, args.outfile2, gzipped=True, itspos=its_pos) + dedup_obj.create_paired_trimmed_seqs(args.outfile, args.outfile2, gzipped=True,zstd_file = False, itspos=its_pos,wri_file=True) + #dedup_obj.create_paired_trimmed_seqs(args.outfile, args.outfile2, gzipped=True,zstd_file = False, itspos=its_pos,wri_file=False) + elif args.outfile.split('.')[-1] == 'zst' and args.outfile2.split('.')[-1] == 'zst': + dedup_obj.create_paired_trimmed_seqs(args.outfile, args.outfile2, gzipped=False, zstd_file = True, itspos=its_pos,wri_file=True) + #dedup_obj.create_paired_trimmed_seqs(args.outfile, args.outfile2, gzipped=False, zstd_file = True, itspos=its_pos,wri_file=False) else: - dedup_obj.create_paired_trimmed_seqs(args.outfile, args.outfile2, gzipped=False, itspos=its_pos) + dedup_obj.create_paired_trimmed_seqs(args.outfile, args.outfile2, gzipped=False,zstd_file = False, itspos=its_pos,wri_file=True) + #dedup_obj.create_paired_trimmed_seqs(args.outfile, args.outfile2, gzipped=False,zstd_file = False, itspos=its_pos,wri_file=False) + else: - if args.outfile.split('.')[-1] =='gz': - dedup_obj.create_trimmed_seqs(args.outfile, gzipped=True, itspos=its_pos) + if args.outfile.split('.')[-1] == 'gz': + dedup_obj.create_trimmed_seqs(args.outfile, gzipped=True,zstd_file = False, itspos=its_pos,wri_file=True,tempdir=sobj.tempdir) + #dedup_obj.create_trimmed_seqs(args.outfile, gzipped=True,zstd_file = False, itspos=its_pos,wri_file=False) + #add function with above create_trimmed_seqs + #use said function to check for 0 length seqs + elif args.outfile.split('.')[-1] == 'zst': + dedup_obj.create_trimmed_seqs(args.outfile, gzipped=False, zstd_file = True, itspos=its_pos,wri_file=True,tempdir=sobj.tempdir) + #dedup_obj.create_trimmed_seqs(args.outfile, gzipped=False, zstd_file = True, itspos=its_pos,wri_file=False) else: - dedup_obj.create_trimmed_seqs(args.outfile, gzipped=False, itspos=its_pos) - + dedup_obj.create_trimmed_seqs(args.outfile, gzipped=False,zstd_file = False, itspos=its_pos,wri_file=True,tempdir=sobj.tempdir) + #dedup_obj.create_trimmed_seqs(args.outfile, gzipped=False,zstd_file = False, itspos=its_pos,wri_file=False) # Count reads after trimming logging.info("Counting reads after trimming.") if args.outfile2: - _check_total_reads(args.fastq, args.fastq2) + if args.fastq2: + _check_total_reads(args.fastq, args.fastq2) + else: + _check_total_reads(args.fastq) _check_total_reads(args.outfile, args.outfile2) else: - _check_total_reads(args.fastq) + if args.fastq2: + _check_total_reads(args.fastq, args.fastq2) + else: + _check_total_reads(args.fastq) _check_total_reads(args.outfile) - + t1 = time.time() fmttime = time.strftime("%H:%M:%S", time.gmtime(t1-t0)) logging.info("ITSxpress ran in {}".format(fmttime)) diff --git a/itsxpress/plugin_setup.py b/itsxpress/plugin_setup.py new file mode 100644 index 0000000..8df05b3 --- /dev/null +++ b/itsxpress/plugin_setup.py @@ -0,0 +1,188 @@ +from q2_types.per_sample_sequences import (SequencesWithQuality, + PairedEndSequencesWithQuality, + JoinedSequencesWithQuality) +from q2_types.sample_data import SampleData +from qiime2.plugin import (Plugin, + Str, + Choices, + Int, + Float, + Range, + Bool, + Citations) + +from itsxpress.q2_itsxpress import (trim_single, + trim_pair, + trim_pair_output_unmerged, + default_cluster_id) + +plugin = Plugin( + name='itsxpress', + version='2.0.0', + package='itsxpress', + website='https://github.com/USDA-ARS-GBRU/q2_itsxpress ' + 'ITSxpress: https://github.com/USDA-ARS-GBRU/itsxpress', + description='ITSxpress trims amplicon reads down to their ITS region. ' + 'ITSxpress is designed to support the calling of exact sequence variants ' + 'rather than OTUs. This newer method of sequence error-correction requires ' + 'quality score data from each sequence, so each input sequence must be trimmed. ' + 'ITSxpress makes this possible by taking FASTQ data, de-replicating the ' + 'sequences then identifying the start and stop sites using HMMSearch. ' + 'Results are parsed and the trimmed files are returned. ' + 'The ITS 1, ITS2 or the entire ITS region including the 5.8s rRNA' + 'gene can be selected. ALL requires very long reads so it is not routinely' + 'used with Illumina data. ITSxpress uses the hmm models from ITSx so results are comparable.', + short_description='Plugin for using ITSxpress to rapidly trim the\n' + 'internally transcribed spacer (ITS) region of FASTQ files.', + citations=Citations.load('citations.bib', package='itsxpress') +) + +taxaList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'L', 'M', 'ALL', 'O', 'P', 'Q', 'R', 'S', 'T', 'U'] + +plugin.methods.register_function( + function=trim_single, + inputs={'per_sample_sequences': SampleData[SequencesWithQuality]}, + parameters={'region': Str % Choices(['ITS2', 'ITS1', 'ALL']), + 'taxa': Str % Choices(taxaList), + 'threads': Int, + 'cluster_id': Float % Range(0.995, 1.0, inclusive_start=True, inclusive_end=True)}, + outputs=[('trimmed', SampleData[SequencesWithQuality])], + input_descriptions={'per_sample_sequences': 'The artifact that contains the sequence file(s).' + ' Either Joined Paired or just a single fastq.' + ' One file sequences in the qza data folder.'}, + parameter_descriptions={ + 'region': ('\nThe regions ITS2, ITS1, and ALL that can be selected from.'), + 'taxa': ('\nThe selected taxonomic group sequenced that can be selected from.'), + 'threads': ('\nThe number of processor threads to use in the run.'),\ + 'cluster_id': ('\nThe percent identity for clustering reads, set to 1 for exact dereplication.') + }, + output_descriptions={'trimmed': 'The trimmed sequences from ITSxpress.'}, + name='Trim single-end reads', + description='ITSxpress trimSingle is used for qza types with\n' + 'SquencesWithQuality or JoinedSequencesWithQuality.' + ' This means the qza must be in the\n' + 'SingleLanePerSampleSingleEndFastqDirFmt, and only contain\n' + 'one fastq file.\n' + '\nThe taxa list and variable for it:\n' + '\nA = Alveolata\n' + '\nB = Bryophyta\n' + '\nC = Bacillariophyta\n' + '\nD = Amoebozoa\n' + '\nE = Euglenozoa\n' + '\nF = Fungi\n' + '\nG = Chlorophyta (green algae)\n' + '\nH = Rhodophyta (red algae)\n' + '\nI = Phaeophyceae (brown algae)\n' + '\nL = Marchantiophyta (liverworts)\n' + '\nM = Metazoa\n' + '\nO = Oomycota\n' + '\nP = Haptophyceae (prymnesiophytes)\n' + '\nQ = Raphidophyceae\n' + '\nR = Rhizaria\n' + '\nS = Synurophyceae\n' + '\nT = Tracheophyta (higher plants)\n' + '\nU = Eustigmatophyceae\n' + '\nALL = All' +) + +plugin.methods.register_function( + function=trim_pair, + inputs={'per_sample_sequences': SampleData[PairedEndSequencesWithQuality]}, + parameters={'region': Str % Choices(['ITS2', 'ITS1', 'ALL']), + 'taxa': Str % Choices(taxaList), + 'threads': Int, + 'reversed_primers': Bool, + 'allow_staggered_reads': Bool, + 'cluster_id': Float % Range(0.995, 1.0, inclusive_start=True, inclusive_end=True)}, + outputs=[('trimmed', SampleData[JoinedSequencesWithQuality])], + input_descriptions={'per_sample_sequences': 'The artifact that contains the sequence file(s). ' + 'Only Paired can be used. ' + 'Two files sequences in the qza data folder'}, + parameter_descriptions={ + 'region': ('\nThe regions ITS2, ITS1, and ALL that can be selected from.'), + 'taxa': ('\nThe selected taxonomic group sequenced that can be selected from.'), + 'threads': ('\nThe number of processor threads to use in the run.'), + 'cluster_id': ('\nThe percent identity for clustering reads, set to 1 for exact dereplication.'), + 'allow_staggered_reads': ('\nAllows merging of staggered reads.'), + 'reversed_primers': ('\n Primers are in reverse orientation as in Taylor et al. 2016, DOI:10.1128/AEM.02576-16.') + }, + output_descriptions={'trimmed': 'The resulting trimmed sequences from ITSxpress'}, + name='Trim paired-end reads, output merged reads for use with Deblur', + description='ITSxpress trimPair takes the qza type \n' + 'PairedEndSquencesWithQuality. The qza\n' + 'format must be the SingleLanePerSamplePairedEndFastqDirFmt\n' + 'and only contain two fastq files.\n' + 'The function returns merged, trimmed reads for use by Deblur\n' + '\nThe taxa list and variable for it:\n' + '\nA = Alveolata\n' + '\nB = Bryophyta\n' + '\nC = Bacillariophyta\n' + '\nD = Amoebozoa\n' + '\nE = Euglenozoa\n' + '\nF = Fungi\n' + '\nG = Chlorophyta (green algae)\n' + '\nH = Rhodophyta (red algae)\n' + '\nI = Phaeophyceae (brown algae)\n' + '\nL = Marchantiophyta (liverworts)\n' + '\nM = Metazoa\n' + '\nO = Oomycota\n' + '\nP = Haptophyceae (prymnesiophytes)\n' + '\nQ = Raphidophyceae\n' + '\nR = Rhizaria\n' + '\nS = Synurophyceae\n' + '\nT = Tracheophyta (higher plants)\n' + '\nU = Eustigmatophyceae\n' + '\nALL = All' + +) + +plugin.methods.register_function( + function=trim_pair_output_unmerged, + inputs={'per_sample_sequences': SampleData[PairedEndSequencesWithQuality]}, + parameters={'region': Str % Choices(['ITS2', 'ITS1', 'ALL']), + 'taxa': Str % Choices(taxaList), + 'threads': Int, + 'reversed_primers': Bool, + 'allow_staggered_reads': Bool, + 'cluster_id': Float % Range(0.995, 1.0, inclusive_start=True, inclusive_end=True)}, + outputs=[('trimmed', SampleData[PairedEndSequencesWithQuality])], + input_descriptions={'per_sample_sequences': 'The artifact that contains the sequence file(s). ' + 'Only Paired can be used. ' + 'Two files sequences in the qza data folder'}, + parameter_descriptions={ + 'region': ('\nThe regions ITS2, ITS1, and ALL that can be selected from.'), + 'taxa': ('\nThe selected taxonomic group sequenced that can be selected from.'), + 'threads': ('\nThe number of processor threads to use in the run.'), + 'cluster_id': ('\nThe percent identity for clustering reads, set to 1 for exact dereplication.'), + 'allow_staggered_reads': ('\nAllows merging of staggered reads.'), + 'reversed_primers': ('\n Primers are in reverse orientation as in Taylor et al. 2016, DOI:10.1128/AEM.02576-16.') + }, + output_descriptions={'trimmed': 'The resulting trimmed sequences from ITSxpress'}, + name='Trim paired-end reads, output unmerged reads for use with Dada2', + description='ITSxpress trimPairUnmerged takes the qza type \n' + 'PairedEndSquencesWithQuality. The qza\n' + 'format must be the SingleLanePerSamplePairedEndFastqDirFmt\n' + 'and only contain two fastq files.\n' + 'The function returns unmerged, trimmed reads for use by Dada2.\n' + '\nThe taxa list and variable for it:\n' + '\nA = Alveolata\n' + '\nB = Bryophyta\n' + '\nC = Bacillariophyta\n' + '\nD = Amoebozoa\n' + '\nE = Euglenozoa\n' + '\nF = Fungi\n' + '\nG = Chlorophyta (green algae)\n' + '\nH = Rhodophyta (red algae)\n' + '\nI = Phaeophyceae (brown algae)\n' + '\nL = Marchantiophyta (liverworts)\n' + '\nM = Metazoa\n' + '\nO = Oomycota\n' + '\nP = Haptophyceae (prymnesiophytes)\n' + '\nQ = Raphidophyceae\n' + '\nR = Rhizaria\n' + '\nS = Synurophyceae\n' + '\nT = Tracheophyta (higher plants)\n' + '\nU = Eustigmatophyceae\n' + '\nALL = All' + +) diff --git a/itsxpress/q2_itsxpress.py b/itsxpress/q2_itsxpress.py new file mode 100644 index 0000000..26dd3b0 --- /dev/null +++ b/itsxpress/q2_itsxpress.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python +"""A python module to integrate ITSxpress into QIIME for the trimming of amplicon sequences. + +Authors: Adam Rivers, Kyle Weber, Sveinn Einarsson USDA Agricultural Research Service + +The internally transcribed spacer region is a region between the highly conserved small +subunit (SSU) of rRNA and the large subunit (LSU) of the rRNA. The eukaryotic ITS contains +the 5.8s gene and two variable length spacer regions. In amplicon sequencing studies it is +common practice to trim off the conserved (SSU, 5,8S or LSU) regions. Bengtsson-Palme +et al. (2013) published software the software package ITSx to do this. + +ITSxpress is a high-speed implementation of the methods in ITSx than also allows FASTQ +files to be processed. Processing FASTQ files Which is essential for analyzing +sequences using the newer exact Sequence Variant methods in Qiime2, Dada2, Deblur +and Unoise that are replacing OTU clustering. + +""" +import os +import pathlib +import shutil +import math + +import pandas as pd +from q2_types.per_sample_sequences import (SingleLanePerSamplePairedEndFastqDirFmt, + SingleLanePerSampleSingleEndFastqDirFmt, + CasavaOneEightSingleLanePerSampleDirFmt) +from itsxpress import main as itsxpress +from itsxpress.definitions import (taxa_dict, + ROOT_DIR) + +default_cluster_id=0.995 + + +def _set_fastqs_and_check(fastq: str, + fastq2: str, + sample_id: str, + single_end: bool, + reversed_primers: bool, + allow_staggered_reads: bool, + threads: int) -> object: + """Checks and writes the fastqs as well as if they are paired end and single end. + + Args: + fastq (str): The path to the forward reads file. + fastq2 (str): The path to the reverse reads file. + sample_id (str): The Sample ID. + single_end (bool): If the sequences are singled ended or not + threads (int): The amount of threads to use + + Returns: + (object): The sobj object + + Raises: + ValueError1: for FASTQ format issue. + + """ + # checking fastqs + try: + itsxpress._check_fastqs(fastq=fastq, fastq2=fastq2) + # Parse input types + paired_end = itsxpress._is_paired(fastq=fastq, + fastq2=fastq2, + single_end=single_end) + except (NotADirectoryError, + FileNotFoundError): + + raise ValueError("There is a problem with the fastq file(s) you selected") + # Create SeqSample objects and merge if needed. + + + if paired_end: + sobj = itsxpress.SeqSamplePairedNotInterleaved(fastq=fastq, + fastq2=fastq2, + tempdir=None, + reversed_primers=reversed_primers) + sobj._merge_reads(threads=threads,stagger=allow_staggered_reads) + return sobj + + elif not paired_end: + sobj = itsxpress.SeqSampleNotPaired(fastq=fastq, + tempdir=None) + return sobj + + +def _taxa_prefix_to_taxa(taxa_prefix: str) -> str: + """Turns the taxa prefix letter into the taxa + + Args: + taxa_prefix (str): The taxa prefix that will be converted to taxa. + + Returns: + (str): The Taxa + + """ + taxa_dic = {"A": "Alveolata", "B": "Bryophyta", "C": "Bacillariophyta", "D": "Amoebozoa", "E": "Euglenozoa", + "F": "Fungi", "G": "Chlorophyta", "H": "Rhodophyta", "I": "Phaeophyceae", "L": "Marchantiophyta", + "M": "Metazoa", "O": "Oomycota", "P": "Haptophyceae", "Q": "Raphidophyceae", "R": "Rhizaria", + "S": "Synurophyceae", "T": "Tracheophyta", "U": "Eustigmatophyceae", "ALL": "All"} + taxa_choice = taxa_dic[taxa_prefix] + return taxa_choice + + +# First command Trim for SingleLanePerSampleSingleEndFastqDirFmt +def trim_single(per_sample_sequences: SingleLanePerSampleSingleEndFastqDirFmt, + region: str, + taxa: str = "F", + threads: int = 1, + cluster_id: float = default_cluster_id) -> CasavaOneEightSingleLanePerSampleDirFmt: + results = main(per_sample_sequences=per_sample_sequences, + threads=threads, + taxa=taxa, + region=region, + paired_in=False, + paired_out=False, + reversed_primers=False, + allow_staggered_reads = False, + cluster_id=cluster_id) + return results + + +# Second command Trim for SingleLanePerSamplePairedEndFastqDirFmt +def trim_pair(per_sample_sequences: SingleLanePerSamplePairedEndFastqDirFmt, + region: str, + taxa: str = "F", + threads: int = 1, + reversed_primers: bool = False, + allow_staggered_reads: bool = True, + cluster_id: float = default_cluster_id) -> CasavaOneEightSingleLanePerSampleDirFmt: + results = main(per_sample_sequences=per_sample_sequences, + threads=threads, + taxa=taxa, + region=region, + paired_in=True, + paired_out=False, + reversed_primers=reversed_primers, + allow_staggered_reads = allow_staggered_reads, + cluster_id=cluster_id) + return results + +# Second command Trim for SingleLanePerSamplePairedEndFastqDirFmt +def trim_pair_output_unmerged(per_sample_sequences: SingleLanePerSamplePairedEndFastqDirFmt, + region: str, + taxa: str = "F", + threads: int = 1, + reversed_primers: bool = False, + allow_staggered_reads: bool = True, + cluster_id: float = default_cluster_id) -> CasavaOneEightSingleLanePerSampleDirFmt: + results = main(per_sample_sequences=per_sample_sequences, + threads=threads, + taxa=taxa, + region=region, + paired_in=True, + paired_out=True, + reversed_primers=reversed_primers, + allow_staggered_reads = allow_staggered_reads, + cluster_id=cluster_id) + return results +# The ITSxpress handling +def main(per_sample_sequences, + threads: int, + taxa: str, + region: str, + paired_in: bool, + paired_out: bool, + reversed_primers: bool, + allow_staggered_reads: bool, + cluster_id: float) -> CasavaOneEightSingleLanePerSampleDirFmt: + """The main communication between the plugin and the ITSxpress program. + + Args: + per_sample_sequences (SingleLanePerSampleSingleEndFastqDirFmt or SingleLanePerSamplePairedEndFastqDirFmt): + the input sequences. + threads (int) : The number of threads to use. + taxa (str): The taxa to be used for the search. + region (str) : The region to be used for the search. + paired_in (bool): Declares if input files are paired. + paired_out (bool): Declares if output files should be paired. + allow_staggered_reads (bool): Allows merging of staggered reads. Default True. + cluster_id (float):The percent identity for clustering reads, set to 1 for exact dereplication. + + Returns: + (CasavaOneEightSingleLanePerSampleDirFmt): A catch-all output type for + both single and paired-end reads. + + Raises: + ValueError1: hmmsearch error. + + """ + # Setting the taxa + taxa = _taxa_prefix_to_taxa(taxa) + samples = per_sample_sequences.manifest.view(pd.DataFrame) + # Creating result dir + results = CasavaOneEightSingleLanePerSampleDirFmt() + # Running the for loop for each sample + for sample in samples.itertuples(): + # writing fastqs and their attributes and checking the files + sobj = _set_fastqs_and_check( + fastq=sample.forward, + fastq2=sample.reverse if paired_in else None, + sample_id=sample.Index, + single_end=False if paired_in else True, + reversed_primers=reversed_primers, + allow_staggered_reads=allow_staggered_reads, + threads=threads) + # Deduplicate + if math.isclose(cluster_id, 1,rel_tol=1e-05): + sobj.deduplicate(threads=threads) + else: + sobj.cluster(threads=threads, cluster_id=cluster_id) + try: + # HMMSearch for ITS regions + hmmfile = os.path.join(ROOT_DIR, "ITSx_db", "HMMs", taxa_dict[taxa]) + sobj._search(hmmfile=hmmfile, threads=threads) + except (ModuleNotFoundError, + FileNotFoundError, + NotADirectoryError): + + raise ValueError("hmmsearch was not found, make sure HMMER3 is installed and executable") + + # Parse HMMseach output. + its_pos = itsxpress.ItsPosition(domtable=sobj.dom_file, + region=region) + # Create deduplication object. + dedup_obj = itsxpress.Dedup(uc_file=sobj.uc_file, + rep_file=sobj.rep_file, + seq_file=sobj.seq_file, + fastq=sobj.r1, + fastq2=sobj.fastq2) + + # Copy the original filename, that way we preserve all filename fields. + out_path_fwd = os.path.join(str(results), + pathlib.Path(sample.forward).name) + + # Create trimmed sequences. + if paired_out: + # Copy the original filename, that way we preserve all filename fields. + out_path_rev = os.path.join(str(results), + pathlib.Path(sample.reverse).name) + dedup_obj.create_paired_trimmed_seqs(out_path_fwd, + out_path_rev, + gzipped=True, + zstd_file=False, + itspos=its_pos, + wri_file=True,) + else: + dedup_obj.create_trimmed_seqs(out_path_fwd, + gzipped=True, + zstd_file=False, + itspos=its_pos, + wri_file=True, + tempdir=sobj.tempdir) + # Deleting the temp files. + shutil.rmtree(sobj.tempdir) + # Writing out the results. + return results diff --git a/meta.yaml b/meta.yaml index 9f63ef1..5c7b8d3 100644 --- a/meta.yaml +++ b/meta.yaml @@ -1,20 +1,15 @@ -{% set name = "itsxpress" %} -{% set version = "1.8.1" %} -{% set file_ext = "tar.gz" %} -{% set hash_type = "sha256" %} -{% set hash_value = "d93e70f0a7f0c9f7e62892f3b3ff261d4d6bacb87010d167c983de25f0005ec9" %} - -package: - name: '{{ name|lower }}' - version: '{{ version }}' +package: + name: '{{ "itsxpress"|lower }}' + version: '{{ "2.0.0" }}' +# source source: - url: https://files.pythonhosted.org/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.{{ file_ext }} - '{{ hash_type }}': '{{ hash_value }}' + git_url: https://github.com/USDA-ARS-GBRU/itsxpress.git + git_tag: version2 build: noarch: python - number: 2 + number: 0 entry_points: - itsxpress=itsxpress.main:main script: python -m pip install --no-deps --ignore-installed . @@ -22,14 +17,14 @@ build: requirements: host: - pip - - python >=3.8 + - python =3.8.16 run: - pip - - python >=3.8 + - python =3.8.16 - biopython >=1.79 - - bioconda::hmmer=3.1b2 - - bioconda::bbmap=38.96 - - bioconda::vsearch=2.21.1 + - bioconda::hmmer =3.1b2 + - bioconda::vsearch =2.22.1 + - pyzstd test: imports: @@ -48,6 +43,7 @@ about: extra: recipe-maintainers: - arivers + - sveinn.einarsson identifiers: - biotools:ITSxpress - doi:10.5281/zenodo.1304349 diff --git a/pyproject.toml b/pyproject.toml index 2e73405..8a6788e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,10 +2,10 @@ requires = ["setuptools"] build-backend = "setuptools.build_meta" - [project] name = "itsxpress" -version = "1.8.1" +version = "2.0.0" + description = "Rapidly trim sequences down to their Internally Transcribed Spacer ITS regions" authors = [ {name = "Adam R. Rivers", email = "adam.rivers@usda.gov"}, @@ -16,23 +16,15 @@ maintainers = [ license = {file = "LICENSE.txt"} readme = "README.rst" keywords = ["Amplicon", "sequencing", "fungal", "ITS"] -requires-python = ">=3.7" +requires-python = ">=3.5" dependencies = [ "biopython >=1.79", "pyzstd >=0.15.3", + "pandas", ] -classifiers = [ - "Topic :: Scientific/Engineering :: Bio-Informatics", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Development Status :: 7 - Inactive" -] - - - [project.urls] repository = "http://github.com/usda-ars-gbru/itsxpress" @@ -41,6 +33,11 @@ tests = [ "pytest" ] +[tool.setuptools.package-data] +itsxpress = ["*.bib"] [project.scripts] -itsxpress="itsxpress.main:main" \ No newline at end of file +itsxpress="itsxpress.main:main" + +[project.entry-points."qiime2.plugins"] +itsxpress="itsxpress.plugin_setup:plugin" diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/VERSION b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/VERSION new file mode 100644 index 0000000..81f4084 --- /dev/null +++ b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.6.0 diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz new file mode 100644 index 0000000..add64b1 Binary files /dev/null and b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz differ diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/data/4774-1-MSITS3_1_L001_R2_001.fastq.gz b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/data/4774-1-MSITS3_1_L001_R2_001.fastq.gz new file mode 100644 index 0000000..b7df534 Binary files /dev/null and b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/data/4774-1-MSITS3_1_L001_R2_001.fastq.gz differ diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/data/MANIFEST b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/data/MANIFEST new file mode 100644 index 0000000..4866944 --- /dev/null +++ b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/data/MANIFEST @@ -0,0 +1,3 @@ +sample-id,filename,direction +4774-1-MSITS3,4774-1-MSITS3_0_L001_R1_001.fastq.gz,forward +4774-1-MSITS3,4774-1-MSITS3_1_L001_R2_001.fastq.gz,reverse diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/data/metadata.yml b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/data/metadata.yml new file mode 100644 index 0000000..8ffe2a9 --- /dev/null +++ b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/data/metadata.yml @@ -0,0 +1 @@ +{phred-offset: 33} diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/metadata.yaml b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/metadata.yaml new file mode 100644 index 0000000..2bc1526 --- /dev/null +++ b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/metadata.yaml @@ -0,0 +1,3 @@ +uuid: d9955749-00d5-44ae-a628-4b2da43000e1 +type: SampleData[SequencesWithQuality] +format: SingleLanePerSampleSingleEndFastqDirFmt diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/VERSION b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/VERSION new file mode 100644 index 0000000..81f4084 --- /dev/null +++ b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.6.0 diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/action/action.yaml b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/action/action.yaml new file mode 100644 index 0000000..b06d080 --- /dev/null +++ b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/action/action.yaml @@ -0,0 +1,171 @@ +execution: + uuid: 8c3ffeb7-0739-4fc3-85e2-0c971ba82bff + runtime: + start: 2018-06-29T22:43:06.685342-04:00 + end: 2018-06-29T22:43:10.624569-04:00 + duration: 3 seconds, and 939227 microseconds + +action: + type: method + plugin: !ref 'environment:plugins:itsxpress' + action: trim_pair + inputs: + - per_sample_sequences: 445cf54a-bf06-4852-8010-13a60fa1598c + parameters: + - region: ITS2 + - taxa: F + - threads: 1 + output-name: trimmed + +transformers: + inputs: + per_sample_sequences: + - from: SingleLanePerSamplePairedEndFastqDirFmt + to: SingleLanePerSamplePairedEndFastqDirFmt + output: + - from: SingleLanePerSampleSingleEndFastqDirFmt + to: SingleLanePerSampleSingleEndFastqDirFmt + +environment: + platform: linux-x86_64 + python: |- + 3.5.5 | packaged by conda-forge | (default, Apr 6 2018, 13:41:05) + [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)] + framework: + version: 2018.6.0 + website: https://qiime2.org + citations: + - !cite 'framework|qiime2:2018.6.0|0' + plugins: + types: + version: 2018.6.0 + website: https://github.com/qiime2/q2-types + itsxpress: + version: '1.4' + website: 'https://github.com/kweber1/q2_itsxpress ITSxpress: + https://github.com/USDA-ARS-GBRU/itsxpress' + python-packages: + widgetsnbextension: 3.2.1 + wheel: 0.31.0 + webencodings: '0.5' + wcwidth: 0.1.7 + urllib3: '1.22' + unifrac: 0.9.2 + traitlets: 4.3.2 + tornado: 5.0.2 + testpath: 0.3.1 + terminado: 0.8.1 + statsmodels: 0.9.0 + six: 1.11.0 + simplegeneric: 0.8.1 + setuptools: 39.2.0 + Send2Trash: 1.5.0 + seaborn: 0.8.1 + scipy: 0.19.1 + scikit-learn: 0.19.1 + scikit-bio: 0.5.1 + requests: 2.19.1 + qiime2: 2018.6.0 + q2templates: 2018.6.0 + q2cli: 2018.6.0 + q2-vsearch: 2018.6.0 + q2-types: 2018.6.0 + q2-taxa: 2018.6.0 + q2-sample-classifier: 2018.6.0 + q2-quality-filter: 2018.6.0 + q2-quality-control: 2018.6.0 + q2-phylogeny: 2018.6.0 + q2-metadata: 2018.6.0 + q2-longitudinal: 2018.6.0 + q2-gneiss: 2018.6.0 + q2-feature-table: 2018.6.0 + q2-feature-classifier: 2018.6.0 + q2-emperor: 2018.6.0 + q2-diversity: 2018.6.0 + q2-demux: 2018.6.0 + q2-deblur: 2018.6.0 + q2-dada2: 2018.6.0 + q2-cutadapt: 2018.6.0 + q2-composition: 2018.6.0 + q2-alignment: 2018.6.0 + pyzmq: 17.0.0 + PyYAML: '3.12' + pytz: '2018.4' + python-dateutil: 2.7.3 + PySocks: 1.6.8 + pyparsing: 2.2.0 + pyOpenSSL: 18.0.0 + Pygments: 2.2.0 + pyflakes: 1.6.0 + pycparser: '2.18' + pycodestyle: 2.3.1 + ptyprocess: 0.5.2 + psutil: 5.4.6 + prompt-toolkit: 1.0.15 + pip: 9.0.3 + pickleshare: 0.7.4 + pexpect: 4.6.0 + patsy: 0.5.0 + parso: 0.2.1 + pandocfilters: 1.4.2 + pandas: 0.23.1 + packaging: '17.1' + numpy: 1.14.5 + notebook: 5.5.0 + nose: 1.3.7 + nbformat: 4.4.0 + nbconvert: 5.3.1 + natsort: 5.3.2 + msgpack: 0.5.6 + mkl-random: 1.0.1 + mkl-fft: 1.0.0 + mistune: 0.8.3 + mccabe: 0.6.1 + matplotlib: 2.2.2 + MarkupSafe: '1.0' + lockfile: 0.12.2 + kiwisolver: 1.0.1 + jupyter-core: 4.4.0 + jupyter-client: 5.2.3 + jsonschema: 2.6.0 + Jinja2: '2.10' + jedi: 0.12.0 + itsxpress: 1.5.3 + ipywidgets: 7.2.1 + ipython: 6.4.0 + ipython-genutils: 0.2.0 + ipykernel: 4.8.2 + idna: '2.7' + html5lib: 1.0.1 + h5py: 2.7.0 + future: 0.16.0 + flake8: 3.5.0 + fastcluster: 1.1.25 + entrypoints: 0.2.3 + emperor: 1.0.0b16 + docopt: 0.6.2 + decorator: 4.3.0 + deblur: 1.0.4 + Cython: 0.28.3 + cutadapt: '1.16' + cryptography: 2.2.1 + coveralls: 1.3.0 + coverage: 4.5.1 + click: '6.7' + chardet: 3.0.4 + cffi: 1.11.5 + certifi: 2018.4.16 + CacheControl: 0.12.4 + bokeh: 0.12.16 + bleach: 2.1.3 + biopython: '1.71' + bibtexparser: 1.0.1 + backcall: 0.1.0 + asn1crypto: 0.24.0 + biom-format: 2.1.6 + cycler: 0.10.0 + gneiss: 0.4.4 + q2-itsxpress: '1.4' + ijson: '2.3' + tzlocal: '1.3' + xopen: 0.3.2 diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/artifacts/445cf54a-bf06-4852-8010-13a60fa1598c/VERSION b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/artifacts/445cf54a-bf06-4852-8010-13a60fa1598c/VERSION new file mode 100644 index 0000000..17e0eb8 --- /dev/null +++ b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/artifacts/445cf54a-bf06-4852-8010-13a60fa1598c/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.4.0 diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/artifacts/445cf54a-bf06-4852-8010-13a60fa1598c/action/action.yaml b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/artifacts/445cf54a-bf06-4852-8010-13a60fa1598c/action/action.yaml new file mode 100644 index 0000000..fec3d36 --- /dev/null +++ b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/artifacts/445cf54a-bf06-4852-8010-13a60fa1598c/action/action.yaml @@ -0,0 +1,150 @@ +execution: + uuid: db1f6056-6580-4d04-b930-a5dcc8c159ba + runtime: + start: 2018-06-23T12:59:59.178077-04:00 + end: 2018-06-23T12:59:59.297050-04:00 + duration: 118973 microseconds + +action: + type: import + format: PairedEndFastqManifestPhred33 + manifest: + - name: MANIFEST.csv + md5sum: 6c50df3891f0022644fd5345c27f0e73 + +transformers: + output: + - from: PairedEndFastqManifestPhred33 + to: SingleLanePerSamplePairedEndFastqDirFmt + plugin: !ref 'environment:plugins:types' + +environment: + platform: linux-x86_64 + python: |- + 3.5.5 | packaged by conda-forge | (default, Apr 6 2018, 13:41:05) + [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)] + framework: + version: 2018.4.0 + website: https://qiime2.org + citations: + - !cite 'framework|qiime2:2018.4.0|0' + plugins: + types: + version: 2018.4.0 + website: https://github.com/qiime2/q2-types + python-packages: + widgetsnbextension: 3.2.1 + wheel: 0.31.0 + webencodings: '0.5' + wcwidth: 0.1.7 + urllib3: '1.22' + unifrac: 0.9.2 + traitlets: 4.3.2 + tornado: 5.0.2 + testpath: 0.3.1 + terminado: 0.8.1 + statsmodels: 0.8.0 + six: 1.11.0 + simplegeneric: 0.8.1 + setuptools: 39.0.1 + Send2Trash: 1.5.0 + seaborn: 0.8.1 + scipy: 0.19.1 + scikit-learn: 0.19.1 + scikit-bio: 0.5.1 + requests: 2.18.4 + qiime2: 2018.4.0 + q2templates: 2018.4.0 + q2cli: 2018.4.0 + q2-vsearch: 2018.4.0 + q2-types: 2018.4.0 + q2-taxa: 2018.4.0 + q2-sample-classifier: 2018.4.0 + q2-quality-filter: 2018.4.0 + q2-quality-control: 2018.4.0 + q2-phylogeny: 2018.4.0 + q2-metadata: 2018.4.0 + q2-longitudinal: 2018.4.0 + q2-gneiss: 2018.4.0 + q2-feature-table: 2018.4.0 + q2-feature-classifier: 2018.4.0 + q2-emperor: 2018.4.0 + q2-diversity: 2018.4.0 + q2-demux: 2018.4.0 + q2-deblur: 2018.4.0 + q2-dada2: 2018.4.0 + q2-cutadapt: 2018.4.0 + q2-composition: 2018.4.0 + q2-alignment: 2018.4.0 + pyzmq: 17.0.0 + PyYAML: '3.12' + pytz: '2018.4' + python-dateutil: 2.7.2 + PySocks: 1.6.8 + pyparsing: 2.2.0 + pyOpenSSL: 17.5.0 + Pygments: 2.2.0 + pycparser: '2.18' + ptyprocess: 0.5.2 + psutil: 5.4.5 + prompt-toolkit: 1.0.15 + pip: 9.0.3 + pickleshare: 0.7.4 + pexpect: 4.5.0 + patsy: 0.5.0 + parso: 0.2.0 + pandocfilters: 1.4.2 + pandas: 0.22.0 + packaging: '17.1' + numpy: 1.12.1 + notebook: 5.4.1 + nose: 1.3.7 + nbformat: 4.4.0 + nbconvert: 5.3.1 + natsort: 5.3.0 + msgpack: 0.5.6 + mistune: 0.8.3 + matplotlib: 2.2.2 + MarkupSafe: '1.0' + lockfile: 0.12.2 + kiwisolver: 1.0.1 + jupyter-core: 4.4.0 + jupyter-client: 5.2.3 + jsonschema: 2.6.0 + Jinja2: '2.10' + jedi: 0.12.0 + itsxpress: 1.5.3 + ipywidgets: 7.2.1 + ipython: 6.3.1 + ipython-genutils: 0.2.0 + ipykernel: 4.8.2 + idna: '2.6' + html5lib: 1.0.1 + h5py: 2.7.0 + future: 0.16.0 + fastcluster: 1.1.24 + entrypoints: 0.2.3 + emperor: 1.0.0b16 + decorator: 4.3.0 + deblur: 1.0.4 + Cython: 0.28.2 + cutadapt: '1.16' + cryptography: 2.2.1 + click: '6.7' + chardet: 3.0.4 + cffi: 1.11.5 + certifi: 2018.4.16 + CacheControl: 0.12.4 + bokeh: 0.12.15 + bleach: 2.1.3 + biopython: '1.71' + bibtexparser: 1.0.1 + backcall: 0.1.0 + asn1crypto: 0.24.0 + biom-format: 2.1.6 + cycler: 0.10.0 + itsxpressqiime2: '1.0' + gneiss: 0.4.2 + ijson: '2.3' + tzlocal: '1.3' + xopen: 0.3.2 diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/artifacts/445cf54a-bf06-4852-8010-13a60fa1598c/citations.bib b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/artifacts/445cf54a-bf06-4852-8010-13a60fa1598c/citations.bib new file mode 100644 index 0000000..4bea5fa --- /dev/null +++ b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/artifacts/445cf54a-bf06-4852-8010-13a60fa1598c/citations.bib @@ -0,0 +1,12 @@ +@article{framework|qiime2:2018.4.0|0, + author = {Caporaso, J Gregory and Kuczynski, Justin and Stombaugh, Jesse and Bittinger, Kyle and Bushman, Frederic D and Costello, Elizabeth K and Fierer, Noah and Peña, Antonio Gonzalez and Goodrich, Julia K and Gordon, Jeffrey I and Huttley, Gavin A and Kelley, Scott T and Knights, Dan and Koenig, Jeremy E and Ley, Ruth E and Lozupone, Catherine A and McDonald, Daniel and Muegge, Brian D and Pirrung, Meg and Reeder, Jens and Sevinsky, Joel R and Turnbaugh, Peter J and Walters, William A and Widmann, Jeremy and Yatsunenko, Tanya and Zaneveld, Jesse and Knight, Rob}, + doi = {10.1038/nmeth.f.303}, + journal = {Nature methods}, + number = {5}, + pages = {335}, + publisher = {Nature Publishing Group}, + title = {QIIME allows analysis of high-throughput community sequencing data}, + volume = {7}, + year = {2010} +} + diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/artifacts/445cf54a-bf06-4852-8010-13a60fa1598c/metadata.yaml b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/artifacts/445cf54a-bf06-4852-8010-13a60fa1598c/metadata.yaml new file mode 100644 index 0000000..fcd0b03 --- /dev/null +++ b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/artifacts/445cf54a-bf06-4852-8010-13a60fa1598c/metadata.yaml @@ -0,0 +1,3 @@ +uuid: 445cf54a-bf06-4852-8010-13a60fa1598c +type: SampleData[PairedEndSequencesWithQuality] +format: SingleLanePerSamplePairedEndFastqDirFmt diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/citations.bib b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/citations.bib new file mode 100644 index 0000000..fd4d2db --- /dev/null +++ b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/citations.bib @@ -0,0 +1,12 @@ +@article{framework|qiime2:2018.6.0|0, + author = {Caporaso, J Gregory and Kuczynski, Justin and Stombaugh, Jesse and Bittinger, Kyle and Bushman, Frederic D and Costello, Elizabeth K and Fierer, Noah and Peña, Antonio Gonzalez and Goodrich, Julia K and Gordon, Jeffrey I and Huttley, Gavin A and Kelley, Scott T and Knights, Dan and Koenig, Jeremy E and Ley, Ruth E and Lozupone, Catherine A and McDonald, Daniel and Muegge, Brian D and Pirrung, Meg and Reeder, Jens and Sevinsky, Joel R and Turnbaugh, Peter J and Walters, William A and Widmann, Jeremy and Yatsunenko, Tanya and Zaneveld, Jesse and Knight, Rob}, + doi = {10.1038/nmeth.f.303}, + journal = {Nature methods}, + number = {5}, + pages = {335}, + publisher = {Nature Publishing Group}, + title = {QIIME allows analysis of high-throughput community sequencing data}, + volume = {7}, + year = {2010} +} + diff --git a/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/metadata.yaml b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/metadata.yaml new file mode 100644 index 0000000..2bc1526 --- /dev/null +++ b/tests/test_data/out/d9955749-00d5-44ae-a628-4b2da43000e1/provenance/metadata.yaml @@ -0,0 +1,3 @@ +uuid: d9955749-00d5-44ae-a628-4b2da43000e1 +type: SampleData[SequencesWithQuality] +format: SingleLanePerSampleSingleEndFastqDirFmt diff --git a/tests/test_data/paired.qza b/tests/test_data/paired.qza new file mode 100644 index 0000000..6549d34 Binary files /dev/null and b/tests/test_data/paired.qza differ diff --git a/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/VERSION b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/VERSION new file mode 100644 index 0000000..17e0eb8 --- /dev/null +++ b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.4.0 diff --git a/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz new file mode 100644 index 0000000..add64b1 Binary files /dev/null and b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz differ diff --git a/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/data/4774-1-MSITS3_1_L001_R2_001.fastq.gz b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/data/4774-1-MSITS3_1_L001_R2_001.fastq.gz new file mode 100644 index 0000000..b7df534 Binary files /dev/null and b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/data/4774-1-MSITS3_1_L001_R2_001.fastq.gz differ diff --git a/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/data/MANIFEST b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/data/MANIFEST new file mode 100644 index 0000000..4866944 --- /dev/null +++ b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/data/MANIFEST @@ -0,0 +1,3 @@ +sample-id,filename,direction +4774-1-MSITS3,4774-1-MSITS3_0_L001_R1_001.fastq.gz,forward +4774-1-MSITS3,4774-1-MSITS3_1_L001_R2_001.fastq.gz,reverse diff --git a/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/data/metadata.yml b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/data/metadata.yml new file mode 100644 index 0000000..8ffe2a9 --- /dev/null +++ b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/data/metadata.yml @@ -0,0 +1 @@ +{phred-offset: 33} diff --git a/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/metadata.yaml b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/metadata.yaml new file mode 100644 index 0000000..fcd0b03 --- /dev/null +++ b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/metadata.yaml @@ -0,0 +1,3 @@ +uuid: 445cf54a-bf06-4852-8010-13a60fa1598c +type: SampleData[PairedEndSequencesWithQuality] +format: SingleLanePerSamplePairedEndFastqDirFmt diff --git a/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/VERSION b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/VERSION new file mode 100644 index 0000000..17e0eb8 --- /dev/null +++ b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.4.0 diff --git a/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/action/action.yaml b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/action/action.yaml new file mode 100644 index 0000000..fec3d36 --- /dev/null +++ b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/action/action.yaml @@ -0,0 +1,150 @@ +execution: + uuid: db1f6056-6580-4d04-b930-a5dcc8c159ba + runtime: + start: 2018-06-23T12:59:59.178077-04:00 + end: 2018-06-23T12:59:59.297050-04:00 + duration: 118973 microseconds + +action: + type: import + format: PairedEndFastqManifestPhred33 + manifest: + - name: MANIFEST.csv + md5sum: 6c50df3891f0022644fd5345c27f0e73 + +transformers: + output: + - from: PairedEndFastqManifestPhred33 + to: SingleLanePerSamplePairedEndFastqDirFmt + plugin: !ref 'environment:plugins:types' + +environment: + platform: linux-x86_64 + python: |- + 3.5.5 | packaged by conda-forge | (default, Apr 6 2018, 13:41:05) + [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)] + framework: + version: 2018.4.0 + website: https://qiime2.org + citations: + - !cite 'framework|qiime2:2018.4.0|0' + plugins: + types: + version: 2018.4.0 + website: https://github.com/qiime2/q2-types + python-packages: + widgetsnbextension: 3.2.1 + wheel: 0.31.0 + webencodings: '0.5' + wcwidth: 0.1.7 + urllib3: '1.22' + unifrac: 0.9.2 + traitlets: 4.3.2 + tornado: 5.0.2 + testpath: 0.3.1 + terminado: 0.8.1 + statsmodels: 0.8.0 + six: 1.11.0 + simplegeneric: 0.8.1 + setuptools: 39.0.1 + Send2Trash: 1.5.0 + seaborn: 0.8.1 + scipy: 0.19.1 + scikit-learn: 0.19.1 + scikit-bio: 0.5.1 + requests: 2.18.4 + qiime2: 2018.4.0 + q2templates: 2018.4.0 + q2cli: 2018.4.0 + q2-vsearch: 2018.4.0 + q2-types: 2018.4.0 + q2-taxa: 2018.4.0 + q2-sample-classifier: 2018.4.0 + q2-quality-filter: 2018.4.0 + q2-quality-control: 2018.4.0 + q2-phylogeny: 2018.4.0 + q2-metadata: 2018.4.0 + q2-longitudinal: 2018.4.0 + q2-gneiss: 2018.4.0 + q2-feature-table: 2018.4.0 + q2-feature-classifier: 2018.4.0 + q2-emperor: 2018.4.0 + q2-diversity: 2018.4.0 + q2-demux: 2018.4.0 + q2-deblur: 2018.4.0 + q2-dada2: 2018.4.0 + q2-cutadapt: 2018.4.0 + q2-composition: 2018.4.0 + q2-alignment: 2018.4.0 + pyzmq: 17.0.0 + PyYAML: '3.12' + pytz: '2018.4' + python-dateutil: 2.7.2 + PySocks: 1.6.8 + pyparsing: 2.2.0 + pyOpenSSL: 17.5.0 + Pygments: 2.2.0 + pycparser: '2.18' + ptyprocess: 0.5.2 + psutil: 5.4.5 + prompt-toolkit: 1.0.15 + pip: 9.0.3 + pickleshare: 0.7.4 + pexpect: 4.5.0 + patsy: 0.5.0 + parso: 0.2.0 + pandocfilters: 1.4.2 + pandas: 0.22.0 + packaging: '17.1' + numpy: 1.12.1 + notebook: 5.4.1 + nose: 1.3.7 + nbformat: 4.4.0 + nbconvert: 5.3.1 + natsort: 5.3.0 + msgpack: 0.5.6 + mistune: 0.8.3 + matplotlib: 2.2.2 + MarkupSafe: '1.0' + lockfile: 0.12.2 + kiwisolver: 1.0.1 + jupyter-core: 4.4.0 + jupyter-client: 5.2.3 + jsonschema: 2.6.0 + Jinja2: '2.10' + jedi: 0.12.0 + itsxpress: 1.5.3 + ipywidgets: 7.2.1 + ipython: 6.3.1 + ipython-genutils: 0.2.0 + ipykernel: 4.8.2 + idna: '2.6' + html5lib: 1.0.1 + h5py: 2.7.0 + future: 0.16.0 + fastcluster: 1.1.24 + entrypoints: 0.2.3 + emperor: 1.0.0b16 + decorator: 4.3.0 + deblur: 1.0.4 + Cython: 0.28.2 + cutadapt: '1.16' + cryptography: 2.2.1 + click: '6.7' + chardet: 3.0.4 + cffi: 1.11.5 + certifi: 2018.4.16 + CacheControl: 0.12.4 + bokeh: 0.12.15 + bleach: 2.1.3 + biopython: '1.71' + bibtexparser: 1.0.1 + backcall: 0.1.0 + asn1crypto: 0.24.0 + biom-format: 2.1.6 + cycler: 0.10.0 + itsxpressqiime2: '1.0' + gneiss: 0.4.2 + ijson: '2.3' + tzlocal: '1.3' + xopen: 0.3.2 diff --git a/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/citations.bib b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/citations.bib new file mode 100644 index 0000000..4bea5fa --- /dev/null +++ b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/citations.bib @@ -0,0 +1,12 @@ +@article{framework|qiime2:2018.4.0|0, + author = {Caporaso, J Gregory and Kuczynski, Justin and Stombaugh, Jesse and Bittinger, Kyle and Bushman, Frederic D and Costello, Elizabeth K and Fierer, Noah and Peña, Antonio Gonzalez and Goodrich, Julia K and Gordon, Jeffrey I and Huttley, Gavin A and Kelley, Scott T and Knights, Dan and Koenig, Jeremy E and Ley, Ruth E and Lozupone, Catherine A and McDonald, Daniel and Muegge, Brian D and Pirrung, Meg and Reeder, Jens and Sevinsky, Joel R and Turnbaugh, Peter J and Walters, William A and Widmann, Jeremy and Yatsunenko, Tanya and Zaneveld, Jesse and Knight, Rob}, + doi = {10.1038/nmeth.f.303}, + journal = {Nature methods}, + number = {5}, + pages = {335}, + publisher = {Nature Publishing Group}, + title = {QIIME allows analysis of high-throughput community sequencing data}, + volume = {7}, + year = {2010} +} + diff --git a/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/metadata.yaml b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/metadata.yaml new file mode 100644 index 0000000..fcd0b03 --- /dev/null +++ b/tests/test_data/paired/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/metadata.yaml @@ -0,0 +1,3 @@ +uuid: 445cf54a-bf06-4852-8010-13a60fa1598c +type: SampleData[PairedEndSequencesWithQuality] +format: SingleLanePerSamplePairedEndFastqDirFmt diff --git a/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/VERSION b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/VERSION new file mode 100644 index 0000000..17e0eb8 --- /dev/null +++ b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.4.0 diff --git a/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz new file mode 100644 index 0000000..add64b1 Binary files /dev/null and b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz differ diff --git a/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/data/4774-1-MSITS3_1_L001_R2_001.fastq.gz b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/data/4774-1-MSITS3_1_L001_R2_001.fastq.gz new file mode 100644 index 0000000..b7df534 Binary files /dev/null and b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/data/4774-1-MSITS3_1_L001_R2_001.fastq.gz differ diff --git a/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/data/MANIFEST b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/data/MANIFEST new file mode 100644 index 0000000..93e02ef --- /dev/null +++ b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/data/MANIFEST @@ -0,0 +1,4 @@ +sample-id,filename,direction +###### +lllllllll# +4774-1-MSITS3,4774-1-MSITS3_0_L001_R1_001.fastq.gz,forward \ No newline at end of file diff --git a/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/data/metadata.yml b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/data/metadata.yml new file mode 100644 index 0000000..8ffe2a9 --- /dev/null +++ b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/data/metadata.yml @@ -0,0 +1 @@ +{phred-offset: 33} diff --git a/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/metadata.yaml b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/metadata.yaml new file mode 100644 index 0000000..fcd0b03 --- /dev/null +++ b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/metadata.yaml @@ -0,0 +1,3 @@ +uuid: 445cf54a-bf06-4852-8010-13a60fa1598c +type: SampleData[PairedEndSequencesWithQuality] +format: SingleLanePerSamplePairedEndFastqDirFmt diff --git a/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/VERSION b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/VERSION new file mode 100644 index 0000000..17e0eb8 --- /dev/null +++ b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.4.0 diff --git a/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/action/action.yaml b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/action/action.yaml new file mode 100644 index 0000000..fec3d36 --- /dev/null +++ b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/action/action.yaml @@ -0,0 +1,150 @@ +execution: + uuid: db1f6056-6580-4d04-b930-a5dcc8c159ba + runtime: + start: 2018-06-23T12:59:59.178077-04:00 + end: 2018-06-23T12:59:59.297050-04:00 + duration: 118973 microseconds + +action: + type: import + format: PairedEndFastqManifestPhred33 + manifest: + - name: MANIFEST.csv + md5sum: 6c50df3891f0022644fd5345c27f0e73 + +transformers: + output: + - from: PairedEndFastqManifestPhred33 + to: SingleLanePerSamplePairedEndFastqDirFmt + plugin: !ref 'environment:plugins:types' + +environment: + platform: linux-x86_64 + python: |- + 3.5.5 | packaged by conda-forge | (default, Apr 6 2018, 13:41:05) + [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)] + framework: + version: 2018.4.0 + website: https://qiime2.org + citations: + - !cite 'framework|qiime2:2018.4.0|0' + plugins: + types: + version: 2018.4.0 + website: https://github.com/qiime2/q2-types + python-packages: + widgetsnbextension: 3.2.1 + wheel: 0.31.0 + webencodings: '0.5' + wcwidth: 0.1.7 + urllib3: '1.22' + unifrac: 0.9.2 + traitlets: 4.3.2 + tornado: 5.0.2 + testpath: 0.3.1 + terminado: 0.8.1 + statsmodels: 0.8.0 + six: 1.11.0 + simplegeneric: 0.8.1 + setuptools: 39.0.1 + Send2Trash: 1.5.0 + seaborn: 0.8.1 + scipy: 0.19.1 + scikit-learn: 0.19.1 + scikit-bio: 0.5.1 + requests: 2.18.4 + qiime2: 2018.4.0 + q2templates: 2018.4.0 + q2cli: 2018.4.0 + q2-vsearch: 2018.4.0 + q2-types: 2018.4.0 + q2-taxa: 2018.4.0 + q2-sample-classifier: 2018.4.0 + q2-quality-filter: 2018.4.0 + q2-quality-control: 2018.4.0 + q2-phylogeny: 2018.4.0 + q2-metadata: 2018.4.0 + q2-longitudinal: 2018.4.0 + q2-gneiss: 2018.4.0 + q2-feature-table: 2018.4.0 + q2-feature-classifier: 2018.4.0 + q2-emperor: 2018.4.0 + q2-diversity: 2018.4.0 + q2-demux: 2018.4.0 + q2-deblur: 2018.4.0 + q2-dada2: 2018.4.0 + q2-cutadapt: 2018.4.0 + q2-composition: 2018.4.0 + q2-alignment: 2018.4.0 + pyzmq: 17.0.0 + PyYAML: '3.12' + pytz: '2018.4' + python-dateutil: 2.7.2 + PySocks: 1.6.8 + pyparsing: 2.2.0 + pyOpenSSL: 17.5.0 + Pygments: 2.2.0 + pycparser: '2.18' + ptyprocess: 0.5.2 + psutil: 5.4.5 + prompt-toolkit: 1.0.15 + pip: 9.0.3 + pickleshare: 0.7.4 + pexpect: 4.5.0 + patsy: 0.5.0 + parso: 0.2.0 + pandocfilters: 1.4.2 + pandas: 0.22.0 + packaging: '17.1' + numpy: 1.12.1 + notebook: 5.4.1 + nose: 1.3.7 + nbformat: 4.4.0 + nbconvert: 5.3.1 + natsort: 5.3.0 + msgpack: 0.5.6 + mistune: 0.8.3 + matplotlib: 2.2.2 + MarkupSafe: '1.0' + lockfile: 0.12.2 + kiwisolver: 1.0.1 + jupyter-core: 4.4.0 + jupyter-client: 5.2.3 + jsonschema: 2.6.0 + Jinja2: '2.10' + jedi: 0.12.0 + itsxpress: 1.5.3 + ipywidgets: 7.2.1 + ipython: 6.3.1 + ipython-genutils: 0.2.0 + ipykernel: 4.8.2 + idna: '2.6' + html5lib: 1.0.1 + h5py: 2.7.0 + future: 0.16.0 + fastcluster: 1.1.24 + entrypoints: 0.2.3 + emperor: 1.0.0b16 + decorator: 4.3.0 + deblur: 1.0.4 + Cython: 0.28.2 + cutadapt: '1.16' + cryptography: 2.2.1 + click: '6.7' + chardet: 3.0.4 + cffi: 1.11.5 + certifi: 2018.4.16 + CacheControl: 0.12.4 + bokeh: 0.12.15 + bleach: 2.1.3 + biopython: '1.71' + bibtexparser: 1.0.1 + backcall: 0.1.0 + asn1crypto: 0.24.0 + biom-format: 2.1.6 + cycler: 0.10.0 + itsxpressqiime2: '1.0' + gneiss: 0.4.2 + ijson: '2.3' + tzlocal: '1.3' + xopen: 0.3.2 diff --git a/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/citations.bib b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/citations.bib new file mode 100644 index 0000000..4bea5fa --- /dev/null +++ b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/citations.bib @@ -0,0 +1,12 @@ +@article{framework|qiime2:2018.4.0|0, + author = {Caporaso, J Gregory and Kuczynski, Justin and Stombaugh, Jesse and Bittinger, Kyle and Bushman, Frederic D and Costello, Elizabeth K and Fierer, Noah and Peña, Antonio Gonzalez and Goodrich, Julia K and Gordon, Jeffrey I and Huttley, Gavin A and Kelley, Scott T and Knights, Dan and Koenig, Jeremy E and Ley, Ruth E and Lozupone, Catherine A and McDonald, Daniel and Muegge, Brian D and Pirrung, Meg and Reeder, Jens and Sevinsky, Joel R and Turnbaugh, Peter J and Walters, William A and Widmann, Jeremy and Yatsunenko, Tanya and Zaneveld, Jesse and Knight, Rob}, + doi = {10.1038/nmeth.f.303}, + journal = {Nature methods}, + number = {5}, + pages = {335}, + publisher = {Nature Publishing Group}, + title = {QIIME allows analysis of high-throughput community sequencing data}, + volume = {7}, + year = {2010} +} + diff --git a/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/metadata.yaml b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/metadata.yaml new file mode 100644 index 0000000..fcd0b03 --- /dev/null +++ b/tests/test_data/pairedAllForward/445cf54a-bf06-4852-8010-13a60fa1598c/provenance/metadata.yaml @@ -0,0 +1,3 @@ +uuid: 445cf54a-bf06-4852-8010-13a60fa1598c +type: SampleData[PairedEndSequencesWithQuality] +format: SingleLanePerSamplePairedEndFastqDirFmt diff --git a/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/VERSION b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/VERSION new file mode 100644 index 0000000..81f4084 --- /dev/null +++ b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.6.0 diff --git a/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/data/metadata.yml b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/data/metadata.yml new file mode 100644 index 0000000..8ffe2a9 --- /dev/null +++ b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/data/metadata.yml @@ -0,0 +1 @@ +{phred-offset: 33} diff --git a/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/metadata.yaml b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/metadata.yaml new file mode 100644 index 0000000..bc644c4 --- /dev/null +++ b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/metadata.yaml @@ -0,0 +1,3 @@ +uuid: 50d5f31a-a761-4c04-990c-e7668fe6bf00 +type: SampleData[PairedEndSequencesWithQuality] +format: SingleLanePerSamplePairedEndFastqDirFmt diff --git a/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/VERSION b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/VERSION new file mode 100644 index 0000000..81f4084 --- /dev/null +++ b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.6.0 diff --git a/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/action/action.yaml b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/action/action.yaml new file mode 100644 index 0000000..7bbef15 --- /dev/null +++ b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/action/action.yaml @@ -0,0 +1,152 @@ +execution: + uuid: 525c43f6-f470-4bcc-b24a-5a1ab7fbc0f5 + runtime: + start: 2018-06-27T20:41:59.719751-04:00 + end: 2018-06-27T20:41:59.830734-04:00 + duration: 110983 microseconds + +action: + type: import + format: PairedEndFastqManifestPhred33 + manifest: + - name: MANIFEST + md5sum: 5ccaf81480adb423a4747b2a64d8a810 + +transformers: + output: + - from: PairedEndFastqManifestPhred33 + to: SingleLanePerSamplePairedEndFastqDirFmt + plugin: !ref 'environment:plugins:types' + +environment: + platform: linux-x86_64 + python: |- + 3.5.5 | packaged by conda-forge | (default, Apr 6 2018, 13:41:05) + [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)] + framework: + version: 2018.6.0 + website: https://qiime2.org + citations: + - !cite 'framework|qiime2:2018.6.0|0' + plugins: + types: + version: 2018.6.0 + website: https://github.com/qiime2/q2-types + python-packages: + widgetsnbextension: 3.2.1 + wheel: 0.31.0 + webencodings: '0.5' + wcwidth: 0.1.7 + urllib3: '1.22' + unifrac: 0.9.2 + traitlets: 4.3.2 + tornado: 5.0.2 + testpath: 0.3.1 + terminado: 0.8.1 + statsmodels: 0.9.0 + six: 1.11.0 + simplegeneric: 0.8.1 + setuptools: 39.2.0 + Send2Trash: 1.5.0 + seaborn: 0.8.1 + scipy: 0.19.1 + scikit-learn: 0.19.1 + scikit-bio: 0.5.1 + requests: 2.19.1 + qiime2: 2018.6.0 + q2templates: 2018.6.0 + q2cli: 2018.6.0 + q2-vsearch: 2018.6.0 + q2-types: 2018.6.0 + q2-taxa: 2018.6.0 + q2-sample-classifier: 2018.6.0 + q2-quality-filter: 2018.6.0 + q2-quality-control: 2018.6.0 + q2-phylogeny: 2018.6.0 + q2-metadata: 2018.6.0 + q2-longitudinal: 2018.6.0 + q2-gneiss: 2018.6.0 + q2-feature-table: 2018.6.0 + q2-feature-classifier: 2018.6.0 + q2-emperor: 2018.6.0 + q2-diversity: 2018.6.0 + q2-demux: 2018.6.0 + q2-deblur: 2018.6.0 + q2-dada2: 2018.6.0 + q2-cutadapt: 2018.6.0 + q2-composition: 2018.6.0 + q2-alignment: 2018.6.0 + pyzmq: 17.0.0 + PyYAML: '3.12' + pytz: '2018.4' + python-dateutil: 2.7.3 + PySocks: 1.6.8 + pyparsing: 2.2.0 + pyOpenSSL: 18.0.0 + Pygments: 2.2.0 + pycparser: '2.18' + ptyprocess: 0.5.2 + psutil: 5.4.6 + prompt-toolkit: 1.0.15 + pip: 9.0.3 + pickleshare: 0.7.4 + pexpect: 4.6.0 + patsy: 0.5.0 + parso: 0.2.1 + pandocfilters: 1.4.2 + pandas: 0.23.1 + packaging: '17.1' + numpy: 1.14.5 + notebook: 5.5.0 + nose: 1.3.7 + nbformat: 4.4.0 + nbconvert: 5.3.1 + natsort: 5.3.2 + msgpack: 0.5.6 + mkl-random: 1.0.1 + mkl-fft: 1.0.0 + mistune: 0.8.3 + matplotlib: 2.2.2 + MarkupSafe: '1.0' + lockfile: 0.12.2 + kiwisolver: 1.0.1 + jupyter-core: 4.4.0 + jupyter-client: 5.2.3 + jsonschema: 2.6.0 + Jinja2: '2.10' + jedi: 0.12.0 + itsxpress: 1.5.3 + ipywidgets: 7.2.1 + ipython: 6.4.0 + ipython-genutils: 0.2.0 + ipykernel: 4.8.2 + idna: '2.7' + html5lib: 1.0.1 + h5py: 2.7.0 + future: 0.16.0 + fastcluster: 1.1.25 + entrypoints: 0.2.3 + emperor: 1.0.0b16 + decorator: 4.3.0 + deblur: 1.0.4 + Cython: 0.28.3 + cutadapt: '1.16' + cryptography: 2.2.1 + click: '6.7' + chardet: 3.0.4 + cffi: 1.11.5 + certifi: 2018.4.16 + CacheControl: 0.12.4 + bokeh: 0.12.16 + bleach: 2.1.3 + biopython: '1.71' + bibtexparser: 1.0.1 + backcall: 0.1.0 + asn1crypto: 0.24.0 + biom-format: 2.1.6 + cycler: 0.10.0 + gneiss: 0.4.4 + itsxpressqiime2: '1.2' + ijson: '2.3' + tzlocal: '1.3' + xopen: 0.3.2 diff --git a/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/citations.bib b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/citations.bib new file mode 100644 index 0000000..fd4d2db --- /dev/null +++ b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/citations.bib @@ -0,0 +1,12 @@ +@article{framework|qiime2:2018.6.0|0, + author = {Caporaso, J Gregory and Kuczynski, Justin and Stombaugh, Jesse and Bittinger, Kyle and Bushman, Frederic D and Costello, Elizabeth K and Fierer, Noah and Peña, Antonio Gonzalez and Goodrich, Julia K and Gordon, Jeffrey I and Huttley, Gavin A and Kelley, Scott T and Knights, Dan and Koenig, Jeremy E and Ley, Ruth E and Lozupone, Catherine A and McDonald, Daniel and Muegge, Brian D and Pirrung, Meg and Reeder, Jens and Sevinsky, Joel R and Turnbaugh, Peter J and Walters, William A and Widmann, Jeremy and Yatsunenko, Tanya and Zaneveld, Jesse and Knight, Rob}, + doi = {10.1038/nmeth.f.303}, + journal = {Nature methods}, + number = {5}, + pages = {335}, + publisher = {Nature Publishing Group}, + title = {QIIME allows analysis of high-throughput community sequencing data}, + volume = {7}, + year = {2010} +} + diff --git a/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/metadata.yaml b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/metadata.yaml new file mode 100644 index 0000000..bc644c4 --- /dev/null +++ b/tests/test_data/pairedBrokenMissingData/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/metadata.yaml @@ -0,0 +1,3 @@ +uuid: 50d5f31a-a761-4c04-990c-e7668fe6bf00 +type: SampleData[PairedEndSequencesWithQuality] +format: SingleLanePerSamplePairedEndFastqDirFmt diff --git a/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/VERSION b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/VERSION new file mode 100644 index 0000000..81f4084 --- /dev/null +++ b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.6.0 diff --git a/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/data/MANIFEST b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/data/MANIFEST new file mode 100644 index 0000000..4866944 --- /dev/null +++ b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/data/MANIFEST @@ -0,0 +1,3 @@ +sample-id,filename,direction +4774-1-MSITS3,4774-1-MSITS3_0_L001_R1_001.fastq.gz,forward +4774-1-MSITS3,4774-1-MSITS3_1_L001_R2_001.fastq.gz,reverse diff --git a/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/data/metadata.yml b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/data/metadata.yml new file mode 100644 index 0000000..8ffe2a9 --- /dev/null +++ b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/data/metadata.yml @@ -0,0 +1 @@ +{phred-offset: 33} diff --git a/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/metadata.yaml b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/metadata.yaml new file mode 100644 index 0000000..bc644c4 --- /dev/null +++ b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/metadata.yaml @@ -0,0 +1,3 @@ +uuid: 50d5f31a-a761-4c04-990c-e7668fe6bf00 +type: SampleData[PairedEndSequencesWithQuality] +format: SingleLanePerSamplePairedEndFastqDirFmt diff --git a/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/VERSION b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/VERSION new file mode 100644 index 0000000..81f4084 --- /dev/null +++ b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.6.0 diff --git a/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/action/action.yaml b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/action/action.yaml new file mode 100644 index 0000000..7bbef15 --- /dev/null +++ b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/action/action.yaml @@ -0,0 +1,152 @@ +execution: + uuid: 525c43f6-f470-4bcc-b24a-5a1ab7fbc0f5 + runtime: + start: 2018-06-27T20:41:59.719751-04:00 + end: 2018-06-27T20:41:59.830734-04:00 + duration: 110983 microseconds + +action: + type: import + format: PairedEndFastqManifestPhred33 + manifest: + - name: MANIFEST + md5sum: 5ccaf81480adb423a4747b2a64d8a810 + +transformers: + output: + - from: PairedEndFastqManifestPhred33 + to: SingleLanePerSamplePairedEndFastqDirFmt + plugin: !ref 'environment:plugins:types' + +environment: + platform: linux-x86_64 + python: |- + 3.5.5 | packaged by conda-forge | (default, Apr 6 2018, 13:41:05) + [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)] + framework: + version: 2018.6.0 + website: https://qiime2.org + citations: + - !cite 'framework|qiime2:2018.6.0|0' + plugins: + types: + version: 2018.6.0 + website: https://github.com/qiime2/q2-types + python-packages: + widgetsnbextension: 3.2.1 + wheel: 0.31.0 + webencodings: '0.5' + wcwidth: 0.1.7 + urllib3: '1.22' + unifrac: 0.9.2 + traitlets: 4.3.2 + tornado: 5.0.2 + testpath: 0.3.1 + terminado: 0.8.1 + statsmodels: 0.9.0 + six: 1.11.0 + simplegeneric: 0.8.1 + setuptools: 39.2.0 + Send2Trash: 1.5.0 + seaborn: 0.8.1 + scipy: 0.19.1 + scikit-learn: 0.19.1 + scikit-bio: 0.5.1 + requests: 2.19.1 + qiime2: 2018.6.0 + q2templates: 2018.6.0 + q2cli: 2018.6.0 + q2-vsearch: 2018.6.0 + q2-types: 2018.6.0 + q2-taxa: 2018.6.0 + q2-sample-classifier: 2018.6.0 + q2-quality-filter: 2018.6.0 + q2-quality-control: 2018.6.0 + q2-phylogeny: 2018.6.0 + q2-metadata: 2018.6.0 + q2-longitudinal: 2018.6.0 + q2-gneiss: 2018.6.0 + q2-feature-table: 2018.6.0 + q2-feature-classifier: 2018.6.0 + q2-emperor: 2018.6.0 + q2-diversity: 2018.6.0 + q2-demux: 2018.6.0 + q2-deblur: 2018.6.0 + q2-dada2: 2018.6.0 + q2-cutadapt: 2018.6.0 + q2-composition: 2018.6.0 + q2-alignment: 2018.6.0 + pyzmq: 17.0.0 + PyYAML: '3.12' + pytz: '2018.4' + python-dateutil: 2.7.3 + PySocks: 1.6.8 + pyparsing: 2.2.0 + pyOpenSSL: 18.0.0 + Pygments: 2.2.0 + pycparser: '2.18' + ptyprocess: 0.5.2 + psutil: 5.4.6 + prompt-toolkit: 1.0.15 + pip: 9.0.3 + pickleshare: 0.7.4 + pexpect: 4.6.0 + patsy: 0.5.0 + parso: 0.2.1 + pandocfilters: 1.4.2 + pandas: 0.23.1 + packaging: '17.1' + numpy: 1.14.5 + notebook: 5.5.0 + nose: 1.3.7 + nbformat: 4.4.0 + nbconvert: 5.3.1 + natsort: 5.3.2 + msgpack: 0.5.6 + mkl-random: 1.0.1 + mkl-fft: 1.0.0 + mistune: 0.8.3 + matplotlib: 2.2.2 + MarkupSafe: '1.0' + lockfile: 0.12.2 + kiwisolver: 1.0.1 + jupyter-core: 4.4.0 + jupyter-client: 5.2.3 + jsonschema: 2.6.0 + Jinja2: '2.10' + jedi: 0.12.0 + itsxpress: 1.5.3 + ipywidgets: 7.2.1 + ipython: 6.4.0 + ipython-genutils: 0.2.0 + ipykernel: 4.8.2 + idna: '2.7' + html5lib: 1.0.1 + h5py: 2.7.0 + future: 0.16.0 + fastcluster: 1.1.25 + entrypoints: 0.2.3 + emperor: 1.0.0b16 + decorator: 4.3.0 + deblur: 1.0.4 + Cython: 0.28.3 + cutadapt: '1.16' + cryptography: 2.2.1 + click: '6.7' + chardet: 3.0.4 + cffi: 1.11.5 + certifi: 2018.4.16 + CacheControl: 0.12.4 + bokeh: 0.12.16 + bleach: 2.1.3 + biopython: '1.71' + bibtexparser: 1.0.1 + backcall: 0.1.0 + asn1crypto: 0.24.0 + biom-format: 2.1.6 + cycler: 0.10.0 + gneiss: 0.4.4 + itsxpressqiime2: '1.2' + ijson: '2.3' + tzlocal: '1.3' + xopen: 0.3.2 diff --git a/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/citations.bib b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/citations.bib new file mode 100644 index 0000000..fd4d2db --- /dev/null +++ b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/citations.bib @@ -0,0 +1,12 @@ +@article{framework|qiime2:2018.6.0|0, + author = {Caporaso, J Gregory and Kuczynski, Justin and Stombaugh, Jesse and Bittinger, Kyle and Bushman, Frederic D and Costello, Elizabeth K and Fierer, Noah and Peña, Antonio Gonzalez and Goodrich, Julia K and Gordon, Jeffrey I and Huttley, Gavin A and Kelley, Scott T and Knights, Dan and Koenig, Jeremy E and Ley, Ruth E and Lozupone, Catherine A and McDonald, Daniel and Muegge, Brian D and Pirrung, Meg and Reeder, Jens and Sevinsky, Joel R and Turnbaugh, Peter J and Walters, William A and Widmann, Jeremy and Yatsunenko, Tanya and Zaneveld, Jesse and Knight, Rob}, + doi = {10.1038/nmeth.f.303}, + journal = {Nature methods}, + number = {5}, + pages = {335}, + publisher = {Nature Publishing Group}, + title = {QIIME allows analysis of high-throughput community sequencing data}, + volume = {7}, + year = {2010} +} + diff --git a/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/metadata.yaml b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/metadata.yaml new file mode 100644 index 0000000..bc644c4 --- /dev/null +++ b/tests/test_data/pairedBrokenWithMANIFEST/50d5f31a-a761-4c04-990c-e7668fe6bf00/provenance/metadata.yaml @@ -0,0 +1,3 @@ +uuid: 50d5f31a-a761-4c04-990c-e7668fe6bf00 +type: SampleData[PairedEndSequencesWithQuality] +format: SingleLanePerSamplePairedEndFastqDirFmt diff --git a/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/VERSION b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/VERSION new file mode 100644 index 0000000..81f4084 --- /dev/null +++ b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.6.0 diff --git a/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz new file mode 100644 index 0000000..a2bac52 Binary files /dev/null and b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz differ diff --git a/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/data/MANIFEST b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/data/MANIFEST new file mode 100644 index 0000000..64013f6 --- /dev/null +++ b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/data/MANIFEST @@ -0,0 +1,2 @@ +sample-id,filename,direction +4774-1-MSITS3,4774-1-MSITS3_0_L001_R1_001.fastq.gz,forward diff --git a/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/data/metadata.yml b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/data/metadata.yml new file mode 100644 index 0000000..8ffe2a9 --- /dev/null +++ b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/data/metadata.yml @@ -0,0 +1 @@ +{phred-offset: 33} diff --git a/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/metadata.yaml b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/metadata.yaml new file mode 100644 index 0000000..14aa44c --- /dev/null +++ b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/metadata.yaml @@ -0,0 +1,3 @@ +uuid: cfd0e65b-05fb-4329-9618-15ecd0aec9b3 +type: SampleData[SequencesWithQuality] +format: SingleLanePerSampleSingleEndFastqDirFmt diff --git a/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/provenance/VERSION b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/provenance/VERSION new file mode 100644 index 0000000..81f4084 --- /dev/null +++ b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/provenance/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.6.0 diff --git a/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/provenance/action/action.yaml b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/provenance/action/action.yaml new file mode 100644 index 0000000..b1f6b96 --- /dev/null +++ b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/provenance/action/action.yaml @@ -0,0 +1,159 @@ +execution: + uuid: 9a7a6bfd-492d-4198-89ad-143534527de5 + runtime: + start: 2018-06-29T23:16:41.457881-04:00 + end: 2018-06-29T23:16:41.589065-04:00 + duration: 131184 microseconds + +action: + type: import + format: SingleEndFastqManifestPhred33 + manifest: + - name: MANIFEST + md5sum: b625947a7812b500fbfdedcf6d52d255 + +transformers: + output: + - from: SingleEndFastqManifestPhred33 + to: SingleLanePerSampleSingleEndFastqDirFmt + plugin: !ref 'environment:plugins:types' + +environment: + platform: linux-x86_64 + python: |- + 3.5.5 | packaged by conda-forge | (default, Apr 6 2018, 13:41:05) + [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)] + framework: + version: 2018.6.0 + website: https://qiime2.org + citations: + - !cite 'framework|qiime2:2018.6.0|0' + plugins: + types: + version: 2018.6.0 + website: https://github.com/qiime2/q2-types + python-packages: + widgetsnbextension: 3.2.1 + wheel: 0.31.0 + webencodings: '0.5' + wcwidth: 0.1.7 + urllib3: '1.22' + unifrac: 0.9.2 + traitlets: 4.3.2 + tornado: 5.0.2 + testpath: 0.3.1 + terminado: 0.8.1 + statsmodels: 0.9.0 + six: 1.11.0 + simplegeneric: 0.8.1 + setuptools: 39.2.0 + Send2Trash: 1.5.0 + seaborn: 0.8.1 + scipy: 0.19.1 + scikit-learn: 0.19.1 + scikit-bio: 0.5.1 + requests: 2.19.1 + qiime2: 2018.6.0 + q2templates: 2018.6.0 + q2cli: 2018.6.0 + q2-vsearch: 2018.6.0 + q2-types: 2018.6.0 + q2-taxa: 2018.6.0 + q2-sample-classifier: 2018.6.0 + q2-quality-filter: 2018.6.0 + q2-quality-control: 2018.6.0 + q2-phylogeny: 2018.6.0 + q2-metadata: 2018.6.0 + q2-longitudinal: 2018.6.0 + q2-gneiss: 2018.6.0 + q2-feature-table: 2018.6.0 + q2-feature-classifier: 2018.6.0 + q2-emperor: 2018.6.0 + q2-diversity: 2018.6.0 + q2-demux: 2018.6.0 + q2-deblur: 2018.6.0 + q2-dada2: 2018.6.0 + q2-cutadapt: 2018.6.0 + q2-composition: 2018.6.0 + q2-alignment: 2018.6.0 + pyzmq: 17.0.0 + PyYAML: '3.12' + pytz: '2018.4' + python-dateutil: 2.7.3 + PySocks: 1.6.8 + pyparsing: 2.2.0 + pyOpenSSL: 18.0.0 + Pygments: 2.2.0 + pyflakes: 1.6.0 + pycparser: '2.18' + pycodestyle: 2.3.1 + ptyprocess: 0.5.2 + psutil: 5.4.6 + prompt-toolkit: 1.0.15 + pip: 9.0.3 + pickleshare: 0.7.4 + pexpect: 4.6.0 + patsy: 0.5.0 + parso: 0.2.1 + pandocfilters: 1.4.2 + pandas: 0.23.1 + packaging: '17.1' + numpy: 1.14.5 + notebook: 5.5.0 + nose: 1.3.7 + nbformat: 4.4.0 + nbconvert: 5.3.1 + natsort: 5.3.2 + msgpack: 0.5.6 + mkl-random: 1.0.1 + mkl-fft: 1.0.0 + mistune: 0.8.3 + mccabe: 0.6.1 + matplotlib: 2.2.2 + MarkupSafe: '1.0' + lockfile: 0.12.2 + kiwisolver: 1.0.1 + jupyter-core: 4.4.0 + jupyter-client: 5.2.3 + jsonschema: 2.6.0 + Jinja2: '2.10' + jedi: 0.12.0 + itsxpress: 1.5.3 + ipywidgets: 7.2.1 + ipython: 6.4.0 + ipython-genutils: 0.2.0 + ipykernel: 4.8.2 + idna: '2.7' + html5lib: 1.0.1 + h5py: 2.7.0 + future: 0.16.0 + flake8: 3.5.0 + fastcluster: 1.1.25 + entrypoints: 0.2.3 + emperor: 1.0.0b16 + docopt: 0.6.2 + decorator: 4.3.0 + deblur: 1.0.4 + Cython: 0.28.3 + cutadapt: '1.16' + cryptography: 2.2.1 + coveralls: 1.3.0 + coverage: 4.5.1 + click: '6.7' + chardet: 3.0.4 + cffi: 1.11.5 + certifi: 2018.4.16 + CacheControl: 0.12.4 + bokeh: 0.12.16 + bleach: 2.1.3 + biopython: '1.71' + bibtexparser: 1.0.1 + backcall: 0.1.0 + asn1crypto: 0.24.0 + biom-format: 2.1.6 + cycler: 0.10.0 + gneiss: 0.4.4 + q2-itsxpress: '1.4' + ijson: '2.3' + tzlocal: '1.3' + xopen: 0.3.2 diff --git a/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/provenance/citations.bib b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/provenance/citations.bib new file mode 100644 index 0000000..fd4d2db --- /dev/null +++ b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/provenance/citations.bib @@ -0,0 +1,12 @@ +@article{framework|qiime2:2018.6.0|0, + author = {Caporaso, J Gregory and Kuczynski, Justin and Stombaugh, Jesse and Bittinger, Kyle and Bushman, Frederic D and Costello, Elizabeth K and Fierer, Noah and Peña, Antonio Gonzalez and Goodrich, Julia K and Gordon, Jeffrey I and Huttley, Gavin A and Kelley, Scott T and Knights, Dan and Koenig, Jeremy E and Ley, Ruth E and Lozupone, Catherine A and McDonald, Daniel and Muegge, Brian D and Pirrung, Meg and Reeder, Jens and Sevinsky, Joel R and Turnbaugh, Peter J and Walters, William A and Widmann, Jeremy and Yatsunenko, Tanya and Zaneveld, Jesse and Knight, Rob}, + doi = {10.1038/nmeth.f.303}, + journal = {Nature methods}, + number = {5}, + pages = {335}, + publisher = {Nature Publishing Group}, + title = {QIIME allows analysis of high-throughput community sequencing data}, + volume = {7}, + year = {2010} +} + diff --git a/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/provenance/metadata.yaml b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/provenance/metadata.yaml new file mode 100644 index 0000000..2184003 --- /dev/null +++ b/tests/test_data/singleIn/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/provenance/metadata.yaml @@ -0,0 +1,3 @@ +uuid: cfd0e65b-05fb-4329-9618-15ecd0aec9b3 +type: SampleData[JoinedSequencesWithQuality] +format: SingleLanePerSampleSingleEndFastqDirFmt diff --git a/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/VERSION b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/VERSION new file mode 100644 index 0000000..81f4084 --- /dev/null +++ b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.6.0 diff --git a/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz new file mode 100644 index 0000000..312483a Binary files /dev/null and b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/data/4774-1-MSITS3_0_L001_R1_001.fastq.gz differ diff --git a/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/data/MANIFEST b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/data/MANIFEST new file mode 100644 index 0000000..64013f6 --- /dev/null +++ b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/data/MANIFEST @@ -0,0 +1,2 @@ +sample-id,filename,direction +4774-1-MSITS3,4774-1-MSITS3_0_L001_R1_001.fastq.gz,forward diff --git a/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/data/metadata.yml b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/data/metadata.yml new file mode 100644 index 0000000..8ffe2a9 --- /dev/null +++ b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/data/metadata.yml @@ -0,0 +1 @@ +{phred-offset: 33} diff --git a/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/metadata.yaml b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/metadata.yaml new file mode 100644 index 0000000..e44a75e --- /dev/null +++ b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/metadata.yaml @@ -0,0 +1,3 @@ +uuid: 75aea4f5-f10e-421e-91d2-feda9fe7b2e1 +type: SampleData[SequencesWithQuality] +format: SingleLanePerSampleSingleEndFastqDirFmt diff --git a/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/VERSION b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/VERSION new file mode 100644 index 0000000..81f4084 --- /dev/null +++ b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.6.0 diff --git a/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/action/action.yaml b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/action/action.yaml new file mode 100644 index 0000000..2cdfe3f --- /dev/null +++ b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/action/action.yaml @@ -0,0 +1,171 @@ +execution: + uuid: 116338df-3449-4007-b722-7890b894ce8b + runtime: + start: 2018-06-29T23:17:07.569082-04:00 + end: 2018-06-29T23:17:11.895562-04:00 + duration: 4 seconds, and 326480 microseconds + +action: + type: method + plugin: !ref 'environment:plugins:itsxpress' + action: trim_single + inputs: + - per_sample_sequences: cfd0e65b-05fb-4329-9618-15ecd0aec9b3 + parameters: + - region: ITS2 + - taxa: F + - threads: 20 + output-name: trimmed + +transformers: + inputs: + per_sample_sequences: + - from: SingleLanePerSampleSingleEndFastqDirFmt + to: SingleLanePerSampleSingleEndFastqDirFmt + output: + - from: SingleLanePerSampleSingleEndFastqDirFmt + to: SingleLanePerSampleSingleEndFastqDirFmt + +environment: + platform: linux-x86_64 + python: |- + 3.5.5 | packaged by conda-forge | (default, Apr 6 2018, 13:41:05) + [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)] + framework: + version: 2018.6.0 + website: https://qiime2.org + citations: + - !cite 'framework|qiime2:2018.6.0|0' + plugins: + types: + version: 2018.6.0 + website: https://github.com/qiime2/q2-types + itsxpress: + version: '1.4' + website: 'https://github.com/kweber1/q2_itsxpress ITSxpress: + https://github.com/USDA-ARS-GBRU/itsxpress' + python-packages: + widgetsnbextension: 3.2.1 + wheel: 0.31.0 + webencodings: '0.5' + wcwidth: 0.1.7 + urllib3: '1.22' + unifrac: 0.9.2 + traitlets: 4.3.2 + tornado: 5.0.2 + testpath: 0.3.1 + terminado: 0.8.1 + statsmodels: 0.9.0 + six: 1.11.0 + simplegeneric: 0.8.1 + setuptools: 39.2.0 + Send2Trash: 1.5.0 + seaborn: 0.8.1 + scipy: 0.19.1 + scikit-learn: 0.19.1 + scikit-bio: 0.5.1 + requests: 2.19.1 + qiime2: 2018.6.0 + q2templates: 2018.6.0 + q2cli: 2018.6.0 + q2-vsearch: 2018.6.0 + q2-types: 2018.6.0 + q2-taxa: 2018.6.0 + q2-sample-classifier: 2018.6.0 + q2-quality-filter: 2018.6.0 + q2-quality-control: 2018.6.0 + q2-phylogeny: 2018.6.0 + q2-metadata: 2018.6.0 + q2-longitudinal: 2018.6.0 + q2-gneiss: 2018.6.0 + q2-feature-table: 2018.6.0 + q2-feature-classifier: 2018.6.0 + q2-emperor: 2018.6.0 + q2-diversity: 2018.6.0 + q2-demux: 2018.6.0 + q2-deblur: 2018.6.0 + q2-dada2: 2018.6.0 + q2-cutadapt: 2018.6.0 + q2-composition: 2018.6.0 + q2-alignment: 2018.6.0 + pyzmq: 17.0.0 + PyYAML: '3.12' + pytz: '2018.4' + python-dateutil: 2.7.3 + PySocks: 1.6.8 + pyparsing: 2.2.0 + pyOpenSSL: 18.0.0 + Pygments: 2.2.0 + pyflakes: 1.6.0 + pycparser: '2.18' + pycodestyle: 2.3.1 + ptyprocess: 0.5.2 + psutil: 5.4.6 + prompt-toolkit: 1.0.15 + pip: 9.0.3 + pickleshare: 0.7.4 + pexpect: 4.6.0 + patsy: 0.5.0 + parso: 0.2.1 + pandocfilters: 1.4.2 + pandas: 0.23.1 + packaging: '17.1' + numpy: 1.14.5 + notebook: 5.5.0 + nose: 1.3.7 + nbformat: 4.4.0 + nbconvert: 5.3.1 + natsort: 5.3.2 + msgpack: 0.5.6 + mkl-random: 1.0.1 + mkl-fft: 1.0.0 + mistune: 0.8.3 + mccabe: 0.6.1 + matplotlib: 2.2.2 + MarkupSafe: '1.0' + lockfile: 0.12.2 + kiwisolver: 1.0.1 + jupyter-core: 4.4.0 + jupyter-client: 5.2.3 + jsonschema: 2.6.0 + Jinja2: '2.10' + jedi: 0.12.0 + itsxpress: 1.5.3 + ipywidgets: 7.2.1 + ipython: 6.4.0 + ipython-genutils: 0.2.0 + ipykernel: 4.8.2 + idna: '2.7' + html5lib: 1.0.1 + h5py: 2.7.0 + future: 0.16.0 + flake8: 3.5.0 + fastcluster: 1.1.25 + entrypoints: 0.2.3 + emperor: 1.0.0b16 + docopt: 0.6.2 + decorator: 4.3.0 + deblur: 1.0.4 + Cython: 0.28.3 + cutadapt: '1.16' + cryptography: 2.2.1 + coveralls: 1.3.0 + coverage: 4.5.1 + click: '6.7' + chardet: 3.0.4 + cffi: 1.11.5 + certifi: 2018.4.16 + CacheControl: 0.12.4 + bokeh: 0.12.16 + bleach: 2.1.3 + biopython: '1.71' + bibtexparser: 1.0.1 + backcall: 0.1.0 + asn1crypto: 0.24.0 + biom-format: 2.1.6 + cycler: 0.10.0 + gneiss: 0.4.4 + q2-itsxpress: '1.4' + ijson: '2.3' + tzlocal: '1.3' + xopen: 0.3.2 diff --git a/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/artifacts/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/VERSION b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/artifacts/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/VERSION new file mode 100644 index 0000000..81f4084 --- /dev/null +++ b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/artifacts/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/VERSION @@ -0,0 +1,3 @@ +QIIME 2 +archive: 4 +framework: 2018.6.0 diff --git a/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/artifacts/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/action/action.yaml b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/artifacts/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/action/action.yaml new file mode 100644 index 0000000..b1f6b96 --- /dev/null +++ b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/artifacts/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/action/action.yaml @@ -0,0 +1,159 @@ +execution: + uuid: 9a7a6bfd-492d-4198-89ad-143534527de5 + runtime: + start: 2018-06-29T23:16:41.457881-04:00 + end: 2018-06-29T23:16:41.589065-04:00 + duration: 131184 microseconds + +action: + type: import + format: SingleEndFastqManifestPhred33 + manifest: + - name: MANIFEST + md5sum: b625947a7812b500fbfdedcf6d52d255 + +transformers: + output: + - from: SingleEndFastqManifestPhred33 + to: SingleLanePerSampleSingleEndFastqDirFmt + plugin: !ref 'environment:plugins:types' + +environment: + platform: linux-x86_64 + python: |- + 3.5.5 | packaged by conda-forge | (default, Apr 6 2018, 13:41:05) + [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)] + framework: + version: 2018.6.0 + website: https://qiime2.org + citations: + - !cite 'framework|qiime2:2018.6.0|0' + plugins: + types: + version: 2018.6.0 + website: https://github.com/qiime2/q2-types + python-packages: + widgetsnbextension: 3.2.1 + wheel: 0.31.0 + webencodings: '0.5' + wcwidth: 0.1.7 + urllib3: '1.22' + unifrac: 0.9.2 + traitlets: 4.3.2 + tornado: 5.0.2 + testpath: 0.3.1 + terminado: 0.8.1 + statsmodels: 0.9.0 + six: 1.11.0 + simplegeneric: 0.8.1 + setuptools: 39.2.0 + Send2Trash: 1.5.0 + seaborn: 0.8.1 + scipy: 0.19.1 + scikit-learn: 0.19.1 + scikit-bio: 0.5.1 + requests: 2.19.1 + qiime2: 2018.6.0 + q2templates: 2018.6.0 + q2cli: 2018.6.0 + q2-vsearch: 2018.6.0 + q2-types: 2018.6.0 + q2-taxa: 2018.6.0 + q2-sample-classifier: 2018.6.0 + q2-quality-filter: 2018.6.0 + q2-quality-control: 2018.6.0 + q2-phylogeny: 2018.6.0 + q2-metadata: 2018.6.0 + q2-longitudinal: 2018.6.0 + q2-gneiss: 2018.6.0 + q2-feature-table: 2018.6.0 + q2-feature-classifier: 2018.6.0 + q2-emperor: 2018.6.0 + q2-diversity: 2018.6.0 + q2-demux: 2018.6.0 + q2-deblur: 2018.6.0 + q2-dada2: 2018.6.0 + q2-cutadapt: 2018.6.0 + q2-composition: 2018.6.0 + q2-alignment: 2018.6.0 + pyzmq: 17.0.0 + PyYAML: '3.12' + pytz: '2018.4' + python-dateutil: 2.7.3 + PySocks: 1.6.8 + pyparsing: 2.2.0 + pyOpenSSL: 18.0.0 + Pygments: 2.2.0 + pyflakes: 1.6.0 + pycparser: '2.18' + pycodestyle: 2.3.1 + ptyprocess: 0.5.2 + psutil: 5.4.6 + prompt-toolkit: 1.0.15 + pip: 9.0.3 + pickleshare: 0.7.4 + pexpect: 4.6.0 + patsy: 0.5.0 + parso: 0.2.1 + pandocfilters: 1.4.2 + pandas: 0.23.1 + packaging: '17.1' + numpy: 1.14.5 + notebook: 5.5.0 + nose: 1.3.7 + nbformat: 4.4.0 + nbconvert: 5.3.1 + natsort: 5.3.2 + msgpack: 0.5.6 + mkl-random: 1.0.1 + mkl-fft: 1.0.0 + mistune: 0.8.3 + mccabe: 0.6.1 + matplotlib: 2.2.2 + MarkupSafe: '1.0' + lockfile: 0.12.2 + kiwisolver: 1.0.1 + jupyter-core: 4.4.0 + jupyter-client: 5.2.3 + jsonschema: 2.6.0 + Jinja2: '2.10' + jedi: 0.12.0 + itsxpress: 1.5.3 + ipywidgets: 7.2.1 + ipython: 6.4.0 + ipython-genutils: 0.2.0 + ipykernel: 4.8.2 + idna: '2.7' + html5lib: 1.0.1 + h5py: 2.7.0 + future: 0.16.0 + flake8: 3.5.0 + fastcluster: 1.1.25 + entrypoints: 0.2.3 + emperor: 1.0.0b16 + docopt: 0.6.2 + decorator: 4.3.0 + deblur: 1.0.4 + Cython: 0.28.3 + cutadapt: '1.16' + cryptography: 2.2.1 + coveralls: 1.3.0 + coverage: 4.5.1 + click: '6.7' + chardet: 3.0.4 + cffi: 1.11.5 + certifi: 2018.4.16 + CacheControl: 0.12.4 + bokeh: 0.12.16 + bleach: 2.1.3 + biopython: '1.71' + bibtexparser: 1.0.1 + backcall: 0.1.0 + asn1crypto: 0.24.0 + biom-format: 2.1.6 + cycler: 0.10.0 + gneiss: 0.4.4 + q2-itsxpress: '1.4' + ijson: '2.3' + tzlocal: '1.3' + xopen: 0.3.2 diff --git a/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/artifacts/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/citations.bib b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/artifacts/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/citations.bib new file mode 100644 index 0000000..fd4d2db --- /dev/null +++ b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/artifacts/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/citations.bib @@ -0,0 +1,12 @@ +@article{framework|qiime2:2018.6.0|0, + author = {Caporaso, J Gregory and Kuczynski, Justin and Stombaugh, Jesse and Bittinger, Kyle and Bushman, Frederic D and Costello, Elizabeth K and Fierer, Noah and Peña, Antonio Gonzalez and Goodrich, Julia K and Gordon, Jeffrey I and Huttley, Gavin A and Kelley, Scott T and Knights, Dan and Koenig, Jeremy E and Ley, Ruth E and Lozupone, Catherine A and McDonald, Daniel and Muegge, Brian D and Pirrung, Meg and Reeder, Jens and Sevinsky, Joel R and Turnbaugh, Peter J and Walters, William A and Widmann, Jeremy and Yatsunenko, Tanya and Zaneveld, Jesse and Knight, Rob}, + doi = {10.1038/nmeth.f.303}, + journal = {Nature methods}, + number = {5}, + pages = {335}, + publisher = {Nature Publishing Group}, + title = {QIIME allows analysis of high-throughput community sequencing data}, + volume = {7}, + year = {2010} +} + diff --git a/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/artifacts/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/metadata.yaml b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/artifacts/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/metadata.yaml new file mode 100644 index 0000000..2184003 --- /dev/null +++ b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/artifacts/cfd0e65b-05fb-4329-9618-15ecd0aec9b3/metadata.yaml @@ -0,0 +1,3 @@ +uuid: cfd0e65b-05fb-4329-9618-15ecd0aec9b3 +type: SampleData[JoinedSequencesWithQuality] +format: SingleLanePerSampleSingleEndFastqDirFmt diff --git a/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/citations.bib b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/citations.bib new file mode 100644 index 0000000..fd4d2db --- /dev/null +++ b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/citations.bib @@ -0,0 +1,12 @@ +@article{framework|qiime2:2018.6.0|0, + author = {Caporaso, J Gregory and Kuczynski, Justin and Stombaugh, Jesse and Bittinger, Kyle and Bushman, Frederic D and Costello, Elizabeth K and Fierer, Noah and Peña, Antonio Gonzalez and Goodrich, Julia K and Gordon, Jeffrey I and Huttley, Gavin A and Kelley, Scott T and Knights, Dan and Koenig, Jeremy E and Ley, Ruth E and Lozupone, Catherine A and McDonald, Daniel and Muegge, Brian D and Pirrung, Meg and Reeder, Jens and Sevinsky, Joel R and Turnbaugh, Peter J and Walters, William A and Widmann, Jeremy and Yatsunenko, Tanya and Zaneveld, Jesse and Knight, Rob}, + doi = {10.1038/nmeth.f.303}, + journal = {Nature methods}, + number = {5}, + pages = {335}, + publisher = {Nature Publishing Group}, + title = {QIIME allows analysis of high-throughput community sequencing data}, + volume = {7}, + year = {2010} +} + diff --git a/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/metadata.yaml b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/metadata.yaml new file mode 100644 index 0000000..e44a75e --- /dev/null +++ b/tests/test_data/singleOut/75aea4f5-f10e-421e-91d2-feda9fe7b2e1/provenance/metadata.yaml @@ -0,0 +1,3 @@ +uuid: 75aea4f5-f10e-421e-91d2-feda9fe7b2e1 +type: SampleData[SequencesWithQuality] +format: SingleLanePerSampleSingleEndFastqDirFmt diff --git a/tests/test_main.py b/tests/test_main.py deleted file mode 100644 index 86b9181..0000000 --- a/tests/test_main.py +++ /dev/null @@ -1,244 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import tempfile -import shutil -import gzip -import subprocess -import filecmp - -from Bio import SeqIO -import pytest - -import itsxpress - -TEST_DIR = os.path.dirname(os.path.abspath(__file__)) -from itsxpress.definitions import ROOT_DIR, taxa_dict -hmmfile = os.path.join(ROOT_DIR,"ITSx_db","HMMs", taxa_dict["Fungi"]) - - - -def test_check_fastqs(): - fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R1.fastq") - fastq2 = os.path.join(TEST_DIR, "test_data", "broken.fastq") - pytest.raises(ValueError, itsxpress.main._check_fastqs, fastq, fastq2) - -def test_check_fastq_gzs(): - fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R1.fastq.gz") - fastq2 = os.path.join(TEST_DIR, "test_data", "broken.fastq.gz") - pytest.raises(ValueError, itsxpress.main._check_fastqs, fastq, fastq2) - -def test_its_position_init(): - itspos = itsxpress.main.ItsPosition(os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "domtbl.txt"), "ITS2") - exp1 = {'tlen': 341, 'right': {'score': 59.1, 'to_pos': 326, 'from_pos': 282}, 'left': {'score': 52.2, 'to_pos': 128, 'from_pos': 84}} - print(itspos.ddict["M02696:28:000000000-ATWK5:1:1101:19331:3209"]) - assert (exp1 == itspos.ddict["M02696:28:000000000-ATWK5:1:1101:19331:3209"]) - exp2 = {'right': {'score': 34.0, 'to_pos': 370, 'from_pos': 327}, 'tlen': 385} - print(itspos.ddict["M02696:28:000000000-ATWK5:1:1101:23011:4341"]) - assert (exp2 == itspos.ddict["M02696:28:000000000-ATWK5:1:1101:23011:4341"]) - assert (len(itspos.ddict) == 137) - -def test_dedup(): - uc = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "uc.txt") - seq = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "seq.fq.gz") - rep = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "rep.fa") - dedup = itsxpress.main.Dedup( uc_file=uc, rep_file=rep, seq_file=seq) - # Check length of records - assert len(dedup.matchdict) == 227 - # Check that non-representative seqs are logged - assert dedup.matchdict['M02696:28:000000000-ATWK5:1:1101:11740:1800'] == 'M02696:28:000000000-ATWK5:1:1101:10899:1561' - # Check that representative seqs are logged - assert dedup.matchdict["M02696:28:000000000-ATWK5:1:1101:23011:4341"] == 'M02696:28:000000000-ATWK5:1:1101:23011:4341' - -def test_dedup_create_trimmed_seqs(): - tf = tempfile.mkdtemp() - print(tf) - uc = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "uc.txt") - seq = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "seq.fq.gz") - rep = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "rep.fa") - dedup = itsxpress.main.Dedup( uc_file=uc, rep_file=rep, seq_file=seq) - itspos = itsxpress.main.ItsPosition(os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "domtbl.txt"), "ITS2") - # Check non gzipped - dedup.create_trimmed_seqs(os.path.join(tf,"testout.fastq"), gzipped=False, itspos=itspos) - with open(os.path.join(tf,"testout.fastq"), 'r') as f: - recs = SeqIO.parse(f, "fastq") - n = 0 - length = 0 - for rec in recs: - n += 1 - length += len(rec) - print("n: {}, length: {}".format(n, length)) - assert n == 226 - assert length == 42637 - shutil.rmtree(tf) - -def test_dedup_create_trimmed_seqs_gzipped(): - tf = tempfile.mkdtemp() - print(tf) - uc = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "uc.txt") - seq = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "seq.fq.gz") - rep = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "rep.fa") - dedup = itsxpress.main.Dedup( uc_file=uc, rep_file=rep, seq_file=seq) - itspos = itsxpress.main.ItsPosition(os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "domtbl.txt"), "ITS2") - # Check gzipped - dedup.create_trimmed_seqs(os.path.join(tf,"testout.fastq.gz"), gzipped=True, itspos=itspos) - with gzip.open(os.path.join(tf,"testout.fastq.gz"), 'rt') as f: - recs = SeqIO.parse(f, "fastq") - n = 0 - length = 0 - for rec in recs: - n += 1 - length += len(rec) - print("n: {}, length: {}".format(n,length)) - assert n == 226 - assert length == 42637 - shutil.rmtree(tf) - - -#Following test is removed in ITSxpress version 2.0.0, as interleaved files are no longer supported. - -def test_seq_sample_paired_interleaved(): - fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_interleaved.fastq") - sobj = itsxpress.main.SeqSamplePairedInterleaved(fastq=fastq, tempdir=".") - sobj._merge_reads(threads=1) - sobj.deduplicate(threads=1) - sobj._search(hmmfile=hmmfile, threads=1) - shutil.rmtree(sobj.tempdir) - -#Following test will fail if Vsearch version is less than 2.20 or hmmer is not installed -def test_seq_sample_not_paired(): - fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_merged.fastq") - sobj = itsxpress.main.SeqSampleNotPaired(fastq=fastq, tempdir=".") - sobj.deduplicate(threads=1) - sobj._search(hmmfile=hmmfile, threads=1) - shutil.rmtree(sobj.tempdir) - -def test_seq_sample_not_paired_clustered(): - fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_merged.fastq") - sobj = itsxpress.main.SeqSampleNotPaired(fastq=fastq, tempdir=".") - sobj.cluster(threads=1, cluster_id=0.995) - sobj._search(hmmfile=hmmfile, threads=1) - shutil.rmtree(sobj.tempdir) - -def test_seq_sample_paired_not_interleaved(): - fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R1.fastq") - fastq2 = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R2.fastq") - sobj = itsxpress.main.SeqSamplePairedNotInterleaved(fastq=fastq, tempdir=".", fastq2=fastq2) - sobj._merge_reads(threads=1) - sobj.deduplicate(threads=1) - sobj._search(hmmfile=hmmfile, threads=1) - shutil.rmtree(sobj.tempdir) - - -def test_is_paired(): - paired_end,interleaved = itsxpress.main._is_paired("fastq1.fq", "fastq2.fq", single_end=False) - assert paired_end == True - - paired_end,interleaved= itsxpress.main._is_paired("fastq1.fq", None, single_end=False) - assert paired_end == True - - paired_end,interleaved= itsxpress.main._is_paired("fastq1.fq", None, single_end=True) - assert paired_end == False - - -def test_myparser(): - parser = itsxpress.main.myparser() - args = parser.parse_args(['--fastq', 'test.fastq','--outfile', 'test.out','--tempdir', 'dirt','--region','ITS1','--taxa', 'Fungi']) - assert (args.fastq == 'test.fastq') - -#Following test is removed as interleaved files aren't supported anymore -def test_main_interleaved(): - parser = itsxpress.main.myparser() - tf = tempfile.mkdtemp() - print(tf) - fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_interleaved.fastq") - outfile = os.path.join(tf,'testout.fastq') - args = parser.parse_args(['--fastq', fastq,'--outfile', outfile, '--region','ITS2', '--taxa', 'Fungi', '--keeptemp']) - itsxpress.main.main(args=args) - seqs = SeqIO.parse(outfile, 'fastq') - n = sum(1 for _ in seqs) - print(n) - assert(n == 227) - shutil.rmtree(tf) - -def test_main_paired(): - parser = itsxpress.main.myparser() - tf = tempfile.mkdtemp() - fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R1.fastq") - fastq2 = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R2.fastq") - outfile = os.path.join(tf,'testout.fastq') - validation = os.path.join(TEST_DIR, "test_data", "testout.fastq") - args = parser.parse_args(['--fastq', fastq, '--fastq2', fastq2, '--outfile', outfile, '--region','ITS2', '--taxa', 'Fungi', '--threads', '1']) - itsxpress.main.main(args=args) - seqs = SeqIO.parse(outfile, 'fastq') - n = sum(1 for _ in seqs) - assert (n == 227) - shutil.rmtree(tf) - -def test_main_merged(): - parser = itsxpress.main.myparser() - tf = tempfile.mkdtemp() - fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_merged.fastq") - outfile = os.path.join(tf,'testout.fastq') - validation = os.path.join(TEST_DIR, "test_data", "testout.fastq") - args = parser.parse_args(['--fastq', fastq, '--single_end', '--outfile', outfile, '--region','ITS2', '--taxa', 'Fungi', '--threads', '1']) - itsxpress.main.main(args=args) - seqs = SeqIO.parse(outfile, 'fastq') - n = sum(1 for _ in seqs) - print(n) - assert (n == 226) - shutil.rmtree(tf) - -def test_main_paired_no_cluster(): - parser = itsxpress.main.myparser() - tf = tempfile.mkdtemp() - fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R1.fastq") - fastq2 = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R2.fastq") - outfile = os.path.join(tf,'testout.fastq') - validation = os.path.join(TEST_DIR, "test_data", "testout.fastq") - args = parser.parse_args(['--fastq', fastq, '--fastq2', fastq2, '--outfile', outfile, '--region','ITS2', '--taxa', 'Fungi', '--cluster_id', '1', '--threads', '1']) - itsxpress.main.main(args=args) - seqs = SeqIO.parse(outfile, 'fastq') - n = sum(1 for _ in seqs) - assert (n==227) - shutil.rmtree(tf) - - -def test_get_paired_seq_generator(): - uc = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "uc.txt") - seq = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "seq.fq.gz") - rep = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "rep.fa") - fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R1.fastq") - fastq2 = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R2.fastq") - dedup = itsxpress.main.Dedup( uc_file=uc, rep_file=rep, seq_file=seq, fastq=fastq, fastq2=fastq2) - itspos = itsxpress.main.ItsPosition(os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "domtbl.txt"), "ITS2") - f = open(fastq, 'r') - g = open(fastq2, 'r') - seqgen1 = SeqIO.parse(f, 'fastq') - seqgen2 = SeqIO.parse(g, 'fastq') - zipseqgen = zip(seqgen1, seqgen2) - seqs1, seqs2 = dedup._get_paired_seq_generator(zipseqgen, itspos) - n1 = 0 - n2 = 0 - for rec in seqs1: - n1 += 1 - for rec in seqs2: - n2 += 1 - assert (n1==226) - assert (n2==226) - -def test_create_paired_trimmed_seqs(): - uc = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "uc.txt") - seq = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "seq.fq.gz") - rep = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "rep.fa") - fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R1.fastq") - fastq2 = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R2.fastq") - dedup = itsxpress.main.Dedup(uc_file=uc, rep_file=rep, seq_file=seq, fastq=fastq, fastq2=fastq2) - itspos = itsxpress.main.ItsPosition(os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "domtbl.txt"), "ITS2") - tf = tempfile.mkdtemp(".") - print(tf) - t1 = os.path.join(tf,'t2_r1.fq') - t2 = os.path.join(tf,'t2_r2.fq') - dedup.create_paired_trimmed_seqs(t1, t2, False, itspos) - assert (filecmp.cmp(t1, os.path.join(TEST_DIR, "test_data", "t2_r1.fq"))) - assert (filecmp.cmp(t2, os.path.join(TEST_DIR, "test_data", "t2_r2.fq"))) - shutil.rmtree(tf) \ No newline at end of file diff --git a/tests/test_main_pytest.py b/tests/test_main_pytest.py new file mode 100755 index 0000000..e852af0 --- /dev/null +++ b/tests/test_main_pytest.py @@ -0,0 +1,512 @@ +# -*- coding: utf-8 -*- +import os +import tempfile +import shutil +import gzip +import pyzstd as zstd +import subprocess +import filecmp +import sys + +from Bio import SeqIO +import pytest + +import itsxpress + +TEST_DIR = os.path.dirname(os.path.abspath(__name__)) +from itsxpress.definitions import ROOT_DIR, taxa_dict +hmmfile = os.path.join(ROOT_DIR,"ITSx_db","HMMs", taxa_dict["Fungi"]) + + + +def test_check_fastqs(): + fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R1.fastq") + fastq2 = os.path.join(TEST_DIR, "test_data", "broken.fastq") + pytest.raises(ValueError, itsxpress.main._check_fastqs, fastq, fastq2) + +def test_check_fastq_gzs(): + fastq = os.path.join(TEST_DIR,"test_data", "4774-1-MSITS3_R1.fastq.gz") + fastq2 = os.path.join(TEST_DIR,"test_data", "broken.fastq.gz") + pytest.raises(ValueError, itsxpress.main._check_fastqs, fastq, fastq2) + +def test_its_position_init(): + itspos = itsxpress.main.ItsPosition(os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "domtbl.txt"), "ITS2") + exp1 = {'tlen': 341, 'right': {'score': 59.1, 'to_pos': 326, 'from_pos': 282}, 'left': {'score': 52.2, 'to_pos': 128, 'from_pos': 84}} + print(itspos.ddict["M02696:28:000000000-ATWK5:1:1101:19331:3209"]) + assert (exp1 == itspos.ddict["M02696:28:000000000-ATWK5:1:1101:19331:3209"]) + exp2 = {'right': {'score': 34.0, 'to_pos': 370, 'from_pos': 327}, 'tlen': 385} + print(itspos.ddict["M02696:28:000000000-ATWK5:1:1101:23011:4341"]) + assert (exp2 == itspos.ddict["M02696:28:000000000-ATWK5:1:1101:23011:4341"]) + assert (len(itspos.ddict) == 137) + +def test_dedup(): + uc = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "uc.txt") + seq = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "seq.fq.gz") + rep = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "rep.fa") + dedup = itsxpress.main.Dedup( uc_file=uc, rep_file=rep, seq_file=seq) + # Check length of records + assert len(dedup.matchdict) == 227 + # Check that non-representative seqs are logged + assert dedup.matchdict['M02696:28:000000000-ATWK5:1:1101:11740:1800'] == 'M02696:28:000000000-ATWK5:1:1101:10899:1561' + # Check that representative seqs are logged + assert dedup.matchdict["M02696:28:000000000-ATWK5:1:1101:23011:4341"] == 'M02696:28:000000000-ATWK5:1:1101:23011:4341' + +def test_dedup_create_trimmed_seqs(): + tf = tempfile.mkdtemp() + print(tf) + uc = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "uc.txt") + seq = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "seq.fq.gz") + rep = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "rep.fa") + dedup = itsxpress.main.Dedup( uc_file=uc, rep_file=rep, seq_file=seq) + itspos = itsxpress.main.ItsPosition(os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "domtbl.txt"), "ITS2") + # Check non gzipped + dedup.create_trimmed_seqs(os.path.join(tf,"testout.fastq"), gzipped=False, zstd_file=False, wri_file=True, itspos=itspos,tempdir=tf) + with open(os.path.join(tf,"testout.fastq"), 'r') as f: + recs = SeqIO.parse(f, "fastq") + n = 0 + length = 0 + for rec in recs: + n += 1 + length += len(rec) + print("n: {}, length: {}".format(n, length)) + assert n == 226 + assert length == 42637 + shutil.rmtree(tf) + +def test_dedup_create_trimmed_seqs_gzipped(): + tf = tempfile.mkdtemp() + print(tf) + uc = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "uc.txt") + seq = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "seq.fq.gz") + rep = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "rep.fa") + dedup = itsxpress.main.Dedup( uc_file=uc, rep_file=rep, seq_file=seq) + itspos = itsxpress.main.ItsPosition(os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "domtbl.txt"), "ITS2") + # Check gzipped + dedup.create_trimmed_seqs(os.path.join(tf,"testout.fastq.gz"), gzipped=True, zstd_file=False, wri_file=True, itspos=itspos,tempdir=tf) + with gzip.open(os.path.join(tf,"testout.fastq.gz"), 'rt') as f: + recs = SeqIO.parse(f, "fastq") + n = 0 + length = 0 + for rec in recs: + n += 1 + length += len(rec) + print("n: {}, length: {}".format(n,length)) + assert n == 226 + assert length == 42637 + shutil.rmtree(tf) + +def test_dedup_create_trimmed_seqs_zst(): + tf = tempfile.mkdtemp() + print(tf) + uc = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "uc.txt") + seq = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "seq.fq.gz") + rep = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "rep.fa") + dedup = itsxpress.main.Dedup( uc_file=uc, rep_file=rep, seq_file=seq) + itspos = itsxpress.main.ItsPosition(os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "domtbl.txt"), "ITS2") + # Check zstd compression + dedup.create_trimmed_seqs(os.path.join(tf,"testout.fastq.zst"), gzipped=False, zstd_file=True, wri_file=True, itspos=itspos,tempdir=tf) + with zstd.open(os.path.join(tf,"testout.fastq.zst"), 'rt') as f: + recs = SeqIO.parse(f, "fastq") + n = 0 + length = 0 + for rec in recs: + n += 1 + length += len(rec) + print("n: {}, length: {}".format(n,length)) + assert n == 226 + assert length == 42637 + shutil.rmtree(tf) + +#Following test is removed in ITSxpress version 2.0.0, as interleaved files are no longer supported. + +# def test_seq_sample_paired_interleaved(): +# fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_interleaved.fastq") +# sobj = itsxpress.main.SeqSamplePairedInterleaved(fastq=fastq, tempdir=".") +# sobj._merge_reads(threads=1) +# sobj.deduplicate(threads=1) +# sobj._search(hmmfile=hmmfile, threads=1) +# shutil.rmtree(sobj.tempdir) + +#Following test will fail if Vsearch version is less than 2.20 or hmmer is not installed +def test_seq_sample_not_paired(): + fastq = os.path.join(TEST_DIR,"test_data", "4774-1-MSITS3_merged.fastq") + sobj = itsxpress.main.SeqSampleNotPaired(fastq=fastq, tempdir=".") + sobj.deduplicate(threads=1) + sobj._search(hmmfile=hmmfile, threads=1) + shutil.rmtree(sobj.tempdir) + +def test_seq_sample_not_paired_clustered(): + fastq = os.path.join(TEST_DIR,"test_data", "4774-1-MSITS3_merged.fastq") + sobj = itsxpress.main.SeqSampleNotPaired(fastq=fastq, tempdir=".") + sobj.cluster(threads=1, cluster_id=0.995) + sobj._search(hmmfile=hmmfile, threads=1) + shutil.rmtree(sobj.tempdir) + +def test_seq_sample_paired_not_interleaved(): + fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R1.fastq") + fastq2 = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R2.fastq") + sobj = itsxpress.main.SeqSamplePairedNotInterleaved(fastq=fastq, tempdir=".", fastq2=fastq2) + sobj._merge_reads(stagger = True, threads=1) + sobj.deduplicate(threads=1) + sobj._search(hmmfile=hmmfile, threads=1) + shutil.rmtree(sobj.tempdir) + + +def test_is_paired(): + paired_end= itsxpress.main._is_paired("fastq1.fq", "fastq2.fq", single_end=False) + assert paired_end == True + + paired_end= itsxpress.main._is_paired("fastq1.fq", None, single_end=False) + assert paired_end == True + + paired_end= itsxpress.main._is_paired("fastq1.fq", None, single_end=True) + assert paired_end == False + + +def test_myparser(): + parser = itsxpress.main.myparser() + args = parser.parse_args(['--fastq', 'test.fastq','--outfile', 'test.out','--tempdir', 'dirt','--region','ITS1','--taxa', 'Fungi']) + assert (args.fastq == 'test.fastq') + +#Following test is removed as interleaved files aren't supported in version 2 of itsxpress +# def test_main_interleaved(): +# parser = itsxpress.main.myparser() +# tf = tempfile.mkdtemp() +# print(tf) +# fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_interleaved.fastq") +# outfile = os.path.join(tf,'testout.fastq') +# args = parser.parse_args(['--fastq', fastq,'--outfile', outfile, '--region','ITS2', '--taxa', 'Fungi', '--keeptemp']) +# itsxpress.main.main(args=args) +# seqs = SeqIO.parse(outfile, 'fastq') +# n = sum(1 for _ in seqs) +# print(n) +# assert (n == 227) +# shutil.rmtree(tf) + +def test_main_paired(): + parser = itsxpress.main.myparser() + tf = tempfile.mkdtemp() + fastq = os.path.join(TEST_DIR,"test_data", "4774-1-MSITS3_R1.fastq") + fastq2 = os.path.join(TEST_DIR,"test_data", "4774-1-MSITS3_R2.fastq") + outfile = os.path.join(tf,'testout.fastq') + validation = os.path.join(TEST_DIR,"test_data", "testout.fastq") + args = parser.parse_args(['--fastq', fastq, '--fastq2', fastq2, '--outfile', outfile, '--region','ITS2', '--taxa', 'Fungi', '--threads', '1']) + itsxpress.main.main(args=args) + seqs = SeqIO.parse(outfile, 'fastq') + n = sum(1 for _ in seqs) + #assert (n == 227) + assert (n==235) + shutil.rmtree(tf) + +def test_main_merged(): + parser = itsxpress.main.myparser() + tf = tempfile.mkdtemp() + fastq = os.path.join(TEST_DIR,"test_data", "4774-1-MSITS3_merged.fastq") + outfile = os.path.join(tf,'testout.fastq') + validation = os.path.join(TEST_DIR,"test_data", "testout.fastq") + args = parser.parse_args(['--fastq', fastq, '--single_end', '--outfile', outfile, '--region','ITS2', '--taxa', 'Fungi', '--threads', '1']) + itsxpress.main.main(args=args) + seqs = SeqIO.parse(outfile, 'fastq') + n = sum(1 for _ in seqs) + print(n) + assert (n == 226) + shutil.rmtree(tf) + +def test_main_paired_no_cluster(): + parser = itsxpress.main.myparser() + tf = tempfile.mkdtemp() + fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R1.fastq") + fastq2 = os.path.join(TEST_DIR,"test_data", "4774-1-MSITS3_R2.fastq") + outfile = os.path.join(tf,'testout.fastq') + validation = os.path.join(TEST_DIR, "test_data", "testout.fastq") + args = parser.parse_args(['--fastq', fastq, '--fastq2', fastq2, '--outfile', outfile, '--region','ITS2', '--taxa', 'Fungi', '--cluster_id', '1', '--threads', '1']) + itsxpress.main.main(args=args) + seqs = SeqIO.parse(outfile, 'fastq') + n = sum(1 for _ in seqs) + #assert (n==227) + assert (n==235) + shutil.rmtree(tf) + + +def test_get_paired_seq_generator(): + uc = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "uc.txt") + seq = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "seq.fq.gz") + rep = os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "rep.fa") + fastq = os.path.join(TEST_DIR,"test_data", "4774-1-MSITS3_R1.fastq") + fastq2 = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R2.fastq") + dedup = itsxpress.main.Dedup( uc_file=uc, rep_file=rep, seq_file=seq, fastq=fastq, fastq2=fastq2) + itspos = itsxpress.main.ItsPosition(os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "domtbl.txt"), "ITS2") + f = open(fastq, 'r') + g = open(fastq2, 'r') + seqgen1 = SeqIO.parse(f, 'fastq') + seqgen2 = SeqIO.parse(g, 'fastq') + zipseqgen = zip(seqgen1, seqgen2) + seqs1, seqs2 = dedup._get_paired_seq_generator(zipseqgen, itspos, wri_file = True) + n1 = 0 + n2 = 0 + for rec in seqs1: + n1 += 1 + for rec in seqs2: + n2 += 1 + assert (n1==226) + assert (n2==226) + +def test_create_paired_trimmed_seqs(): + uc = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "uc.txt") + seq = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "seq.fq.gz") + rep = os.path.join(TEST_DIR, "test_data", "ex_tmpdir", "rep.fa") + fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R1.fastq") + fastq2 = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R2.fastq") + dedup = itsxpress.main.Dedup(uc_file=uc, rep_file=rep, seq_file=seq, fastq=fastq, fastq2=fastq2) + itspos = itsxpress.main.ItsPosition(os.path.join(TEST_DIR,"test_data", "ex_tmpdir", "domtbl.txt"), "ITS2") + tf = tempfile.mkdtemp(".") + print(tf) + t1 = os.path.join(tf,'t2_r1.fq') + t2 = os.path.join(tf,'t2_r2.fq') + dedup.create_paired_trimmed_seqs(t1, t2, False,False, itspos,True) + assert (filecmp.cmp(t1, os.path.join(TEST_DIR, "test_data", "t2_r1.fq"))) + assert (filecmp.cmp(t2, os.path.join(TEST_DIR, "test_data", "t2_r2.fq"))) + shutil.rmtree(tf) + +try: + import qiime2 + from qiime2.util import redirected_stdio + import os + import unittest + from sys import getsizeof + + #from nose.tools import eq_, raises + from q2_types.per_sample_sequences import (SingleLanePerSampleSingleEndFastqDirFmt, + SingleLanePerSamplePairedEndFastqDirFmt, + FastqManifestFormat) + + import pandas as pd + import itsxpress.q2_itsxpress as q2_itsxpress + import itsxpress.plugin_setup + # The test data dir + TEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__name__)),"tests") + # Test info 1 + TEST_FILE = os.path.join(TEST_DIR, + "test_data", + "paired", + "445cf54a-bf06-4852-8010-13a60fa1598c", + "data") + + TEST_DATA = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE, "r") + # Test info 2 + TEST_FILE_PBMD = os.path.join(TEST_DIR, + "test_data", + "pairedBrokenMissingData", + "50d5f31a-a761-4c04-990c-e7668fe6bf00", + "data") + + TEST_DATA_PBMD = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE_PBMD, "r") + # Test info 3 + TEST_FILE_PAF = os.path.join(TEST_DIR, + "test_data", + "pairedAllForward", + "445cf54a-bf06-4852-8010-13a60fa1598c", + "data") + TEST_DATA_PAF = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE_PAF, "r") + # Test info 4 + TEST_FILE_OUT = os.path.join(TEST_DIR, + "test_data", + "out", + "d9955749-00d5-44ae-a628-4b2da43000e1", + "data") + TEST_DATA_OUT = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE_OUT, "r") + # Test info 5 + TEST_FILE_SINGLEOUT = os.path.join(TEST_DIR, + "test_data", + "singleOut", + "75aea4f5-f10e-421e-91d2-feda9fe7b2e1", + "data") + TEST_DATA_SINGLEOUT = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE_SINGLEOUT, "r") + # Test info 6 + TEST_FILE_SINGLEIN = os.path.join(TEST_DIR, + "test_data", + "singleIn", + "cfd0e65b-05fb-4329-9618-15ecd0aec9b3", + "data") + TEST_DATA_SINGLEIN = SingleLanePerSampleSingleEndFastqDirFmt(TEST_FILE_SINGLEIN, "r") + # Test artifact1 + ARTIFACT_TYPE_P = "SampleData[PairedEndSequencesWithQuality]" + # Test artifact2 + ARTIFACT_TYPE_S = "SampleData[SequencesWithQuality]" + + + def test_taxa_prefix_to_taxa(): + exp1 = q2_itsxpress._taxa_prefix_to_taxa(taxa_prefix="A") + assert exp1, "Alveolata" + + + class SetFastqsAndCheckTests(unittest.TestCase): + def test_single(self): + samples = TEST_DATA_SINGLEIN.manifest.view(pd.DataFrame) + for sample in samples.itertuples(): + obs = q2_itsxpress._set_fastqs_and_check(fastq=sample.forward, + fastq2=None, + sample_id=sample.Index, + single_end=True, + reversed_primers=False, + allow_staggered_reads=False, + threads=1) + self.assertTrue("4774-1-MSITS3" in obs.fastq) + def test_paired(self): + samples = TEST_DATA.manifest.view(pd.DataFrame) + for sample in samples.itertuples(): + obs = q2_itsxpress._set_fastqs_and_check(fastq=sample.forward, + fastq2=sample.reverse, + sample_id=sample.Index, + single_end=True, + reversed_primers=False, + allow_staggered_reads=False, + threads=1) + self.assertTrue("4774-1-MSITS3" in obs.fastq) + + #This test seems to test bbmap? It doesn't fail in V2 + # def test_trim_pair_no_bb(self): + # samples = TEST_DATA.manifest.view(pd.DataFrame) + # for sample in samples.itertuples(): + # with pytest.raises(ValueError): + # q2_itsxpress._set_fastqs_and_check(fastq=sample.forward, + # fastq2=sample.reverse, + # sample_id=sample.Index, + # single_end=False, + # reversed_primers=False, + # threads=1) + # with pytest.raises(ValueError): + # q2_itsxpress._set_fastqs_and_check(fastq=sample.forward, + # fastq2=sample.reverse, + # sample_id=sample.Index, + # single_end=True, + # reversed_primers=False, + # threads=1) + + def test_trim_single_no_cluster(): + threads = 1 + taxa = "F" + sample = None # replace with actual sample object + # Fix for missing argument + obs = q2_itsxpress._set_fastqs_and_check(fastq=sample.forward, + fastq2=None, + sample_id=sample.Index, + single_end=True, + reversed_primers=False, + allow_staggered_reads=False, + threads=threads, + cluster=False) + + + +#This test seems to test bbmap? It doesn't fail in V2 + # def test_trim_pair_no_bb(self): + # samples = TEST_DATA.manifest.view(pd.DataFrame) + # for sample in samples.itertuples(): + # with pytest.raises(ValueError): + # q2_itsxpress._set_fastqs_and_check(fastq=sample.forward, + # fastq2=sample.reverse, + # sample_id=sample.Index, + # single_end=False, + # reversed_primers=False, + # threads=1) + # with pytest.raises(ValueError): + # q2_itsxpress._set_fastqs_and_check(fastq=sample.forward, + # fastq2=sample.reverse, + # sample_id=sample.Index, + # single_end=True, + # reversed_primers=False, + # threads=1) + + def test_trim_single_no_cluster(): + threads = 1 + taxa = "F" + region = "ITS2" + cluster_id = 1 + + exp1 = q2_itsxpress.trim_single(per_sample_sequences=TEST_DATA_SINGLEIN, + threads=threads, + taxa=taxa, + region=region, + cluster_id=cluster_id) + exp2 = getsizeof(exp1) + exp3 = getsizeof(TEST_DATA_SINGLEOUT) + assert exp2, exp3 +# #Testing if there is no hmmer package? +# def test_trim_pair_no_hmmer(): +# threads = 1 +# taxa = "F" +# region = "ITS2" + +# pytest.raises(ValueError, lambda: q2_itsxpress.trim_pair(per_sample_sequences=TEST_DATA, +# threads=threads, +# taxa=taxa, +# region=region)) + + class TrimTests(unittest.TestCase): + def setUp(self): + self.plugin = qiime2.sdk.PluginManager().plugins['itsxpress'] + self.trim_single_fn = self.plugin.methods['trim_single'] + self.trim_paired_fn = self.plugin.methods['trim_pair'] + self.trim_paired_unmerged_fn = \ + self.plugin.methods['trim_pair_output_unmerged'] + + self.se_seqs = qiime2.Artifact.import_data( + 'SampleData[SequencesWithQuality]', + TEST_FILE_SINGLEIN, + 'SingleLanePerSampleSingleEndFastqDirFmt') + self.pe_seqs = qiime2.Artifact.import_data( + 'SampleData[PairedEndSequencesWithQuality]', + TEST_FILE, + 'SingleLanePerSamplePairedEndFastqDirFmt') + + def test_trim_single_success(self): + with redirected_stdio(stderr=os.devnull): + obs_artifact, = self.trim_single_fn(self.se_seqs, 'ITS2') + self.assertEqual(str(obs_artifact.type), + 'SampleData[SequencesWithQuality]') + + obs_dir = obs_artifact.view(SingleLanePerSampleSingleEndFastqDirFmt) + self.assertEqual(getsizeof(obs_dir), getsizeof(TEST_DATA_SINGLEOUT)) + + obs = obs_artifact.view(SingleLanePerSampleSingleEndFastqDirFmt) + obs_manifest = list(obs.manifest.view(FastqManifestFormat).open()) + exp_manifest = [ + 'sample-id,filename,direction\n', + '4774-1-MSITS3,4774-1-MSITS3_0_L001_R1_001.fastq.gz,forward\n'] + self.assertEqual(obs_manifest, exp_manifest) + + def test_trim_pair_success(self): + with redirected_stdio(stderr=os.devnull): + obs_artifact, = self.trim_paired_fn(self.pe_seqs, 'ITS2') + self.assertEqual(str(obs_artifact.type), + 'SampleData[JoinedSequencesWithQuality]') + + obs_dir = obs_artifact.view(SingleLanePerSampleSingleEndFastqDirFmt) + self.assertEqual(getsizeof(obs_dir), getsizeof(TEST_DATA_OUT)) + + obs = obs_artifact.view(SingleLanePerSampleSingleEndFastqDirFmt) + obs_manifest = list(obs.manifest.view(FastqManifestFormat).open()) + exp_manifest = [ + 'sample-id,filename,direction\n', + '4774-1-MSITS3,4774-1-MSITS3_0_L001_R1_001.fastq.gz,forward\n'] + self.assertEqual(obs_manifest, exp_manifest) + + def test_trim_pair_output_unmerged_success(self): + with redirected_stdio(stderr=os.devnull): + obs_artifact, = self.trim_paired_unmerged_fn(self.pe_seqs, 'ITS2') + self.assertEqual(str(obs_artifact.type), + 'SampleData[PairedEndSequencesWithQuality]') + obs = obs_artifact.view(SingleLanePerSamplePairedEndFastqDirFmt) + obs_manifest = list(obs.manifest.view(FastqManifestFormat).open()) + exp_manifest = [ + 'sample-id,filename,direction\n', + '4774-1-MSITS3,4774-1-MSITS3_0_L001_R1_001.fastq.gz,forward\n', + '4774-1-MSITS3,4774-1-MSITS3_1_L001_R2_001.fastq.gz,reverse\n'] + self.assertEqual(obs_manifest, exp_manifest) + + +except ModuleNotFoundError as e: + #logging + print("{}.Could not initialize the Qiime plugin portion of ITSxpress. Command line ITSxpress will still work normally. If you wish to use the Qiime2 ITSxpress plugin, you need to install Qiime2 first into your environment.\n".format(e)) + pass \ No newline at end of file