From 5d1e22f1f33bfb33d4ef2284135c816bc228a019 Mon Sep 17 00:00:00 2001 From: Colin Wood <68213641+colinvwood@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:19:59 -0700 Subject: [PATCH] support q-score filtering on both forward and reverse reads (#72) --- .gitignore | 1 + q2_quality_filter/_filter.py | 570 +++++++++---- q2_quality_filter/_format.py | 19 +- q2_quality_filter/_transformer.py | 17 +- q2_quality_filter/plugin_setup.py | 36 +- q2_quality_filter/test/test_filter.py | 362 --------- q2_quality_filter/{test => tests}/__init__.py | 0 .../{test => tests}/data/numeric_ids.qza | Bin .../Human-Kneecap2_S2_L001_R1_001.fastq.gz | Bin 0 -> 205 bytes .../Human-Kneecap2_S2_L001_R2_001.fastq.gz | Bin 0 -> 270 bytes .../Human-Kneecap3_S3_L001_R1_001.fastq.gz | Bin 0 -> 289 bytes .../Human-Kneecap3_S3_L001_R2_001.fastq.gz | Bin 0 -> 270 bytes .../Human-Kneecap_S1_L001_R1_001.fastq.gz | Bin 0 -> 291 bytes .../Human-Kneecap_S1_L001_R2_001.fastq.gz | Bin 0 -> 270 bytes .../tests/data/paired-end-data/MANIFEST | 9 + .../tests/data/paired-end-data/metadata.yml | 1 + .../{test => tests}/data/real_data.qza | Bin .../{test => tests}/data/real_data_joined.qza | Bin .../{test => tests}/data/simple.fastq.gz | Bin .../{test => tests}/data/simple.qza | Bin .../{test => tests}/data/stats-1.txt | 0 .../{test => tests}/data/stats-numeric.txt | 0 q2_quality_filter/tests/test_filter.py | 764 ++++++++++++++++++ setup.py | 2 +- 24 files changed, 1245 insertions(+), 536 deletions(-) delete mode 100644 q2_quality_filter/test/test_filter.py rename q2_quality_filter/{test => tests}/__init__.py (100%) rename q2_quality_filter/{test => tests}/data/numeric_ids.qza (100%) create mode 100644 q2_quality_filter/tests/data/paired-end-data/Human-Kneecap2_S2_L001_R1_001.fastq.gz create mode 100644 q2_quality_filter/tests/data/paired-end-data/Human-Kneecap2_S2_L001_R2_001.fastq.gz create mode 100644 q2_quality_filter/tests/data/paired-end-data/Human-Kneecap3_S3_L001_R1_001.fastq.gz create mode 100644 q2_quality_filter/tests/data/paired-end-data/Human-Kneecap3_S3_L001_R2_001.fastq.gz create mode 100644 q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R1_001.fastq.gz create mode 100644 q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R2_001.fastq.gz create mode 100644 q2_quality_filter/tests/data/paired-end-data/MANIFEST create mode 100644 q2_quality_filter/tests/data/paired-end-data/metadata.yml rename q2_quality_filter/{test => tests}/data/real_data.qza (100%) rename q2_quality_filter/{test => tests}/data/real_data_joined.qza (100%) rename q2_quality_filter/{test => tests}/data/simple.fastq.gz (100%) rename q2_quality_filter/{test => tests}/data/simple.qza (100%) rename q2_quality_filter/{test => tests}/data/stats-1.txt (100%) rename q2_quality_filter/{test => tests}/data/stats-numeric.txt (100%) create mode 100644 q2_quality_filter/tests/test_filter.py diff --git a/.gitignore b/.gitignore index 66446a6..874cc3c 100644 --- a/.gitignore +++ b/.gitignore @@ -66,6 +66,7 @@ target/ # qiime *.qza +!q2_quality_filter/tests/data/*.qza *.qzv .DS_Store diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index fcac4df..6411b5a 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -6,162 +6,434 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -import itertools +from dataclasses import dataclass +from enum import Enum import gzip +import os +from pathlib import Path + import yaml import pandas as pd - import numpy as np + from q2_types.per_sample_sequences import ( - SingleLanePerSampleSingleEndFastqDirFmt, - FastqManifestFormat, YamlFormat, FastqGzFormat) + SingleLanePerSamplePairedEndFastqDirFmt, + FastqManifestFormat, + YamlFormat, +) + +from q2_quality_filter._format import _ReadDirectionUnion, _ReadDirectionTypes + +@dataclass +class FastqRecord: + sequence_header: bytes + sequence: bytes + quality_header: bytes + quality_scores: bytes -def _read_fastq_seqs(filepath, phred_offset): - # This function is adapted from @jairideout's SO post: - # http://stackoverflow.com/a/39302117/3424666 + +def _read_fastq_records(filepath: str): + ''' + A generator for a fastq file that yields sequence records. The fastq file + is assumed to be gzipped. + + Parameters + ---------- + filepath : str + The filepath to the fastq.gz file. + + Yields + ------ + SequenceRecord + A sequence record representing a record from the fastq file. + ''' fh = gzip.open(filepath, 'rb') - for seq_header, seq, qual_header, qual in itertools.zip_longest(*[fh] * 4): - qual = qual.strip() - qual_parsed = np.frombuffer(memoryview(qual), dtype=np.uint8) - qual_parsed = qual_parsed - phred_offset - yield (seq_header.strip(), seq.strip(), qual_header.strip(), - qual, qual_parsed) - - -def _runs_of_ones(arr): - """Find the location and length of runs - - This method assumes the input array is boolean - """ - # http://stackoverflow.com/a/1066838 - # make sure all runs of ones are well-bounded - bounded = np.hstack(([0], arr, [0])) - # get 1 at run starts and -1 at run ends - difs = np.diff(bounded) - run_starts, = np.where(difs > 0) - run_ends, = np.where(difs < 0) - return run_starts, run_ends - run_starts - - -def _truncate(sequence_record, position): - """Truncate a record up to a specified position""" - seq = sequence_record[1][:position] - qual = sequence_record[3][:position] - qual_parsed = sequence_record[4][:position] - return (sequence_record[0], seq, sequence_record[2], qual, qual_parsed) - - -# defaults as used Bokulich et al, Nature Methods 2013, -# same as QIIME 1.9.1 -_default_params = { - 'min_quality': 4, - 'quality_window': 3, - 'min_length_fraction': 0.75, - 'max_ambiguous': 0 -} - - -# TODO: fix up demux fmt writing a la q2-cutadapt -def q_score(demux: SingleLanePerSampleSingleEndFastqDirFmt, - min_quality: int = _default_params['min_quality'], - quality_window: int = _default_params['quality_window'], - min_length_fraction: - float = _default_params['min_length_fraction'], - max_ambiguous: int = _default_params['max_ambiguous']) \ - -> (SingleLanePerSampleSingleEndFastqDirFmt, - pd.DataFrame): - result = SingleLanePerSampleSingleEndFastqDirFmt() + while True: + try: + sequence_header = next(fh) + sequence = next(fh) + quality_header = next(fh) + quality_scores = next(fh) + except StopIteration: + fh.close() + break + + yield FastqRecord( + sequence_header.strip(), + sequence.strip(), + quality_header.strip(), + quality_scores.strip() + ) + + +def _find_low_quality_window( + quality_scores: bytes, + phred_offset: int, + min_quality: int, + window_length: int +) -> int | None: + ''' + Searches a sequence of quality scores for subsequences (windows) of length + `window_length` that consist of quality scores each less than + `min_quality`. If one or more such windows exist then the index of the + first position of the first such window is returned (which will be the + truncation position). Otherwise None is returned. + + Parameters + ---------- + quality_scores : bytes + The quality scores byte string for a fastq record. + phred_offset : int + The PHRED offset encoding of the quality scores. + min_quality : int + The minimum quality that a base must have in order to not be considered + part of a low quality window. + window_length : int + The length of the low quality window to search for. + + Returns + ------- + int or None + The index of the first position of the first low quality window found + or None if no such window is found. + + ''' + # parse and adjust quality scores + quality_scores_parsed = np.frombuffer( + quality_scores, np.uint8 + ) + quality_scores_adjusted = quality_scores_parsed - phred_offset + less_than_min_quality = quality_scores_adjusted < min_quality + + # use a convolution to detect bad quality windows + window = np.ones(window_length, dtype=int) + convolution = np.convolve(less_than_min_quality, window, mode='valid') + window_indices = np.where(convolution == window_length)[0] + + if len(window_indices) == 0: + return None + + return window_indices[0] + + +def _truncate(fastq_record: FastqRecord, position: int) -> FastqRecord: + ''' + Truncates a fastq record's sequence and quality scores to a specified + `position`. Note that `position` is the first position that is excluded + from the resulting record. + + Parameters + ---------- + fastq_record : FastqRecord + The fastq record to truncate + position : int + The truncation position + + Returns + ------- + FastqRecord + The truncated fastq record. + ''' + fastq_record.sequence = fastq_record.sequence[:position] + fastq_record.quality_scores = fastq_record.quality_scores[:position] + + return fastq_record + + +class RecordStatus(Enum): + UNTRUNCATED = 1 + TRUNCATED = 2 + SHORT = 3 + AMBIGUOUS = 4 + TRUNCATED_AMBIGUOUS = 5 + + +def _process_record( + fastq_record: FastqRecord, + phred_offset: int, + min_quality: int, + window_length: int, + min_length_fraction: float, + max_ambiguous: int, +) -> tuple[FastqRecord, RecordStatus]: + ''' + Processes a fastq record by detecting low quality windows, truncating + before the first such window if found, detecting if a truncated record is + too short, and finally detecting if the number of ambiguous bases is too + high. + + Parameters + ---------- + fastq_record : FastqRecord | None + The fastq record to be processed. None if record does not exist (for + convenience when reverse reads are not present). + phred_offset : int + The PHRED encoding of the record's quality scores. + min_quality : int + The minimum quality that a base must have in order to not be considered + part of a low quality window. + window_length : int + The length of the low quality window to search for. + min_length_fraction : float + The fraction of its original length a record must be greater than to + be retained. + max_ambiguous : int + The maximum number of ambiguous bases a record may contain to be + retained. + + Returns + ------- + tuple[FastqRecord, RecordStatus] + A tuple containing the processed record and its status. + ''' + if fastq_record is None: + return None, None + + status = RecordStatus.UNTRUNCATED + + # search for low quality window + truncation_position = _find_low_quality_window( + fastq_record.quality_scores, + phred_offset, + min_quality, + window_length + ) + + # check if truncation should be performed mark short if necessary + if truncation_position is not None: + initial_record_length = len(fastq_record.sequence) + fastq_record = _truncate(fastq_record, truncation_position) + status = RecordStatus.TRUNCATED + + trunc_fraction = truncation_position / initial_record_length + if trunc_fraction <= min_length_fraction: + status = RecordStatus.SHORT + + return fastq_record, status + + # mark ambiguous if too many ambiguous bases are present + if fastq_record.sequence.count(b'N') > max_ambiguous: + if status == RecordStatus.TRUNCATED: + status = RecordStatus.TRUNCATED_AMBIGUOUS + else: + status = RecordStatus.AMBIGUOUS + + return fastq_record, status + + +def _is_retained( + forward_status: RecordStatus, + reverse_status: RecordStatus | None, + filtering_stats_df: pd.DataFrame, + sample_id: str +) -> bool: + ''' + Determines whether a fastq record or pair of fastq records will retained + in the output. The `reverse_status` is None in the case of single-end + reads. + + Parameters + ---------- + forward_status : RecordStatus + The status of the record from the forward fastq file. + reverse_status : RecordStatus or None + The status of the record from the reverse fastq file if it exists + otherwise None. + filtering_stats_df : pd.DataFrame + The data structure that tracks filtering stats. + sample_id : str + The sample id that the record(s) belongs to. + + Returns + ------- + bool + True if the record(s) is to be retained, False otherwise. + ''' + filtering_stats_df.loc[sample_id, 'total-input-reads'] += 1 + + if (RecordStatus.SHORT in (forward_status, reverse_status)): + filtering_stats_df.loc[sample_id, 'reads-truncated'] += 1 + filtering_stats_df.loc[ + sample_id, 'reads-too-short-after-truncation' + ] += 1 + return False + + if (RecordStatus.AMBIGUOUS in (forward_status, reverse_status)): + filtering_stats_df.loc[ + sample_id, 'reads-exceeding-maximum-ambiguous-bases' + ] += 1 + return False + + if (RecordStatus.TRUNCATED_AMBIGUOUS in (forward_status, reverse_status)): + filtering_stats_df.loc[sample_id, 'reads-truncated'] += 1 + filtering_stats_df.loc[ + sample_id, 'reads-exceeding-maximum-ambiguous-bases' + ] += 1 + return False + + if (RecordStatus.TRUNCATED in (forward_status, reverse_status)): + filtering_stats_df.loc[sample_id, 'reads-truncated'] += 1 + + filtering_stats_df.loc[sample_id, 'total-retained-reads'] += 1 + + return True + + +def _write_record(fastq_record: FastqRecord, fh: gzip.GzipFile) -> None: + ''' + Writes a fastq record to an open fastq file. + + Parameters + ---------- + fastq_record : FastqRecord + The fastq record to be written. + fh : GzipFile + The output fastq file handler. + + Returns + ------- + None + ''' + fh.write(fastq_record.sequence_header + b'\n') + fh.write(fastq_record.sequence + b'\n') + fh.write(fastq_record.quality_header + b'\n') + fh.write(fastq_record.quality_scores + b'\n') + + +def q_score( + demux: _ReadDirectionTypes, + min_quality: int = 4, + quality_window: int = 3, + min_length_fraction: float = 0.75, + max_ambiguous: int = 0 +) -> (_ReadDirectionUnion, pd.DataFrame): + ''' + Parameter defaults as used in Bokulich et al, Nature Methods 2013, same as + QIIME 1.9.1. + ''' + # we need to use a union type of single-end or paired-end formats + # which will be transformed by the framework to the appropriate return type + union_format = _ReadDirectionUnion() + union_format.format = type(demux)() + + result = union_format.format manifest = FastqManifestFormat() manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') - manifest_fh.write('# direction is not meaningful in this file as these\n') - manifest_fh.write('# data may be derived from forward, reverse, or \n') - manifest_fh.write('# joined reads\n') - log_records_truncated_counts = {} - log_records_max_ambig_counts = {} - log_records_tooshort_counts = {} - log_records_totalread_counts = {} - log_records_totalkept_counts = {} + paired = isinstance(result, SingleLanePerSamplePairedEndFastqDirFmt) + # parse phred offset and load the input demux manifest metadata_view = demux.metadata.view(YamlFormat).open() - phred_offset = yaml.load(metadata_view, - Loader=yaml.SafeLoader)['phred-offset'] - demux_manifest = demux.manifest.view(demux.manifest.format) - demux_manifest = pd.read_csv(demux_manifest.open(), dtype=str) - demux_manifest.set_index('filename', inplace=True) - - iterator = demux.sequences.iter_views(FastqGzFormat) - for bc_id, (fname, fp) in enumerate(iterator): - sample_id = demux_manifest.loc[str(fname)]['sample-id'] - - log_records_truncated_counts[sample_id] = 0 - log_records_max_ambig_counts[sample_id] = 0 - log_records_tooshort_counts[sample_id] = 0 - log_records_totalread_counts[sample_id] = 0 - log_records_totalkept_counts[sample_id] = 0 - - # per q2-demux, barcode ID, lane number and read number are not - # relevant here - path = result.sequences.path_maker(sample_id=sample_id, - barcode_id=bc_id, - lane_number=1, - read_number=1) - - # we do not open a writer by default in the event that all sequences - # for a sample are filtered out; an empty fastq file is not a valid - # fastq file. - writer = None - for sequence_record in _read_fastq_seqs(str(fp), phred_offset): - log_records_totalread_counts[sample_id] += 1 - - # determine the length of the runs below quality threshold - # NOTE: QIIME 1.x used <= the quality threshold and the parameter - # -q was interpreted as the maximum unacceptable PHRED score. In - # QIIME 2.x, we're now interpreting this as the minimum - # acceptable score. - qual_below_threshold = sequence_record[4] < min_quality - run_starts, run_lengths = _runs_of_ones(qual_below_threshold) - bad_windows = np.argwhere(run_lengths > quality_window) - - # if there is a run of sufficient size, truncate it - if bad_windows.size > 0: - log_records_truncated_counts[sample_id] += 1 - - full_length = len(sequence_record[1]) - sequence_record = _truncate(sequence_record, - run_starts[bad_windows[0]][0]) - trunc_length = len(sequence_record[1]) - - # do not keep the read if it is too short following truncation - if round(trunc_length / full_length, 3) <= min_length_fraction: - log_records_tooshort_counts[sample_id] += 1 - continue - - # do not keep the read if there are too many ambiguous bases - if sequence_record[1].count(b'N') > max_ambiguous: - log_records_max_ambig_counts[sample_id] += 1 - continue - - fastq_lines = b'\n'.join(sequence_record[:4]) + b'\n' - - if writer is None: - writer = gzip.open(str(path), mode='w') - writer.write(fastq_lines) - - log_records_totalkept_counts[sample_id] += 1 - - if writer is not None: - manifest_fh.write('%s,%s,%s\n' % (sample_id, path.name, 'forward')) - writer.close() - - if set(log_records_totalkept_counts.values()) == {0, }: - raise ValueError("All sequences from all samples were filtered out. " - "The parameter choices may be too stringent for the " - "data.") - + phred_offset = yaml.load( + metadata_view, Loader=yaml.SafeLoader + )['phred-offset'] + demux_manifest_df = demux.manifest.view(pd.DataFrame) + + # initialize filtering stats tracking dataframe + filtering_stats_df = pd.DataFrame( + data=0, + index=demux_manifest_df.index, + columns=[ + 'total-input-reads', + 'total-retained-reads', + 'reads-truncated', + 'reads-too-short-after-truncation', + 'reads-exceeding-maximum-ambiguous-bases' + ] + ) + + for barcode_id, sample_id in enumerate(demux_manifest_df.index.values): + # get/create filepath(s) of input/output fastq file(s) + forward_input_fp = demux_manifest_df.loc[sample_id, 'forward'] + forward_output_fp = Path(result.path) / Path(forward_input_fp).name + if paired: + reverse_input_fp = demux_manifest_df.loc[sample_id, 'reverse'] + reverse_output_fp = Path(result.path) / Path(reverse_input_fp).name + + # open output filehandle(s) and create fastq record iterator + forward_fh = gzip.open(forward_output_fp, mode='wb') + + if paired: + reverse_fh = gzip.open(reverse_output_fp, mode='wb') + + forward_iterator = _read_fastq_records(str(forward_input_fp)) + reverse_iterator = _read_fastq_records(str(reverse_input_fp)) + iterator = zip(forward_iterator, reverse_iterator) + else: + iterator = _read_fastq_records(str(forward_input_fp)) + + for fastq_record in iterator: + if paired: + forward_record, reverse_record = fastq_record + else: + forward_record = fastq_record + reverse_record = None + + # process records + forward_record, forward_status = _process_record( + fastq_record=forward_record, + phred_offset=phred_offset, + min_quality=min_quality, + window_length=quality_window + 1, + min_length_fraction=min_length_fraction, + max_ambiguous=max_ambiguous + ) + reverse_record, reverse_status = _process_record( + fastq_record=reverse_record, + phred_offset=phred_offset, + min_quality=min_quality, + window_length=quality_window + 1, + min_length_fraction=min_length_fraction, + max_ambiguous=max_ambiguous + ) + + # see if record(s) retained and update filtering stats + retained = _is_retained( + forward_status, + reverse_status, + filtering_stats_df, + sample_id + ) + + # if retained write to output file(s) + if retained: + if paired: + _write_record(forward_record, forward_fh) + _write_record(reverse_record, reverse_fh) + else: + _write_record(forward_record, forward_fh) + + # close output file(s) and update manifest if record(s) retained, + # otherwise delete the empty file(s) + forward_fh.close() + if paired: + reverse_fh.close() + + if filtering_stats_df.loc[sample_id, 'total-retained-reads'] > 0: + manifest_fh.write( + f'{sample_id},{forward_output_fp.name},forward\n' + ) + if paired: + manifest_fh.write( + f'{sample_id},{reverse_output_fp.name},reverse\n' + ) + else: + os.remove(forward_output_fp) + if paired: + os.remove(reverse_output_fp) + + # error if all samples retained no reads + if filtering_stats_df['total-retained-reads'].sum() == 0: + msg = ( + 'All sequences from all samples were filtered. The parameter ' + 'choices may have been too stringent for the data.' + ) + raise ValueError(msg) + + # write output manifest and metadata files to format manifest_fh.close() result.manifest.write_data(manifest, FastqManifestFormat) @@ -169,18 +441,4 @@ def q_score(demux: SingleLanePerSampleSingleEndFastqDirFmt, metadata.path.write_text(yaml.dump({'phred-offset': phred_offset})) result.metadata.write_data(metadata, YamlFormat) - columns = ['sample-id', 'total-input-reads', 'total-retained-reads', - 'reads-truncated', - 'reads-too-short-after-truncation', - 'reads-exceeding-maximum-ambiguous-bases'] - stats = [] - for id_, _ in sorted(log_records_truncated_counts.items()): - stats.append([id_, log_records_totalread_counts[id_], - log_records_totalkept_counts[id_], - log_records_truncated_counts[id_], - log_records_tooshort_counts[id_], - log_records_max_ambig_counts[id_]]) - - stats = pd.DataFrame(stats, columns=columns).set_index('sample-id') - - return result, stats + return union_format, filtering_stats_df diff --git a/q2_quality_filter/_format.py b/q2_quality_filter/_format.py index c1e62c1..2a0a159 100644 --- a/q2_quality_filter/_format.py +++ b/q2_quality_filter/_format.py @@ -6,11 +6,18 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +from typing import Union + import qiime2.plugin.model as model +from q2_types.per_sample_sequences import ( + SingleLanePerSampleSingleEndFastqDirFmt, + SingleLanePerSamplePairedEndFastqDirFmt, +) + class QualityFilterStatsFmt(model.TextFileFormat): - def sniff(self): + def _validate_(self, level): line = open(str(self)).readline() hdr = line.strip().split(',') expected = ['sample-id', 'total-input-reads', @@ -23,3 +30,13 @@ def sniff(self): QualityFilterStatsDirFmt = model.SingleFileDirectoryFormat( 'QualityFilterStatsDirFmt', 'stats.csv', QualityFilterStatsFmt) + + +_ReadDirectionTypes = Union[ + SingleLanePerSamplePairedEndFastqDirFmt, + SingleLanePerSampleSingleEndFastqDirFmt +] + + +class _ReadDirectionUnion: + format: _ReadDirectionTypes diff --git a/q2_quality_filter/_transformer.py b/q2_quality_filter/_transformer.py index c908551..0bcf28c 100644 --- a/q2_quality_filter/_transformer.py +++ b/q2_quality_filter/_transformer.py @@ -10,7 +10,12 @@ import qiime2 from .plugin_setup import plugin -from ._format import QualityFilterStatsFmt +from ._format import QualityFilterStatsFmt, _ReadDirectionUnion + +from q2_types.per_sample_sequences import ( + SingleLanePerSampleSingleEndFastqDirFmt, + SingleLanePerSamplePairedEndFastqDirFmt, +) @plugin.register_transformer @@ -45,3 +50,13 @@ def _2(ff: QualityFilterStatsFmt) -> pd.DataFrame: @plugin.register_transformer def _3(ff: QualityFilterStatsFmt) -> qiime2.Metadata: return qiime2.Metadata(_stats_to_df(ff)) + + +@plugin.register_transformer +def _4(ff: _ReadDirectionUnion) -> SingleLanePerSampleSingleEndFastqDirFmt: + return ff.format + + +@plugin.register_transformer +def _5(ff: _ReadDirectionUnion) -> SingleLanePerSamplePairedEndFastqDirFmt: + return ff.format diff --git a/q2_quality_filter/plugin_setup.py b/q2_quality_filter/plugin_setup.py index f077e7c..c7ce738 100644 --- a/q2_quality_filter/plugin_setup.py +++ b/q2_quality_filter/plugin_setup.py @@ -43,9 +43,10 @@ artifact_format=QualityFilterStatsDirFmt) InputMap, OutputMap = qiime2.plugin.TypeMap({ - SampleData[SequencesWithQuality | PairedEndSequencesWithQuality]: + SampleData[SequencesWithQuality]: SampleData[SequencesWithQuality], - + SampleData[PairedEndSequencesWithQuality]: + SampleData[PairedEndSequencesWithQuality], SampleData[JoinedSequencesWithQuality]: SampleData[JoinedSequencesWithQuality], }) @@ -62,19 +63,24 @@ } _q_score_parameter_descriptions = { - 'min_quality': 'The minimum acceptable PHRED score. All PHRED scores ' - 'less that this value are considered to be low PHRED ' - 'scores.', - 'quality_window': 'The maximum number of low PHRED scores that ' - 'can be observed in direct succession before ' - 'truncating a sequence read.', - 'min_length_fraction': 'The minimum length that a sequence read can ' - 'be following truncation and still be ' - 'retained. This length should be provided ' - 'as a fraction of the input sequence length.', - 'max_ambiguous': 'The maximum number of ambiguous (i.e., N) base ' - 'calls. This is applied after trimming sequences ' - 'based on `min_length_fraction`.' + 'min_quality': ( + 'The minimum acceptable PHRED score. All PHRED scores less that this ' + 'value are considered to be low PHRED scores.' + ), + 'quality_window': ( + 'The maximum number of low PHRED scores that can be observed in ' + 'direct succession before truncating a sequence read. Note that ' + 'truncation is performed such that the entirety of the low quality ' + 'window is truncated.' + ), + 'min_length_fraction': ( + 'Filter truncated reads whose length fraction (truncated length ' + 'divided by original length) is less than or equal to this value.' + ), + 'max_ambiguous': ( + 'The maximum number of ambiguous (i.e., N) base calls. This is ' + 'applied after trimming sequences based on `min_length_fraction`.' + ) } _q_score_output_descriptions = { diff --git a/q2_quality_filter/test/test_filter.py b/q2_quality_filter/test/test_filter.py deleted file mode 100644 index 242b9f2..0000000 --- a/q2_quality_filter/test/test_filter.py +++ /dev/null @@ -1,362 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2016-2023, QIIME 2 development team. -# -# Distributed under the terms of the Modified BSD License. -# -# The full license is in the file LICENSE, distributed with this software. -# ---------------------------------------------------------------------------- - -import unittest -import gzip -import os - -import pandas as pd -import pandas.testing as pdt -import qiime2 -from qiime2.sdk import Artifact -import numpy as np -import numpy.testing as npt -from qiime2.plugin.testing import TestPluginBase -from qiime2.util import redirected_stdio -from q2_types.per_sample_sequences import ( - FastqGzFormat, - SingleLanePerSampleSingleEndFastqDirFmt, -) - -from q2_quality_filter._filter import ( - _read_fastq_seqs, - _runs_of_ones, - _truncate, -) -from q2_quality_filter._format import QualityFilterStatsFmt - - -class FilterTests(TestPluginBase): - package = 'q2_quality_filter.test' - - def test_read_fastq_seqs(self): - exp = [(b'@foo', b'ATGC', b'+', b'IIII', np.array([40, 40, 40, 40])), - (b'@bar', b'TGCA', b'+', b'ABCD', np.array([32, 33, 34, 35]))] - obs = list(_read_fastq_seqs(self.get_data_path('simple.fastq.gz'), 33)) - self.assertEqual(len(obs), 2) - - for o, e in zip(obs, exp): - self.assertEqual(o[:4], e[:4]) - npt.assert_equal(o[4], e[4]) - - def test_runs_of_ones(self): - data = [np.array([0, 0, 0, 0, 0, 0], dtype=bool), - np.array([1, 0, 1, 0, 1, 0], dtype=bool), - np.array([1, 1, 1, 1, 1, 1], dtype=bool), - np.array([0, 1, 1, 1, 0, 0], dtype=bool), - np.array([0, 0, 0, 0, 0, 1], dtype=bool)] - - exp_starts = [np.array([]), np.array([0, 2, 4]), np.array([0]), - np.array([1]), np.array([5])] - exp_lengths = [np.array([]), np.array([1, 1, 1]), np.array([6]), - np.array([3]), np.array([1])] - - for i, d in enumerate(data): - o_starts, o_lengths = _runs_of_ones(d) - npt.assert_equal(o_starts, exp_starts[i]) - npt.assert_equal(o_lengths, exp_lengths[i]) - - def test_truncate(self): - data = [('@x', 'ATGCG', '+', 'IIIIA', np.array([40, 40, 40, 40, 32])), - ('@y', 'TGCAC', '+', 'ABCDA', np.array([32, 33, 34, 35, 32]))] - - exp1 = [('@x', 'A', '+', 'I', np.array([40])), - ('@y', 'T', '+', 'A', np.array([32]))] - - exp2 = [('@x', 'AT', '+', 'II', np.array([40, 40])), - ('@y', 'TG', '+', 'AB', np.array([32, 33]))] - - for i, d in enumerate(data): - o1 = _truncate(d, 1) - o2 = _truncate(d, 2) - self.assertEqual(o1[:4], exp1[i][:4]) - npt.assert_equal(o1[4], exp1[i][4]) - self.assertEqual(o2[:4], exp2[i][:4]) - npt.assert_equal(o2[4], exp2[i][4]) - - def test_q_score_all_dropped(self): - ar = Artifact.load(self.get_data_path('simple.qza')) - - with self.assertRaisesRegex(ValueError, "filtered out"): - with redirected_stdio(stdout=os.devnull): - self.plugin.methods['q_score'](ar, min_quality=50) - - def test_q_score_numeric_ids(self): - ar = Artifact.load(self.get_data_path('numeric_ids.qza')) - exp_sids = {'00123', '0.4560'} - - with redirected_stdio(stdout=os.devnull): - obs_ar, stats_ar = self.plugin.methods['q_score']( - ar, min_quality=20) - obs = obs_ar.view(SingleLanePerSampleSingleEndFastqDirFmt) - stats = stats_ar.view(pd.DataFrame) - obs_manifest = obs.manifest.view(obs.manifest.format) - obs_manifest = pd.read_csv(obs_manifest.open(), dtype=str, comment='#') - obs_manifest.set_index('sample-id', inplace=True) - - obs_sids = set(obs_manifest.index) - self.assertEqual(obs_sids, exp_sids) - self.assertEqual(set(stats.index), exp_sids) - - def test_q_score(self): - ar = Artifact.load(self.get_data_path('simple.qza')) - with redirected_stdio(stdout=os.devnull): - obs_drop_ambig_ar, stats_ar = self.plugin.methods['q_score']( - ar, quality_window=2, min_quality=20, min_length_fraction=0.25) - obs_drop_ambig = obs_drop_ambig_ar.view( - SingleLanePerSampleSingleEndFastqDirFmt) - stats = stats_ar.view(pd.DataFrame) - - exp_drop_ambig = ["@foo_1", - "ATGCATGC", - "+", - "DDDDBBDD"] - columns = ['sample-id', 'total-input-reads', 'total-retained-reads', - 'reads-truncated', - 'reads-too-short-after-truncation', - 'reads-exceeding-maximum-ambiguous-bases'] - exp_drop_ambig_stats = pd.DataFrame([('foo', 2, 1, 0, 0, 1), - ('bar', 1, 0, 0, 0, 1)], - columns=columns) - exp_drop_ambig_stats = exp_drop_ambig_stats.set_index('sample-id') - obs = [] - iterator = obs_drop_ambig.sequences.iter_views(FastqGzFormat) - for sample_id, fp in iterator: - obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')]) - self.assertEqual(obs, exp_drop_ambig) - pdt.assert_frame_equal(stats, exp_drop_ambig_stats.loc[stats.index]) - - with redirected_stdio(stdout=os.devnull): - obs_trunc_ar, stats_ar = self.plugin.methods['q_score']( - ar, quality_window=1, min_quality=33, min_length_fraction=0.25) - obs_trunc = obs_trunc_ar.view(SingleLanePerSampleSingleEndFastqDirFmt) - stats = stats_ar.view(pd.DataFrame) - - exp_trunc = ["@foo_1", - "ATGCATGC", - "+", - "DDDDBBDD", - "@bar_1", - "ATA", - "+", - "DDD"] - exp_trunc_stats = pd.DataFrame([('foo', 2, 1, 0, 0, 1), - ('bar', 1, 1, 1, 0, 0)], - columns=columns) - exp_trunc_stats = exp_trunc_stats.set_index('sample-id') - - obs = [] - for sample_id, fp in obs_trunc.sequences.iter_views(FastqGzFormat): - obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')]) - self.assertEqual(sorted(obs), sorted(exp_trunc)) - pdt.assert_frame_equal(stats, exp_trunc_stats.loc[stats.index]) - - def test_q_score_real(self): - ar = Artifact.load(self.get_data_path('real_data.qza')) - with redirected_stdio(stdout=os.devnull): - obs_ar, stats_ar = self.plugin.methods['q_score']( - ar, min_quality=40, min_length_fraction=0.24) - obs_result = obs_ar.view(SingleLanePerSampleSingleEndFastqDirFmt) - stats = stats_ar.view(pd.DataFrame) - - # All input reads are represented here in their post-quality filtered - # form. Reads that are commented out were manually identified as being - # filtered by the q_score method. For the commented reads, the comments - # denote why the read is not retained. - - # The first read, @HWI-EAS440_0386:1:32:15467:1432#0/1, is 25% of - # total read length and is indicative of a sequence at the - # min_length_fraction boundary. - exp_result = [ - "@HWI-EAS440_0386:1:32:15467:1432#0/1", - "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT", - "+", - "hhhhhhhhhhhhfghhhghghghhhchhhahhhhhfhh", - - # too short - # "@HWI-EAS440_0386:1:36:9986:17043#0/1", - # "TACGTAGGTGGCAAGCGTTATCCGGATTTATTG", - # "+", - # "hhhhhhhhhhhhhhhhhhhhhhhhhffhhghhh", - - "@HWI-EAS440_0386:1:37:13343:14820#0/1", - ("TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGAT" - "GGATGTTTAAGTCAGTTGTG"), - "+", - ("hhhhhhhhhhhhhfhhhhhfhhhhghhhhghhhhhhhhhgghhhgghhhgghh" - "hgdhhhhghghhhdhhhhgh"), - - "@HWI-EAS440_0386:1:41:18215:15404#0/1", - "TACGTAGGTGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCATGTA", - "+", - "hhhhhhhhhhhhghhhhhhhhhhhhffhhghhhhghhghgghghhhhhgh", - - # too short - # "@HWI-EAS440_0386:1:42:5423:19606#0/1", - # "TACGTAGGGAGCAAGCGTT", - # "+", - # "hhhhghhhhhhhhhghhfh", - - "@HWI-EAS440_0386:1:52:7507:5841#0/1", - "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT", - "+", - "hhhhhhhhhghhfghhhhhhhhhhgfhhhghhhghdhh", - - "@HWI-EAS440_0386:1:53:18599:4074#0/1", - "TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTG", - "+", - "hhhhfhhhhhfhhhhhhfhffhghhfgghggghdcbh", - - # too short - # "@HWI-EAS440_0386:1:55:16425:9514#0/1", - # "TACGGAGGATCCGAGCGTTATCCGGATT", - # "+", - # "hhhhhhhhhhhhfghhhghghhhhhbgh", - - "@HWI-EAS440_0386:1:65:12049:5619#0/1", - "TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTG", - "+", - "hhhhhhhhhhhhhhhhhhhhhhhhhfhhhhhhhghdhghhhhhghcfh", - - # @HWI-EAS440_0386:1:95:4837:16388#0/1 - # starts off < Q40 - ] - - columns = ['sample-id', 'total-input-reads', 'total-retained-reads', - 'reads-truncated', - 'reads-too-short-after-truncation', - 'reads-exceeding-maximum-ambiguous-bases'] - exp_stats = pd.DataFrame([('foo', 10, 6, 10, 4, 0)], - columns=columns) - exp_stats = exp_stats.set_index('sample-id') - obs = [] - iterator = obs_result.sequences.iter_views(FastqGzFormat) - for sample_id, fp in iterator: - obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')]) - self.assertEqual(obs, exp_result) - pdt.assert_frame_equal(stats, exp_stats.loc[stats.index]) - - def test_q_score_real_joined(self): - ar = Artifact.load(self.get_data_path('real_data_joined.qza')) - with redirected_stdio(stdout=os.devnull): - obs_ar, stats_ar = self.plugin.methods['q_score']( - ar, min_quality=40, min_length_fraction=0.24) - obs_result = obs_ar.view(SingleLanePerSampleSingleEndFastqDirFmt) - stats = stats_ar.view(pd.DataFrame) - - # All input reads are represented here in their post-quality filtered - # form. Reads that are commented out were manually identified as being - # filtered by the q_score method. For the commented reads, the comments - # denote why the read is not retained. - - # The first read, @HWI-EAS440_0386:1:32:15467:1432#0/1, is 25% of - # total read length and is indicative of a sequence at the - # min_length_fraction boundary. - exp_result = [ - "@HWI-EAS440_0386:1:32:15467:1432#0/1", - "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT", - "+", - "hhhhhhhhhhhhfghhhghghghhhchhhahhhhhfhh", - - # too short - # "@HWI-EAS440_0386:1:36:9986:17043#0/1", - # "TACGTAGGTGGCAAGCGTTATCCGGATTTATTG", - # "+", - # "hhhhhhhhhhhhhhhhhhhhhhhhhffhhghhh", - - "@HWI-EAS440_0386:1:37:13343:14820#0/1", - "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGAT" - "GGATGTTTAAGTCAGTTGTG", - "+", - "hhhhhhhhhhhhhfhhhhhfhhhhghhhhghhhhhhhhhgghhhgghhhgghh" - "hgdhhhhghghhhdhhhhgh", - - "@HWI-EAS440_0386:1:41:18215:15404#0/1", - "TACGTAGGTGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCATGTA", - "+", - "hhhhhhhhhhhhghhhhhhhhhhhhffhhghhhhghhghgghghhhhhgh", - - # too short - # "@HWI-EAS440_0386:1:42:5423:19606#0/1", - # "TACGTAGGGAGCAAGCGTT", - # "+", - # "hhhhghhhhhhhhhghhfh", - - "@HWI-EAS440_0386:1:52:7507:5841#0/1", - "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT", - "+", - "hhhhhhhhhghhfghhhhhhhhhhgfhhhghhhghdhh", - - "@HWI-EAS440_0386:1:53:18599:4074#0/1", - "TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTG", - "+", - "hhhhfhhhhhfhhhhhhfhffhghhfgghggghdcbh", - - # too short - # "@HWI-EAS440_0386:1:55:16425:9514#0/1", - # "TACGGAGGATCCGAGCGTTATCCGGATT", - # "+", - # "hhhhhhhhhhhhfghhhghghhhhhbgh", - - "@HWI-EAS440_0386:1:65:12049:5619#0/1", - "TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTG", - "+", - "hhhhhhhhhhhhhhhhhhhhhhhhhfhhhhhhhghdhghhhhhghcfh", - - # @HWI-EAS440_0386:1:95:4837:16388#0/1 - # starts off < Q40 - ] - - columns = ['sample-id', 'total-input-reads', 'total-retained-reads', - 'reads-truncated', - 'reads-too-short-after-truncation', - 'reads-exceeding-maximum-ambiguous-bases'] - exp_stats = pd.DataFrame([('foo', 10, 6, 10, 4, 0)], - columns=columns) - exp_stats = exp_stats.set_index('sample-id') - obs = [] - iterator = obs_result.sequences.iter_views(FastqGzFormat) - for sample_id, fp in iterator: - obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')]) - self.assertEqual(obs, exp_result) - pdt.assert_frame_equal(stats, exp_stats.loc[stats.index]) - - -class TransformerTests(TestPluginBase): - package = 'q2_quality_filter.test' - - def test_stats_to_metadata(self): - filepath = self.get_data_path('stats-1.txt') - format = QualityFilterStatsFmt(filepath, mode='r') - transformer = self.get_transformer(QualityFilterStatsFmt, - qiime2.Metadata) - obs = transformer(format) - self.assertEqual(obs.id_count, 34) - self.assertEqual(obs.column_count, 5) - self.assertEqual(obs.id_header, 'sample-id') - - def test_numeric_ids(self): - filepath = self.get_data_path('stats-numeric.txt') - format = QualityFilterStatsFmt(filepath, mode='r') - transformer = self.get_transformer(QualityFilterStatsFmt, - qiime2.Metadata) - obs = transformer(format) - self.assertEqual(obs.id_count, 34) - self.assertEqual(obs.column_count, 5) - self.assertEqual(obs.id_header, 'sample-id') - - -class TestUsageExamples(TestPluginBase): - package = 'q2_quality_filter.test' - - def test_examples(self): - self.execute_examples() - - -if __name__ == '__main__': - unittest.main() diff --git a/q2_quality_filter/test/__init__.py b/q2_quality_filter/tests/__init__.py similarity index 100% rename from q2_quality_filter/test/__init__.py rename to q2_quality_filter/tests/__init__.py diff --git a/q2_quality_filter/test/data/numeric_ids.qza b/q2_quality_filter/tests/data/numeric_ids.qza similarity index 100% rename from q2_quality_filter/test/data/numeric_ids.qza rename to q2_quality_filter/tests/data/numeric_ids.qza diff --git a/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap2_S2_L001_R1_001.fastq.gz b/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap2_S2_L001_R1_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..b3d1f5982b7bfa0a4cf4e55642b27edf414ea61f GIT binary patch literal 205 zcmV;;05bm{iwFp2ms4i|14wmkVQwu;Ze?X-VQ^nlGG9zEFfm_JF<&q+F)n6db98Y4 z#gDNL!Y~X(_k0Bt3qo>|nk1fTdGLT>^8bGeb}8+MkPw`#Eo6KX`I$R4;&^@kAp4e#-Lz5XZ{W HZ~_1Tw@zF@ literal 0 HcmV?d00001 diff --git a/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap2_S2_L001_R2_001.fastq.gz b/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap2_S2_L001_R2_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..60b4683d1896ac2248657ac9ad5dfd66b21ffcb8 GIT binary patch literal 270 zcmV+p0rCDHiwFq7mQ!Z{14wmkVQwu;Ze?X-VQ^nlGG9zEFfm_JGG8z-F)n6db98Y4 zjgdiW#4rp+_dZ4U^?;L%6~&oSt75E0X;-@cS>`*?E;0>`FtYUZ4SKkKrNqlphG8n{ zCoYmd$5aNW35hwUGUjP{8OwJ;8BEl1)XZEQHIllyyY+#pC<=!G0(^pd%V?1?Hxa?e zG`t3`%=D)EfrmA!)Z0q2*LQtyw8oX-i1AWu_n?&eSQ|obI~Ug1UmghEu!+9$Xb(G#c078Db3DU5mx)r||Iagz|H&h# Uc$Rjw`FW!705+oik#GV40Pm-MssI20 literal 0 HcmV?d00001 diff --git a/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap3_S3_L001_R1_001.fastq.gz b/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap3_S3_L001_R1_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..7b373f3de6b08c68351f97ba1aa741604fbde1fd GIT binary patch literal 289 zcmV++0p9)}iwFp8nNw!~14wmkVQwu;Ze?X-VQ^nlF<(qDFfm_JF<&q+F)n6db98Y4 zm66SEgD?<;?|zEjQ6wAVVlRcTjDHSQd-D4~OV=bMQq(Hx%7XFCAo~;Rm)1T6GRBfV z=EUx^*JQ-i^6=hi>YWX5hQ3LpDyWRI54f$`By;6jWeZU)5=T%pw zPMth&z%nn+8#!#C4+>#EuPGtt5;Mmpa!N6#oMMXmCIcJt(siAcl(tP=sP#Ed?*=!J zb)&!W;L5{y9=`FEdgm!u9!BelUn=iBXnhGFJYG_ixH8Yi&zG5ya(Q7MO0V&laU}b$ n-T!!6qX9hU@$!38`*?E;0>`FtYUZ4SKkKrNqlphG8n{ zCoYmd$5aNW35hwUGUjP{8OwJ;8BEl1)XZEQHIllyyY+#pC<=!G0(^pd%V?1?Hxa?e zG`t3`%=D)EfrmA!)Z0q2*LQtyw8oX-i1AWu_n?&eSQ|obI~Ug1UmghEu!+9$Xb(G#c078Db3DU5mx)r||Iagz|H&h# Uc$Rjw`FW!705+oik#GV4013r@!vFvP literal 0 HcmV?d00001 diff --git a/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R1_001.fastq.gz b/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R1_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..a74e913d9ae2f4faa36984b002ba65d5a34f492e GIT binary patch literal 291 zcmV+;0o?u{iwFpHQdegH14wmkVQwu;Ze?X-VQ^nlF<(qDFfm_JF<&q+F)n6db98Y4 zm61zsgfI|9_dZ4TXe1lsU^juGDSsAGvhw}Uk~YJ9q(quY%RqHiA@s!hrL_-%jIpGz zb7J?|Yck?$xq0t2_0EPjL*FD)6;wvq2i#6=lDTrNvO-jg#E}%WaJ}^8CCHI_liVJv zQzy?Gu*{3|Mm{#s2Zb=7*OZWRiJ4;)Ii(m=PBF!O$-suZbX{j9rEL=zYJJYryTJ`) z-RN&TIC%KZ!#AE%?>yz;VYHt3rSi^$)|UXn<0VCjEAwppe3=O;mlq}r&%^o}j~Pd@ p@7n#3r!^YDa~?0hCq<4yUwNhp*m?4C{mJs%Qa{<4Fx+qg003r_ilqPm literal 0 HcmV?d00001 diff --git a/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R2_001.fastq.gz b/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R2_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..668164d4643801fc6ac9f0c17777ce5389b2eb1e GIT binary patch literal 270 zcmV+p0rCDHiwFqXmQ!Z{14wmkVQwu;Ze?X-VQ^nlF<(qDFfm_JGG8z-F)n6db98Y4 zjgdiW#4rp+_dZ4U^?;L%6~&oSt75E0X;-@cS>`*?E;0>`FtYUZ4SKkKrNqlphG8n{ zCoYmd$5aNW35hwUGUjP{8OwJ;8BEl1)XZEQHIllyyY+#pC<=!G0(^pd%V?1?Hxa?e zG`t3`%=D)EfrmA!)Z0q2*LQtyw8oX-i1AWu_n?&eSQ|obI~Ug1UmghEu!+9$Xb(G#c078Db3DU5mx)r||Iagz|H&h# Uc$Rjw`FW!705+oik#GV4013r@!vFvP literal 0 HcmV?d00001 diff --git a/q2_quality_filter/tests/data/paired-end-data/MANIFEST b/q2_quality_filter/tests/data/paired-end-data/MANIFEST new file mode 100644 index 0000000..fc29edb --- /dev/null +++ b/q2_quality_filter/tests/data/paired-end-data/MANIFEST @@ -0,0 +1,9 @@ +# Produced by Super Duper Sequencing Machine + +sample-id,filename,direction +Human-Kneecap,Human-Kneecap_S1_L001_R1_001.fastq.gz,forward +Human-Kneecap,Human-Kneecap_S1_L001_R2_001.fastq.gz,reverse +Human-Kneecap2,Human-Kneecap2_S2_L001_R1_001.fastq.gz,forward +Human-Kneecap2,Human-Kneecap2_S2_L001_R2_001.fastq.gz,reverse +Human-Kneecap3,Human-Kneecap3_S3_L001_R1_001.fastq.gz,forward +Human-Kneecap3,Human-Kneecap3_S3_L001_R2_001.fastq.gz,reverse diff --git a/q2_quality_filter/tests/data/paired-end-data/metadata.yml b/q2_quality_filter/tests/data/paired-end-data/metadata.yml new file mode 100644 index 0000000..5e35def --- /dev/null +++ b/q2_quality_filter/tests/data/paired-end-data/metadata.yml @@ -0,0 +1 @@ +{ phred-offset: 33 } diff --git a/q2_quality_filter/test/data/real_data.qza b/q2_quality_filter/tests/data/real_data.qza similarity index 100% rename from q2_quality_filter/test/data/real_data.qza rename to q2_quality_filter/tests/data/real_data.qza diff --git a/q2_quality_filter/test/data/real_data_joined.qza b/q2_quality_filter/tests/data/real_data_joined.qza similarity index 100% rename from q2_quality_filter/test/data/real_data_joined.qza rename to q2_quality_filter/tests/data/real_data_joined.qza diff --git a/q2_quality_filter/test/data/simple.fastq.gz b/q2_quality_filter/tests/data/simple.fastq.gz similarity index 100% rename from q2_quality_filter/test/data/simple.fastq.gz rename to q2_quality_filter/tests/data/simple.fastq.gz diff --git a/q2_quality_filter/test/data/simple.qza b/q2_quality_filter/tests/data/simple.qza similarity index 100% rename from q2_quality_filter/test/data/simple.qza rename to q2_quality_filter/tests/data/simple.qza diff --git a/q2_quality_filter/test/data/stats-1.txt b/q2_quality_filter/tests/data/stats-1.txt similarity index 100% rename from q2_quality_filter/test/data/stats-1.txt rename to q2_quality_filter/tests/data/stats-1.txt diff --git a/q2_quality_filter/test/data/stats-numeric.txt b/q2_quality_filter/tests/data/stats-numeric.txt similarity index 100% rename from q2_quality_filter/test/data/stats-numeric.txt rename to q2_quality_filter/tests/data/stats-numeric.txt diff --git a/q2_quality_filter/tests/test_filter.py b/q2_quality_filter/tests/test_filter.py new file mode 100644 index 0000000..256ba76 --- /dev/null +++ b/q2_quality_filter/tests/test_filter.py @@ -0,0 +1,764 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016-2024, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import pandas as pd +import pandas.testing as pdt + +from copy import copy +import gzip +import os +from pathlib import Path +import tempfile +import unittest + +import qiime2 +from qiime2.sdk import Artifact +from qiime2.plugin.testing import TestPluginBase +from qiime2.util import redirected_stdio +from q2_types.sample_data import SampleData +from q2_types.per_sample_sequences import ( + FastqGzFormat, + SingleLanePerSampleSingleEndFastqDirFmt, + SingleLanePerSamplePairedEndFastqDirFmt, + PairedEndSequencesWithQuality, +) + +from q2_quality_filter._filter import ( + FastqRecord, + _read_fastq_records, + _find_low_quality_window, + _truncate, + RecordStatus, + _process_record, + _is_retained, + _write_record, +) +from q2_quality_filter._format import QualityFilterStatsFmt + + +class HelperMethodTests(TestPluginBase): + package = 'q2_quality_filter.tests' + + def test_read_fastq_records(self): + exp = [ + FastqRecord(b'@foo', b'ATGC', b'+', b'IIII'), + FastqRecord(b'@bar', b'TGCA', b'+', b'ABCD') + ] + + obs = list( + _read_fastq_records(self.get_data_path('simple.fastq.gz')) + ) + + self.assertEqual(len(obs), 2) + + attrs = [ + 'sequence_header', 'sequence', 'quality_header', 'quality_scores' + ] + for exp_record, obs_record in zip(exp, obs): + for attr in attrs: + self.assertEqual( + exp_record.__getattribute__(attr), + obs_record.__getattribute__(attr) + ) + + def test_find_low_quality_window(self): + # test no low quality window returns none + # 'M' has quality score of 44 with PHRED offset of 33 + quality_scores = b'M' * 10 + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=20, window_length=2 + ) + self.assertEqual(obs, None) + + # test that `min_quality` bases are not considered part of a window + # (only scores that are lower) + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=44, window_length=2 + ) + self.assertEqual(obs, None) + + # test windows detected correctly + # quality scores: M => 44; + => 10 + quality_scores = b'MMM++MM' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=15, window_length=2 + ) + self.assertEqual(obs, 3) + + quality_scores = b'M++MM+++MM' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=15, window_length=3 + ) + self.assertEqual(obs, 5) + + quality_scores = b'++MMMM' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=11, window_length=2 + ) + self.assertEqual(obs, 0) + + quality_scores = b'M++MMMM+++' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=11, window_length=3 + ) + self.assertEqual(obs, 7) + + # low quality window is detected when it's longer than window length + quality_scores = b'MMMMMMM+++' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=11, window_length=2 + ) + self.assertEqual(obs, 7) + + # test when multiple windows exist, first window is returned + quality_scores = b'ML++MMM+++' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=20, window_length=2 + ) + self.assertEqual(obs, 2) + + quality_scores = b'++ML+++M+++MM++' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=20, window_length=3 + ) + self.assertEqual(obs, 4) + # test that when all windows are too short, None is returned + quality_scores = b'++ML+++M+++MM++' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=20, window_length=4 + ) + self.assertEqual(obs, None) + + def test_truncate(self): + fastq_record = FastqRecord( + b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' + ) + + truncated = _truncate(copy(fastq_record), position=4) + exp = FastqRecord( + b'@header', b'ATTC', b'+', b'MMLM' + ) + self.assertEqual(truncated, exp) + + truncated = _truncate(copy(fastq_record), position=7) + exp = FastqRecord( + b'@header', b'ATTCTGT', b'+', b'MMLMLL+' + ) + self.assertEqual(truncated, exp) + + truncated = _truncate(copy(fastq_record), position=1) + exp = FastqRecord( + b'@header', b'A', b'+', b'M' + ) + self.assertEqual(truncated, exp) + + truncated = _truncate(copy(fastq_record), position=0) + exp = FastqRecord( + b'@header', b'', b'+', b'' + ) + self.assertEqual(truncated, exp) + + def test_process_record(self): + # truncation + fastq_record = FastqRecord( + b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' + ) + processed_record, status = _process_record( + copy(fastq_record), + phred_offset=33, + min_quality=15, + window_length=2, + min_length_fraction=0.5, + max_ambiguous=0 + ) + exp_record = FastqRecord( + b'@header', b'ATTCTG', b'+', b'MMLMLL' + ) + exp_status = RecordStatus.TRUNCATED + self.assertEqual(processed_record, exp_record) + self.assertEqual(status, exp_status) + + # no truncation + processed_record, status = _process_record( + copy(fastq_record), + phred_offset=33, + min_quality=5, + window_length=2, + min_length_fraction=0.5, + max_ambiguous=0 + ) + exp_record = fastq_record + exp_status = RecordStatus.UNTRUNCATED + self.assertEqual(processed_record, exp_record) + self.assertEqual(status, exp_status) + + # ambiguous + fastq_record = FastqRecord( + b'@header', b'ATTCTNTN', b'+', b'MMLMLL++' + ) + processed_record, status = _process_record( + copy(fastq_record), + phred_offset=33, + min_quality=5, + window_length=2, + min_length_fraction=0.5, + max_ambiguous=1 + ) + exp_record = FastqRecord( + b'@header', b'ATTCTNTN', b'+', b'MMLMLL++' + ) + exp_status = RecordStatus.AMBIGUOUS + self.assertEqual(processed_record, exp_record) + self.assertEqual(status, exp_status) + + # truncation and ambiguous + fastq_record = FastqRecord( + b'@header', b'ATTCTNTA', b'+', b'MMLMLL++' + ) + processed_record, status = _process_record( + copy(fastq_record), + phred_offset=33, + min_quality=15, + window_length=2, + min_length_fraction=0.5, + max_ambiguous=0 + ) + exp_record = FastqRecord( + b'@header', b'ATTCTN', b'+', b'MMLMLL' + ) + exp_status = RecordStatus.TRUNCATED_AMBIGUOUS + self.assertEqual(processed_record, exp_record) + self.assertEqual(status, exp_status) + + # truncation and too short + fastq_record = FastqRecord( + b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' + ) + processed_record, status = _process_record( + copy(fastq_record), + phred_offset=33, + min_quality=15, + window_length=2, + min_length_fraction=0.9, + max_ambiguous=0 + ) + exp_record = FastqRecord( + b'@header', b'ATTCTG', b'+', b'MMLMLL' + ) + exp_status = RecordStatus.SHORT + self.assertEqual(processed_record, exp_record) + self.assertEqual(status, exp_status) + + def test_is_retained(self): + filtering_stats_df = pd.DataFrame( + data=0, + index=['sample-a', 'sample-b', 'sample-c'], + columns=[ + 'total-input-reads', + 'total-retained-reads', + 'reads-truncated', + 'reads-too-short-after-truncation', + 'reads-exceeding-maximum-ambiguous-bases' + ] + ) + + # retained and truncated + retained = _is_retained( + forward_status=RecordStatus.TRUNCATED, + reverse_status=RecordStatus.UNTRUNCATED, + filtering_stats_df=filtering_stats_df, + sample_id='sample-a' + ) + self.assertTrue(retained) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'total-retained-reads'], 1 + ) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'reads-truncated'], 1 + ) + filtering_stats_df.iloc[:, :] = 0 + + # forward read only, retained + retained = _is_retained( + forward_status=RecordStatus.TRUNCATED, + reverse_status=None, + filtering_stats_df=filtering_stats_df, + sample_id='sample-a' + ) + self.assertTrue(retained) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'total-retained-reads'], 1 + ) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'reads-truncated'], 1 + ) + self.assertEqual( + filtering_stats_df.loc[ + 'sample-a', 'reads-too-short-after-truncation' + ], + 0 + ) + filtering_stats_df.iloc[:, :] = 0 + + # forward read only, short + retained = _is_retained( + forward_status=RecordStatus.SHORT, + reverse_status=None, + filtering_stats_df=filtering_stats_df, + sample_id='sample-b' + ) + self.assertFalse(retained) + self.assertEqual( + filtering_stats_df.loc['sample-b', 'total-retained-reads'], 0 + ) + self.assertEqual( + filtering_stats_df.loc['sample-b', 'reads-truncated'], 1 + ) + self.assertEqual( + filtering_stats_df.loc[ + 'sample-b', 'reads-too-short-after-truncation' + ], + 1 + ) + filtering_stats_df.iloc[:, :] = 0 + + # one read untruncated, one read truncated and ambiguous + retained = _is_retained( + forward_status=RecordStatus.UNTRUNCATED, + reverse_status=RecordStatus.TRUNCATED_AMBIGUOUS, + filtering_stats_df=filtering_stats_df, + sample_id='sample-a' + ) + self.assertFalse(retained) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'total-retained-reads'], 0 + ) + self.assertEqual( + filtering_stats_df.loc[ + 'sample-a', 'reads-exceeding-maximum-ambiguous-bases' + ], + 1 + ) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'reads-truncated'], 1 + ) + filtering_stats_df.iloc[:, :] = 0 + + def test_write_record(self): + fastq_record = FastqRecord( + b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' + ) + + with tempfile.TemporaryDirectory() as tempdir: + fp = Path(tempdir) / 'file.fastq.gz' + + with gzip.open(fp, 'wb') as fh: + _write_record(fastq_record, fh) + + with gzip.open(fp, 'rb') as fh: + contents = fh.read() + exp = b'@header\nATTCTGTA\n+\nMMLMLL++\n' + self.assertEqual(contents, exp) + + with gzip.open(fp, 'ab') as fh: + _write_record(fastq_record, fh) + _write_record(fastq_record, fh) + + with gzip.open(fp, 'rb') as fh: + contents = fh.read() + exp = b'@header\nATTCTGTA\n+\nMMLMLL++\n' * 3 + self.assertEqual(contents, exp) + + +class QScoreSingleEndTests(TestPluginBase): + package = 'q2_quality_filter.tests' + + def test_q_score_all_dropped(self): + ar = Artifact.load(self.get_data_path('simple.qza')) + + with self.assertRaisesRegex( + ValueError, 'All sequences from all samples were filtered' + ): + with redirected_stdio(stdout=os.devnull): + self.plugin.methods['q_score'](ar, min_quality=50) + + def test_q_score_numeric_ids(self): + ar = Artifact.load(self.get_data_path('numeric_ids.qza')) + exp_sids = {'00123', '0.4560'} + + with redirected_stdio(stdout=os.devnull): + obs_ar, stats_ar = self.plugin.methods['q_score']( + ar, min_quality=2) + obs = obs_ar.view(SingleLanePerSampleSingleEndFastqDirFmt) + stats = stats_ar.view(pd.DataFrame) + obs_manifest = obs.manifest.view(obs.manifest.format) + obs_manifest = pd.read_csv(obs_manifest.open(), dtype=str, comment='#') + obs_manifest.set_index('sample-id', inplace=True) + + obs_sids = set(obs_manifest.index) + self.assertEqual(obs_sids, exp_sids) + self.assertEqual(set(stats.index), exp_sids) + + def test_q_score(self): + ar = Artifact.load(self.get_data_path('simple.qza')) + with redirected_stdio(stdout=os.devnull): + obs_drop_ambig_ar, stats_ar = self.plugin.methods['q_score']( + ar, quality_window=2, min_quality=20, min_length_fraction=0.25) + obs_drop_ambig = obs_drop_ambig_ar.view( + SingleLanePerSampleSingleEndFastqDirFmt) + stats = stats_ar.view(pd.DataFrame) + + exp_drop_ambig = ["@foo_1", + "ATGCATGC", + "+", + "DDDDBBDD"] + columns = ['sample-id', 'total-input-reads', 'total-retained-reads', + 'reads-truncated', + 'reads-too-short-after-truncation', + 'reads-exceeding-maximum-ambiguous-bases'] + exp_drop_ambig_stats = pd.DataFrame([('foo', 2, 1, 0, 0, 1), + ('bar', 1, 0, 0, 0, 1)], + columns=columns) + exp_drop_ambig_stats = exp_drop_ambig_stats.set_index('sample-id') + obs = [] + iterator = obs_drop_ambig.sequences.iter_views(FastqGzFormat) + for sample_id, fp in iterator: + obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')]) + self.assertEqual(obs, exp_drop_ambig) + pdt.assert_frame_equal(stats, exp_drop_ambig_stats.loc[stats.index]) + + with redirected_stdio(stdout=os.devnull): + obs_trunc_ar, stats_ar = self.plugin.methods['q_score']( + ar, quality_window=1, min_quality=33, min_length_fraction=0.25) + obs_trunc = obs_trunc_ar.view(SingleLanePerSampleSingleEndFastqDirFmt) + stats = stats_ar.view(pd.DataFrame) + + exp_trunc = ["@foo_1", + "ATGCATGC", + "+", + "DDDDBBDD", + "@bar_1", + "ATA", + "+", + "DDD"] + exp_trunc_stats = pd.DataFrame([('foo', 2, 1, 0, 0, 1), + ('bar', 1, 1, 1, 0, 0)], + columns=columns) + exp_trunc_stats = exp_trunc_stats.set_index('sample-id') + + obs = [] + for sample_id, fp in obs_trunc.sequences.iter_views(FastqGzFormat): + obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')]) + self.assertEqual(sorted(obs), sorted(exp_trunc)) + pdt.assert_frame_equal(stats, exp_trunc_stats.loc[stats.index]) + + def test_q_score_real(self): + self.maxDiff = None + + ar = Artifact.load(self.get_data_path('real_data.qza')) + with redirected_stdio(stdout=os.devnull): + obs_ar, stats_ar = self.plugin.methods['q_score']( + ar, min_quality=40, min_length_fraction=0.24) + obs_result = obs_ar.view(SingleLanePerSampleSingleEndFastqDirFmt) + stats = stats_ar.view(pd.DataFrame) + + # All input reads are represented here in their post-quality filtered + # form. Reads that are commented out were manually identified as being + # filtered by the q_score method. For the commented reads, the comments + # denote why the read is not retained. + + # The first read, @HWI-EAS440_0386:1:32:15467:1432#0/1, is 25% of + # total read length and is indicative of a sequence at the + # min_length_fraction boundary. + exp_result = [ + "@HWI-EAS440_0386:1:32:15467:1432#0/1", + "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT", + "+", + "hhhhhhhhhhhhfghhhghghghhhchhhahhhhhfhh", + + # too short + # "@HWI-EAS440_0386:1:36:9986:17043#0/1", + # "TACGTAGGTGGCAAGCGTTATCCGGATTTATTG", + # "+", + # "hhhhhhhhhhhhhhhhhhhhhhhhhffhhghhh", + + "@HWI-EAS440_0386:1:37:13343:14820#0/1", + ("TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGAT" + "GGATGTTTAAGTCAGTTGTG"), + "+", + ("hhhhhhhhhhhhhfhhhhhfhhhhghhhhghhhhhhhhhgghhhgghhhgghh" + "hgdhhhhghghhhdhhhhgh"), + + "@HWI-EAS440_0386:1:41:18215:15404#0/1", + "TACGTAGGTGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCATGTA", + "+", + "hhhhhhhhhhhhghhhhhhhhhhhhffhhghhhhghhghgghghhhhhgh", + + # too short + # "@HWI-EAS440_0386:1:42:5423:19606#0/1", + # "TACGTAGGGAGCAAGCGTT", + # "+", + # "hhhhghhhhhhhhhghhfh", + + "@HWI-EAS440_0386:1:52:7507:5841#0/1", + "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT", + "+", + "hhhhhhhhhghhfghhhhhhhhhhgfhhhghhhghdhh", + + "@HWI-EAS440_0386:1:53:18599:4074#0/1", + "TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTG", + "+", + "hhhhfhhhhhfhhhhhhfhffhghhfgghggghdcbh", + + # too short + # "@HWI-EAS440_0386:1:55:16425:9514#0/1", + # "TACGGAGGATCCGAGCGTTATCCGGATT", + # "+", + # "hhhhhhhhhhhhfghhhghghhhhhbgh", + + "@HWI-EAS440_0386:1:65:12049:5619#0/1", + "TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTG", + "+", + "hhhhhhhhhhhhhhhhhhhhhhhhhfhhhhhhhghdhghhhhhghcfh", + + # @HWI-EAS440_0386:1:95:4837:16388#0/1 + # starts off < Q40 + ] + + columns = ['sample-id', 'total-input-reads', 'total-retained-reads', + 'reads-truncated', + 'reads-too-short-after-truncation', + 'reads-exceeding-maximum-ambiguous-bases'] + exp_stats = pd.DataFrame([('foo', 10, 6, 10, 4, 0)], + columns=columns) + exp_stats = exp_stats.set_index('sample-id') + obs = [] + iterator = obs_result.sequences.iter_views(FastqGzFormat) + for sample_id, fp in iterator: + obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')]) + self.assertEqual(obs, exp_result) + pdt.assert_frame_equal(stats, exp_stats.loc[stats.index]) + + def test_q_score_real_joined(self): + ar = Artifact.load(self.get_data_path('real_data_joined.qza')) + with redirected_stdio(stdout=os.devnull): + obs_ar, stats_ar = self.plugin.methods['q_score']( + ar, min_quality=40, min_length_fraction=0.24) + obs_result = obs_ar.view(SingleLanePerSampleSingleEndFastqDirFmt) + stats = stats_ar.view(pd.DataFrame) + + # All input reads are represented here in their post-quality filtered + # form. Reads that are commented out were manually identified as being + # filtered by the q_score method. For the commented reads, the comments + # denote why the read is not retained. + + # The first read, @HWI-EAS440_0386:1:32:15467:1432#0/1, is 25% of + # total read length and is indicative of a sequence at the + # min_length_fraction boundary. + exp_result = [ + "@HWI-EAS440_0386:1:32:15467:1432#0/1", + "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT", + "+", + "hhhhhhhhhhhhfghhhghghghhhchhhahhhhhfhh", + + # too short + # "@HWI-EAS440_0386:1:36:9986:17043#0/1", + # "TACGTAGGTGGCAAGCGTTATCCGGATTTATTG", + # "+", + # "hhhhhhhhhhhhhhhhhhhhhhhhhffhhghhh", + + "@HWI-EAS440_0386:1:37:13343:14820#0/1", + "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGAT" + "GGATGTTTAAGTCAGTTGTG", + "+", + "hhhhhhhhhhhhhfhhhhhfhhhhghhhhghhhhhhhhhgghhhgghhhgghh" + "hgdhhhhghghhhdhhhhgh", + + "@HWI-EAS440_0386:1:41:18215:15404#0/1", + "TACGTAGGTGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCATGTA", + "+", + "hhhhhhhhhhhhghhhhhhhhhhhhffhhghhhhghhghgghghhhhhgh", + + # too short + # "@HWI-EAS440_0386:1:42:5423:19606#0/1", + # "TACGTAGGGAGCAAGCGTT", + # "+", + # "hhhhghhhhhhhhhghhfh", + + "@HWI-EAS440_0386:1:52:7507:5841#0/1", + "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT", + "+", + "hhhhhhhhhghhfghhhhhhhhhhgfhhhghhhghdhh", + + "@HWI-EAS440_0386:1:53:18599:4074#0/1", + "TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTG", + "+", + "hhhhfhhhhhfhhhhhhfhffhghhfgghggghdcbh", + + # too short + # "@HWI-EAS440_0386:1:55:16425:9514#0/1", + # "TACGGAGGATCCGAGCGTTATCCGGATT", + # "+", + # "hhhhhhhhhhhhfghhhghghhhhhbgh", + + "@HWI-EAS440_0386:1:65:12049:5619#0/1", + "TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTG", + "+", + "hhhhhhhhhhhhhhhhhhhhhhhhhfhhhhhhhghdhghhhhhghcfh", + + # @HWI-EAS440_0386:1:95:4837:16388#0/1 + # starts off < Q40 + ] + + columns = ['sample-id', 'total-input-reads', 'total-retained-reads', + 'reads-truncated', + 'reads-too-short-after-truncation', + 'reads-exceeding-maximum-ambiguous-bases'] + exp_stats = pd.DataFrame([('foo', 10, 6, 10, 4, 0)], + columns=columns) + exp_stats = exp_stats.set_index('sample-id') + obs = [] + iterator = obs_result.sequences.iter_views(FastqGzFormat) + for sample_id, fp in iterator: + obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')]) + self.assertEqual(obs, exp_result) + pdt.assert_frame_equal(stats, exp_stats.loc[stats.index]) + + +class QScorePairedEndTests(TestPluginBase): + package = 'q2_quality_filter.tests' + + def _get_header_diff( + self, forward_record: FastqRecord, reverse_record: FastqRecord + ) -> int: + zipped_headers = zip( + forward_record.sequence_header, reverse_record.sequence_header + ) + + diff = 0 + for forward_byte, reverse_byte in zipped_headers: + if forward_byte != reverse_byte: + diff += 1 + + return diff + + def _assert_records_match(self, manifest_df: pd.DataFrame): + for forward_fp, reverse_fp in zip( + manifest_df['forward'], manifest_df['reverse'] + ): + forward_iterator = _read_fastq_records(forward_fp) + reverse_iterator = _read_fastq_records(reverse_fp) + iterator = zip(forward_iterator, reverse_iterator) + + for forward_record, reverse_record in iterator: + # headers differ in one position to indicate read direction + self.assertEqual( + self._get_header_diff(forward_record, reverse_record), 1 + ) + + def test_paired_end_sequences(self): + demux_artifact = Artifact.import_data( + SampleData[PairedEndSequencesWithQuality], + self.get_data_path('paired-end-data'), + ) + + output_seqs, stats = self.plugin.methods['q_score']( + demux_artifact, + min_quality=15, + quality_window=2, + min_length_fraction=0.8, + max_ambiguous=2 + ) + output_demux_format = output_seqs.view( + SingleLanePerSamplePairedEndFastqDirFmt + ) + demux_manifest_df = output_demux_format.manifest.view(pd.DataFrame) + + # corresponding records should have matching headers + self._assert_records_match(demux_manifest_df) + + # "Human-Kneecap2_S2" is dropped because the R1 reads have low q scores + exp_sample_ids = ['Human-Kneecap', 'Human-Kneecap3'] + self.assertEqual( + set(demux_manifest_df.index), set(exp_sample_ids) + ) + self.assertEqual(len(demux_manifest_df), 2) + + # assert truncation positions are correct + sample1_forward_exp = [ + # first record dropped because of R2 scores + b'@M00899:113:000000000-A5K20:1:1101:25454:3578 1:N:0:2', + b'CCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGGCGAGAGCCTGAACCAGCCAAGTA', + b'+', + b'8ACCCGD@AA=18=======;CEFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGF', + b'@M00899:113:000000000-A5K20:1:1101:25177:3605 1:N:0:2', + b'CCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGACGGAAGTCTGAACCAGCCAAGTAGCGTGCAGGATGAC', # noqa + b'+', + b'88BCCEDAD9018======;;CCFGGGGFGGGFGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGG', # noqa + ] + + with gzip.open( + demux_manifest_df.loc['Human-Kneecap', 'forward'] + ) as fh: + sample1_forward_obs = [line.strip() for line in fh.readlines()] + + self.assertEqual(sample1_forward_exp, sample1_forward_obs) + + sample1_reverse_exp = [ + # first record dropped because of R2 scores + b'@M00899:113:000000000-A5K20:1:1101:25454:3578 2:N:0:2', + b'GACTACCGGGGTATCTAATCCTGTTCGATACCCGCACCTTCGAGCTTCAGCGTCAGTTGCGCTCCCGTCAGCTGC', # noqa + b'+', + b'CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG', # noqa + b'@M00899:113:000000000-A5K20:1:1101:25177:3605 2:N:0:2', + b'GACTACTGGGGTATCTAATCCTGTTTGATACCCGCACCTTCGAGCTTAAGCGTCAGTTGCGCTCCCGTCAGCTGC', # noqa + b'+', + b'CCCCCGG9FFGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG', # noqa + ] + + with gzip.open( + demux_manifest_df.loc['Human-Kneecap', 'reverse'] + ) as fh: + sample1_reverse_obs = [line.strip() for line in fh.readlines()] + + self.assertEqual(sample1_reverse_exp, sample1_reverse_obs) + + +class TransformerTests(TestPluginBase): + package = 'q2_quality_filter.tests' + + def test_stats_to_metadata(self): + filepath = self.get_data_path('stats-1.txt') + format = QualityFilterStatsFmt(filepath, mode='r') + transformer = self.get_transformer(QualityFilterStatsFmt, + qiime2.Metadata) + obs = transformer(format) + self.assertEqual(obs.id_count, 34) + self.assertEqual(obs.column_count, 5) + self.assertEqual(obs.id_header, 'sample-id') + + def test_numeric_ids(self): + filepath = self.get_data_path('stats-numeric.txt') + format = QualityFilterStatsFmt(filepath, mode='r') + transformer = self.get_transformer(QualityFilterStatsFmt, + qiime2.Metadata) + obs = transformer(format) + self.assertEqual(obs.id_count, 34) + self.assertEqual(obs.column_count, 5) + self.assertEqual(obs.id_header, 'sample-id') + + +class TestUsageExamples(TestPluginBase): + package = 'q2_quality_filter.tests' + + def test_examples(self): + self.execute_examples() + + +if __name__ == '__main__': + unittest.main() diff --git a/setup.py b/setup.py index 052bb33..843953f 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ }, package_data={ "q2_quality_filter": ["citations.bib"], - "q2_quality_filter.test": ["data/*"], + "q2_quality_filter.tests": ["data/*", "data/paired-end-data/*"], }, zip_safe=False, )