From 137880aab8656104263e371ed80c61af76e9ee0e Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Wed, 4 Dec 2024 11:51:47 -0700 Subject: [PATCH 01/24] WIP: q_score refactor --- q2_quality_filter/_filter.py | 344 ++++++++++++++++++++++------------- q2_quality_filter/_format.py | 2 +- 2 files changed, 221 insertions(+), 125 deletions(-) diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index fcac4df..d5622aa 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -6,8 +6,10 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -import itertools +from dataclasses import dataclass import gzip +import os +from typing import BinaryIO import yaml import pandas as pd @@ -17,39 +19,141 @@ FastqManifestFormat, YamlFormat, FastqGzFormat) -def _read_fastq_seqs(filepath, phred_offset): - # This function is adapted from @jairideout's SO post: - # http://stackoverflow.com/a/39302117/3424666 - fh = gzip.open(filepath, 'rb') - for seq_header, seq, qual_header, qual in itertools.zip_longest(*[fh] * 4): - qual = qual.strip() - qual_parsed = np.frombuffer(memoryview(qual), dtype=np.uint8) - qual_parsed = qual_parsed - phred_offset - yield (seq_header.strip(), seq.strip(), qual_header.strip(), - qual, qual_parsed) - +@dataclass +class FastqRecord: + sequence_header: bytes + sequence: bytes + quality_header: bytes + quality_scores: bytes -def _runs_of_ones(arr): - """Find the location and length of runs - This method assumes the input array is boolean - """ - # http://stackoverflow.com/a/1066838 - # make sure all runs of ones are well-bounded - bounded = np.hstack(([0], arr, [0])) - # get 1 at run starts and -1 at run ends - difs = np.diff(bounded) - run_starts, = np.where(difs > 0) - run_ends, = np.where(difs < 0) - return run_starts, run_ends - run_starts +def _read_fastq_records(filepath: str): + ''' + A generator for a fastq file that yields sequence records. The fastq file + is assumed to be gzipped. + Parameters + ---------- + filepath : str + The filepath to the fastq.gz file. -def _truncate(sequence_record, position): - """Truncate a record up to a specified position""" - seq = sequence_record[1][:position] - qual = sequence_record[3][:position] - qual_parsed = sequence_record[4][:position] - return (sequence_record[0], seq, sequence_record[2], qual, qual_parsed) + Yields + ------ + SequenceRecord + A sequence record representing a record from the fastq file. + ''' + fh = gzip.open(filepath, 'rb') + while True: + try: + sequence_header = next(fh) + sequence = next(fh) + quality_header = next(fh) + quality_scores = next(fh) + except StopIteration: + fh.close() + break + + yield FastqRecord( + sequence_header.strip(), + sequence.strip(), + quality_header.strip(), + quality_scores.strip() + ) + + +def _find_low_quality_window( + quality_scores: bytes, + phred_offset: int, + min_quality: int, + window_length: int +) -> int | None: + ''' + Searches a sequence of quality scores for subsequences (windows) of length + `window_length` that consist of quality scores each less than + `min_quality`. If one or more such windows exist then the index of the + first position of the first such window is returned (which will be the + truncation position). Otherwise None is returned. + + Parameters + ---------- + quality_scores : bytes + The quality scores byte string for a fastq record. + phred_offset : int + The PHRED offset encoding of the quality scores. + min_quality : int + The minimum quality that a base must have in order to not be considered + part of a low quality window. + window_length : int + The length of the low quality window to search for. + + Returns + ------- + int or None + The index of the first position of the first low quality window found + or None if no such window is found. + + ''' + # parse and adjust quality scores + quality_scores_parsed = np.frombuffer( + quality_scores, np.uint8 + ) + quality_scores_adjusted = quality_scores_parsed - phred_offset + less_than_min_quality = quality_scores_adjusted < min_quality + + # use a convolution to detect bad quality windows + window = np.ones(window_length, dtype=int) + convolution = np.convolve(less_than_min_quality, window, mode='valid') + window_indices = np.where(convolution == window_length)[0] + + if len(window_indices) == 0: + return None + + return window_indices[0] + + +def _truncate(fastq_record: FastqRecord, position: int) -> FastqRecord: + ''' + Truncates a fastq record's sequence and quality scores to a specified + `position`. Note that `position` is the first position that is excluded + from the resulting record. + + Parameters + ---------- + fastq_record : FastqRecord + The fastq record to truncate + position : int + The truncation position + + Returns + ------- + FastqRecord + The truncated fastq record. + ''' + fastq_record.sequence = fastq_record.sequence[:position] + fastq_record.quality_scores = fastq_record.quality_scores[:position] + + return fastq_record + + +def _write_record(fastq_record: FastqRecord, fh: BinaryIO) -> None: + ''' + Writes a fastq record to an open fastq file. + + Parameters + ---------- + fastq_record : FastqRecord + The fastq record to be written. + fh : BinaryIO + The output fastq file handler. + + Returns + ------- + None + ''' + fh.write(fastq_record.sequence_header + b'\n') + fh.write(fastq_record.sequence + b'\n') + fh.write(fastq_record.quality_header + b'\n') + fh.write(fastq_record.quality_scores + b'\n') # defaults as used Bokulich et al, Nature Methods 2013, @@ -62,15 +166,15 @@ def _truncate(sequence_record, position): } -# TODO: fix up demux fmt writing a la q2-cutadapt -def q_score(demux: SingleLanePerSampleSingleEndFastqDirFmt, - min_quality: int = _default_params['min_quality'], - quality_window: int = _default_params['quality_window'], - min_length_fraction: - float = _default_params['min_length_fraction'], - max_ambiguous: int = _default_params['max_ambiguous']) \ - -> (SingleLanePerSampleSingleEndFastqDirFmt, - pd.DataFrame): +def q_score( + demux: SingleLanePerSampleSingleEndFastqDirFmt, + min_quality: int = _default_params['min_quality'], + quality_window: int = _default_params['quality_window'], + min_length_fraction: float = _default_params['min_length_fraction'], + max_ambiguous: int = _default_params['max_ambiguous'] +) -> (SingleLanePerSampleSingleEndFastqDirFmt, pd.DataFrame): + + # create the output format and its manifest format result = SingleLanePerSampleSingleEndFastqDirFmt() manifest = FastqManifestFormat() @@ -80,12 +184,7 @@ def q_score(demux: SingleLanePerSampleSingleEndFastqDirFmt, manifest_fh.write('# data may be derived from forward, reverse, or \n') manifest_fh.write('# joined reads\n') - log_records_truncated_counts = {} - log_records_max_ambig_counts = {} - log_records_tooshort_counts = {} - log_records_totalread_counts = {} - log_records_totalkept_counts = {} - + # load the input demux manifest metadata_view = demux.metadata.view(YamlFormat).open() phred_offset = yaml.load(metadata_view, Loader=yaml.SafeLoader)['phred-offset'] @@ -93,75 +192,85 @@ def q_score(demux: SingleLanePerSampleSingleEndFastqDirFmt, demux_manifest = pd.read_csv(demux_manifest.open(), dtype=str) demux_manifest.set_index('filename', inplace=True) + filtering_stats_df = pd.DataFrame( + data=0, + index=demux_manifest['sample-id'], + columns=[ + 'total-input-reads', 'total-retained-reads', 'reads-truncated', + 'reads-too-short-after-truncation', + 'reads-exceeding-maximum-ambiguous-bases' + ] + ) + iterator = demux.sequences.iter_views(FastqGzFormat) - for bc_id, (fname, fp) in enumerate(iterator): - sample_id = demux_manifest.loc[str(fname)]['sample-id'] - - log_records_truncated_counts[sample_id] = 0 - log_records_max_ambig_counts[sample_id] = 0 - log_records_tooshort_counts[sample_id] = 0 - log_records_totalread_counts[sample_id] = 0 - log_records_totalkept_counts[sample_id] = 0 - - # per q2-demux, barcode ID, lane number and read number are not - # relevant here - path = result.sequences.path_maker(sample_id=sample_id, - barcode_id=bc_id, - lane_number=1, - read_number=1) - - # we do not open a writer by default in the event that all sequences - # for a sample are filtered out; an empty fastq file is not a valid - # fastq file. - writer = None - for sequence_record in _read_fastq_seqs(str(fp), phred_offset): - log_records_totalread_counts[sample_id] += 1 - - # determine the length of the runs below quality threshold - # NOTE: QIIME 1.x used <= the quality threshold and the parameter - # -q was interpreted as the maximum unacceptable PHRED score. In - # QIIME 2.x, we're now interpreting this as the minimum - # acceptable score. - qual_below_threshold = sequence_record[4] < min_quality - run_starts, run_lengths = _runs_of_ones(qual_below_threshold) - bad_windows = np.argwhere(run_lengths > quality_window) - - # if there is a run of sufficient size, truncate it - if bad_windows.size > 0: - log_records_truncated_counts[sample_id] += 1 - - full_length = len(sequence_record[1]) - sequence_record = _truncate(sequence_record, - run_starts[bad_windows[0]][0]) - trunc_length = len(sequence_record[1]) - - # do not keep the read if it is too short following truncation - if round(trunc_length / full_length, 3) <= min_length_fraction: - log_records_tooshort_counts[sample_id] += 1 + for barcode_id, (filename, filepath) in enumerate(iterator): + sample_id = demux_manifest.loc[str(filename)]['sample-id'] + + # barcode ID, lane number and read number are not relevant here + path = result.sequences.path_maker( + sample_id=sample_id, + barcode_id=barcode_id, + lane_number=1, + read_number=1 + ) + + output_fh = gzip.open(path, mode='wb') + + for fastq_record in _read_fastq_records(str(filepath)): + filtering_stats_df.loc[sample_id, 'total-input-reads'] += 1 + + # search for low quality window + truncation_position = _find_low_quality_window( + fastq_record.quality_scores, + phred_offset, + min_quality, + quality_window + ) + + # truncate fastq record if necessary and discard if it has been + # made too short + initial_record_length = len(fastq_record.sequence) + if truncation_position is not None: + fastq_record = _truncate(fastq_record, truncation_position) + filtering_stats_df.loc[sample_id, 'reads-truncated'] += 1 + + trunc_fraction = truncation_position / initial_record_length + if trunc_fraction < min_length_fraction: + filtering_stats_df.loc[ + sample_id, 'reads-too-short-after-truncation' + ] += 1 continue - # do not keep the read if there are too many ambiguous bases - if sequence_record[1].count(b'N') > max_ambiguous: - log_records_max_ambig_counts[sample_id] += 1 + # discard record if there are too many ambiguous bases + if fastq_record.sequence.count(b'N') > max_ambiguous: + filtering_stats_df.loc[ + sample_id, 'reads-exceeding-maximum-ambiguous-bases' + ] += 1 continue - fastq_lines = b'\n'.join(sequence_record[:4]) + b'\n' - - if writer is None: - writer = gzip.open(str(path), mode='w') - writer.write(fastq_lines) - - log_records_totalkept_counts[sample_id] += 1 - - if writer is not None: - manifest_fh.write('%s,%s,%s\n' % (sample_id, path.name, 'forward')) - writer.close() - - if set(log_records_totalkept_counts.values()) == {0, }: - raise ValueError("All sequences from all samples were filtered out. " - "The parameter choices may be too stringent for the " - "data.") - + # write record to output file + _write_record(fastq_record, output_fh) + filtering_stats_df.loc[sample_id, 'total-retained-reads'] += 1 + + # close output file and update manifest if records were retained, + # otherwise delete the empty file + output_fh.close() + if filtering_stats_df.loc[sample_id, 'total-retained-reads'] > 0: + # TODO + direction = 'forward' + manifest_fh.write(f'{sample_id},{path.name},{direction}\n') + else: + os.remove(path) + + # error if all samples retained no reads + if filtering_stats_df['total-retained-reads'].sum() == 0: + msg = ( + 'All sequences from all samples were filtered. The parameter ' + 'choices may have been too stringent for the data.' + ) + raise ValueError(msg) + + # write output manifest and metadata files to format manifest_fh.close() result.manifest.write_data(manifest, FastqManifestFormat) @@ -169,18 +278,5 @@ def q_score(demux: SingleLanePerSampleSingleEndFastqDirFmt, metadata.path.write_text(yaml.dump({'phred-offset': phred_offset})) result.metadata.write_data(metadata, YamlFormat) - columns = ['sample-id', 'total-input-reads', 'total-retained-reads', - 'reads-truncated', - 'reads-too-short-after-truncation', - 'reads-exceeding-maximum-ambiguous-bases'] - stats = [] - for id_, _ in sorted(log_records_truncated_counts.items()): - stats.append([id_, log_records_totalread_counts[id_], - log_records_totalkept_counts[id_], - log_records_truncated_counts[id_], - log_records_tooshort_counts[id_], - log_records_max_ambig_counts[id_]]) - - stats = pd.DataFrame(stats, columns=columns).set_index('sample-id') - - return result, stats + print('stats df', filtering_stats_df) + return result, filtering_stats_df diff --git a/q2_quality_filter/_format.py b/q2_quality_filter/_format.py index c1e62c1..21d12f1 100644 --- a/q2_quality_filter/_format.py +++ b/q2_quality_filter/_format.py @@ -10,7 +10,7 @@ class QualityFilterStatsFmt(model.TextFileFormat): - def sniff(self): + def _validate_(self, level): line = open(str(self)).readline() hdr = line.strip().split(',') expected = ['sample-id', 'total-input-reads', From 8f6b0c8524a3d2f07901099c6bcb7cc15e2431fd Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Wed, 4 Dec 2024 14:51:33 -0700 Subject: [PATCH 02/24] remove old tests, add new tests --- q2_quality_filter/_filter.py | 50 +++---- q2_quality_filter/test/test_filter.py | 186 +++++++++++++++++++------- 2 files changed, 166 insertions(+), 70 deletions(-) diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index d5622aa..cde4def 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -9,7 +9,6 @@ from dataclasses import dataclass import gzip import os -from typing import BinaryIO import yaml import pandas as pd @@ -135,7 +134,7 @@ def _truncate(fastq_record: FastqRecord, position: int) -> FastqRecord: return fastq_record -def _write_record(fastq_record: FastqRecord, fh: BinaryIO) -> None: +def _write_record(fastq_record: FastqRecord, fh: gzip.GzipFile) -> None: ''' Writes a fastq record to an open fastq file. @@ -143,7 +142,7 @@ def _write_record(fastq_record: FastqRecord, fh: BinaryIO) -> None: ---------- fastq_record : FastqRecord The fastq record to be written. - fh : BinaryIO + fh : GzipFile The output fastq file handler. Returns @@ -156,33 +155,33 @@ def _write_record(fastq_record: FastqRecord, fh: BinaryIO) -> None: fh.write(fastq_record.quality_scores + b'\n') -# defaults as used Bokulich et al, Nature Methods 2013, -# same as QIIME 1.9.1 -_default_params = { - 'min_quality': 4, - 'quality_window': 3, - 'min_length_fraction': 0.75, - 'max_ambiguous': 0 -} - - def q_score( demux: SingleLanePerSampleSingleEndFastqDirFmt, - min_quality: int = _default_params['min_quality'], - quality_window: int = _default_params['quality_window'], - min_length_fraction: float = _default_params['min_length_fraction'], - max_ambiguous: int = _default_params['max_ambiguous'] + min_quality: int = 4, + quality_window: int = 3, + min_length_fraction: float = 0.75, + max_ambiguous: int = 0 ) -> (SingleLanePerSampleSingleEndFastqDirFmt, pd.DataFrame): + ''' + Parameter defaults as used in Bokulich et al, Nature Methods 2013, same as + QIIME 1.9.1. + + Parameters + ---------- + Returns + ------- + tuple[SingleLanePerSampleSingleEndFastqDirFmt, pd.DataFrame] + + ''' + + # TODO: paired/single handling # create the output format and its manifest format result = SingleLanePerSampleSingleEndFastqDirFmt() manifest = FastqManifestFormat() manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') - manifest_fh.write('# direction is not meaningful in this file as these\n') - manifest_fh.write('# data may be derived from forward, reverse, or \n') - manifest_fh.write('# joined reads\n') # load the input demux manifest metadata_view = demux.metadata.view(YamlFormat).open() @@ -192,6 +191,12 @@ def q_score( demux_manifest = pd.read_csv(demux_manifest.open(), dtype=str) demux_manifest.set_index('filename', inplace=True) + # HACK: we have to deal with comment lines that may be present in the + # manifest that used to be written by this method + demux_manifest = demux_manifest[ + ~demux_manifest['sample-id'].str.startswith('#') + ] + filtering_stats_df = pd.DataFrame( data=0, index=demux_manifest['sample-id'], @@ -224,7 +229,7 @@ def q_score( fastq_record.quality_scores, phred_offset, min_quality, - quality_window + quality_window + 1 ) # truncate fastq record if necessary and discard if it has been @@ -235,7 +240,7 @@ def q_score( filtering_stats_df.loc[sample_id, 'reads-truncated'] += 1 trunc_fraction = truncation_position / initial_record_length - if trunc_fraction < min_length_fraction: + if trunc_fraction <= min_length_fraction: filtering_stats_df.loc[ sample_id, 'reads-too-short-after-truncation' ] += 1 @@ -278,5 +283,4 @@ def q_score( metadata.path.write_text(yaml.dump({'phred-offset': phred_offset})) result.metadata.write_data(metadata, YamlFormat) - print('stats df', filtering_stats_df) return result, filtering_stats_df diff --git a/q2_quality_filter/test/test_filter.py b/q2_quality_filter/test/test_filter.py index 242b9f2..144f9f9 100644 --- a/q2_quality_filter/test/test_filter.py +++ b/q2_quality_filter/test/test_filter.py @@ -1,21 +1,22 @@ # ---------------------------------------------------------------------------- -# Copyright (c) 2016-2023, QIIME 2 development team. +# Copyright (c) 2016-2024, QIIME 2 development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -import unittest +from copy import copy import gzip import os +from pathlib import Path +import tempfile +import unittest import pandas as pd import pandas.testing as pdt import qiime2 from qiime2.sdk import Artifact -import numpy as np -import numpy.testing as npt from qiime2.plugin.testing import TestPluginBase from qiime2.util import redirected_stdio from q2_types.per_sample_sequences import ( @@ -24,9 +25,11 @@ ) from q2_quality_filter._filter import ( - _read_fastq_seqs, - _runs_of_ones, + FastqRecord, + _read_fastq_records, + _find_low_quality_window, _truncate, + _write_record, ) from q2_quality_filter._format import QualityFilterStatsFmt @@ -34,55 +37,142 @@ class FilterTests(TestPluginBase): package = 'q2_quality_filter.test' - def test_read_fastq_seqs(self): - exp = [(b'@foo', b'ATGC', b'+', b'IIII', np.array([40, 40, 40, 40])), - (b'@bar', b'TGCA', b'+', b'ABCD', np.array([32, 33, 34, 35]))] - obs = list(_read_fastq_seqs(self.get_data_path('simple.fastq.gz'), 33)) - self.assertEqual(len(obs), 2) + def test_read_fastq_records(self): + exp = [ + FastqRecord(b'@foo', b'ATGC', b'+', b'IIII'), + FastqRecord(b'@bar', b'TGCA', b'+', b'ABCD') + ] - for o, e in zip(obs, exp): - self.assertEqual(o[:4], e[:4]) - npt.assert_equal(o[4], e[4]) + obs = list( + _read_fastq_records(self.get_data_path('simple.fastq.gz')) + ) - def test_runs_of_ones(self): - data = [np.array([0, 0, 0, 0, 0, 0], dtype=bool), - np.array([1, 0, 1, 0, 1, 0], dtype=bool), - np.array([1, 1, 1, 1, 1, 1], dtype=bool), - np.array([0, 1, 1, 1, 0, 0], dtype=bool), - np.array([0, 0, 0, 0, 0, 1], dtype=bool)] - - exp_starts = [np.array([]), np.array([0, 2, 4]), np.array([0]), - np.array([1]), np.array([5])] - exp_lengths = [np.array([]), np.array([1, 1, 1]), np.array([6]), - np.array([3]), np.array([1])] + self.assertEqual(len(obs), 2) - for i, d in enumerate(data): - o_starts, o_lengths = _runs_of_ones(d) - npt.assert_equal(o_starts, exp_starts[i]) - npt.assert_equal(o_lengths, exp_lengths[i]) + attrs = [ + 'sequence_header', 'sequence', 'quality_header', 'quality_scores' + ] + for exp_record, obs_record in zip(exp, obs): + for attr in attrs: + self.assertEqual( + exp_record.__getattribute__(attr), + obs_record.__getattribute__(attr) + ) + + def test_find_low_quality_window(self): + # test no low quality window returns none + # 'M' has quality score of 44 with PHRED offset of 33 + quality_scores = b'M' * 10 + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=20, window_length=2 + ) + self.assertEqual(obs, None) + + # test that `min_quality` bases are not considered part of a window + # (only scores that are lower) + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=44, window_length=2 + ) + + # test windows detected correctly + # quality scores: M => 44; + => 10 + quality_scores = b'MMM++MM' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=15, window_length=2 + ) + self.assertEqual(obs, 3) + + quality_scores = b'M++MM+++MM' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=15, window_length=3 + ) + self.assertEqual(obs, 5) + + quality_scores = b'++MMMM' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=11, window_length=2 + ) + self.assertEqual(obs, 0) + + quality_scores = b'M++MMMM+++' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=11, window_length=3 + ) + self.assertEqual(obs, 7) + + # test when multiple windows exist, first window is returned + quality_scores = b'ML++MMM+++' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=20, window_length=2 + ) + self.assertEqual(obs, 2) + + quality_scores = b'++ML+++M+++MM++' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=20, window_length=3 + ) + self.assertEqual(obs, 4) def test_truncate(self): - data = [('@x', 'ATGCG', '+', 'IIIIA', np.array([40, 40, 40, 40, 32])), - ('@y', 'TGCAC', '+', 'ABCDA', np.array([32, 33, 34, 35, 32]))] - - exp1 = [('@x', 'A', '+', 'I', np.array([40])), - ('@y', 'T', '+', 'A', np.array([32]))] - - exp2 = [('@x', 'AT', '+', 'II', np.array([40, 40])), - ('@y', 'TG', '+', 'AB', np.array([32, 33]))] - - for i, d in enumerate(data): - o1 = _truncate(d, 1) - o2 = _truncate(d, 2) - self.assertEqual(o1[:4], exp1[i][:4]) - npt.assert_equal(o1[4], exp1[i][4]) - self.assertEqual(o2[:4], exp2[i][:4]) - npt.assert_equal(o2[4], exp2[i][4]) + fastq_record = FastqRecord( + b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' + ) + + truncated = _truncate(copy(fastq_record), position=4) + exp = FastqRecord( + b'@header', b'ATTC', b'+', b'MMLM' + ) + self.assertEqual(truncated, exp) + + truncated = _truncate(copy(fastq_record), position=7) + exp = FastqRecord( + b'@header', b'ATTCTGT', b'+', b'MMLMLL+' + ) + self.assertEqual(truncated, exp) + + truncated = _truncate(copy(fastq_record), position=1) + exp = FastqRecord( + b'@header', b'A', b'+', b'M' + ) + self.assertEqual(truncated, exp) + + truncated = _truncate(copy(fastq_record), position=0) + exp = FastqRecord( + b'@header', b'', b'+', b'' + ) + self.assertEqual(truncated, exp) + + def test_write_record(self): + fastq_record = FastqRecord( + b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' + ) + + with tempfile.TemporaryDirectory() as tempdir: + fp = Path(tempdir) / 'file.fastq.gz' + + with gzip.open(fp, 'wb') as fh: + _write_record(fastq_record, fh) + + with gzip.open(fp, 'rb') as fh: + contents = fh.read() + exp = b'@header\nATTCTGTA\n+\nMMLMLL++\n' + self.assertEqual(contents, exp) + + with gzip.open(fp, 'ab') as fh: + _write_record(fastq_record, fh) + _write_record(fastq_record, fh) + + with gzip.open(fp, 'rb') as fh: + contents = fh.read() + exp = b'@header\nATTCTGTA\n+\nMMLMLL++\n' * 3 + self.assertEqual(contents, exp) def test_q_score_all_dropped(self): ar = Artifact.load(self.get_data_path('simple.qza')) - with self.assertRaisesRegex(ValueError, "filtered out"): + with self.assertRaisesRegex( + ValueError, 'All sequences from all samples were filtered' + ): with redirected_stdio(stdout=os.devnull): self.plugin.methods['q_score'](ar, min_quality=50) @@ -92,7 +182,7 @@ def test_q_score_numeric_ids(self): with redirected_stdio(stdout=os.devnull): obs_ar, stats_ar = self.plugin.methods['q_score']( - ar, min_quality=20) + ar, min_quality=2) obs = obs_ar.view(SingleLanePerSampleSingleEndFastqDirFmt) stats = stats_ar.view(pd.DataFrame) obs_manifest = obs.manifest.view(obs.manifest.format) @@ -157,6 +247,8 @@ def test_q_score(self): pdt.assert_frame_equal(stats, exp_trunc_stats.loc[stats.index]) def test_q_score_real(self): + self.maxDiff = None + ar = Artifact.load(self.get_data_path('real_data.qza')) with redirected_stdio(stdout=os.devnull): obs_ar, stats_ar = self.plugin.methods['q_score']( From 8730f63e61a60ee1f39944b73fb4284acb4c89b4 Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Wed, 4 Dec 2024 14:58:37 -0700 Subject: [PATCH 03/24] parameter help text refactor --- q2_quality_filter/plugin_setup.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/q2_quality_filter/plugin_setup.py b/q2_quality_filter/plugin_setup.py index f077e7c..3caec6f 100644 --- a/q2_quality_filter/plugin_setup.py +++ b/q2_quality_filter/plugin_setup.py @@ -62,19 +62,24 @@ } _q_score_parameter_descriptions = { - 'min_quality': 'The minimum acceptable PHRED score. All PHRED scores ' - 'less that this value are considered to be low PHRED ' - 'scores.', - 'quality_window': 'The maximum number of low PHRED scores that ' - 'can be observed in direct succession before ' - 'truncating a sequence read.', - 'min_length_fraction': 'The minimum length that a sequence read can ' - 'be following truncation and still be ' - 'retained. This length should be provided ' - 'as a fraction of the input sequence length.', - 'max_ambiguous': 'The maximum number of ambiguous (i.e., N) base ' - 'calls. This is applied after trimming sequences ' - 'based on `min_length_fraction`.' + 'min_quality': ( + 'The minimum acceptable PHRED score. All PHRED scores less that this ' + 'value are considered to be low PHRED scores.' + ), + 'quality_window': ( + 'The maximum number of low PHRED scores that can be observed in ' + 'direct succession before truncating a sequence read. Note that ' + 'truncation is performed such that the entirety of the low quality ' + 'window is truncated.' + ), + 'min_length_fraction': ( + 'Filter truncated reads whose length fraction (truncated length ' + 'divided by original length) is less than or equal to this value.' + ), + 'max_ambiguous': ( + 'The maximum number of ambiguous (i.e., N) base calls. This is ' + 'applied after trimming sequences based on `min_length_fraction`.' + ) } _q_score_output_descriptions = { From a1c5f578199a081781f6b2acc0d38bd61f8bcd3a Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Wed, 4 Dec 2024 16:14:09 -0700 Subject: [PATCH 04/24] figure out single/paired end type map --- q2_quality_filter/_filter.py | 46 +++++++++++++++---------------- q2_quality_filter/_format.py | 18 ++++++++++++ q2_quality_filter/_transformer.py | 17 +++++++++++- 3 files changed, 57 insertions(+), 24 deletions(-) diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index cde4def..926ad22 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -9,13 +9,21 @@ from dataclasses import dataclass import gzip import os +from typing import Union import yaml import pandas as pd import numpy as np + from q2_types.per_sample_sequences import ( - SingleLanePerSampleSingleEndFastqDirFmt, - FastqManifestFormat, YamlFormat, FastqGzFormat) + SingleLanePerSampleSingleEndFastqDirFmt, + SingleLanePerSamplePairedEndFastqDirFmt, + FastqManifestFormat, + YamlFormat, + FastqGzFormat, +) + +from q2_quality_filter._format import _ReadDirectionUnion @dataclass @@ -156,28 +164,26 @@ def _write_record(fastq_record: FastqRecord, fh: gzip.GzipFile) -> None: def q_score( - demux: SingleLanePerSampleSingleEndFastqDirFmt, + demux: Union[ + SingleLanePerSampleSingleEndFastqDirFmt, + SingleLanePerSamplePairedEndFastqDirFmt + ], min_quality: int = 4, quality_window: int = 3, min_length_fraction: float = 0.75, max_ambiguous: int = 0 -) -> (SingleLanePerSampleSingleEndFastqDirFmt, pd.DataFrame): +) -> (_ReadDirectionUnion, pd.DataFrame): ''' Parameter defaults as used in Bokulich et al, Nature Methods 2013, same as QIIME 1.9.1. - - Parameters - ---------- - - Returns - ------- - tuple[SingleLanePerSampleSingleEndFastqDirFmt, pd.DataFrame] - ''' - # TODO: paired/single handling - # create the output format and its manifest format - result = SingleLanePerSampleSingleEndFastqDirFmt() + # we need to use a union type of single-end and paired-end formats + # which will be transformed by the framework to the appropriate return type + output_format = _ReadDirectionUnion() + output_format.format = type(demux)() + + result = output_format.format manifest = FastqManifestFormat() manifest_fh = manifest.open() @@ -188,15 +194,9 @@ def q_score( phred_offset = yaml.load(metadata_view, Loader=yaml.SafeLoader)['phred-offset'] demux_manifest = demux.manifest.view(demux.manifest.format) - demux_manifest = pd.read_csv(demux_manifest.open(), dtype=str) + demux_manifest = pd.read_csv(demux_manifest.open(), dtype=str, comment='#') demux_manifest.set_index('filename', inplace=True) - # HACK: we have to deal with comment lines that may be present in the - # manifest that used to be written by this method - demux_manifest = demux_manifest[ - ~demux_manifest['sample-id'].str.startswith('#') - ] - filtering_stats_df = pd.DataFrame( data=0, index=demux_manifest['sample-id'], @@ -283,4 +283,4 @@ def q_score( metadata.path.write_text(yaml.dump({'phred-offset': phred_offset})) result.metadata.write_data(metadata, YamlFormat) - return result, filtering_stats_df + return output_format, filtering_stats_df diff --git a/q2_quality_filter/_format.py b/q2_quality_filter/_format.py index 21d12f1..ca02a06 100644 --- a/q2_quality_filter/_format.py +++ b/q2_quality_filter/_format.py @@ -6,8 +6,15 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +from typing import Union + import qiime2.plugin.model as model +from q2_types.per_sample_sequences import ( + SingleLanePerSampleSingleEndFastqDirFmt, + SingleLanePerSamplePairedEndFastqDirFmt, +) + class QualityFilterStatsFmt(model.TextFileFormat): def _validate_(self, level): @@ -23,3 +30,14 @@ def _validate_(self, level): QualityFilterStatsDirFmt = model.SingleFileDirectoryFormat( 'QualityFilterStatsDirFmt', 'stats.csv', QualityFilterStatsFmt) + + +_ReadDirectionTypes = Union[ + SingleLanePerSampleSingleEndFastqDirFmt, + SingleLanePerSamplePairedEndFastqDirFmt, + None +] + + +class _ReadDirectionUnion: + format: _ReadDirectionTypes = None diff --git a/q2_quality_filter/_transformer.py b/q2_quality_filter/_transformer.py index c908551..0bcf28c 100644 --- a/q2_quality_filter/_transformer.py +++ b/q2_quality_filter/_transformer.py @@ -10,7 +10,12 @@ import qiime2 from .plugin_setup import plugin -from ._format import QualityFilterStatsFmt +from ._format import QualityFilterStatsFmt, _ReadDirectionUnion + +from q2_types.per_sample_sequences import ( + SingleLanePerSampleSingleEndFastqDirFmt, + SingleLanePerSamplePairedEndFastqDirFmt, +) @plugin.register_transformer @@ -45,3 +50,13 @@ def _2(ff: QualityFilterStatsFmt) -> pd.DataFrame: @plugin.register_transformer def _3(ff: QualityFilterStatsFmt) -> qiime2.Metadata: return qiime2.Metadata(_stats_to_df(ff)) + + +@plugin.register_transformer +def _4(ff: _ReadDirectionUnion) -> SingleLanePerSampleSingleEndFastqDirFmt: + return ff.format + + +@plugin.register_transformer +def _5(ff: _ReadDirectionUnion) -> SingleLanePerSamplePairedEndFastqDirFmt: + return ff.format From cbf336ce75399ff3c2797af10e5d1ebf164bc17a Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Wed, 4 Dec 2024 16:46:55 -0700 Subject: [PATCH 05/24] WIP: support paired end reads --- q2_quality_filter/_filter.py | 50 ++++++++++++++++++++++++------------ q2_quality_filter/_format.py | 3 +-- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index 926ad22..5f5d75c 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -10,9 +10,10 @@ import gzip import os from typing import Union +from pathlib import Path + import yaml import pandas as pd - import numpy as np from q2_types.per_sample_sequences import ( @@ -177,13 +178,12 @@ def q_score( Parameter defaults as used in Bokulich et al, Nature Methods 2013, same as QIIME 1.9.1. ''' - # we need to use a union type of single-end and paired-end formats # which will be transformed by the framework to the appropriate return type - output_format = _ReadDirectionUnion() - output_format.format = type(demux)() + union_format = _ReadDirectionUnion() + union_format.format = type(demux)() - result = output_format.format + result = union_format.format manifest = FastqManifestFormat() manifest_fh = manifest.open() @@ -194,12 +194,14 @@ def q_score( phred_offset = yaml.load(metadata_view, Loader=yaml.SafeLoader)['phred-offset'] demux_manifest = demux.manifest.view(demux.manifest.format) - demux_manifest = pd.read_csv(demux_manifest.open(), dtype=str, comment='#') - demux_manifest.set_index('filename', inplace=True) + demux_manifest_df = pd.read_csv( + demux_manifest.open(), dtype=str, comment='#' + ) + demux_manifest_df.set_index('filename', inplace=True) filtering_stats_df = pd.DataFrame( data=0, - index=demux_manifest['sample-id'], + index=demux_manifest_df['sample-id'], columns=[ 'total-input-reads', 'total-retained-reads', 'reads-truncated', 'reads-too-short-after-truncation', @@ -207,20 +209,33 @@ def q_score( ] ) - iterator = demux.sequences.iter_views(FastqGzFormat) - for barcode_id, (filename, filepath) in enumerate(iterator): - sample_id = demux_manifest.loc[str(filename)]['sample-id'] + for barcode_id, filename in enumerate(demux_manifest_df.index.values): + # determine read number + if 'R1' in str(filename): + read_number = 1 + elif 'R2' in str(filename): + read_number = 2 + else: + msg = ( + 'Unrecognized fastq file name. Netiher "R1" nor "R2" ' + f'were found in the file {filename}.' + ) + raise ValueError(msg) + + # look up sample id + sample_id = demux_manifest_df.loc[str(filename)]['sample-id'] - # barcode ID, lane number and read number are not relevant here + # create path for output fastq file path = result.sequences.path_maker( sample_id=sample_id, barcode_id=barcode_id, lane_number=1, - read_number=1 + read_number=read_number ) output_fh = gzip.open(path, mode='wb') + filepath = Path(demux.path) / filename for fastq_record in _read_fastq_records(str(filepath)): filtering_stats_df.loc[sample_id, 'total-input-reads'] += 1 @@ -261,9 +276,10 @@ def q_score( # otherwise delete the empty file output_fh.close() if filtering_stats_df.loc[sample_id, 'total-retained-reads'] > 0: - # TODO - direction = 'forward' - manifest_fh.write(f'{sample_id},{path.name},{direction}\n') + if read_number == 1: + manifest_fh.write(f'{sample_id},{path.name},forward\n') + elif read_number == 2: + manifest_fh.write(f'{sample_id},{path.name},reverse\n') else: os.remove(path) @@ -283,4 +299,4 @@ def q_score( metadata.path.write_text(yaml.dump({'phred-offset': phred_offset})) result.metadata.write_data(metadata, YamlFormat) - return output_format, filtering_stats_df + return union_format, filtering_stats_df diff --git a/q2_quality_filter/_format.py b/q2_quality_filter/_format.py index ca02a06..ac03d03 100644 --- a/q2_quality_filter/_format.py +++ b/q2_quality_filter/_format.py @@ -35,9 +35,8 @@ def _validate_(self, level): _ReadDirectionTypes = Union[ SingleLanePerSampleSingleEndFastqDirFmt, SingleLanePerSamplePairedEndFastqDirFmt, - None ] class _ReadDirectionUnion: - format: _ReadDirectionTypes = None + format: _ReadDirectionTypes From 321b1e94759c4928649f4b5a49d251cc3e510447 Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Thu, 5 Dec 2024 16:29:20 -0700 Subject: [PATCH 06/24] WIP: paired end read processing --- q2_quality_filter/_filter.py | 313 ++++++++++++++++++++++++++++------- 1 file changed, 256 insertions(+), 57 deletions(-) diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index 5f5d75c..8daf83f 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -7,6 +7,7 @@ # ---------------------------------------------------------------------------- from dataclasses import dataclass +from enum import Enum import gzip import os from typing import Union @@ -21,7 +22,6 @@ SingleLanePerSamplePairedEndFastqDirFmt, FastqManifestFormat, YamlFormat, - FastqGzFormat, ) from q2_quality_filter._format import _ReadDirectionUnion @@ -143,6 +143,177 @@ def _truncate(fastq_record: FastqRecord, position: int) -> FastqRecord: return fastq_record +class RecordStatus(Enum): + UNTRUNCATED = 1 + TRUNCATED = 2 + SHORT = 3 + AMBIGUOUS = 4 + TRUNCATED_AMBIGUOUS = 5 + + +def _process_record( + fastq_record: FastqRecord, + phred_offset: int, + min_quality: int, + window_length: int, + min_length_fraction: float, + max_ambiguous: int, +) -> tuple[FastqRecord, RecordStatus]: + ''' + Processes a fastq record by detecting low quality windows, truncating if + one or more such windows are found, detecting if a truncated record is too + short, and finally detecting if the number of ambiguous bases is too high. + + Parameters + ---------- + fastq_record : FastqRecord | None + The fastq record to be processed. None if record does not exist (for + convenience when reverse reads are not present). + phred_offset : int + The PHRED encoding of the record's quality scores. + min_quality : int + The minimum quality that a base must have in order to not be considered + part of a low quality window. + window_length : int + The length of the low quality window to search for. + min_length_fraction : float + The fraction of its original length a record must be greater than to + be retained. + max_ambiguous : int + The maximum number of ambiguous bases a record may contain to be + retained. + + Returns + ------- + tuple[FastqRecord, RecordStatus] + A tuple containing the processed record and its status. + ''' + if fastq_record is None: + return None, None + + status = RecordStatus.UNTRUNCATED + + # search for low quality window + truncation_position = _find_low_quality_window( + fastq_record.quality_scores, + phred_offset, + min_quality, + window_length + ) + + # check if truncation should be performed mark short if necessary + initial_record_length = len(fastq_record.sequence) + if truncation_position is not None: + fastq_record = _truncate(fastq_record, truncation_position) + status = RecordStatus.TRUNCATED + + trunc_fraction = truncation_position / initial_record_length + if trunc_fraction <= min_length_fraction: + status = RecordStatus.SHORT + + return fastq_record, status + + # mark ambiguous if too many ambiguous bases are present + if fastq_record.sequence.count(b'N') > max_ambiguous: + if status == RecordStatus.TRUNCATED: + status = RecordStatus.TRUNCATED_AMBIGUOUS + else: + status = RecordStatus.AMBIGUOUS + + return fastq_record, status + + +def _is_retained( + forward_status: RecordStatus, + reverse_status: RecordStatus | None, + filtering_stats_df: pd.DataFrame, + sample_id: str +) -> bool: + ''' + Determines whether a fastq record or pair of fastq records will retained + in the output. The `reverse_status` is None in the case of single-end + reads. + + Parameters + ---------- + forward_status : RecordStatus + The status of the record from the forward fastq file. + reverse_status : RecordStatus or None + The status of the record from the reverse fastq file if it exists + otherwise None. + filtering_stats_df : pd.DataFrame + The data structure that tracks filtering stats. + sample_id : str + The sample id that the record(s) belongs to. + + Returns + ------- + bool + True if the record(s) is to be retained, False otherwise. + ''' + filtering_stats_df.loc[sample_id, 'total-input-reads'] += 1 + + if (RecordStatus.SHORT in (forward_status, reverse_status)): + filtering_stats_df.loc[sample_id, 'reads-truncated'] += 1 + filtering_stats_df.loc[ + sample_id, 'reads-too-short-after-truncation' + ] += 1 + return False + + if (RecordStatus.AMBIGUOUS in (forward_status, reverse_status)): + filtering_stats_df.loc[ + sample_id, 'reads-exceeding-maximum-ambiguous-bases' + ] += 1 + return False + + if (RecordStatus.TRUNCATED_AMBIGUOUS in (forward_status, reverse_status)): + filtering_stats_df.loc[sample_id, 'reads-truncated'] += 1 + filtering_stats_df.loc[ + sample_id, 'reads-exceeding-maximum-ambiguous-bases' + ] += 1 + return False + + if (RecordStatus.TRUNCATED in (forward_status, reverse_status)): + filtering_stats_df.loc[sample_id, 'reads-truncated'] += 1 + + filtering_stats_df.loc[sample_id, 'total-retained-reads'] += 1 + + return True + + +def _align_records( + forward_record: FastqRecord, reverse_record: FastqRecord +) -> tuple[FastqRecord, FastqRecord]: + ''' + Align a forward record and reverse record to the same truncation length. + Note that if either (forward or reverse) truncation resulted in the record + falling below the minimum length fraction then this was already handled + upstream. + + Parameters + ---------- + forward_record : FastqRecord + The record from the forward fastq file. + reverse_record : FastqRecord + The record from the reverse fastq file. + + Returns + ------- + tuple[FastqRecord, FastqRecord] + The length-aligned forward and reverse records. + ''' + if len(forward_record.sequence) < len(reverse_record.sequence): + reverse_record = _truncate( + reverse_record, len(forward_record.sequence) + ) + elif len(reverse_record.sequence) < len(forward_record.sequence): + forward_record = _truncate( + forward_record, len(reverse_record.sequence) + ) + + return forward_record, reverse_record + + def _write_record(fastq_record: FastqRecord, fh: gzip.GzipFile) -> None: ''' Writes a fastq record to an open fastq file. @@ -189,6 +360,11 @@ def q_score( manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') + if isinstance(result, SingleLanePerSamplePairedEndFastqDirFmt): + paired = True + else: + paired = False + # load the input demux manifest metadata_view = demux.metadata.view(YamlFormat).open() phred_offset = yaml.load(metadata_view, @@ -210,78 +386,101 @@ def q_score( ) for barcode_id, filename in enumerate(demux_manifest_df.index.values): - # determine read number - if 'R1' in str(filename): - read_number = 1 - elif 'R2' in str(filename): - read_number = 2 - else: - msg = ( - 'Unrecognized fastq file name. Netiher "R1" nor "R2" ' - f'were found in the file {filename}.' - ) - raise ValueError(msg) + if 'R2' in str(filename): + # we handle a read pair in the iteration for R1 + continue # look up sample id sample_id = demux_manifest_df.loc[str(filename)]['sample-id'] # create path for output fastq file - path = result.sequences.path_maker( + forward_path = result.sequences.path_maker( sample_id=sample_id, barcode_id=barcode_id, lane_number=1, - read_number=read_number + read_number=2 ) + if paired: + reverse_path = result.sequences.path_maker( + sample_id=sample_id, + barcode_id=barcode_id, + lane_number=1, + read_number=2 + ) + + forward_fh = gzip.open(forward_path, mode='wb') + forward_input_fp = Path(demux.path) / filename - output_fh = gzip.open(path, mode='wb') + if paired: + reverse_fh = gzip.open(reverse_path, mode='wb') + reverse_input_fp = Path(demux.path) / filename.replace('R1', 'R2') - filepath = Path(demux.path) / filename - for fastq_record in _read_fastq_records(str(filepath)): - filtering_stats_df.loc[sample_id, 'total-input-reads'] += 1 + forward_iterator = _read_fastq_records(str(forward_input_fp)) + reverse_iterator = _read_fastq_records(str(reverse_input_fp)) + iterator = zip(forward_iterator, reverse_iterator) + else: + iterator = _read_fastq_records(str(forward_input_fp)) + + for fastq_record in iterator: + # process record(s) + if paired: + forward_record, reverse_record = fastq_record + else: + forward_record = fastq_record + reverse_record = None + + forward_record, forward_status = _process_record( + fastq_record=forward_record, + phred_offset=phred_offset, + min_quality=min_quality, + window_length=quality_window + 1, + min_length_fraction=min_length_fraction, + max_ambiguous=max_ambiguous + ) + reverse_record, reverse_status = _process_record( + fastq_record=reverse_record, + phred_offset=phred_offset, + min_quality=min_quality, + window_length=quality_window + 1, + min_length_fraction=min_length_fraction, + max_ambiguous=max_ambiguous + ) - # search for low quality window - truncation_position = _find_low_quality_window( - fastq_record.quality_scores, - phred_offset, - min_quality, - quality_window + 1 + # see if record(s) is retained and update filtering stats + retained = _is_retained( + forward_status, + reverse_status, + filtering_stats_df, + sample_id ) - # truncate fastq record if necessary and discard if it has been - # made too short - initial_record_length = len(fastq_record.sequence) - if truncation_position is not None: - fastq_record = _truncate(fastq_record, truncation_position) - filtering_stats_df.loc[sample_id, 'reads-truncated'] += 1 - - trunc_fraction = truncation_position / initial_record_length - if trunc_fraction <= min_length_fraction: - filtering_stats_df.loc[ - sample_id, 'reads-too-short-after-truncation' - ] += 1 - continue - - # discard record if there are too many ambiguous bases - if fastq_record.sequence.count(b'N') > max_ambiguous: - filtering_stats_df.loc[ - sample_id, 'reads-exceeding-maximum-ambiguous-bases' - ] += 1 - continue - - # write record to output file - _write_record(fastq_record, output_fh) - filtering_stats_df.loc[sample_id, 'total-retained-reads'] += 1 - - # close output file and update manifest if records were retained, - # otherwise delete the empty file - output_fh.close() + # if retained, align truncations and write to output files + if retained: + if paired: + forward_record, reverse_record = _align_records( + forward_record, reverse_record + ) + _write_record(forward_record, forward_fh) + _write_record(reverse_record, reverse_fh) + else: + _write_record(forward_record, forward_fh) + + # close output file(s) and update manifest if record(s) is retained, + # otherwise delete the empty file(s) + forward_fh.close() + if paired: + reverse_fh.close() + if filtering_stats_df.loc[sample_id, 'total-retained-reads'] > 0: - if read_number == 1: - manifest_fh.write(f'{sample_id},{path.name},forward\n') - elif read_number == 2: - manifest_fh.write(f'{sample_id},{path.name},reverse\n') + manifest_fh.write(f'{sample_id},{forward_path.name},forward\n') + if paired: + manifest_fh.write( + f'{sample_id},{reverse_path.name},reverse\n' + ) else: - os.remove(path) + os.remove(forward_path) + if paired: + os.remove(reverse_path) # error if all samples retained no reads if filtering_stats_df['total-retained-reads'].sum() == 0: From 5d5f273f03512d9fe61b66febef4a3b00d94c5f0 Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Fri, 6 Dec 2024 10:43:56 -0700 Subject: [PATCH 07/24] _process_record and _align_records tests --- q2_quality_filter/_filter.py | 13 +-- q2_quality_filter/test/test_filter.py | 139 ++++++++++++++++++++++++++ 2 files changed, 146 insertions(+), 6 deletions(-) diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index 8daf83f..fe67cde 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -349,7 +349,7 @@ def q_score( Parameter defaults as used in Bokulich et al, Nature Methods 2013, same as QIIME 1.9.1. ''' - # we need to use a union type of single-end and paired-end formats + # we need to use a union type of single-end or paired-end formats # which will be transformed by the framework to the appropriate return type union_format = _ReadDirectionUnion() union_format.format = type(demux)() @@ -393,7 +393,7 @@ def q_score( # look up sample id sample_id = demux_manifest_df.loc[str(filename)]['sample-id'] - # create path for output fastq file + # create path(s) for output fastq file forward_path = result.sequences.path_maker( sample_id=sample_id, barcode_id=barcode_id, @@ -422,13 +422,13 @@ def q_score( iterator = _read_fastq_records(str(forward_input_fp)) for fastq_record in iterator: - # process record(s) if paired: forward_record, reverse_record = fastq_record else: forward_record = fastq_record reverse_record = None + # process records forward_record, forward_status = _process_record( fastq_record=forward_record, phred_offset=phred_offset, @@ -446,7 +446,7 @@ def q_score( max_ambiguous=max_ambiguous ) - # see if record(s) is retained and update filtering stats + # see if record(s) retained and update filtering stats retained = _is_retained( forward_status, reverse_status, @@ -454,9 +454,10 @@ def q_score( sample_id ) - # if retained, align truncations and write to output files + # if retained write to output file(s) if retained: if paired: + # align truncations if paired forward_record, reverse_record = _align_records( forward_record, reverse_record ) @@ -465,7 +466,7 @@ def q_score( else: _write_record(forward_record, forward_fh) - # close output file(s) and update manifest if record(s) is retained, + # close output file(s) and update manifest if record(s) retained, # otherwise delete the empty file(s) forward_fh.close() if paired: diff --git a/q2_quality_filter/test/test_filter.py b/q2_quality_filter/test/test_filter.py index 144f9f9..998e91d 100644 --- a/q2_quality_filter/test/test_filter.py +++ b/q2_quality_filter/test/test_filter.py @@ -29,6 +29,9 @@ _read_fastq_records, _find_low_quality_window, _truncate, + RecordStatus, + _process_record, + _align_records, _write_record, ) from q2_quality_filter._format import QualityFilterStatsFmt @@ -142,6 +145,142 @@ def test_truncate(self): ) self.assertEqual(truncated, exp) + def test_process_record(self): + # truncation + fastq_record = FastqRecord( + b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' + ) + processed_record, status = _process_record( + copy(fastq_record), + phred_offset=33, + min_quality=15, + window_length=2, + min_length_fraction=0.5, + max_ambiguous=0 + ) + exp_record = FastqRecord( + b'@header', b'ATTCTG', b'+', b'MMLMLL' + ) + exp_status = RecordStatus.TRUNCATED + self.assertEqual(processed_record, exp_record) + self.assertEqual(status, exp_status) + + # no truncation + processed_record, status = _process_record( + copy(fastq_record), + phred_offset=33, + min_quality=5, + window_length=2, + min_length_fraction=0.5, + max_ambiguous=0 + ) + exp_record = fastq_record + exp_status = RecordStatus.UNTRUNCATED + self.assertEqual(processed_record, exp_record) + self.assertEqual(status, exp_status) + + # ambiguous + fastq_record = FastqRecord( + b'@header', b'ATTCTNTN', b'+', b'MMLMLL++' + ) + processed_record, status = _process_record( + copy(fastq_record), + phred_offset=33, + min_quality=5, + window_length=2, + min_length_fraction=0.5, + max_ambiguous=1 + ) + exp_record = FastqRecord( + b'@header', b'ATTCTNTN', b'+', b'MMLMLL++' + ) + exp_status = RecordStatus.AMBIGUOUS + self.assertEqual(processed_record, exp_record) + self.assertEqual(status, exp_status) + + # truncation and ambiguous + fastq_record = FastqRecord( + b'@header', b'ATTCTNTA', b'+', b'MMLMLL++' + ) + processed_record, status = _process_record( + copy(fastq_record), + phred_offset=33, + min_quality=15, + window_length=2, + min_length_fraction=0.5, + max_ambiguous=0 + ) + exp_record = FastqRecord( + b'@header', b'ATTCTN', b'+', b'MMLMLL' + ) + exp_status = RecordStatus.TRUNCATED_AMBIGUOUS + self.assertEqual(processed_record, exp_record) + self.assertEqual(status, exp_status) + + # truncation and too short + fastq_record = FastqRecord( + b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' + ) + processed_record, status = _process_record( + copy(fastq_record), + phred_offset=33, + min_quality=15, + window_length=2, + min_length_fraction=0.9, + max_ambiguous=0 + ) + exp_record = FastqRecord( + b'@header', b'ATTCTG', b'+', b'MMLMLL' + ) + exp_status = RecordStatus.SHORT + self.assertEqual(processed_record, exp_record) + self.assertEqual(status, exp_status) + + def test_align_records(self): + # records unchanged if equal lengths + forward_record = FastqRecord( + b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' + ) + reverse_record = FastqRecord( + b'@header', b'TTAGCATC', b'+', b'+MM+MLM+' + ) + obs_forward_record, obs_reverse_record = _align_records( + forward_record, reverse_record + ) + self.assertEqual(obs_forward_record, forward_record) + self.assertEqual(obs_reverse_record, reverse_record) + + # longer record truncated to shorter record + forward_record = FastqRecord( + b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' + ) + reverse_record = FastqRecord( + b'@header', b'TTAGCA', b'+', b'+MM+ML' + ) + obs_forward_record, obs_reverse_record = _align_records( + forward_record, reverse_record + ) + exp_forward_record = FastqRecord( + b'@header', b'ATTCTG', b'+', b'MMLMLL' + ) + self.assertEqual(obs_forward_record, exp_forward_record) + self.assertEqual(obs_reverse_record, reverse_record) + + forward_record = FastqRecord( + b'@header', b'ATTC', b'+', b'MMLM' + ) + reverse_record = FastqRecord( + b'@header', b'TTAGCATC', b'+', b'+MM+MLM+' + ) + obs_forward_record, obs_reverse_record = _align_records( + forward_record, reverse_record + ) + exp_reverse_record = FastqRecord( + b'@header', b'TTAG', b'+', b'+MM+' + ) + self.assertEqual(obs_forward_record, forward_record) + self.assertEqual(obs_reverse_record, exp_reverse_record) + def test_write_record(self): fastq_record = FastqRecord( b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' From de3e4a4452af02222c2d49049d3b266c5ab3f184 Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Fri, 6 Dec 2024 11:33:35 -0700 Subject: [PATCH 08/24] _is_retained tests --- q2_quality_filter/_filter.py | 4 +- q2_quality_filter/test/test_filter.py | 96 +++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index fe67cde..407ce5c 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -379,7 +379,9 @@ def q_score( data=0, index=demux_manifest_df['sample-id'], columns=[ - 'total-input-reads', 'total-retained-reads', 'reads-truncated', + 'total-input-reads', + 'total-retained-reads', + 'reads-truncated', 'reads-too-short-after-truncation', 'reads-exceeding-maximum-ambiguous-bases' ] diff --git a/q2_quality_filter/test/test_filter.py b/q2_quality_filter/test/test_filter.py index 998e91d..0c691dc 100644 --- a/q2_quality_filter/test/test_filter.py +++ b/q2_quality_filter/test/test_filter.py @@ -31,6 +31,7 @@ _truncate, RecordStatus, _process_record, + _is_retained, _align_records, _write_record, ) @@ -236,6 +237,101 @@ def test_process_record(self): self.assertEqual(processed_record, exp_record) self.assertEqual(status, exp_status) + def test_is_retained(self): + filtering_stats_df = pd.DataFrame( + data=0, + index=['sample-a', 'sample-b', 'sample-c'], + columns=[ + 'total-input-reads', + 'total-retained-reads', + 'reads-truncated', + 'reads-too-short-after-truncation', + 'reads-exceeding-maximum-ambiguous-bases' + ] + ) + + # retained and truncated + retained = _is_retained( + forward_status=RecordStatus.TRUNCATED, + reverse_status=RecordStatus.UNTRUNCATED, + filtering_stats_df=filtering_stats_df, + sample_id='sample-a' + ) + self.assertTrue(retained) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'total-retained-reads'], 1 + ) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'reads-truncated'], 1 + ) + filtering_stats_df.iloc[:, :] = 0 + + # forward read only, retained + retained = _is_retained( + forward_status=RecordStatus.TRUNCATED, + reverse_status=None, + filtering_stats_df=filtering_stats_df, + sample_id='sample-a' + ) + self.assertTrue(retained) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'total-retained-reads'], 1 + ) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'reads-truncated'], 1 + ) + self.assertEqual( + filtering_stats_df.loc[ + 'sample-a', 'reads-too-short-after-truncation' + ], + 0 + ) + filtering_stats_df.iloc[:, :] = 0 + + # forward read only, short + retained = _is_retained( + forward_status=RecordStatus.SHORT, + reverse_status=None, + filtering_stats_df=filtering_stats_df, + sample_id='sample-a' + ) + self.assertFalse(retained) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'total-retained-reads'], 0 + ) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'reads-truncated'], 1 + ) + self.assertEqual( + filtering_stats_df.loc[ + 'sample-a', 'reads-too-short-after-truncation' + ], + 1 + ) + filtering_stats_df.iloc[:, :] = 0 + + # one read untruncated, one read truncated and ambiguous + retained = _is_retained( + forward_status=RecordStatus.UNTRUNCATED, + reverse_status=RecordStatus.TRUNCATED_AMBIGUOUS, + filtering_stats_df=filtering_stats_df, + sample_id='sample-a' + ) + self.assertFalse(retained) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'total-retained-reads'], 0 + ) + self.assertEqual( + filtering_stats_df.loc[ + 'sample-a', 'reads-exceeding-maximum-ambiguous-bases' + ], + 1 + ) + self.assertEqual( + filtering_stats_df.loc['sample-a', 'reads-truncated'], 1 + ) + filtering_stats_df.iloc[:, :] = 0 + def test_align_records(self): # records unchanged if equal lengths forward_record = FastqRecord( From 79376283a1fa81a2199ed0d4a02c231232eb9275 Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Fri, 6 Dec 2024 11:44:04 -0700 Subject: [PATCH 09/24] tests restructuring --- q2_quality_filter/test/data/numeric_ids.qza | Bin 3945 -> 0 bytes q2_quality_filter/test/data/real_data.qza | Bin 4576 -> 0 bytes .../test/data/real_data_joined.qza | Bin 4880 -> 0 bytes q2_quality_filter/test/data/simple.qza | Bin 3870 -> 0 bytes q2_quality_filter/{test => tests}/__init__.py | 0 .../Human-Kneecap_S1_L001_R2_001.fastq.gz | Bin 0 -> 827 bytes .../tests/data/paired-end-data/MANIFEST | 7 +++++++ .../{test => tests}/data/simple.fastq.gz | Bin .../{test => tests}/data/stats-1.txt | 0 .../{test => tests}/data/stats-numeric.txt | 0 .../{test => tests}/test_filter.py | 16 ++++++++++++---- setup.py | 2 +- 12 files changed, 20 insertions(+), 5 deletions(-) delete mode 100644 q2_quality_filter/test/data/numeric_ids.qza delete mode 100644 q2_quality_filter/test/data/real_data.qza delete mode 100644 q2_quality_filter/test/data/real_data_joined.qza delete mode 100644 q2_quality_filter/test/data/simple.qza rename q2_quality_filter/{test => tests}/__init__.py (100%) create mode 100644 q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R2_001.fastq.gz create mode 100644 q2_quality_filter/tests/data/paired-end-data/MANIFEST rename q2_quality_filter/{test => tests}/data/simple.fastq.gz (100%) rename q2_quality_filter/{test => tests}/data/stats-1.txt (100%) rename q2_quality_filter/{test => tests}/data/stats-numeric.txt (100%) rename q2_quality_filter/{test => tests}/test_filter.py (98%) diff --git a/q2_quality_filter/test/data/numeric_ids.qza b/q2_quality_filter/test/data/numeric_ids.qza deleted file mode 100644 index d704134097901297da85d0c2f9f45c519aa88ed8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3945 zcmd6pc~sL^7RM8!5J41KOl1c_qnM3^eT@(y1d>2NDB?g!Si`0Skwr0KDTWpl0xp1T zVY3#2f(3*T&=HixuvRFFEGmi%t0*cu(bj`a$2r(Df6V)xd)|5f-0$bUbMNnc$pp|E zMF0Q*225hj2}907{)a^;wkY&PK}Q)H)AZ?vFuDoSABOSsH-Y&gO{g$B9p#VqMbXhF z27ZW8KPHt&Z8f{vk3LfuAL{q$h`1aN}Ex?9G_s0|WHxT7w-!kl?gU?cw*J|t7i*upi zNko~2g@q_L|BPWr9KI*6Y+mjnD`KUCTkuk&f0@q-7^k-TrZ@?&ed9`id%ENJEUl zb7X?cHHwEz`}|Dr&&BQS85+{+`Y8ITh*Sp@5-B45)Ac#sl7f1Pcs)g7_jTMzls@_o zY~+8!9(?*GY)ujNNNXz@0jp`?ohbSS>g(Gd@Q#;SUY!SXM!1u_iV7<#W~k1I-?^4d zd9(1+QXX6WdF9<@K8M8uiWNWsFo_fqc(f>1-Nzp<-U&;@@4-=A^@w~}Cw@TS*n{ul z;{^o1(*XX)E(a(04Zd(fzZ;D*KzFmqvu%>ImXeYwv)GhbYln~IWJHU@cG#5)^BQQf z3^b5_0ko|sST^GPS}{auRa}jd2v0#2hN4*2sC`6SnYYl8AK{%)2d5qZiJ(V>)wm!H zs~$**34K1(A7)}zLJBYI5ELA`q9k``;KzS3#_ev_KJ!9hZYaM*B#%(Pk zK544?T2`ZS(wJJV@V5*@+cWtP?YyOmnX7N3xziC6R#@p7iM(Y^Dw#g=qqDpuN3-i zbKexknBs=rj;lB2yFbt>U3(*qU1NSjI&U)Qn-u5$<Um)qy}9%_DSi9oI7L@5$4~2C;0{sgexm6H6WDErf|7A55Ig zocnv`enlPZ(<3c8D56oB?{Ez-~G?&|H<|w)|);0%B+O z{YZh=@{7I~^$;oQu@=VA3vzd@--z4B!8jwMg!s8$K+JR?!FHk?ycTL-`!Yd@b3N8w z;`-egG|)p*Db8A>7#TR;RH&1l$k9W?k8hBL#7-{HU(7mnPa#%Qwoi|NCpZ93@7OoI zt5t8p{+`Sb$)KZc=eu<7g=9^DQoMYQRFl-=)gz~%V1#>27x*qHpe;7a`6g(Xbo8pG z8UA8h=C|DJ5Qrbsr8kFSzpO7?J7AW*(GhEE)`~e)T(mWGGO4hcC{t6b#+8-yW%68i zGw^;bKJHr2n1Pdd5zYVTYC5;0(RjDt7_-iqhG?v|ZdD_`&AC>m=3iLT($Q0YzjaPc z4tB8kCnKPyd=LMms@Kk5Kc+V`R*EkhL} zA-vq12;VDIU*PqJ4Lw$K=c$Hq1fkn}vRnL)4$d^fD_*NePzvrHa3vJ6z-l$hM)L^J z9U*t4YxVL|-P}>4XXQ4_2J)p$_27}(WH$!5Ak}pJGOQ9CwQgi@w~x=_PZ))Yu@r>b zZGSnL)3ZB&{UH?oev9chT<@u;mm6H zME9ph0h^t?u7#0;Zx+$sW_C5Mn`x9Zym~Jo{13 zn#!TXL8FB;6Bw^MCrgs!cj1TI&pv62>1$LwFg)F8d<)h@j6ZCoau=I*OGP*OE%2rq zgqKO7KQ1llmMrT%nrWA=T6>y5A@D+SBKnAFH^NffX6>o8GfS}X@B2F}XniU=%$j{U zxzM2=WKns?;kw7B6?-PFNMW7k_Fj9*r=wivmz1os%@VdrZ8)p1{cQ^ErL`XJs(=h! z^UNe)zfb{Akq`fE4{7?DAyeT|*mEVEli^X9wH{!~Ah&;I{BXb{n-TQfB>u6qw>cZwD$&O)R59MU zp-V|l!+TWLILU33{H3x*bKE6H`mogT{OfRmsl%93h*O^vVLDi;-cb4^pof@R_9lVezK+F`Ndu2N`40DYl>Rw zguga|xPknn!DIEWR^v-{QoNkcd+vj=`?Pqh@-?e0=QF7E!ou5qqwlRdqr3m?{mq#>cjhHJPgE9An{UoxQHaoTFTl2CT58cfl0w}aG0o-l$eOPB|=I<+*;TP?`eUv z@U#%RZsFv}(^^0q2&eNOd#zaw5wjCa#jEgA(gX#$NzUpC+X<2u-V8d_?9A+{arR=S zLr=%bs=3HC`G{e$Q>2UP#XW7di&?9IB}{|%{*si7{$o<3+Hj3h*-*JHP{6et#}3-) zw@>iD2Agt@(X9I|%|#oZ5EN#ek}cc@YW+n>qNtaT(4he6zv8EaV3f6WP?~7S#>R%1 z9(-ICs`{=8s`gSFKB+3G`c?zhF5FVA4M)JmdPTcLJJe7x(NPfK8?V!00*UwDAR`Ys zYmSn46hD)|qvQ!|$fHpz$~w9-_o}ID;B-nWUy0<;3(^k8P^&B$(au=j>;!|(=MvvVQFAh;AF3mzr*GFBut)V-8x zXl?vD5+eLf9?hY=q$~45rw(N?9ESKW$@^9$PLA0^O?;{voo$+Gf@oNQHe57o>iayF zs!OYKW3G$UkpTczhk^bjB0Xmu-V^WcWao^>nd!jI)ZuWrnYM@-94;bcZQCS ztRlO#%N|BWwIoV30D#@`7LI0rRPY}^0NUbO-6zE9l(+W7p*mT!@+~LzYxA|=N`|Q; zNH`S7rHU^HW>JGQ#HZZl4M;&nZnAOtxD!IoC6#+#dpm&DQc&H1?cmDY@>IrHUpA9PPuW|!NQ-leridYR}vsUwLQTi#aD@gCMKUDfJl^t4bl zNh!*)6{7Z9C7rxwZ(!JIB}4oEeR42TGeh(1#)6INtLtw3l{_T~I*G^Pff7r7ocFk4_N9A)tL6b+0na!qF&8FspI8PJW!mzeTON|K9DdAb z&YW{!X*ju9WvO2N%FC)d7a^yyCeMqIjIT^!vJY_!FU)9G*1-LrFpUM{CnubycI1bB z<-`IcN=Gh#H1j7h4(~<~Da}_bxnVa}+>R&XvSTsNVm|ZD1Yfu;ZgjeJ$^K!?d1xVJ z8!c`@skkCqBh)sZ1!+Xzqa>bd*@o!74=tM7LUeK)njTPyqr{ztIQoenZeA8gv*$fZ z7dS?UXV6)Nj#62VQVn(ti~&J}-#x4SS$tl+z)_{O6$g{^x%ria_Ueo2p06rVV5^hQ zVxW*yAm~l@Q%4UwolaoRf4f8hMHQBd1o5H>fL&XPD9e8JH-URWOu7|{U_-1KnkP(R zg6Bg5Qyo8clv;#$tdRUtsAN}z)W*aFi1ped2JkyMR>GXR+6d8amVs~Vc!vOUP6%QU zEWpoQ7I3oh^|kw|Y-w8X0ew#EB?7HurNHC$dXM`qMN=~`0;QQYb@SaqO?w;RfJo0C zR8kg1nwPwh+o}udykR4occVrsRY^w z=-YZFtPqroulacfvL7o}h1!y`nX1=#3pCPZk)a#BK0F%Hm~4lQ@nNM`T!OZ|oeq|{ zsQkMK_8m6tl2aN9P?H`-e$~E?Lwb|(1r-O9!$d5KYgSa@(QSsaPdFzq<7ww_O37z| zn8c0Ctz6m=9Y9QzuJ}v7#c!|APoAuFb^2=T%-DTso!h_G~9wa3J zoS&^+EV$O@z7=qD;c9_ZrJk%naYaF&b&n;p!1=!EnYG+buA#HVjs+WrJach$uFjT(-BaCfxEa!p*!{UgH|n?s{qH zQsPa_y_MPLuxN>jbs^@?-?dD*S}>JqARh0K+PaJ6F02R~i7ysRKc!#aP_QitE77Dc zuR8%U0KSc`Y*y?Xru3R8SC|{U6zCBghy}G^&Ko7>?m9D-Ru&h)6Y5 zMz#!a!LAz9DOhS_l#OFG=;O%Z?CJ-g?d$y}kzK^cyDr1+2G5oRYWQGQq=wRMVjhOu zXo|ZzukeMcm2NJb^o~b=#j*yRpXjk-{oeB(vPD}vCG(8BVC}=`@Uqfm$E;d3*i1Q< zom5yNUvz&*o=5ETw$cCvnduG8av)lU}yDmNg%B|s>oL1Xv)b%Rf zy;Z(@pZ50`M4neRxAiG8g%`c9o#CS_#ZF-DN zW{B%HTV>h_s&v$XJE})hxSb;>h5OEq#{<*OXE;40YpFa}<@E>RHZ~hWT_M;<>b_fu z)7h;#4LmI#bz#s}I=gO!gVBQrrLeUyh)@+I7uP4Yn&+L)^y#NPyrfFN`CHqnKq0g7hOq>jOLL`m7sB%>+ARK%qR%sTI9Q0 z5!U4%JE2GX$fo#pbD>Q`>11MBXZSp&x#IBZtdGCLJ!hALQV)ZT5HH<4tH5HIP z5*z=Jkj2;$pBnk&B(c2<(j;VFuvqqN(ybtY@qj{@iAND#qU!vtidE0CUJ_sMcp*6( zMHb7ZRx~ffEvv_V<({5-PH>u!HdbIEd67)pcTe4rPrNy@g9gvMjLp)MV2|Ogr@x(c z_Pw_ucRk)bk@n#@nJ^~3xTZ}O-eJKl`B)>z0}~nn@&LzBfHMI=mp2bYmMQZ-uu#fo z#yJjlMhujkKSBF(vUZYMFmTo~zSvl8q-^#J|0sWK(#D2)M6Yq6zt?odjQ@&q(R5F# zaBY{|u`l74whZWJ=^7uc9WE{~-L8?(bWuTpomR5c&UG0?(HD-k2PY*&a&!K+B$G;~ z=QWhRHQ(6c6_JSR;xBemp1E+Y$MfpMD0%(1MkLAIe7gbQ??T*Udw_1^mF9%||jx(Oz)Gen*lcn@?zhnI!M{6z0jOutA zAV-_nUwaQ~9sf2Wf#$;dihYhh3dNUzmv-O>EMX4$eYpi`PH ztsF6O&sUs*P5u>e0b4p%K|aKo>E%T?*(Q{0oA-G~yqV@XK*%v$_2c7nM*a$QA5F2r z`-wdqRr}JQBG>1js8f05>#3}Pk)n1r>fvcd9U(C9n-`U#@HBmztu(ufP$AY8-f#=E>AOv@uV ztWeuti~IU(Xx!_kK{AY0pPcVZg!B$)esv|aqP?ofse9j}-*PSBI4*1^(eDj}$R@sI zw)<_7F^KA2MFMNLKFctqWA9>Aa4fmz!vhcC>OK?SkV^cSE6GKd3MyCA$KecDt}U5q zdE6n!^o0A$$;|A4SjBZR{6o8?tV>xbek=>Km9xY-;Lrw(E)(8faagY|5u;+9{ERB@K0p_0D0g~-uy4d z@O%6pd9iQ!9H3vp|CXM=X6HvX>`#ugfV;oM@iV6Edwo9|X})onQU0X9|1{wIo$C7{ xIGQy6TxkK}RKHI3gL?SUEb)zm?%1!B{QG2~rAAJ1gh_SyNrM4^DB7dbKL8~KKG6UG diff --git a/q2_quality_filter/test/data/real_data_joined.qza b/q2_quality_filter/test/data/real_data_joined.qza deleted file mode 100644 index 2aab35c91bde6cc2910dc991f2d0029bdd4f0850..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4880 zcmdT|cTm$^mktCZfFfOxDosif0wkdbB1lJyR4Jho0Vx6@(xjspq)N2_QWONFigb+h zrZh!LC;;P&Au;xNPy1BR4_Ng!>+AZSUL1OzUPhC!^w>}(-+Fr=-twKP&f90?b7vBy~3 zS!1lBKGrVI{GBB?q5!U-_`#G@- z!Z}e+>orL~3eFClE#ez^r}4ZTVTJMRE1bBEfBBFbO5$hb9WB9+VR%XYpm|YYukH+} zX;*VwT6IGH`8uU{c$j2Yt6#RJ2hh28GvRI*=fO zXj8ARuX`E7h#FvxcWq$JSES_!s3i30hLuJsm4lCL1wQX~`CZnqd^YKmvwyc207CD|+PJeRI6%;b`QSMK1KyB@E& zIJ_cEKWL#_A~TvI-tdJC2Q#4+N=0ipiB`O_F0ii?*e$B#N zYP)!C?Cw5eZF)=7E4h}=*8Q`Gm_*(T=^u6EGxC8qkpTb=hoSwCfDB#j>@oHpE{?ADc9!}u%PTM#%u-Lx z5(X24qOCnK?$GPLBn@PjcDW)!phXGXH2~1V-rCvnS3Cdu0MMP>=`n?*Ro?m$1=i1- zQ|O>KZYK25^Wj->ZLh@;->+AYGuf3g-^qDQN zS#Gm$jZjjvuOWENdCeSLwlSaFyxjTxS;2gD(+2~l`uu9g%6l|U2rra=R#O7p+$Ky# z{}#cZaz(eF5o4`^dRkWC0HyL;NuUpOx@OXAD@QXhkP(j1d9Jg+zMwBJbS@LO_r)*L z?76FXVbr%r{$1_}Ic4)kj2jWL6lNVN6GcwY@N3vh?P? z=@fsg1o-TT1F`yFu>-yqb_;Z1tKI|BvujohB+kv8D&~b`%J=<>A4Sm1C;omO%-OgxzQyh1IeRAH;9j{bYt+exlKf2b+Sk;TKeq zX3U+7PN@%g!KD=4Gu@60X6VYBkiAe= zgWAhL6H`lV{z%Cw{tu5>n}kd!sKf*&OUW-qNcFWyuTM>pa(F*v0KSvwz<#-+Ife_E zV-TD?*(1zWa7TOuC@jQR6-?h+=N+fPnXL;OHa>@5#L_s|3#Y9$d&aqy&CHgE$gpl0 z6uU=Q47S1mF_@3)>3P&L0_3fHwtZmNRTS-4#EuW}Qu=7n+vHPY!+=)9=(b1NH+;Ae zH~LL(Fl|n5D3SgE7&~}9TE;3@cn4rYxF{<$zzzxdtoT)d60PhxM8vv)FTYm2QT~l} zB0;H!N5ny(_l8ZOdhtDRt{qO(MVIUZz}uWy@~ciw+%g-C)ezPF`L(4S|N{=mCV-^XbcN8Mi9N1NtZ*yYg|hWExxYWZ%Q+D#@|+>Zt;)A-MbCL zo6;Ddwmx7uiPYK(c#y)FEjEM2{(*dKoX5OA8pu#c|U!gT`lb88D*(( zo)cBmz_SlIQ(aFO(G6EPf6wO8)smUqYsi!vnVq``zS7!|v6Kpt z946!DmXd8L$ZH+C>Za4A*GRhJ>)S8)5<#EaF955;RpjGMxoT&*7*p<2KD^CWJ=IW{ zu*%a$2Y2e_7+(A0?JLKzbzbPpY$Ms%gQ4~rZogB2-`>kVRb1}4@8wMoH>|y@wX3cD z{|iC?DeT7*oZoo;UupZ#8sB?|h(?|O0LEwl0LA~sU94>}j&81^|K5I0PT|I8Zc~*^ z0q=`?%&=l2!2=&QN4N<>rgqh#=v%>=f}}WEJ*gzM@N6{k%Vxt?EDj>GRC!|0V<~z0 z@nt;qw;}aHtR&@+&n7t3+j%X+0j;=rTx>a`%M({`?S5u55#eeVhbl@U+DJRPcy1@% ztE&w@JxC?k+~*wcxI$v4C$lQI~^VSKvr<2A&0G$0H2O3PdPMM6burk-Y6|{FamVD|>8|QodG$4G zbBxL}JBp7G0sTFZJM98o;|GhPTmb>w8;pa3!WUvv)XcE8pBF2h*{4Y|(eX287F0v7 z8`5mBU1$&@q3IH2ifE-+_0|FT>|MCc7lwWk!@tmg+7Ya>vJdCTl_HdCHAFV7#gq*0yqA?CdP!QL7HrHA%Wd2m5aJK_#DXnH@)<;v1>;6cg& zYq#1ekSk@r|KT)i`wL=gAcO^WExu^@>zOutMr8>S%eD&-H zcrf_X#FOVen#Im61Kz#r&y-G=b;n(_zI&Bnh*f|PRI*rXuH4n5BKB0S3fCUt3BG>Q z%nqLE=YN9yWLbUGdBV_DYLnayK&GsMQM=XoD+#JWT!cm7jbl z;Vfq6t?!xeknuk6ZWvdY4CM%dP+|ODPNo(Pq!}u+1K&%aW9sRQ9u>%nrSUIS>>UL2 zt9_@O)fw%QP>3i%e*tXI3C>^m=BL54t3n?#G?@-v9Fw*}lvhWZiki- z9q&O{!3CQ8P`slhGZ|_Y6WR=zT|q>0`g9 zyfqb~*Gm|-F4ng8e#KTRM2^!feFQzDv4AF@Bvqfz8<*AJ4=8%*9cZU1UBugFN^9|A)S%~)L zqN~?w#Vcc|g9fKJl9uC7XzDjT&JoyP1zEHUEosFx=2XSGO)ujG19v;i9OD&B0Bdte z7pvTJ6*)@Yj!cOXjXBRthj@DRbO$ttD-j>>*TkJJ^9o3zH8H7w7Sb$nSG^M!d~r0w z%iMo5vjP)kJh67xl8|QX>q(cz@!2@)Ll1#c!G|GES_WG!c`_#IJ)aT2!b52n0!>EP zM+5H_P0dKZisoCEVO_;7UsTQLT=F(!TlR@Mh(3p0iZ+I2xOx%h z4u3k;BaEbfgmE0-`a8eBoBEEhTPTjn@3&d$h~@v#1V0C{|6JL@X`p`&^*8m%_Cp2X3?H#>um)?0#=cw!aah~6GU(faYbAP}0bKUpv`B))}c;pevI7Kl+l@Ty^C_#(hK~RR_)zy^Mp>Ar*T577=%G!9?UTrOc224W@szM?L z;tBXbJR}TH@|JI}&>9| zMh39`W^(y8nZrD==%mwu8oBrQx7GB2rhp@?{!&MWq!&S-;opD};LX-yZ)StBK%=g}VZLPGWD zXa(tT2;i?a+-j^3sx#!(7-6n(f^KVy_3FQ zOjJQZW1JaJUV1X(3n!bR6pqNRxUlT;kV*wGc|@gs#-SPhNQ~HYCw{#;OCyxUelv{i z9u!Tv)uO)x*=^BIv*0B z2O6;lrCezPg_V!7G6{V@k?S43DWPNjzQIHvypKEa|9v<$f1#xR{hZd>YR_-u004q` z0RZD)aD;ac^z`*n`D6R~8;m*l(q|OL6W)1nNxAlUHGcU+7^0UOOXxPk2!j%KrV4acU0n;#n!)Y?( zwNSkQjP(PzZF^5I=LG44^jfC!!DySC?xq%&@xs;j?@Kx~lSp}%vw4KA?U~Gsc z{>+C6laN`JBNebkqt!sJ>Qan;V5(%Yxowe!)Ooqm5v8OOB_B={f5KFqd8d=64Wzv3 zOHgkAq0DFM;PeWrB5Rx->MfWwbvj4h>0yVe2+>$0 zG0O7lgNYSI>V$Pk6EX_`j=o@UT4n@vB;zWZX~q?g$4Q6+llIfE!@jIdkB%*7OC3Wj zz#V9Q>;OzP9gqOHL1+qBp%AUbuHZx>!D+~zd%BgnlqkPkkc5Ueij(_PZp*sX3oRRpP;LW}LJAV9f(8m~V?i}> z_9v}iVrRP31h+cNpI?Y_im8ewcWUPo>NA7&?fm_O?TdP8`;+SljjSvoT62zqLIyO$ zT_;is-!2&@O7klR7mzsAu8V)B3C#3HRK1%hrv>Vbm0j4WJkcounf7iO5OP+kQ(Bob zJYUg|JcX8QdBr|(^<(05-(rVTmD)b@DP=yz9j|)=`cLL-I6kf%S>+aTJw^ynjP};( zw>ctf9Czbk$67mJ5iz5vxZ3Q z+dPP!MLCAau#=P7uxJM=(!SD%6tLXc6+>+up@=*`QtZN}tRyx_9kvkA7UA!w*WIB7 z^HQtsuEpox7wGANKaO4)j)KHKv0-l&)y+ErKB|`m30(4QNe7sp%D)pQs6Z66J$?z} zFV*diZWGRz4C4!bTpa5;%UY1PP=VdC{}Hx9O#OaZx=ePCE>}@0*0G>tNNbqtk4d-E zFG{Pq;K!>U43Um6!^b7xS?fysF!r`DY~`e0_;$zIy;apA%Y!NU_7bphL`xl-)c!4{v5)x(LYi!qH1SdGmYnl|ZY0X*vf$<&^5! z&(>vqJv0MO7U2@H$~dk?9$i#Zi?lYj^$}M9uUGl>xaCA}GwH$M5v{xLbmoTJyaTsb z>iC{(oZA8lK@MS2X~e9v#**3aRg3Mq{;h7RXjni4h{khSG-HsbbR>=XP?hf3EMA;) z0)9=?eAMkOSkt<@Fy9zGGaK3)WR?6RGVfJtSZYm~EOeUUK?g=Ch0Gy&g{$n6=u+|@ zSRiKYG%Fz6+9i~B*H$}m6pU1OO<`@?nZ9EyHmXMzBDwe+$Zt3h^5Ur$_X}YWPqJpX zcO>>5osFs<6{m$d?^+lctUo(~M4FFuc zIW~Qplt%Oz*O?TRSq`a!nk1+*=0x7kml}`WM>nI~vpl4C%yQkiI&d|S*7efPqq|L0 zvQe)=vRPEXMSN&tpA~|IRT%hV%4WPcUl)bEzHRD--=GKvb2H~>rr$rZ0{|3q>t^%+ z`NlG5W(L}g%xIQPI{$BY%Ph$Zs~eKzKPCAe(3M$|nLWQro+2l({!EqMNR&B0GdynO zR|Nh{{*5O1dk-*k<2MZoIeoJlKcYzHa=vf3b)NgSC}bkrCd>Iw>Ia?2jA$Doi@zZ9 YcVM$ZurmM<0NBd-D2y8-aIc?!2TIDJ`v3p{ diff --git a/q2_quality_filter/test/__init__.py b/q2_quality_filter/tests/__init__.py similarity index 100% rename from q2_quality_filter/test/__init__.py rename to q2_quality_filter/tests/__init__.py diff --git a/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R2_001.fastq.gz b/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R2_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..db834f3b8cf319529c7b53a0cd0f481876da5044 GIT binary patch literal 827 zcmV-B1H}9viwFoA{Y6*+14wmkVQwu;Ze?X-VQ^nlF<(qDFfm_JGG8z-F)n6db98Y4 z#gxmI+aL^v_j!sgF~)a{0h1Ua1B;$^<@bMlnVGhyo07y9NcttmANU`AuZ^+8 zFxmcPG-3We_IAM49xqzYy&dPtd>idgi{>Q_qAQS2pirEaFwCX6@ThRsrYWZ7B0zWYp>RgC zbepMqsd)tI0xO^%36l_Nk}j&C2gu4)xHLd52YN&bIRiL@>LPgLuCS1MP zf))4)>F%@>!Z>H86jF%U8LcX*XJI1jfIeZ+_Bz&H%sS><)Jv`UVKrhAT1;9fDb%F= zt)6TnEv;W&hl7yoA_$63zOPV=Ilm?R{hn4~vJ zOwyYyv;|@lUkto5nG1Af3dakRBv9}<${d{!-49Hj#1NC2&;`;Y)#Je=!48F@vsOMq zEu@`L<^uae_QVj2QU_(Tz_}HuNrEzjbBmN8^++gZC}wHC?27Vl&>o;onNK&BKsiJb z0M0+_^O3%G{dR2WUHV7;uVmBfJ%yANO<|7lT;4M*DZPe3%JH;tQ&48Sx;1%U(g4t( zl8?y&3%4Lcboc#szvJS7n?r8vy=tW2aH@=h-C%;Xc9V;&Y9g@k+1A|hYV0~|gk^t6 zZ!Or{rfuhW||D@$W>!3z6Z8$m1Q6@PSA=l&LQE_C};he}d%SM|!gG zinzU_?8K>(yN0Br^H5r!p3>_n2`NiQM-!R#q!l zEt}~kt%qw;mD-F+)};+OvI=SBhA-CMMkbxp+-l1jhUe%s@}Bc<=$ug1 Date: Fri, 6 Dec 2024 15:42:33 -0700 Subject: [PATCH 10/24] type map --- q2_quality_filter/plugin_setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/q2_quality_filter/plugin_setup.py b/q2_quality_filter/plugin_setup.py index 3caec6f..c7ce738 100644 --- a/q2_quality_filter/plugin_setup.py +++ b/q2_quality_filter/plugin_setup.py @@ -43,9 +43,10 @@ artifact_format=QualityFilterStatsDirFmt) InputMap, OutputMap = qiime2.plugin.TypeMap({ - SampleData[SequencesWithQuality | PairedEndSequencesWithQuality]: + SampleData[SequencesWithQuality]: SampleData[SequencesWithQuality], - + SampleData[PairedEndSequencesWithQuality]: + SampleData[PairedEndSequencesWithQuality], SampleData[JoinedSequencesWithQuality]: SampleData[JoinedSequencesWithQuality], }) From 1dad260d6fb43cfdd08438347ea8e58e4d106238 Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Fri, 6 Dec 2024 15:43:36 -0700 Subject: [PATCH 11/24] finalize paired end handling, paired end tests --- q2_quality_filter/_filter.py | 42 ++++++------- .../Human-Kneecap_S1_L001_R1_001.fastq.gz | Bin 0 -> 763 bytes .../tests/data/paired-end-data/metadata.yml | 1 + q2_quality_filter/tests/test_filter.py | 59 +++++++++++++++++- 4 files changed, 77 insertions(+), 25 deletions(-) create mode 100644 q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R1_001.fastq.gz create mode 100644 q2_quality_filter/tests/data/paired-end-data/metadata.yml diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index 407ce5c..911636b 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -11,7 +11,6 @@ import gzip import os from typing import Union -from pathlib import Path import yaml import pandas as pd @@ -68,6 +67,8 @@ def _read_fastq_records(filepath: str): quality_scores.strip() ) + fh.close() + def _find_low_quality_window( quality_scores: bytes, @@ -337,8 +338,8 @@ def _write_record(fastq_record: FastqRecord, fh: gzip.GzipFile) -> None: def q_score( demux: Union[ - SingleLanePerSampleSingleEndFastqDirFmt, - SingleLanePerSamplePairedEndFastqDirFmt + SingleLanePerSamplePairedEndFastqDirFmt, + SingleLanePerSampleSingleEndFastqDirFmt ], min_quality: int = 4, quality_window: int = 3, @@ -365,19 +366,17 @@ def q_score( else: paired = False - # load the input demux manifest + # parse phred offset and load the input demux manifest metadata_view = demux.metadata.view(YamlFormat).open() - phred_offset = yaml.load(metadata_view, - Loader=yaml.SafeLoader)['phred-offset'] - demux_manifest = demux.manifest.view(demux.manifest.format) - demux_manifest_df = pd.read_csv( - demux_manifest.open(), dtype=str, comment='#' - ) - demux_manifest_df.set_index('filename', inplace=True) + phred_offset = yaml.load( + metadata_view, Loader=yaml.SafeLoader + )['phred-offset'] + demux_manifest_df = demux.manifest.view(pd.DataFrame) + # initialize filtering stats tracking dataframe filtering_stats_df = pd.DataFrame( data=0, - index=demux_manifest_df['sample-id'], + index=demux_manifest_df.index, columns=[ 'total-input-reads', 'total-retained-reads', @@ -387,20 +386,18 @@ def q_score( ] ) - for barcode_id, filename in enumerate(demux_manifest_df.index.values): - if 'R2' in str(filename): - # we handle a read pair in the iteration for R1 - continue - - # look up sample id - sample_id = demux_manifest_df.loc[str(filename)]['sample-id'] + for barcode_id, sample_id in enumerate(demux_manifest_df.index.values): + # get filepath(s) of input fastq file(s) + forward_input_fp = demux_manifest_df.loc[sample_id, 'forward'] + if paired: + reverse_input_fp = demux_manifest_df.loc[sample_id, 'reverse'] - # create path(s) for output fastq file + # create path(s) for output fastq file(s) forward_path = result.sequences.path_maker( sample_id=sample_id, barcode_id=barcode_id, lane_number=1, - read_number=2 + read_number=1 ) if paired: reverse_path = result.sequences.path_maker( @@ -410,12 +407,11 @@ def q_score( read_number=2 ) + # open output filehandle(s) and create fastq record iterator forward_fh = gzip.open(forward_path, mode='wb') - forward_input_fp = Path(demux.path) / filename if paired: reverse_fh = gzip.open(reverse_path, mode='wb') - reverse_input_fp = Path(demux.path) / filename.replace('R1', 'R2') forward_iterator = _read_fastq_records(str(forward_input_fp)) reverse_iterator = _read_fastq_records(str(reverse_input_fp)) diff --git a/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R1_001.fastq.gz b/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R1_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..c2a52db4149ccc5d9c59973ea01e6acd3956467f GIT binary patch literal 763 zcmVjdc#% zwuk02UeWyQ8fZn^$irIGKxcaQt%ctp(?g<+l*%B~U)CvN#+*eYnW;%MVx^ogS3e0N z`H8zuQC}%#%2|@3JhBFelvBx5lZr+2X+f!~=|m_75kx7yVChFfYB!~ZCM#O3eac$| z>5xh>nNe-B6ipgke6x0>Eom!M=N!O;bJMcMNX#iP2Ayz>Aw-TLh9WT0Mm)>27b|R=Xol!Epf-=8}!f-tw3k8?U|fk zq=co>Sl#;$#z3>IKst8Mon#$c_hz?o4o~CCll01bbZk1`y>!<328IDQKe+F`pV_oe zKTa+zp&yRJ0E0JvTL2n+I;?+ZvNMAjpf{tvGD$m=cw-W;Os0HeGVM%ap`~ndL8Ab- zgQl1je~Ha?b7fzPP*Ja>Qu%AFGOpCtEXrwtPs_c5pYS)j_~J*T-gvU7?% zrOm`<>glRXYQ0pt%7sa*XDNjhacb>U$5}QrJun_}y6{#bm~T|Z z$3m&psibNYzc)&jaFZwf!jZyVb0vDFc~hcU2k2>-PSXjHx2j@v=tWxA=4mEqJn=%q zNsgpn7T25smVonPOt#|){i!kk#^k>D`HPq=_g!VOMtlL2_#c?WkC?P_O~!!1H5Vq& zXR6m1f5P1(Jn|=!(sc(D^E int: + zipped_headers = zip( + forward_record.sequence_header, reverse_record.sequence_header + ) + + diff = 0 + for forward_byte, reverse_byte in zipped_headers: + if forward_byte != reverse_byte: + diff += 1 + + return diff + + def _assert_records_match(self, manifest_df: pd.DataFrame): + for forward_fp, reverse_fp in zip( + manifest_df['forward'], manifest_df['reverse'] + ): + forward_iterator = _read_fastq_records(forward_fp) + reverse_iterator = _read_fastq_records(reverse_fp) + iterator = zip(forward_iterator, reverse_iterator) + + for forward_record, reverse_record in iterator: + # headers differ in one position to indicate read direction + self.assertEqual( + self._get_header_diff(forward_record, reverse_record), 1 + ) + self.assertEqual( + len(forward_record.sequence), len(reverse_record.sequence) + ) + + def test_paired_end_sequences(self): + demux_artifact = Artifact.import_data( + SampleData[PairedEndSequencesWithQuality], + self.get_data_path('paired-end-data'), + ) + + output_seqs, stats = self.plugin.methods['q_score']( + demux_artifact, + min_quality=10, + quality_window=5, + min_length_fraction=0.8, + max_ambiguous=2 + ) + output_demux_format = output_seqs.view( + SingleLanePerSamplePairedEndFastqDirFmt + ) + demux_manifest_df = output_demux_format.manifest.view(pd.DataFrame) + + self._assert_records_match(demux_manifest_df) + class TransformerTests(TestPluginBase): package = 'q2_quality_filter.tests' From e3a1fc6ba652bffff98c5796a677c389dc30f300 Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Mon, 9 Dec 2024 13:44:07 -0700 Subject: [PATCH 12/24] paired end tests --- q2_quality_filter/_filter.py | 34 +++++------- q2_quality_filter/_format.py | 4 +- .../Human-Kneecap2_S2_L001_R1_001.fastq.gz | Bin 0 -> 205 bytes .../Human-Kneecap2_S2_L001_R2_001.fastq.gz | Bin 0 -> 270 bytes .../Human-Kneecap3_S3_L001_R1_001.fastq.gz | Bin 0 -> 289 bytes .../Human-Kneecap3_S3_L001_R2_001.fastq.gz | Bin 0 -> 270 bytes .../Human-Kneecap_S1_L001_R1_001.fastq.gz | Bin 763 -> 291 bytes .../Human-Kneecap_S1_L001_R2_001.fastq.gz | Bin 827 -> 270 bytes .../tests/data/paired-end-data/MANIFEST | 6 ++- q2_quality_filter/tests/test_filter.py | 51 +++++++++++++++++- 10 files changed, 67 insertions(+), 28 deletions(-) create mode 100644 q2_quality_filter/tests/data/paired-end-data/Human-Kneecap2_S2_L001_R1_001.fastq.gz create mode 100644 q2_quality_filter/tests/data/paired-end-data/Human-Kneecap2_S2_L001_R2_001.fastq.gz create mode 100644 q2_quality_filter/tests/data/paired-end-data/Human-Kneecap3_S3_L001_R1_001.fastq.gz create mode 100644 q2_quality_filter/tests/data/paired-end-data/Human-Kneecap3_S3_L001_R2_001.fastq.gz diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index 911636b..6390505 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -10,6 +10,7 @@ from enum import Enum import gzip import os +from pathlib import Path from typing import Union import yaml @@ -387,31 +388,18 @@ def q_score( ) for barcode_id, sample_id in enumerate(demux_manifest_df.index.values): - # get filepath(s) of input fastq file(s) + # get/create filepath(s) of input/output fastq file(s) forward_input_fp = demux_manifest_df.loc[sample_id, 'forward'] + forward_output_fp = Path(result.path) / Path(forward_input_fp).name if paired: reverse_input_fp = demux_manifest_df.loc[sample_id, 'reverse'] - - # create path(s) for output fastq file(s) - forward_path = result.sequences.path_maker( - sample_id=sample_id, - barcode_id=barcode_id, - lane_number=1, - read_number=1 - ) - if paired: - reverse_path = result.sequences.path_maker( - sample_id=sample_id, - barcode_id=barcode_id, - lane_number=1, - read_number=2 - ) + reverse_output_fp = Path(result.path) / Path(reverse_input_fp).name # open output filehandle(s) and create fastq record iterator - forward_fh = gzip.open(forward_path, mode='wb') + forward_fh = gzip.open(forward_output_fp, mode='wb') if paired: - reverse_fh = gzip.open(reverse_path, mode='wb') + reverse_fh = gzip.open(reverse_output_fp, mode='wb') forward_iterator = _read_fastq_records(str(forward_input_fp)) reverse_iterator = _read_fastq_records(str(reverse_input_fp)) @@ -471,15 +459,17 @@ def q_score( reverse_fh.close() if filtering_stats_df.loc[sample_id, 'total-retained-reads'] > 0: - manifest_fh.write(f'{sample_id},{forward_path.name},forward\n') + manifest_fh.write( + f'{sample_id},{forward_output_fp.name},forward\n' + ) if paired: manifest_fh.write( - f'{sample_id},{reverse_path.name},reverse\n' + f'{sample_id},{reverse_output_fp.name},reverse\n' ) else: - os.remove(forward_path) + os.remove(forward_output_fp) if paired: - os.remove(reverse_path) + os.remove(reverse_output_fp) # error if all samples retained no reads if filtering_stats_df['total-retained-reads'].sum() == 0: diff --git a/q2_quality_filter/_format.py b/q2_quality_filter/_format.py index ac03d03..2a0a159 100644 --- a/q2_quality_filter/_format.py +++ b/q2_quality_filter/_format.py @@ -33,8 +33,8 @@ def _validate_(self, level): _ReadDirectionTypes = Union[ - SingleLanePerSampleSingleEndFastqDirFmt, - SingleLanePerSamplePairedEndFastqDirFmt, + SingleLanePerSamplePairedEndFastqDirFmt, + SingleLanePerSampleSingleEndFastqDirFmt ] diff --git a/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap2_S2_L001_R1_001.fastq.gz b/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap2_S2_L001_R1_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..b3d1f5982b7bfa0a4cf4e55642b27edf414ea61f GIT binary patch literal 205 zcmV;;05bm{iwFp2ms4i|14wmkVQwu;Ze?X-VQ^nlGG9zEFfm_JF<&q+F)n6db98Y4 z#gDNL!Y~X(_k0Bt3qo>|nk1fTdGLT>^8bGeb}8+MkPw`#Eo6KX`I$R4;&^@kAp4e#-Lz5XZ{W HZ~_1Tw@zF@ literal 0 HcmV?d00001 diff --git a/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap2_S2_L001_R2_001.fastq.gz b/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap2_S2_L001_R2_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..60b4683d1896ac2248657ac9ad5dfd66b21ffcb8 GIT binary patch literal 270 zcmV+p0rCDHiwFq7mQ!Z{14wmkVQwu;Ze?X-VQ^nlGG9zEFfm_JGG8z-F)n6db98Y4 zjgdiW#4rp+_dZ4U^?;L%6~&oSt75E0X;-@cS>`*?E;0>`FtYUZ4SKkKrNqlphG8n{ zCoYmd$5aNW35hwUGUjP{8OwJ;8BEl1)XZEQHIllyyY+#pC<=!G0(^pd%V?1?Hxa?e zG`t3`%=D)EfrmA!)Z0q2*LQtyw8oX-i1AWu_n?&eSQ|obI~Ug1UmghEu!+9$Xb(G#c078Db3DU5mx)r||Iagz|H&h# Uc$Rjw`FW!705+oik#GV40Pm-MssI20 literal 0 HcmV?d00001 diff --git a/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap3_S3_L001_R1_001.fastq.gz b/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap3_S3_L001_R1_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..7b373f3de6b08c68351f97ba1aa741604fbde1fd GIT binary patch literal 289 zcmV++0p9)}iwFp8nNw!~14wmkVQwu;Ze?X-VQ^nlF<(qDFfm_JF<&q+F)n6db98Y4 zm66SEgD?<;?|zEjQ6wAVVlRcTjDHSQd-D4~OV=bMQq(Hx%7XFCAo~;Rm)1T6GRBfV z=EUx^*JQ-i^6=hi>YWX5hQ3LpDyWRI54f$`By;6jWeZU)5=T%pw zPMth&z%nn+8#!#C4+>#EuPGtt5;Mmpa!N6#oMMXmCIcJt(siAcl(tP=sP#Ed?*=!J zb)&!W;L5{y9=`FEdgm!u9!BelUn=iBXnhGFJYG_ixH8Yi&zG5ya(Q7MO0V&laU}b$ n-T!!6qX9hU@$!38`*?E;0>`FtYUZ4SKkKrNqlphG8n{ zCoYmd$5aNW35hwUGUjP{8OwJ;8BEl1)XZEQHIllyyY+#pC<=!G0(^pd%V?1?Hxa?e zG`t3`%=D)EfrmA!)Z0q2*LQtyw8oX-i1AWu_n?&eSQ|obI~Ug1UmghEu!+9$Xb(G#c078Db3DU5mx)r||Iagz|H&h# Uc$Rjw`FW!705+oik#GV4013r@!vFvP literal 0 HcmV?d00001 diff --git a/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R1_001.fastq.gz b/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R1_001.fastq.gz index c2a52db4149ccc5d9c59973ea01e6acd3956467f..a74e913d9ae2f4faa36984b002ba65d5a34f492e 100644 GIT binary patch delta 264 zcmV+j0r&p<1)~B8ABzYGby8Pnkq9M!m61zsgfI|9_dZ4TXe1lsU^juGDSsAGvhw}U zk~YJ9q(quY%RqHiA@s!hrL_-%jIpGzb7J?|Yck?$xq0t2_0EPjL*FD)6;wvq2i#6= zlDTrNvO-jg#E}%WaJ}^8CCHI_liVJvQzy?Gu*{3|Mm{#s2Zb=7*OZWRiJ4<}6FH?A zQ%*6(eaXOvymVb>C8cc>7ixXZ)4Rb9WZmd*JUDpx&cipJQtv$F;9<0$_@(mBgVvV- z!s8`Hi7WGL{Ct@SDVG-}3(v#)8jl%AvhUjckEb;nz;hlizb8eGL0@^M3D|k^asA2i O+fqN-m@wRM0ssI4eS+u! delta 740 zcmVjdc#%wuk02UeWyQ8fZn^$irIGKxcaQt%ctp(?g<+ zl*%B~U)CvN#+*eYnW;%MVx^ogS3e0N`H8zuQC}%#%2|@3JhBFelvBx5lZr)u^Jzh; zs_8^11`$Lly=rsyRF#UM+4!^E4aV%I6%wgLBie#z@R5Fb17)j3GphA%-F_ z(MCMWvllCDn`nu`i(MV91kYxFT>)3}9Ei7=W-$k8Cep&`GDxOOH#x;LYEAJKfz!fB z@eEc!rBrmDS1B!V$KxCH&=IXbXR_^?oL{7brO{a3`wqrHv#dZmcF&z;9bETjw{Z?n z7+GlLnR zH>15WNjsBxV-l}SrhH>E?Mz~!rEGISqX4&qrkE9fiOqF$WnYU>QLm&@`D?5)uGH2n z%4va5%NHniPP5YYF{k@kpvvSur@1z=bBa5sl{Zv%sh>*|x=OL?K`Wf<>8ea>y;Qo& zg-NStDTNhrYVB0VSvE6&Jun_}y6{#bm~T|Z$3m&psibNYzc)&jaFZwf!jZyVb0vDF zc~hcU2k2>-PSXjHx2j@v=tWxA=4mEqJn=%qNsgpn7T25smVonPOt#|){i!kk#^k>D z`HPq=_g!VOMtlL2_#c?WkC?P_O~!!1H5Vq&XR6m1f5P1(Jn|<~lG1et6Z1R2>z@-{ zwH)4s8`^g3{p~A+O5PCmay$3fJ7rSu`T>GjDK2{Z&odS#oyTrCc^Ml(h?CcZoQ9c~ Wz)RD0Z3_W;Jp2I*w8rzi2LJ$;2V|E3 diff --git a/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R2_001.fastq.gz b/q2_quality_filter/tests/data/paired-end-data/Human-Kneecap_S1_L001_R2_001.fastq.gz index db834f3b8cf319529c7b53a0cd0f481876da5044..668164d4643801fc6ac9f0c17777ce5389b2eb1e 100644 GIT binary patch delta 243 zcmV`*?E;0>`FtYUZ4SKkKrNqlphG8n{CoYmd$5aNW35hwUGUjP{8OwJ;8BEl1)XZEQ zHIllyyY+#pC<=!G0(^pd%V?1?Hxa?eG`t3`%=D)EfrmA!)Z0q2*LQtyw8oWU;E3^3 zYxkg)8+=st-WoPwC238Ci04rM=Na=gU(1wd-tlyt9gmu9JXjk-Z#x&(*Iyn8-LQ$i t@n{b_jCMSFigP@}JeP@5-v7@tkN?Rdr+AijwE20W@BlWV{gH42007xBcANkJ delta 805 zcmV+=1KRwK0=otXABzYG8U00Akq9M!#gxmI+aL^v_j!sgF~)a{0h1Ua1B;$^<@bM< zN;q>lnVGhyo07y9NcttmANU`AuZ^+8FxmcPG-3We_IAM49xqzYy&dPtd>idgi{>Q< zIPeQO=OY|w;{}d<97iSxKmarj48R493wSSAoZ=PzIfQ^N@?sQ_V5T6_q5N=v>Y^)< zPM}bnmN3kvxbUcO)}|?@vvixOdZ~E?>H;gE9to2WYLYIhpa;mxRJb%i zEeCo;3ONHwwS=E(M@b`#l=CBoemK3<7wxrHSAaV{nimkfTa_R?!j{x3R)rBlG~QDT zPJ_cZd#Vi8(QS|};hr2MU(dbtSaVHeyFhKreBtrVP3^*9JQciT8J zCc8CdIZbs{aVAo;onNK&BKsiJb0M0+_^O3%G{dR2WUHV7;uVmBfJ%yAN zO<|7lT;4M*DZPe3%JH;tQ&48Sx;1%U(g4t(l8?y&3%4Lcboc#szvJS7n?r8vy=tW2 zaH@=h-C%;Xc9V;&Y9g@k+1A|hYV0~|gk^t6ZzT&!^;PM27!NUm-$#0~@QS#-qwgtnzr~Q#W8L{H9>*y{p>KJE zdr2Y2IP2A1;UqHow5Ku_u=kj86^Y#UHC9$DS1p_ACas5SQkB|_N!FzeIkF0A Date: Mon, 9 Dec 2024 13:44:56 -0700 Subject: [PATCH 13/24] package data --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7ba04c9..038979c 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ }, package_data={ "q2_quality_filter": ["citations.bib"], - "q2_quality_filter.tests": ["data/*"], + "q2_quality_filter.tests": ["data/**/*"], }, zip_safe=False, ) From 5ecb2648eea09aa1f9193851b7be425d04c48262 Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Mon, 9 Dec 2024 14:01:17 -0700 Subject: [PATCH 14/24] package data pt. 2 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 038979c..843953f 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ }, package_data={ "q2_quality_filter": ["citations.bib"], - "q2_quality_filter.tests": ["data/**/*"], + "q2_quality_filter.tests": ["data/*", "data/paired-end-data/*"], }, zip_safe=False, ) From e174ffc5f3d0153583af557dfbd8fbeed192f4ab Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Mon, 9 Dec 2024 15:53:59 -0700 Subject: [PATCH 15/24] gitignore qza override --- .gitignore | 1 + q2_quality_filter/tests/data/numeric_ids.qza | Bin 0 -> 3945 bytes q2_quality_filter/tests/data/real_data.qza | Bin 0 -> 4576 bytes .../tests/data/real_data_joined.qza | Bin 0 -> 4880 bytes q2_quality_filter/tests/data/simple.qza | Bin 0 -> 3870 bytes 5 files changed, 1 insertion(+) create mode 100644 q2_quality_filter/tests/data/numeric_ids.qza create mode 100644 q2_quality_filter/tests/data/real_data.qza create mode 100644 q2_quality_filter/tests/data/real_data_joined.qza create mode 100644 q2_quality_filter/tests/data/simple.qza diff --git a/.gitignore b/.gitignore index 66446a6..874cc3c 100644 --- a/.gitignore +++ b/.gitignore @@ -66,6 +66,7 @@ target/ # qiime *.qza +!q2_quality_filter/tests/data/*.qza *.qzv .DS_Store diff --git a/q2_quality_filter/tests/data/numeric_ids.qza b/q2_quality_filter/tests/data/numeric_ids.qza new file mode 100644 index 0000000000000000000000000000000000000000..d704134097901297da85d0c2f9f45c519aa88ed8 GIT binary patch literal 3945 zcmd6pc~sL^7RM8!5J41KOl1c_qnM3^eT@(y1d>2NDB?g!Si`0Skwr0KDTWpl0xp1T zVY3#2f(3*T&=HixuvRFFEGmi%t0*cu(bj`a$2r(Df6V)xd)|5f-0$bUbMNnc$pp|E zMF0Q*225hj2}907{)a^;wkY&PK}Q)H)AZ?vFuDoSABOSsH-Y&gO{g$B9p#VqMbXhF z27ZW8KPHt&Z8f{vk3LfuAL{q$h`1aN}Ex?9G_s0|WHxT7w-!kl?gU?cw*J|t7i*upi zNko~2g@q_L|BPWr9KI*6Y+mjnD`KUCTkuk&f0@q-7^k-TrZ@?&ed9`id%ENJEUl zb7X?cHHwEz`}|Dr&&BQS85+{+`Y8ITh*Sp@5-B45)Ac#sl7f1Pcs)g7_jTMzls@_o zY~+8!9(?*GY)ujNNNXz@0jp`?ohbSS>g(Gd@Q#;SUY!SXM!1u_iV7<#W~k1I-?^4d zd9(1+QXX6WdF9<@K8M8uiWNWsFo_fqc(f>1-Nzp<-U&;@@4-=A^@w~}Cw@TS*n{ul z;{^o1(*XX)E(a(04Zd(fzZ;D*KzFmqvu%>ImXeYwv)GhbYln~IWJHU@cG#5)^BQQf z3^b5_0ko|sST^GPS}{auRa}jd2v0#2hN4*2sC`6SnYYl8AK{%)2d5qZiJ(V>)wm!H zs~$**34K1(A7)}zLJBYI5ELA`q9k``;KzS3#_ev_KJ!9hZYaM*B#%(Pk zK544?T2`ZS(wJJV@V5*@+cWtP?YyOmnX7N3xziC6R#@p7iM(Y^Dw#g=qqDpuN3-i zbKexknBs=rj;lB2yFbt>U3(*qU1NSjI&U)Qn-u5$<Um)qy}9%_DSi9oI7L@5$4~2C;0{sgexm6H6WDErf|7A55Ig zocnv`enlPZ(<3c8D56oB?{Ez-~G?&|H<|w)|);0%B+O z{YZh=@{7I~^$;oQu@=VA3vzd@--z4B!8jwMg!s8$K+JR?!FHk?ycTL-`!Yd@b3N8w z;`-egG|)p*Db8A>7#TR;RH&1l$k9W?k8hBL#7-{HU(7mnPa#%Qwoi|NCpZ93@7OoI zt5t8p{+`Sb$)KZc=eu<7g=9^DQoMYQRFl-=)gz~%V1#>27x*qHpe;7a`6g(Xbo8pG z8UA8h=C|DJ5Qrbsr8kFSzpO7?J7AW*(GhEE)`~e)T(mWGGO4hcC{t6b#+8-yW%68i zGw^;bKJHr2n1Pdd5zYVTYC5;0(RjDt7_-iqhG?v|ZdD_`&AC>m=3iLT($Q0YzjaPc z4tB8kCnKPyd=LMms@Kk5Kc+V`R*EkhL} zA-vq12;VDIU*PqJ4Lw$K=c$Hq1fkn}vRnL)4$d^fD_*NePzvrHa3vJ6z-l$hM)L^J z9U*t4YxVL|-P}>4XXQ4_2J)p$_27}(WH$!5Ak}pJGOQ9CwQgi@w~x=_PZ))Yu@r>b zZGSnL)3ZB&{UH?oev9chT<@u;mm6H zME9ph0h^t?u7#0;Zx+$sW_C5Mn`x9Zym~Jo{13 zn#!TXL8FB;6Bw^MCrgs!cj1TI&pv62>1$LwFg)F8d<)h@j6ZCoau=I*OGP*OE%2rq zgqKO7KQ1llmMrT%nrWA=T6>y5A@D+SBKnAFH^NffX6>o8GfS}X@B2F}XniU=%$j{U zxzM2=WKns?;kw7B6?-PFNMW7k_Fj9*r=wivmz1os%@VdrZ8)p1{cQ^ErL`XJs(=h! z^UNe)zfb{Akq`fE4{7?DAyeT|*mEVEli^X9wH{!~Ah&;I{BXb{n-TQfB>u6qw>cZwD$&O)R59MU zp-V|l!+TWLILU33{H3x*bKE6H`mogT{OfRmsl%93h*O^vVLDi;-cb4^pof@R_9lVezK+F`Ndu2N`40DYl>Rw zguga|xPknn!DIEWR^v-{QoNkcd+vj=`?Pqh@-?e0=QF7E!ou5qqwlRdqr3m?{mq#>cjhHJPgE9An{UoxQHaoTFTl2CT58cfl0w}aG0o-l$eOPB|=I<+*;TP?`eUv z@U#%RZsFv}(^^0q2&eNOd#zaw5wjCa#jEgA(gX#$NzUpC+X<2u-V8d_?9A+{arR=S zLr=%bs=3HC`G{e$Q>2UP#XW7di&?9IB}{|%{*si7{$o<3+Hj3h*-*JHP{6et#}3-) zw@>iD2Agt@(X9I|%|#oZ5EN#ek}cc@YW+n>qNtaT(4he6zv8EaV3f6WP?~7S#>R%1 z9(-ICs`{=8s`gSFKB+3G`c?zhF5FVA4M)JmdPTcLJJe7x(NPfK8?V!00*UwDAR`Ys zYmSn46hD)|qvQ!|$fHpz$~w9-_o}ID;B-nWUy0<;3(^k8P^&B$(au=j>;!|(=MvvVQFAh;AF3mzr*GFBut)V-8x zXl?vD5+eLf9?hY=q$~45rw(N?9ESKW$@^9$PLA0^O?;{voo$+Gf@oNQHe57o>iayF zs!OYKW3G$UkpTczhk^bjB0Xmu-V^WcWao^>nd!jI)ZuWrnYM@-94;bcZQCS ztRlO#%N|BWwIoV30D#@`7LI0rRPY}^0NUbO-6zE9l(+W7p*mT!@+~LzYxA|=N`|Q; zNH`S7rHU^HW>JGQ#HZZl4M;&nZnAOtxD!IoC6#+#dpm&DQc&H1?cmDY@>IrHUpA9PPuW|!NQ-leridYR}vsUwLQTi#aD@gCMKUDfJl^t4bl zNh!*)6{7Z9C7rxwZ(!JIB}4oEeR42TGeh(1#)6INtLtw3l{_T~I*G^Pff7r7ocFk4_N9A)tL6b+0na!qF&8FspI8PJW!mzeTON|K9DdAb z&YW{!X*ju9WvO2N%FC)d7a^yyCeMqIjIT^!vJY_!FU)9G*1-LrFpUM{CnubycI1bB z<-`IcN=Gh#H1j7h4(~<~Da}_bxnVa}+>R&XvSTsNVm|ZD1Yfu;ZgjeJ$^K!?d1xVJ z8!c`@skkCqBh)sZ1!+Xzqa>bd*@o!74=tM7LUeK)njTPyqr{ztIQoenZeA8gv*$fZ z7dS?UXV6)Nj#62VQVn(ti~&J}-#x4SS$tl+z)_{O6$g{^x%ria_Ueo2p06rVV5^hQ zVxW*yAm~l@Q%4UwolaoRf4f8hMHQBd1o5H>fL&XPD9e8JH-URWOu7|{U_-1KnkP(R zg6Bg5Qyo8clv;#$tdRUtsAN}z)W*aFi1ped2JkyMR>GXR+6d8amVs~Vc!vOUP6%QU zEWpoQ7I3oh^|kw|Y-w8X0ew#EB?7HurNHC$dXM`qMN=~`0;QQYb@SaqO?w;RfJo0C zR8kg1nwPwh+o}udykR4occVrsRY^w z=-YZFtPqroulacfvL7o}h1!y`nX1=#3pCPZk)a#BK0F%Hm~4lQ@nNM`T!OZ|oeq|{ zsQkMK_8m6tl2aN9P?H`-e$~E?Lwb|(1r-O9!$d5KYgSa@(QSsaPdFzq<7ww_O37z| zn8c0Ctz6m=9Y9QzuJ}v7#c!|APoAuFb^2=T%-DTso!h_G~9wa3J zoS&^+EV$O@z7=qD;c9_ZrJk%naYaF&b&n;p!1=!EnYG+buA#HVjs+WrJach$uFjT(-BaCfxEa!p*!{UgH|n?s{qH zQsPa_y_MPLuxN>jbs^@?-?dD*S}>JqARh0K+PaJ6F02R~i7ysRKc!#aP_QitE77Dc zuR8%U0KSc`Y*y?Xru3R8SC|{U6zCBghy}G^&Ko7>?m9D-Ru&h)6Y5 zMz#!a!LAz9DOhS_l#OFG=;O%Z?CJ-g?d$y}kzK^cyDr1+2G5oRYWQGQq=wRMVjhOu zXo|ZzukeMcm2NJb^o~b=#j*yRpXjk-{oeB(vPD}vCG(8BVC}=`@Uqfm$E;d3*i1Q< zom5yNUvz&*o=5ETw$cCvnduG8av)lU}yDmNg%B|s>oL1Xv)b%Rf zy;Z(@pZ50`M4neRxAiG8g%`c9o#CS_#ZF-DN zW{B%HTV>h_s&v$XJE})hxSb;>h5OEq#{<*OXE;40YpFa}<@E>RHZ~hWT_M;<>b_fu z)7h;#4LmI#bz#s}I=gO!gVBQrrLeUyh)@+I7uP4Yn&+L)^y#NPyrfFN`CHqnKq0g7hOq>jOLL`m7sB%>+ARK%qR%sTI9Q0 z5!U4%JE2GX$fo#pbD>Q`>11MBXZSp&x#IBZtdGCLJ!hALQV)ZT5HH<4tH5HIP z5*z=Jkj2;$pBnk&B(c2<(j;VFuvqqN(ybtY@qj{@iAND#qU!vtidE0CUJ_sMcp*6( zMHb7ZRx~ffEvv_V<({5-PH>u!HdbIEd67)pcTe4rPrNy@g9gvMjLp)MV2|Ogr@x(c z_Pw_ucRk)bk@n#@nJ^~3xTZ}O-eJKl`B)>z0}~nn@&LzBfHMI=mp2bYmMQZ-uu#fo z#yJjlMhujkKSBF(vUZYMFmTo~zSvl8q-^#J|0sWK(#D2)M6Yq6zt?odjQ@&q(R5F# zaBY{|u`l74whZWJ=^7uc9WE{~-L8?(bWuTpomR5c&UG0?(HD-k2PY*&a&!K+B$G;~ z=QWhRHQ(6c6_JSR;xBemp1E+Y$MfpMD0%(1MkLAIe7gbQ??T*Udw_1^mF9%||jx(Oz)Gen*lcn@?zhnI!M{6z0jOutA zAV-_nUwaQ~9sf2Wf#$;dihYhh3dNUzmv-O>EMX4$eYpi`PH ztsF6O&sUs*P5u>e0b4p%K|aKo>E%T?*(Q{0oA-G~yqV@XK*%v$_2c7nM*a$QA5F2r z`-wdqRr}JQBG>1js8f05>#3}Pk)n1r>fvcd9U(C9n-`U#@HBmztu(ufP$AY8-f#=E>AOv@uV ztWeuti~IU(Xx!_kK{AY0pPcVZg!B$)esv|aqP?ofse9j}-*PSBI4*1^(eDj}$R@sI zw)<_7F^KA2MFMNLKFctqWA9>Aa4fmz!vhcC>OK?SkV^cSE6GKd3MyCA$KecDt}U5q zdE6n!^o0A$$;|A4SjBZR{6o8?tV>xbek=>Km9xY-;Lrw(E)(8faagY|5u;+9{ERB@K0p_0D0g~-uy4d z@O%6pd9iQ!9H3vp|CXM=X6HvX>`#ugfV;oM@iV6Edwo9|X})onQU0X9|1{wIo$C7{ xIGQy6TxkK}RKHI3gL?SUEb)zm?%1!B{QG2~rAAJ1gh_SyNrM4^DB7dbKL8~KKG6UG literal 0 HcmV?d00001 diff --git a/q2_quality_filter/tests/data/real_data_joined.qza b/q2_quality_filter/tests/data/real_data_joined.qza new file mode 100644 index 0000000000000000000000000000000000000000..2aab35c91bde6cc2910dc991f2d0029bdd4f0850 GIT binary patch literal 4880 zcmdT|cTm$^mktCZfFfOxDosif0wkdbB1lJyR4Jho0Vx6@(xjspq)N2_QWONFigb+h zrZh!LC;;P&Au;xNPy1BR4_Ng!>+AZSUL1OzUPhC!^w>}(-+Fr=-twKP&f90?b7vBy~3 zS!1lBKGrVI{GBB?q5!U-_`#G@- z!Z}e+>orL~3eFClE#ez^r}4ZTVTJMRE1bBEfBBFbO5$hb9WB9+VR%XYpm|YYukH+} zX;*VwT6IGH`8uU{c$j2Yt6#RJ2hh28GvRI*=fO zXj8ARuX`E7h#FvxcWq$JSES_!s3i30hLuJsm4lCL1wQX~`CZnqd^YKmvwyc207CD|+PJeRI6%;b`QSMK1KyB@E& zIJ_cEKWL#_A~TvI-tdJC2Q#4+N=0ipiB`O_F0ii?*e$B#N zYP)!C?Cw5eZF)=7E4h}=*8Q`Gm_*(T=^u6EGxC8qkpTb=hoSwCfDB#j>@oHpE{?ADc9!}u%PTM#%u-Lx z5(X24qOCnK?$GPLBn@PjcDW)!phXGXH2~1V-rCvnS3Cdu0MMP>=`n?*Ro?m$1=i1- zQ|O>KZYK25^Wj->ZLh@;->+AYGuf3g-^qDQN zS#Gm$jZjjvuOWENdCeSLwlSaFyxjTxS;2gD(+2~l`uu9g%6l|U2rra=R#O7p+$Ky# z{}#cZaz(eF5o4`^dRkWC0HyL;NuUpOx@OXAD@QXhkP(j1d9Jg+zMwBJbS@LO_r)*L z?76FXVbr%r{$1_}Ic4)kj2jWL6lNVN6GcwY@N3vh?P? z=@fsg1o-TT1F`yFu>-yqb_;Z1tKI|BvujohB+kv8D&~b`%J=<>A4Sm1C;omO%-OgxzQyh1IeRAH;9j{bYt+exlKf2b+Sk;TKeq zX3U+7PN@%g!KD=4Gu@60X6VYBkiAe= zgWAhL6H`lV{z%Cw{tu5>n}kd!sKf*&OUW-qNcFWyuTM>pa(F*v0KSvwz<#-+Ife_E zV-TD?*(1zWa7TOuC@jQR6-?h+=N+fPnXL;OHa>@5#L_s|3#Y9$d&aqy&CHgE$gpl0 z6uU=Q47S1mF_@3)>3P&L0_3fHwtZmNRTS-4#EuW}Qu=7n+vHPY!+=)9=(b1NH+;Ae zH~LL(Fl|n5D3SgE7&~}9TE;3@cn4rYxF{<$zzzxdtoT)d60PhxM8vv)FTYm2QT~l} zB0;H!N5ny(_l8ZOdhtDRt{qO(MVIUZz}uWy@~ciw+%g-C)ezPF`L(4S|N{=mCV-^XbcN8Mi9N1NtZ*yYg|hWExxYWZ%Q+D#@|+>Zt;)A-MbCL zo6;Ddwmx7uiPYK(c#y)FEjEM2{(*dKoX5OA8pu#c|U!gT`lb88D*(( zo)cBmz_SlIQ(aFO(G6EPf6wO8)smUqYsi!vnVq``zS7!|v6Kpt z946!DmXd8L$ZH+C>Za4A*GRhJ>)S8)5<#EaF955;RpjGMxoT&*7*p<2KD^CWJ=IW{ zu*%a$2Y2e_7+(A0?JLKzbzbPpY$Ms%gQ4~rZogB2-`>kVRb1}4@8wMoH>|y@wX3cD z{|iC?DeT7*oZoo;UupZ#8sB?|h(?|O0LEwl0LA~sU94>}j&81^|K5I0PT|I8Zc~*^ z0q=`?%&=l2!2=&QN4N<>rgqh#=v%>=f}}WEJ*gzM@N6{k%Vxt?EDj>GRC!|0V<~z0 z@nt;qw;}aHtR&@+&n7t3+j%X+0j;=rTx>a`%M({`?S5u55#eeVhbl@U+DJRPcy1@% ztE&w@JxC?k+~*wcxI$v4C$lQI~^VSKvr<2A&0G$0H2O3PdPMM6burk-Y6|{FamVD|>8|QodG$4G zbBxL}JBp7G0sTFZJM98o;|GhPTmb>w8;pa3!WUvv)XcE8pBF2h*{4Y|(eX287F0v7 z8`5mBU1$&@q3IH2ifE-+_0|FT>|MCc7lwWk!@tmg+7Ya>vJdCTl_HdCHAFV7#gq*0yqA?CdP!QL7HrHA%Wd2m5aJK_#DXnH@)<;v1>;6cg& zYq#1ekSk@r|KT)i`wL=gAcO^WExu^@>zOutMr8>S%eD&-H zcrf_X#FOVen#Im61Kz#r&y-G=b;n(_zI&Bnh*f|PRI*rXuH4n5BKB0S3fCUt3BG>Q z%nqLE=YN9yWLbUGdBV_DYLnayK&GsMQM=XoD+#JWT!cm7jbl z;Vfq6t?!xeknuk6ZWvdY4CM%dP+|ODPNo(Pq!}u+1K&%aW9sRQ9u>%nrSUIS>>UL2 zt9_@O)fw%QP>3i%e*tXI3C>^m=BL54t3n?#G?@-v9Fw*}lvhWZiki- z9q&O{!3CQ8P`slhGZ|_Y6WR=zT|q>0`g9 zyfqb~*Gm|-F4ng8e#KTRM2^!feFQzDv4AF@Bvqfz8<*AJ4=8%*9cZU1UBugFN^9|A)S%~)L zqN~?w#Vcc|g9fKJl9uC7XzDjT&JoyP1zEHUEosFx=2XSGO)ujG19v;i9OD&B0Bdte z7pvTJ6*)@Yj!cOXjXBRthj@DRbO$ttD-j>>*TkJJ^9o3zH8H7w7Sb$nSG^M!d~r0w z%iMo5vjP)kJh67xl8|QX>q(cz@!2@)Ll1#c!G|GES_WG!c`_#IJ)aT2!b52n0!>EP zM+5H_P0dKZisoCEVO_;7UsTQLT=F(!TlR@Mh(3p0iZ+I2xOx%h z4u3k;BaEbfgmE0-`a8eBoBEEhTPTjn@3&d$h~@v#1V0C{|6JL@X`p`&^*8m%_Cp2X3?H#>um)?0#=cw!aah~6GU(faYbAP}0bKUpv`B))}c;pevI7Kl+l@Ty^C_#(hK~RR_)zy^Mp>Ar*T577=%G!9?UTrOc224W@szM?L z;tBXbJR}TH@|JI}&>9| zMh39`W^(y8nZrD==%mwu8oBrQx7GB2rhp@?{!&MWq!&S-;opD};LX-yZ)StBK%=g}VZLPGWD zXa(tT2;i?a+-j^3sx#!(7-6n(f^KVy_3FQ zOjJQZW1JaJUV1X(3n!bR6pqNRxUlT;kV*wGc|@gs#-SPhNQ~HYCw{#;OCyxUelv{i z9u!Tv)uO)x*=^BIv*0B z2O6;lrCezPg_V!7G6{V@k?S43DWPNjzQIHvypKEa|9v<$f1#xR{hZd>YR_-u004q` z0RZD)aD;ac^z`*n`D6R~8;m*l(q|OL6W)1nNxAlUHGcU+7^0UOOXxPk2!j%KrV4acU0n;#n!)Y?( zwNSkQjP(PzZF^5I=LG44^jfC!!DySC?xq%&@xs;j?@Kx~lSp}%vw4KA?U~Gsc z{>+C6laN`JBNebkqt!sJ>Qan;V5(%Yxowe!)Ooqm5v8OOB_B={f5KFqd8d=64Wzv3 zOHgkAq0DFM;PeWrB5Rx->MfWwbvj4h>0yVe2+>$0 zG0O7lgNYSI>V$Pk6EX_`j=o@UT4n@vB;zWZX~q?g$4Q6+llIfE!@jIdkB%*7OC3Wj zz#V9Q>;OzP9gqOHL1+qBp%AUbuHZx>!D+~zd%BgnlqkPkkc5Ueij(_PZp*sX3oRRpP;LW}LJAV9f(8m~V?i}> z_9v}iVrRP31h+cNpI?Y_im8ewcWUPo>NA7&?fm_O?TdP8`;+SljjSvoT62zqLIyO$ zT_;is-!2&@O7klR7mzsAu8V)B3C#3HRK1%hrv>Vbm0j4WJkcounf7iO5OP+kQ(Bob zJYUg|JcX8QdBr|(^<(05-(rVTmD)b@DP=yz9j|)=`cLL-I6kf%S>+aTJw^ynjP};( zw>ctf9Czbk$67mJ5iz5vxZ3Q z+dPP!MLCAau#=P7uxJM=(!SD%6tLXc6+>+up@=*`QtZN}tRyx_9kvkA7UA!w*WIB7 z^HQtsuEpox7wGANKaO4)j)KHKv0-l&)y+ErKB|`m30(4QNe7sp%D)pQs6Z66J$?z} zFV*diZWGRz4C4!bTpa5;%UY1PP=VdC{}Hx9O#OaZx=ePCE>}@0*0G>tNNbqtk4d-E zFG{Pq;K!>U43Um6!^b7xS?fysF!r`DY~`e0_;$zIy;apA%Y!NU_7bphL`xl-)c!4{v5)x(LYi!qH1SdGmYnl|ZY0X*vf$<&^5! z&(>vqJv0MO7U2@H$~dk?9$i#Zi?lYj^$}M9uUGl>xaCA}GwH$M5v{xLbmoTJyaTsb z>iC{(oZA8lK@MS2X~e9v#**3aRg3Mq{;h7RXjni4h{khSG-HsbbR>=XP?hf3EMA;) z0)9=?eAMkOSkt<@Fy9zGGaK3)WR?6RGVfJtSZYm~EOeUUK?g=Ch0Gy&g{$n6=u+|@ zSRiKYG%Fz6+9i~B*H$}m6pU1OO<`@?nZ9EyHmXMzBDwe+$Zt3h^5Ur$_X}YWPqJpX zcO>>5osFs<6{m$d?^+lctUo(~M4FFuc zIW~Qplt%Oz*O?TRSq`a!nk1+*=0x7kml}`WM>nI~vpl4C%yQkiI&d|S*7efPqq|L0 zvQe)=vRPEXMSN&tpA~|IRT%hV%4WPcUl)bEzHRD--=GKvb2H~>rr$rZ0{|3q>t^%+ z`NlG5W(L}g%xIQPI{$BY%Ph$Zs~eKzKPCAe(3M$|nLWQro+2l({!EqMNR&B0GdynO zR|Nh{{*5O1dk-*k<2MZoIeoJlKcYzHa=vf3b)NgSC}bkrCd>Iw>Ia?2jA$Doi@zZ9 YcVM$ZurmM<0NBd-D2y8-aIc?!2TIDJ`v3p{ literal 0 HcmV?d00001 From 46ce37323cf852864c0442a349e255cd6ea95adf Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Wed, 11 Dec 2024 09:40:40 -0700 Subject: [PATCH 16/24] don't do truncation alignment --- q2_quality_filter/_filter.py | 37 ----------------- q2_quality_filter/tests/test_filter.py | 57 ++------------------------ 2 files changed, 4 insertions(+), 90 deletions(-) diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index 6390505..4f8a53d 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -283,39 +283,6 @@ def _is_retained( return True -def _align_records( - forward_record: FastqRecord, reverse_record: FastqRecord -) -> tuple[FastqRecord, FastqRecord]: - ''' - Align a forward record and reverse record to the same truncation length. - Note that if either (forward or reverse) truncation resulted in the record - falling below the minimum length fraction then this was already handled - upstream. - - Parameters - ---------- - forward_record : FastqRecord - The record from the forward fastq file. - reverse_record : FastqRecord - The record from the reverse fastq file. - - Returns - ------- - tuple[FastqRecord, FastqRecord] - The length-aligned forward and reverse records. - ''' - if len(forward_record.sequence) < len(reverse_record.sequence): - reverse_record = _truncate( - reverse_record, len(forward_record.sequence) - ) - elif len(reverse_record.sequence) < len(forward_record.sequence): - forward_record = _truncate( - forward_record, len(reverse_record.sequence) - ) - - return forward_record, reverse_record - - def _write_record(fastq_record: FastqRecord, fh: gzip.GzipFile) -> None: ''' Writes a fastq record to an open fastq file. @@ -443,10 +410,6 @@ def q_score( # if retained write to output file(s) if retained: if paired: - # align truncations if paired - forward_record, reverse_record = _align_records( - forward_record, reverse_record - ) _write_record(forward_record, forward_fh) _write_record(reverse_record, reverse_fh) else: diff --git a/q2_quality_filter/tests/test_filter.py b/q2_quality_filter/tests/test_filter.py index 2014674..ff1fecc 100644 --- a/q2_quality_filter/tests/test_filter.py +++ b/q2_quality_filter/tests/test_filter.py @@ -36,7 +36,6 @@ RecordStatus, _process_record, _is_retained, - _align_records, _write_record, ) from q2_quality_filter._format import QualityFilterStatsFmt @@ -336,51 +335,6 @@ def test_is_retained(self): ) filtering_stats_df.iloc[:, :] = 0 - def test_align_records(self): - # records unchanged if equal lengths - forward_record = FastqRecord( - b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' - ) - reverse_record = FastqRecord( - b'@header', b'TTAGCATC', b'+', b'+MM+MLM+' - ) - obs_forward_record, obs_reverse_record = _align_records( - forward_record, reverse_record - ) - self.assertEqual(obs_forward_record, forward_record) - self.assertEqual(obs_reverse_record, reverse_record) - - # longer record truncated to shorter record - forward_record = FastqRecord( - b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' - ) - reverse_record = FastqRecord( - b'@header', b'TTAGCA', b'+', b'+MM+ML' - ) - obs_forward_record, obs_reverse_record = _align_records( - forward_record, reverse_record - ) - exp_forward_record = FastqRecord( - b'@header', b'ATTCTG', b'+', b'MMLMLL' - ) - self.assertEqual(obs_forward_record, exp_forward_record) - self.assertEqual(obs_reverse_record, reverse_record) - - forward_record = FastqRecord( - b'@header', b'ATTC', b'+', b'MMLM' - ) - reverse_record = FastqRecord( - b'@header', b'TTAGCATC', b'+', b'+MM+MLM+' - ) - obs_forward_record, obs_reverse_record = _align_records( - forward_record, reverse_record - ) - exp_reverse_record = FastqRecord( - b'@header', b'TTAG', b'+', b'+MM+' - ) - self.assertEqual(obs_forward_record, forward_record) - self.assertEqual(obs_reverse_record, exp_reverse_record) - def test_write_record(self): fastq_record = FastqRecord( b'@header', b'ATTCTGTA', b'+', b'MMLMLL++' @@ -692,9 +646,6 @@ def _assert_records_match(self, manifest_df: pd.DataFrame): self.assertEqual( self._get_header_diff(forward_record, reverse_record), 1 ) - self.assertEqual( - len(forward_record.sequence), len(reverse_record.sequence) - ) def test_paired_end_sequences(self): demux_artifact = Artifact.import_data( @@ -714,10 +665,10 @@ def test_paired_end_sequences(self): ) demux_manifest_df = output_demux_format.manifest.view(pd.DataFrame) - # corresponding records should be same length and have matching headers + # corresponding records should have matching headers self._assert_records_match(demux_manifest_df) - # "Human-Kneecap2_S2" is dropped because the R2 reads have low q scores + # "Human-Kneecap2_S2" is dropped because the R1 reads have low q scores exp_sample_ids = ['Human-Kneecap', 'Human-Kneecap3'] self.assertEqual( set(demux_manifest_df.index), set(exp_sample_ids) @@ -747,9 +698,9 @@ def test_paired_end_sequences(self): sample1_reverse_exp = [ # first record dropped because of R2 scores b'@M00899:113:000000000-A5K20:1:1101:25454:3578 2:N:0:2', - b'GACTACCGGGGTATCTAATCCTGTTCGATACCCGCACCTTCGAGCTTCAGCGTCAGTTGCG', + b'GACTACCGGGGTATCTAATCCTGTTCGATACCCGCACCTTCGAGCTTCAGCGTCAGTTGCGCTCCCGTCAGCTGC', # noqa b'+', - b'CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGG', + b'CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG', # noqa b'@M00899:113:000000000-A5K20:1:1101:25177:3605 2:N:0:2', b'GACTACTGGGGTATCTAATCCTGTTTGATACCCGCACCTTCGAGCTTAAGCGTCAGTTGCGCTCCCGTCAGCTGC', # noqa b'+', From 2901aa7459431f38aecc329a11c5545f45e1e7b6 Mon Sep 17 00:00:00 2001 From: Colin Wood <68213641+colinvwood@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:05:26 -0700 Subject: [PATCH 17/24] Update q2_quality_filter/_filter.py Co-authored-by: Greg Caporaso <192372+gregcaporaso@users.noreply.github.com> --- q2_quality_filter/_filter.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index 4f8a53d..0b05ff9 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -329,10 +329,7 @@ def q_score( manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') - if isinstance(result, SingleLanePerSamplePairedEndFastqDirFmt): - paired = True - else: - paired = False + paired = isinstance(result, SingleLanePerSamplePairedEndFastqDirFmt) # parse phred offset and load the input demux manifest metadata_view = demux.metadata.view(YamlFormat).open() From 5898dabd6c521693c06ad7055e7bed6d17e8c5db Mon Sep 17 00:00:00 2001 From: Colin Wood <68213641+colinvwood@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:06:02 -0700 Subject: [PATCH 18/24] Update q2_quality_filter/tests/test_filter.py Co-authored-by: Greg Caporaso <192372+gregcaporaso@users.noreply.github.com> --- q2_quality_filter/tests/test_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_quality_filter/tests/test_filter.py b/q2_quality_filter/tests/test_filter.py index ff1fecc..76ebce4 100644 --- a/q2_quality_filter/tests/test_filter.py +++ b/q2_quality_filter/tests/test_filter.py @@ -80,7 +80,7 @@ def test_find_low_quality_window(self): obs = _find_low_quality_window( quality_scores, phred_offset=33, min_quality=44, window_length=2 ) - +self.assertEqual(obs, None) # test windows detected correctly # quality scores: M => 44; + => 10 quality_scores = b'MMM++MM' From 06e17256571c59ebbe07f9ccfc2639d80e3b8af4 Mon Sep 17 00:00:00 2001 From: Colin Wood <68213641+colinvwood@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:06:15 -0700 Subject: [PATCH 19/24] Update q2_quality_filter/tests/test_filter.py Co-authored-by: Greg Caporaso <192372+gregcaporaso@users.noreply.github.com> --- q2_quality_filter/tests/test_filter.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/q2_quality_filter/tests/test_filter.py b/q2_quality_filter/tests/test_filter.py index 76ebce4..cda30b5 100644 --- a/q2_quality_filter/tests/test_filter.py +++ b/q2_quality_filter/tests/test_filter.py @@ -119,6 +119,12 @@ def test_find_low_quality_window(self): quality_scores, phred_offset=33, min_quality=20, window_length=3 ) self.assertEqual(obs, 4) + # test that when all windows are too short, None is returned + quality_scores = b'++ML+++M+++MM++' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=20, window_length=4 + ) + self.assertEqual(obs, None) def test_truncate(self): fastq_record = FastqRecord( From 23752a00ec8ed326a64806fae970f84722a38c21 Mon Sep 17 00:00:00 2001 From: Colin Wood <68213641+colinvwood@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:06:27 -0700 Subject: [PATCH 20/24] Update q2_quality_filter/_filter.py Co-authored-by: Greg Caporaso <192372+gregcaporaso@users.noreply.github.com> --- q2_quality_filter/_filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index 0b05ff9..3038afb 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -162,8 +162,8 @@ def _process_record( max_ambiguous: int, ) -> tuple[FastqRecord, RecordStatus]: ''' - Processes a fastq record by detecting low quality windows, truncating if - one or more such windows are found, detecting if a truncated record is too + Processes a fastq record by detecting low quality windows, truncating + before the first such window if found, detecting if a truncated record is too short, and finally detecting if the number of ambiguous bases is too high. Parameters From 0e934db4d3bc7209731434c1abc8f680e60e9c77 Mon Sep 17 00:00:00 2001 From: Colin Wood <68213641+colinvwood@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:07:11 -0700 Subject: [PATCH 21/24] Update q2_quality_filter/tests/test_filter.py Co-authored-by: Greg Caporaso <192372+gregcaporaso@users.noreply.github.com> --- q2_quality_filter/tests/test_filter.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/q2_quality_filter/tests/test_filter.py b/q2_quality_filter/tests/test_filter.py index cda30b5..5390d1e 100644 --- a/q2_quality_filter/tests/test_filter.py +++ b/q2_quality_filter/tests/test_filter.py @@ -107,6 +107,13 @@ def test_find_low_quality_window(self): ) self.assertEqual(obs, 7) + # low quality window is detected when it's longer than window length + quality_scores = b'MMMMMMM+++' + obs = _find_low_quality_window( + quality_scores, phred_offset=33, min_quality=11, window_length=2 + ) + self.assertEqual(obs, 7) + # test when multiple windows exist, first window is returned quality_scores = b'ML++MMM+++' obs = _find_low_quality_window( From 8ccf732634df7bcabee9b92ad2f7e64001a4e0f3 Mon Sep 17 00:00:00 2001 From: Colin Wood <68213641+colinvwood@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:07:40 -0700 Subject: [PATCH 22/24] Update q2_quality_filter/_filter.py Co-authored-by: Greg Caporaso <192372+gregcaporaso@users.noreply.github.com> --- q2_quality_filter/_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index 3038afb..fc42b08 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -204,8 +204,8 @@ def _process_record( ) # check if truncation should be performed mark short if necessary - initial_record_length = len(fastq_record.sequence) if truncation_position is not None: + initial_record_length = len(fastq_record.sequence) fastq_record = _truncate(fastq_record, truncation_position) status = RecordStatus.TRUNCATED From 678be72df799396dd24a6e93386237f69dcdde69 Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Fri, 13 Dec 2024 10:24:55 -0700 Subject: [PATCH 23/24] suggestions --- q2_quality_filter/_filter.py | 7 +++---- q2_quality_filter/tests/test_filter.py | 11 ++++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index fc42b08..618cef2 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -68,8 +68,6 @@ def _read_fastq_records(filepath: str): quality_scores.strip() ) - fh.close() - def _find_low_quality_window( quality_scores: bytes, @@ -163,8 +161,9 @@ def _process_record( ) -> tuple[FastqRecord, RecordStatus]: ''' Processes a fastq record by detecting low quality windows, truncating - before the first such window if found, detecting if a truncated record is too - short, and finally detecting if the number of ambiguous bases is too high. + before the first such window if found, detecting if a truncated record is + too short, and finally detecting if the number of ambiguous bases is too + high. Parameters ---------- diff --git a/q2_quality_filter/tests/test_filter.py b/q2_quality_filter/tests/test_filter.py index 5390d1e..256ba76 100644 --- a/q2_quality_filter/tests/test_filter.py +++ b/q2_quality_filter/tests/test_filter.py @@ -80,7 +80,8 @@ def test_find_low_quality_window(self): obs = _find_low_quality_window( quality_scores, phred_offset=33, min_quality=44, window_length=2 ) -self.assertEqual(obs, None) + self.assertEqual(obs, None) + # test windows detected correctly # quality scores: M => 44; + => 10 quality_scores = b'MMM++MM' @@ -309,18 +310,18 @@ def test_is_retained(self): forward_status=RecordStatus.SHORT, reverse_status=None, filtering_stats_df=filtering_stats_df, - sample_id='sample-a' + sample_id='sample-b' ) self.assertFalse(retained) self.assertEqual( - filtering_stats_df.loc['sample-a', 'total-retained-reads'], 0 + filtering_stats_df.loc['sample-b', 'total-retained-reads'], 0 ) self.assertEqual( - filtering_stats_df.loc['sample-a', 'reads-truncated'], 1 + filtering_stats_df.loc['sample-b', 'reads-truncated'], 1 ) self.assertEqual( filtering_stats_df.loc[ - 'sample-a', 'reads-too-short-after-truncation' + 'sample-b', 'reads-too-short-after-truncation' ], 1 ) From ff0f7eebd2c09c72554e8acba3921e962360e7f1 Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Fri, 13 Dec 2024 15:50:36 -0700 Subject: [PATCH 24/24] use _ReadDirectionTypes in q_score signature --- q2_quality_filter/_filter.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/q2_quality_filter/_filter.py b/q2_quality_filter/_filter.py index 618cef2..6411b5a 100644 --- a/q2_quality_filter/_filter.py +++ b/q2_quality_filter/_filter.py @@ -11,20 +11,18 @@ import gzip import os from pathlib import Path -from typing import Union import yaml import pandas as pd import numpy as np from q2_types.per_sample_sequences import ( - SingleLanePerSampleSingleEndFastqDirFmt, SingleLanePerSamplePairedEndFastqDirFmt, FastqManifestFormat, YamlFormat, ) -from q2_quality_filter._format import _ReadDirectionUnion +from q2_quality_filter._format import _ReadDirectionUnion, _ReadDirectionTypes @dataclass @@ -304,10 +302,7 @@ def _write_record(fastq_record: FastqRecord, fh: gzip.GzipFile) -> None: def q_score( - demux: Union[ - SingleLanePerSamplePairedEndFastqDirFmt, - SingleLanePerSampleSingleEndFastqDirFmt - ], + demux: _ReadDirectionTypes, min_quality: int = 4, quality_window: int = 3, min_length_fraction: float = 0.75,