From 23ed4c1dba78f153e48f0add33d234fe77605744 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 2 Jul 2024 10:05:59 +0000 Subject: [PATCH] linting fixes --- anglerfish/anglerfish.py | 24 +++++++-------- anglerfish/demux/demux.py | 53 +++++++++++++++++++++++---------- anglerfish/demux/samplesheet.py | 12 ++++---- 3 files changed, 54 insertions(+), 35 deletions(-) diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py index 2519ef6..2ad7730 100755 --- a/anglerfish/anglerfish.py +++ b/anglerfish/anglerfish.py @@ -13,7 +13,6 @@ import numpy as np import pkg_resources -from .demux.adaptor import Adaptor from .demux.demux import ( Alignment, categorize_matches, @@ -101,20 +100,19 @@ def run_demux(args): # Iterate across samplesheet rows conforming to the adaptor-barcode combinations out_fastqs = [] for adaptor_name, ont_barcode in adaptor_barcode_sets: - subset_rows = ss.subset_rows(adaptor_name=adaptor_name, ont_barcode=ont_barcode) # Grab the fastq files for the current entries assert all([subset_rows[0].fastq == row.fastq for row in subset_rows]) fastq_path = subset_rows[0].fastq - fastq_files = glob.glob(fastq_path) + fastq_files = glob.glob(fastq_path) # If there are multiple ONT barcodes, we need to add the ONT barcode to the adaptor name if ont_barcode: adaptor_bc_name = f"{adaptor_name}_{ont_barcode}" else: adaptor_bc_name = adaptor_name - + # Align alignment_path: str = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf") adaptor_fasta_path: str = os.path.join(args.out_fastq, f"{adaptor_name}.fasta") @@ -144,8 +142,6 @@ def run_demux(args): report.add_alignment_stat(stats) # Demux - no_matches = [] - matches = [] flipped_i7 = False flipped_i5 = False flips = { @@ -158,7 +154,7 @@ def run_demux(args): log.info( f" Force reverse complementing {args.force_rc} index for adaptor {adaptor_name}. Lenient mode is disabled" ) - no_matches, matches = cluster_matches( + unmatched_bed, matched_bed = cluster_matches( subset_rows, fragments, args.max_distance, @@ -197,7 +193,7 @@ def run_demux(args): log.warning( " Lenient mode: Could not find any barcode reverse complements with unambiguously more matches. Using original index orientation for all adaptors. Please study the results carefully!" ) - no_matches, matches = flipped["original"] + unmatched_bed, matched_bed = flipped["original"] elif ( best_flip != "None" and len(flipped[best_flip][1]) @@ -206,20 +202,22 @@ def run_demux(args): log.info( f" Lenient mode: Reverse complementing {best_flip} index for adaptor {adaptor_name} found at least {args.lenient_factor} times more matches" ) - no_matches, matches = flipped[best_flip] + unmatched_bed, matched_bed = flipped[best_flip] flipped_i7, flipped_i5 = flips[best_flip].values() else: log.info( f" Lenient mode: using original index orientation for {adaptor_name}" ) - no_matches, matches = flipped["original"] + unmatched_bed, matched_bed = flipped["original"] else: - no_matches, matches = cluster_matches( + unmatched_bed, matched_bed = cluster_matches( subset_rows, fragments, args.max_distance ) out_pool = [] - for k, v in groupby(sorted(matches, key=lambda x: x[3]), key=lambda y: y[3]): + for k, v in groupby( + sorted(matched_bed, key=lambda x: x[3]), key=lambda y: y[3] + ): # To avoid collisions in fastq filenames, we add the ONT barcode to the sample name fq_prefix = k if ont_barcode: @@ -264,7 +262,7 @@ def run_demux(args): ) # Top unmatched indexes - nomatch_count = Counter([x[3] for x in no_matches]) + nomatch_count = Counter([x[3] for x in unmatched_bed]) if args.max_unknowns == 0: args.max_unknowns = len([sample for sample in ss]) + 10 diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index cfd51b5..88ec6ca 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -223,7 +223,9 @@ def cluster_matches( i7_reversed: bool = False, i5_reversed: bool = False, ) -> tuple[list, list]: - """Inputs: + """Return the BED coordinates of unmatching and matching reads. + + Inputs: - matches: dict of reads to their respective alignments - entries: samplesheet entries for a given adaptor-barcode combination - max_distance: maximum allowed Levenshtein distance between index read and index sequence @@ -243,7 +245,6 @@ def cluster_matches( # Iterate over each read and it's alignments for read, alignments in matches.items(): - # Determine which alignment is i5 and which is i7 if ( alignments[0].adapter_name[-2:] == "i5" @@ -260,14 +261,12 @@ def cluster_matches( else: log.debug(f" {read} has no valid illumina fragment") continue - + entries_index_dists = [] # Iterate over sample sheet entries and collect the distances between their index and the read's index for entry in entries: - # Parse i5 index read if entry.adaptor.i5.index_seq is not None: - if i5_reversed: i5_seq = str(Seq(entry.adaptor.i5.index_seq).reverse_complement()) else: @@ -303,10 +302,22 @@ def cluster_matches( entries_index_dists.append(i5_index_dist + i7_index_dist) # Find the list idx of the minimum distance between the read's index and the indices of the samples in the sheet - entries_min_distance_idx = min(range(len(entries_index_dists)), key=entries_index_dists.__getitem__) + entries_min_index_dist_loc = min( + range(len(entries_index_dists)), key=entries_index_dists.__getitem__ + ) + entry_min_index_dist = entries[entries_min_index_dist_loc] # If two samples in the sheet are equidistant from the read, skip the read - if len([i for i, j in enumerate(entries_index_dists) if j == entries_index_dists[entries_min_distance_idx]]) > 1: + if ( + len( + [ + i + for i, j in enumerate(entries_index_dists) + if j == entries_index_dists[entries_min_index_dist_loc] + ] + ) + > 1 + ): continue # Get the coordinates of the read insert @@ -318,25 +329,35 @@ def cluster_matches( continue # If the read is too far from the closest sample in the sheet - if entries_index_dists[entries_min_distance_idx] > max_distance: + if entries_index_dists[entries_min_index_dist_loc] > max_distance: # If the read has sensible lengths, add it to the unmatched beds - if len(i7_index_read) + len(i5_index_read) == len(entry.adaptor.i7.index_seq or "") + len( - entry.adaptor.i5.index_seq or "" - ): - fi75 = "+".join([i for i in [i7_index_read, i5_index_read] if not i == ""]) + if len(i7_index_read) + len(i5_index_read) == len( + entry.adaptor.i7.index_seq or "" + ) + len(entry.adaptor.i5.index_seq or ""): + fi75 = "+".join( + [i for i in [i7_index_read, i5_index_read] if not i == ""] + ) unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."]) # Otherwise, skip the read else: continue else: # Add the read to the matched beds - import ipdb; ipdb.set_trace() matched_bed.append( - [read, start_insert, end_insert, entries_index_dists[entries_min_distance_idx][0], "999", "."] + [ + read, + start_insert, + end_insert, + entry_min_index_dist.sample_name, + "999", + ".", + ] ) - log.debug(f" Matched {len(matched_bed)} reads, unmatched {len(unmatched_bed)} reads") - + log.debug( + f" Matched {len(matched_bed)} reads, unmatched {len(unmatched_bed)} reads" + ) + return unmatched_bed, matched_bed diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index ffa741f..1998f33 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -1,6 +1,5 @@ import csv import glob -import os import re from dataclasses import dataclass from itertools import combinations @@ -162,19 +161,21 @@ def get_fastastring(self, adaptor_name: str | None = None) -> str: def get_adaptor_barcode_sets( self, - ) -> list[tuple[str, str]]: + ) -> list[tuple[str, str | None]]: """Return a set of unique adaptor-barcode pairings in the samplesheet.""" # Get a set corresponding to the unique pairings of adaptors and ONT barcodes in the samplesheet - adaptor_barcode_sets: list[tuple[str, str]] = list( + adaptor_barcode_sets: list[tuple[str, str | None]] = list( set([(row.adaptor.name, row.ont_barcode) for row in self.rows]) ) return adaptor_barcode_sets - def subset_rows(self, adaptor_name: str, ont_barcode: str|None) -> list[SampleSheetEntry]: + def subset_rows( + self, adaptor_name: str, ont_barcode: str | None + ) -> list[SampleSheetEntry]: """Return a subset of samplesheet rows based on logical criteria.""" - + subset_rows = [] for row in self.rows: @@ -185,7 +186,6 @@ def subset_rows(self, adaptor_name: str, ont_barcode: str|None) -> list[SampleSh return subset_rows - def __iter__(self): return iter(self.rows)