linting fixes

kedhammar · Jul 2, 2024 · 23ed4c1 · 23ed4c1
1 parent 5c97195
commit 23ed4c1
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 35 deletions.
diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py
@@ -13,7 +13,6 @@
 import numpy as np
 import pkg_resources
 
-from .demux.adaptor import Adaptor
 from .demux.demux import (
     Alignment,
     categorize_matches,
@@ -101,20 +100,19 @@ def run_demux(args):
     # Iterate across samplesheet rows conforming to the adaptor-barcode combinations
     out_fastqs = []
     for adaptor_name, ont_barcode in adaptor_barcode_sets:
-
         subset_rows = ss.subset_rows(adaptor_name=adaptor_name, ont_barcode=ont_barcode)
 
         # Grab the fastq files for the current entries
         assert all([subset_rows[0].fastq == row.fastq for row in subset_rows])
         fastq_path = subset_rows[0].fastq
-        fastq_files = glob.glob(fastq_path) 
+        fastq_files = glob.glob(fastq_path)
 
         # If there are multiple ONT barcodes, we need to add the ONT barcode to the adaptor name
         if ont_barcode:
             adaptor_bc_name = f"{adaptor_name}_{ont_barcode}"
         else:
             adaptor_bc_name = adaptor_name
-            
+
         # Align
         alignment_path: str = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf")
         adaptor_fasta_path: str = os.path.join(args.out_fastq, f"{adaptor_name}.fasta")
@@ -144,8 +142,6 @@ def run_demux(args):
         report.add_alignment_stat(stats)
 
         # Demux
-        no_matches = []
-        matches = []
         flipped_i7 = False
         flipped_i5 = False
         flips = {
@@ -158,7 +154,7 @@ def run_demux(args):
             log.info(
                 f" Force reverse complementing {args.force_rc} index for adaptor {adaptor_name}. Lenient mode is disabled"
             )
-            no_matches, matches = cluster_matches(
+            unmatched_bed, matched_bed = cluster_matches(
                 subset_rows,
                 fragments,
                 args.max_distance,
@@ -197,7 +193,7 @@ def run_demux(args):
                 log.warning(
                     " Lenient mode: Could not find any barcode reverse complements with unambiguously more matches. Using original index orientation for all adaptors. Please study the results carefully!"
                 )
-                no_matches, matches = flipped["original"]
+                unmatched_bed, matched_bed = flipped["original"]
             elif (
                 best_flip != "None"
                 and len(flipped[best_flip][1])
@@ -206,20 +202,22 @@ def run_demux(args):
                 log.info(
                     f" Lenient mode: Reverse complementing {best_flip} index for adaptor {adaptor_name} found at least {args.lenient_factor} times more matches"
                 )
-                no_matches, matches = flipped[best_flip]
+                unmatched_bed, matched_bed = flipped[best_flip]
                 flipped_i7, flipped_i5 = flips[best_flip].values()
             else:
                 log.info(
                     f" Lenient mode: using original index orientation for {adaptor_name}"
                 )
-                no_matches, matches = flipped["original"]
+                unmatched_bed, matched_bed = flipped["original"]
         else:
-            no_matches, matches = cluster_matches(
+            unmatched_bed, matched_bed = cluster_matches(
                 subset_rows, fragments, args.max_distance
             )
 
         out_pool = []
-        for k, v in groupby(sorted(matches, key=lambda x: x[3]), key=lambda y: y[3]):
+        for k, v in groupby(
+            sorted(matched_bed, key=lambda x: x[3]), key=lambda y: y[3]
+        ):
             # To avoid collisions in fastq filenames, we add the ONT barcode to the sample name
             fq_prefix = k
             if ont_barcode:
@@ -264,7 +262,7 @@ def run_demux(args):
             )
 
         # Top unmatched indexes
-        nomatch_count = Counter([x[3] for x in no_matches])
+        nomatch_count = Counter([x[3] for x in unmatched_bed])
         if args.max_unknowns == 0:
             args.max_unknowns = len([sample for sample in ss]) + 10
 

diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py
@@ -223,7 +223,9 @@ def cluster_matches(
     i7_reversed: bool = False,
     i5_reversed: bool = False,
 ) -> tuple[list, list]:
-    """Inputs:
+    """Return the BED coordinates of unmatching and matching reads.
+
+    Inputs:
     - matches: dict of reads to their respective alignments
     - entries: samplesheet entries for a given adaptor-barcode combination
     - max_distance: maximum allowed Levenshtein distance between index read and index sequence
@@ -243,7 +245,6 @@ def cluster_matches(
 
     # Iterate over each read and it's alignments
     for read, alignments in matches.items():
-
         # Determine which alignment is i5 and which is i7
         if (
             alignments[0].adapter_name[-2:] == "i5"
@@ -260,14 +261,12 @@ def cluster_matches(
         else:
             log.debug(f" {read} has no valid illumina fragment")
             continue
-        
+
         entries_index_dists = []
         # Iterate over sample sheet entries and collect the distances between their index and the read's index
         for entry in entries:
-
             # Parse i5 index read
             if entry.adaptor.i5.index_seq is not None:
-
                 if i5_reversed:
                     i5_seq = str(Seq(entry.adaptor.i5.index_seq).reverse_complement())
                 else:
@@ -303,10 +302,22 @@ def cluster_matches(
             entries_index_dists.append(i5_index_dist + i7_index_dist)
 
         # Find the list idx of the minimum distance between the read's index and the indices of the samples in the sheet
-        entries_min_distance_idx = min(range(len(entries_index_dists)), key=entries_index_dists.__getitem__)
+        entries_min_index_dist_loc = min(
+            range(len(entries_index_dists)), key=entries_index_dists.__getitem__
+        )
+        entry_min_index_dist = entries[entries_min_index_dist_loc]
 
         # If two samples in the sheet are equidistant from the read, skip the read
-        if len([i for i, j in enumerate(entries_index_dists) if j == entries_index_dists[entries_min_distance_idx]]) > 1:
+        if (
+            len(
+                [
+                    i
+                    for i, j in enumerate(entries_index_dists)
+                    if j == entries_index_dists[entries_min_index_dist_loc]
+                ]
+            )
+            > 1
+        ):
             continue
 
         # Get the coordinates of the read insert
@@ -318,25 +329,35 @@ def cluster_matches(
             continue
 
         # If the read is too far from the closest sample in the sheet
-        if entries_index_dists[entries_min_distance_idx] > max_distance:
+        if entries_index_dists[entries_min_index_dist_loc] > max_distance:
             # If the read has sensible lengths, add it to the unmatched beds
-            if len(i7_index_read) + len(i5_index_read) == len(entry.adaptor.i7.index_seq or "") + len(
-                entry.adaptor.i5.index_seq or ""
-            ):
-                fi75 = "+".join([i for i in [i7_index_read, i5_index_read] if not i == ""])
+            if len(i7_index_read) + len(i5_index_read) == len(
+                entry.adaptor.i7.index_seq or ""
+            ) + len(entry.adaptor.i5.index_seq or ""):
+                fi75 = "+".join(
+                    [i for i in [i7_index_read, i5_index_read] if not i == ""]
+                )
                 unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."])
             # Otherwise, skip the read
             else:
                 continue
         else:
             # Add the read to the matched beds
-            import ipdb; ipdb.set_trace()
             matched_bed.append(
-                [read, start_insert, end_insert, entries_index_dists[entries_min_distance_idx][0], "999", "."]
+                [
+                    read,
+                    start_insert,
+                    end_insert,
+                    entry_min_index_dist.sample_name,
+                    "999",
+                    ".",
+                ]
             )
 
-    log.debug(f" Matched {len(matched_bed)} reads, unmatched {len(unmatched_bed)} reads")
-
+    log.debug(
+        f" Matched {len(matched_bed)} reads, unmatched {len(unmatched_bed)} reads"
+    )
+
     return unmatched_bed, matched_bed
 
 

diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py
@@ -1,6 +1,5 @@
 import csv
 import glob
-import os
 import re
 from dataclasses import dataclass
 from itertools import combinations
@@ -162,19 +161,21 @@ def get_fastastring(self, adaptor_name: str | None = None) -> str:
 
     def get_adaptor_barcode_sets(
         self,
-    ) -> list[tuple[str, str]]:
+    ) -> list[tuple[str, str | None]]:
         """Return a set of unique adaptor-barcode pairings in the samplesheet."""
 
         # Get a set corresponding to the unique pairings of adaptors and ONT barcodes in the samplesheet
-        adaptor_barcode_sets: list[tuple[str, str]] = list(
+        adaptor_barcode_sets: list[tuple[str, str | None]] = list(
             set([(row.adaptor.name, row.ont_barcode) for row in self.rows])
         )
 
         return adaptor_barcode_sets
 
-    def subset_rows(self, adaptor_name: str, ont_barcode: str|None) -> list[SampleSheetEntry]:
+    def subset_rows(
+        self, adaptor_name: str, ont_barcode: str | None
+    ) -> list[SampleSheetEntry]:
         """Return a subset of samplesheet rows based on logical criteria."""
-        
+
         subset_rows = []
 
         for row in self.rows:
@@ -185,7 +186,6 @@ def subset_rows(self, adaptor_name: str, ont_barcode: str|None) -> list[SampleSh
 
         return subset_rows
 
-
     def __iter__(self):
         return iter(self.rows)