Skip to content

Commit

Permalink
linting fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
kedhammar committed Jul 2, 2024
1 parent 5c97195 commit 23ed4c1
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 35 deletions.
24 changes: 11 additions & 13 deletions anglerfish/anglerfish.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import numpy as np
import pkg_resources

from .demux.adaptor import Adaptor
from .demux.demux import (
Alignment,
categorize_matches,
Expand Down Expand Up @@ -101,20 +100,19 @@ def run_demux(args):
# Iterate across samplesheet rows conforming to the adaptor-barcode combinations
out_fastqs = []
for adaptor_name, ont_barcode in adaptor_barcode_sets:

subset_rows = ss.subset_rows(adaptor_name=adaptor_name, ont_barcode=ont_barcode)

# Grab the fastq files for the current entries
assert all([subset_rows[0].fastq == row.fastq for row in subset_rows])
fastq_path = subset_rows[0].fastq
fastq_files = glob.glob(fastq_path)
fastq_files = glob.glob(fastq_path)

# If there are multiple ONT barcodes, we need to add the ONT barcode to the adaptor name
if ont_barcode:
adaptor_bc_name = f"{adaptor_name}_{ont_barcode}"
else:
adaptor_bc_name = adaptor_name

# Align
alignment_path: str = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf")
adaptor_fasta_path: str = os.path.join(args.out_fastq, f"{adaptor_name}.fasta")
Expand Down Expand Up @@ -144,8 +142,6 @@ def run_demux(args):
report.add_alignment_stat(stats)

# Demux
no_matches = []
matches = []
flipped_i7 = False
flipped_i5 = False
flips = {
Expand All @@ -158,7 +154,7 @@ def run_demux(args):
log.info(
f" Force reverse complementing {args.force_rc} index for adaptor {adaptor_name}. Lenient mode is disabled"
)
no_matches, matches = cluster_matches(
unmatched_bed, matched_bed = cluster_matches(
subset_rows,
fragments,
args.max_distance,
Expand Down Expand Up @@ -197,7 +193,7 @@ def run_demux(args):
log.warning(
" Lenient mode: Could not find any barcode reverse complements with unambiguously more matches. Using original index orientation for all adaptors. Please study the results carefully!"
)
no_matches, matches = flipped["original"]
unmatched_bed, matched_bed = flipped["original"]
elif (
best_flip != "None"
and len(flipped[best_flip][1])
Expand All @@ -206,20 +202,22 @@ def run_demux(args):
log.info(
f" Lenient mode: Reverse complementing {best_flip} index for adaptor {adaptor_name} found at least {args.lenient_factor} times more matches"
)
no_matches, matches = flipped[best_flip]
unmatched_bed, matched_bed = flipped[best_flip]
flipped_i7, flipped_i5 = flips[best_flip].values()
else:
log.info(
f" Lenient mode: using original index orientation for {adaptor_name}"
)
no_matches, matches = flipped["original"]
unmatched_bed, matched_bed = flipped["original"]
else:
no_matches, matches = cluster_matches(
unmatched_bed, matched_bed = cluster_matches(
subset_rows, fragments, args.max_distance
)

out_pool = []
for k, v in groupby(sorted(matches, key=lambda x: x[3]), key=lambda y: y[3]):
for k, v in groupby(
sorted(matched_bed, key=lambda x: x[3]), key=lambda y: y[3]
):
# To avoid collisions in fastq filenames, we add the ONT barcode to the sample name
fq_prefix = k
if ont_barcode:
Expand Down Expand Up @@ -264,7 +262,7 @@ def run_demux(args):
)

# Top unmatched indexes
nomatch_count = Counter([x[3] for x in no_matches])
nomatch_count = Counter([x[3] for x in unmatched_bed])
if args.max_unknowns == 0:
args.max_unknowns = len([sample for sample in ss]) + 10

Expand Down
53 changes: 37 additions & 16 deletions anglerfish/demux/demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,9 @@ def cluster_matches(
i7_reversed: bool = False,
i5_reversed: bool = False,
) -> tuple[list, list]:
"""Inputs:
"""Return the BED coordinates of unmatching and matching reads.
Inputs:
- matches: dict of reads to their respective alignments
- entries: samplesheet entries for a given adaptor-barcode combination
- max_distance: maximum allowed Levenshtein distance between index read and index sequence
Expand All @@ -243,7 +245,6 @@ def cluster_matches(

# Iterate over each read and it's alignments
for read, alignments in matches.items():

# Determine which alignment is i5 and which is i7
if (
alignments[0].adapter_name[-2:] == "i5"
Expand All @@ -260,14 +261,12 @@ def cluster_matches(
else:
log.debug(f" {read} has no valid illumina fragment")
continue

entries_index_dists = []
# Iterate over sample sheet entries and collect the distances between their index and the read's index
for entry in entries:

# Parse i5 index read
if entry.adaptor.i5.index_seq is not None:

if i5_reversed:
i5_seq = str(Seq(entry.adaptor.i5.index_seq).reverse_complement())
else:
Expand Down Expand Up @@ -303,10 +302,22 @@ def cluster_matches(
entries_index_dists.append(i5_index_dist + i7_index_dist)

# Find the list idx of the minimum distance between the read's index and the indices of the samples in the sheet
entries_min_distance_idx = min(range(len(entries_index_dists)), key=entries_index_dists.__getitem__)
entries_min_index_dist_loc = min(
range(len(entries_index_dists)), key=entries_index_dists.__getitem__
)
entry_min_index_dist = entries[entries_min_index_dist_loc]

# If two samples in the sheet are equidistant from the read, skip the read
if len([i for i, j in enumerate(entries_index_dists) if j == entries_index_dists[entries_min_distance_idx]]) > 1:
if (
len(
[
i
for i, j in enumerate(entries_index_dists)
if j == entries_index_dists[entries_min_index_dist_loc]
]
)
> 1
):
continue

# Get the coordinates of the read insert
Expand All @@ -318,25 +329,35 @@ def cluster_matches(
continue

# If the read is too far from the closest sample in the sheet
if entries_index_dists[entries_min_distance_idx] > max_distance:
if entries_index_dists[entries_min_index_dist_loc] > max_distance:
# If the read has sensible lengths, add it to the unmatched beds
if len(i7_index_read) + len(i5_index_read) == len(entry.adaptor.i7.index_seq or "") + len(
entry.adaptor.i5.index_seq or ""
):
fi75 = "+".join([i for i in [i7_index_read, i5_index_read] if not i == ""])
if len(i7_index_read) + len(i5_index_read) == len(
entry.adaptor.i7.index_seq or ""
) + len(entry.adaptor.i5.index_seq or ""):
fi75 = "+".join(
[i for i in [i7_index_read, i5_index_read] if not i == ""]
)
unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."])
# Otherwise, skip the read
else:
continue
else:
# Add the read to the matched beds
import ipdb; ipdb.set_trace()
matched_bed.append(
[read, start_insert, end_insert, entries_index_dists[entries_min_distance_idx][0], "999", "."]
[
read,
start_insert,
end_insert,
entry_min_index_dist.sample_name,
"999",
".",
]
)

log.debug(f" Matched {len(matched_bed)} reads, unmatched {len(unmatched_bed)} reads")

log.debug(
f" Matched {len(matched_bed)} reads, unmatched {len(unmatched_bed)} reads"
)

return unmatched_bed, matched_bed


Expand Down
12 changes: 6 additions & 6 deletions anglerfish/demux/samplesheet.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import csv
import glob
import os
import re
from dataclasses import dataclass
from itertools import combinations
Expand Down Expand Up @@ -162,19 +161,21 @@ def get_fastastring(self, adaptor_name: str | None = None) -> str:

def get_adaptor_barcode_sets(
self,
) -> list[tuple[str, str]]:
) -> list[tuple[str, str | None]]:
"""Return a set of unique adaptor-barcode pairings in the samplesheet."""

# Get a set corresponding to the unique pairings of adaptors and ONT barcodes in the samplesheet
adaptor_barcode_sets: list[tuple[str, str]] = list(
adaptor_barcode_sets: list[tuple[str, str | None]] = list(
set([(row.adaptor.name, row.ont_barcode) for row in self.rows])
)

return adaptor_barcode_sets

def subset_rows(self, adaptor_name: str, ont_barcode: str|None) -> list[SampleSheetEntry]:
def subset_rows(
self, adaptor_name: str, ont_barcode: str | None
) -> list[SampleSheetEntry]:
"""Return a subset of samplesheet rows based on logical criteria."""

subset_rows = []

for row in self.rows:
Expand All @@ -185,7 +186,6 @@ def subset_rows(self, adaptor_name: str, ont_barcode: str|None) -> list[SampleSh

return subset_rows


def __iter__(self):
return iter(self.rows)

Expand Down

0 comments on commit 23ed4c1

Please sign in to comment.