jbloomlab · jbloom · Apr 10, 2024 · Apr 10, 2024 · Apr 10, 2024 · Apr 10, 2024
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,12 @@ All notable changes to this project will be documented in this file.
 
 The format is based on `Keep a Changelog <https://keepachangelog.com>`_.
 
+1.6.0
+-----
+- Added ability to parse second upstream / downstream region in ``IlluminaBarcodeParser`` by adding ``upstream2`` and ``downstream2`` parameters. Also modified ``IlluminaBarcodeParser`` so that reads will only be parsed if they are long enough to fully cover the region containing the barcodes and specified upstream / downstream sequences. Based on docs, this is how it was supposed to function before but did not. Additionally, this adds another row ("reads too short") to the fates from the barcode parser, as well as the ``outer_flank_fates`` option to report just failing the additional upstream and downstream regions.
+- Change default color of heatmaps made by ``CodonVariantTable`` due to current one being obsolete.
+- Remove obsolete ``guide=False`` from some ``plotnine`` plots in examples / tests (this was removed in ``plotnine`` version 0.13).
+
 1.5.0
 -----
 - Remove use of deprecated ``scipy`` functions like ``flip`` to use ``numpy`` alternatives instead (fixes [this issue](https://github.com/jbloomlab/dms_variants/issues/86)).

diff --git a/dms_variants/__init__.py b/dms_variants/__init__.py
@@ -10,5 +10,5 @@
 
 __author__ = "`the Bloom lab <https://research.fhcrc.org/bloom/en.html>`_"
 __email__ = "[email protected]"
-__version__ = "1.5.0"
+__version__ = "1.6.0"
 __url__ = "https://github.com/jbloomlab/dms_variants"
diff --git a/dms_variants/codonvarianttable.py b/dms_variants/codonvarianttable.py
@@ -1934,7 +1934,6 @@ def plotMutHeatmap(
                 expand=(0, 0),
             )
             + p9.ylab(mut_desc)
-            + p9.scale_fill_cmap("gnuplot")
         )
 
         if samples is None:
@@ -1999,7 +1998,7 @@ def plotMutFreqs(
         assert "target" not in set(df.columns).union(set(n_variants.columns))
 
         df = (
-            df.groupby(["library", "sample", "mutation_type", "site"])
+            df.groupby(["library", "sample", "mutation_type", "site"], observed=False)
             .aggregate({"count": "sum"})
             .reset_index()
             .merge(n_variants, on=["library", "sample"])

diff --git a/dms_variants/illuminabarcodeparser.py b/dms_variants/illuminabarcodeparser.py
@@ -30,15 +30,19 @@ class IlluminaBarcodeParser:
     ----
     Barcodes should be read by R1 and optionally R2. Expected arrangement is
 
-        5'-[R2_start]-upstream-barcode-downstream-[R1_start]-3'
+        5'-[R2_start]-upstream2-upstream-barcode-downstream-downstream2-[R1_start]-3'
 
     R1 anneals downstream of barcode and reads backwards. If R2 is used,
     it anneals upstream of barcode and reads forward. There can be sequences
     (`upstream` and `downstream`) on either side of the barcode: `downstream`
     must fully cover region between R1 start and barcode, and if using R2
     then `upstream` must fully cover region between R2 start and barcode.
     However, it is fine if R1 reads backwards past `upstream`, and if `R2`
-    reads forward past `downstream`.
+    reads forward past `downstream`. The `upstream2` and `downstream2`
+    can be used to require additional flanking sequences. Normally these
+    would just be rolled into `upstream` and `downstream`, but you might
+    specify separately if you are actually using these to parse additional
+    indices that you might want to set different mismatch criteria for.
 
     Parameters
     ----------
@@ -72,12 +76,20 @@ class IlluminaBarcodeParser:
         Length of barcodes.
     upstream : str
         Sequence upstream of barcode.
+    upstream2 : str
+        Second sequence upstream of barcode.
     downstream : str
         Sequence downstream of barcode.
+    downstream2 : str
+        Second sequence downstream of barcode
     upstream_mismatch : int
         Max number of mismatches allowed in `upstream`.
+    upstream2_mismatch : int
+        Max number of mismatches allowed in `upstream2`.
     downstream_mismatch : int
         Max number of mismatches allowed in `downstream`.
+    downstream2_mismatch : int
+        Max number of mismatches allowed in `downstream2`.
     valid_barcodes : None or set
         If not `None`, set of barcodes to retain.
     bc_orientation : {'R1', 'R2'}
@@ -101,9 +113,13 @@ def __init__(
         *,
         bclen=None,
         upstream="",
+        upstream2="",
         downstream="",
+        downstream2="",
         upstream_mismatch=0,
+        upstream2_mismatch=0,
         downstream_mismatch=0,
+        downstream2_mismatch=0,
         valid_barcodes=None,
         bc_orientation="R1",
         minq=20,
@@ -112,16 +128,20 @@ def __init__(
     ):
         """See main class doc string."""
         self.bclen = bclen
-        if regex.match(f"^[{self.VALID_NTS}]*$", upstream):
-            self.upstream = upstream
-        else:
-            raise ValueError(f"invalid chars in upstream {upstream}")
-        if regex.match(f"^[{self.VALID_NTS}]*$", downstream):
-            self.downstream = downstream
-        else:
-            raise ValueError(f"invalid chars in downstream {downstream}")
+        for param_name, param_val in [
+            ("upstream", upstream),
+            ("downstream", downstream),
+            ("upstream2", upstream2),
+            ("downstream2", downstream2),
+        ]:
+            if regex.match(f"^[{self.VALID_NTS}]*$", param_val):
+                setattr(self, param_name, param_val)
+            else:
+                raise ValueError(f"invalid chars in {param_name} {param_val}")
         self.upstream_mismatch = upstream_mismatch
         self.downstream_mismatch = downstream_mismatch
+        self.upstream2_mismatch = upstream2_mismatch
+        self.downstream2_mismatch = downstream2_mismatch
         self.valid_barcodes = valid_barcodes
         if self.valid_barcodes is not None:
             self.valid_barcodes = set(self.valid_barcodes)
@@ -142,15 +162,61 @@ def __init__(
         self.list_all_valid_barcodes = list_all_valid_barcodes
 
         # specify information about R1 / R2 matches
-        self._bcend = {
-            "R1": self.bclen + len(self.downstream),
-            "R2": self.bclen + len(self.upstream),
-        }
         self._rcdownstream = reverse_complement(self.downstream)
         self._rcupstream = reverse_complement(self.upstream)
-        self._matches = {"R1": {}, "R2": {}}  # match objects by read length
+        self._rcdownstream2 = reverse_complement(self.downstream2)
+        self._rcupstream2 = reverse_complement(self.upstream2)
+
+        # build the regex read matches
+        self._matchers = {
+            "R1": regex.compile(
+                f"({self._rcdownstream2})"
+                + f"{{s<={self.downstream2_mismatch}}}"
+                + f"({self._rcdownstream})"
+                + f"{{s<={self.downstream_mismatch}}}"
+                + f"(?P<bc>[ACTG]{{{self.bclen}}})"
+                + f"({self._rcupstream})"
+                + f"{{s<={self.upstream_mismatch}}}"
+                + f"({self._rcupstream2})"
+                + f"{{s<={self.upstream2_mismatch}}}"
+            ),
+            "R2": regex.compile(
+                f"({self.upstream2})"
+                + f"{{s<={self.upstream2_mismatch}}}"
+                + f"({self.upstream})"
+                + f"{{s<={self.upstream_mismatch}}}"
+                + f"(?P<bc>[ACTG]{{{self.bclen}}})"
+                + f"({self.downstream})"
+                + f"{{s<={self.downstream_mismatch}}}"
+                + f"({self.downstream2})"
+                + f"{{s<={self.downstream2_mismatch}}}"
+            ),
+        }
+
+        # build matchers that do not have upstream2 or downstream2 if needed
+        self._has_flank2 = (len(self.upstream2) > 0) or (len(self.downstream2) > 0)
+        self._matchers_no_flank2 = {
+            "R1": regex.compile(
+                f"[{self.VALID_NTS}]{{{len(self.downstream2)}}}"
+                + f"({self._rcdownstream})"
+                + f"{{s<={self.downstream_mismatch}}}"
+                + f"(?P<bc>[ACTG]{{{self.bclen}}})"
+                + f"({self._rcupstream})"
+                + f"{{s<={self.upstream_mismatch}}}"
+                + f"[{self.VALID_NTS}]{{{len(self.upstream2)}}}"
+            ),
+            "R2": regex.compile(
+                f"[{self.VALID_NTS}]{{{len(self.upstream2)}}}"
+                + f"^({self.upstream})"
+                + f"{{s<={self.upstream_mismatch}}}"
+                + f"(?P<bc>[ACTG]{{{self.bclen}}})"
+                + f"({self.downstream})"
+                + f"{{s<={self.downstream_mismatch}}}"
+                + f"[{self.VALID_NTS}]{{{len(self.downstream2)}}}"
+            ),
+        }
 
-    def parse(self, r1files, *, r2files=None, add_cols=None):
+    def parse(self, r1files, *, r2files=None, add_cols=None, outer_flank_fates=False):
         """Parse barcodes from files.
 
         Parameters
@@ -162,6 +228,11 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
         add_cols : None or dict
             If dict, specify names and values (i.e., sample or library names)
             to be aded to returned data frames.
+        outer_flank_fates : bool
+            If `True`, if using outer flanking regions then in the output fates
+            specify reads that fail just the outer flanking regions (`upstream2` or
+            `downstream2`). Otherwise, such failures will be grouped with the
+            "unparseable barcode" fate.
 
         Returns
         -------
@@ -177,6 +248,9 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
                   - "R1 / R2 disagree" (if using `r2files`)
                   - "low quality barcode": sequencing quality low
                   - "unparseable barcode": invalid flank sequence, N in barcode
+                  - "read too short": read is too short to cover specified region
+                  - "invalid outer flank" : if using `outer_flank_fates` and
+                    `upstream2` or `downstream2` fails.
 
             Note that these data frames also include any columns specified by
             `add_cols`.
@@ -210,21 +284,30 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
             "low quality barcode": 0,
             "invalid barcode": 0,
             "valid barcode": 0,
+            "read too short": 0,
         }
         if not r1only:
             fates["R1 / R2 disagree"] = 0
-
-        # max length of interest for reads
-        max_len = self.bclen + len(self.upstream) + len(self.downstream)
+        if outer_flank_fates and self._has_flank2:
+            fates["invalid outer flank"] = 0
+
+        # min length of interest for reads
+        minlen = (
+            self.bclen
+            + len(self.upstream)
+            + len(self.downstream)
+            + len(self.upstream2)
+            + len(self.downstream2)
+        )
 
         for filetup in zip(*fileslist):
             if r1only:
                 assert len(filetup) == 1
-                iterator = iterate_fastq(filetup[0], check_pair=1, trim=max_len)
+                iterator = iterate_fastq(filetup[0], check_pair=1, trim=minlen)
             else:
                 assert len(filetup) == 2, f"{filetup}\n{fileslist}"
                 iterator = iterate_fastq_pair(
-                    filetup[0], filetup[1], r1trim=max_len, r2trim=max_len
+                    filetup[0], filetup[1], r1trim=minlen, r2trim=minlen
                 )
 
             for entry in iterator:
@@ -242,44 +325,18 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
                     fates["failed chastity filter"] += 1
                     continue
 
-                matches = {}
-                for read, r in zip(reads, readlist):
-                    rlen = len(r)
+                if any(len(r) < minlen for r in readlist):
+                    fates["read too short"] += 1
+                    continue
 
-                    # get or build matcher for read of this length
-                    len_past_bc = rlen - self._bcend[read]
-                    if len_past_bc < 0:
-                        raise ValueError(f"{read} too short: {rlen}")
-                    elif rlen in self._matches[read]:
-                        matcher = self._matches[read][rlen]
-                    else:
-                        if read == "R1":
-                            match_str = (
-                                f"^({self._rcdownstream})"
-                                f"{{s<={self.downstream_mismatch}}}"
-                                f"(?P<bc>[ACTG]{{{self.bclen}}})"
-                                f"({self._rcupstream[: len_past_bc]})"
-                                f"{{s<={self.upstream_mismatch}}}"
-                            )
-                        else:
-                            assert read == "R2"
-                            match_str = (
-                                f"^({self.upstream})"
-                                f"{{s<={self.upstream_mismatch}}}"
-                                f"(?P<bc>[ACTG]{{{self.bclen}}})"
-                                f"({self.downstream[: len_past_bc]})"
-                                f"{{s<={self.downstream_mismatch}}}"
-                            )
-                        matcher = regex.compile(match_str, flags=regex.BESTMATCH)
-                        self._matches[read][rlen] = matcher
-
-                    m = matcher.match(r)
-                    if m:
-                        matches[read] = m
-                    else:
-                        break
+                assert all(len(r) == minlen for r in readlist)
+
+                matches = {
+                    read: self._matchers[read].fullmatch(r)
+                    for (read, r) in zip(reads, readlist)
+                }
 
-                if len(matches) == len(reads):
+                if all(m is not None for m in matches.values()):
                     bc = {}
                     bc_q = {}
                     for read, q in zip(reads, qlist):
@@ -321,6 +378,15 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
                                 fates["low quality barcode"] += 1
                         else:
                             fates["R1 / R2 disagree"] += 1
+                elif (
+                    outer_flank_fates
+                    and self._has_flank2
+                    and all(
+                        self._matchers_no_flank2[read].fullmatch(r) is not None
+                        for (read, r) in zip(reads, readlist)
+                    )
+                ):
+                    fates["invalid outer flank"] += 1
                 else:
                     # invalid flanking sequence or N in barcode
                     fates["unparseable barcode"] += 1

diff --git a/notebooks/codonvariant_sim_data.ipynb b/notebooks/codonvariant_sim_data.ipynb
@@ -3584,7 +3584,7 @@
     "        axis_text_x=element_text(angle=90),\n",
     "        panel_grid_major_x=element_blank(),  # no vertical grid lines\n",
     "    )\n",
-    "    + scale_fill_manual(values=CBPALETTE[1:], guide=False)\n",
+    "    + scale_fill_manual(values=CBPALETTE[1:])\n",
     ")\n",
     "_ = p.draw(show=True)"
    ]

diff --git a/notebooks/codonvariant_sim_data_multi_targets.ipynb b/notebooks/codonvariant_sim_data_multi_targets.ipynb
@@ -5430,7 +5430,7 @@
     "        axis_text_x=element_text(angle=90),\n",
     "        panel_grid_major_x=element_blank(),  # no vertical grid lines\n",
     "    )\n",
-    "    + scale_fill_manual(values=CBPALETTE[1:], guide=False)\n",
+    "    + scale_fill_manual(values=CBPALETTE[1:])\n",
     ")\n",
     "\n",
     "_ = p.draw(show=True)"

diff --git a/notebooks/multi_latent_phenos.ipynb b/notebooks/multi_latent_phenos.ipynb
@@ -768,7 +768,7 @@
     "        axis_text_x=element_text(angle=90),\n",
     "        panel_grid_major_x=element_blank(),  # no vertical grid lines\n",
     "    )\n",
-    "    + scale_fill_manual(values=CBPALETTE[1:], guide=False)\n",
+    "    + scale_fill_manual(values=CBPALETTE[1:])\n",
     ")\n",
     "_ = p.draw(show=True)"
    ]

diff --git a/notebooks/narrow_bottleneck.ipynb b/notebooks/narrow_bottleneck.ipynb
@@ -835,7 +835,7 @@
     "        axis_text_x=element_text(angle=90),\n",
     "        panel_grid_major_x=element_blank(),  # no vertical grid lines\n",
     "    )\n",
-    "    + scale_fill_manual(values=CBPALETTE[1:], guide=False)\n",
+    "    + scale_fill_manual(values=CBPALETTE[1:])\n",
     ")\n",
     "_ = p.draw(show=True)"
    ]