From eccd4d650d0f10e9560539ba1370787d692c0ec6 Mon Sep 17 00:00:00 2001
From: jbloom <jbloom@fredhutch.org>
Date: Wed, 10 Apr 2024 15:18:45 -0700
Subject: [PATCH] add `outer_flank_fates` to classify outer flank failures
 differently in barcode parsing

---
 CHANGELOG.rst                                 |   2 +-
 dms_variants/illuminabarcodeparser.py         |  52 ++++++-
 ...arcodeparser_toy_example_w_upstream2.ipynb | 137 ++++++++++++------
 3 files changed, 141 insertions(+), 50 deletions(-)
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 13f6241..01359ae 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -8,7 +8,7 @@ The format is based on `Keep a Changelog <https://keepachangelog.com>`_.
 
 1.6.0
 -----
-- Added ability to parse second upstream / downstream region in ``IlluminaBarcodeParser`` by adding ``upstream2`` and ``downstream`` parameters. Also modified ``IlluminaBarcodeParser`` so that reads will only be parsed if they are long enough to fully cover the region containing the barcodes and specified upstream / downstream sequences. Based on docs, this is how it was supposed to function before but did not. Additionally, this adds another row ("reads too short") to the fates from the barcode parser.
+- Added ability to parse second upstream / downstream region in ``IlluminaBarcodeParser`` by adding ``upstream2`` and ``downstream2`` parameters. Also modified ``IlluminaBarcodeParser`` so that reads will only be parsed if they are long enough to fully cover the region containing the barcodes and specified upstream / downstream sequences. Based on docs, this is how it was supposed to function before but did not. Additionally, this adds another row ("reads too short") to the fates from the barcode parser, as well as the ``outer_flank_fates`` option to report just failing the additional upstream and downstream regions.
 
 1.5.0
 -----
diff --git a/dms_variants/illuminabarcodeparser.py b/dms_variants/illuminabarcodeparser.py
index b6e194f..26e7da8 100644
--- a/dms_variants/illuminabarcodeparser.py
+++ b/dms_variants/illuminabarcodeparser.py
@@ -38,7 +38,11 @@ class IlluminaBarcodeParser:
     must fully cover region between R1 start and barcode, and if using R2
     then `upstream` must fully cover region between R2 start and barcode.
     However, it is fine if R1 reads backwards past `upstream`, and if `R2`
-    reads forward past `downstream`.
+    reads forward past `downstream`. The `upstream2` and `downstream2`
+    can be used to require additional flanking sequences. Normally these
+    would just be rolled into `upstream` and `downstream`, but you might
+    specify separately if you are actually using these to parse additional
+    indices that you might want to set different mismatch criteria for.
 
     Parameters
     ----------
@@ -179,7 +183,7 @@ def __init__(
             "R2": regex.compile(
                 f"({self.upstream2})"
                 + f"{{s<={self.upstream2_mismatch}}}"
-                + f"^({self.upstream})"
+                + f"({self.upstream})"
                 + f"{{s<={self.upstream_mismatch}}}"
                 + f"(?P<bc>[ACTG]{{{self.bclen}}})"
                 + f"({self.downstream})"
@@ -189,7 +193,30 @@ def __init__(
             ),
         }
 
-    def parse(self, r1files, *, r2files=None, add_cols=None):
+        # build matchers that do not have upstream2 or downstream2 if needed
+        self._has_flank2 = (len(self.upstream2) > 0) or (len(self.downstream2) > 0)
+        self._matchers_no_flank2 = {
+            "R1": regex.compile(
+                f"[{self.VALID_NTS}]{{{len(self.downstream2)}}}"
+                + f"({self._rcdownstream})"
+                + f"{{s<={self.downstream_mismatch}}}"
+                + f"(?P<bc>[ACTG]{{{self.bclen}}})"
+                + f"({self._rcupstream})"
+                + f"{{s<={self.upstream_mismatch}}}"
+                + f"[{self.VALID_NTS}]{{{len(self.upstream2)}}}"
+            ),
+            "R2": regex.compile(
+                f"[{self.VALID_NTS}]{{{len(self.upstream2)}}}"
+                + f"^({self.upstream})"
+                + f"{{s<={self.upstream_mismatch}}}"
+                + f"(?P<bc>[ACTG]{{{self.bclen}}})"
+                + f"({self.downstream})"
+                + f"{{s<={self.downstream_mismatch}}}"
+                + f"[{self.VALID_NTS}]{{{len(self.downstream2)}}}"
+            ),
+        }
+
+    def parse(self, r1files, *, r2files=None, add_cols=None, outer_flank_fates=False):
         """Parse barcodes from files.
 
         Parameters
@@ -201,6 +228,11 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
         add_cols : None or dict
             If dict, specify names and values (i.e., sample or library names)
             to be aded to returned data frames.
+        outer_flank_fates : bool
+            If `True`, if using outer flanking regions then in the output fates
+            specify reads that fail just the outer flanking regions (`upstream2` or
+            `downstream2`). Otherwise, such failures will be grouped with the
+            "unparseable barcode" fate.
 
         Returns
         -------
@@ -216,6 +248,9 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
                   - "R1 / R2 disagree" (if using `r2files`)
                   - "low quality barcode": sequencing quality low
                   - "unparseable barcode": invalid flank sequence, N in barcode
+                  - "read too short": read is too short to cover specified region
+                  - "invalid outer flank" : if using `outer_flank_fates` and
+                    `upstream2` or `downstream2` fails.
 
             Note that these data frames also include any columns specified by
             `add_cols`.
@@ -253,6 +288,8 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
         }
         if not r1only:
             fates["R1 / R2 disagree"] = 0
+        if outer_flank_fates and self._has_flank2:
+            fates["invalid outer flank"] = 0
 
         # min length of interest for reads
         minlen = (
@@ -341,6 +378,15 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
                                 fates["low quality barcode"] += 1
                         else:
                             fates["R1 / R2 disagree"] += 1
+                elif (
+                    outer_flank_fates
+                    and self._has_flank2
+                    and all(
+                        self._matchers_no_flank2[read].fullmatch(r) is not None
+                        for (read, r) in zip(reads, readlist)
+                    )
+                ):
+                    fates["invalid outer flank"] += 1
                 else:
                     # invalid flanking sequence or N in barcode
                     fates["unparseable barcode"] += 1
diff --git a/tests/illuminabarcodeparser_toy_example_w_upstream2.ipynb b/tests/illuminabarcodeparser_toy_example_w_upstream2.ipynb
index ed98f41..75d7ed2 100644
--- a/tests/illuminabarcodeparser_toy_example_w_upstream2.ipynb
+++ b/tests/illuminabarcodeparser_toy_example_w_upstream2.ipynb
@@ -17,11 +17,11 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-04-10T21:48:06.599078Z",
-     "iopub.status.busy": "2024-04-10T21:48:06.598706Z",
-     "iopub.status.idle": "2024-04-10T21:48:07.574491Z",
-     "shell.execute_reply": "2024-04-10T21:48:07.573386Z",
-     "shell.execute_reply.started": "2024-04-10T21:48:06.599045Z"
+     "iopub.execute_input": "2024-04-10T22:17:38.979735Z",
+     "iopub.status.busy": "2024-04-10T22:17:38.979371Z",
+     "iopub.status.idle": "2024-04-10T22:17:39.999532Z",
+     "shell.execute_reply": "2024-04-10T22:17:39.998756Z",
+     "shell.execute_reply.started": "2024-04-10T22:17:38.979703Z"
     }
    },
    "outputs": [],
@@ -45,11 +45,11 @@
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-04-10T21:48:07.578862Z",
-     "iopub.status.busy": "2024-04-10T21:48:07.578511Z",
-     "iopub.status.idle": "2024-04-10T21:48:07.584685Z",
-     "shell.execute_reply": "2024-04-10T21:48:07.583848Z",
-     "shell.execute_reply.started": "2024-04-10T21:48:07.578834Z"
+     "iopub.execute_input": "2024-04-10T22:17:40.003968Z",
+     "iopub.status.busy": "2024-04-10T22:17:40.003644Z",
+     "iopub.status.idle": "2024-04-10T22:17:40.011300Z",
+     "shell.execute_reply": "2024-04-10T22:17:40.010467Z",
+     "shell.execute_reply.started": "2024-04-10T22:17:40.003937Z"
     }
    },
    "outputs": [],
@@ -72,11 +72,11 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-04-10T21:48:07.588864Z",
-     "iopub.status.busy": "2024-04-10T21:48:07.588522Z",
-     "iopub.status.idle": "2024-04-10T21:48:07.597119Z",
-     "shell.execute_reply": "2024-04-10T21:48:07.596272Z",
-     "shell.execute_reply.started": "2024-04-10T21:48:07.588833Z"
+     "iopub.execute_input": "2024-04-10T22:17:40.015148Z",
+     "iopub.status.busy": "2024-04-10T22:17:40.014722Z",
+     "iopub.status.idle": "2024-04-10T22:17:40.022961Z",
+     "shell.execute_reply": "2024-04-10T22:17:40.022166Z",
+     "shell.execute_reply.started": "2024-04-10T22:17:40.015118Z"
     }
    },
    "outputs": [],
@@ -84,12 +84,7 @@
     "r1file = tempfile.NamedTemporaryFile(mode=\"w\")\n",
     "\n",
     "# valid TACG barcode, full flanking regions\n",
-    "_ = r1file.write(\n",
-    "    \"@valid_CGTA_barcode\\n\"\n",
-    "    \"CGTATCATGTTGC\\n\"\n",
-    "    \"+\\n\"\n",
-    "    \"?????????????\\n\"\n",
-    ")\n",
+    "_ = r1file.write(\"@valid_CGTA_barcode\\n\" \"CGTATCATGTTGC\\n\" \"+\\n\" \"?????????????\\n\")\n",
     "\n",
     "# valid CGTA barcode, partial flanking regions\n",
     "_ = r1file.write(\n",
@@ -108,9 +103,7 @@
     ")\n",
     "\n",
     "# some sites low quality\n",
-    "_ = r1file.write(\n",
-    "    \"@low_quality_site\\n\" \"CGTATCATGTTGC\\n\" \"+\\n\" \"???+?????????\\n\"\n",
-    ")\n",
+    "_ = r1file.write(\"@low_quality_site\\n\" \"CGTATCATGTTGC\\n\" \"+\\n\" \"???+?????????\\n\")\n",
     "\n",
     "# N in barcode\n",
     "_ = r1file.write(\"@N_in_barcode\\n\" \"CGTNTCATGTTGC\\n\" \"+\\n\" \"?????????????\\n\")\n",
@@ -143,11 +136,11 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-04-10T21:48:07.601083Z",
-     "iopub.status.busy": "2024-04-10T21:48:07.600756Z",
-     "iopub.status.idle": "2024-04-10T21:48:07.621543Z",
-     "shell.execute_reply": "2024-04-10T21:48:07.620283Z",
-     "shell.execute_reply.started": "2024-04-10T21:48:07.601055Z"
+     "iopub.execute_input": "2024-04-10T22:17:40.026647Z",
+     "iopub.status.busy": "2024-04-10T22:17:40.026403Z",
+     "iopub.status.idle": "2024-04-10T22:17:40.039583Z",
+     "shell.execute_reply": "2024-04-10T22:17:40.038894Z",
+     "shell.execute_reply.started": "2024-04-10T22:17:40.026620Z"
     }
    },
    "outputs": [
@@ -186,11 +179,11 @@
    "execution_count": 5,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-04-10T21:48:07.626568Z",
-     "iopub.status.busy": "2024-04-10T21:48:07.626231Z",
-     "iopub.status.idle": "2024-04-10T21:48:07.642139Z",
-     "shell.execute_reply": "2024-04-10T21:48:07.641184Z",
-     "shell.execute_reply.started": "2024-04-10T21:48:07.626531Z"
+     "iopub.execute_input": "2024-04-10T22:17:40.042962Z",
+     "iopub.status.busy": "2024-04-10T22:17:40.042642Z",
+     "iopub.status.idle": "2024-04-10T22:17:40.056823Z",
+     "shell.execute_reply": "2024-04-10T22:17:40.056209Z",
+     "shell.execute_reply.started": "2024-04-10T22:17:40.042936Z"
     },
     "scrolled": true
    },
@@ -229,7 +222,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now create a parser that allows mismatch in `upstream` and `upstream2`, and check that we recover barcode:"
+    "Now classify outer flank failures differently from unparseable barcodes:"
    ]
   },
   {
@@ -237,11 +230,63 @@
    "execution_count": 6,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-04-10T21:48:07.643592Z",
-     "iopub.status.busy": "2024-04-10T21:48:07.643257Z",
-     "iopub.status.idle": "2024-04-10T21:48:07.656472Z",
-     "shell.execute_reply": "2024-04-10T21:48:07.655830Z",
-     "shell.execute_reply.started": "2024-04-10T21:48:07.643560Z"
+     "iopub.execute_input": "2024-04-10T22:17:40.058324Z",
+     "iopub.status.busy": "2024-04-10T22:17:40.057954Z",
+     "iopub.status.idle": "2024-04-10T22:17:40.069187Z",
+     "shell.execute_reply": "2024-04-10T22:17:40.068353Z",
+     "shell.execute_reply.started": "2024-04-10T22:17:40.058295Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  barcode  count\n",
+      "0    CGTA      1\n",
+      "1    GCCG      1\n",
+      "2    GGAG      1\n",
+      "                     fate  count\n",
+      "0           valid barcode      3\n",
+      "1     invalid outer flank      1\n",
+      "2     low quality barcode      1\n",
+      "3          read too short      1\n",
+      "4     unparseable barcode      1\n",
+      "5  failed chastity filter      0\n",
+      "6         invalid barcode      0\n"
+     ]
+    }
+   ],
+   "source": [
+    "parser_mismatch = IlluminaBarcodeParser(\n",
+    "    bclen=4,\n",
+    "    upstream=\"ACATGA\",\n",
+    "    upstream2=\"GCA\",\n",
+    "    upstream_mismatch=1,\n",
+    ")\n",
+    "barcodes_mismatch, fates_mismatch = parser_mismatch.parse(r1file.name, outer_flank_fates=True)\n",
+    "print(barcodes_mismatch)\n",
+    "print(fates_mismatch)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now create a parser that allows mismatch in `upstream` and `upstream2`, and check that we recover barcode:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-04-10T22:17:40.070486Z",
+     "iopub.status.busy": "2024-04-10T22:17:40.070109Z",
+     "iopub.status.idle": "2024-04-10T22:17:40.082614Z",
+     "shell.execute_reply": "2024-04-10T22:17:40.081961Z",
+     "shell.execute_reply.started": "2024-04-10T22:17:40.070458Z"
     }
    },
    "outputs": [
@@ -286,14 +331,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-04-10T21:48:07.657977Z",
-     "iopub.status.busy": "2024-04-10T21:48:07.657654Z",
-     "iopub.status.idle": "2024-04-10T21:48:07.661132Z",
-     "shell.execute_reply": "2024-04-10T21:48:07.660520Z",
-     "shell.execute_reply.started": "2024-04-10T21:48:07.657944Z"
+     "iopub.execute_input": "2024-04-10T22:17:40.083879Z",
+     "iopub.status.busy": "2024-04-10T22:17:40.083467Z",
+     "iopub.status.idle": "2024-04-10T22:17:40.087525Z",
+     "shell.execute_reply": "2024-04-10T22:17:40.086863Z",
+     "shell.execute_reply.started": "2024-04-10T22:17:40.083850Z"
     }
    },
    "outputs": [],