Skip to content

Commit

Permalink
add outer_flank_fates to classify outer flank failures differently …
Browse files Browse the repository at this point in the history
…in barcode parsing
  • Loading branch information
jbloom committed Apr 10, 2024
1 parent cb75b89 commit eccd4d6
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 50 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ The format is based on `Keep a Changelog <https://keepachangelog.com>`_.

1.6.0
-----
- Added ability to parse second upstream / downstream region in ``IlluminaBarcodeParser`` by adding ``upstream2`` and ``downstream`` parameters. Also modified ``IlluminaBarcodeParser`` so that reads will only be parsed if they are long enough to fully cover the region containing the barcodes and specified upstream / downstream sequences. Based on docs, this is how it was supposed to function before but did not. Additionally, this adds another row ("reads too short") to the fates from the barcode parser.
- Added ability to parse second upstream / downstream region in ``IlluminaBarcodeParser`` by adding ``upstream2`` and ``downstream2`` parameters. Also modified ``IlluminaBarcodeParser`` so that reads will only be parsed if they are long enough to fully cover the region containing the barcodes and specified upstream / downstream sequences. Based on docs, this is how it was supposed to function before but did not. Additionally, this adds another row ("reads too short") to the fates from the barcode parser, as well as the ``outer_flank_fates`` option to report just failing the additional upstream and downstream regions.

1.5.0
-----
Expand Down
52 changes: 49 additions & 3 deletions dms_variants/illuminabarcodeparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ class IlluminaBarcodeParser:
must fully cover region between R1 start and barcode, and if using R2
then `upstream` must fully cover region between R2 start and barcode.
However, it is fine if R1 reads backwards past `upstream`, and if `R2`
reads forward past `downstream`.
reads forward past `downstream`. The `upstream2` and `downstream2`
can be used to require additional flanking sequences. Normally these
would just be rolled into `upstream` and `downstream`, but you might
specify separately if you are actually using these to parse additional
indices that you might want to set different mismatch criteria for.
Parameters
----------
Expand Down Expand Up @@ -179,7 +183,7 @@ def __init__(
"R2": regex.compile(
f"({self.upstream2})"
+ f"{{s<={self.upstream2_mismatch}}}"
+ f"^({self.upstream})"
+ f"({self.upstream})"
+ f"{{s<={self.upstream_mismatch}}}"
+ f"(?P<bc>[ACTG]{{{self.bclen}}})"
+ f"({self.downstream})"
Expand All @@ -189,7 +193,30 @@ def __init__(
),
}

def parse(self, r1files, *, r2files=None, add_cols=None):
# build matchers that do not have upstream2 or downstream2 if needed
self._has_flank2 = (len(self.upstream2) > 0) or (len(self.downstream2) > 0)
self._matchers_no_flank2 = {
"R1": regex.compile(
f"[{self.VALID_NTS}]{{{len(self.downstream2)}}}"
+ f"({self._rcdownstream})"
+ f"{{s<={self.downstream_mismatch}}}"
+ f"(?P<bc>[ACTG]{{{self.bclen}}})"
+ f"({self._rcupstream})"
+ f"{{s<={self.upstream_mismatch}}}"
+ f"[{self.VALID_NTS}]{{{len(self.upstream2)}}}"
),
"R2": regex.compile(
f"[{self.VALID_NTS}]{{{len(self.upstream2)}}}"
+ f"^({self.upstream})"
+ f"{{s<={self.upstream_mismatch}}}"
+ f"(?P<bc>[ACTG]{{{self.bclen}}})"
+ f"({self.downstream})"
+ f"{{s<={self.downstream_mismatch}}}"
+ f"[{self.VALID_NTS}]{{{len(self.downstream2)}}}"
),
}

def parse(self, r1files, *, r2files=None, add_cols=None, outer_flank_fates=False):
"""Parse barcodes from files.
Parameters
Expand All @@ -201,6 +228,11 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
add_cols : None or dict
If dict, specify names and values (i.e., sample or library names)
to be aded to returned data frames.
outer_flank_fates : bool
If `True`, if using outer flanking regions then in the output fates
specify reads that fail just the outer flanking regions (`upstream2` or
`downstream2`). Otherwise, such failures will be grouped with the
"unparseable barcode" fate.
Returns
-------
Expand All @@ -216,6 +248,9 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
- "R1 / R2 disagree" (if using `r2files`)
- "low quality barcode": sequencing quality low
- "unparseable barcode": invalid flank sequence, N in barcode
- "read too short": read is too short to cover specified region
- "invalid outer flank" : if using `outer_flank_fates` and
`upstream2` or `downstream2` fails.
Note that these data frames also include any columns specified by
`add_cols`.
Expand Down Expand Up @@ -253,6 +288,8 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
}
if not r1only:
fates["R1 / R2 disagree"] = 0
if outer_flank_fates and self._has_flank2:
fates["invalid outer flank"] = 0

# min length of interest for reads
minlen = (
Expand Down Expand Up @@ -341,6 +378,15 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
fates["low quality barcode"] += 1
else:
fates["R1 / R2 disagree"] += 1
elif (
outer_flank_fates
and self._has_flank2
and all(
self._matchers_no_flank2[read].fullmatch(r) is not None
for (read, r) in zip(reads, readlist)
)
):
fates["invalid outer flank"] += 1
else:
# invalid flanking sequence or N in barcode
fates["unparseable barcode"] += 1
Expand Down
137 changes: 91 additions & 46 deletions tests/illuminabarcodeparser_toy_example_w_upstream2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
"execution_count": 1,
"metadata": {
"execution": {
"iopub.execute_input": "2024-04-10T21:48:06.599078Z",
"iopub.status.busy": "2024-04-10T21:48:06.598706Z",
"iopub.status.idle": "2024-04-10T21:48:07.574491Z",
"shell.execute_reply": "2024-04-10T21:48:07.573386Z",
"shell.execute_reply.started": "2024-04-10T21:48:06.599045Z"
"iopub.execute_input": "2024-04-10T22:17:38.979735Z",
"iopub.status.busy": "2024-04-10T22:17:38.979371Z",
"iopub.status.idle": "2024-04-10T22:17:39.999532Z",
"shell.execute_reply": "2024-04-10T22:17:39.998756Z",
"shell.execute_reply.started": "2024-04-10T22:17:38.979703Z"
}
},
"outputs": [],
Expand All @@ -45,11 +45,11 @@
"execution_count": 2,
"metadata": {
"execution": {
"iopub.execute_input": "2024-04-10T21:48:07.578862Z",
"iopub.status.busy": "2024-04-10T21:48:07.578511Z",
"iopub.status.idle": "2024-04-10T21:48:07.584685Z",
"shell.execute_reply": "2024-04-10T21:48:07.583848Z",
"shell.execute_reply.started": "2024-04-10T21:48:07.578834Z"
"iopub.execute_input": "2024-04-10T22:17:40.003968Z",
"iopub.status.busy": "2024-04-10T22:17:40.003644Z",
"iopub.status.idle": "2024-04-10T22:17:40.011300Z",
"shell.execute_reply": "2024-04-10T22:17:40.010467Z",
"shell.execute_reply.started": "2024-04-10T22:17:40.003937Z"
}
},
"outputs": [],
Expand All @@ -72,24 +72,19 @@
"execution_count": 3,
"metadata": {
"execution": {
"iopub.execute_input": "2024-04-10T21:48:07.588864Z",
"iopub.status.busy": "2024-04-10T21:48:07.588522Z",
"iopub.status.idle": "2024-04-10T21:48:07.597119Z",
"shell.execute_reply": "2024-04-10T21:48:07.596272Z",
"shell.execute_reply.started": "2024-04-10T21:48:07.588833Z"
"iopub.execute_input": "2024-04-10T22:17:40.015148Z",
"iopub.status.busy": "2024-04-10T22:17:40.014722Z",
"iopub.status.idle": "2024-04-10T22:17:40.022961Z",
"shell.execute_reply": "2024-04-10T22:17:40.022166Z",
"shell.execute_reply.started": "2024-04-10T22:17:40.015118Z"
}
},
"outputs": [],
"source": [
"r1file = tempfile.NamedTemporaryFile(mode=\"w\")\n",
"\n",
"# valid TACG barcode, full flanking regions\n",
"_ = r1file.write(\n",
" \"@valid_CGTA_barcode\\n\"\n",
" \"CGTATCATGTTGC\\n\"\n",
" \"+\\n\"\n",
" \"?????????????\\n\"\n",
")\n",
"_ = r1file.write(\"@valid_CGTA_barcode\\n\" \"CGTATCATGTTGC\\n\" \"+\\n\" \"?????????????\\n\")\n",
"\n",
"# valid CGTA barcode, partial flanking regions\n",
"_ = r1file.write(\n",
Expand All @@ -108,9 +103,7 @@
")\n",
"\n",
"# some sites low quality\n",
"_ = r1file.write(\n",
" \"@low_quality_site\\n\" \"CGTATCATGTTGC\\n\" \"+\\n\" \"???+?????????\\n\"\n",
")\n",
"_ = r1file.write(\"@low_quality_site\\n\" \"CGTATCATGTTGC\\n\" \"+\\n\" \"???+?????????\\n\")\n",
"\n",
"# N in barcode\n",
"_ = r1file.write(\"@N_in_barcode\\n\" \"CGTNTCATGTTGC\\n\" \"+\\n\" \"?????????????\\n\")\n",
Expand Down Expand Up @@ -143,11 +136,11 @@
"execution_count": 4,
"metadata": {
"execution": {
"iopub.execute_input": "2024-04-10T21:48:07.601083Z",
"iopub.status.busy": "2024-04-10T21:48:07.600756Z",
"iopub.status.idle": "2024-04-10T21:48:07.621543Z",
"shell.execute_reply": "2024-04-10T21:48:07.620283Z",
"shell.execute_reply.started": "2024-04-10T21:48:07.601055Z"
"iopub.execute_input": "2024-04-10T22:17:40.026647Z",
"iopub.status.busy": "2024-04-10T22:17:40.026403Z",
"iopub.status.idle": "2024-04-10T22:17:40.039583Z",
"shell.execute_reply": "2024-04-10T22:17:40.038894Z",
"shell.execute_reply.started": "2024-04-10T22:17:40.026620Z"
}
},
"outputs": [
Expand Down Expand Up @@ -186,11 +179,11 @@
"execution_count": 5,
"metadata": {
"execution": {
"iopub.execute_input": "2024-04-10T21:48:07.626568Z",
"iopub.status.busy": "2024-04-10T21:48:07.626231Z",
"iopub.status.idle": "2024-04-10T21:48:07.642139Z",
"shell.execute_reply": "2024-04-10T21:48:07.641184Z",
"shell.execute_reply.started": "2024-04-10T21:48:07.626531Z"
"iopub.execute_input": "2024-04-10T22:17:40.042962Z",
"iopub.status.busy": "2024-04-10T22:17:40.042642Z",
"iopub.status.idle": "2024-04-10T22:17:40.056823Z",
"shell.execute_reply": "2024-04-10T22:17:40.056209Z",
"shell.execute_reply.started": "2024-04-10T22:17:40.042936Z"
},
"scrolled": true
},
Expand Down Expand Up @@ -229,19 +222,71 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Now create a parser that allows mismatch in `upstream` and `upstream2`, and check that we recover barcode:"
"Now classify outer flank failures differently from unparseable barcodes:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2024-04-10T21:48:07.643592Z",
"iopub.status.busy": "2024-04-10T21:48:07.643257Z",
"iopub.status.idle": "2024-04-10T21:48:07.656472Z",
"shell.execute_reply": "2024-04-10T21:48:07.655830Z",
"shell.execute_reply.started": "2024-04-10T21:48:07.643560Z"
"iopub.execute_input": "2024-04-10T22:17:40.058324Z",
"iopub.status.busy": "2024-04-10T22:17:40.057954Z",
"iopub.status.idle": "2024-04-10T22:17:40.069187Z",
"shell.execute_reply": "2024-04-10T22:17:40.068353Z",
"shell.execute_reply.started": "2024-04-10T22:17:40.058295Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" barcode count\n",
"0 CGTA 1\n",
"1 GCCG 1\n",
"2 GGAG 1\n",
" fate count\n",
"0 valid barcode 3\n",
"1 invalid outer flank 1\n",
"2 low quality barcode 1\n",
"3 read too short 1\n",
"4 unparseable barcode 1\n",
"5 failed chastity filter 0\n",
"6 invalid barcode 0\n"
]
}
],
"source": [
"parser_mismatch = IlluminaBarcodeParser(\n",
" bclen=4,\n",
" upstream=\"ACATGA\",\n",
" upstream2=\"GCA\",\n",
" upstream_mismatch=1,\n",
")\n",
"barcodes_mismatch, fates_mismatch = parser_mismatch.parse(r1file.name, outer_flank_fates=True)\n",
"print(barcodes_mismatch)\n",
"print(fates_mismatch)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now create a parser that allows mismatch in `upstream` and `upstream2`, and check that we recover barcode:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"execution": {
"iopub.execute_input": "2024-04-10T22:17:40.070486Z",
"iopub.status.busy": "2024-04-10T22:17:40.070109Z",
"iopub.status.idle": "2024-04-10T22:17:40.082614Z",
"shell.execute_reply": "2024-04-10T22:17:40.081961Z",
"shell.execute_reply.started": "2024-04-10T22:17:40.070458Z"
}
},
"outputs": [
Expand Down Expand Up @@ -286,14 +331,14 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {
"execution": {
"iopub.execute_input": "2024-04-10T21:48:07.657977Z",
"iopub.status.busy": "2024-04-10T21:48:07.657654Z",
"iopub.status.idle": "2024-04-10T21:48:07.661132Z",
"shell.execute_reply": "2024-04-10T21:48:07.660520Z",
"shell.execute_reply.started": "2024-04-10T21:48:07.657944Z"
"iopub.execute_input": "2024-04-10T22:17:40.083879Z",
"iopub.status.busy": "2024-04-10T22:17:40.083467Z",
"iopub.status.idle": "2024-04-10T22:17:40.087525Z",
"shell.execute_reply": "2024-04-10T22:17:40.086863Z",
"shell.execute_reply.started": "2024-04-10T22:17:40.083850Z"
}
},
"outputs": [],
Expand Down

0 comments on commit eccd4d6

Please sign in to comment.