fixed seqspec index and seqspec onlist to use the RegionCoordinate class

pachterlab · Mar 11, 2024 · f2ad2dd · f2ad2dd
1 parent c22574c
commit f2ad2dd
Show file tree

Hide file tree

Showing 12 changed files with 301 additions and 152 deletions.
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -17,6 +17,8 @@
 - `Assay` attribute `sequencer` changed to `sequence_protocol`
 - `Assay` function `get_modality` changed to `get_libspec`
 - `Region` function `update_attr` uses the `max_len` to generate `random` and `onlist` sequence lengths instead of `min_len`
+- `get_region_by_type` changed to `get_region_by_region_type` to disambiguate between `region_type` and `sequence_type`
+- `seqspec onlist` (by default) searches for onlists in the `Region`s intersected by the `Read` passed to `-r`.
 
 ### Added
 
@@ -25,7 +27,6 @@
 - Add `sequence_spec` to the seqspec json schema
 - Add `Read` object to specification document
 - Add `Read` generator to website GUI
-- Add prior version seqspec schema to seqspec/schema (note to self, this must be done for every release)
 - Add pattern matching to `date` in `Assay` (expected date format: DAY MONTH YEAR, where day is one or two numbers, month is the full named month starting with a Capital letter and year is the full year)
 - Add `library_kit` to `Assay` object (kit that adds seq adapters)
 - Add `library_protocol` to `Assay` object (library that generates insert)
@@ -45,6 +46,8 @@
   - check that the min len is less than or equal to the max len
   - check that the length of the sequence is between min and max len
     - Note a strong assumption in `seqspec print` is that the sequence have a length equal to the `max_len` for visualization purposes
+- Add `RegionCoordinate` object that maps `Region` min/max lengths to 0-indexed positions
+- `seqspec onlist` searches for onlists in a `Region` based on `--region` flag
 
 ### Removed
 

diff --git a/examples/seqspec-dev.ipynb b/examples/seqspec-dev.ipynb
@@ -294,7 +294,7 @@
         "from datetime import datetime\n",
         "import matplotlib.dates as mdates\n",
         "\n",
-        "from seqspec.utils import load_spec, get_cuts\n",
+        "from seqspec.utils import load_spec, project_regions_to_coordinates\n",
         "from seqspec.seqspec_index import run_index\n",
         "from seqspec.seqspec_find import run_find\n",
         "import os\n",
@@ -331,7 +331,7 @@
     {
       "cell_type": "code",
       "source": [
-        "from seqspec.utils import get_cuts\n",
+        "from seqspec.utils import project_regions_to_coordinates\n",
         "\n",
         "def complement_nucleotide(nucleotide):\n",
         "    complements = {\n",
@@ -353,7 +353,7 @@
         "  p = []\n",
         "  n = []\n",
         "  leaves = libspec.get_leaves()\n",
-        "  cuts = get_cuts(leaves)\n",
+        "  cuts = project_regions_to_coordinates(leaves)\n",
         "  for idx, read in enumerate(seqspec, 1):\n",
         "    read_len = read.max_len\n",
         "    read_id = read.read_id\n",
@@ -450,4 +450,4 @@
       "outputs": []
     }
   ]
-}
+}
diff --git a/examples/seqspec_dev2.ipynb b/examples/seqspec_dev2.ipynb
@@ -97,7 +97,7 @@
     {
       "cell_type": "code",
       "source": [
-        "def get_cuts(regions, cuts=[]):\n",
+        "def project_regions_to_coordinates(regions, cuts=[]):\n",
         "    if not cuts:\n",
         "        cuts = []\n",
         "    prev = 0\n",
@@ -156,7 +156,7 @@
         "    rgn = leaves[primer_idx + 1:]\n",
         "\n",
         "# get the cuts for all of the atomic elements (tuples of 0-indexed start stop)\n",
-        "cuts = get_cuts(rgn)\n",
+        "cuts = project_regions_to_coordinates(rgn)\n",
         "\n",
         "# associate each cut with its region type\n",
         "for idx, r in enumerate(rgn):\n",

diff --git a/seqspec/Region.py b/seqspec/Region.py
@@ -129,14 +129,14 @@ def get_region_by_id(self, region_id, found=[]):
                 found = r.get_region_by_id(region_id, found)
         return found
 
-    def get_region_by_type(self, region_type, found=[]):
+    def get_region_by_region_type(self, region_type, found=[]):
         if not found:
             found = []
         if self.region_type == region_type:
             found.append(self)
         if self.regions:
             for r in self.regions:
-                found = r.get_region_by_type(region_type, found)
+                found = r.get_region_by_region_type(region_type, found)
         return found
 
     def get_onlist_regions(self, found=[]):
@@ -222,6 +222,123 @@ def update_region_by_id(
                 target_region.max_len = max_len
         return
 
+    def reverse(self):
+        if self.regions:
+            # reverse the list of sub regions
+            for r in self.regions[::-1]:
+                r.reverse()
+        else:
+            # reverse the actual sequence
+            self.sequence = self.sequence[::-1]
+        return
+
+    def complement(self):
+        if self.regions:
+            for r in self.regions:
+                r.complement()
+        else:
+            self.sequence = complement_sequence(self.sequence)
+
+
+def complement_nucleotide(nucleotide):
+    complements = {
+        "A": "T",
+        "T": "A",
+        "G": "C",
+        "C": "G",
+        "R": "Y",
+        "Y": "R",
+        "S": "S",
+        "W": "W",
+        "K": "M",
+        "M": "K",
+        "B": "V",
+        "D": "H",
+        "V": "B",
+        "H": "D",
+        "N": "N",
+        "X": "X",
+    }
+    return complements.get(
+        nucleotide, "N"
+    )  # Default to 'N' if nucleotide is not recognized
+
+
+def complement_sequence(sequence):
+    return "".join(complement_nucleotide(n) for n in sequence.upper())
+
+
+class RegionCoordinate(Region):
+    def __init__(
+        self,
+        region: Region,
+        start: int = 0,
+        stop: int = 0,
+    ):
+        super().__init__(
+            region.region_id,
+            region.region_type,
+            region.name,
+            region.sequence_type,
+            region.sequence,
+            region.min_len,
+            region.max_len,
+            region.onlist,
+            region.regions,
+        )
+        self.start = start
+        self.stop = stop
+
+    def __repr__(self):
+        return f"RegionCoordinate {self.name} [{self.region_type}]: ({self.start}, {self.stop})"
+
+    def __str__(self):
+        return f"RegionCoordinate {self.name} [{self.region_type}]: ({self.start}, {self.stop})"
+
+    def __eq__(self, other):
+        return self.start == other.start and self.stop == other.stop
+
+
+def project_regions_to_coordinates(
+    regions: List[Region], rcs: List[RegionCoordinate] = []
+) -> List[RegionCoordinate]:
+    if not rcs:
+        rcs = []
+    prev = 0
+    for r in regions:
+        nxt = prev + r.max_len
+        rc = RegionCoordinate(r, prev, nxt)
+        rcs.append(rc)
+        prev = nxt
+    return rcs
+
+
+def itx_read(
+    region_coordinates: List[RegionCoordinate], read_start: int, read_stop: int
+):
+    # return a list of region_coordinates intersect with read start/stop
+    new_rcs = []
+
+    for idx, rc in enumerate(region_coordinates):
+        # read start after rc ends, ignore
+        if read_start >= rc.stop:
+            continue
+        # read stop before rc starts, ignore
+        if read_stop <= rc.start:
+            continue
+
+        # all region_coordinates now have read start or stop in the rc
+
+        # read start in rc, update start
+        if read_start >= rc.start:
+            rc.start = read_start
+        # read stop in rc, update stop
+        if read_stop < rc.stop:
+            rc.stop = read_stop
+        new_rcs.append(rc)
+
+    return new_rcs
+
 
 class Onlist(yaml.YAMLObject):
     yaml_tag = "!Onlist"

diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py
@@ -103,8 +103,10 @@ def run_check(schema, spec, spec_fn):
     # get all of the regions with type fastq in the spec and check that those files exist relative to the path of the spec
     fqrgns = []
     for m in modes:
-        fqrgns += [i for i in spec.get_libspec(m).get_region_by_type("fastq")]
-        fqrgns += [i for i in spec.get_libspec(m).get_region_by_type("fastq_link")]
+        fqrgns += [i for i in spec.get_libspec(m).get_region_by_region_type("fastq")]
+        fqrgns += [
+            i for i in spec.get_libspec(m).get_region_by_region_type("fastq_link")
+        ]
     for fqrgn in fqrgns:
         if fqrgn.region_type == "fastq":
             check = path.join(path.dirname(spec_fn), fqrgn.region_id)

diff --git a/seqspec/seqspec_find.py b/seqspec/seqspec_find.py
@@ -75,5 +75,5 @@ def run_find(spec: Assay, modality: str, region_id: str):
 
 def run_find_by_type(spec: Assay, modality: str, region_type: str):
     m = spec.get_libspec(modality)
-    regions = m.get_region_by_type(region_type)
+    regions = m.get_region_by_region_type(region_type)
     return regions