speed up reading products

EBI-Metagenomics · Nov 30, 2023 · ff87872 · ff87872
1 parent 3ebe8e8
commit ff87872
Show file tree

Hide file tree

Showing 7 changed files with 107 additions and 53 deletions.
diff --git a/snap/deciphon_snap/hit.py b/snap/deciphon_snap/hit.py
@@ -41,12 +41,10 @@ def matches(self):
         matches: list[Match] = []
         offset = self._interval.pyinterval.start
         for x in self._match_list[self.match_list_interval.slice]:
-            x.position = offset
-            if x.state.startswith("I"):
-                offset += len(x.query)
-            if x.state.startswith("M"):
-                offset += len(x.query)
-            matches.append(x)
+            y = Match(raw=x.raw, start=x.start, end=x.end, position=offset)
+            if y.is_match_state or y.is_insert_state:
+                offset += y.query_size
+            matches.append(y)
         return matches
 
 
@@ -91,11 +89,11 @@ def make(cls: Type[T], match_list: MatchList) -> T:
         match_stop = 0
 
         for i, x in enumerate(match_list):
-            if not hit_start_found and is_core_state(x.state):
+            if not hit_start_found and x.is_core_state:
                 match_start = i
                 hit_start_found = True
 
-            if hit_start_found and not is_core_state(x.state):
+            if hit_start_found and not x.is_core_state:
                 hit_end_found = True
 
             if hit_end_found:
@@ -107,10 +105,6 @@ def make(cls: Type[T], match_list: MatchList) -> T:
                 hit_start_found = False
                 hit_end_found = False
 
-            offset += len(x.query)
+            offset += x.query_size
 
         return cls.model_validate(hits)
-
-
-def is_core_state(state: str):
-    return state.startswith("M") or state.startswith("I") or state.startswith("D")
diff --git a/snap/deciphon_snap/match.py b/snap/deciphon_snap/match.py
@@ -1,10 +1,11 @@
 from __future__ import annotations
-from enum import Enum
 
+from dataclasses import dataclass
+from enum import Enum
 from functools import lru_cache
 from typing import List, overload
 
-from pydantic import BaseModel, ConfigDict, RootModel
+from pydantic import BaseModel, ConfigDict
 
 from deciphon_snap.amino import AminoInterval
 from deciphon_snap.interval import PyInterval
@@ -19,26 +20,69 @@ class MatchElemName(Enum):
     AMINO = 4
 
 
-class Match(BaseModel):
-    query: str
-    state: str
-    codon: str
-    amino: str
-    _position: int | None = None
+@dataclass(slots=True, frozen=True)
+class Match:
+    raw: str
+    start: int
+    end: int
+    position: int = -1
 
     @classmethod
     def from_string(cls, x: str):
-        y = x.split(",", 3)
-        return cls(query=y[0], state=y[1], codon=y[2], amino=y[3])
+        return cls(raw=x, start=0, end=len(x))
+
+    @property
+    def query(self):
+        start = self.start
+        return self.raw[start : self.raw.find(",", start, self.end)]
+
+    @property
+    def state(self):
+        i = self.start
+        i = self.raw.find(",", i, self.end) + 1
+        return self.raw[i : self.raw.find(",", i, self.end)]
+
+    @property
+    def codon(self):
+        i = self.start
+        i = self.raw.find(",", i, self.end) + 1
+        i = self.raw.find(",", i, self.end) + 1
+        return self.raw[i : self.raw.find(",", i, self.end)]
+
+    @property
+    def amino(self):
+        i = self.start
+        i = self.raw.find(",", i, self.end) + 1
+        i = self.raw.find(",", i, self.end) + 1
+        i = self.raw.find(",", i, self.end) + 1
+        return self.raw[i : self.end]
+
+    @property
+    def query_size(self) -> int:
+        return self.raw.find(",", self.start, self.end) - self.start
 
     @property
-    def position(self):
-        assert self._position is not None
-        return self._position
+    def _state_symbol(self):
+        i = self.start
+        i = self.raw.find(",", i, self.end) + 1
+        return self.raw[i]
 
-    @position.setter
-    def position(self, x: int):
-        self._position = x
+    @property
+    def is_insert_state(self):
+        return self._state_symbol == "I"
+
+    @property
+    def is_match_state(self):
+        return self._state_symbol == "M"
+
+    @property
+    def is_delete_state(self):
+        return self._state_symbol == "D"
+
+    @property
+    def is_core_state(self):
+        x = self._state_symbol
+        return x == "I" or x == "M" or x == "D"
 
     def __str__(self):
         query = self.query if len(self.query) > 0 else "∅"
@@ -48,12 +92,14 @@ def __str__(self):
         return f"({query},{state},{codon},{amino})"
 
 
-class MatchList(RootModel):
+@dataclass(slots=True, frozen=True)
+class MatchList:
     root: List[Match]
 
     @classmethod
     def from_string(cls, x: str):
-        return cls.model_validate([Match.from_string(i) for i in x.split(";")])
+        y = [i for i in ifind(x, ";")]
+        return cls([Match(raw=x, start=i[0], end=i[1]) for i in y])
 
     def __len__(self):
         return len(self.root)
@@ -68,7 +114,7 @@ def __getitem__(self, i: slice) -> MatchList:
 
     def __getitem__(self, i: int | slice):
         if isinstance(i, slice):
-            return MatchList.model_validate(self.root[i])
+            return MatchList(self.root[i])
         match = self.root[i]
         assert isinstance(match, Match)
         return match
@@ -150,3 +196,13 @@ def codon(self):
     @property
     def amino(self):
         return self.evaluate().amino
+
+
+def ifind(x: str, delim: str):
+    start = 0
+    end = x.find(delim, start)
+    while end != -1:
+        yield (start, end)
+        start = end + 1
+        end = x.find(delim, start)
+    yield (start, len(x))
diff --git a/snap/deciphon_snap/prod.py b/snap/deciphon_snap/prod.py
@@ -60,13 +60,12 @@ def hits(self):
 
     @property
     def matches(self):
-        matches = []
+        matches: list[Match] = []
         i = 0
         for x in self.match_list:
-            match = Match.model_validate(x)
-            match.position = i
+            match = Match(raw=x.raw, start=x.start, end=x.end, position=i)
             matches.append(match)
-            i += len(match.query)
+            i += match.query_size
         return MatchList(root=matches)
 
     @property

diff --git a/snap/deciphon_snap/query_interval.py b/snap/deciphon_snap/query_interval.py
@@ -14,7 +14,7 @@ def __init__(self, match_list: MatchList):
         offset = 0
         for x in match_list:
             self._offset.append(offset)
-            offset += len(x.query)
+            offset += x.query_size
         self._offset.append(offset)
 
     def make(self, match_list_interval: MatchListInterval) -> QueryInterval:

diff --git a/snap/deciphon_snap/snap_file.py b/snap/deciphon_snap/snap_file.py
@@ -1,23 +1,19 @@
 from __future__ import annotations
 
-import csv
 from typing import List
 
 import prettytable as pt
 from h3result.read_h3result import read_h3result
 
 from deciphon_snap.hmmer import H3Result
+from deciphon_snap.interval import PyInterval
 from deciphon_snap.match import LazyMatchList
-from deciphon_snap.prod import Prod
-from deciphon_snap.prod import ProdList
+from deciphon_snap.prod import Prod, ProdList
 from deciphon_snap.shorten import shorten
 from deciphon_snap.stringify import stringify
-from deciphon_snap.interval import PyInterval
 
 __all__ = ["SnapFile"]
 
-csv.field_size_limit(8388608)
-
 
 class SnapFile:
     def __init__(self, filesystem):
@@ -35,8 +31,9 @@ def __init__(self, filesystem):
 
         with fs.open(prod_file, "rb") as file:
             prods: List[Prod] = []
-            reader = csv.DictReader((stringify(x) for x in file), delimiter="\t")
-            for idx, row in enumerate(reader):
+            rows = [stringify(x) for x in file]
+            fieldnames = csv_fieldnames(rows[0])
+            for idx, row in enumerate((csv_parse(fieldnames, r) for r in rows[1:])):
                 seq_id = int(row["sequence"])
                 profile = str(row["profile"])
                 with fs.open(f"{hmmer_dir}/{seq_id}/{profile}.h3r", "rb") as f2:
@@ -82,3 +79,11 @@ def __str__(self):
 
         header = f"shape: ({num_products}, {num_fields})"
         return header + "\n" + x.get_string()
+
+
+def csv_fieldnames(row: str):
+    return row.strip().split("\t")
+
+
+def csv_parse(fieldnames: list[str], row: str):
+    return {name: field for name, field in zip(fieldnames, row.strip().split("\t"))}
diff --git a/snap/pyproject.toml b/snap/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "deciphon-snap"
-version = "0.8.1"
+version = "0.8.2"
 description = "Reader for Deciphon snap files."
 authors = ["Danilo Horta <[email protected]>"]
 license = "MIT"

diff --git a/snap/tests/test_hits.py b/snap/tests/test_hits.py
@@ -42,16 +42,16 @@ def test_hits():
 
     x = match_list[hits[0].match_list_interval.slice]
     assert len(x) == 5
-    assert repr(x[0]) == repr(Match.from_string("GTG,M1,GTT,V"))
-    assert repr(x[1]) == repr(Match.from_string("AAA,I2,AAA,K"))
-    assert repr(x[2]) == repr(Match.from_string("ACC,M3,ACC,T"))
-    assert repr(x[3]) == repr(Match.from_string(",D4,,"))
-    assert repr(x[4]) == repr(Match.from_string(",D5,,"))
+    assert str(x[0]) == str(Match.from_string("GTG,M1,GTT,V"))
+    assert str(x[1]) == str(Match.from_string("AAA,I2,AAA,K"))
+    assert str(x[2]) == str(Match.from_string("ACC,M3,ACC,T"))
+    assert str(x[3]) == str(Match.from_string(",D4,,"))
+    assert str(x[4]) == str(Match.from_string(",D5,,"))
 
     x = match_list[hits[1].match_list_interval.slice]
     assert len(x) == 2
-    assert repr(x[0]) == repr(Match.from_string("AAA,M258,AAA,K"))
-    assert repr(x[1]) == repr(Match.from_string("CCG,M259,CCG,P"))
+    assert str(x[0]) == str(Match.from_string("AAA,M258,AAA,K"))
+    assert str(x[1]) == str(Match.from_string("CCG,M259,CCG,P"))
 
     assert match_list[hits[0].match_list_interval.slice].query == "GTGAAAACC"
     assert match_list[hits[1].match_list_interval.slice].query == "AAACCG"