Skip to content

Commit

Permalink
speed up reading products
Browse files Browse the repository at this point in the history
  • Loading branch information
horta committed Nov 30, 2023
1 parent 3ebe8e8 commit ff87872
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 53 deletions.
20 changes: 7 additions & 13 deletions snap/deciphon_snap/hit.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,10 @@ def matches(self):
matches: list[Match] = []
offset = self._interval.pyinterval.start
for x in self._match_list[self.match_list_interval.slice]:
x.position = offset
if x.state.startswith("I"):
offset += len(x.query)
if x.state.startswith("M"):
offset += len(x.query)
matches.append(x)
y = Match(raw=x.raw, start=x.start, end=x.end, position=offset)
if y.is_match_state or y.is_insert_state:
offset += y.query_size
matches.append(y)
return matches


Expand Down Expand Up @@ -91,11 +89,11 @@ def make(cls: Type[T], match_list: MatchList) -> T:
match_stop = 0

for i, x in enumerate(match_list):
if not hit_start_found and is_core_state(x.state):
if not hit_start_found and x.is_core_state:
match_start = i
hit_start_found = True

if hit_start_found and not is_core_state(x.state):
if hit_start_found and not x.is_core_state:
hit_end_found = True

if hit_end_found:
Expand All @@ -107,10 +105,6 @@ def make(cls: Type[T], match_list: MatchList) -> T:
hit_start_found = False
hit_end_found = False

offset += len(x.query)
offset += x.query_size

return cls.model_validate(hits)


def is_core_state(state: str):
return state.startswith("M") or state.startswith("I") or state.startswith("D")
94 changes: 75 additions & 19 deletions snap/deciphon_snap/match.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from __future__ import annotations
from enum import Enum

from dataclasses import dataclass
from enum import Enum
from functools import lru_cache
from typing import List, overload

from pydantic import BaseModel, ConfigDict, RootModel
from pydantic import BaseModel, ConfigDict

from deciphon_snap.amino import AminoInterval
from deciphon_snap.interval import PyInterval
Expand All @@ -19,26 +20,69 @@ class MatchElemName(Enum):
AMINO = 4


class Match(BaseModel):
query: str
state: str
codon: str
amino: str
_position: int | None = None
@dataclass(slots=True, frozen=True)
class Match:
raw: str
start: int
end: int
position: int = -1

@classmethod
def from_string(cls, x: str):
y = x.split(",", 3)
return cls(query=y[0], state=y[1], codon=y[2], amino=y[3])
return cls(raw=x, start=0, end=len(x))

@property
def query(self):
start = self.start
return self.raw[start : self.raw.find(",", start, self.end)]

@property
def state(self):
i = self.start
i = self.raw.find(",", i, self.end) + 1
return self.raw[i : self.raw.find(",", i, self.end)]

@property
def codon(self):
i = self.start
i = self.raw.find(",", i, self.end) + 1
i = self.raw.find(",", i, self.end) + 1
return self.raw[i : self.raw.find(",", i, self.end)]

@property
def amino(self):
i = self.start
i = self.raw.find(",", i, self.end) + 1
i = self.raw.find(",", i, self.end) + 1
i = self.raw.find(",", i, self.end) + 1
return self.raw[i : self.end]

@property
def query_size(self) -> int:
return self.raw.find(",", self.start, self.end) - self.start

@property
def position(self):
assert self._position is not None
return self._position
def _state_symbol(self):
i = self.start
i = self.raw.find(",", i, self.end) + 1
return self.raw[i]

@position.setter
def position(self, x: int):
self._position = x
@property
def is_insert_state(self):
return self._state_symbol == "I"

@property
def is_match_state(self):
return self._state_symbol == "M"

@property
def is_delete_state(self):
return self._state_symbol == "D"

@property
def is_core_state(self):
x = self._state_symbol
return x == "I" or x == "M" or x == "D"

def __str__(self):
query = self.query if len(self.query) > 0 else "∅"
Expand All @@ -48,12 +92,14 @@ def __str__(self):
return f"({query},{state},{codon},{amino})"


class MatchList(RootModel):
@dataclass(slots=True, frozen=True)
class MatchList:
root: List[Match]

@classmethod
def from_string(cls, x: str):
return cls.model_validate([Match.from_string(i) for i in x.split(";")])
y = [i for i in ifind(x, ";")]
return cls([Match(raw=x, start=i[0], end=i[1]) for i in y])

def __len__(self):
return len(self.root)
Expand All @@ -68,7 +114,7 @@ def __getitem__(self, i: slice) -> MatchList:

def __getitem__(self, i: int | slice):
if isinstance(i, slice):
return MatchList.model_validate(self.root[i])
return MatchList(self.root[i])
match = self.root[i]
assert isinstance(match, Match)
return match
Expand Down Expand Up @@ -150,3 +196,13 @@ def codon(self):
@property
def amino(self):
return self.evaluate().amino


def ifind(x: str, delim: str):
start = 0
end = x.find(delim, start)
while end != -1:
yield (start, end)
start = end + 1
end = x.find(delim, start)
yield (start, len(x))
7 changes: 3 additions & 4 deletions snap/deciphon_snap/prod.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,12 @@ def hits(self):

@property
def matches(self):
matches = []
matches: list[Match] = []
i = 0
for x in self.match_list:
match = Match.model_validate(x)
match.position = i
match = Match(raw=x.raw, start=x.start, end=x.end, position=i)
matches.append(match)
i += len(match.query)
i += match.query_size
return MatchList(root=matches)

@property
Expand Down
2 changes: 1 addition & 1 deletion snap/deciphon_snap/query_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def __init__(self, match_list: MatchList):
offset = 0
for x in match_list:
self._offset.append(offset)
offset += len(x.query)
offset += x.query_size
self._offset.append(offset)

def make(self, match_list_interval: MatchListInterval) -> QueryInterval:
Expand Down
21 changes: 13 additions & 8 deletions snap/deciphon_snap/snap_file.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,19 @@
from __future__ import annotations

import csv
from typing import List

import prettytable as pt
from h3result.read_h3result import read_h3result

from deciphon_snap.hmmer import H3Result
from deciphon_snap.interval import PyInterval
from deciphon_snap.match import LazyMatchList
from deciphon_snap.prod import Prod
from deciphon_snap.prod import ProdList
from deciphon_snap.prod import Prod, ProdList
from deciphon_snap.shorten import shorten
from deciphon_snap.stringify import stringify
from deciphon_snap.interval import PyInterval

__all__ = ["SnapFile"]

csv.field_size_limit(8388608)


class SnapFile:
def __init__(self, filesystem):
Expand All @@ -35,8 +31,9 @@ def __init__(self, filesystem):

with fs.open(prod_file, "rb") as file:
prods: List[Prod] = []
reader = csv.DictReader((stringify(x) for x in file), delimiter="\t")
for idx, row in enumerate(reader):
rows = [stringify(x) for x in file]
fieldnames = csv_fieldnames(rows[0])
for idx, row in enumerate((csv_parse(fieldnames, r) for r in rows[1:])):
seq_id = int(row["sequence"])
profile = str(row["profile"])
with fs.open(f"{hmmer_dir}/{seq_id}/{profile}.h3r", "rb") as f2:
Expand Down Expand Up @@ -82,3 +79,11 @@ def __str__(self):

header = f"shape: ({num_products}, {num_fields})"
return header + "\n" + x.get_string()


def csv_fieldnames(row: str):
return row.strip().split("\t")


def csv_parse(fieldnames: list[str], row: str):
return {name: field for name, field in zip(fieldnames, row.strip().split("\t"))}
2 changes: 1 addition & 1 deletion snap/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "deciphon-snap"
version = "0.8.1"
version = "0.8.2"
description = "Reader for Deciphon snap files."
authors = ["Danilo Horta <[email protected]>"]
license = "MIT"
Expand Down
14 changes: 7 additions & 7 deletions snap/tests/test_hits.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,16 @@ def test_hits():

x = match_list[hits[0].match_list_interval.slice]
assert len(x) == 5
assert repr(x[0]) == repr(Match.from_string("GTG,M1,GTT,V"))
assert repr(x[1]) == repr(Match.from_string("AAA,I2,AAA,K"))
assert repr(x[2]) == repr(Match.from_string("ACC,M3,ACC,T"))
assert repr(x[3]) == repr(Match.from_string(",D4,,"))
assert repr(x[4]) == repr(Match.from_string(",D5,,"))
assert str(x[0]) == str(Match.from_string("GTG,M1,GTT,V"))
assert str(x[1]) == str(Match.from_string("AAA,I2,AAA,K"))
assert str(x[2]) == str(Match.from_string("ACC,M3,ACC,T"))
assert str(x[3]) == str(Match.from_string(",D4,,"))
assert str(x[4]) == str(Match.from_string(",D5,,"))

x = match_list[hits[1].match_list_interval.slice]
assert len(x) == 2
assert repr(x[0]) == repr(Match.from_string("AAA,M258,AAA,K"))
assert repr(x[1]) == repr(Match.from_string("CCG,M259,CCG,P"))
assert str(x[0]) == str(Match.from_string("AAA,M258,AAA,K"))
assert str(x[1]) == str(Match.from_string("CCG,M259,CCG,P"))

assert match_list[hits[0].match_list_interval.slice].query == "GTGAAAACC"
assert match_list[hits[1].match_list_interval.slice].query == "AAACCG"
Expand Down

0 comments on commit ff87872

Please sign in to comment.