Skip to content

Add more support for typing, fix some typing-related edge case bugs #125

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 49 additions & 19 deletions psm_utils/peptidoform.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from collections import defaultdict
from typing import Iterable, List, Tuple, Union
from typing import Iterable, List, Tuple, TypedDict, Union, cast

import numpy as np
from pyteomics import mass, proforma
Expand Down Expand Up @@ -29,8 +29,10 @@ def __init__(self, proforma_sequence: Union[str, proforma.ProForma]) -> None:
----------
parsed_sequence : list
List of tuples with residue and modifications for each location.
properties : dict[str, Any]
Dict with sequence-wide properties.
properties : :py:class:`PeptidoformProperties`
Dictionary with properties of the peptidoform, including N- and C-terminal
modifications, unlocalized modifications, labile modifications, fixed
modifications, and charge state.

Examples
--------
Expand All @@ -39,6 +41,10 @@ def __init__(self, proforma_sequence: Union[str, proforma.ProForma]) -> None:
711.2567622919099

"""
self.parsed_sequence: List[Tuple[str, List[proforma.TagBase] | None]]
self.properties: PeptidoformProperties

# Parse ProForma
if isinstance(proforma_sequence, str):
try:
self.parsed_sequence, self.properties = proforma.parse(proforma_sequence)
Expand Down Expand Up @@ -66,13 +72,21 @@ def __str__(self) -> str:
def __hash__(self) -> int:
return hash(self.proforma)

def __eq__(self, __o: Union[Peptidoform, str]) -> bool:
def __eq__(self, __o: object) -> bool:
if isinstance(__o, str):
return self.proforma == __o
elif isinstance(__o, Peptidoform):
elif isinstance(__o, Peptidoform): # type: ignore[return]
return self.proforma == __o.proforma
else:
raise TypeError(f"Cannot compare {type(__o)} with Peptidoform.")
raise TypeError(f"Unsupported comparison type for Peptidoform: {type(__o)}")

def __lt__(self, __o: object) -> bool:
if isinstance(__o, str):
return self.proforma < __o
elif isinstance(__o, Peptidoform):
return self.proforma < __o.proforma
else:
raise TypeError(f"Unsupported comparison type for Peptidoform: {type(__o)}")

def __iter__(self) -> Iterable[Tuple[str, Union[None, List[proforma.TagBase]]]]:
return self.parsed_sequence.__iter__()
Expand Down Expand Up @@ -188,8 +202,9 @@ def sequential_composition(self) -> list[mass.Composition]:
# Get compositions for fixed modifications by amino acid
fixed_rules = {}
for rule in self.properties["fixed_modifications"]:
for aa in rule.targets:
fixed_rules[aa] = rule.modification_tag.composition
if rule.targets is not None:
for aa in rule.targets:
fixed_rules[aa] = rule.modification_tag.composition

comp_list = []

Expand Down Expand Up @@ -220,6 +235,7 @@ def sequential_composition(self) -> list[mass.Composition]:
# Localized modifications
if tags:
for tag in tags:
tag = cast(proforma.ModificationBase, tag)
try:
position_comp += tag.composition
except (AttributeError, KeyError) as e:
Expand Down Expand Up @@ -275,7 +291,7 @@ def composition(self) -> mass.Composition:
return comp

@property
def sequential_theoretical_mass(self) -> float:
def sequential_theoretical_mass(self) -> list[float]:
"""
Monoisotopic mass of both termini and each (modified) residue.

Expand All @@ -296,8 +312,9 @@ def sequential_theoretical_mass(self) -> float:
"""
fixed_rules = {}
for rule in self.properties["fixed_modifications"]:
for aa in rule.targets:
fixed_rules[aa] = rule.modification_tag.mass
if rule.targets is not None:
for aa in rule.targets:
fixed_rules[aa] = rule.modification_tag.mass

mass_list = []

Expand Down Expand Up @@ -326,6 +343,7 @@ def sequential_theoretical_mass(self) -> float:
# Localized modifications
if tags:
for tag in tags:
tag = cast(proforma.ModificationBase, tag)
try:
position_mass += tag.mass
except (AttributeError, KeyError) as e:
Expand Down Expand Up @@ -496,15 +514,14 @@ def add_fixed_modifications(

"""
if isinstance(modification_rules, dict):
modification_rules = modification_rules.items()
modification_rules = [
modification_rules = list(modification_rules.items())

parsed_modification_rules = [
proforma.ModificationRule(proforma.process_tag_tokens(mod), targets)
for mod, targets in modification_rules
]
if self.properties["fixed_modifications"]:
self.properties["fixed_modifications"].extend(modification_rules)
else:
self.properties["fixed_modifications"] = modification_rules

self.properties.setdefault("fixed_modifications", []).extend(parsed_modification_rules)

def apply_fixed_modifications(self):
"""
Expand All @@ -530,8 +547,9 @@ def apply_fixed_modifications(self):
# Setup target_aa -> modification_list dictionary
rule_dict = defaultdict(list)
for rule in self.properties["fixed_modifications"]:
for target_aa in rule.targets:
rule_dict[target_aa].append(rule.modification_tag)
if rule.targets is not None:
for target_aa in rule.targets:
rule_dict[target_aa].append(rule.modification_tag)

# Apply modifications to sequence
for i, (aa, site_mods) in enumerate(self.parsed_sequence):
Expand All @@ -553,6 +571,18 @@ def apply_fixed_modifications(self):
self.properties["fixed_modifications"] = []


class PeptidoformProperties(TypedDict):
"""Property items of a :py:class:`Peptidoform`."""

n_term: list[proforma.ModificationBase] | None
c_term: list[proforma.ModificationBase] | None
unlocalized_modifications: list[proforma.ModificationBase]
labile_modifications: list[proforma.ModificationBase]
fixed_modifications: list[proforma.ModificationRule]
charge_state: proforma.ChargeState
isotopes: list[proforma.StableIsotope]


def format_number_as_string(num):
"""Format number as string for ProForma mass modifications."""
# Using this method over `:+g` string formatting to avoid rounding and scientific notation
Expand Down
17 changes: 11 additions & 6 deletions psm_utils/psm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
class PSM(BaseModel):
"""Data class representing a peptide-spectrum match (PSM)."""

peptidoform: Union[Peptidoform, str]
spectrum_id: Union[str]
peptidoform: Union[Peptidoform, str] # type: ignore
spectrum_id: str
run: Optional[str] = None
collection: Optional[str] = None
spectrum: Optional[Any] = None
Expand Down Expand Up @@ -89,25 +89,30 @@ def __init__(self, **data):
super().__init__(**data)
# Parse peptidoform
if isinstance(self.peptidoform, str):
self.peptidoform = Peptidoform(self.peptidoform)
self.peptidoform: Peptidoform = Peptidoform(self.peptidoform)
elif not isinstance(self.peptidoform, Peptidoform):
raise TypeError(
f"Peptidoform or str expected for `peptidoform`, not `{type(self.peptidoform)}`."
)

def __getitem__(self, item) -> any:
def __getitem__(self, item) -> Any:
return getattr(self, item)

def __setitem__(self, item, value: any) -> None:
def __setitem__(self, item, value: Any) -> None:
setattr(self, item, value)

@property
def precursor_mz_error(self) -> float:
"""Difference between observed and theoretical m/z in Da."""
theoretical_mz = self.peptidoform.theoretical_mz
if theoretical_mz is None or self.precursor_mz is None:
raise ValueError(
"Cannot calculate precursor m/z error: "
"precursor m/z is not set or theoretical m/z cannot be calculated."
)
return self.precursor_mz - theoretical_mz

def get_precursor_charge(self) -> int:
def get_precursor_charge(self) -> int | None:
"""Precursor charge, as embedded in :py:attr:`PSM.peptidoform`."""
return self.peptidoform.precursor_charge

Expand Down
47 changes: 27 additions & 20 deletions psm_utils/psm_list.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import re
from typing import Iterable, List, Sequence
from typing import Iterator, List, Sequence, cast

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -83,13 +83,13 @@ def __str__(self):
def __add__(self, other):
return PSMList(psm_list=self.psm_list + other.psm_list)

def __iter__(self) -> Iterable[PSM]:
def __iter__(self) -> Iterator[PSM]: # type: ignore[override]
return self.psm_list.__iter__()

def __len__(self) -> int:
return self.psm_list.__len__()

def __getitem__(self, item) -> PSM | list[PSM]:
def __getitem__(self, item) -> PSM | PSMList | np.ndarray:
if isinstance(item, (int, np.integer)):
# Return single PSM by index
return self.psm_list[item]
Expand Down Expand Up @@ -127,16 +127,18 @@ def __setitem__(self, item, values: Sequence) -> None:
@property
def collections(self) -> list:
"""List of collections in :py:class:`PSMList`."""
if (self["collection"] != None).any(): # noqa: E711
return list(np.unique(self["collection"]))
collection_array = np.asarray(self["collection"])
if (collection_array != None).any(): # noqa: E711
return np.unique(collection_array).tolist()
else:
return [None]

@property
def runs(self) -> list:
"""List of runs in :py:class:`PSMList`."""
if (self["run"] != None).any(): # noqa: E711
return list(np.unique(self["run"]))
run_array = np.asarray(self["run"])
if (run_array != None).any(): # noqa: E711
return np.unique(run_array).tolist()
else:
return [None]

Expand Down Expand Up @@ -168,14 +170,14 @@ def set_ranks(self, lower_score_better: bool = False):
"""Set identification ranks for all PSMs in :py:class:`PSMList`."""
columns = ["collection", "run", "spectrum_id", "score"]
self["rank"] = (
pd.DataFrame(self[columns], columns=columns)
pd.DataFrame(np.array([self[c] for c in columns]).transpose(), columns=columns)
.sort_values("score", ascending=lower_score_better)
.fillna(0) # groupby does not play well with None values
.groupby(["collection", "run", "spectrum_id"])
.cumcount()
.sort_index()
+ 1 # 1-based counting
)
).to_list()

def get_rank1_psms(self, *args, **kwargs) -> PSMList:
"""
Expand All @@ -184,9 +186,10 @@ def get_rank1_psms(self, *args, **kwargs) -> PSMList:
First runs :py:meth:`~set_ranks` with ``*args`` and ``**kwargs`` if if any PSM
has no rank yet.
"""
if None in self["rank"]:
rank_array = np.asarray(self["rank"])
if None in rank_array:
self.set_ranks(*args, **kwargs)
return self[self["rank"] == 1]
return PSMList(psm_list=[self.psm_list[i] for i in np.flatnonzero(rank_array == 1)])

def find_decoys(self, decoy_pattern: str) -> None:
"""
Expand All @@ -211,9 +214,12 @@ def find_decoys(self, decoy_pattern: str) -> None:
>>> psm_list.find_decoys(r"^DECOY_")

"""
decoy_pattern = re.compile(decoy_pattern)
pattern = re.compile(decoy_pattern)
for psm in self:
psm.is_decoy = all([decoy_pattern.search(p) is not None for p in psm.protein_list])
if psm.protein_list is not None:
psm.is_decoy = all(pattern.search(p) is not None for p in psm.protein_list)
else:
psm.is_decoy = None

def calculate_qvalues(self, reverse: bool = True, **kwargs) -> None:
"""
Expand All @@ -233,7 +239,7 @@ def calculate_qvalues(self, reverse: bool = True, **kwargs) -> None:

"""
for key in ["score", "is_decoy"]:
if (self[key] == None).any(): # noqa: E711 (self[key] is a Numpy array)
if (np.asarray(self[key]) == None).any(): # noqa: E711 (self[key] is a Numpy array)
raise ValueError(
f"Cannot calculate q-values if not all PSMs have `{key}` assigned."
)
Expand Down Expand Up @@ -294,16 +300,17 @@ def add_fixed_modifications(

"""
if isinstance(modification_rules, dict):
modification_rules = modification_rules.items()
modification_rules = [
modification_rules = list(modification_rules.items())

parsed_modification_rules = [
proforma.ModificationRule(proforma.process_tag_tokens(mod), targets)
for mod, targets in modification_rules
]

for psm in self.psm_list:
if psm.peptidoform.properties["fixed_modifications"]:
psm.peptidoform.properties["fixed_modifications"].extend(modification_rules)
else:
psm.peptidoform.properties["fixed_modifications"] = modification_rules
psm.peptidoform.properties.setdefault("fixed_modifications", []).extend( # type: ignore[union-attr]
cast(list, parsed_modification_rules)
)

def apply_fixed_modifications(self):
"""
Expand Down
Loading