diff --git a/psm_utils/peptidoform.py b/psm_utils/peptidoform.py index f92e297..7f0923a 100644 --- a/psm_utils/peptidoform.py +++ b/psm_utils/peptidoform.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections import defaultdict -from typing import Iterable, List, Tuple, Union +from typing import Iterable, List, Tuple, TypedDict, Union, cast import numpy as np from pyteomics import mass, proforma @@ -29,8 +29,10 @@ def __init__(self, proforma_sequence: Union[str, proforma.ProForma]) -> None: ---------- parsed_sequence : list List of tuples with residue and modifications for each location. - properties : dict[str, Any] - Dict with sequence-wide properties. + properties : :py:class:`PeptidoformProperties` + Dictionary with properties of the peptidoform, including N- and C-terminal + modifications, unlocalized modifications, labile modifications, fixed + modifications, and charge state. Examples -------- @@ -39,6 +41,10 @@ def __init__(self, proforma_sequence: Union[str, proforma.ProForma]) -> None: 711.2567622919099 """ + self.parsed_sequence: List[Tuple[str, List[proforma.TagBase] | None]] + self.properties: PeptidoformProperties + + # Parse ProForma if isinstance(proforma_sequence, str): try: self.parsed_sequence, self.properties = proforma.parse(proforma_sequence) @@ -66,13 +72,21 @@ def __str__(self) -> str: def __hash__(self) -> int: return hash(self.proforma) - def __eq__(self, __o: Union[Peptidoform, str]) -> bool: + def __eq__(self, __o: object) -> bool: if isinstance(__o, str): return self.proforma == __o - elif isinstance(__o, Peptidoform): + elif isinstance(__o, Peptidoform): # type: ignore[return] return self.proforma == __o.proforma else: - raise TypeError(f"Cannot compare {type(__o)} with Peptidoform.") + raise TypeError(f"Unsupported comparison type for Peptidoform: {type(__o)}") + + def __lt__(self, __o: object) -> bool: + if isinstance(__o, str): + return self.proforma < __o + elif isinstance(__o, Peptidoform): + return self.proforma < __o.proforma + else: + raise TypeError(f"Unsupported comparison type for Peptidoform: {type(__o)}") def __iter__(self) -> Iterable[Tuple[str, Union[None, List[proforma.TagBase]]]]: return self.parsed_sequence.__iter__() @@ -188,8 +202,9 @@ def sequential_composition(self) -> list[mass.Composition]: # Get compositions for fixed modifications by amino acid fixed_rules = {} for rule in self.properties["fixed_modifications"]: - for aa in rule.targets: - fixed_rules[aa] = rule.modification_tag.composition + if rule.targets is not None: + for aa in rule.targets: + fixed_rules[aa] = rule.modification_tag.composition comp_list = [] @@ -220,6 +235,7 @@ def sequential_composition(self) -> list[mass.Composition]: # Localized modifications if tags: for tag in tags: + tag = cast(proforma.ModificationBase, tag) try: position_comp += tag.composition except (AttributeError, KeyError) as e: @@ -275,7 +291,7 @@ def composition(self) -> mass.Composition: return comp @property - def sequential_theoretical_mass(self) -> float: + def sequential_theoretical_mass(self) -> list[float]: """ Monoisotopic mass of both termini and each (modified) residue. @@ -296,8 +312,9 @@ def sequential_theoretical_mass(self) -> float: """ fixed_rules = {} for rule in self.properties["fixed_modifications"]: - for aa in rule.targets: - fixed_rules[aa] = rule.modification_tag.mass + if rule.targets is not None: + for aa in rule.targets: + fixed_rules[aa] = rule.modification_tag.mass mass_list = [] @@ -326,6 +343,7 @@ def sequential_theoretical_mass(self) -> float: # Localized modifications if tags: for tag in tags: + tag = cast(proforma.ModificationBase, tag) try: position_mass += tag.mass except (AttributeError, KeyError) as e: @@ -496,15 +514,14 @@ def add_fixed_modifications( """ if isinstance(modification_rules, dict): - modification_rules = modification_rules.items() - modification_rules = [ + modification_rules = list(modification_rules.items()) + + parsed_modification_rules = [ proforma.ModificationRule(proforma.process_tag_tokens(mod), targets) for mod, targets in modification_rules ] - if self.properties["fixed_modifications"]: - self.properties["fixed_modifications"].extend(modification_rules) - else: - self.properties["fixed_modifications"] = modification_rules + + self.properties.setdefault("fixed_modifications", []).extend(parsed_modification_rules) def apply_fixed_modifications(self): """ @@ -530,8 +547,9 @@ def apply_fixed_modifications(self): # Setup target_aa -> modification_list dictionary rule_dict = defaultdict(list) for rule in self.properties["fixed_modifications"]: - for target_aa in rule.targets: - rule_dict[target_aa].append(rule.modification_tag) + if rule.targets is not None: + for target_aa in rule.targets: + rule_dict[target_aa].append(rule.modification_tag) # Apply modifications to sequence for i, (aa, site_mods) in enumerate(self.parsed_sequence): @@ -553,6 +571,18 @@ def apply_fixed_modifications(self): self.properties["fixed_modifications"] = [] +class PeptidoformProperties(TypedDict): + """Property items of a :py:class:`Peptidoform`.""" + + n_term: list[proforma.ModificationBase] | None + c_term: list[proforma.ModificationBase] | None + unlocalized_modifications: list[proforma.ModificationBase] + labile_modifications: list[proforma.ModificationBase] + fixed_modifications: list[proforma.ModificationRule] + charge_state: proforma.ChargeState + isotopes: list[proforma.StableIsotope] + + def format_number_as_string(num): """Format number as string for ProForma mass modifications.""" # Using this method over `:+g` string formatting to avoid rounding and scientific notation diff --git a/psm_utils/psm.py b/psm_utils/psm.py index 2d01d08..9888d72 100644 --- a/psm_utils/psm.py +++ b/psm_utils/psm.py @@ -10,8 +10,8 @@ class PSM(BaseModel): """Data class representing a peptide-spectrum match (PSM).""" - peptidoform: Union[Peptidoform, str] - spectrum_id: Union[str] + peptidoform: Union[Peptidoform, str] # type: ignore + spectrum_id: str run: Optional[str] = None collection: Optional[str] = None spectrum: Optional[Any] = None @@ -89,25 +89,30 @@ def __init__(self, **data): super().__init__(**data) # Parse peptidoform if isinstance(self.peptidoform, str): - self.peptidoform = Peptidoform(self.peptidoform) + self.peptidoform: Peptidoform = Peptidoform(self.peptidoform) elif not isinstance(self.peptidoform, Peptidoform): raise TypeError( f"Peptidoform or str expected for `peptidoform`, not `{type(self.peptidoform)}`." ) - def __getitem__(self, item) -> any: + def __getitem__(self, item) -> Any: return getattr(self, item) - def __setitem__(self, item, value: any) -> None: + def __setitem__(self, item, value: Any) -> None: setattr(self, item, value) @property def precursor_mz_error(self) -> float: """Difference between observed and theoretical m/z in Da.""" theoretical_mz = self.peptidoform.theoretical_mz + if theoretical_mz is None or self.precursor_mz is None: + raise ValueError( + "Cannot calculate precursor m/z error: " + "precursor m/z is not set or theoretical m/z cannot be calculated." + ) return self.precursor_mz - theoretical_mz - def get_precursor_charge(self) -> int: + def get_precursor_charge(self) -> int | None: """Precursor charge, as embedded in :py:attr:`PSM.peptidoform`.""" return self.peptidoform.precursor_charge diff --git a/psm_utils/psm_list.py b/psm_utils/psm_list.py index c38ffdf..8190611 100644 --- a/psm_utils/psm_list.py +++ b/psm_utils/psm_list.py @@ -1,7 +1,7 @@ from __future__ import annotations import re -from typing import Iterable, List, Sequence +from typing import Iterator, List, Sequence, cast import numpy as np import pandas as pd @@ -83,13 +83,13 @@ def __str__(self): def __add__(self, other): return PSMList(psm_list=self.psm_list + other.psm_list) - def __iter__(self) -> Iterable[PSM]: + def __iter__(self) -> Iterator[PSM]: # type: ignore[override] return self.psm_list.__iter__() def __len__(self) -> int: return self.psm_list.__len__() - def __getitem__(self, item) -> PSM | list[PSM]: + def __getitem__(self, item) -> PSM | PSMList | np.ndarray: if isinstance(item, (int, np.integer)): # Return single PSM by index return self.psm_list[item] @@ -127,16 +127,18 @@ def __setitem__(self, item, values: Sequence) -> None: @property def collections(self) -> list: """List of collections in :py:class:`PSMList`.""" - if (self["collection"] != None).any(): # noqa: E711 - return list(np.unique(self["collection"])) + collection_array = np.asarray(self["collection"]) + if (collection_array != None).any(): # noqa: E711 + return np.unique(collection_array).tolist() else: return [None] @property def runs(self) -> list: """List of runs in :py:class:`PSMList`.""" - if (self["run"] != None).any(): # noqa: E711 - return list(np.unique(self["run"])) + run_array = np.asarray(self["run"]) + if (run_array != None).any(): # noqa: E711 + return np.unique(run_array).tolist() else: return [None] @@ -168,14 +170,14 @@ def set_ranks(self, lower_score_better: bool = False): """Set identification ranks for all PSMs in :py:class:`PSMList`.""" columns = ["collection", "run", "spectrum_id", "score"] self["rank"] = ( - pd.DataFrame(self[columns], columns=columns) + pd.DataFrame(np.array([self[c] for c in columns]).transpose(), columns=columns) .sort_values("score", ascending=lower_score_better) .fillna(0) # groupby does not play well with None values .groupby(["collection", "run", "spectrum_id"]) .cumcount() .sort_index() + 1 # 1-based counting - ) + ).to_list() def get_rank1_psms(self, *args, **kwargs) -> PSMList: """ @@ -184,9 +186,10 @@ def get_rank1_psms(self, *args, **kwargs) -> PSMList: First runs :py:meth:`~set_ranks` with ``*args`` and ``**kwargs`` if if any PSM has no rank yet. """ - if None in self["rank"]: + rank_array = np.asarray(self["rank"]) + if None in rank_array: self.set_ranks(*args, **kwargs) - return self[self["rank"] == 1] + return PSMList(psm_list=[self.psm_list[i] for i in np.flatnonzero(rank_array == 1)]) def find_decoys(self, decoy_pattern: str) -> None: """ @@ -211,9 +214,12 @@ def find_decoys(self, decoy_pattern: str) -> None: >>> psm_list.find_decoys(r"^DECOY_") """ - decoy_pattern = re.compile(decoy_pattern) + pattern = re.compile(decoy_pattern) for psm in self: - psm.is_decoy = all([decoy_pattern.search(p) is not None for p in psm.protein_list]) + if psm.protein_list is not None: + psm.is_decoy = all(pattern.search(p) is not None for p in psm.protein_list) + else: + psm.is_decoy = None def calculate_qvalues(self, reverse: bool = True, **kwargs) -> None: """ @@ -233,7 +239,7 @@ def calculate_qvalues(self, reverse: bool = True, **kwargs) -> None: """ for key in ["score", "is_decoy"]: - if (self[key] == None).any(): # noqa: E711 (self[key] is a Numpy array) + if (np.asarray(self[key]) == None).any(): # noqa: E711 (self[key] is a Numpy array) raise ValueError( f"Cannot calculate q-values if not all PSMs have `{key}` assigned." ) @@ -294,16 +300,17 @@ def add_fixed_modifications( """ if isinstance(modification_rules, dict): - modification_rules = modification_rules.items() - modification_rules = [ + modification_rules = list(modification_rules.items()) + + parsed_modification_rules = [ proforma.ModificationRule(proforma.process_tag_tokens(mod), targets) for mod, targets in modification_rules ] + for psm in self.psm_list: - if psm.peptidoform.properties["fixed_modifications"]: - psm.peptidoform.properties["fixed_modifications"].extend(modification_rules) - else: - psm.peptidoform.properties["fixed_modifications"] = modification_rules + psm.peptidoform.properties.setdefault("fixed_modifications", []).extend( # type: ignore[union-attr] + cast(list, parsed_modification_rules) + ) def apply_fixed_modifications(self): """