diff --git a/ms2pip/_utils/encoder.py b/ms2pip/_utils/encoder.py index a5c304b9..da1dc0be 100644 --- a/ms2pip/_utils/encoder.py +++ b/ms2pip/_utils/encoder.py @@ -64,6 +64,7 @@ class Encoder: """Modification-aware encoding of peptidoforms.""" + def __init__(self) -> None: """ Modification-aware encoding of peptidoforms. @@ -98,7 +99,7 @@ def __exit__(self, exc_type, exc_value, traceback): self.remove_encoder_files() def __repr__(self) -> str: - return "{}.{}({})".format( + return "{}.{}(modifications={})".format( self.__class__.__module__, self.__class__.__qualname__, self.modifications, @@ -168,7 +169,7 @@ def _configure_modification(self, target: str, modification: proforma.TagBase): logger.warning(f"Skipping modification for invalid amino acid: {target}") return None - self.modifications[(target, modification.key)] = { + self.modifications[(target, str(modification))] = { "mod_id": self._next_mod_id, "mass_shift": modification.mass, "amino_acid": target, @@ -180,42 +181,42 @@ def _configure_modification(self, target: str, modification: proforma.TagBase): def _configure_from_peptidoform(self, peptidoform: Peptidoform): """Configure encoder with modifications from single Peptidoform.""" # Get unique modifications from psm + unique_modifications = dict() try: - unique_modifications = set() for aa, mods in peptidoform.parsed_sequence: if mods: - unique_modifications.update([(aa, mod) for mod in mods]) + unique_modifications.update({(aa, str(mod)): mod for mod in mods}) for term in ["n_term", "c_term"]: if peptidoform.properties[term]: unique_modifications.update( - [(term, mod) for mod in peptidoform.properties[term]] + {(term, str(mod)): mod for mod in peptidoform.properties[term]} ) except KeyError as e: raise exceptions.UnresolvableModificationError(e.args[0]) from e # Add modification entries - for target, mod in unique_modifications: + for (target, _), mod in unique_modifications.items(): self._configure_modification(target, mod) def _configure_from_psm_list(self, psm_list: PSMList): """Configure encoder with modifications from PSMList.""" # Get unique modifications from psm_list + unique_modifications = dict() try: - unique_modifications = set() for psm in psm_list: for aa, mods in psm.peptidoform.parsed_sequence: if mods: - unique_modifications.update([(aa, mod) for mod in mods]) + unique_modifications.update({(aa, str(mod)): mod for mod in mods}) for term in ["n_term", "c_term"]: if psm.peptidoform.properties[term]: unique_modifications.update( - [(term, mod) for mod in psm.peptidoform.properties[term]] + {(term, str(mod)): mod for mod in psm.peptidoform.properties[term]} ) except KeyError as e: raise exceptions.UnresolvableModificationError(e.args[0]) from e # Add modification entries - for target, mod in unique_modifications: + for (target, _), mod in unique_modifications.items(): self._configure_modification(target, mod) def write_encoder_files(self) -> str: @@ -295,8 +296,8 @@ def encode_peptidoform(self, peptidoform: Peptidoform) -> np.ndarray: def _generate_encoding(peptidoform) -> Generator[int, None, None]: if peptidoform.properties["n_term"]: - mod_key = peptidoform.properties["n_term"][0].key - yield self.modifications["n_term", mod_key]["mod_id"] + mod_str = str(peptidoform.properties["n_term"][0]) + yield self.modifications["n_term", mod_str]["mod_id"] else: yield 0 @@ -305,15 +306,15 @@ def _generate_encoding(peptidoform) -> Generator[int, None, None]: if not mods: yield AMINO_ACID_IDS[aa] else: - yield self.modifications[aa, mods[0].key]["mod_id"] + yield self.modifications[aa, str(mods[0])]["mod_id"] except KeyError as e: raise exceptions.InvalidAminoAcidError( f"Unsupported amino acid found in peptide `{peptidoform.proforma}`" ) from e if peptidoform.properties["c_term"]: - mod_key = peptidoform.properties["c_term"][0].key - yield self.modifications["c_term", mod_key]["mod_id"] + mod_str = str(peptidoform.properties["c_term"][0]) + yield self.modifications["c_term", mod_str]["mod_id"] else: yield 0 diff --git a/tests/test_encoder.py b/tests/test_encoder.py new file mode 100644 index 00000000..5feb5f96 --- /dev/null +++ b/tests/test_encoder.py @@ -0,0 +1,91 @@ +import pytest +from psm_utils import Peptidoform, PSM, PSMList + +from ms2pip._utils.encoder import Encoder + + +class TestEncoder: + def test_from_peptidoform(self): + test_cases = [ + # Peptidoform, {(target, label): (amino_acid, amino_acid_id, mass_shift)} + ("ACDEK", {}), + ("AC[+57.021464]DEK", {("C", "+57.021464"): ("C", 1, 57.021464)}), + ("AC[U:4]", {("C", "UNIMOD:4"): ("C", 1, 57.021464)}), + ("AC[formula:H3C2NO]", {("C", "Formula:H3C2NO"): ("C", 1, 57.021464)}), + ("[Acetyl]-ACDE", {("n_term", "Acetyl"): ("n_term", -1, 42.010565)}), + ("ACDE-[Amidated]", {("c_term", "Amidated"): ("c_term", -2, -0.984016)}), + ( + "AC[+57.021464]DE-[Amidated]", + { + ("C", "+57.021464"): ("C", 1, 57.021464), + ("c_term", "Amidated"): ("c_term", -2, -0.984016), + }, + ), + ( + "[Acetyl]-AC[+57.021464]DE", + { + ("n_term", "Acetyl"): ("n_term", -1, 42.010565), + ("C", "+57.021464"): ("C", 1, 57.021464), + }, + ), + ] + + for peptidoform, expected_mods in test_cases: + encoder = Encoder.from_peptidoform(Peptidoform(peptidoform)) + for key, modification in encoder.modifications.items(): + for item_key, expected_item in zip( + ["amino_acid", "amino_acid_id", "mass_shift"], expected_mods[key] + ): + if isinstance(expected_item, float): + assert modification[item_key] == pytest.approx(expected_item) + else: + assert modification[item_key] == expected_item + + def test_from_psm_list(self): + psm_list = PSMList(psm_list=[ + PSM(peptidoform="AC[+57.021464]DEK", spectrum_id=0), + PSM(peptidoform="AC[U:4]", spectrum_id=1), + PSM(peptidoform="AC[formula:H3C2NO]", spectrum_id=2), + PSM(peptidoform="[Acetyl]-ACDE", spectrum_id=3), + PSM(peptidoform="ACDE-[Amidated]",spectrum_id= 4) + ]) + expected = { + ("C", "+57.021464"): { + "mod_id": 38, + "mass_shift": 57.021464, + "amino_acid": "C", + "amino_acid_id": 1, + }, + ("C", "UNIMOD:4"): { + "mod_id": 39, + "mass_shift": 57.021464, + "amino_acid": "C", + "amino_acid_id": 1, + }, + ("C", "Formula:H3C2NO"): { + "mod_id": 40, + "mass_shift": 57.02146372057, + "amino_acid": "C", + "amino_acid_id": 1, + }, + ("n_term", "Acetyl"): { + "mod_id": 41, + "mass_shift": 42.010565, + "amino_acid": "n_term", + "amino_acid_id": -1, + }, + ("c_term", "Amidated"): { + "mod_id": 42, + "mass_shift": -0.984016, + "amino_acid": "c_term", + "amino_acid_id": -2, + }, + } + + encoder = Encoder.from_psm_list(psm_list) + for modification_key, modification_dict in encoder.modifications.items(): + for item_key, expected_item in expected[modification_key].items(): + if isinstance(expected_item, float): + assert modification_dict[item_key] == pytest.approx(expected_item) + else: + assert modification_dict[item_key] == expected_item diff --git a/tests/test_modifications.py b/tests/test_modifications.py deleted file mode 100644 index 1a37766a..00000000 --- a/tests/test_modifications.py +++ /dev/null @@ -1,29 +0,0 @@ -import ms2pip.peptides - - -class TestModifications: - def test_add_from_ms2pip_modstrings(self): - mods = ms2pip.peptides.Modifications() - mods.add_from_ms2pip_modstrings([ - "Oxidation,15.994915,opt,M", - "Acetyl,42.010565,opt,N-term", - "Methyl,14.01565,opt,L", - ]) - - assert mods.modifications['ptm']["Oxidation"]["amino_acid"] == "M" - assert mods.modifications['ptm']["Acetyl"]["mass_shift"] == 42.010565 - assert mods.modifications['ptm']["Methyl"]["mass_shift"] == 14.01565 - - def test_get_mass_shifts(self): - mods = ms2pip.peptides.Modifications() - - mods.add_from_ms2pip_modstrings([ - "Oxidation,15.994915,opt,M" - ]) - assert mods.mass_shifts["Oxidation"] == 15.994915 - - # Test cache clear after adding new modifications - mods.add_from_ms2pip_modstrings([ - "Acetyl,42.010565,opt,N-term", - ]) - assert mods.mass_shifts["Acetyl"] == 42.010565